Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs...
[sfrench/cifs-2.6.git] / drivers / pci / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/iommu.h>
38 #include <linux/intel-iommu.h>
39 #include <asm/cacheflush.h>
40 #include <asm/iommu.h>
41 #include "pci.h"
42
43 #define ROOT_SIZE               VTD_PAGE_SIZE
44 #define CONTEXT_SIZE            VTD_PAGE_SIZE
45
46 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
47 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
48
49 #define IOAPIC_RANGE_START      (0xfee00000)
50 #define IOAPIC_RANGE_END        (0xfeefffff)
51 #define IOVA_START_ADDR         (0x1000)
52
53 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
54
55 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
56
57 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
58 #define DMA_32BIT_PFN           IOVA_PFN(DMA_32BIT_MASK)
59 #define DMA_64BIT_PFN           IOVA_PFN(DMA_64BIT_MASK)
60
61 /* global iommu list, set NULL for ignored DMAR units */
62 static struct intel_iommu **g_iommus;
63
64 /*
65  * 0: Present
66  * 1-11: Reserved
67  * 12-63: Context Ptr (12 - (haw-1))
68  * 64-127: Reserved
69  */
70 struct root_entry {
71         u64     val;
72         u64     rsvd1;
73 };
74 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
75 static inline bool root_present(struct root_entry *root)
76 {
77         return (root->val & 1);
78 }
79 static inline void set_root_present(struct root_entry *root)
80 {
81         root->val |= 1;
82 }
83 static inline void set_root_value(struct root_entry *root, unsigned long value)
84 {
85         root->val |= value & VTD_PAGE_MASK;
86 }
87
88 static inline struct context_entry *
89 get_context_addr_from_root(struct root_entry *root)
90 {
91         return (struct context_entry *)
92                 (root_present(root)?phys_to_virt(
93                 root->val & VTD_PAGE_MASK) :
94                 NULL);
95 }
96
97 /*
98  * low 64 bits:
99  * 0: present
100  * 1: fault processing disable
101  * 2-3: translation type
102  * 12-63: address space root
103  * high 64 bits:
104  * 0-2: address width
105  * 3-6: aval
106  * 8-23: domain id
107  */
108 struct context_entry {
109         u64 lo;
110         u64 hi;
111 };
112
113 static inline bool context_present(struct context_entry *context)
114 {
115         return (context->lo & 1);
116 }
117 static inline void context_set_present(struct context_entry *context)
118 {
119         context->lo |= 1;
120 }
121
122 static inline void context_set_fault_enable(struct context_entry *context)
123 {
124         context->lo &= (((u64)-1) << 2) | 1;
125 }
126
127 #define CONTEXT_TT_MULTI_LEVEL 0
128
129 static inline void context_set_translation_type(struct context_entry *context,
130                                                 unsigned long value)
131 {
132         context->lo &= (((u64)-1) << 4) | 3;
133         context->lo |= (value & 3) << 2;
134 }
135
136 static inline void context_set_address_root(struct context_entry *context,
137                                             unsigned long value)
138 {
139         context->lo |= value & VTD_PAGE_MASK;
140 }
141
142 static inline void context_set_address_width(struct context_entry *context,
143                                              unsigned long value)
144 {
145         context->hi |= value & 7;
146 }
147
148 static inline void context_set_domain_id(struct context_entry *context,
149                                          unsigned long value)
150 {
151         context->hi |= (value & ((1 << 16) - 1)) << 8;
152 }
153
154 static inline void context_clear_entry(struct context_entry *context)
155 {
156         context->lo = 0;
157         context->hi = 0;
158 }
159
160 /*
161  * 0: readable
162  * 1: writable
163  * 2-6: reserved
164  * 7: super page
165  * 8-11: available
166  * 12-63: Host physcial address
167  */
168 struct dma_pte {
169         u64 val;
170 };
171
172 static inline void dma_clear_pte(struct dma_pte *pte)
173 {
174         pte->val = 0;
175 }
176
177 static inline void dma_set_pte_readable(struct dma_pte *pte)
178 {
179         pte->val |= DMA_PTE_READ;
180 }
181
182 static inline void dma_set_pte_writable(struct dma_pte *pte)
183 {
184         pte->val |= DMA_PTE_WRITE;
185 }
186
187 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
188 {
189         pte->val = (pte->val & ~3) | (prot & 3);
190 }
191
192 static inline u64 dma_pte_addr(struct dma_pte *pte)
193 {
194         return (pte->val & VTD_PAGE_MASK);
195 }
196
197 static inline void dma_set_pte_addr(struct dma_pte *pte, u64 addr)
198 {
199         pte->val |= (addr & VTD_PAGE_MASK);
200 }
201
202 static inline bool dma_pte_present(struct dma_pte *pte)
203 {
204         return (pte->val & 3) != 0;
205 }
206
207 /* devices under the same p2p bridge are owned in one domain */
208 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
209
210 /* domain represents a virtual machine, more than one devices
211  * across iommus may be owned in one domain, e.g. kvm guest.
212  */
213 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
214
215 struct dmar_domain {
216         int     id;                     /* domain id */
217         unsigned long iommu_bmp;        /* bitmap of iommus this domain uses*/
218
219         struct list_head devices;       /* all devices' list */
220         struct iova_domain iovad;       /* iova's that belong to this domain */
221
222         struct dma_pte  *pgd;           /* virtual address */
223         spinlock_t      mapping_lock;   /* page table lock */
224         int             gaw;            /* max guest address width */
225
226         /* adjusted guest address width, 0 is level 2 30-bit */
227         int             agaw;
228
229         int             flags;          /* flags to find out type of domain */
230
231         int             iommu_coherency;/* indicate coherency of iommu access */
232         int             iommu_count;    /* reference count of iommu */
233         spinlock_t      iommu_lock;     /* protect iommu set in domain */
234         u64             max_addr;       /* maximum mapped address */
235 };
236
237 /* PCI domain-device relationship */
238 struct device_domain_info {
239         struct list_head link;  /* link to domain siblings */
240         struct list_head global; /* link to global list */
241         u8 bus;                 /* PCI bus numer */
242         u8 devfn;               /* PCI devfn number */
243         struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */
244         struct dmar_domain *domain; /* pointer to domain */
245 };
246
247 static void flush_unmaps_timeout(unsigned long data);
248
249 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
250
251 #define HIGH_WATER_MARK 250
252 struct deferred_flush_tables {
253         int next;
254         struct iova *iova[HIGH_WATER_MARK];
255         struct dmar_domain *domain[HIGH_WATER_MARK];
256 };
257
258 static struct deferred_flush_tables *deferred_flush;
259
260 /* bitmap for indexing intel_iommus */
261 static int g_num_of_iommus;
262
263 static DEFINE_SPINLOCK(async_umap_flush_lock);
264 static LIST_HEAD(unmaps_to_do);
265
266 static int timer_on;
267 static long list_size;
268
269 static void domain_remove_dev_info(struct dmar_domain *domain);
270
271 int dmar_disabled;
272 static int __initdata dmar_map_gfx = 1;
273 static int dmar_forcedac;
274 static int intel_iommu_strict;
275
276 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
277 static DEFINE_SPINLOCK(device_domain_lock);
278 static LIST_HEAD(device_domain_list);
279
280 static struct iommu_ops intel_iommu_ops;
281
282 static int __init intel_iommu_setup(char *str)
283 {
284         if (!str)
285                 return -EINVAL;
286         while (*str) {
287                 if (!strncmp(str, "off", 3)) {
288                         dmar_disabled = 1;
289                         printk(KERN_INFO"Intel-IOMMU: disabled\n");
290                 } else if (!strncmp(str, "igfx_off", 8)) {
291                         dmar_map_gfx = 0;
292                         printk(KERN_INFO
293                                 "Intel-IOMMU: disable GFX device mapping\n");
294                 } else if (!strncmp(str, "forcedac", 8)) {
295                         printk(KERN_INFO
296                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
297                         dmar_forcedac = 1;
298                 } else if (!strncmp(str, "strict", 6)) {
299                         printk(KERN_INFO
300                                 "Intel-IOMMU: disable batched IOTLB flush\n");
301                         intel_iommu_strict = 1;
302                 }
303
304                 str += strcspn(str, ",");
305                 while (*str == ',')
306                         str++;
307         }
308         return 0;
309 }
310 __setup("intel_iommu=", intel_iommu_setup);
311
312 static struct kmem_cache *iommu_domain_cache;
313 static struct kmem_cache *iommu_devinfo_cache;
314 static struct kmem_cache *iommu_iova_cache;
315
316 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
317 {
318         unsigned int flags;
319         void *vaddr;
320
321         /* trying to avoid low memory issues */
322         flags = current->flags & PF_MEMALLOC;
323         current->flags |= PF_MEMALLOC;
324         vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
325         current->flags &= (~PF_MEMALLOC | flags);
326         return vaddr;
327 }
328
329
330 static inline void *alloc_pgtable_page(void)
331 {
332         unsigned int flags;
333         void *vaddr;
334
335         /* trying to avoid low memory issues */
336         flags = current->flags & PF_MEMALLOC;
337         current->flags |= PF_MEMALLOC;
338         vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
339         current->flags &= (~PF_MEMALLOC | flags);
340         return vaddr;
341 }
342
343 static inline void free_pgtable_page(void *vaddr)
344 {
345         free_page((unsigned long)vaddr);
346 }
347
348 static inline void *alloc_domain_mem(void)
349 {
350         return iommu_kmem_cache_alloc(iommu_domain_cache);
351 }
352
353 static void free_domain_mem(void *vaddr)
354 {
355         kmem_cache_free(iommu_domain_cache, vaddr);
356 }
357
358 static inline void * alloc_devinfo_mem(void)
359 {
360         return iommu_kmem_cache_alloc(iommu_devinfo_cache);
361 }
362
363 static inline void free_devinfo_mem(void *vaddr)
364 {
365         kmem_cache_free(iommu_devinfo_cache, vaddr);
366 }
367
368 struct iova *alloc_iova_mem(void)
369 {
370         return iommu_kmem_cache_alloc(iommu_iova_cache);
371 }
372
373 void free_iova_mem(struct iova *iova)
374 {
375         kmem_cache_free(iommu_iova_cache, iova);
376 }
377
378
379 static inline int width_to_agaw(int width);
380
381 /* calculate agaw for each iommu.
382  * "SAGAW" may be different across iommus, use a default agaw, and
383  * get a supported less agaw for iommus that don't support the default agaw.
384  */
385 int iommu_calculate_agaw(struct intel_iommu *iommu)
386 {
387         unsigned long sagaw;
388         int agaw = -1;
389
390         sagaw = cap_sagaw(iommu->cap);
391         for (agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH);
392              agaw >= 0; agaw--) {
393                 if (test_bit(agaw, &sagaw))
394                         break;
395         }
396
397         return agaw;
398 }
399
400 /* in native case, each domain is related to only one iommu */
401 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
402 {
403         int iommu_id;
404
405         BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
406
407         iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
408         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
409                 return NULL;
410
411         return g_iommus[iommu_id];
412 }
413
414 /* "Coherency" capability may be different across iommus */
415 static void domain_update_iommu_coherency(struct dmar_domain *domain)
416 {
417         int i;
418
419         domain->iommu_coherency = 1;
420
421         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
422         for (; i < g_num_of_iommus; ) {
423                 if (!ecap_coherent(g_iommus[i]->ecap)) {
424                         domain->iommu_coherency = 0;
425                         break;
426                 }
427                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
428         }
429 }
430
431 static struct intel_iommu *device_to_iommu(u8 bus, u8 devfn)
432 {
433         struct dmar_drhd_unit *drhd = NULL;
434         int i;
435
436         for_each_drhd_unit(drhd) {
437                 if (drhd->ignored)
438                         continue;
439
440                 for (i = 0; i < drhd->devices_cnt; i++)
441                         if (drhd->devices[i] &&
442                             drhd->devices[i]->bus->number == bus &&
443                             drhd->devices[i]->devfn == devfn)
444                                 return drhd->iommu;
445
446                 if (drhd->include_all)
447                         return drhd->iommu;
448         }
449
450         return NULL;
451 }
452
453 static void domain_flush_cache(struct dmar_domain *domain,
454                                void *addr, int size)
455 {
456         if (!domain->iommu_coherency)
457                 clflush_cache_range(addr, size);
458 }
459
460 /* Gets context entry for a given bus and devfn */
461 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
462                 u8 bus, u8 devfn)
463 {
464         struct root_entry *root;
465         struct context_entry *context;
466         unsigned long phy_addr;
467         unsigned long flags;
468
469         spin_lock_irqsave(&iommu->lock, flags);
470         root = &iommu->root_entry[bus];
471         context = get_context_addr_from_root(root);
472         if (!context) {
473                 context = (struct context_entry *)alloc_pgtable_page();
474                 if (!context) {
475                         spin_unlock_irqrestore(&iommu->lock, flags);
476                         return NULL;
477                 }
478                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
479                 phy_addr = virt_to_phys((void *)context);
480                 set_root_value(root, phy_addr);
481                 set_root_present(root);
482                 __iommu_flush_cache(iommu, root, sizeof(*root));
483         }
484         spin_unlock_irqrestore(&iommu->lock, flags);
485         return &context[devfn];
486 }
487
488 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
489 {
490         struct root_entry *root;
491         struct context_entry *context;
492         int ret;
493         unsigned long flags;
494
495         spin_lock_irqsave(&iommu->lock, flags);
496         root = &iommu->root_entry[bus];
497         context = get_context_addr_from_root(root);
498         if (!context) {
499                 ret = 0;
500                 goto out;
501         }
502         ret = context_present(&context[devfn]);
503 out:
504         spin_unlock_irqrestore(&iommu->lock, flags);
505         return ret;
506 }
507
508 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
509 {
510         struct root_entry *root;
511         struct context_entry *context;
512         unsigned long flags;
513
514         spin_lock_irqsave(&iommu->lock, flags);
515         root = &iommu->root_entry[bus];
516         context = get_context_addr_from_root(root);
517         if (context) {
518                 context_clear_entry(&context[devfn]);
519                 __iommu_flush_cache(iommu, &context[devfn], \
520                         sizeof(*context));
521         }
522         spin_unlock_irqrestore(&iommu->lock, flags);
523 }
524
525 static void free_context_table(struct intel_iommu *iommu)
526 {
527         struct root_entry *root;
528         int i;
529         unsigned long flags;
530         struct context_entry *context;
531
532         spin_lock_irqsave(&iommu->lock, flags);
533         if (!iommu->root_entry) {
534                 goto out;
535         }
536         for (i = 0; i < ROOT_ENTRY_NR; i++) {
537                 root = &iommu->root_entry[i];
538                 context = get_context_addr_from_root(root);
539                 if (context)
540                         free_pgtable_page(context);
541         }
542         free_pgtable_page(iommu->root_entry);
543         iommu->root_entry = NULL;
544 out:
545         spin_unlock_irqrestore(&iommu->lock, flags);
546 }
547
548 /* page table handling */
549 #define LEVEL_STRIDE            (9)
550 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
551
552 static inline int agaw_to_level(int agaw)
553 {
554         return agaw + 2;
555 }
556
557 static inline int agaw_to_width(int agaw)
558 {
559         return 30 + agaw * LEVEL_STRIDE;
560
561 }
562
563 static inline int width_to_agaw(int width)
564 {
565         return (width - 30) / LEVEL_STRIDE;
566 }
567
568 static inline unsigned int level_to_offset_bits(int level)
569 {
570         return (12 + (level - 1) * LEVEL_STRIDE);
571 }
572
573 static inline int address_level_offset(u64 addr, int level)
574 {
575         return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
576 }
577
578 static inline u64 level_mask(int level)
579 {
580         return ((u64)-1 << level_to_offset_bits(level));
581 }
582
583 static inline u64 level_size(int level)
584 {
585         return ((u64)1 << level_to_offset_bits(level));
586 }
587
588 static inline u64 align_to_level(u64 addr, int level)
589 {
590         return ((addr + level_size(level) - 1) & level_mask(level));
591 }
592
593 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
594 {
595         int addr_width = agaw_to_width(domain->agaw);
596         struct dma_pte *parent, *pte = NULL;
597         int level = agaw_to_level(domain->agaw);
598         int offset;
599         unsigned long flags;
600
601         BUG_ON(!domain->pgd);
602
603         addr &= (((u64)1) << addr_width) - 1;
604         parent = domain->pgd;
605
606         spin_lock_irqsave(&domain->mapping_lock, flags);
607         while (level > 0) {
608                 void *tmp_page;
609
610                 offset = address_level_offset(addr, level);
611                 pte = &parent[offset];
612                 if (level == 1)
613                         break;
614
615                 if (!dma_pte_present(pte)) {
616                         tmp_page = alloc_pgtable_page();
617
618                         if (!tmp_page) {
619                                 spin_unlock_irqrestore(&domain->mapping_lock,
620                                         flags);
621                                 return NULL;
622                         }
623                         domain_flush_cache(domain, tmp_page, PAGE_SIZE);
624                         dma_set_pte_addr(pte, virt_to_phys(tmp_page));
625                         /*
626                          * high level table always sets r/w, last level page
627                          * table control read/write
628                          */
629                         dma_set_pte_readable(pte);
630                         dma_set_pte_writable(pte);
631                         domain_flush_cache(domain, pte, sizeof(*pte));
632                 }
633                 parent = phys_to_virt(dma_pte_addr(pte));
634                 level--;
635         }
636
637         spin_unlock_irqrestore(&domain->mapping_lock, flags);
638         return pte;
639 }
640
641 /* return address's pte at specific level */
642 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
643                 int level)
644 {
645         struct dma_pte *parent, *pte = NULL;
646         int total = agaw_to_level(domain->agaw);
647         int offset;
648
649         parent = domain->pgd;
650         while (level <= total) {
651                 offset = address_level_offset(addr, total);
652                 pte = &parent[offset];
653                 if (level == total)
654                         return pte;
655
656                 if (!dma_pte_present(pte))
657                         break;
658                 parent = phys_to_virt(dma_pte_addr(pte));
659                 total--;
660         }
661         return NULL;
662 }
663
664 /* clear one page's page table */
665 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
666 {
667         struct dma_pte *pte = NULL;
668
669         /* get last level pte */
670         pte = dma_addr_level_pte(domain, addr, 1);
671
672         if (pte) {
673                 dma_clear_pte(pte);
674                 domain_flush_cache(domain, pte, sizeof(*pte));
675         }
676 }
677
678 /* clear last level pte, a tlb flush should be followed */
679 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
680 {
681         int addr_width = agaw_to_width(domain->agaw);
682
683         start &= (((u64)1) << addr_width) - 1;
684         end &= (((u64)1) << addr_width) - 1;
685         /* in case it's partial page */
686         start = PAGE_ALIGN(start);
687         end &= PAGE_MASK;
688
689         /* we don't need lock here, nobody else touches the iova range */
690         while (start < end) {
691                 dma_pte_clear_one(domain, start);
692                 start += VTD_PAGE_SIZE;
693         }
694 }
695
696 /* free page table pages. last level pte should already be cleared */
697 static void dma_pte_free_pagetable(struct dmar_domain *domain,
698         u64 start, u64 end)
699 {
700         int addr_width = agaw_to_width(domain->agaw);
701         struct dma_pte *pte;
702         int total = agaw_to_level(domain->agaw);
703         int level;
704         u64 tmp;
705
706         start &= (((u64)1) << addr_width) - 1;
707         end &= (((u64)1) << addr_width) - 1;
708
709         /* we don't need lock here, nobody else touches the iova range */
710         level = 2;
711         while (level <= total) {
712                 tmp = align_to_level(start, level);
713                 if (tmp >= end || (tmp + level_size(level) > end))
714                         return;
715
716                 while (tmp < end) {
717                         pte = dma_addr_level_pte(domain, tmp, level);
718                         if (pte) {
719                                 free_pgtable_page(
720                                         phys_to_virt(dma_pte_addr(pte)));
721                                 dma_clear_pte(pte);
722                                 domain_flush_cache(domain, pte, sizeof(*pte));
723                         }
724                         tmp += level_size(level);
725                 }
726                 level++;
727         }
728         /* free pgd */
729         if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
730                 free_pgtable_page(domain->pgd);
731                 domain->pgd = NULL;
732         }
733 }
734
735 /* iommu handling */
736 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
737 {
738         struct root_entry *root;
739         unsigned long flags;
740
741         root = (struct root_entry *)alloc_pgtable_page();
742         if (!root)
743                 return -ENOMEM;
744
745         __iommu_flush_cache(iommu, root, ROOT_SIZE);
746
747         spin_lock_irqsave(&iommu->lock, flags);
748         iommu->root_entry = root;
749         spin_unlock_irqrestore(&iommu->lock, flags);
750
751         return 0;
752 }
753
754 static void iommu_set_root_entry(struct intel_iommu *iommu)
755 {
756         void *addr;
757         u32 cmd, sts;
758         unsigned long flag;
759
760         addr = iommu->root_entry;
761
762         spin_lock_irqsave(&iommu->register_lock, flag);
763         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
764
765         cmd = iommu->gcmd | DMA_GCMD_SRTP;
766         writel(cmd, iommu->reg + DMAR_GCMD_REG);
767
768         /* Make sure hardware complete it */
769         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
770                 readl, (sts & DMA_GSTS_RTPS), sts);
771
772         spin_unlock_irqrestore(&iommu->register_lock, flag);
773 }
774
775 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
776 {
777         u32 val;
778         unsigned long flag;
779
780         if (!cap_rwbf(iommu->cap))
781                 return;
782         val = iommu->gcmd | DMA_GCMD_WBF;
783
784         spin_lock_irqsave(&iommu->register_lock, flag);
785         writel(val, iommu->reg + DMAR_GCMD_REG);
786
787         /* Make sure hardware complete it */
788         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
789                         readl, (!(val & DMA_GSTS_WBFS)), val);
790
791         spin_unlock_irqrestore(&iommu->register_lock, flag);
792 }
793
794 /* return value determine if we need a write buffer flush */
795 static int __iommu_flush_context(struct intel_iommu *iommu,
796         u16 did, u16 source_id, u8 function_mask, u64 type,
797         int non_present_entry_flush)
798 {
799         u64 val = 0;
800         unsigned long flag;
801
802         /*
803          * In the non-present entry flush case, if hardware doesn't cache
804          * non-present entry we do nothing and if hardware cache non-present
805          * entry, we flush entries of domain 0 (the domain id is used to cache
806          * any non-present entries)
807          */
808         if (non_present_entry_flush) {
809                 if (!cap_caching_mode(iommu->cap))
810                         return 1;
811                 else
812                         did = 0;
813         }
814
815         switch (type) {
816         case DMA_CCMD_GLOBAL_INVL:
817                 val = DMA_CCMD_GLOBAL_INVL;
818                 break;
819         case DMA_CCMD_DOMAIN_INVL:
820                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
821                 break;
822         case DMA_CCMD_DEVICE_INVL:
823                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
824                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
825                 break;
826         default:
827                 BUG();
828         }
829         val |= DMA_CCMD_ICC;
830
831         spin_lock_irqsave(&iommu->register_lock, flag);
832         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
833
834         /* Make sure hardware complete it */
835         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
836                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
837
838         spin_unlock_irqrestore(&iommu->register_lock, flag);
839
840         /* flush context entry will implicitly flush write buffer */
841         return 0;
842 }
843
844 /* return value determine if we need a write buffer flush */
845 static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
846         u64 addr, unsigned int size_order, u64 type,
847         int non_present_entry_flush)
848 {
849         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
850         u64 val = 0, val_iva = 0;
851         unsigned long flag;
852
853         /*
854          * In the non-present entry flush case, if hardware doesn't cache
855          * non-present entry we do nothing and if hardware cache non-present
856          * entry, we flush entries of domain 0 (the domain id is used to cache
857          * any non-present entries)
858          */
859         if (non_present_entry_flush) {
860                 if (!cap_caching_mode(iommu->cap))
861                         return 1;
862                 else
863                         did = 0;
864         }
865
866         switch (type) {
867         case DMA_TLB_GLOBAL_FLUSH:
868                 /* global flush doesn't need set IVA_REG */
869                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
870                 break;
871         case DMA_TLB_DSI_FLUSH:
872                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
873                 break;
874         case DMA_TLB_PSI_FLUSH:
875                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
876                 /* Note: always flush non-leaf currently */
877                 val_iva = size_order | addr;
878                 break;
879         default:
880                 BUG();
881         }
882         /* Note: set drain read/write */
883 #if 0
884         /*
885          * This is probably to be super secure.. Looks like we can
886          * ignore it without any impact.
887          */
888         if (cap_read_drain(iommu->cap))
889                 val |= DMA_TLB_READ_DRAIN;
890 #endif
891         if (cap_write_drain(iommu->cap))
892                 val |= DMA_TLB_WRITE_DRAIN;
893
894         spin_lock_irqsave(&iommu->register_lock, flag);
895         /* Note: Only uses first TLB reg currently */
896         if (val_iva)
897                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
898         dmar_writeq(iommu->reg + tlb_offset + 8, val);
899
900         /* Make sure hardware complete it */
901         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
902                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
903
904         spin_unlock_irqrestore(&iommu->register_lock, flag);
905
906         /* check IOTLB invalidation granularity */
907         if (DMA_TLB_IAIG(val) == 0)
908                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
909         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
910                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
911                         (unsigned long long)DMA_TLB_IIRG(type),
912                         (unsigned long long)DMA_TLB_IAIG(val));
913         /* flush iotlb entry will implicitly flush write buffer */
914         return 0;
915 }
916
917 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
918         u64 addr, unsigned int pages, int non_present_entry_flush)
919 {
920         unsigned int mask;
921
922         BUG_ON(addr & (~VTD_PAGE_MASK));
923         BUG_ON(pages == 0);
924
925         /* Fallback to domain selective flush if no PSI support */
926         if (!cap_pgsel_inv(iommu->cap))
927                 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
928                                                 DMA_TLB_DSI_FLUSH,
929                                                 non_present_entry_flush);
930
931         /*
932          * PSI requires page size to be 2 ^ x, and the base address is naturally
933          * aligned to the size
934          */
935         mask = ilog2(__roundup_pow_of_two(pages));
936         /* Fallback to domain selective flush if size is too big */
937         if (mask > cap_max_amask_val(iommu->cap))
938                 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
939                         DMA_TLB_DSI_FLUSH, non_present_entry_flush);
940
941         return iommu->flush.flush_iotlb(iommu, did, addr, mask,
942                                         DMA_TLB_PSI_FLUSH,
943                                         non_present_entry_flush);
944 }
945
946 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
947 {
948         u32 pmen;
949         unsigned long flags;
950
951         spin_lock_irqsave(&iommu->register_lock, flags);
952         pmen = readl(iommu->reg + DMAR_PMEN_REG);
953         pmen &= ~DMA_PMEN_EPM;
954         writel(pmen, iommu->reg + DMAR_PMEN_REG);
955
956         /* wait for the protected region status bit to clear */
957         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
958                 readl, !(pmen & DMA_PMEN_PRS), pmen);
959
960         spin_unlock_irqrestore(&iommu->register_lock, flags);
961 }
962
963 static int iommu_enable_translation(struct intel_iommu *iommu)
964 {
965         u32 sts;
966         unsigned long flags;
967
968         spin_lock_irqsave(&iommu->register_lock, flags);
969         writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
970
971         /* Make sure hardware complete it */
972         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
973                 readl, (sts & DMA_GSTS_TES), sts);
974
975         iommu->gcmd |= DMA_GCMD_TE;
976         spin_unlock_irqrestore(&iommu->register_lock, flags);
977         return 0;
978 }
979
980 static int iommu_disable_translation(struct intel_iommu *iommu)
981 {
982         u32 sts;
983         unsigned long flag;
984
985         spin_lock_irqsave(&iommu->register_lock, flag);
986         iommu->gcmd &= ~DMA_GCMD_TE;
987         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
988
989         /* Make sure hardware complete it */
990         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
991                 readl, (!(sts & DMA_GSTS_TES)), sts);
992
993         spin_unlock_irqrestore(&iommu->register_lock, flag);
994         return 0;
995 }
996
997 /* iommu interrupt handling. Most stuff are MSI-like. */
998
999 static const char *fault_reason_strings[] =
1000 {
1001         "Software",
1002         "Present bit in root entry is clear",
1003         "Present bit in context entry is clear",
1004         "Invalid context entry",
1005         "Access beyond MGAW",
1006         "PTE Write access is not set",
1007         "PTE Read access is not set",
1008         "Next page table ptr is invalid",
1009         "Root table address invalid",
1010         "Context table ptr is invalid",
1011         "non-zero reserved fields in RTP",
1012         "non-zero reserved fields in CTP",
1013         "non-zero reserved fields in PTE",
1014 };
1015 #define MAX_FAULT_REASON_IDX    (ARRAY_SIZE(fault_reason_strings) - 1)
1016
1017 const char *dmar_get_fault_reason(u8 fault_reason)
1018 {
1019         if (fault_reason > MAX_FAULT_REASON_IDX)
1020                 return "Unknown";
1021         else
1022                 return fault_reason_strings[fault_reason];
1023 }
1024
1025 void dmar_msi_unmask(unsigned int irq)
1026 {
1027         struct intel_iommu *iommu = get_irq_data(irq);
1028         unsigned long flag;
1029
1030         /* unmask it */
1031         spin_lock_irqsave(&iommu->register_lock, flag);
1032         writel(0, iommu->reg + DMAR_FECTL_REG);
1033         /* Read a reg to force flush the post write */
1034         readl(iommu->reg + DMAR_FECTL_REG);
1035         spin_unlock_irqrestore(&iommu->register_lock, flag);
1036 }
1037
1038 void dmar_msi_mask(unsigned int irq)
1039 {
1040         unsigned long flag;
1041         struct intel_iommu *iommu = get_irq_data(irq);
1042
1043         /* mask it */
1044         spin_lock_irqsave(&iommu->register_lock, flag);
1045         writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
1046         /* Read a reg to force flush the post write */
1047         readl(iommu->reg + DMAR_FECTL_REG);
1048         spin_unlock_irqrestore(&iommu->register_lock, flag);
1049 }
1050
1051 void dmar_msi_write(int irq, struct msi_msg *msg)
1052 {
1053         struct intel_iommu *iommu = get_irq_data(irq);
1054         unsigned long flag;
1055
1056         spin_lock_irqsave(&iommu->register_lock, flag);
1057         writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
1058         writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
1059         writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
1060         spin_unlock_irqrestore(&iommu->register_lock, flag);
1061 }
1062
1063 void dmar_msi_read(int irq, struct msi_msg *msg)
1064 {
1065         struct intel_iommu *iommu = get_irq_data(irq);
1066         unsigned long flag;
1067
1068         spin_lock_irqsave(&iommu->register_lock, flag);
1069         msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
1070         msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
1071         msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
1072         spin_unlock_irqrestore(&iommu->register_lock, flag);
1073 }
1074
1075 static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
1076                 u8 fault_reason, u16 source_id, unsigned long long addr)
1077 {
1078         const char *reason;
1079
1080         reason = dmar_get_fault_reason(fault_reason);
1081
1082         printk(KERN_ERR
1083                 "DMAR:[%s] Request device [%02x:%02x.%d] "
1084                 "fault addr %llx \n"
1085                 "DMAR:[fault reason %02d] %s\n",
1086                 (type ? "DMA Read" : "DMA Write"),
1087                 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
1088                 PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
1089         return 0;
1090 }
1091
1092 #define PRIMARY_FAULT_REG_LEN (16)
1093 static irqreturn_t iommu_page_fault(int irq, void *dev_id)
1094 {
1095         struct intel_iommu *iommu = dev_id;
1096         int reg, fault_index;
1097         u32 fault_status;
1098         unsigned long flag;
1099
1100         spin_lock_irqsave(&iommu->register_lock, flag);
1101         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1102
1103         /* TBD: ignore advanced fault log currently */
1104         if (!(fault_status & DMA_FSTS_PPF))
1105                 goto clear_overflow;
1106
1107         fault_index = dma_fsts_fault_record_index(fault_status);
1108         reg = cap_fault_reg_offset(iommu->cap);
1109         while (1) {
1110                 u8 fault_reason;
1111                 u16 source_id;
1112                 u64 guest_addr;
1113                 int type;
1114                 u32 data;
1115
1116                 /* highest 32 bits */
1117                 data = readl(iommu->reg + reg +
1118                                 fault_index * PRIMARY_FAULT_REG_LEN + 12);
1119                 if (!(data & DMA_FRCD_F))
1120                         break;
1121
1122                 fault_reason = dma_frcd_fault_reason(data);
1123                 type = dma_frcd_type(data);
1124
1125                 data = readl(iommu->reg + reg +
1126                                 fault_index * PRIMARY_FAULT_REG_LEN + 8);
1127                 source_id = dma_frcd_source_id(data);
1128
1129                 guest_addr = dmar_readq(iommu->reg + reg +
1130                                 fault_index * PRIMARY_FAULT_REG_LEN);
1131                 guest_addr = dma_frcd_page_addr(guest_addr);
1132                 /* clear the fault */
1133                 writel(DMA_FRCD_F, iommu->reg + reg +
1134                         fault_index * PRIMARY_FAULT_REG_LEN + 12);
1135
1136                 spin_unlock_irqrestore(&iommu->register_lock, flag);
1137
1138                 iommu_page_fault_do_one(iommu, type, fault_reason,
1139                                 source_id, guest_addr);
1140
1141                 fault_index++;
1142                 if (fault_index > cap_num_fault_regs(iommu->cap))
1143                         fault_index = 0;
1144                 spin_lock_irqsave(&iommu->register_lock, flag);
1145         }
1146 clear_overflow:
1147         /* clear primary fault overflow */
1148         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1149         if (fault_status & DMA_FSTS_PFO)
1150                 writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
1151
1152         spin_unlock_irqrestore(&iommu->register_lock, flag);
1153         return IRQ_HANDLED;
1154 }
1155
1156 int dmar_set_interrupt(struct intel_iommu *iommu)
1157 {
1158         int irq, ret;
1159
1160         irq = create_irq();
1161         if (!irq) {
1162                 printk(KERN_ERR "IOMMU: no free vectors\n");
1163                 return -EINVAL;
1164         }
1165
1166         set_irq_data(irq, iommu);
1167         iommu->irq = irq;
1168
1169         ret = arch_setup_dmar_msi(irq);
1170         if (ret) {
1171                 set_irq_data(irq, NULL);
1172                 iommu->irq = 0;
1173                 destroy_irq(irq);
1174                 return 0;
1175         }
1176
1177         /* Force fault register is cleared */
1178         iommu_page_fault(irq, iommu);
1179
1180         ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
1181         if (ret)
1182                 printk(KERN_ERR "IOMMU: can't request irq\n");
1183         return ret;
1184 }
1185
1186 static int iommu_init_domains(struct intel_iommu *iommu)
1187 {
1188         unsigned long ndomains;
1189         unsigned long nlongs;
1190
1191         ndomains = cap_ndoms(iommu->cap);
1192         pr_debug("Number of Domains supportd <%ld>\n", ndomains);
1193         nlongs = BITS_TO_LONGS(ndomains);
1194
1195         /* TBD: there might be 64K domains,
1196          * consider other allocation for future chip
1197          */
1198         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1199         if (!iommu->domain_ids) {
1200                 printk(KERN_ERR "Allocating domain id array failed\n");
1201                 return -ENOMEM;
1202         }
1203         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1204                         GFP_KERNEL);
1205         if (!iommu->domains) {
1206                 printk(KERN_ERR "Allocating domain array failed\n");
1207                 kfree(iommu->domain_ids);
1208                 return -ENOMEM;
1209         }
1210
1211         spin_lock_init(&iommu->lock);
1212
1213         /*
1214          * if Caching mode is set, then invalid translations are tagged
1215          * with domainid 0. Hence we need to pre-allocate it.
1216          */
1217         if (cap_caching_mode(iommu->cap))
1218                 set_bit(0, iommu->domain_ids);
1219         return 0;
1220 }
1221
1222
1223 static void domain_exit(struct dmar_domain *domain);
1224 static void vm_domain_exit(struct dmar_domain *domain);
1225
1226 void free_dmar_iommu(struct intel_iommu *iommu)
1227 {
1228         struct dmar_domain *domain;
1229         int i;
1230         unsigned long flags;
1231
1232         i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1233         for (; i < cap_ndoms(iommu->cap); ) {
1234                 domain = iommu->domains[i];
1235                 clear_bit(i, iommu->domain_ids);
1236
1237                 spin_lock_irqsave(&domain->iommu_lock, flags);
1238                 if (--domain->iommu_count == 0) {
1239                         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1240                                 vm_domain_exit(domain);
1241                         else
1242                                 domain_exit(domain);
1243                 }
1244                 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1245
1246                 i = find_next_bit(iommu->domain_ids,
1247                         cap_ndoms(iommu->cap), i+1);
1248         }
1249
1250         if (iommu->gcmd & DMA_GCMD_TE)
1251                 iommu_disable_translation(iommu);
1252
1253         if (iommu->irq) {
1254                 set_irq_data(iommu->irq, NULL);
1255                 /* This will mask the irq */
1256                 free_irq(iommu->irq, iommu);
1257                 destroy_irq(iommu->irq);
1258         }
1259
1260         kfree(iommu->domains);
1261         kfree(iommu->domain_ids);
1262
1263         g_iommus[iommu->seq_id] = NULL;
1264
1265         /* if all iommus are freed, free g_iommus */
1266         for (i = 0; i < g_num_of_iommus; i++) {
1267                 if (g_iommus[i])
1268                         break;
1269         }
1270
1271         if (i == g_num_of_iommus)
1272                 kfree(g_iommus);
1273
1274         /* free context mapping */
1275         free_context_table(iommu);
1276 }
1277
1278 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
1279 {
1280         unsigned long num;
1281         unsigned long ndomains;
1282         struct dmar_domain *domain;
1283         unsigned long flags;
1284
1285         domain = alloc_domain_mem();
1286         if (!domain)
1287                 return NULL;
1288
1289         ndomains = cap_ndoms(iommu->cap);
1290
1291         spin_lock_irqsave(&iommu->lock, flags);
1292         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1293         if (num >= ndomains) {
1294                 spin_unlock_irqrestore(&iommu->lock, flags);
1295                 free_domain_mem(domain);
1296                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1297                 return NULL;
1298         }
1299
1300         set_bit(num, iommu->domain_ids);
1301         domain->id = num;
1302         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1303         set_bit(iommu->seq_id, &domain->iommu_bmp);
1304         domain->flags = 0;
1305         iommu->domains[num] = domain;
1306         spin_unlock_irqrestore(&iommu->lock, flags);
1307
1308         return domain;
1309 }
1310
1311 static void iommu_free_domain(struct dmar_domain *domain)
1312 {
1313         unsigned long flags;
1314         struct intel_iommu *iommu;
1315
1316         iommu = domain_get_iommu(domain);
1317
1318         spin_lock_irqsave(&iommu->lock, flags);
1319         clear_bit(domain->id, iommu->domain_ids);
1320         spin_unlock_irqrestore(&iommu->lock, flags);
1321 }
1322
1323 static struct iova_domain reserved_iova_list;
1324 static struct lock_class_key reserved_alloc_key;
1325 static struct lock_class_key reserved_rbtree_key;
1326
1327 static void dmar_init_reserved_ranges(void)
1328 {
1329         struct pci_dev *pdev = NULL;
1330         struct iova *iova;
1331         int i;
1332         u64 addr, size;
1333
1334         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1335
1336         lockdep_set_class(&reserved_iova_list.iova_alloc_lock,
1337                 &reserved_alloc_key);
1338         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1339                 &reserved_rbtree_key);
1340
1341         /* IOAPIC ranges shouldn't be accessed by DMA */
1342         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1343                 IOVA_PFN(IOAPIC_RANGE_END));
1344         if (!iova)
1345                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1346
1347         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1348         for_each_pci_dev(pdev) {
1349                 struct resource *r;
1350
1351                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1352                         r = &pdev->resource[i];
1353                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1354                                 continue;
1355                         addr = r->start;
1356                         addr &= PAGE_MASK;
1357                         size = r->end - addr;
1358                         size = PAGE_ALIGN(size);
1359                         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1360                                 IOVA_PFN(size + addr) - 1);
1361                         if (!iova)
1362                                 printk(KERN_ERR "Reserve iova failed\n");
1363                 }
1364         }
1365
1366 }
1367
1368 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1369 {
1370         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1371 }
1372
1373 static inline int guestwidth_to_adjustwidth(int gaw)
1374 {
1375         int agaw;
1376         int r = (gaw - 12) % 9;
1377
1378         if (r == 0)
1379                 agaw = gaw;
1380         else
1381                 agaw = gaw + 9 - r;
1382         if (agaw > 64)
1383                 agaw = 64;
1384         return agaw;
1385 }
1386
1387 static int domain_init(struct dmar_domain *domain, int guest_width)
1388 {
1389         struct intel_iommu *iommu;
1390         int adjust_width, agaw;
1391         unsigned long sagaw;
1392
1393         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1394         spin_lock_init(&domain->mapping_lock);
1395         spin_lock_init(&domain->iommu_lock);
1396
1397         domain_reserve_special_ranges(domain);
1398
1399         /* calculate AGAW */
1400         iommu = domain_get_iommu(domain);
1401         if (guest_width > cap_mgaw(iommu->cap))
1402                 guest_width = cap_mgaw(iommu->cap);
1403         domain->gaw = guest_width;
1404         adjust_width = guestwidth_to_adjustwidth(guest_width);
1405         agaw = width_to_agaw(adjust_width);
1406         sagaw = cap_sagaw(iommu->cap);
1407         if (!test_bit(agaw, &sagaw)) {
1408                 /* hardware doesn't support it, choose a bigger one */
1409                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1410                 agaw = find_next_bit(&sagaw, 5, agaw);
1411                 if (agaw >= 5)
1412                         return -ENODEV;
1413         }
1414         domain->agaw = agaw;
1415         INIT_LIST_HEAD(&domain->devices);
1416
1417         if (ecap_coherent(iommu->ecap))
1418                 domain->iommu_coherency = 1;
1419         else
1420                 domain->iommu_coherency = 0;
1421
1422         domain->iommu_count = 1;
1423
1424         /* always allocate the top pgd */
1425         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1426         if (!domain->pgd)
1427                 return -ENOMEM;
1428         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1429         return 0;
1430 }
1431
1432 static void domain_exit(struct dmar_domain *domain)
1433 {
1434         u64 end;
1435
1436         /* Domain 0 is reserved, so dont process it */
1437         if (!domain)
1438                 return;
1439
1440         domain_remove_dev_info(domain);
1441         /* destroy iovas */
1442         put_iova_domain(&domain->iovad);
1443         end = DOMAIN_MAX_ADDR(domain->gaw);
1444         end = end & (~PAGE_MASK);
1445
1446         /* clear ptes */
1447         dma_pte_clear_range(domain, 0, end);
1448
1449         /* free page tables */
1450         dma_pte_free_pagetable(domain, 0, end);
1451
1452         iommu_free_domain(domain);
1453         free_domain_mem(domain);
1454 }
1455
1456 static int domain_context_mapping_one(struct dmar_domain *domain,
1457                 u8 bus, u8 devfn)
1458 {
1459         struct context_entry *context;
1460         unsigned long flags;
1461         struct intel_iommu *iommu;
1462         struct dma_pte *pgd;
1463         unsigned long num;
1464         unsigned long ndomains;
1465         int id;
1466         int agaw;
1467
1468         pr_debug("Set context mapping for %02x:%02x.%d\n",
1469                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1470         BUG_ON(!domain->pgd);
1471
1472         iommu = device_to_iommu(bus, devfn);
1473         if (!iommu)
1474                 return -ENODEV;
1475
1476         context = device_to_context_entry(iommu, bus, devfn);
1477         if (!context)
1478                 return -ENOMEM;
1479         spin_lock_irqsave(&iommu->lock, flags);
1480         if (context_present(context)) {
1481                 spin_unlock_irqrestore(&iommu->lock, flags);
1482                 return 0;
1483         }
1484
1485         id = domain->id;
1486         pgd = domain->pgd;
1487
1488         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) {
1489                 int found = 0;
1490
1491                 /* find an available domain id for this device in iommu */
1492                 ndomains = cap_ndoms(iommu->cap);
1493                 num = find_first_bit(iommu->domain_ids, ndomains);
1494                 for (; num < ndomains; ) {
1495                         if (iommu->domains[num] == domain) {
1496                                 id = num;
1497                                 found = 1;
1498                                 break;
1499                         }
1500                         num = find_next_bit(iommu->domain_ids,
1501                                             cap_ndoms(iommu->cap), num+1);
1502                 }
1503
1504                 if (found == 0) {
1505                         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1506                         if (num >= ndomains) {
1507                                 spin_unlock_irqrestore(&iommu->lock, flags);
1508                                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1509                                 return -EFAULT;
1510                         }
1511
1512                         set_bit(num, iommu->domain_ids);
1513                         iommu->domains[num] = domain;
1514                         id = num;
1515                 }
1516
1517                 /* Skip top levels of page tables for
1518                  * iommu which has less agaw than default.
1519                  */
1520                 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1521                         pgd = phys_to_virt(dma_pte_addr(pgd));
1522                         if (!dma_pte_present(pgd)) {
1523                                 spin_unlock_irqrestore(&iommu->lock, flags);
1524                                 return -ENOMEM;
1525                         }
1526                 }
1527         }
1528
1529         context_set_domain_id(context, id);
1530         context_set_address_width(context, iommu->agaw);
1531         context_set_address_root(context, virt_to_phys(pgd));
1532         context_set_translation_type(context, CONTEXT_TT_MULTI_LEVEL);
1533         context_set_fault_enable(context);
1534         context_set_present(context);
1535         domain_flush_cache(domain, context, sizeof(*context));
1536
1537         /* it's a non-present to present mapping */
1538         if (iommu->flush.flush_context(iommu, domain->id,
1539                 (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT,
1540                 DMA_CCMD_DEVICE_INVL, 1))
1541                 iommu_flush_write_buffer(iommu);
1542         else
1543                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH, 0);
1544
1545         spin_unlock_irqrestore(&iommu->lock, flags);
1546
1547         spin_lock_irqsave(&domain->iommu_lock, flags);
1548         if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1549                 domain->iommu_count++;
1550                 domain_update_iommu_coherency(domain);
1551         }
1552         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1553         return 0;
1554 }
1555
1556 static int
1557 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1558 {
1559         int ret;
1560         struct pci_dev *tmp, *parent;
1561
1562         ret = domain_context_mapping_one(domain, pdev->bus->number,
1563                 pdev->devfn);
1564         if (ret)
1565                 return ret;
1566
1567         /* dependent device mapping */
1568         tmp = pci_find_upstream_pcie_bridge(pdev);
1569         if (!tmp)
1570                 return 0;
1571         /* Secondary interface's bus number and devfn 0 */
1572         parent = pdev->bus->self;
1573         while (parent != tmp) {
1574                 ret = domain_context_mapping_one(domain, parent->bus->number,
1575                         parent->devfn);
1576                 if (ret)
1577                         return ret;
1578                 parent = parent->bus->self;
1579         }
1580         if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1581                 return domain_context_mapping_one(domain,
1582                         tmp->subordinate->number, 0);
1583         else /* this is a legacy PCI bridge */
1584                 return domain_context_mapping_one(domain,
1585                         tmp->bus->number, tmp->devfn);
1586 }
1587
1588 static int domain_context_mapped(struct pci_dev *pdev)
1589 {
1590         int ret;
1591         struct pci_dev *tmp, *parent;
1592         struct intel_iommu *iommu;
1593
1594         iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
1595         if (!iommu)
1596                 return -ENODEV;
1597
1598         ret = device_context_mapped(iommu,
1599                 pdev->bus->number, pdev->devfn);
1600         if (!ret)
1601                 return ret;
1602         /* dependent device mapping */
1603         tmp = pci_find_upstream_pcie_bridge(pdev);
1604         if (!tmp)
1605                 return ret;
1606         /* Secondary interface's bus number and devfn 0 */
1607         parent = pdev->bus->self;
1608         while (parent != tmp) {
1609                 ret = device_context_mapped(iommu, parent->bus->number,
1610                         parent->devfn);
1611                 if (!ret)
1612                         return ret;
1613                 parent = parent->bus->self;
1614         }
1615         if (tmp->is_pcie)
1616                 return device_context_mapped(iommu,
1617                         tmp->subordinate->number, 0);
1618         else
1619                 return device_context_mapped(iommu,
1620                         tmp->bus->number, tmp->devfn);
1621 }
1622
1623 static int
1624 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1625                         u64 hpa, size_t size, int prot)
1626 {
1627         u64 start_pfn, end_pfn;
1628         struct dma_pte *pte;
1629         int index;
1630         int addr_width = agaw_to_width(domain->agaw);
1631
1632         hpa &= (((u64)1) << addr_width) - 1;
1633
1634         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1635                 return -EINVAL;
1636         iova &= PAGE_MASK;
1637         start_pfn = ((u64)hpa) >> VTD_PAGE_SHIFT;
1638         end_pfn = (VTD_PAGE_ALIGN(((u64)hpa) + size)) >> VTD_PAGE_SHIFT;
1639         index = 0;
1640         while (start_pfn < end_pfn) {
1641                 pte = addr_to_dma_pte(domain, iova + VTD_PAGE_SIZE * index);
1642                 if (!pte)
1643                         return -ENOMEM;
1644                 /* We don't need lock here, nobody else
1645                  * touches the iova range
1646                  */
1647                 BUG_ON(dma_pte_addr(pte));
1648                 dma_set_pte_addr(pte, start_pfn << VTD_PAGE_SHIFT);
1649                 dma_set_pte_prot(pte, prot);
1650                 domain_flush_cache(domain, pte, sizeof(*pte));
1651                 start_pfn++;
1652                 index++;
1653         }
1654         return 0;
1655 }
1656
1657 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1658 {
1659         if (!iommu)
1660                 return;
1661
1662         clear_context_table(iommu, bus, devfn);
1663         iommu->flush.flush_context(iommu, 0, 0, 0,
1664                                            DMA_CCMD_GLOBAL_INVL, 0);
1665         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
1666                                          DMA_TLB_GLOBAL_FLUSH, 0);
1667 }
1668
1669 static void domain_remove_dev_info(struct dmar_domain *domain)
1670 {
1671         struct device_domain_info *info;
1672         unsigned long flags;
1673         struct intel_iommu *iommu;
1674
1675         spin_lock_irqsave(&device_domain_lock, flags);
1676         while (!list_empty(&domain->devices)) {
1677                 info = list_entry(domain->devices.next,
1678                         struct device_domain_info, link);
1679                 list_del(&info->link);
1680                 list_del(&info->global);
1681                 if (info->dev)
1682                         info->dev->dev.archdata.iommu = NULL;
1683                 spin_unlock_irqrestore(&device_domain_lock, flags);
1684
1685                 iommu = device_to_iommu(info->bus, info->devfn);
1686                 iommu_detach_dev(iommu, info->bus, info->devfn);
1687                 free_devinfo_mem(info);
1688
1689                 spin_lock_irqsave(&device_domain_lock, flags);
1690         }
1691         spin_unlock_irqrestore(&device_domain_lock, flags);
1692 }
1693
1694 /*
1695  * find_domain
1696  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1697  */
1698 static struct dmar_domain *
1699 find_domain(struct pci_dev *pdev)
1700 {
1701         struct device_domain_info *info;
1702
1703         /* No lock here, assumes no domain exit in normal case */
1704         info = pdev->dev.archdata.iommu;
1705         if (info)
1706                 return info->domain;
1707         return NULL;
1708 }
1709
1710 /* domain is initialized */
1711 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1712 {
1713         struct dmar_domain *domain, *found = NULL;
1714         struct intel_iommu *iommu;
1715         struct dmar_drhd_unit *drhd;
1716         struct device_domain_info *info, *tmp;
1717         struct pci_dev *dev_tmp;
1718         unsigned long flags;
1719         int bus = 0, devfn = 0;
1720
1721         domain = find_domain(pdev);
1722         if (domain)
1723                 return domain;
1724
1725         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1726         if (dev_tmp) {
1727                 if (dev_tmp->is_pcie) {
1728                         bus = dev_tmp->subordinate->number;
1729                         devfn = 0;
1730                 } else {
1731                         bus = dev_tmp->bus->number;
1732                         devfn = dev_tmp->devfn;
1733                 }
1734                 spin_lock_irqsave(&device_domain_lock, flags);
1735                 list_for_each_entry(info, &device_domain_list, global) {
1736                         if (info->bus == bus && info->devfn == devfn) {
1737                                 found = info->domain;
1738                                 break;
1739                         }
1740                 }
1741                 spin_unlock_irqrestore(&device_domain_lock, flags);
1742                 /* pcie-pci bridge already has a domain, uses it */
1743                 if (found) {
1744                         domain = found;
1745                         goto found_domain;
1746                 }
1747         }
1748
1749         /* Allocate new domain for the device */
1750         drhd = dmar_find_matched_drhd_unit(pdev);
1751         if (!drhd) {
1752                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1753                         pci_name(pdev));
1754                 return NULL;
1755         }
1756         iommu = drhd->iommu;
1757
1758         domain = iommu_alloc_domain(iommu);
1759         if (!domain)
1760                 goto error;
1761
1762         if (domain_init(domain, gaw)) {
1763                 domain_exit(domain);
1764                 goto error;
1765         }
1766
1767         /* register pcie-to-pci device */
1768         if (dev_tmp) {
1769                 info = alloc_devinfo_mem();
1770                 if (!info) {
1771                         domain_exit(domain);
1772                         goto error;
1773                 }
1774                 info->bus = bus;
1775                 info->devfn = devfn;
1776                 info->dev = NULL;
1777                 info->domain = domain;
1778                 /* This domain is shared by devices under p2p bridge */
1779                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
1780
1781                 /* pcie-to-pci bridge already has a domain, uses it */
1782                 found = NULL;
1783                 spin_lock_irqsave(&device_domain_lock, flags);
1784                 list_for_each_entry(tmp, &device_domain_list, global) {
1785                         if (tmp->bus == bus && tmp->devfn == devfn) {
1786                                 found = tmp->domain;
1787                                 break;
1788                         }
1789                 }
1790                 if (found) {
1791                         free_devinfo_mem(info);
1792                         domain_exit(domain);
1793                         domain = found;
1794                 } else {
1795                         list_add(&info->link, &domain->devices);
1796                         list_add(&info->global, &device_domain_list);
1797                 }
1798                 spin_unlock_irqrestore(&device_domain_lock, flags);
1799         }
1800
1801 found_domain:
1802         info = alloc_devinfo_mem();
1803         if (!info)
1804                 goto error;
1805         info->bus = pdev->bus->number;
1806         info->devfn = pdev->devfn;
1807         info->dev = pdev;
1808         info->domain = domain;
1809         spin_lock_irqsave(&device_domain_lock, flags);
1810         /* somebody is fast */
1811         found = find_domain(pdev);
1812         if (found != NULL) {
1813                 spin_unlock_irqrestore(&device_domain_lock, flags);
1814                 if (found != domain) {
1815                         domain_exit(domain);
1816                         domain = found;
1817                 }
1818                 free_devinfo_mem(info);
1819                 return domain;
1820         }
1821         list_add(&info->link, &domain->devices);
1822         list_add(&info->global, &device_domain_list);
1823         pdev->dev.archdata.iommu = info;
1824         spin_unlock_irqrestore(&device_domain_lock, flags);
1825         return domain;
1826 error:
1827         /* recheck it here, maybe others set it */
1828         return find_domain(pdev);
1829 }
1830
1831 static int iommu_prepare_identity_map(struct pci_dev *pdev,
1832                                       unsigned long long start,
1833                                       unsigned long long end)
1834 {
1835         struct dmar_domain *domain;
1836         unsigned long size;
1837         unsigned long long base;
1838         int ret;
1839
1840         printk(KERN_INFO
1841                 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1842                 pci_name(pdev), start, end);
1843         /* page table init */
1844         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1845         if (!domain)
1846                 return -ENOMEM;
1847
1848         /* The address might not be aligned */
1849         base = start & PAGE_MASK;
1850         size = end - base;
1851         size = PAGE_ALIGN(size);
1852         if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1853                         IOVA_PFN(base + size) - 1)) {
1854                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1855                 ret = -ENOMEM;
1856                 goto error;
1857         }
1858
1859         pr_debug("Mapping reserved region %lx@%llx for %s\n",
1860                 size, base, pci_name(pdev));
1861         /*
1862          * RMRR range might have overlap with physical memory range,
1863          * clear it first
1864          */
1865         dma_pte_clear_range(domain, base, base + size);
1866
1867         ret = domain_page_mapping(domain, base, base, size,
1868                 DMA_PTE_READ|DMA_PTE_WRITE);
1869         if (ret)
1870                 goto error;
1871
1872         /* context entry init */
1873         ret = domain_context_mapping(domain, pdev);
1874         if (!ret)
1875                 return 0;
1876 error:
1877         domain_exit(domain);
1878         return ret;
1879
1880 }
1881
1882 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1883         struct pci_dev *pdev)
1884 {
1885         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1886                 return 0;
1887         return iommu_prepare_identity_map(pdev, rmrr->base_address,
1888                 rmrr->end_address + 1);
1889 }
1890
1891 #ifdef CONFIG_DMAR_GFX_WA
1892 struct iommu_prepare_data {
1893         struct pci_dev *pdev;
1894         int ret;
1895 };
1896
1897 static int __init iommu_prepare_work_fn(unsigned long start_pfn,
1898                                          unsigned long end_pfn, void *datax)
1899 {
1900         struct iommu_prepare_data *data;
1901
1902         data = (struct iommu_prepare_data *)datax;
1903
1904         data->ret = iommu_prepare_identity_map(data->pdev,
1905                                 start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
1906         return data->ret;
1907
1908 }
1909
1910 static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev)
1911 {
1912         int nid;
1913         struct iommu_prepare_data data;
1914
1915         data.pdev = pdev;
1916         data.ret = 0;
1917
1918         for_each_online_node(nid) {
1919                 work_with_active_regions(nid, iommu_prepare_work_fn, &data);
1920                 if (data.ret)
1921                         return data.ret;
1922         }
1923         return data.ret;
1924 }
1925
1926 static void __init iommu_prepare_gfx_mapping(void)
1927 {
1928         struct pci_dev *pdev = NULL;
1929         int ret;
1930
1931         for_each_pci_dev(pdev) {
1932                 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
1933                                 !IS_GFX_DEVICE(pdev))
1934                         continue;
1935                 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1936                         pci_name(pdev));
1937                 ret = iommu_prepare_with_active_regions(pdev);
1938                 if (ret)
1939                         printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1940         }
1941 }
1942 #else /* !CONFIG_DMAR_GFX_WA */
1943 static inline void iommu_prepare_gfx_mapping(void)
1944 {
1945         return;
1946 }
1947 #endif
1948
1949 #ifdef CONFIG_DMAR_FLOPPY_WA
1950 static inline void iommu_prepare_isa(void)
1951 {
1952         struct pci_dev *pdev;
1953         int ret;
1954
1955         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
1956         if (!pdev)
1957                 return;
1958
1959         printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
1960         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
1961
1962         if (ret)
1963                 printk("IOMMU: Failed to create 0-64M identity map, "
1964                         "floppy might not work\n");
1965
1966 }
1967 #else
1968 static inline void iommu_prepare_isa(void)
1969 {
1970         return;
1971 }
1972 #endif /* !CONFIG_DMAR_FLPY_WA */
1973
1974 static int __init init_dmars(void)
1975 {
1976         struct dmar_drhd_unit *drhd;
1977         struct dmar_rmrr_unit *rmrr;
1978         struct pci_dev *pdev;
1979         struct intel_iommu *iommu;
1980         int i, ret, unit = 0;
1981
1982         /*
1983          * for each drhd
1984          *    allocate root
1985          *    initialize and program root entry to not present
1986          * endfor
1987          */
1988         for_each_drhd_unit(drhd) {
1989                 g_num_of_iommus++;
1990                 /*
1991                  * lock not needed as this is only incremented in the single
1992                  * threaded kernel __init code path all other access are read
1993                  * only
1994                  */
1995         }
1996
1997         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
1998                         GFP_KERNEL);
1999         if (!g_iommus) {
2000                 printk(KERN_ERR "Allocating global iommu array failed\n");
2001                 ret = -ENOMEM;
2002                 goto error;
2003         }
2004
2005         deferred_flush = kzalloc(g_num_of_iommus *
2006                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2007         if (!deferred_flush) {
2008                 kfree(g_iommus);
2009                 ret = -ENOMEM;
2010                 goto error;
2011         }
2012
2013         for_each_drhd_unit(drhd) {
2014                 if (drhd->ignored)
2015                         continue;
2016
2017                 iommu = drhd->iommu;
2018                 g_iommus[iommu->seq_id] = iommu;
2019
2020                 ret = iommu_init_domains(iommu);
2021                 if (ret)
2022                         goto error;
2023
2024                 /*
2025                  * TBD:
2026                  * we could share the same root & context tables
2027                  * amoung all IOMMU's. Need to Split it later.
2028                  */
2029                 ret = iommu_alloc_root_entry(iommu);
2030                 if (ret) {
2031                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2032                         goto error;
2033                 }
2034         }
2035
2036         for_each_drhd_unit(drhd) {
2037                 if (drhd->ignored)
2038                         continue;
2039
2040                 iommu = drhd->iommu;
2041                 if (dmar_enable_qi(iommu)) {
2042                         /*
2043                          * Queued Invalidate not enabled, use Register Based
2044                          * Invalidate
2045                          */
2046                         iommu->flush.flush_context = __iommu_flush_context;
2047                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2048                         printk(KERN_INFO "IOMMU 0x%Lx: using Register based "
2049                                "invalidation\n",
2050                                (unsigned long long)drhd->reg_base_addr);
2051                 } else {
2052                         iommu->flush.flush_context = qi_flush_context;
2053                         iommu->flush.flush_iotlb = qi_flush_iotlb;
2054                         printk(KERN_INFO "IOMMU 0x%Lx: using Queued "
2055                                "invalidation\n",
2056                                (unsigned long long)drhd->reg_base_addr);
2057                 }
2058         }
2059
2060         /*
2061          * For each rmrr
2062          *   for each dev attached to rmrr
2063          *   do
2064          *     locate drhd for dev, alloc domain for dev
2065          *     allocate free domain
2066          *     allocate page table entries for rmrr
2067          *     if context not allocated for bus
2068          *           allocate and init context
2069          *           set present in root table for this bus
2070          *     init context with domain, translation etc
2071          *    endfor
2072          * endfor
2073          */
2074         for_each_rmrr_units(rmrr) {
2075                 for (i = 0; i < rmrr->devices_cnt; i++) {
2076                         pdev = rmrr->devices[i];
2077                         /* some BIOS lists non-exist devices in DMAR table */
2078                         if (!pdev)
2079                                 continue;
2080                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2081                         if (ret)
2082                                 printk(KERN_ERR
2083                                  "IOMMU: mapping reserved region failed\n");
2084                 }
2085         }
2086
2087         iommu_prepare_gfx_mapping();
2088
2089         iommu_prepare_isa();
2090
2091         /*
2092          * for each drhd
2093          *   enable fault log
2094          *   global invalidate context cache
2095          *   global invalidate iotlb
2096          *   enable translation
2097          */
2098         for_each_drhd_unit(drhd) {
2099                 if (drhd->ignored)
2100                         continue;
2101                 iommu = drhd->iommu;
2102                 sprintf (iommu->name, "dmar%d", unit++);
2103
2104                 iommu_flush_write_buffer(iommu);
2105
2106                 ret = dmar_set_interrupt(iommu);
2107                 if (ret)
2108                         goto error;
2109
2110                 iommu_set_root_entry(iommu);
2111
2112                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
2113                                            0);
2114                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
2115                                          0);
2116                 iommu_disable_protect_mem_regions(iommu);
2117
2118                 ret = iommu_enable_translation(iommu);
2119                 if (ret)
2120                         goto error;
2121         }
2122
2123         return 0;
2124 error:
2125         for_each_drhd_unit(drhd) {
2126                 if (drhd->ignored)
2127                         continue;
2128                 iommu = drhd->iommu;
2129                 free_iommu(iommu);
2130         }
2131         kfree(g_iommus);
2132         return ret;
2133 }
2134
2135 static inline u64 aligned_size(u64 host_addr, size_t size)
2136 {
2137         u64 addr;
2138         addr = (host_addr & (~PAGE_MASK)) + size;
2139         return PAGE_ALIGN(addr);
2140 }
2141
2142 struct iova *
2143 iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
2144 {
2145         struct iova *piova;
2146
2147         /* Make sure it's in range */
2148         end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
2149         if (!size || (IOVA_START_ADDR + size > end))
2150                 return NULL;
2151
2152         piova = alloc_iova(&domain->iovad,
2153                         size >> PAGE_SHIFT, IOVA_PFN(end), 1);
2154         return piova;
2155 }
2156
2157 static struct iova *
2158 __intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
2159                    size_t size, u64 dma_mask)
2160 {
2161         struct pci_dev *pdev = to_pci_dev(dev);
2162         struct iova *iova = NULL;
2163
2164         if (dma_mask <= DMA_32BIT_MASK || dmar_forcedac)
2165                 iova = iommu_alloc_iova(domain, size, dma_mask);
2166         else {
2167                 /*
2168                  * First try to allocate an io virtual address in
2169                  * DMA_32BIT_MASK and if that fails then try allocating
2170                  * from higher range
2171                  */
2172                 iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
2173                 if (!iova)
2174                         iova = iommu_alloc_iova(domain, size, dma_mask);
2175         }
2176
2177         if (!iova) {
2178                 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
2179                 return NULL;
2180         }
2181
2182         return iova;
2183 }
2184
2185 static struct dmar_domain *
2186 get_valid_domain_for_dev(struct pci_dev *pdev)
2187 {
2188         struct dmar_domain *domain;
2189         int ret;
2190
2191         domain = get_domain_for_dev(pdev,
2192                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2193         if (!domain) {
2194                 printk(KERN_ERR
2195                         "Allocating domain for %s failed", pci_name(pdev));
2196                 return NULL;
2197         }
2198
2199         /* make sure context mapping is ok */
2200         if (unlikely(!domain_context_mapped(pdev))) {
2201                 ret = domain_context_mapping(domain, pdev);
2202                 if (ret) {
2203                         printk(KERN_ERR
2204                                 "Domain context map for %s failed",
2205                                 pci_name(pdev));
2206                         return NULL;
2207                 }
2208         }
2209
2210         return domain;
2211 }
2212
2213 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2214                                      size_t size, int dir, u64 dma_mask)
2215 {
2216         struct pci_dev *pdev = to_pci_dev(hwdev);
2217         struct dmar_domain *domain;
2218         phys_addr_t start_paddr;
2219         struct iova *iova;
2220         int prot = 0;
2221         int ret;
2222         struct intel_iommu *iommu;
2223
2224         BUG_ON(dir == DMA_NONE);
2225         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2226                 return paddr;
2227
2228         domain = get_valid_domain_for_dev(pdev);
2229         if (!domain)
2230                 return 0;
2231
2232         iommu = domain_get_iommu(domain);
2233         size = aligned_size((u64)paddr, size);
2234
2235         iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2236         if (!iova)
2237                 goto error;
2238
2239         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2240
2241         /*
2242          * Check if DMAR supports zero-length reads on write only
2243          * mappings..
2244          */
2245         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2246                         !cap_zlr(iommu->cap))
2247                 prot |= DMA_PTE_READ;
2248         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2249                 prot |= DMA_PTE_WRITE;
2250         /*
2251          * paddr - (paddr + size) might be partial page, we should map the whole
2252          * page.  Note: if two part of one page are separately mapped, we
2253          * might have two guest_addr mapping to the same host paddr, but this
2254          * is not a big problem
2255          */
2256         ret = domain_page_mapping(domain, start_paddr,
2257                 ((u64)paddr) & PAGE_MASK, size, prot);
2258         if (ret)
2259                 goto error;
2260
2261         /* it's a non-present to present mapping */
2262         ret = iommu_flush_iotlb_psi(iommu, domain->id,
2263                         start_paddr, size >> VTD_PAGE_SHIFT, 1);
2264         if (ret)
2265                 iommu_flush_write_buffer(iommu);
2266
2267         return start_paddr + ((u64)paddr & (~PAGE_MASK));
2268
2269 error:
2270         if (iova)
2271                 __free_iova(&domain->iovad, iova);
2272         printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n",
2273                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2274         return 0;
2275 }
2276
2277 dma_addr_t intel_map_single(struct device *hwdev, phys_addr_t paddr,
2278                             size_t size, int dir)
2279 {
2280         return __intel_map_single(hwdev, paddr, size, dir,
2281                                   to_pci_dev(hwdev)->dma_mask);
2282 }
2283
2284 static void flush_unmaps(void)
2285 {
2286         int i, j;
2287
2288         timer_on = 0;
2289
2290         /* just flush them all */
2291         for (i = 0; i < g_num_of_iommus; i++) {
2292                 struct intel_iommu *iommu = g_iommus[i];
2293                 if (!iommu)
2294                         continue;
2295
2296                 if (deferred_flush[i].next) {
2297                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2298                                                  DMA_TLB_GLOBAL_FLUSH, 0);
2299                         for (j = 0; j < deferred_flush[i].next; j++) {
2300                                 __free_iova(&deferred_flush[i].domain[j]->iovad,
2301                                                 deferred_flush[i].iova[j]);
2302                         }
2303                         deferred_flush[i].next = 0;
2304                 }
2305         }
2306
2307         list_size = 0;
2308 }
2309
2310 static void flush_unmaps_timeout(unsigned long data)
2311 {
2312         unsigned long flags;
2313
2314         spin_lock_irqsave(&async_umap_flush_lock, flags);
2315         flush_unmaps();
2316         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2317 }
2318
2319 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2320 {
2321         unsigned long flags;
2322         int next, iommu_id;
2323         struct intel_iommu *iommu;
2324
2325         spin_lock_irqsave(&async_umap_flush_lock, flags);
2326         if (list_size == HIGH_WATER_MARK)
2327                 flush_unmaps();
2328
2329         iommu = domain_get_iommu(dom);
2330         iommu_id = iommu->seq_id;
2331
2332         next = deferred_flush[iommu_id].next;
2333         deferred_flush[iommu_id].domain[next] = dom;
2334         deferred_flush[iommu_id].iova[next] = iova;
2335         deferred_flush[iommu_id].next++;
2336
2337         if (!timer_on) {
2338                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2339                 timer_on = 1;
2340         }
2341         list_size++;
2342         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2343 }
2344
2345 void intel_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
2346                         int dir)
2347 {
2348         struct pci_dev *pdev = to_pci_dev(dev);
2349         struct dmar_domain *domain;
2350         unsigned long start_addr;
2351         struct iova *iova;
2352         struct intel_iommu *iommu;
2353
2354         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2355                 return;
2356         domain = find_domain(pdev);
2357         BUG_ON(!domain);
2358
2359         iommu = domain_get_iommu(domain);
2360
2361         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2362         if (!iova)
2363                 return;
2364
2365         start_addr = iova->pfn_lo << PAGE_SHIFT;
2366         size = aligned_size((u64)dev_addr, size);
2367
2368         pr_debug("Device %s unmapping: %lx@%llx\n",
2369                 pci_name(pdev), size, (unsigned long long)start_addr);
2370
2371         /*  clear the whole page */
2372         dma_pte_clear_range(domain, start_addr, start_addr + size);
2373         /* free page tables */
2374         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2375         if (intel_iommu_strict) {
2376                 if (iommu_flush_iotlb_psi(iommu,
2377                         domain->id, start_addr, size >> VTD_PAGE_SHIFT, 0))
2378                         iommu_flush_write_buffer(iommu);
2379                 /* free iova */
2380                 __free_iova(&domain->iovad, iova);
2381         } else {
2382                 add_unmap(domain, iova);
2383                 /*
2384                  * queue up the release of the unmap to save the 1/6th of the
2385                  * cpu used up by the iotlb flush operation...
2386                  */
2387         }
2388 }
2389
2390 void *intel_alloc_coherent(struct device *hwdev, size_t size,
2391                            dma_addr_t *dma_handle, gfp_t flags)
2392 {
2393         void *vaddr;
2394         int order;
2395
2396         size = PAGE_ALIGN(size);
2397         order = get_order(size);
2398         flags &= ~(GFP_DMA | GFP_DMA32);
2399
2400         vaddr = (void *)__get_free_pages(flags, order);
2401         if (!vaddr)
2402                 return NULL;
2403         memset(vaddr, 0, size);
2404
2405         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2406                                          DMA_BIDIRECTIONAL,
2407                                          hwdev->coherent_dma_mask);
2408         if (*dma_handle)
2409                 return vaddr;
2410         free_pages((unsigned long)vaddr, order);
2411         return NULL;
2412 }
2413
2414 void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2415                          dma_addr_t dma_handle)
2416 {
2417         int order;
2418
2419         size = PAGE_ALIGN(size);
2420         order = get_order(size);
2421
2422         intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
2423         free_pages((unsigned long)vaddr, order);
2424 }
2425
2426 #define SG_ENT_VIRT_ADDRESS(sg) (sg_virt((sg)))
2427
2428 void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2429                     int nelems, int dir)
2430 {
2431         int i;
2432         struct pci_dev *pdev = to_pci_dev(hwdev);
2433         struct dmar_domain *domain;
2434         unsigned long start_addr;
2435         struct iova *iova;
2436         size_t size = 0;
2437         void *addr;
2438         struct scatterlist *sg;
2439         struct intel_iommu *iommu;
2440
2441         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2442                 return;
2443
2444         domain = find_domain(pdev);
2445         BUG_ON(!domain);
2446
2447         iommu = domain_get_iommu(domain);
2448
2449         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2450         if (!iova)
2451                 return;
2452         for_each_sg(sglist, sg, nelems, i) {
2453                 addr = SG_ENT_VIRT_ADDRESS(sg);
2454                 size += aligned_size((u64)addr, sg->length);
2455         }
2456
2457         start_addr = iova->pfn_lo << PAGE_SHIFT;
2458
2459         /*  clear the whole page */
2460         dma_pte_clear_range(domain, start_addr, start_addr + size);
2461         /* free page tables */
2462         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2463
2464         if (iommu_flush_iotlb_psi(iommu, domain->id, start_addr,
2465                         size >> VTD_PAGE_SHIFT, 0))
2466                 iommu_flush_write_buffer(iommu);
2467
2468         /* free iova */
2469         __free_iova(&domain->iovad, iova);
2470 }
2471
2472 static int intel_nontranslate_map_sg(struct device *hddev,
2473         struct scatterlist *sglist, int nelems, int dir)
2474 {
2475         int i;
2476         struct scatterlist *sg;
2477
2478         for_each_sg(sglist, sg, nelems, i) {
2479                 BUG_ON(!sg_page(sg));
2480                 sg->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(sg));
2481                 sg->dma_length = sg->length;
2482         }
2483         return nelems;
2484 }
2485
2486 int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2487                  int dir)
2488 {
2489         void *addr;
2490         int i;
2491         struct pci_dev *pdev = to_pci_dev(hwdev);
2492         struct dmar_domain *domain;
2493         size_t size = 0;
2494         int prot = 0;
2495         size_t offset = 0;
2496         struct iova *iova = NULL;
2497         int ret;
2498         struct scatterlist *sg;
2499         unsigned long start_addr;
2500         struct intel_iommu *iommu;
2501
2502         BUG_ON(dir == DMA_NONE);
2503         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2504                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2505
2506         domain = get_valid_domain_for_dev(pdev);
2507         if (!domain)
2508                 return 0;
2509
2510         iommu = domain_get_iommu(domain);
2511
2512         for_each_sg(sglist, sg, nelems, i) {
2513                 addr = SG_ENT_VIRT_ADDRESS(sg);
2514                 addr = (void *)virt_to_phys(addr);
2515                 size += aligned_size((u64)addr, sg->length);
2516         }
2517
2518         iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2519         if (!iova) {
2520                 sglist->dma_length = 0;
2521                 return 0;
2522         }
2523
2524         /*
2525          * Check if DMAR supports zero-length reads on write only
2526          * mappings..
2527          */
2528         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2529                         !cap_zlr(iommu->cap))
2530                 prot |= DMA_PTE_READ;
2531         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2532                 prot |= DMA_PTE_WRITE;
2533
2534         start_addr = iova->pfn_lo << PAGE_SHIFT;
2535         offset = 0;
2536         for_each_sg(sglist, sg, nelems, i) {
2537                 addr = SG_ENT_VIRT_ADDRESS(sg);
2538                 addr = (void *)virt_to_phys(addr);
2539                 size = aligned_size((u64)addr, sg->length);
2540                 ret = domain_page_mapping(domain, start_addr + offset,
2541                         ((u64)addr) & PAGE_MASK,
2542                         size, prot);
2543                 if (ret) {
2544                         /*  clear the page */
2545                         dma_pte_clear_range(domain, start_addr,
2546                                   start_addr + offset);
2547                         /* free page tables */
2548                         dma_pte_free_pagetable(domain, start_addr,
2549                                   start_addr + offset);
2550                         /* free iova */
2551                         __free_iova(&domain->iovad, iova);
2552                         return 0;
2553                 }
2554                 sg->dma_address = start_addr + offset +
2555                                 ((u64)addr & (~PAGE_MASK));
2556                 sg->dma_length = sg->length;
2557                 offset += size;
2558         }
2559
2560         /* it's a non-present to present mapping */
2561         if (iommu_flush_iotlb_psi(iommu, domain->id,
2562                         start_addr, offset >> VTD_PAGE_SHIFT, 1))
2563                 iommu_flush_write_buffer(iommu);
2564         return nelems;
2565 }
2566
2567 static struct dma_mapping_ops intel_dma_ops = {
2568         .alloc_coherent = intel_alloc_coherent,
2569         .free_coherent = intel_free_coherent,
2570         .map_single = intel_map_single,
2571         .unmap_single = intel_unmap_single,
2572         .map_sg = intel_map_sg,
2573         .unmap_sg = intel_unmap_sg,
2574 };
2575
2576 static inline int iommu_domain_cache_init(void)
2577 {
2578         int ret = 0;
2579
2580         iommu_domain_cache = kmem_cache_create("iommu_domain",
2581                                          sizeof(struct dmar_domain),
2582                                          0,
2583                                          SLAB_HWCACHE_ALIGN,
2584
2585                                          NULL);
2586         if (!iommu_domain_cache) {
2587                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2588                 ret = -ENOMEM;
2589         }
2590
2591         return ret;
2592 }
2593
2594 static inline int iommu_devinfo_cache_init(void)
2595 {
2596         int ret = 0;
2597
2598         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2599                                          sizeof(struct device_domain_info),
2600                                          0,
2601                                          SLAB_HWCACHE_ALIGN,
2602                                          NULL);
2603         if (!iommu_devinfo_cache) {
2604                 printk(KERN_ERR "Couldn't create devinfo cache\n");
2605                 ret = -ENOMEM;
2606         }
2607
2608         return ret;
2609 }
2610
2611 static inline int iommu_iova_cache_init(void)
2612 {
2613         int ret = 0;
2614
2615         iommu_iova_cache = kmem_cache_create("iommu_iova",
2616                                          sizeof(struct iova),
2617                                          0,
2618                                          SLAB_HWCACHE_ALIGN,
2619                                          NULL);
2620         if (!iommu_iova_cache) {
2621                 printk(KERN_ERR "Couldn't create iova cache\n");
2622                 ret = -ENOMEM;
2623         }
2624
2625         return ret;
2626 }
2627
2628 static int __init iommu_init_mempool(void)
2629 {
2630         int ret;
2631         ret = iommu_iova_cache_init();
2632         if (ret)
2633                 return ret;
2634
2635         ret = iommu_domain_cache_init();
2636         if (ret)
2637                 goto domain_error;
2638
2639         ret = iommu_devinfo_cache_init();
2640         if (!ret)
2641                 return ret;
2642
2643         kmem_cache_destroy(iommu_domain_cache);
2644 domain_error:
2645         kmem_cache_destroy(iommu_iova_cache);
2646
2647         return -ENOMEM;
2648 }
2649
2650 static void __init iommu_exit_mempool(void)
2651 {
2652         kmem_cache_destroy(iommu_devinfo_cache);
2653         kmem_cache_destroy(iommu_domain_cache);
2654         kmem_cache_destroy(iommu_iova_cache);
2655
2656 }
2657
2658 static void __init init_no_remapping_devices(void)
2659 {
2660         struct dmar_drhd_unit *drhd;
2661
2662         for_each_drhd_unit(drhd) {
2663                 if (!drhd->include_all) {
2664                         int i;
2665                         for (i = 0; i < drhd->devices_cnt; i++)
2666                                 if (drhd->devices[i] != NULL)
2667                                         break;
2668                         /* ignore DMAR unit if no pci devices exist */
2669                         if (i == drhd->devices_cnt)
2670                                 drhd->ignored = 1;
2671                 }
2672         }
2673
2674         if (dmar_map_gfx)
2675                 return;
2676
2677         for_each_drhd_unit(drhd) {
2678                 int i;
2679                 if (drhd->ignored || drhd->include_all)
2680                         continue;
2681
2682                 for (i = 0; i < drhd->devices_cnt; i++)
2683                         if (drhd->devices[i] &&
2684                                 !IS_GFX_DEVICE(drhd->devices[i]))
2685                                 break;
2686
2687                 if (i < drhd->devices_cnt)
2688                         continue;
2689
2690                 /* bypass IOMMU if it is just for gfx devices */
2691                 drhd->ignored = 1;
2692                 for (i = 0; i < drhd->devices_cnt; i++) {
2693                         if (!drhd->devices[i])
2694                                 continue;
2695                         drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
2696                 }
2697         }
2698 }
2699
2700 int __init intel_iommu_init(void)
2701 {
2702         int ret = 0;
2703
2704         if (dmar_table_init())
2705                 return  -ENODEV;
2706
2707         if (dmar_dev_scope_init())
2708                 return  -ENODEV;
2709
2710         /*
2711          * Check the need for DMA-remapping initialization now.
2712          * Above initialization will also be used by Interrupt-remapping.
2713          */
2714         if (no_iommu || swiotlb || dmar_disabled)
2715                 return -ENODEV;
2716
2717         iommu_init_mempool();
2718         dmar_init_reserved_ranges();
2719
2720         init_no_remapping_devices();
2721
2722         ret = init_dmars();
2723         if (ret) {
2724                 printk(KERN_ERR "IOMMU: dmar init failed\n");
2725                 put_iova_domain(&reserved_iova_list);
2726                 iommu_exit_mempool();
2727                 return ret;
2728         }
2729         printk(KERN_INFO
2730         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2731
2732         init_timer(&unmap_timer);
2733         force_iommu = 1;
2734         dma_ops = &intel_dma_ops;
2735
2736         register_iommu(&intel_iommu_ops);
2737
2738         return 0;
2739 }
2740
2741 static int vm_domain_add_dev_info(struct dmar_domain *domain,
2742                                   struct pci_dev *pdev)
2743 {
2744         struct device_domain_info *info;
2745         unsigned long flags;
2746
2747         info = alloc_devinfo_mem();
2748         if (!info)
2749                 return -ENOMEM;
2750
2751         info->bus = pdev->bus->number;
2752         info->devfn = pdev->devfn;
2753         info->dev = pdev;
2754         info->domain = domain;
2755
2756         spin_lock_irqsave(&device_domain_lock, flags);
2757         list_add(&info->link, &domain->devices);
2758         list_add(&info->global, &device_domain_list);
2759         pdev->dev.archdata.iommu = info;
2760         spin_unlock_irqrestore(&device_domain_lock, flags);
2761
2762         return 0;
2763 }
2764
2765 static void vm_domain_remove_one_dev_info(struct dmar_domain *domain,
2766                                           struct pci_dev *pdev)
2767 {
2768         struct device_domain_info *info;
2769         struct intel_iommu *iommu;
2770         unsigned long flags;
2771         int found = 0;
2772         struct list_head *entry, *tmp;
2773
2774         iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
2775         if (!iommu)
2776                 return;
2777
2778         spin_lock_irqsave(&device_domain_lock, flags);
2779         list_for_each_safe(entry, tmp, &domain->devices) {
2780                 info = list_entry(entry, struct device_domain_info, link);
2781                 if (info->bus == pdev->bus->number &&
2782                     info->devfn == pdev->devfn) {
2783                         list_del(&info->link);
2784                         list_del(&info->global);
2785                         if (info->dev)
2786                                 info->dev->dev.archdata.iommu = NULL;
2787                         spin_unlock_irqrestore(&device_domain_lock, flags);
2788
2789                         iommu_detach_dev(iommu, info->bus, info->devfn);
2790                         free_devinfo_mem(info);
2791
2792                         spin_lock_irqsave(&device_domain_lock, flags);
2793
2794                         if (found)
2795                                 break;
2796                         else
2797                                 continue;
2798                 }
2799
2800                 /* if there is no other devices under the same iommu
2801                  * owned by this domain, clear this iommu in iommu_bmp
2802                  * update iommu count and coherency
2803                  */
2804                 if (device_to_iommu(info->bus, info->devfn) == iommu)
2805                         found = 1;
2806         }
2807
2808         if (found == 0) {
2809                 unsigned long tmp_flags;
2810                 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
2811                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
2812                 domain->iommu_count--;
2813                 domain_update_iommu_coherency(domain);
2814                 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
2815         }
2816
2817         spin_unlock_irqrestore(&device_domain_lock, flags);
2818 }
2819
2820 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
2821 {
2822         struct device_domain_info *info;
2823         struct intel_iommu *iommu;
2824         unsigned long flags1, flags2;
2825
2826         spin_lock_irqsave(&device_domain_lock, flags1);
2827         while (!list_empty(&domain->devices)) {
2828                 info = list_entry(domain->devices.next,
2829                         struct device_domain_info, link);
2830                 list_del(&info->link);
2831                 list_del(&info->global);
2832                 if (info->dev)
2833                         info->dev->dev.archdata.iommu = NULL;
2834
2835                 spin_unlock_irqrestore(&device_domain_lock, flags1);
2836
2837                 iommu = device_to_iommu(info->bus, info->devfn);
2838                 iommu_detach_dev(iommu, info->bus, info->devfn);
2839
2840                 /* clear this iommu in iommu_bmp, update iommu count
2841                  * and coherency
2842                  */
2843                 spin_lock_irqsave(&domain->iommu_lock, flags2);
2844                 if (test_and_clear_bit(iommu->seq_id,
2845                                        &domain->iommu_bmp)) {
2846                         domain->iommu_count--;
2847                         domain_update_iommu_coherency(domain);
2848                 }
2849                 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
2850
2851                 free_devinfo_mem(info);
2852                 spin_lock_irqsave(&device_domain_lock, flags1);
2853         }
2854         spin_unlock_irqrestore(&device_domain_lock, flags1);
2855 }
2856
2857 /* domain id for virtual machine, it won't be set in context */
2858 static unsigned long vm_domid;
2859
2860 static int vm_domain_min_agaw(struct dmar_domain *domain)
2861 {
2862         int i;
2863         int min_agaw = domain->agaw;
2864
2865         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
2866         for (; i < g_num_of_iommus; ) {
2867                 if (min_agaw > g_iommus[i]->agaw)
2868                         min_agaw = g_iommus[i]->agaw;
2869
2870                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
2871         }
2872
2873         return min_agaw;
2874 }
2875
2876 static struct dmar_domain *iommu_alloc_vm_domain(void)
2877 {
2878         struct dmar_domain *domain;
2879
2880         domain = alloc_domain_mem();
2881         if (!domain)
2882                 return NULL;
2883
2884         domain->id = vm_domid++;
2885         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
2886         domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
2887
2888         return domain;
2889 }
2890
2891 static int vm_domain_init(struct dmar_domain *domain, int guest_width)
2892 {
2893         int adjust_width;
2894
2895         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
2896         spin_lock_init(&domain->mapping_lock);
2897         spin_lock_init(&domain->iommu_lock);
2898
2899         domain_reserve_special_ranges(domain);
2900
2901         /* calculate AGAW */
2902         domain->gaw = guest_width;
2903         adjust_width = guestwidth_to_adjustwidth(guest_width);
2904         domain->agaw = width_to_agaw(adjust_width);
2905
2906         INIT_LIST_HEAD(&domain->devices);
2907
2908         domain->iommu_count = 0;
2909         domain->iommu_coherency = 0;
2910         domain->max_addr = 0;
2911
2912         /* always allocate the top pgd */
2913         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
2914         if (!domain->pgd)
2915                 return -ENOMEM;
2916         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
2917         return 0;
2918 }
2919
2920 static void iommu_free_vm_domain(struct dmar_domain *domain)
2921 {
2922         unsigned long flags;
2923         struct dmar_drhd_unit *drhd;
2924         struct intel_iommu *iommu;
2925         unsigned long i;
2926         unsigned long ndomains;
2927
2928         for_each_drhd_unit(drhd) {
2929                 if (drhd->ignored)
2930                         continue;
2931                 iommu = drhd->iommu;
2932
2933                 ndomains = cap_ndoms(iommu->cap);
2934                 i = find_first_bit(iommu->domain_ids, ndomains);
2935                 for (; i < ndomains; ) {
2936                         if (iommu->domains[i] == domain) {
2937                                 spin_lock_irqsave(&iommu->lock, flags);
2938                                 clear_bit(i, iommu->domain_ids);
2939                                 iommu->domains[i] = NULL;
2940                                 spin_unlock_irqrestore(&iommu->lock, flags);
2941                                 break;
2942                         }
2943                         i = find_next_bit(iommu->domain_ids, ndomains, i+1);
2944                 }
2945         }
2946 }
2947
2948 static void vm_domain_exit(struct dmar_domain *domain)
2949 {
2950         u64 end;
2951
2952         /* Domain 0 is reserved, so dont process it */
2953         if (!domain)
2954                 return;
2955
2956         vm_domain_remove_all_dev_info(domain);
2957         /* destroy iovas */
2958         put_iova_domain(&domain->iovad);
2959         end = DOMAIN_MAX_ADDR(domain->gaw);
2960         end = end & (~VTD_PAGE_MASK);
2961
2962         /* clear ptes */
2963         dma_pte_clear_range(domain, 0, end);
2964
2965         /* free page tables */
2966         dma_pte_free_pagetable(domain, 0, end);
2967
2968         iommu_free_vm_domain(domain);
2969         free_domain_mem(domain);
2970 }
2971
2972 static int intel_iommu_domain_init(struct iommu_domain *domain)
2973 {
2974         struct dmar_domain *dmar_domain;
2975
2976         dmar_domain = iommu_alloc_vm_domain();
2977         if (!dmar_domain) {
2978                 printk(KERN_ERR
2979                         "intel_iommu_domain_init: dmar_domain == NULL\n");
2980                 return -ENOMEM;
2981         }
2982         if (vm_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2983                 printk(KERN_ERR
2984                         "intel_iommu_domain_init() failed\n");
2985                 vm_domain_exit(dmar_domain);
2986                 return -ENOMEM;
2987         }
2988         domain->priv = dmar_domain;
2989
2990         return 0;
2991 }
2992
2993 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
2994 {
2995         struct dmar_domain *dmar_domain = domain->priv;
2996
2997         domain->priv = NULL;
2998         vm_domain_exit(dmar_domain);
2999 }
3000
3001 static int intel_iommu_attach_device(struct iommu_domain *domain,
3002                                      struct device *dev)
3003 {
3004         struct dmar_domain *dmar_domain = domain->priv;
3005         struct pci_dev *pdev = to_pci_dev(dev);
3006         struct intel_iommu *iommu;
3007         int addr_width;
3008         u64 end;
3009         int ret;
3010
3011         /* normally pdev is not mapped */
3012         if (unlikely(domain_context_mapped(pdev))) {
3013                 struct dmar_domain *old_domain;
3014
3015                 old_domain = find_domain(pdev);
3016                 if (old_domain) {
3017                         if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
3018                                 vm_domain_remove_one_dev_info(old_domain, pdev);
3019                         else
3020                                 domain_remove_dev_info(old_domain);
3021                 }
3022         }
3023
3024         iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
3025         if (!iommu)
3026                 return -ENODEV;
3027
3028         /* check if this iommu agaw is sufficient for max mapped address */
3029         addr_width = agaw_to_width(iommu->agaw);
3030         end = DOMAIN_MAX_ADDR(addr_width);
3031         end = end & VTD_PAGE_MASK;
3032         if (end < dmar_domain->max_addr) {
3033                 printk(KERN_ERR "%s: iommu agaw (%d) is not "
3034                        "sufficient for the mapped address (%llx)\n",
3035                        __func__, iommu->agaw, dmar_domain->max_addr);
3036                 return -EFAULT;
3037         }
3038
3039         ret = domain_context_mapping(dmar_domain, pdev);
3040         if (ret)
3041                 return ret;
3042
3043         ret = vm_domain_add_dev_info(dmar_domain, pdev);
3044         return ret;
3045 }
3046
3047 static void intel_iommu_detach_device(struct iommu_domain *domain,
3048                                       struct device *dev)
3049 {
3050         struct dmar_domain *dmar_domain = domain->priv;
3051         struct pci_dev *pdev = to_pci_dev(dev);
3052
3053         vm_domain_remove_one_dev_info(dmar_domain, pdev);
3054 }
3055
3056 static int intel_iommu_map_range(struct iommu_domain *domain,
3057                                  unsigned long iova, phys_addr_t hpa,
3058                                  size_t size, int iommu_prot)
3059 {
3060         struct dmar_domain *dmar_domain = domain->priv;
3061         u64 max_addr;
3062         int addr_width;
3063         int prot = 0;
3064         int ret;
3065
3066         if (iommu_prot & IOMMU_READ)
3067                 prot |= DMA_PTE_READ;
3068         if (iommu_prot & IOMMU_WRITE)
3069                 prot |= DMA_PTE_WRITE;
3070
3071         max_addr = (iova & VTD_PAGE_MASK) + VTD_PAGE_ALIGN(size);
3072         if (dmar_domain->max_addr < max_addr) {
3073                 int min_agaw;
3074                 u64 end;
3075
3076                 /* check if minimum agaw is sufficient for mapped address */
3077                 min_agaw = vm_domain_min_agaw(dmar_domain);
3078                 addr_width = agaw_to_width(min_agaw);
3079                 end = DOMAIN_MAX_ADDR(addr_width);
3080                 end = end & VTD_PAGE_MASK;
3081                 if (end < max_addr) {
3082                         printk(KERN_ERR "%s: iommu agaw (%d) is not "
3083                                "sufficient for the mapped address (%llx)\n",
3084                                __func__, min_agaw, max_addr);
3085                         return -EFAULT;
3086                 }
3087                 dmar_domain->max_addr = max_addr;
3088         }
3089
3090         ret = domain_page_mapping(dmar_domain, iova, hpa, size, prot);
3091         return ret;
3092 }
3093
3094 static void intel_iommu_unmap_range(struct iommu_domain *domain,
3095                                     unsigned long iova, size_t size)
3096 {
3097         struct dmar_domain *dmar_domain = domain->priv;
3098         dma_addr_t base;
3099
3100         /* The address might not be aligned */
3101         base = iova & VTD_PAGE_MASK;
3102         size = VTD_PAGE_ALIGN(size);
3103         dma_pte_clear_range(dmar_domain, base, base + size);
3104
3105         if (dmar_domain->max_addr == base + size)
3106                 dmar_domain->max_addr = base;
3107 }
3108
3109 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3110                                             unsigned long iova)
3111 {
3112         struct dmar_domain *dmar_domain = domain->priv;
3113         struct dma_pte *pte;
3114         u64 phys = 0;
3115
3116         pte = addr_to_dma_pte(dmar_domain, iova);
3117         if (pte)
3118                 phys = dma_pte_addr(pte);
3119
3120         return phys;
3121 }
3122
3123 static struct iommu_ops intel_iommu_ops = {
3124         .domain_init    = intel_iommu_domain_init,
3125         .domain_destroy = intel_iommu_domain_destroy,
3126         .attach_dev     = intel_iommu_attach_device,
3127         .detach_dev     = intel_iommu_detach_device,
3128         .map            = intel_iommu_map_range,
3129         .unmap          = intel_iommu_unmap_range,
3130         .iova_to_phys   = intel_iommu_iova_to_phys,
3131 };