drivers/iommu/intel-iommu.c

   1 /*
   2  * Copyright © 2006-2014 Intel Corporation.
   3  *
   4  * This program is free software; you can redistribute it and/or modify it
   5  * under the terms and conditions of the GNU General Public License,
   6  * version 2, as published by the Free Software Foundation.
   7  *
   8  * This program is distributed in the hope it will be useful, but WITHOUT
   9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  11  * more details.
  12  *
  13  * Authors: David Woodhouse <dwmw2@infradead.org>,
  14  *          Ashok Raj <ashok.raj@intel.com>,
  15  *          Shaohua Li <shaohua.li@intel.com>,
  16  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
  17  *          Fenghua Yu <fenghua.yu@intel.com>
  18  *          Joerg Roedel <jroedel@suse.de>
  19  */
  20
  21 #define pr_fmt(fmt)     "DMAR: " fmt
  22
  23 #include <linux/init.h>
  24 #include <linux/bitmap.h>
  25 #include <linux/debugfs.h>
  26 #include <linux/export.h>
  27 #include <linux/slab.h>
  28 #include <linux/irq.h>
  29 #include <linux/interrupt.h>
  30 #include <linux/spinlock.h>
  31 #include <linux/pci.h>
  32 #include <linux/dmar.h>
  33 #include <linux/dma-mapping.h>
  34 #include <linux/mempool.h>
  35 #include <linux/memory.h>
  36 #include <linux/cpu.h>
  37 #include <linux/timer.h>
  38 #include <linux/io.h>
  39 #include <linux/iova.h>
  40 #include <linux/iommu.h>
  41 #include <linux/intel-iommu.h>
  42 #include <linux/syscore_ops.h>
  43 #include <linux/tboot.h>
  44 #include <linux/dmi.h>
  45 #include <linux/pci-ats.h>
  46 #include <linux/memblock.h>
  47 #include <linux/dma-contiguous.h>
  48 #include <linux/dma-direct.h>
  49 #include <linux/crash_dump.h>
  50 #include <asm/irq_remapping.h>
  51 #include <asm/cacheflush.h>
  52 #include <asm/iommu.h>
  53
  54 #include "irq_remapping.h"
  55
  56 #define ROOT_SIZE               VTD_PAGE_SIZE
  57 #define CONTEXT_SIZE            VTD_PAGE_SIZE
  58
  59 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
  60 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
  61 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
  62 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
  63
  64 #define IOAPIC_RANGE_START      (0xfee00000)
  65 #define IOAPIC_RANGE_END        (0xfeefffff)
  66 #define IOVA_START_ADDR         (0x1000)
  67
  68 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
  69
  70 #define MAX_AGAW_WIDTH 64
  71 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
  72
  73 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
  74 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
  75
  76 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
  77    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
  78 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
  79                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
  80 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
  81
  82 /* IO virtual address start page frame number */
  83 #define IOVA_START_PFN          (1)
  84
  85 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
  86
  87 /* page table handling */
  88 #define LEVEL_STRIDE            (9)
  89 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
  90
  91 /*
  92  * This bitmap is used to advertise the page sizes our hardware support
  93  * to the IOMMU core, which will then use this information to split
  94  * physically contiguous memory regions it is mapping into page sizes
  95  * that we support.
  96  *
  97  * Traditionally the IOMMU core just handed us the mappings directly,
  98  * after making sure the size is an order of a 4KiB page and that the
  99  * mapping has natural alignment.
 100  *
 101  * To retain this behavior, we currently advertise that we support
 102  * all page sizes that are an order of 4KiB.
 103  *
 104  * If at some point we'd like to utilize the IOMMU core's new behavior,
 105  * we could change this to advertise the real page sizes we support.
 106  */
 107 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
 108
 109 static inline int agaw_to_level(int agaw)
 110 {
 111         return agaw + 2;
 112 }
 113
 114 static inline int agaw_to_width(int agaw)
 115 {
 116         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
 117 }
 118
 119 static inline int width_to_agaw(int width)
 120 {
 121         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
 122 }
 123
 124 static inline unsigned int level_to_offset_bits(int level)
 125 {
 126         return (level - 1) * LEVEL_STRIDE;
 127 }
 128
 129 static inline int pfn_level_offset(unsigned long pfn, int level)
 130 {
 131         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
 132 }
 133
 134 static inline unsigned long level_mask(int level)
 135 {
 136         return -1UL << level_to_offset_bits(level);
 137 }
 138
 139 static inline unsigned long level_size(int level)
 140 {
 141         return 1UL << level_to_offset_bits(level);
 142 }
 143
 144 static inline unsigned long align_to_level(unsigned long pfn, int level)
 145 {
 146         return (pfn + level_size(level) - 1) & level_mask(level);
 147 }
 148
 149 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
 150 {
 151         return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
 152 }
 153
 154 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
 155    are never going to work. */
 156 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
 157 {
 158         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
 159 }
 160
 161 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
 162 {
 163         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
 164 }
 165 static inline unsigned long page_to_dma_pfn(struct page *pg)
 166 {
 167         return mm_to_dma_pfn(page_to_pfn(pg));
 168 }
 169 static inline unsigned long virt_to_dma_pfn(void *p)
 170 {
 171         return page_to_dma_pfn(virt_to_page(p));
 172 }
 173
 174 /* global iommu list, set NULL for ignored DMAR units */
 175 static struct intel_iommu **g_iommus;
 176
 177 static void __init check_tylersburg_isoch(void);
 178 static int rwbf_quirk;
 179
 180 /*
 181  * set to 1 to panic kernel if can't successfully enable VT-d
 182  * (used when kernel is launched w/ TXT)
 183  */
 184 static int force_on = 0;
 185 int intel_iommu_tboot_noforce;
 186
 187 /*
 188  * 0: Present
 189  * 1-11: Reserved
 190  * 12-63: Context Ptr (12 - (haw-1))
 191  * 64-127: Reserved
 192  */
 193 struct root_entry {
 194         u64     lo;
 195         u64     hi;
 196 };
 197 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
 198
 199 /*
 200  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
 201  * if marked present.
 202  */
 203 static phys_addr_t root_entry_lctp(struct root_entry *re)
 204 {
 205         if (!(re->lo & 1))
 206                 return 0;
 207
 208         return re->lo & VTD_PAGE_MASK;
 209 }
 210
 211 /*
 212  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
 213  * if marked present.
 214  */
 215 static phys_addr_t root_entry_uctp(struct root_entry *re)
 216 {
 217         if (!(re->hi & 1))
 218                 return 0;
 219
 220         return re->hi & VTD_PAGE_MASK;
 221 }
 222 /*
 223  * low 64 bits:
 224  * 0: present
 225  * 1: fault processing disable
 226  * 2-3: translation type
 227  * 12-63: address space root
 228  * high 64 bits:
 229  * 0-2: address width
 230  * 3-6: aval
 231  * 8-23: domain id
 232  */
 233 struct context_entry {
 234         u64 lo;
 235         u64 hi;
 236 };
 237
 238 static inline void context_clear_pasid_enable(struct context_entry *context)
 239 {
 240         context->lo &= ~(1ULL << 11);
 241 }
 242
 243 static inline bool context_pasid_enabled(struct context_entry *context)
 244 {
 245         return !!(context->lo & (1ULL << 11));
 246 }
 247
 248 static inline void context_set_copied(struct context_entry *context)
 249 {
 250         context->hi |= (1ull << 3);
 251 }
 252
 253 static inline bool context_copied(struct context_entry *context)
 254 {
 255         return !!(context->hi & (1ULL << 3));
 256 }
 257
 258 static inline bool __context_present(struct context_entry *context)
 259 {
 260         return (context->lo & 1);
 261 }
 262
 263 static inline bool context_present(struct context_entry *context)
 264 {
 265         return context_pasid_enabled(context) ?
 266              __context_present(context) :
 267              __context_present(context) && !context_copied(context);
 268 }
 269
 270 static inline void context_set_present(struct context_entry *context)
 271 {
 272         context->lo |= 1;
 273 }
 274
 275 static inline void context_set_fault_enable(struct context_entry *context)
 276 {
 277         context->lo &= (((u64)-1) << 2) | 1;
 278 }
 279
 280 static inline void context_set_translation_type(struct context_entry *context,
 281                                                 unsigned long value)
 282 {
 283         context->lo &= (((u64)-1) << 4) | 3;
 284         context->lo |= (value & 3) << 2;
 285 }
 286
 287 static inline void context_set_address_root(struct context_entry *context,
 288                                             unsigned long value)
 289 {
 290         context->lo &= ~VTD_PAGE_MASK;
 291         context->lo |= value & VTD_PAGE_MASK;
 292 }
 293
 294 static inline void context_set_address_width(struct context_entry *context,
 295                                              unsigned long value)
 296 {
 297         context->hi |= value & 7;
 298 }
 299
 300 static inline void context_set_domain_id(struct context_entry *context,
 301                                          unsigned long value)
 302 {
 303         context->hi |= (value & ((1 << 16) - 1)) << 8;
 304 }
 305
 306 static inline int context_domain_id(struct context_entry *c)
 307 {
 308         return((c->hi >> 8) & 0xffff);
 309 }
 310
 311 static inline void context_clear_entry(struct context_entry *context)
 312 {
 313         context->lo = 0;
 314         context->hi = 0;
 315 }
 316
 317 /*
 318  * 0: readable
 319  * 1: writable
 320  * 2-6: reserved
 321  * 7: super page
 322  * 8-10: available
 323  * 11: snoop behavior
 324  * 12-63: Host physcial address
 325  */
 326 struct dma_pte {
 327         u64 val;
 328 };
 329
 330 static inline void dma_clear_pte(struct dma_pte *pte)
 331 {
 332         pte->val = 0;
 333 }
 334
 335 static inline u64 dma_pte_addr(struct dma_pte *pte)
 336 {
 337 #ifdef CONFIG_64BIT
 338         return pte->val & VTD_PAGE_MASK;
 339 #else
 340         /* Must have a full atomic 64-bit read */
 341         return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
 342 #endif
 343 }
 344
 345 static inline bool dma_pte_present(struct dma_pte *pte)
 346 {
 347         return (pte->val & 3) != 0;
 348 }
 349
 350 static inline bool dma_pte_superpage(struct dma_pte *pte)
 351 {
 352         return (pte->val & DMA_PTE_LARGE_PAGE);
 353 }
 354
 355 static inline int first_pte_in_page(struct dma_pte *pte)
 356 {
 357         return !((unsigned long)pte & ~VTD_PAGE_MASK);
 358 }
 359
 360 /*
 361  * This domain is a statically identity mapping domain.
 362  *      1. This domain creats a static 1:1 mapping to all usable memory.
 363  *      2. It maps to each iommu if successful.
 364  *      3. Each iommu mapps to this domain if successful.
 365  */
 366 static struct dmar_domain *si_domain;
 367 static int hw_pass_through = 1;
 368
 369 /*
 370  * Domain represents a virtual machine, more than one devices
 371  * across iommus may be owned in one domain, e.g. kvm guest.
 372  */
 373 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 0)
 374
 375 /* si_domain contains mulitple devices */
 376 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 1)
 377
 378 #define for_each_domain_iommu(idx, domain)                      \
 379         for (idx = 0; idx < g_num_of_iommus; idx++)             \
 380                 if (domain->iommu_refcnt[idx])
 381
 382 struct dmar_domain {
 383         int     nid;                    /* node id */
 384
 385         unsigned        iommu_refcnt[DMAR_UNITS_SUPPORTED];
 386                                         /* Refcount of devices per iommu */
 387
 388
 389         u16             iommu_did[DMAR_UNITS_SUPPORTED];
 390                                         /* Domain ids per IOMMU. Use u16 since
 391                                          * domain ids are 16 bit wide according
 392                                          * to VT-d spec, section 9.3 */
 393
 394         bool has_iotlb_device;
 395         struct list_head devices;       /* all devices' list */
 396         struct iova_domain iovad;       /* iova's that belong to this domain */
 397
 398         struct dma_pte  *pgd;           /* virtual address */
 399         int             gaw;            /* max guest address width */
 400
 401         /* adjusted guest address width, 0 is level 2 30-bit */
 402         int             agaw;
 403
 404         int             flags;          /* flags to find out type of domain */
 405
 406         int             iommu_coherency;/* indicate coherency of iommu access */
 407         int             iommu_snooping; /* indicate snooping control feature*/
 408         int             iommu_count;    /* reference count of iommu */
 409         int             iommu_superpage;/* Level of superpages supported:
 410                                            0 == 4KiB (no superpages), 1 == 2MiB,
 411                                            2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
 412         u64             max_addr;       /* maximum mapped address */
 413
 414         struct iommu_domain domain;     /* generic domain data structure for
 415                                            iommu core */
 416 };
 417
 418 /* PCI domain-device relationship */
 419 struct device_domain_info {
 420         struct list_head link;  /* link to domain siblings */
 421         struct list_head global; /* link to global list */
 422         u8 bus;                 /* PCI bus number */
 423         u8 devfn;               /* PCI devfn number */
 424         u8 pasid_supported:3;
 425         u8 pasid_enabled:1;
 426         u8 pri_supported:1;
 427         u8 pri_enabled:1;
 428         u8 ats_supported:1;
 429         u8 ats_enabled:1;
 430         u8 ats_qdep;
 431         struct device *dev; /* it's NULL for PCIe-to-PCI bridge */
 432         struct intel_iommu *iommu; /* IOMMU used by this device */
 433         struct dmar_domain *domain; /* pointer to domain */
 434 };
 435
 436 struct dmar_rmrr_unit {
 437         struct list_head list;          /* list of rmrr units   */
 438         struct acpi_dmar_header *hdr;   /* ACPI header          */
 439         u64     base_address;           /* reserved base address*/
 440         u64     end_address;            /* reserved end address */
 441         struct dmar_dev_scope *devices; /* target devices */
 442         int     devices_cnt;            /* target device count */
 443         struct iommu_resv_region *resv; /* reserved region handle */
 444 };
 445
 446 struct dmar_atsr_unit {
 447         struct list_head list;          /* list of ATSR units */
 448         struct acpi_dmar_header *hdr;   /* ACPI header */
 449         struct dmar_dev_scope *devices; /* target devices */
 450         int devices_cnt;                /* target device count */
 451         u8 include_all:1;               /* include all ports */
 452 };
 453
 454 static LIST_HEAD(dmar_atsr_units);
 455 static LIST_HEAD(dmar_rmrr_units);
 456
 457 #define for_each_rmrr_units(rmrr) \
 458         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
 459
 460 /* bitmap for indexing intel_iommus */
 461 static int g_num_of_iommus;
 462
 463 static void domain_exit(struct dmar_domain *domain);
 464 static void domain_remove_dev_info(struct dmar_domain *domain);
 465 static void dmar_remove_one_dev_info(struct dmar_domain *domain,
 466                                      struct device *dev);
 467 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
 468 static void domain_context_clear(struct intel_iommu *iommu,
 469                                  struct device *dev);
 470 static int domain_detach_iommu(struct dmar_domain *domain,
 471                                struct intel_iommu *iommu);
 472
 473 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
 474 int dmar_disabled = 0;
 475 #else
 476 int dmar_disabled = 1;
 477 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
 478
 479 int intel_iommu_enabled = 0;
 480 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
 481
 482 static int dmar_map_gfx = 1;
 483 static int dmar_forcedac;
 484 static int intel_iommu_strict;
 485 static int intel_iommu_superpage = 1;
 486 static int intel_iommu_ecs = 1;
 487 static int iommu_identity_mapping;
 488
 489 #define IDENTMAP_ALL            1
 490 #define IDENTMAP_GFX            2
 491 #define IDENTMAP_AZALIA         4
 492
 493 #define ecs_enabled(iommu)      (intel_iommu_ecs && ecap_ecs(iommu->ecap))
 494 #define pasid_enabled(iommu)    (ecs_enabled(iommu) && ecap_pasid(iommu->ecap))
 495
 496 int intel_iommu_gfx_mapped;
 497 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
 498
 499 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
 500 static DEFINE_SPINLOCK(device_domain_lock);
 501 static LIST_HEAD(device_domain_list);
 502
 503 const struct iommu_ops intel_iommu_ops;
 504
 505 static bool translation_pre_enabled(struct intel_iommu *iommu)
 506 {
 507         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
 508 }
 509
 510 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
 511 {
 512         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
 513 }
 514
 515 static void init_translation_status(struct intel_iommu *iommu)
 516 {
 517         u32 gsts;
 518
 519         gsts = readl(iommu->reg + DMAR_GSTS_REG);
 520         if (gsts & DMA_GSTS_TES)
 521                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
 522 }
 523
 524 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
 525 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
 526 {
 527         return container_of(dom, struct dmar_domain, domain);
 528 }
 529
 530 static int __init intel_iommu_setup(char *str)
 531 {
 532         if (!str)
 533                 return -EINVAL;
 534         while (*str) {
 535                 if (!strncmp(str, "on", 2)) {
 536                         dmar_disabled = 0;
 537                         pr_info("IOMMU enabled\n");
 538                 } else if (!strncmp(str, "off", 3)) {
 539                         dmar_disabled = 1;
 540                         pr_info("IOMMU disabled\n");
 541                 } else if (!strncmp(str, "igfx_off", 8)) {
 542                         dmar_map_gfx = 0;
 543                         pr_info("Disable GFX device mapping\n");
 544                 } else if (!strncmp(str, "forcedac", 8)) {
 545                         pr_info("Forcing DAC for PCI devices\n");
 546                         dmar_forcedac = 1;
 547                 } else if (!strncmp(str, "strict", 6)) {
 548                         pr_info("Disable batched IOTLB flush\n");
 549                         intel_iommu_strict = 1;
 550                 } else if (!strncmp(str, "sp_off", 6)) {
 551                         pr_info("Disable supported super page\n");
 552                         intel_iommu_superpage = 0;
 553                 } else if (!strncmp(str, "ecs_off", 7)) {
 554                         printk(KERN_INFO
 555                                 "Intel-IOMMU: disable extended context table support\n");
 556                         intel_iommu_ecs = 0;
 557                 } else if (!strncmp(str, "tboot_noforce", 13)) {
 558                         printk(KERN_INFO
 559                                 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
 560                         intel_iommu_tboot_noforce = 1;
 561                 }
 562
 563                 str += strcspn(str, ",");
 564                 while (*str == ',')
 565                         str++;
 566         }
 567         return 0;
 568 }
 569 __setup("intel_iommu=", intel_iommu_setup);
 570
 571 static struct kmem_cache *iommu_domain_cache;
 572 static struct kmem_cache *iommu_devinfo_cache;
 573
 574 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
 575 {
 576         struct dmar_domain **domains;
 577         int idx = did >> 8;
 578
 579         domains = iommu->domains[idx];
 580         if (!domains)
 581                 return NULL;
 582
 583         return domains[did & 0xff];
 584 }
 585
 586 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
 587                              struct dmar_domain *domain)
 588 {
 589         struct dmar_domain **domains;
 590         int idx = did >> 8;
 591
 592         if (!iommu->domains[idx]) {
 593                 size_t size = 256 * sizeof(struct dmar_domain *);
 594                 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
 595         }
 596
 597         domains = iommu->domains[idx];
 598         if (WARN_ON(!domains))
 599                 return;
 600         else
 601                 domains[did & 0xff] = domain;
 602 }
 603
 604 static inline void *alloc_pgtable_page(int node)
 605 {
 606         struct page *page;
 607         void *vaddr = NULL;
 608
 609         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
 610         if (page)
 611                 vaddr = page_address(page);
 612         return vaddr;
 613 }
 614
 615 static inline void free_pgtable_page(void *vaddr)
 616 {
 617         free_page((unsigned long)vaddr);
 618 }
 619
 620 static inline void *alloc_domain_mem(void)
 621 {
 622         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
 623 }
 624
 625 static void free_domain_mem(void *vaddr)
 626 {
 627         kmem_cache_free(iommu_domain_cache, vaddr);
 628 }
 629
 630 static inline void * alloc_devinfo_mem(void)
 631 {
 632         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
 633 }
 634
 635 static inline void free_devinfo_mem(void *vaddr)
 636 {
 637         kmem_cache_free(iommu_devinfo_cache, vaddr);
 638 }
 639
 640 static inline int domain_type_is_vm(struct dmar_domain *domain)
 641 {
 642         return domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE;
 643 }
 644
 645 static inline int domain_type_is_si(struct dmar_domain *domain)
 646 {
 647         return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
 648 }
 649
 650 static inline int domain_type_is_vm_or_si(struct dmar_domain *domain)
 651 {
 652         return domain->flags & (DOMAIN_FLAG_VIRTUAL_MACHINE |
 653                                 DOMAIN_FLAG_STATIC_IDENTITY);
 654 }
 655
 656 static inline int domain_pfn_supported(struct dmar_domain *domain,
 657                                        unsigned long pfn)
 658 {
 659         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 660
 661         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
 662 }
 663
 664 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
 665 {
 666         unsigned long sagaw;
 667         int agaw = -1;
 668
 669         sagaw = cap_sagaw(iommu->cap);
 670         for (agaw = width_to_agaw(max_gaw);
 671              agaw >= 0; agaw--) {
 672                 if (test_bit(agaw, &sagaw))
 673                         break;
 674         }
 675
 676         return agaw;
 677 }
 678
 679 /*
 680  * Calculate max SAGAW for each iommu.
 681  */
 682 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
 683 {
 684         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
 685 }
 686
 687 /*
 688  * calculate agaw for each iommu.
 689  * "SAGAW" may be different across iommus, use a default agaw, and
 690  * get a supported less agaw for iommus that don't support the default agaw.
 691  */
 692 int iommu_calculate_agaw(struct intel_iommu *iommu)
 693 {
 694         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
 695 }
 696
 697 /* This functionin only returns single iommu in a domain */
 698 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
 699 {
 700         int iommu_id;
 701
 702         /* si_domain and vm domain should not get here. */
 703         BUG_ON(domain_type_is_vm_or_si(domain));
 704         for_each_domain_iommu(iommu_id, domain)
 705                 break;
 706
 707         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
 708                 return NULL;
 709
 710         return g_iommus[iommu_id];
 711 }
 712
 713 static void domain_update_iommu_coherency(struct dmar_domain *domain)
 714 {
 715         struct dmar_drhd_unit *drhd;
 716         struct intel_iommu *iommu;
 717         bool found = false;
 718         int i;
 719
 720         domain->iommu_coherency = 1;
 721
 722         for_each_domain_iommu(i, domain) {
 723                 found = true;
 724                 if (!ecap_coherent(g_iommus[i]->ecap)) {
 725                         domain->iommu_coherency = 0;
 726                         break;
 727                 }
 728         }
 729         if (found)
 730                 return;
 731
 732         /* No hardware attached; use lowest common denominator */
 733         rcu_read_lock();
 734         for_each_active_iommu(iommu, drhd) {
 735                 if (!ecap_coherent(iommu->ecap)) {
 736                         domain->iommu_coherency = 0;
 737                         break;
 738                 }
 739         }
 740         rcu_read_unlock();
 741 }
 742
 743 static int domain_update_iommu_snooping(struct intel_iommu *skip)
 744 {
 745         struct dmar_drhd_unit *drhd;
 746         struct intel_iommu *iommu;
 747         int ret = 1;
 748
 749         rcu_read_lock();
 750         for_each_active_iommu(iommu, drhd) {
 751                 if (iommu != skip) {
 752                         if (!ecap_sc_support(iommu->ecap)) {
 753                                 ret = 0;
 754                                 break;
 755                         }
 756                 }
 757         }
 758         rcu_read_unlock();
 759
 760         return ret;
 761 }
 762
 763 static int domain_update_iommu_superpage(struct intel_iommu *skip)
 764 {
 765         struct dmar_drhd_unit *drhd;
 766         struct intel_iommu *iommu;
 767         int mask = 0xf;
 768
 769         if (!intel_iommu_superpage) {
 770                 return 0;
 771         }
 772
 773         /* set iommu_superpage to the smallest common denominator */
 774         rcu_read_lock();
 775         for_each_active_iommu(iommu, drhd) {
 776                 if (iommu != skip) {
 777                         mask &= cap_super_page_val(iommu->cap);
 778                         if (!mask)
 779                                 break;
 780                 }
 781         }
 782         rcu_read_unlock();
 783
 784         return fls(mask);
 785 }
 786
 787 /* Some capabilities may be different across iommus */
 788 static void domain_update_iommu_cap(struct dmar_domain *domain)
 789 {
 790         domain_update_iommu_coherency(domain);
 791         domain->iommu_snooping = domain_update_iommu_snooping(NULL);
 792         domain->iommu_superpage = domain_update_iommu_superpage(NULL);
 793 }
 794
 795 static inline struct context_entry *iommu_context_addr(struct intel_iommu *iommu,
 796                                                        u8 bus, u8 devfn, int alloc)
 797 {
 798         struct root_entry *root = &iommu->root_entry[bus];
 799         struct context_entry *context;
 800         u64 *entry;
 801
 802         entry = &root->lo;
 803         if (ecs_enabled(iommu)) {
 804                 if (devfn >= 0x80) {
 805                         devfn -= 0x80;
 806                         entry = &root->hi;
 807                 }
 808                 devfn *= 2;
 809         }
 810         if (*entry & 1)
 811                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
 812         else {
 813                 unsigned long phy_addr;
 814                 if (!alloc)
 815                         return NULL;
 816
 817                 context = alloc_pgtable_page(iommu->node);
 818                 if (!context)
 819                         return NULL;
 820
 821                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
 822                 phy_addr = virt_to_phys((void *)context);
 823                 *entry = phy_addr | 1;
 824                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
 825         }
 826         return &context[devfn];
 827 }
 828
 829 static int iommu_dummy(struct device *dev)
 830 {
 831         return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
 832 }
 833
 834 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
 835 {
 836         struct dmar_drhd_unit *drhd = NULL;
 837         struct intel_iommu *iommu;
 838         struct device *tmp;
 839         struct pci_dev *ptmp, *pdev = NULL;
 840         u16 segment = 0;
 841         int i;
 842
 843         if (iommu_dummy(dev))
 844                 return NULL;
 845
 846         if (dev_is_pci(dev)) {
 847                 struct pci_dev *pf_pdev;
 848
 849                 pdev = to_pci_dev(dev);
 850
 851 #ifdef CONFIG_X86
 852                 /* VMD child devices currently cannot be handled individually */
 853                 if (is_vmd(pdev->bus))
 854                         return NULL;
 855 #endif
 856
 857                 /* VFs aren't listed in scope tables; we need to look up
 858                  * the PF instead to find the IOMMU. */
 859                 pf_pdev = pci_physfn(pdev);
 860                 dev = &pf_pdev->dev;
 861                 segment = pci_domain_nr(pdev->bus);
 862         } else if (has_acpi_companion(dev))
 863                 dev = &ACPI_COMPANION(dev)->dev;
 864
 865         rcu_read_lock();
 866         for_each_active_iommu(iommu, drhd) {
 867                 if (pdev && segment != drhd->segment)
 868                         continue;
 869
 870                 for_each_active_dev_scope(drhd->devices,
 871                                           drhd->devices_cnt, i, tmp) {
 872                         if (tmp == dev) {
 873                                 /* For a VF use its original BDF# not that of the PF
 874                                  * which we used for the IOMMU lookup. Strictly speaking
 875                                  * we could do this for all PCI devices; we only need to
 876                                  * get the BDF# from the scope table for ACPI matches. */
 877                                 if (pdev && pdev->is_virtfn)
 878                                         goto got_pdev;
 879
 880                                 *bus = drhd->devices[i].bus;
 881                                 *devfn = drhd->devices[i].devfn;
 882                                 goto out;
 883                         }
 884
 885                         if (!pdev || !dev_is_pci(tmp))
 886                                 continue;
 887
 888                         ptmp = to_pci_dev(tmp);
 889                         if (ptmp->subordinate &&
 890                             ptmp->subordinate->number <= pdev->bus->number &&
 891                             ptmp->subordinate->busn_res.end >= pdev->bus->number)
 892                                 goto got_pdev;
 893                 }
 894
 895                 if (pdev && drhd->include_all) {
 896                 got_pdev:
 897                         *bus = pdev->bus->number;
 898                         *devfn = pdev->devfn;
 899                         goto out;
 900                 }
 901         }
 902         iommu = NULL;
 903  out:
 904         rcu_read_unlock();
 905
 906         return iommu;
 907 }
 908
 909 static void domain_flush_cache(struct dmar_domain *domain,
 910                                void *addr, int size)
 911 {
 912         if (!domain->iommu_coherency)
 913                 clflush_cache_range(addr, size);
 914 }
 915
 916 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
 917 {
 918         struct context_entry *context;
 919         int ret = 0;
 920         unsigned long flags;
 921
 922         spin_lock_irqsave(&iommu->lock, flags);
 923         context = iommu_context_addr(iommu, bus, devfn, 0);
 924         if (context)
 925                 ret = context_present(context);
 926         spin_unlock_irqrestore(&iommu->lock, flags);
 927         return ret;
 928 }
 929
 930 static void free_context_table(struct intel_iommu *iommu)
 931 {
 932         int i;
 933         unsigned long flags;
 934         struct context_entry *context;
 935
 936         spin_lock_irqsave(&iommu->lock, flags);
 937         if (!iommu->root_entry) {
 938                 goto out;
 939         }
 940         for (i = 0; i < ROOT_ENTRY_NR; i++) {
 941                 context = iommu_context_addr(iommu, i, 0, 0);
 942                 if (context)
 943                         free_pgtable_page(context);
 944
 945                 if (!ecs_enabled(iommu))
 946                         continue;
 947
 948                 context = iommu_context_addr(iommu, i, 0x80, 0);
 949                 if (context)
 950                         free_pgtable_page(context);
 951
 952         }
 953         free_pgtable_page(iommu->root_entry);
 954         iommu->root_entry = NULL;
 955 out:
 956         spin_unlock_irqrestore(&iommu->lock, flags);
 957 }
 958
 959 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
 960                                       unsigned long pfn, int *target_level)
 961 {
 962         struct dma_pte *parent, *pte = NULL;
 963         int level = agaw_to_level(domain->agaw);
 964         int offset;
 965
 966         BUG_ON(!domain->pgd);
 967
 968         if (!domain_pfn_supported(domain, pfn))
 969                 /* Address beyond IOMMU's addressing capabilities. */
 970                 return NULL;
 971
 972         parent = domain->pgd;
 973
 974         while (1) {
 975                 void *tmp_page;
 976
 977                 offset = pfn_level_offset(pfn, level);
 978                 pte = &parent[offset];
 979                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
 980                         break;
 981                 if (level == *target_level)
 982                         break;
 983
 984                 if (!dma_pte_present(pte)) {
 985                         uint64_t pteval;
 986
 987                         tmp_page = alloc_pgtable_page(domain->nid);
 988
 989                         if (!tmp_page)
 990                                 return NULL;
 991
 992                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
 993                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
 994                         if (cmpxchg64(&pte->val, 0ULL, pteval))
 995                                 /* Someone else set it while we were thinking; use theirs. */
 996                                 free_pgtable_page(tmp_page);
 997                         else
 998                                 domain_flush_cache(domain, pte, sizeof(*pte));
 999                 }
1000                 if (level == 1)
1001                         break;
1002
1003                 parent = phys_to_virt(dma_pte_addr(pte));
1004                 level--;
1005         }
1006
1007         if (!*target_level)
1008                 *target_level = level;
1009
1010         return pte;
1011 }
1012
1013
1014 /* return address's pte at specific level */
1015 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1016                                          unsigned long pfn,
1017                                          int level, int *large_page)
1018 {
1019         struct dma_pte *parent, *pte = NULL;
1020         int total = agaw_to_level(domain->agaw);
1021         int offset;
1022
1023         parent = domain->pgd;
1024         while (level <= total) {
1025                 offset = pfn_level_offset(pfn, total);
1026                 pte = &parent[offset];
1027                 if (level == total)
1028                         return pte;
1029
1030                 if (!dma_pte_present(pte)) {
1031                         *large_page = total;
1032                         break;
1033                 }
1034
1035                 if (dma_pte_superpage(pte)) {
1036                         *large_page = total;
1037                         return pte;
1038                 }
1039
1040                 parent = phys_to_virt(dma_pte_addr(pte));
1041                 total--;
1042         }
1043         return NULL;
1044 }
1045
1046 /* clear last level pte, a tlb flush should be followed */
1047 static void dma_pte_clear_range(struct dmar_domain *domain,
1048                                 unsigned long start_pfn,
1049                                 unsigned long last_pfn)
1050 {
1051         unsigned int large_page = 1;
1052         struct dma_pte *first_pte, *pte;
1053
1054         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1055         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1056         BUG_ON(start_pfn > last_pfn);
1057
1058         /* we don't need lock here; nobody else touches the iova range */
1059         do {
1060                 large_page = 1;
1061                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1062                 if (!pte) {
1063                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1064                         continue;
1065                 }
1066                 do {
1067                         dma_clear_pte(pte);
1068                         start_pfn += lvl_to_nr_pages(large_page);
1069                         pte++;
1070                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1071
1072                 domain_flush_cache(domain, first_pte,
1073                                    (void *)pte - (void *)first_pte);
1074
1075         } while (start_pfn && start_pfn <= last_pfn);
1076 }
1077
1078 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1079                                int retain_level, struct dma_pte *pte,
1080                                unsigned long pfn, unsigned long start_pfn,
1081                                unsigned long last_pfn)
1082 {
1083         pfn = max(start_pfn, pfn);
1084         pte = &pte[pfn_level_offset(pfn, level)];
1085
1086         do {
1087                 unsigned long level_pfn;
1088                 struct dma_pte *level_pte;
1089
1090                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1091                         goto next;
1092
1093                 level_pfn = pfn & level_mask(level);
1094                 level_pte = phys_to_virt(dma_pte_addr(pte));
1095
1096                 if (level > 2) {
1097                         dma_pte_free_level(domain, level - 1, retain_level,
1098                                            level_pte, level_pfn, start_pfn,
1099                                            last_pfn);
1100                 }
1101
1102                 /*
1103                  * Free the page table if we're below the level we want to
1104                  * retain and the range covers the entire table.
1105                  */
1106                 if (level < retain_level && !(start_pfn > level_pfn ||
1107                       last_pfn < level_pfn + level_size(level) - 1)) {
1108                         dma_clear_pte(pte);
1109                         domain_flush_cache(domain, pte, sizeof(*pte));
1110                         free_pgtable_page(level_pte);
1111                 }
1112 next:
1113                 pfn += level_size(level);
1114         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1115 }
1116
1117 /*
1118  * clear last level (leaf) ptes and free page table pages below the
1119  * level we wish to keep intact.
1120  */
1121 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1122                                    unsigned long start_pfn,
1123                                    unsigned long last_pfn,
1124                                    int retain_level)
1125 {
1126         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1127         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1128         BUG_ON(start_pfn > last_pfn);
1129
1130         dma_pte_clear_range(domain, start_pfn, last_pfn);
1131
1132         /* We don't need lock here; nobody else touches the iova range */
1133         dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1134                            domain->pgd, 0, start_pfn, last_pfn);
1135
1136         /* free pgd */
1137         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1138                 free_pgtable_page(domain->pgd);
1139                 domain->pgd = NULL;
1140         }
1141 }
1142
1143 /* When a page at a given level is being unlinked from its parent, we don't
1144    need to *modify* it at all. All we need to do is make a list of all the
1145    pages which can be freed just as soon as we've flushed the IOTLB and we
1146    know the hardware page-walk will no longer touch them.
1147    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1148    be freed. */
1149 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1150                                             int level, struct dma_pte *pte,
1151                                             struct page *freelist)
1152 {
1153         struct page *pg;
1154
1155         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1156         pg->freelist = freelist;
1157         freelist = pg;
1158
1159         if (level == 1)
1160                 return freelist;
1161
1162         pte = page_address(pg);
1163         do {
1164                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1165                         freelist = dma_pte_list_pagetables(domain, level - 1,
1166                                                            pte, freelist);
1167                 pte++;
1168         } while (!first_pte_in_page(pte));
1169
1170         return freelist;
1171 }
1172
1173 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1174                                         struct dma_pte *pte, unsigned long pfn,
1175                                         unsigned long start_pfn,
1176                                         unsigned long last_pfn,
1177                                         struct page *freelist)
1178 {
1179         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1180
1181         pfn = max(start_pfn, pfn);
1182         pte = &pte[pfn_level_offset(pfn, level)];
1183
1184         do {
1185                 unsigned long level_pfn;
1186
1187                 if (!dma_pte_present(pte))
1188                         goto next;
1189
1190                 level_pfn = pfn & level_mask(level);
1191
1192                 /* If range covers entire pagetable, free it */
1193                 if (start_pfn <= level_pfn &&
1194                     last_pfn >= level_pfn + level_size(level) - 1) {
1195                         /* These suborbinate page tables are going away entirely. Don't
1196                            bother to clear them; we're just going to *free* them. */
1197                         if (level > 1 && !dma_pte_superpage(pte))
1198                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1199
1200                         dma_clear_pte(pte);
1201                         if (!first_pte)
1202                                 first_pte = pte;
1203                         last_pte = pte;
1204                 } else if (level > 1) {
1205                         /* Recurse down into a level that isn't *entirely* obsolete */
1206                         freelist = dma_pte_clear_level(domain, level - 1,
1207                                                        phys_to_virt(dma_pte_addr(pte)),
1208                                                        level_pfn, start_pfn, last_pfn,
1209                                                        freelist);
1210                 }
1211 next:
1212                 pfn += level_size(level);
1213         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1214
1215         if (first_pte)
1216                 domain_flush_cache(domain, first_pte,
1217                                    (void *)++last_pte - (void *)first_pte);
1218
1219         return freelist;
1220 }
1221
1222 /* We can't just free the pages because the IOMMU may still be walking
1223    the page tables, and may have cached the intermediate levels. The
1224    pages can only be freed after the IOTLB flush has been done. */
1225 static struct page *domain_unmap(struct dmar_domain *domain,
1226                                  unsigned long start_pfn,
1227                                  unsigned long last_pfn)
1228 {
1229         struct page *freelist = NULL;
1230
1231         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1232         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1233         BUG_ON(start_pfn > last_pfn);
1234
1235         /* we don't need lock here; nobody else touches the iova range */
1236         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1237                                        domain->pgd, 0, start_pfn, last_pfn, NULL);
1238
1239         /* free pgd */
1240         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1241                 struct page *pgd_page = virt_to_page(domain->pgd);
1242                 pgd_page->freelist = freelist;
1243                 freelist = pgd_page;
1244
1245                 domain->pgd = NULL;
1246         }
1247
1248         return freelist;
1249 }
1250
1251 static void dma_free_pagelist(struct page *freelist)
1252 {
1253         struct page *pg;
1254
1255         while ((pg = freelist)) {
1256                 freelist = pg->freelist;
1257                 free_pgtable_page(page_address(pg));
1258         }
1259 }
1260
1261 static void iova_entry_free(unsigned long data)
1262 {
1263         struct page *freelist = (struct page *)data;
1264
1265         dma_free_pagelist(freelist);
1266 }
1267
1268 /* iommu handling */
1269 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1270 {
1271         struct root_entry *root;
1272         unsigned long flags;
1273
1274         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1275         if (!root) {
1276                 pr_err("Allocating root entry for %s failed\n",
1277                         iommu->name);
1278                 return -ENOMEM;
1279         }
1280
1281         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1282
1283         spin_lock_irqsave(&iommu->lock, flags);
1284         iommu->root_entry = root;
1285         spin_unlock_irqrestore(&iommu->lock, flags);
1286
1287         return 0;
1288 }
1289
1290 static void iommu_set_root_entry(struct intel_iommu *iommu)
1291 {
1292         u64 addr;
1293         u32 sts;
1294         unsigned long flag;
1295
1296         addr = virt_to_phys(iommu->root_entry);
1297         if (ecs_enabled(iommu))
1298                 addr |= DMA_RTADDR_RTT;
1299
1300         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1301         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1302
1303         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1304
1305         /* Make sure hardware complete it */
1306         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1307                       readl, (sts & DMA_GSTS_RTPS), sts);
1308
1309         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1310 }
1311
1312 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
1313 {
1314         u32 val;
1315         unsigned long flag;
1316
1317         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1318                 return;
1319
1320         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1321         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1322
1323         /* Make sure hardware complete it */
1324         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1325                       readl, (!(val & DMA_GSTS_WBFS)), val);
1326
1327         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1328 }
1329
1330 /* return value determine if we need a write buffer flush */
1331 static void __iommu_flush_context(struct intel_iommu *iommu,
1332                                   u16 did, u16 source_id, u8 function_mask,
1333                                   u64 type)
1334 {
1335         u64 val = 0;
1336         unsigned long flag;
1337
1338         switch (type) {
1339         case DMA_CCMD_GLOBAL_INVL:
1340                 val = DMA_CCMD_GLOBAL_INVL;
1341                 break;
1342         case DMA_CCMD_DOMAIN_INVL:
1343                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1344                 break;
1345         case DMA_CCMD_DEVICE_INVL:
1346                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1347                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1348                 break;
1349         default:
1350                 BUG();
1351         }
1352         val |= DMA_CCMD_ICC;
1353
1354         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1355         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1356
1357         /* Make sure hardware complete it */
1358         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1359                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1360
1361         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1362 }
1363
1364 /* return value determine if we need a write buffer flush */
1365 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1366                                 u64 addr, unsigned int size_order, u64 type)
1367 {
1368         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1369         u64 val = 0, val_iva = 0;
1370         unsigned long flag;
1371
1372         switch (type) {
1373         case DMA_TLB_GLOBAL_FLUSH:
1374                 /* global flush doesn't need set IVA_REG */
1375                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1376                 break;
1377         case DMA_TLB_DSI_FLUSH:
1378                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1379                 break;
1380         case DMA_TLB_PSI_FLUSH:
1381                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1382                 /* IH bit is passed in as part of address */
1383                 val_iva = size_order | addr;
1384                 break;
1385         default:
1386                 BUG();
1387         }
1388         /* Note: set drain read/write */
1389 #if 0
1390         /*
1391          * This is probably to be super secure.. Looks like we can
1392          * ignore it without any impact.
1393          */
1394         if (cap_read_drain(iommu->cap))
1395                 val |= DMA_TLB_READ_DRAIN;
1396 #endif
1397         if (cap_write_drain(iommu->cap))
1398                 val |= DMA_TLB_WRITE_DRAIN;
1399
1400         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1401         /* Note: Only uses first TLB reg currently */
1402         if (val_iva)
1403                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1404         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1405
1406         /* Make sure hardware complete it */
1407         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1408                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1409
1410         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1411
1412         /* check IOTLB invalidation granularity */
1413         if (DMA_TLB_IAIG(val) == 0)
1414                 pr_err("Flush IOTLB failed\n");
1415         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1416                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1417                         (unsigned long long)DMA_TLB_IIRG(type),
1418                         (unsigned long long)DMA_TLB_IAIG(val));
1419 }
1420
1421 static struct device_domain_info *
1422 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1423                          u8 bus, u8 devfn)
1424 {
1425         struct device_domain_info *info;
1426
1427         assert_spin_locked(&device_domain_lock);
1428
1429         if (!iommu->qi)
1430                 return NULL;
1431
1432         list_for_each_entry(info, &domain->devices, link)
1433                 if (info->iommu == iommu && info->bus == bus &&
1434                     info->devfn == devfn) {
1435                         if (info->ats_supported && info->dev)
1436                                 return info;
1437                         break;
1438                 }
1439
1440         return NULL;
1441 }
1442
1443 static void domain_update_iotlb(struct dmar_domain *domain)
1444 {
1445         struct device_domain_info *info;
1446         bool has_iotlb_device = false;
1447
1448         assert_spin_locked(&device_domain_lock);
1449
1450         list_for_each_entry(info, &domain->devices, link) {
1451                 struct pci_dev *pdev;
1452
1453                 if (!info->dev || !dev_is_pci(info->dev))
1454                         continue;
1455
1456                 pdev = to_pci_dev(info->dev);
1457                 if (pdev->ats_enabled) {
1458                         has_iotlb_device = true;
1459                         break;
1460                 }
1461         }
1462
1463         domain->has_iotlb_device = has_iotlb_device;
1464 }
1465
1466 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1467 {
1468         struct pci_dev *pdev;
1469
1470         assert_spin_locked(&device_domain_lock);
1471
1472         if (!info || !dev_is_pci(info->dev))
1473                 return;
1474
1475         pdev = to_pci_dev(info->dev);
1476
1477 #ifdef CONFIG_INTEL_IOMMU_SVM
1478         /* The PCIe spec, in its wisdom, declares that the behaviour of
1479            the device if you enable PASID support after ATS support is
1480            undefined. So always enable PASID support on devices which
1481            have it, even if we can't yet know if we're ever going to
1482            use it. */
1483         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1484                 info->pasid_enabled = 1;
1485
1486         if (info->pri_supported && !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1487                 info->pri_enabled = 1;
1488 #endif
1489         if (info->ats_supported && !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1490                 info->ats_enabled = 1;
1491                 domain_update_iotlb(info->domain);
1492                 info->ats_qdep = pci_ats_queue_depth(pdev);
1493         }
1494 }
1495
1496 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1497 {
1498         struct pci_dev *pdev;
1499
1500         assert_spin_locked(&device_domain_lock);
1501
1502         if (!dev_is_pci(info->dev))
1503                 return;
1504
1505         pdev = to_pci_dev(info->dev);
1506
1507         if (info->ats_enabled) {
1508                 pci_disable_ats(pdev);
1509                 info->ats_enabled = 0;
1510                 domain_update_iotlb(info->domain);
1511         }
1512 #ifdef CONFIG_INTEL_IOMMU_SVM
1513         if (info->pri_enabled) {
1514                 pci_disable_pri(pdev);
1515                 info->pri_enabled = 0;
1516         }
1517         if (info->pasid_enabled) {
1518                 pci_disable_pasid(pdev);
1519                 info->pasid_enabled = 0;
1520         }
1521 #endif
1522 }
1523
1524 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1525                                   u64 addr, unsigned mask)
1526 {
1527         u16 sid, qdep;
1528         unsigned long flags;
1529         struct device_domain_info *info;
1530
1531         if (!domain->has_iotlb_device)
1532                 return;
1533
1534         spin_lock_irqsave(&device_domain_lock, flags);
1535         list_for_each_entry(info, &domain->devices, link) {
1536                 if (!info->ats_enabled)
1537                         continue;
1538
1539                 sid = info->bus << 8 | info->devfn;
1540                 qdep = info->ats_qdep;
1541                 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1542         }
1543         spin_unlock_irqrestore(&device_domain_lock, flags);
1544 }
1545
1546 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1547                                   struct dmar_domain *domain,
1548                                   unsigned long pfn, unsigned int pages,
1549                                   int ih, int map)
1550 {
1551         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1552         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1553         u16 did = domain->iommu_did[iommu->seq_id];
1554
1555         BUG_ON(pages == 0);
1556
1557         if (ih)
1558                 ih = 1 << 6;
1559         /*
1560          * Fallback to domain selective flush if no PSI support or the size is
1561          * too big.
1562          * PSI requires page size to be 2 ^ x, and the base address is naturally
1563          * aligned to the size
1564          */
1565         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1566                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1567                                                 DMA_TLB_DSI_FLUSH);
1568         else
1569                 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1570                                                 DMA_TLB_PSI_FLUSH);
1571
1572         /*
1573          * In caching mode, changes of pages from non-present to present require
1574          * flush. However, device IOTLB doesn't need to be flushed in this case.
1575          */
1576         if (!cap_caching_mode(iommu->cap) || !map)
1577                 iommu_flush_dev_iotlb(domain, addr, mask);
1578 }
1579
1580 /* Notification for newly created mappings */
1581 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1582                                         struct dmar_domain *domain,
1583                                         unsigned long pfn, unsigned int pages)
1584 {
1585         /* It's a non-present to present mapping. Only flush if caching mode */
1586         if (cap_caching_mode(iommu->cap))
1587                 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1588         else
1589                 iommu_flush_write_buffer(iommu);
1590 }
1591
1592 static void iommu_flush_iova(struct iova_domain *iovad)
1593 {
1594         struct dmar_domain *domain;
1595         int idx;
1596
1597         domain = container_of(iovad, struct dmar_domain, iovad);
1598
1599         for_each_domain_iommu(idx, domain) {
1600                 struct intel_iommu *iommu = g_iommus[idx];
1601                 u16 did = domain->iommu_did[iommu->seq_id];
1602
1603                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1604
1605                 if (!cap_caching_mode(iommu->cap))
1606                         iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1607                                               0, MAX_AGAW_PFN_WIDTH);
1608         }
1609 }
1610
1611 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1612 {
1613         u32 pmen;
1614         unsigned long flags;
1615
1616         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1617         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1618         pmen &= ~DMA_PMEN_EPM;
1619         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1620
1621         /* wait for the protected region status bit to clear */
1622         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1623                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1624
1625         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1626 }
1627
1628 static void iommu_enable_translation(struct intel_iommu *iommu)
1629 {
1630         u32 sts;
1631         unsigned long flags;
1632
1633         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1634         iommu->gcmd |= DMA_GCMD_TE;
1635         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1636
1637         /* Make sure hardware complete it */
1638         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1639                       readl, (sts & DMA_GSTS_TES), sts);
1640
1641         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1642 }
1643
1644 static void iommu_disable_translation(struct intel_iommu *iommu)
1645 {
1646         u32 sts;
1647         unsigned long flag;
1648
1649         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1650         iommu->gcmd &= ~DMA_GCMD_TE;
1651         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1652
1653         /* Make sure hardware complete it */
1654         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1655                       readl, (!(sts & DMA_GSTS_TES)), sts);
1656
1657         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1658 }
1659
1660
1661 static int iommu_init_domains(struct intel_iommu *iommu)
1662 {
1663         u32 ndomains, nlongs;
1664         size_t size;
1665
1666         ndomains = cap_ndoms(iommu->cap);
1667         pr_debug("%s: Number of Domains supported <%d>\n",
1668                  iommu->name, ndomains);
1669         nlongs = BITS_TO_LONGS(ndomains);
1670
1671         spin_lock_init(&iommu->lock);
1672
1673         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1674         if (!iommu->domain_ids) {
1675                 pr_err("%s: Allocating domain id array failed\n",
1676                        iommu->name);
1677                 return -ENOMEM;
1678         }
1679
1680         size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1681         iommu->domains = kzalloc(size, GFP_KERNEL);
1682
1683         if (iommu->domains) {
1684                 size = 256 * sizeof(struct dmar_domain *);
1685                 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1686         }
1687
1688         if (!iommu->domains || !iommu->domains[0]) {
1689                 pr_err("%s: Allocating domain array failed\n",
1690                        iommu->name);
1691                 kfree(iommu->domain_ids);
1692                 kfree(iommu->domains);
1693                 iommu->domain_ids = NULL;
1694                 iommu->domains    = NULL;
1695                 return -ENOMEM;
1696         }
1697
1698
1699
1700         /*
1701          * If Caching mode is set, then invalid translations are tagged
1702          * with domain-id 0, hence we need to pre-allocate it. We also
1703          * use domain-id 0 as a marker for non-allocated domain-id, so
1704          * make sure it is not used for a real domain.
1705          */
1706         set_bit(0, iommu->domain_ids);
1707
1708         return 0;
1709 }
1710
1711 static void disable_dmar_iommu(struct intel_iommu *iommu)
1712 {
1713         struct device_domain_info *info, *tmp;
1714         unsigned long flags;
1715
1716         if (!iommu->domains || !iommu->domain_ids)
1717                 return;
1718
1719 again:
1720         spin_lock_irqsave(&device_domain_lock, flags);
1721         list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1722                 struct dmar_domain *domain;
1723
1724                 if (info->iommu != iommu)
1725                         continue;
1726
1727                 if (!info->dev || !info->domain)
1728                         continue;
1729
1730                 domain = info->domain;
1731
1732                 __dmar_remove_one_dev_info(info);
1733
1734                 if (!domain_type_is_vm_or_si(domain)) {
1735                         /*
1736                          * The domain_exit() function  can't be called under
1737                          * device_domain_lock, as it takes this lock itself.
1738                          * So release the lock here and re-run the loop
1739                          * afterwards.
1740                          */
1741                         spin_unlock_irqrestore(&device_domain_lock, flags);
1742                         domain_exit(domain);
1743                         goto again;
1744                 }
1745         }
1746         spin_unlock_irqrestore(&device_domain_lock, flags);
1747
1748         if (iommu->gcmd & DMA_GCMD_TE)
1749                 iommu_disable_translation(iommu);
1750 }
1751
1752 static void free_dmar_iommu(struct intel_iommu *iommu)
1753 {
1754         if ((iommu->domains) && (iommu->domain_ids)) {
1755                 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1756                 int i;
1757
1758                 for (i = 0; i < elems; i++)
1759                         kfree(iommu->domains[i]);
1760                 kfree(iommu->domains);
1761                 kfree(iommu->domain_ids);
1762                 iommu->domains = NULL;
1763                 iommu->domain_ids = NULL;
1764         }
1765
1766         g_iommus[iommu->seq_id] = NULL;
1767
1768         /* free context mapping */
1769         free_context_table(iommu);
1770
1771 #ifdef CONFIG_INTEL_IOMMU_SVM
1772         if (pasid_enabled(iommu)) {
1773                 if (ecap_prs(iommu->ecap))
1774                         intel_svm_finish_prq(iommu);
1775                 intel_svm_free_pasid_tables(iommu);
1776         }
1777 #endif
1778 }
1779
1780 static struct dmar_domain *alloc_domain(int flags)
1781 {
1782         struct dmar_domain *domain;
1783
1784         domain = alloc_domain_mem();
1785         if (!domain)
1786                 return NULL;
1787
1788         memset(domain, 0, sizeof(*domain));
1789         domain->nid = -1;
1790         domain->flags = flags;
1791         domain->has_iotlb_device = false;
1792         INIT_LIST_HEAD(&domain->devices);
1793
1794         return domain;
1795 }
1796
1797 /* Must be called with iommu->lock */
1798 static int domain_attach_iommu(struct dmar_domain *domain,
1799                                struct intel_iommu *iommu)
1800 {
1801         unsigned long ndomains;
1802         int num;
1803
1804         assert_spin_locked(&device_domain_lock);
1805         assert_spin_locked(&iommu->lock);
1806
1807         domain->iommu_refcnt[iommu->seq_id] += 1;
1808         domain->iommu_count += 1;
1809         if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1810                 ndomains = cap_ndoms(iommu->cap);
1811                 num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1812
1813                 if (num >= ndomains) {
1814                         pr_err("%s: No free domain ids\n", iommu->name);
1815                         domain->iommu_refcnt[iommu->seq_id] -= 1;
1816                         domain->iommu_count -= 1;
1817                         return -ENOSPC;
1818                 }
1819
1820                 set_bit(num, iommu->domain_ids);
1821                 set_iommu_domain(iommu, num, domain);
1822
1823                 domain->iommu_did[iommu->seq_id] = num;
1824                 domain->nid                      = iommu->node;
1825
1826                 domain_update_iommu_cap(domain);
1827         }
1828
1829         return 0;
1830 }
1831
1832 static int domain_detach_iommu(struct dmar_domain *domain,
1833                                struct intel_iommu *iommu)
1834 {
1835         int num, count = INT_MAX;
1836
1837         assert_spin_locked(&device_domain_lock);
1838         assert_spin_locked(&iommu->lock);
1839
1840         domain->iommu_refcnt[iommu->seq_id] -= 1;
1841         count = --domain->iommu_count;
1842         if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1843                 num = domain->iommu_did[iommu->seq_id];
1844                 clear_bit(num, iommu->domain_ids);
1845                 set_iommu_domain(iommu, num, NULL);
1846
1847                 domain_update_iommu_cap(domain);
1848                 domain->iommu_did[iommu->seq_id] = 0;
1849         }
1850
1851         return count;
1852 }
1853
1854 static struct iova_domain reserved_iova_list;
1855 static struct lock_class_key reserved_rbtree_key;
1856
1857 static int dmar_init_reserved_ranges(void)
1858 {
1859         struct pci_dev *pdev = NULL;
1860         struct iova *iova;
1861         int i;
1862
1863         init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1864
1865         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1866                 &reserved_rbtree_key);
1867
1868         /* IOAPIC ranges shouldn't be accessed by DMA */
1869         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1870                 IOVA_PFN(IOAPIC_RANGE_END));
1871         if (!iova) {
1872                 pr_err("Reserve IOAPIC range failed\n");
1873                 return -ENODEV;
1874         }
1875
1876         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1877         for_each_pci_dev(pdev) {
1878                 struct resource *r;
1879
1880                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1881                         r = &pdev->resource[i];
1882                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1883                                 continue;
1884                         iova = reserve_iova(&reserved_iova_list,
1885                                             IOVA_PFN(r->start),
1886                                             IOVA_PFN(r->end));
1887                         if (!iova) {
1888                                 pr_err("Reserve iova failed\n");
1889                                 return -ENODEV;
1890                         }
1891                 }
1892         }
1893         return 0;
1894 }
1895
1896 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1897 {
1898         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1899 }
1900
1901 static inline int guestwidth_to_adjustwidth(int gaw)
1902 {
1903         int agaw;
1904         int r = (gaw - 12) % 9;
1905
1906         if (r == 0)
1907                 agaw = gaw;
1908         else
1909                 agaw = gaw + 9 - r;
1910         if (agaw > 64)
1911                 agaw = 64;
1912         return agaw;
1913 }
1914
1915 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1916                        int guest_width)
1917 {
1918         int adjust_width, agaw;
1919         unsigned long sagaw;
1920         int err;
1921
1922         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
1923
1924         err = init_iova_flush_queue(&domain->iovad,
1925                                     iommu_flush_iova, iova_entry_free);
1926         if (err)
1927                 return err;
1928
1929         domain_reserve_special_ranges(domain);
1930
1931         /* calculate AGAW */
1932         if (guest_width > cap_mgaw(iommu->cap))
1933                 guest_width = cap_mgaw(iommu->cap);
1934         domain->gaw = guest_width;
1935         adjust_width = guestwidth_to_adjustwidth(guest_width);
1936         agaw = width_to_agaw(adjust_width);
1937         sagaw = cap_sagaw(iommu->cap);
1938         if (!test_bit(agaw, &sagaw)) {
1939                 /* hardware doesn't support it, choose a bigger one */
1940                 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1941                 agaw = find_next_bit(&sagaw, 5, agaw);
1942                 if (agaw >= 5)
1943                         return -ENODEV;
1944         }
1945         domain->agaw = agaw;
1946
1947         if (ecap_coherent(iommu->ecap))
1948                 domain->iommu_coherency = 1;
1949         else
1950                 domain->iommu_coherency = 0;
1951
1952         if (ecap_sc_support(iommu->ecap))
1953                 domain->iommu_snooping = 1;
1954         else
1955                 domain->iommu_snooping = 0;
1956
1957         if (intel_iommu_superpage)
1958                 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1959         else
1960                 domain->iommu_superpage = 0;
1961
1962         domain->nid = iommu->node;
1963
1964         /* always allocate the top pgd */
1965         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1966         if (!domain->pgd)
1967                 return -ENOMEM;
1968         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1969         return 0;
1970 }
1971
1972 static void domain_exit(struct dmar_domain *domain)
1973 {
1974         struct page *freelist = NULL;
1975
1976         /* Domain 0 is reserved, so dont process it */
1977         if (!domain)
1978                 return;
1979
1980         /* Remove associated devices and clear attached or cached domains */
1981         rcu_read_lock();
1982         domain_remove_dev_info(domain);
1983         rcu_read_unlock();
1984
1985         /* destroy iovas */
1986         put_iova_domain(&domain->iovad);
1987
1988         freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1989
1990         dma_free_pagelist(freelist);
1991
1992         free_domain_mem(domain);
1993 }
1994
1995 static int domain_context_mapping_one(struct dmar_domain *domain,
1996                                       struct intel_iommu *iommu,
1997                                       u8 bus, u8 devfn)
1998 {
1999         u16 did = domain->iommu_did[iommu->seq_id];
2000         int translation = CONTEXT_TT_MULTI_LEVEL;
2001         struct device_domain_info *info = NULL;
2002         struct context_entry *context;
2003         unsigned long flags;
2004         struct dma_pte *pgd;
2005         int ret, agaw;
2006
2007         WARN_ON(did == 0);
2008
2009         if (hw_pass_through && domain_type_is_si(domain))
2010                 translation = CONTEXT_TT_PASS_THROUGH;
2011
2012         pr_debug("Set context mapping for %02x:%02x.%d\n",
2013                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2014
2015         BUG_ON(!domain->pgd);
2016
2017         spin_lock_irqsave(&device_domain_lock, flags);
2018         spin_lock(&iommu->lock);
2019
2020         ret = -ENOMEM;
2021         context = iommu_context_addr(iommu, bus, devfn, 1);
2022         if (!context)
2023                 goto out_unlock;
2024
2025         ret = 0;
2026         if (context_present(context))
2027                 goto out_unlock;
2028
2029         /*
2030          * For kdump cases, old valid entries may be cached due to the
2031          * in-flight DMA and copied pgtable, but there is no unmapping
2032          * behaviour for them, thus we need an explicit cache flush for
2033          * the newly-mapped device. For kdump, at this point, the device
2034          * is supposed to finish reset at its driver probe stage, so no
2035          * in-flight DMA will exist, and we don't need to worry anymore
2036          * hereafter.
2037          */
2038         if (context_copied(context)) {
2039                 u16 did_old = context_domain_id(context);
2040
2041                 if (did_old < cap_ndoms(iommu->cap)) {
2042                         iommu->flush.flush_context(iommu, did_old,
2043                                                    (((u16)bus) << 8) | devfn,
2044                                                    DMA_CCMD_MASK_NOBIT,
2045                                                    DMA_CCMD_DEVICE_INVL);
2046                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2047                                                  DMA_TLB_DSI_FLUSH);
2048                 }
2049         }
2050
2051         pgd = domain->pgd;
2052
2053         context_clear_entry(context);
2054         context_set_domain_id(context, did);
2055
2056         /*
2057          * Skip top levels of page tables for iommu which has less agaw
2058          * than default.  Unnecessary for PT mode.
2059          */
2060         if (translation != CONTEXT_TT_PASS_THROUGH) {
2061                 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
2062                         ret = -ENOMEM;
2063                         pgd = phys_to_virt(dma_pte_addr(pgd));
2064                         if (!dma_pte_present(pgd))
2065                                 goto out_unlock;
2066                 }
2067
2068                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2069                 if (info && info->ats_supported)
2070                         translation = CONTEXT_TT_DEV_IOTLB;
2071                 else
2072                         translation = CONTEXT_TT_MULTI_LEVEL;
2073
2074                 context_set_address_root(context, virt_to_phys(pgd));
2075                 context_set_address_width(context, iommu->agaw);
2076         } else {
2077                 /*
2078                  * In pass through mode, AW must be programmed to
2079                  * indicate the largest AGAW value supported by
2080                  * hardware. And ASR is ignored by hardware.
2081                  */
2082                 context_set_address_width(context, iommu->msagaw);
2083         }
2084
2085         context_set_translation_type(context, translation);
2086         context_set_fault_enable(context);
2087         context_set_present(context);
2088         domain_flush_cache(domain, context, sizeof(*context));
2089
2090         /*
2091          * It's a non-present to present mapping. If hardware doesn't cache
2092          * non-present entry we only need to flush the write-buffer. If the
2093          * _does_ cache non-present entries, then it does so in the special
2094          * domain #0, which we have to flush:
2095          */
2096         if (cap_caching_mode(iommu->cap)) {
2097                 iommu->flush.flush_context(iommu, 0,
2098                                            (((u16)bus) << 8) | devfn,
2099                                            DMA_CCMD_MASK_NOBIT,
2100                                            DMA_CCMD_DEVICE_INVL);
2101                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2102         } else {
2103                 iommu_flush_write_buffer(iommu);
2104         }
2105         iommu_enable_dev_iotlb(info);
2106
2107         ret = 0;
2108
2109 out_unlock:
2110         spin_unlock(&iommu->lock);
2111         spin_unlock_irqrestore(&device_domain_lock, flags);
2112
2113         return ret;
2114 }
2115
2116 struct domain_context_mapping_data {
2117         struct dmar_domain *domain;
2118         struct intel_iommu *iommu;
2119 };
2120
2121 static int domain_context_mapping_cb(struct pci_dev *pdev,
2122                                      u16 alias, void *opaque)
2123 {
2124         struct domain_context_mapping_data *data = opaque;
2125
2126         return domain_context_mapping_one(data->domain, data->iommu,
2127                                           PCI_BUS_NUM(alias), alias & 0xff);
2128 }
2129
2130 static int
2131 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2132 {
2133         struct intel_iommu *iommu;
2134         u8 bus, devfn;
2135         struct domain_context_mapping_data data;
2136
2137         iommu = device_to_iommu(dev, &bus, &devfn);
2138         if (!iommu)
2139                 return -ENODEV;
2140
2141         if (!dev_is_pci(dev))
2142                 return domain_context_mapping_one(domain, iommu, bus, devfn);
2143
2144         data.domain = domain;
2145         data.iommu = iommu;
2146
2147         return pci_for_each_dma_alias(to_pci_dev(dev),
2148                                       &domain_context_mapping_cb, &data);
2149 }
2150
2151 static int domain_context_mapped_cb(struct pci_dev *pdev,
2152                                     u16 alias, void *opaque)
2153 {
2154         struct intel_iommu *iommu = opaque;
2155
2156         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2157 }
2158
2159 static int domain_context_mapped(struct device *dev)
2160 {
2161         struct intel_iommu *iommu;
2162         u8 bus, devfn;
2163
2164         iommu = device_to_iommu(dev, &bus, &devfn);
2165         if (!iommu)
2166                 return -ENODEV;
2167
2168         if (!dev_is_pci(dev))
2169                 return device_context_mapped(iommu, bus, devfn);
2170
2171         return !pci_for_each_dma_alias(to_pci_dev(dev),
2172                                        domain_context_mapped_cb, iommu);
2173 }
2174
2175 /* Returns a number of VTD pages, but aligned to MM page size */
2176 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2177                                             size_t size)
2178 {
2179         host_addr &= ~PAGE_MASK;
2180         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2181 }
2182
2183 /* Return largest possible superpage level for a given mapping */
2184 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2185                                           unsigned long iov_pfn,
2186                                           unsigned long phy_pfn,
2187                                           unsigned long pages)
2188 {
2189         int support, level = 1;
2190         unsigned long pfnmerge;
2191
2192         support = domain->iommu_superpage;
2193
2194         /* To use a large page, the virtual *and* physical addresses
2195            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2196            of them will mean we have to use smaller pages. So just
2197            merge them and check both at once. */
2198         pfnmerge = iov_pfn | phy_pfn;
2199
2200         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2201                 pages >>= VTD_STRIDE_SHIFT;
2202                 if (!pages)
2203                         break;
2204                 pfnmerge >>= VTD_STRIDE_SHIFT;
2205                 level++;
2206                 support--;
2207         }
2208         return level;
2209 }
2210
2211 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2212                             struct scatterlist *sg, unsigned long phys_pfn,
2213                             unsigned long nr_pages, int prot)
2214 {
2215         struct dma_pte *first_pte = NULL, *pte = NULL;
2216         phys_addr_t uninitialized_var(pteval);
2217         unsigned long sg_res = 0;
2218         unsigned int largepage_lvl = 0;
2219         unsigned long lvl_pages = 0;
2220
2221         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2222
2223         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2224                 return -EINVAL;
2225
2226         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2227
2228         if (!sg) {
2229                 sg_res = nr_pages;
2230                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2231         }
2232
2233         while (nr_pages > 0) {
2234                 uint64_t tmp;
2235
2236                 if (!sg_res) {
2237                         unsigned int pgoff = sg->offset & ~PAGE_MASK;
2238
2239                         sg_res = aligned_nrpages(sg->offset, sg->length);
2240                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2241                         sg->dma_length = sg->length;
2242                         pteval = (sg_phys(sg) - pgoff) | prot;
2243                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
2244                 }
2245
2246                 if (!pte) {
2247                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2248
2249                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2250                         if (!pte)
2251                                 return -ENOMEM;
2252                         /* It is large page*/
2253                         if (largepage_lvl > 1) {
2254                                 unsigned long nr_superpages, end_pfn;
2255
2256                                 pteval |= DMA_PTE_LARGE_PAGE;
2257                                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2258
2259                                 nr_superpages = sg_res / lvl_pages;
2260                                 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2261
2262                                 /*
2263                                  * Ensure that old small page tables are
2264                                  * removed to make room for superpage(s).
2265                                  * We're adding new large pages, so make sure
2266                                  * we don't remove their parent tables.
2267                                  */
2268                                 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2269                                                        largepage_lvl + 1);
2270                         } else {
2271                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2272                         }
2273
2274                 }
2275                 /* We don't need lock here, nobody else
2276                  * touches the iova range
2277                  */
2278                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2279                 if (tmp) {
2280                         static int dumps = 5;
2281                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2282                                 iov_pfn, tmp, (unsigned long long)pteval);
2283                         if (dumps) {
2284                                 dumps--;
2285                                 debug_dma_dump_mappings(NULL);
2286                         }
2287                         WARN_ON(1);
2288                 }
2289
2290                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2291
2292                 BUG_ON(nr_pages < lvl_pages);
2293                 BUG_ON(sg_res < lvl_pages);
2294
2295                 nr_pages -= lvl_pages;
2296                 iov_pfn += lvl_pages;
2297                 phys_pfn += lvl_pages;
2298                 pteval += lvl_pages * VTD_PAGE_SIZE;
2299                 sg_res -= lvl_pages;
2300
2301                 /* If the next PTE would be the first in a new page, then we
2302                    need to flush the cache on the entries we've just written.
2303                    And then we'll need to recalculate 'pte', so clear it and
2304                    let it get set again in the if (!pte) block above.
2305
2306                    If we're done (!nr_pages) we need to flush the cache too.
2307
2308                    Also if we've been setting superpages, we may need to
2309                    recalculate 'pte' and switch back to smaller pages for the
2310                    end of the mapping, if the trailing size is not enough to
2311                    use another superpage (i.e. sg_res < lvl_pages). */
2312                 pte++;
2313                 if (!nr_pages || first_pte_in_page(pte) ||
2314                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
2315                         domain_flush_cache(domain, first_pte,
2316                                            (void *)pte - (void *)first_pte);
2317                         pte = NULL;
2318                 }
2319
2320                 if (!sg_res && nr_pages)
2321                         sg = sg_next(sg);
2322         }
2323         return 0;
2324 }
2325
2326 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2327                          struct scatterlist *sg, unsigned long phys_pfn,
2328                          unsigned long nr_pages, int prot)
2329 {
2330        int ret;
2331        struct intel_iommu *iommu;
2332
2333        /* Do the real mapping first */
2334        ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2335        if (ret)
2336                return ret;
2337
2338        /* Notify about the new mapping */
2339        if (domain_type_is_vm(domain)) {
2340                /* VM typed domains can have more than one IOMMUs */
2341                int iommu_id;
2342                for_each_domain_iommu(iommu_id, domain) {
2343                        iommu = g_iommus[iommu_id];
2344                        __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2345                }
2346        } else {
2347                /* General domains only have one IOMMU */
2348                iommu = domain_get_iommu(domain);
2349                __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2350        }
2351
2352        return 0;
2353 }
2354
2355 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2356                                     struct scatterlist *sg, unsigned long nr_pages,
2357                                     int prot)
2358 {
2359         return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2360 }
2361
2362 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2363                                      unsigned long phys_pfn, unsigned long nr_pages,
2364                                      int prot)
2365 {
2366         return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2367 }
2368
2369 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2370 {
2371         unsigned long flags;
2372         struct context_entry *context;
2373         u16 did_old;
2374
2375         if (!iommu)
2376                 return;
2377
2378         spin_lock_irqsave(&iommu->lock, flags);
2379         context = iommu_context_addr(iommu, bus, devfn, 0);
2380         if (!context) {
2381                 spin_unlock_irqrestore(&iommu->lock, flags);
2382                 return;
2383         }
2384         did_old = context_domain_id(context);
2385         context_clear_entry(context);
2386         __iommu_flush_cache(iommu, context, sizeof(*context));
2387         spin_unlock_irqrestore(&iommu->lock, flags);
2388         iommu->flush.flush_context(iommu,
2389                                    did_old,
2390                                    (((u16)bus) << 8) | devfn,
2391                                    DMA_CCMD_MASK_NOBIT,
2392                                    DMA_CCMD_DEVICE_INVL);
2393         iommu->flush.flush_iotlb(iommu,
2394                                  did_old,
2395                                  0,
2396                                  0,
2397                                  DMA_TLB_DSI_FLUSH);
2398 }
2399
2400 static inline void unlink_domain_info(struct device_domain_info *info)
2401 {
2402         assert_spin_locked(&device_domain_lock);
2403         list_del(&info->link);
2404         list_del(&info->global);
2405         if (info->dev)
2406                 info->dev->archdata.iommu = NULL;
2407 }
2408
2409 static void domain_remove_dev_info(struct dmar_domain *domain)
2410 {
2411         struct device_domain_info *info, *tmp;
2412         unsigned long flags;
2413
2414         spin_lock_irqsave(&device_domain_lock, flags);
2415         list_for_each_entry_safe(info, tmp, &domain->devices, link)
2416                 __dmar_remove_one_dev_info(info);
2417         spin_unlock_irqrestore(&device_domain_lock, flags);
2418 }
2419
2420 /*
2421  * find_domain
2422  * Note: we use struct device->archdata.iommu stores the info
2423  */
2424 static struct dmar_domain *find_domain(struct device *dev)
2425 {
2426         struct device_domain_info *info;
2427
2428         /* No lock here, assumes no domain exit in normal case */
2429         info = dev->archdata.iommu;
2430         if (likely(info))
2431                 return info->domain;
2432         return NULL;
2433 }
2434
2435 static inline struct device_domain_info *
2436 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2437 {
2438         struct device_domain_info *info;
2439
2440         list_for_each_entry(info, &device_domain_list, global)
2441                 if (info->iommu->segment == segment && info->bus == bus &&
2442                     info->devfn == devfn)
2443                         return info;
2444
2445         return NULL;
2446 }
2447
2448 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2449                                                     int bus, int devfn,
2450                                                     struct device *dev,
2451                                                     struct dmar_domain *domain)
2452 {
2453         struct dmar_domain *found = NULL;
2454         struct device_domain_info *info;
2455         unsigned long flags;
2456         int ret;
2457
2458         info = alloc_devinfo_mem();
2459         if (!info)
2460                 return NULL;
2461
2462         info->bus = bus;
2463         info->devfn = devfn;
2464         info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2465         info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2466         info->ats_qdep = 0;
2467         info->dev = dev;
2468         info->domain = domain;
2469         info->iommu = iommu;
2470
2471         if (dev && dev_is_pci(dev)) {
2472                 struct pci_dev *pdev = to_pci_dev(info->dev);
2473
2474                 if (!pci_ats_disabled() &&
2475                     ecap_dev_iotlb_support(iommu->ecap) &&
2476                     pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2477                     dmar_find_matched_atsr_unit(pdev))
2478                         info->ats_supported = 1;
2479
2480                 if (ecs_enabled(iommu)) {
2481                         if (pasid_enabled(iommu)) {
2482                                 int features = pci_pasid_features(pdev);
2483                                 if (features >= 0)
2484                                         info->pasid_supported = features | 1;
2485                         }
2486
2487                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
2488                             pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2489                                 info->pri_supported = 1;
2490                 }
2491         }
2492
2493         spin_lock_irqsave(&device_domain_lock, flags);
2494         if (dev)
2495                 found = find_domain(dev);
2496
2497         if (!found) {
2498                 struct device_domain_info *info2;
2499                 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2500                 if (info2) {
2501                         found      = info2->domain;
2502                         info2->dev = dev;
2503                 }
2504         }
2505
2506         if (found) {
2507                 spin_unlock_irqrestore(&device_domain_lock, flags);
2508                 free_devinfo_mem(info);
2509                 /* Caller must free the original domain */
2510                 return found;
2511         }
2512
2513         spin_lock(&iommu->lock);
2514         ret = domain_attach_iommu(domain, iommu);
2515         spin_unlock(&iommu->lock);
2516
2517         if (ret) {
2518                 spin_unlock_irqrestore(&device_domain_lock, flags);
2519                 free_devinfo_mem(info);
2520                 return NULL;
2521         }
2522
2523         list_add(&info->link, &domain->devices);
2524         list_add(&info->global, &device_domain_list);
2525         if (dev)
2526                 dev->archdata.iommu = info;
2527         spin_unlock_irqrestore(&device_domain_lock, flags);
2528
2529         if (dev && domain_context_mapping(domain, dev)) {
2530                 pr_err("Domain context map for %s failed\n", dev_name(dev));
2531                 dmar_remove_one_dev_info(domain, dev);
2532                 return NULL;
2533         }
2534
2535         return domain;
2536 }
2537
2538 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2539 {
2540         *(u16 *)opaque = alias;
2541         return 0;
2542 }
2543
2544 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2545 {
2546         struct device_domain_info *info = NULL;
2547         struct dmar_domain *domain = NULL;
2548         struct intel_iommu *iommu;
2549         u16 dma_alias;
2550         unsigned long flags;
2551         u8 bus, devfn;
2552
2553         iommu = device_to_iommu(dev, &bus, &devfn);
2554         if (!iommu)
2555                 return NULL;
2556
2557         if (dev_is_pci(dev)) {
2558                 struct pci_dev *pdev = to_pci_dev(dev);
2559
2560                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2561
2562                 spin_lock_irqsave(&device_domain_lock, flags);
2563                 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2564                                                       PCI_BUS_NUM(dma_alias),
2565                                                       dma_alias & 0xff);
2566                 if (info) {
2567                         iommu = info->iommu;
2568                         domain = info->domain;
2569                 }
2570                 spin_unlock_irqrestore(&device_domain_lock, flags);
2571
2572                 /* DMA alias already has a domain, use it */
2573                 if (info)
2574                         goto out;
2575         }
2576
2577         /* Allocate and initialize new domain for the device */
2578         domain = alloc_domain(0);
2579         if (!domain)
2580                 return NULL;
2581         if (domain_init(domain, iommu, gaw)) {
2582                 domain_exit(domain);
2583                 return NULL;
2584         }
2585
2586 out:
2587
2588         return domain;
2589 }
2590
2591 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2592                                               struct dmar_domain *domain)
2593 {
2594         struct intel_iommu *iommu;
2595         struct dmar_domain *tmp;
2596         u16 req_id, dma_alias;
2597         u8 bus, devfn;
2598
2599         iommu = device_to_iommu(dev, &bus, &devfn);
2600         if (!iommu)
2601                 return NULL;
2602
2603         req_id = ((u16)bus << 8) | devfn;
2604
2605         if (dev_is_pci(dev)) {
2606                 struct pci_dev *pdev = to_pci_dev(dev);
2607
2608                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2609
2610                 /* register PCI DMA alias device */
2611                 if (req_id != dma_alias) {
2612                         tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2613                                         dma_alias & 0xff, NULL, domain);
2614
2615                         if (!tmp || tmp != domain)
2616                                 return tmp;
2617                 }
2618         }
2619
2620         tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2621         if (!tmp || tmp != domain)
2622                 return tmp;
2623
2624         return domain;
2625 }
2626
2627 static struct dmar_domain *get_domain_for_dev(struct device *dev, int gaw)
2628 {
2629         struct dmar_domain *domain, *tmp;
2630
2631         domain = find_domain(dev);
2632         if (domain)
2633                 goto out;
2634
2635         domain = find_or_alloc_domain(dev, gaw);
2636         if (!domain)
2637                 goto out;
2638
2639         tmp = set_domain_for_dev(dev, domain);
2640         if (!tmp || domain != tmp) {
2641                 domain_exit(domain);
2642                 domain = tmp;
2643         }
2644
2645 out:
2646
2647         return domain;
2648 }
2649
2650 static int iommu_domain_identity_map(struct dmar_domain *domain,
2651                                      unsigned long long start,
2652                                      unsigned long long end)
2653 {
2654         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2655         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2656
2657         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2658                           dma_to_mm_pfn(last_vpfn))) {
2659                 pr_err("Reserving iova failed\n");
2660                 return -ENOMEM;
2661         }
2662
2663         pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2664         /*
2665          * RMRR range might have overlap with physical memory range,
2666          * clear it first
2667          */
2668         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2669
2670         return __domain_mapping(domain, first_vpfn, NULL,
2671                                 first_vpfn, last_vpfn - first_vpfn + 1,
2672                                 DMA_PTE_READ|DMA_PTE_WRITE);
2673 }
2674
2675 static int domain_prepare_identity_map(struct device *dev,
2676                                        struct dmar_domain *domain,
2677                                        unsigned long long start,
2678                                        unsigned long long end)
2679 {
2680         /* For _hardware_ passthrough, don't bother. But for software
2681            passthrough, we do it anyway -- it may indicate a memory
2682            range which is reserved in E820, so which didn't get set
2683            up to start with in si_domain */
2684         if (domain == si_domain && hw_pass_through) {
2685                 pr_warn("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2686                         dev_name(dev), start, end);
2687                 return 0;
2688         }
2689
2690         pr_info("Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2691                 dev_name(dev), start, end);
2692
2693         if (end < start) {
2694                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2695                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2696                         dmi_get_system_info(DMI_BIOS_VENDOR),
2697                         dmi_get_system_info(DMI_BIOS_VERSION),
2698                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2699                 return -EIO;
2700         }
2701
2702         if (end >> agaw_to_width(domain->agaw)) {
2703                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2704                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2705                      agaw_to_width(domain->agaw),
2706                      dmi_get_system_info(DMI_BIOS_VENDOR),
2707                      dmi_get_system_info(DMI_BIOS_VERSION),
2708                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2709                 return -EIO;
2710         }
2711
2712         return iommu_domain_identity_map(domain, start, end);
2713 }
2714
2715 static int iommu_prepare_identity_map(struct device *dev,
2716                                       unsigned long long start,
2717                                       unsigned long long end)
2718 {
2719         struct dmar_domain *domain;
2720         int ret;
2721
2722         domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2723         if (!domain)
2724                 return -ENOMEM;
2725
2726         ret = domain_prepare_identity_map(dev, domain, start, end);
2727         if (ret)
2728                 domain_exit(domain);
2729
2730         return ret;
2731 }
2732
2733 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2734                                          struct device *dev)
2735 {
2736         if (dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2737                 return 0;
2738         return iommu_prepare_identity_map(dev, rmrr->base_address,
2739                                           rmrr->end_address);
2740 }
2741
2742 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2743 static inline void iommu_prepare_isa(void)
2744 {
2745         struct pci_dev *pdev;
2746         int ret;
2747
2748         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2749         if (!pdev)
2750                 return;
2751
2752         pr_info("Prepare 0-16MiB unity mapping for LPC\n");
2753         ret = iommu_prepare_identity_map(&pdev->dev, 0, 16*1024*1024 - 1);
2754
2755         if (ret)
2756                 pr_err("Failed to create 0-16MiB identity map - floppy might not work\n");
2757
2758         pci_dev_put(pdev);
2759 }
2760 #else
2761 static inline void iommu_prepare_isa(void)
2762 {
2763         return;
2764 }
2765 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2766
2767 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2768
2769 static int __init si_domain_init(int hw)
2770 {
2771         int nid, ret = 0;
2772
2773         si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2774         if (!si_domain)
2775                 return -EFAULT;
2776
2777         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2778                 domain_exit(si_domain);
2779                 return -EFAULT;
2780         }
2781
2782         pr_debug("Identity mapping domain allocated\n");
2783
2784         if (hw)
2785                 return 0;
2786
2787         for_each_online_node(nid) {
2788                 unsigned long start_pfn, end_pfn;
2789                 int i;
2790
2791                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2792                         ret = iommu_domain_identity_map(si_domain,
2793                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2794                         if (ret)
2795                                 return ret;
2796                 }
2797         }
2798
2799         return 0;
2800 }
2801
2802 static int identity_mapping(struct device *dev)
2803 {
2804         struct device_domain_info *info;
2805
2806         if (likely(!iommu_identity_mapping))
2807                 return 0;
2808
2809         info = dev->archdata.iommu;
2810         if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2811                 return (info->domain == si_domain);
2812
2813         return 0;
2814 }
2815
2816 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2817 {
2818         struct dmar_domain *ndomain;
2819         struct intel_iommu *iommu;
2820         u8 bus, devfn;
2821
2822         iommu = device_to_iommu(dev, &bus, &devfn);
2823         if (!iommu)
2824                 return -ENODEV;
2825
2826         ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2827         if (ndomain != domain)
2828                 return -EBUSY;
2829
2830         return 0;
2831 }
2832
2833 static bool device_has_rmrr(struct device *dev)
2834 {
2835         struct dmar_rmrr_unit *rmrr;
2836         struct device *tmp;
2837         int i;
2838
2839         rcu_read_lock();
2840         for_each_rmrr_units(rmrr) {
2841                 /*
2842                  * Return TRUE if this RMRR contains the device that
2843                  * is passed in.
2844                  */
2845                 for_each_active_dev_scope(rmrr->devices,
2846                                           rmrr->devices_cnt, i, tmp)
2847                         if (tmp == dev) {
2848                                 rcu_read_unlock();
2849                                 return true;
2850                         }
2851         }
2852         rcu_read_unlock();
2853         return false;
2854 }
2855
2856 /*
2857  * There are a couple cases where we need to restrict the functionality of
2858  * devices associated with RMRRs.  The first is when evaluating a device for
2859  * identity mapping because problems exist when devices are moved in and out
2860  * of domains and their respective RMRR information is lost.  This means that
2861  * a device with associated RMRRs will never be in a "passthrough" domain.
2862  * The second is use of the device through the IOMMU API.  This interface
2863  * expects to have full control of the IOVA space for the device.  We cannot
2864  * satisfy both the requirement that RMRR access is maintained and have an
2865  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2866  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2867  * We therefore prevent devices associated with an RMRR from participating in
2868  * the IOMMU API, which eliminates them from device assignment.
2869  *
2870  * In both cases we assume that PCI USB devices with RMRRs have them largely
2871  * for historical reasons and that the RMRR space is not actively used post
2872  * boot.  This exclusion may change if vendors begin to abuse it.
2873  *
2874  * The same exception is made for graphics devices, with the requirement that
2875  * any use of the RMRR regions will be torn down before assigning the device
2876  * to a guest.
2877  */
2878 static bool device_is_rmrr_locked(struct device *dev)
2879 {
2880         if (!device_has_rmrr(dev))
2881                 return false;
2882
2883         if (dev_is_pci(dev)) {
2884                 struct pci_dev *pdev = to_pci_dev(dev);
2885
2886                 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2887                         return false;
2888         }
2889
2890         return true;
2891 }
2892
2893 static int iommu_should_identity_map(struct device *dev, int startup)
2894 {
2895
2896         if (dev_is_pci(dev)) {
2897                 struct pci_dev *pdev = to_pci_dev(dev);
2898
2899                 if (device_is_rmrr_locked(dev))
2900                         return 0;
2901
2902                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2903                         return 1;
2904
2905                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2906                         return 1;
2907
2908                 if (!(iommu_identity_mapping & IDENTMAP_ALL))
2909                         return 0;
2910
2911                 /*
2912                  * We want to start off with all devices in the 1:1 domain, and
2913                  * take them out later if we find they can't access all of memory.
2914                  *
2915                  * However, we can't do this for PCI devices behind bridges,
2916                  * because all PCI devices behind the same bridge will end up
2917                  * with the same source-id on their transactions.
2918                  *
2919                  * Practically speaking, we can't change things around for these
2920                  * devices at run-time, because we can't be sure there'll be no
2921                  * DMA transactions in flight for any of their siblings.
2922                  *
2923                  * So PCI devices (unless they're on the root bus) as well as
2924                  * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2925                  * the 1:1 domain, just in _case_ one of their siblings turns out
2926                  * not to be able to map all of memory.
2927                  */
2928                 if (!pci_is_pcie(pdev)) {
2929                         if (!pci_is_root_bus(pdev->bus))
2930                                 return 0;
2931                         if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2932                                 return 0;
2933                 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2934                         return 0;
2935         } else {
2936                 if (device_has_rmrr(dev))
2937                         return 0;
2938         }
2939
2940         /*
2941          * At boot time, we don't yet know if devices will be 64-bit capable.
2942          * Assume that they will — if they turn out not to be, then we can
2943          * take them out of the 1:1 domain later.
2944          */
2945         if (!startup) {
2946                 /*
2947                  * If the device's dma_mask is less than the system's memory
2948                  * size then this is not a candidate for identity mapping.
2949                  */
2950                 u64 dma_mask = *dev->dma_mask;
2951
2952                 if (dev->coherent_dma_mask &&
2953                     dev->coherent_dma_mask < dma_mask)
2954                         dma_mask = dev->coherent_dma_mask;
2955
2956                 return dma_mask >= dma_get_required_mask(dev);
2957         }
2958
2959         return 1;
2960 }
2961
2962 static int __init dev_prepare_static_identity_mapping(struct device *dev, int hw)
2963 {
2964         int ret;
2965
2966         if (!iommu_should_identity_map(dev, 1))
2967                 return 0;
2968
2969         ret = domain_add_dev_info(si_domain, dev);
2970         if (!ret)
2971                 pr_info("%s identity mapping for device %s\n",
2972                         hw ? "Hardware" : "Software", dev_name(dev));
2973         else if (ret == -ENODEV)
2974                 /* device not associated with an iommu */
2975                 ret = 0;
2976
2977         return ret;
2978 }
2979
2980
2981 static int __init iommu_prepare_static_identity_mapping(int hw)
2982 {
2983         struct pci_dev *pdev = NULL;
2984         struct dmar_drhd_unit *drhd;
2985         struct intel_iommu *iommu;
2986         struct device *dev;
2987         int i;
2988         int ret = 0;
2989
2990         for_each_pci_dev(pdev) {
2991                 ret = dev_prepare_static_identity_mapping(&pdev->dev, hw);
2992                 if (ret)
2993                         return ret;
2994         }
2995
2996         for_each_active_iommu(iommu, drhd)
2997                 for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) {
2998                         struct acpi_device_physical_node *pn;
2999                         struct acpi_device *adev;
3000
3001                         if (dev->bus != &acpi_bus_type)
3002                                 continue;
3003
3004                         adev= to_acpi_device(dev);
3005                         mutex_lock(&adev->physical_node_lock);
3006                         list_for_each_entry(pn, &adev->physical_node_list, node) {
3007                                 ret = dev_prepare_static_identity_mapping(pn->dev, hw);
3008                                 if (ret)
3009                                         break;
3010                         }
3011                         mutex_unlock(&adev->physical_node_lock);
3012                         if (ret)
3013                                 return ret;
3014                 }
3015
3016         return 0;
3017 }
3018
3019 static void intel_iommu_init_qi(struct intel_iommu *iommu)
3020 {
3021         /*
3022          * Start from the sane iommu hardware state.
3023          * If the queued invalidation is already initialized by us
3024          * (for example, while enabling interrupt-remapping) then
3025          * we got the things already rolling from a sane state.
3026          */
3027         if (!iommu->qi) {
3028                 /*
3029                  * Clear any previous faults.
3030                  */
3031                 dmar_fault(-1, iommu);
3032                 /*
3033                  * Disable queued invalidation if supported and already enabled
3034                  * before OS handover.
3035                  */
3036                 dmar_disable_qi(iommu);
3037         }
3038
3039         if (dmar_enable_qi(iommu)) {
3040                 /*
3041                  * Queued Invalidate not enabled, use Register Based Invalidate
3042                  */
3043                 iommu->flush.flush_context = __iommu_flush_context;
3044                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
3045                 pr_info("%s: Using Register based invalidation\n",
3046                         iommu->name);
3047         } else {
3048                 iommu->flush.flush_context = qi_flush_context;
3049                 iommu->flush.flush_iotlb = qi_flush_iotlb;
3050                 pr_info("%s: Using Queued invalidation\n", iommu->name);
3051         }
3052 }
3053
3054 static int copy_context_table(struct intel_iommu *iommu,
3055                               struct root_entry *old_re,
3056                               struct context_entry **tbl,
3057                               int bus, bool ext)
3058 {
3059         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
3060         struct context_entry *new_ce = NULL, ce;
3061         struct context_entry *old_ce = NULL;
3062         struct root_entry re;
3063         phys_addr_t old_ce_phys;
3064
3065         tbl_idx = ext ? bus * 2 : bus;
3066         memcpy(&re, old_re, sizeof(re));
3067
3068         for (devfn = 0; devfn < 256; devfn++) {
3069                 /* First calculate the correct index */
3070                 idx = (ext ? devfn * 2 : devfn) % 256;
3071
3072                 if (idx == 0) {
3073                         /* First save what we may have and clean up */
3074                         if (new_ce) {
3075                                 tbl[tbl_idx] = new_ce;
3076                                 __iommu_flush_cache(iommu, new_ce,
3077                                                     VTD_PAGE_SIZE);
3078                                 pos = 1;
3079                         }
3080
3081                         if (old_ce)
3082                                 iounmap(old_ce);
3083
3084                         ret = 0;
3085                         if (devfn < 0x80)
3086                                 old_ce_phys = root_entry_lctp(&re);
3087                         else
3088                                 old_ce_phys = root_entry_uctp(&re);
3089
3090                         if (!old_ce_phys) {
3091                                 if (ext && devfn == 0) {
3092                                         /* No LCTP, try UCTP */
3093                                         devfn = 0x7f;
3094                                         continue;
3095                                 } else {
3096                                         goto out;
3097                                 }
3098                         }
3099
3100                         ret = -ENOMEM;
3101                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
3102                                         MEMREMAP_WB);
3103                         if (!old_ce)
3104                                 goto out;
3105
3106                         new_ce = alloc_pgtable_page(iommu->node);
3107                         if (!new_ce)
3108                                 goto out_unmap;
3109
3110                         ret = 0;
3111                 }
3112
3113                 /* Now copy the context entry */
3114                 memcpy(&ce, old_ce + idx, sizeof(ce));
3115
3116                 if (!__context_present(&ce))
3117                         continue;
3118
3119                 did = context_domain_id(&ce);
3120                 if (did >= 0 && did < cap_ndoms(iommu->cap))
3121                         set_bit(did, iommu->domain_ids);
3122
3123                 /*
3124                  * We need a marker for copied context entries. This
3125                  * marker needs to work for the old format as well as
3126                  * for extended context entries.
3127                  *
3128                  * Bit 67 of the context entry is used. In the old
3129                  * format this bit is available to software, in the
3130                  * extended format it is the PGE bit, but PGE is ignored
3131                  * by HW if PASIDs are disabled (and thus still
3132                  * available).
3133                  *
3134                  * So disable PASIDs first and then mark the entry
3135                  * copied. This means that we don't copy PASID
3136                  * translations from the old kernel, but this is fine as
3137                  * faults there are not fatal.
3138                  */
3139                 context_clear_pasid_enable(&ce);
3140                 context_set_copied(&ce);
3141
3142                 new_ce[idx] = ce;
3143         }
3144
3145         tbl[tbl_idx + pos] = new_ce;
3146
3147         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3148
3149 out_unmap:
3150         memunmap(old_ce);
3151
3152 out:
3153         return ret;
3154 }
3155
3156 static int copy_translation_tables(struct intel_iommu *iommu)
3157 {
3158         struct context_entry **ctxt_tbls;
3159         struct root_entry *old_rt;
3160         phys_addr_t old_rt_phys;
3161         int ctxt_table_entries;
3162         unsigned long flags;
3163         u64 rtaddr_reg;
3164         int bus, ret;
3165         bool new_ext, ext;
3166
3167         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3168         ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3169         new_ext    = !!ecap_ecs(iommu->ecap);
3170
3171         /*
3172          * The RTT bit can only be changed when translation is disabled,
3173          * but disabling translation means to open a window for data
3174          * corruption. So bail out and don't copy anything if we would
3175          * have to change the bit.
3176          */
3177         if (new_ext != ext)
3178                 return -EINVAL;
3179
3180         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3181         if (!old_rt_phys)
3182                 return -EINVAL;
3183
3184         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3185         if (!old_rt)
3186                 return -ENOMEM;
3187
3188         /* This is too big for the stack - allocate it from slab */
3189         ctxt_table_entries = ext ? 512 : 256;
3190         ret = -ENOMEM;
3191         ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3192         if (!ctxt_tbls)
3193                 goto out_unmap;
3194
3195         for (bus = 0; bus < 256; bus++) {
3196                 ret = copy_context_table(iommu, &old_rt[bus],
3197                                          ctxt_tbls, bus, ext);
3198                 if (ret) {
3199                         pr_err("%s: Failed to copy context table for bus %d\n",
3200                                 iommu->name, bus);
3201                         continue;
3202                 }
3203         }
3204
3205         spin_lock_irqsave(&iommu->lock, flags);
3206
3207         /* Context tables are copied, now write them to the root_entry table */
3208         for (bus = 0; bus < 256; bus++) {
3209                 int idx = ext ? bus * 2 : bus;
3210                 u64 val;
3211
3212                 if (ctxt_tbls[idx]) {
3213                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
3214                         iommu->root_entry[bus].lo = val;
3215                 }
3216
3217                 if (!ext || !ctxt_tbls[idx + 1])
3218                         continue;
3219
3220                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3221                 iommu->root_entry[bus].hi = val;
3222         }
3223
3224         spin_unlock_irqrestore(&iommu->lock, flags);
3225
3226         kfree(ctxt_tbls);
3227
3228         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3229
3230         ret = 0;
3231
3232 out_unmap:
3233         memunmap(old_rt);
3234
3235         return ret;
3236 }
3237
3238 static int __init init_dmars(void)
3239 {
3240         struct dmar_drhd_unit *drhd;
3241         struct dmar_rmrr_unit *rmrr;
3242         bool copied_tables = false;
3243         struct device *dev;
3244         struct intel_iommu *iommu;
3245         int i, ret;
3246
3247         /*
3248          * for each drhd
3249          *    allocate root
3250          *    initialize and program root entry to not present
3251          * endfor
3252          */
3253         for_each_drhd_unit(drhd) {
3254                 /*
3255                  * lock not needed as this is only incremented in the single
3256                  * threaded kernel __init code path all other access are read
3257                  * only
3258                  */
3259                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3260                         g_num_of_iommus++;
3261                         continue;
3262                 }
3263                 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3264         }
3265
3266         /* Preallocate enough resources for IOMMU hot-addition */
3267         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3268                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3269
3270         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3271                         GFP_KERNEL);
3272         if (!g_iommus) {
3273                 pr_err("Allocating global iommu array failed\n");
3274                 ret = -ENOMEM;
3275                 goto error;
3276         }
3277
3278         for_each_active_iommu(iommu, drhd) {
3279                 g_iommus[iommu->seq_id] = iommu;
3280
3281                 intel_iommu_init_qi(iommu);
3282
3283                 ret = iommu_init_domains(iommu);
3284                 if (ret)
3285                         goto free_iommu;
3286
3287                 init_translation_status(iommu);
3288
3289                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3290                         iommu_disable_translation(iommu);
3291                         clear_translation_pre_enabled(iommu);
3292                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3293                                 iommu->name);
3294                 }
3295
3296                 /*
3297                  * TBD:
3298                  * we could share the same root & context tables
3299                  * among all IOMMU's. Need to Split it later.
3300                  */
3301                 ret = iommu_alloc_root_entry(iommu);
3302                 if (ret)
3303                         goto free_iommu;
3304
3305                 if (translation_pre_enabled(iommu)) {
3306                         pr_info("Translation already enabled - trying to copy translation structures\n");
3307
3308                         ret = copy_translation_tables(iommu);
3309                         if (ret) {
3310                                 /*
3311                                  * We found the IOMMU with translation
3312                                  * enabled - but failed to copy over the
3313                                  * old root-entry table. Try to proceed
3314                                  * by disabling translation now and
3315                                  * allocating a clean root-entry table.
3316                                  * This might cause DMAR faults, but
3317                                  * probably the dump will still succeed.
3318                                  */
3319                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3320                                        iommu->name);
3321                                 iommu_disable_translation(iommu);
3322                                 clear_translation_pre_enabled(iommu);
3323                         } else {
3324                                 pr_info("Copied translation tables from previous kernel for %s\n",
3325                                         iommu->name);
3326                                 copied_tables = true;
3327                         }
3328                 }
3329
3330                 if (!ecap_pass_through(iommu->ecap))
3331                         hw_pass_through = 0;
3332 #ifdef CONFIG_INTEL_IOMMU_SVM
3333                 if (pasid_enabled(iommu))
3334                         intel_svm_alloc_pasid_tables(iommu);
3335 #endif
3336         }
3337
3338         /*
3339          * Now that qi is enabled on all iommus, set the root entry and flush
3340          * caches. This is required on some Intel X58 chipsets, otherwise the
3341          * flush_context function will loop forever and the boot hangs.
3342          */
3343         for_each_active_iommu(iommu, drhd) {
3344                 iommu_flush_write_buffer(iommu);
3345                 iommu_set_root_entry(iommu);
3346                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3347                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3348         }
3349
3350         if (iommu_pass_through)
3351                 iommu_identity_mapping |= IDENTMAP_ALL;
3352
3353 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3354         iommu_identity_mapping |= IDENTMAP_GFX;
3355 #endif
3356
3357         check_tylersburg_isoch();
3358
3359         if (iommu_identity_mapping) {
3360                 ret = si_domain_init(hw_pass_through);
3361                 if (ret)
3362                         goto free_iommu;
3363         }
3364
3365
3366         /*
3367          * If we copied translations from a previous kernel in the kdump
3368          * case, we can not assign the devices to domains now, as that
3369          * would eliminate the old mappings. So skip this part and defer
3370          * the assignment to device driver initialization time.
3371          */
3372         if (copied_tables)
3373                 goto domains_done;
3374
3375         /*
3376          * If pass through is not set or not enabled, setup context entries for
3377          * identity mappings for rmrr, gfx, and isa and may fall back to static
3378          * identity mapping if iommu_identity_mapping is set.
3379          */
3380         if (iommu_identity_mapping) {
3381                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
3382                 if (ret) {
3383                         pr_crit("Failed to setup IOMMU pass-through\n");
3384                         goto free_iommu;
3385                 }
3386         }
3387         /*
3388          * For each rmrr
3389          *   for each dev attached to rmrr
3390          *   do
3391          *     locate drhd for dev, alloc domain for dev
3392          *     allocate free domain
3393          *     allocate page table entries for rmrr
3394          *     if context not allocated for bus
3395          *           allocate and init context
3396          *           set present in root table for this bus
3397          *     init context with domain, translation etc
3398          *    endfor
3399          * endfor
3400          */
3401         pr_info("Setting RMRR:\n");
3402         for_each_rmrr_units(rmrr) {
3403                 /* some BIOS lists non-exist devices in DMAR table. */
3404                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3405                                           i, dev) {
3406                         ret = iommu_prepare_rmrr_dev(rmrr, dev);
3407                         if (ret)
3408                                 pr_err("Mapping reserved region failed\n");
3409                 }
3410         }
3411
3412         iommu_prepare_isa();
3413
3414 domains_done:
3415
3416         /*
3417          * for each drhd
3418          *   enable fault log
3419          *   global invalidate context cache
3420          *   global invalidate iotlb
3421          *   enable translation
3422          */
3423         for_each_iommu(iommu, drhd) {
3424                 if (drhd->ignored) {
3425                         /*
3426                          * we always have to disable PMRs or DMA may fail on
3427                          * this device
3428                          */
3429                         if (force_on)
3430                                 iommu_disable_protect_mem_regions(iommu);
3431                         continue;
3432                 }
3433
3434                 iommu_flush_write_buffer(iommu);
3435
3436 #ifdef CONFIG_INTEL_IOMMU_SVM
3437                 if (pasid_enabled(iommu) && ecap_prs(iommu->ecap)) {
3438                         ret = intel_svm_enable_prq(iommu);
3439                         if (ret)
3440                                 goto free_iommu;
3441                 }
3442 #endif
3443                 ret = dmar_set_interrupt(iommu);
3444                 if (ret)
3445                         goto free_iommu;
3446
3447                 if (!translation_pre_enabled(iommu))
3448                         iommu_enable_translation(iommu);
3449
3450                 iommu_disable_protect_mem_regions(iommu);
3451         }
3452
3453         return 0;
3454
3455 free_iommu:
3456         for_each_active_iommu(iommu, drhd) {
3457                 disable_dmar_iommu(iommu);
3458                 free_dmar_iommu(iommu);
3459         }
3460
3461         kfree(g_iommus);
3462
3463 error:
3464         return ret;
3465 }
3466
3467 /* This takes a number of _MM_ pages, not VTD pages */
3468 static unsigned long intel_alloc_iova(struct device *dev,
3469                                      struct dmar_domain *domain,
3470                                      unsigned long nrpages, uint64_t dma_mask)
3471 {
3472         unsigned long iova_pfn = 0;
3473
3474         /* Restrict dma_mask to the width that the iommu can handle */
3475         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3476         /* Ensure we reserve the whole size-aligned region */
3477         nrpages = __roundup_pow_of_two(nrpages);
3478
3479         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3480                 /*
3481                  * First try to allocate an io virtual address in
3482                  * DMA_BIT_MASK(32) and if that fails then try allocating
3483                  * from higher range
3484                  */
3485                 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3486                                            IOVA_PFN(DMA_BIT_MASK(32)), false);
3487                 if (iova_pfn)
3488                         return iova_pfn;
3489         }
3490         iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3491                                    IOVA_PFN(dma_mask), true);
3492         if (unlikely(!iova_pfn)) {
3493                 pr_err("Allocating %ld-page iova for %s failed",
3494                        nrpages, dev_name(dev));
3495                 return 0;
3496         }
3497
3498         return iova_pfn;
3499 }
3500
3501 static struct dmar_domain *get_valid_domain_for_dev(struct device *dev)
3502 {
3503         struct dmar_domain *domain, *tmp;
3504         struct dmar_rmrr_unit *rmrr;
3505         struct device *i_dev;
3506         int i, ret;
3507
3508         domain = find_domain(dev);
3509         if (domain)
3510                 goto out;
3511
3512         domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3513         if (!domain)
3514                 goto out;
3515
3516         /* We have a new domain - setup possible RMRRs for the device */
3517         rcu_read_lock();
3518         for_each_rmrr_units(rmrr) {
3519                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3520                                           i, i_dev) {
3521                         if (i_dev != dev)
3522                                 continue;
3523
3524                         ret = domain_prepare_identity_map(dev, domain,
3525                                                           rmrr->base_address,
3526                                                           rmrr->end_address);
3527                         if (ret)
3528                                 dev_err(dev, "Mapping reserved region failed\n");
3529                 }
3530         }
3531         rcu_read_unlock();
3532
3533         tmp = set_domain_for_dev(dev, domain);
3534         if (!tmp || domain != tmp) {
3535                 domain_exit(domain);
3536                 domain = tmp;
3537         }
3538
3539 out:
3540
3541         if (!domain)
3542                 pr_err("Allocating domain for %s failed\n", dev_name(dev));
3543
3544
3545         return domain;
3546 }
3547
3548 /* Check if the dev needs to go through non-identity map and unmap process.*/
3549 static int iommu_no_mapping(struct device *dev)
3550 {
3551         int found;
3552
3553         if (iommu_dummy(dev))
3554                 return 1;
3555
3556         if (!iommu_identity_mapping)
3557                 return 0;
3558
3559         found = identity_mapping(dev);
3560         if (found) {
3561                 if (iommu_should_identity_map(dev, 0))
3562                         return 1;
3563                 else {
3564                         /*
3565                          * 32 bit DMA is removed from si_domain and fall back
3566                          * to non-identity mapping.
3567                          */
3568                         dmar_remove_one_dev_info(si_domain, dev);
3569                         pr_info("32bit %s uses non-identity mapping\n",
3570                                 dev_name(dev));
3571                         return 0;
3572                 }
3573         } else {
3574                 /*
3575                  * In case of a detached 64 bit DMA device from vm, the device
3576                  * is put into si_domain for identity mapping.
3577                  */
3578                 if (iommu_should_identity_map(dev, 0)) {
3579                         int ret;
3580                         ret = domain_add_dev_info(si_domain, dev);
3581                         if (!ret) {
3582                                 pr_info("64bit %s uses identity mapping\n",
3583                                         dev_name(dev));
3584                                 return 1;
3585                         }
3586                 }
3587         }
3588
3589         return 0;
3590 }
3591
3592 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3593                                      size_t size, int dir, u64 dma_mask)
3594 {
3595         struct dmar_domain *domain;
3596         phys_addr_t start_paddr;
3597         unsigned long iova_pfn;
3598         int prot = 0;
3599         int ret;
3600         struct intel_iommu *iommu;
3601         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3602
3603         BUG_ON(dir == DMA_NONE);
3604
3605         if (iommu_no_mapping(dev))
3606                 return paddr;
3607
3608         domain = get_valid_domain_for_dev(dev);
3609         if (!domain)
3610                 return 0;
3611
3612         iommu = domain_get_iommu(domain);
3613         size = aligned_nrpages(paddr, size);
3614
3615         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3616         if (!iova_pfn)
3617                 goto error;
3618
3619         /*
3620          * Check if DMAR supports zero-length reads on write only
3621          * mappings..
3622          */
3623         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3624                         !cap_zlr(iommu->cap))
3625                 prot |= DMA_PTE_READ;
3626         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3627                 prot |= DMA_PTE_WRITE;
3628         /*
3629          * paddr - (paddr + size) might be partial page, we should map the whole
3630          * page.  Note: if two part of one page are separately mapped, we
3631          * might have two guest_addr mapping to the same host paddr, but this
3632          * is not a big problem
3633          */
3634         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3635                                  mm_to_dma_pfn(paddr_pfn), size, prot);
3636         if (ret)
3637                 goto error;
3638
3639         start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3640         start_paddr += paddr & ~PAGE_MASK;
3641         return start_paddr;
3642
3643 error:
3644         if (iova_pfn)
3645                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3646         pr_err("Device %s request: %zx@%llx dir %d --- failed\n",
3647                 dev_name(dev), size, (unsigned long long)paddr, dir);
3648         return 0;
3649 }
3650
3651 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3652                                  unsigned long offset, size_t size,
3653                                  enum dma_data_direction dir,
3654                                  unsigned long attrs)
3655 {
3656         return __intel_map_single(dev, page_to_phys(page) + offset, size,
3657                                   dir, *dev->dma_mask);
3658 }
3659
3660 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3661 {
3662         struct dmar_domain *domain;
3663         unsigned long start_pfn, last_pfn;
3664         unsigned long nrpages;
3665         unsigned long iova_pfn;
3666         struct intel_iommu *iommu;
3667         struct page *freelist;
3668
3669         if (iommu_no_mapping(dev))
3670                 return;
3671
3672         domain = find_domain(dev);
3673         BUG_ON(!domain);
3674
3675         iommu = domain_get_iommu(domain);
3676
3677         iova_pfn = IOVA_PFN(dev_addr);
3678
3679         nrpages = aligned_nrpages(dev_addr, size);
3680         start_pfn = mm_to_dma_pfn(iova_pfn);
3681         last_pfn = start_pfn + nrpages - 1;
3682
3683         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
3684                  dev_name(dev), start_pfn, last_pfn);
3685
3686         freelist = domain_unmap(domain, start_pfn, last_pfn);
3687
3688         if (intel_iommu_strict) {
3689                 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3690                                       nrpages, !freelist, 0);
3691                 /* free iova */
3692                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3693                 dma_free_pagelist(freelist);
3694         } else {
3695                 queue_iova(&domain->iovad, iova_pfn, nrpages,
3696                            (unsigned long)freelist);
3697                 /*
3698                  * queue up the release of the unmap to save the 1/6th of the
3699                  * cpu used up by the iotlb flush operation...
3700                  */
3701         }
3702 }
3703
3704 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3705                              size_t size, enum dma_data_direction dir,
3706                              unsigned long attrs)
3707 {
3708         intel_unmap(dev, dev_addr, size);
3709 }
3710
3711 static void *intel_alloc_coherent(struct device *dev, size_t size,
3712                                   dma_addr_t *dma_handle, gfp_t flags,
3713                                   unsigned long attrs)
3714 {
3715         struct page *page = NULL;
3716         int order;
3717
3718         size = PAGE_ALIGN(size);
3719         order = get_order(size);
3720
3721         if (!iommu_no_mapping(dev))
3722                 flags &= ~(GFP_DMA | GFP_DMA32);
3723         else if (dev->coherent_dma_mask < dma_get_required_mask(dev)) {
3724                 if (dev->coherent_dma_mask < DMA_BIT_MASK(32))
3725                         flags |= GFP_DMA;
3726                 else
3727                         flags |= GFP_DMA32;
3728         }
3729
3730         if (gfpflags_allow_blocking(flags)) {
3731                 unsigned int count = size >> PAGE_SHIFT;
3732
3733                 page = dma_alloc_from_contiguous(dev, count, order, flags);
3734                 if (page && iommu_no_mapping(dev) &&
3735                     page_to_phys(page) + size > dev->coherent_dma_mask) {
3736                         dma_release_from_contiguous(dev, page, count);
3737                         page = NULL;
3738                 }
3739         }
3740
3741         if (!page)
3742                 page = alloc_pages(flags, order);
3743         if (!page)
3744                 return NULL;
3745         memset(page_address(page), 0, size);
3746
3747         *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3748                                          DMA_BIDIRECTIONAL,
3749                                          dev->coherent_dma_mask);
3750         if (*dma_handle)
3751                 return page_address(page);
3752         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3753                 __free_pages(page, order);
3754
3755         return NULL;
3756 }
3757
3758 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3759                                 dma_addr_t dma_handle, unsigned long attrs)
3760 {
3761         int order;
3762         struct page *page = virt_to_page(vaddr);
3763
3764         size = PAGE_ALIGN(size);
3765         order = get_order(size);
3766
3767         intel_unmap(dev, dma_handle, size);
3768         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3769                 __free_pages(page, order);
3770 }
3771
3772 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3773                            int nelems, enum dma_data_direction dir,
3774                            unsigned long attrs)
3775 {
3776         dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3777         unsigned long nrpages = 0;
3778         struct scatterlist *sg;
3779         int i;
3780
3781         for_each_sg(sglist, sg, nelems, i) {
3782                 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3783         }
3784
3785         intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3786 }
3787
3788 static int intel_nontranslate_map_sg(struct device *hddev,
3789         struct scatterlist *sglist, int nelems, int dir)
3790 {
3791         int i;
3792         struct scatterlist *sg;
3793
3794         for_each_sg(sglist, sg, nelems, i) {
3795                 BUG_ON(!sg_page(sg));
3796                 sg->dma_address = sg_phys(sg);
3797                 sg->dma_length = sg->length;
3798         }
3799         return nelems;
3800 }
3801
3802 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3803                         enum dma_data_direction dir, unsigned long attrs)
3804 {
3805         int i;
3806         struct dmar_domain *domain;
3807         size_t size = 0;
3808         int prot = 0;
3809         unsigned long iova_pfn;
3810         int ret;
3811         struct scatterlist *sg;
3812         unsigned long start_vpfn;
3813         struct intel_iommu *iommu;
3814
3815         BUG_ON(dir == DMA_NONE);
3816         if (iommu_no_mapping(dev))
3817                 return intel_nontranslate_map_sg(dev, sglist, nelems, dir);
3818
3819         domain = get_valid_domain_for_dev(dev);
3820         if (!domain)
3821                 return 0;
3822
3823         iommu = domain_get_iommu(domain);
3824
3825         for_each_sg(sglist, sg, nelems, i)
3826                 size += aligned_nrpages(sg->offset, sg->length);
3827
3828         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3829                                 *dev->dma_mask);
3830         if (!iova_pfn) {
3831                 sglist->dma_length = 0;
3832                 return 0;
3833         }
3834
3835         /*
3836          * Check if DMAR supports zero-length reads on write only
3837          * mappings..
3838          */
3839         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3840                         !cap_zlr(iommu->cap))
3841                 prot |= DMA_PTE_READ;
3842         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3843                 prot |= DMA_PTE_WRITE;
3844
3845         start_vpfn = mm_to_dma_pfn(iova_pfn);
3846
3847         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3848         if (unlikely(ret)) {
3849                 dma_pte_free_pagetable(domain, start_vpfn,
3850                                        start_vpfn + size - 1,
3851                                        agaw_to_level(domain->agaw) + 1);
3852                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3853                 return 0;
3854         }
3855
3856         return nelems;
3857 }
3858
3859 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3860 {
3861         return !dma_addr;
3862 }
3863
3864 const struct dma_map_ops intel_dma_ops = {
3865         .alloc = intel_alloc_coherent,
3866         .free = intel_free_coherent,
3867         .map_sg = intel_map_sg,
3868         .unmap_sg = intel_unmap_sg,
3869         .map_page = intel_map_page,
3870         .unmap_page = intel_unmap_page,
3871         .mapping_error = intel_mapping_error,
3872 #ifdef CONFIG_X86
3873         .dma_supported = dma_direct_supported,
3874 #endif
3875 };
3876
3877 static inline int iommu_domain_cache_init(void)
3878 {
3879         int ret = 0;
3880
3881         iommu_domain_cache = kmem_cache_create("iommu_domain",
3882                                          sizeof(struct dmar_domain),
3883                                          0,
3884                                          SLAB_HWCACHE_ALIGN,
3885
3886                                          NULL);
3887         if (!iommu_domain_cache) {
3888                 pr_err("Couldn't create iommu_domain cache\n");
3889                 ret = -ENOMEM;
3890         }
3891
3892         return ret;
3893 }
3894
3895 static inline int iommu_devinfo_cache_init(void)
3896 {
3897         int ret = 0;
3898
3899         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3900                                          sizeof(struct device_domain_info),
3901                                          0,
3902                                          SLAB_HWCACHE_ALIGN,
3903                                          NULL);
3904         if (!iommu_devinfo_cache) {
3905                 pr_err("Couldn't create devinfo cache\n");
3906                 ret = -ENOMEM;
3907         }
3908
3909         return ret;
3910 }
3911
3912 static int __init iommu_init_mempool(void)
3913 {
3914         int ret;
3915         ret = iova_cache_get();
3916         if (ret)
3917                 return ret;
3918
3919         ret = iommu_domain_cache_init();
3920         if (ret)
3921                 goto domain_error;
3922
3923         ret = iommu_devinfo_cache_init();
3924         if (!ret)
3925                 return ret;
3926
3927         kmem_cache_destroy(iommu_domain_cache);
3928 domain_error:
3929         iova_cache_put();
3930
3931         return -ENOMEM;
3932 }
3933
3934 static void __init iommu_exit_mempool(void)
3935 {
3936         kmem_cache_destroy(iommu_devinfo_cache);
3937         kmem_cache_destroy(iommu_domain_cache);
3938         iova_cache_put();
3939 }
3940
3941 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3942 {
3943         struct dmar_drhd_unit *drhd;
3944         u32 vtbar;
3945         int rc;
3946
3947         /* We know that this device on this chipset has its own IOMMU.
3948          * If we find it under a different IOMMU, then the BIOS is lying
3949          * to us. Hope that the IOMMU for this device is actually
3950          * disabled, and it needs no translation...
3951          */
3952         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3953         if (rc) {
3954                 /* "can't" happen */
3955                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3956                 return;
3957         }
3958         vtbar &= 0xffff0000;
3959
3960         /* we know that the this iommu should be at offset 0xa000 from vtbar */
3961         drhd = dmar_find_matched_drhd_unit(pdev);
3962         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3963                             TAINT_FIRMWARE_WORKAROUND,
3964                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3965                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3966 }
3967 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3968
3969 static void __init init_no_remapping_devices(void)
3970 {
3971         struct dmar_drhd_unit *drhd;
3972         struct device *dev;
3973         int i;
3974
3975         for_each_drhd_unit(drhd) {
3976                 if (!drhd->include_all) {
3977                         for_each_active_dev_scope(drhd->devices,
3978                                                   drhd->devices_cnt, i, dev)
3979                                 break;
3980                         /* ignore DMAR unit if no devices exist */
3981                         if (i == drhd->devices_cnt)
3982                                 drhd->ignored = 1;
3983                 }
3984         }
3985
3986         for_each_active_drhd_unit(drhd) {
3987                 if (drhd->include_all)
3988                         continue;
3989
3990                 for_each_active_dev_scope(drhd->devices,
3991                                           drhd->devices_cnt, i, dev)
3992                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3993                                 break;
3994                 if (i < drhd->devices_cnt)
3995                         continue;
3996
3997                 /* This IOMMU has *only* gfx devices. Either bypass it or
3998                    set the gfx_mapped flag, as appropriate */
3999                 if (dmar_map_gfx) {
4000                         intel_iommu_gfx_mapped = 1;
4001                 } else {
4002                         drhd->ignored = 1;
4003                         for_each_active_dev_scope(drhd->devices,
4004                                                   drhd->devices_cnt, i, dev)
4005                                 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4006                 }
4007         }
4008 }
4009
4010 #ifdef CONFIG_SUSPEND
4011 static int init_iommu_hw(void)
4012 {
4013         struct dmar_drhd_unit *drhd;
4014         struct intel_iommu *iommu = NULL;
4015
4016         for_each_active_iommu(iommu, drhd)
4017                 if (iommu->qi)
4018                         dmar_reenable_qi(iommu);
4019
4020         for_each_iommu(iommu, drhd) {
4021                 if (drhd->ignored) {
4022                         /*
4023                          * we always have to disable PMRs or DMA may fail on
4024                          * this device
4025                          */
4026                         if (force_on)
4027                                 iommu_disable_protect_mem_regions(iommu);
4028                         continue;
4029                 }
4030
4031                 iommu_flush_write_buffer(iommu);
4032
4033                 iommu_set_root_entry(iommu);
4034
4035                 iommu->flush.flush_context(iommu, 0, 0, 0,
4036                                            DMA_CCMD_GLOBAL_INVL);
4037                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4038                 iommu_enable_translation(iommu);
4039                 iommu_disable_protect_mem_regions(iommu);
4040         }
4041
4042         return 0;
4043 }
4044
4045 static void iommu_flush_all(void)
4046 {
4047         struct dmar_drhd_unit *drhd;
4048         struct intel_iommu *iommu;
4049
4050         for_each_active_iommu(iommu, drhd) {
4051                 iommu->flush.flush_context(iommu, 0, 0, 0,
4052                                            DMA_CCMD_GLOBAL_INVL);
4053                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4054                                          DMA_TLB_GLOBAL_FLUSH);
4055         }
4056 }
4057
4058 static int iommu_suspend(void)
4059 {
4060         struct dmar_drhd_unit *drhd;
4061         struct intel_iommu *iommu = NULL;
4062         unsigned long flag;
4063
4064         for_each_active_iommu(iommu, drhd) {
4065                 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4066                                                  GFP_ATOMIC);
4067                 if (!iommu->iommu_state)
4068                         goto nomem;
4069         }
4070
4071         iommu_flush_all();
4072
4073         for_each_active_iommu(iommu, drhd) {
4074                 iommu_disable_translation(iommu);
4075
4076                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4077
4078                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4079                         readl(iommu->reg + DMAR_FECTL_REG);
4080                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4081                         readl(iommu->reg + DMAR_FEDATA_REG);
4082                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4083                         readl(iommu->reg + DMAR_FEADDR_REG);
4084                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4085                         readl(iommu->reg + DMAR_FEUADDR_REG);
4086
4087                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4088         }
4089         return 0;
4090
4091 nomem:
4092         for_each_active_iommu(iommu, drhd)
4093                 kfree(iommu->iommu_state);
4094
4095         return -ENOMEM;
4096 }
4097
4098 static void iommu_resume(void)
4099 {
4100         struct dmar_drhd_unit *drhd;
4101         struct intel_iommu *iommu = NULL;
4102         unsigned long flag;
4103
4104         if (init_iommu_hw()) {
4105                 if (force_on)
4106                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4107                 else
4108                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4109                 return;
4110         }
4111
4112         for_each_active_iommu(iommu, drhd) {
4113
4114                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4115
4116                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4117                         iommu->reg + DMAR_FECTL_REG);
4118                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4119                         iommu->reg + DMAR_FEDATA_REG);
4120                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4121                         iommu->reg + DMAR_FEADDR_REG);
4122                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4123                         iommu->reg + DMAR_FEUADDR_REG);
4124
4125                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4126         }
4127
4128         for_each_active_iommu(iommu, drhd)
4129                 kfree(iommu->iommu_state);
4130 }
4131
4132 static struct syscore_ops iommu_syscore_ops = {
4133         .resume         = iommu_resume,
4134         .suspend        = iommu_suspend,
4135 };
4136
4137 static void __init init_iommu_pm_ops(void)
4138 {
4139         register_syscore_ops(&iommu_syscore_ops);
4140 }
4141
4142 #else
4143 static inline void init_iommu_pm_ops(void) {}
4144 #endif  /* CONFIG_PM */
4145
4146
4147 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4148 {
4149         struct acpi_dmar_reserved_memory *rmrr;
4150         int prot = DMA_PTE_READ|DMA_PTE_WRITE;
4151         struct dmar_rmrr_unit *rmrru;
4152         size_t length;
4153
4154         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4155         if (!rmrru)
4156                 goto out;
4157
4158         rmrru->hdr = header;
4159         rmrr = (struct acpi_dmar_reserved_memory *)header;
4160         rmrru->base_address = rmrr->base_address;
4161         rmrru->end_address = rmrr->end_address;
4162
4163         length = rmrr->end_address - rmrr->base_address + 1;
4164         rmrru->resv = iommu_alloc_resv_region(rmrr->base_address, length, prot,
4165                                               IOMMU_RESV_DIRECT);
4166         if (!rmrru->resv)
4167                 goto free_rmrru;
4168
4169         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4170                                 ((void *)rmrr) + rmrr->header.length,
4171                                 &rmrru->devices_cnt);
4172         if (rmrru->devices_cnt && rmrru->devices == NULL)
4173                 goto free_all;
4174
4175         list_add(&rmrru->list, &dmar_rmrr_units);
4176
4177         return 0;
4178 free_all:
4179         kfree(rmrru->resv);
4180 free_rmrru:
4181         kfree(rmrru);
4182 out:
4183         return -ENOMEM;
4184 }
4185
4186 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4187 {
4188         struct dmar_atsr_unit *atsru;
4189         struct acpi_dmar_atsr *tmp;
4190
4191         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4192                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4193                 if (atsr->segment != tmp->segment)
4194                         continue;
4195                 if (atsr->header.length != tmp->header.length)
4196                         continue;
4197                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4198                         return atsru;
4199         }
4200
4201         return NULL;
4202 }
4203
4204 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4205 {
4206         struct acpi_dmar_atsr *atsr;
4207         struct dmar_atsr_unit *atsru;
4208
4209         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4210                 return 0;
4211
4212         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4213         atsru = dmar_find_atsr(atsr);
4214         if (atsru)
4215                 return 0;
4216
4217         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4218         if (!atsru)
4219                 return -ENOMEM;
4220
4221         /*
4222          * If memory is allocated from slab by ACPI _DSM method, we need to
4223          * copy the memory content because the memory buffer will be freed
4224          * on return.
4225          */
4226         atsru->hdr = (void *)(atsru + 1);
4227         memcpy(atsru->hdr, hdr, hdr->length);
4228         atsru->include_all = atsr->flags & 0x1;
4229         if (!atsru->include_all) {
4230                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4231                                 (void *)atsr + atsr->header.length,
4232                                 &atsru->devices_cnt);
4233                 if (atsru->devices_cnt && atsru->devices == NULL) {
4234                         kfree(atsru);
4235                         return -ENOMEM;
4236                 }
4237         }
4238
4239         list_add_rcu(&atsru->list, &dmar_atsr_units);
4240
4241         return 0;
4242 }
4243
4244 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4245 {
4246         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4247         kfree(atsru);
4248 }
4249
4250 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4251 {
4252         struct acpi_dmar_atsr *atsr;
4253         struct dmar_atsr_unit *atsru;
4254
4255         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4256         atsru = dmar_find_atsr(atsr);
4257         if (atsru) {
4258                 list_del_rcu(&atsru->list);
4259                 synchronize_rcu();
4260                 intel_iommu_free_atsr(atsru);
4261         }
4262
4263         return 0;
4264 }
4265
4266 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4267 {
4268         int i;
4269         struct device *dev;
4270         struct acpi_dmar_atsr *atsr;
4271         struct dmar_atsr_unit *atsru;
4272
4273         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4274         atsru = dmar_find_atsr(atsr);
4275         if (!atsru)
4276                 return 0;
4277
4278         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4279                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4280                                           i, dev)
4281                         return -EBUSY;
4282         }
4283
4284         return 0;
4285 }
4286
4287 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4288 {
4289         int sp, ret = 0;
4290         struct intel_iommu *iommu = dmaru->iommu;
4291
4292         if (g_iommus[iommu->seq_id])
4293                 return 0;
4294
4295         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4296                 pr_warn("%s: Doesn't support hardware pass through.\n",
4297                         iommu->name);
4298                 return -ENXIO;
4299         }
4300         if (!ecap_sc_support(iommu->ecap) &&
4301             domain_update_iommu_snooping(iommu)) {
4302                 pr_warn("%s: Doesn't support snooping.\n",
4303                         iommu->name);
4304                 return -ENXIO;
4305         }
4306         sp = domain_update_iommu_superpage(iommu) - 1;
4307         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4308                 pr_warn("%s: Doesn't support large page.\n",
4309                         iommu->name);
4310                 return -ENXIO;
4311         }
4312
4313         /*
4314          * Disable translation if already enabled prior to OS handover.
4315          */
4316         if (iommu->gcmd & DMA_GCMD_TE)
4317                 iommu_disable_translation(iommu);
4318
4319         g_iommus[iommu->seq_id] = iommu;
4320         ret = iommu_init_domains(iommu);
4321         if (ret == 0)
4322                 ret = iommu_alloc_root_entry(iommu);
4323         if (ret)
4324                 goto out;
4325
4326 #ifdef CONFIG_INTEL_IOMMU_SVM
4327         if (pasid_enabled(iommu))
4328                 intel_svm_alloc_pasid_tables(iommu);
4329 #endif
4330
4331         if (dmaru->ignored) {
4332                 /*
4333                  * we always have to disable PMRs or DMA may fail on this device
4334                  */
4335                 if (force_on)
4336                         iommu_disable_protect_mem_regions(iommu);
4337                 return 0;
4338         }
4339
4340         intel_iommu_init_qi(iommu);
4341         iommu_flush_write_buffer(iommu);
4342
4343 #ifdef CONFIG_INTEL_IOMMU_SVM
4344         if (pasid_enabled(iommu) && ecap_prs(iommu->ecap)) {
4345                 ret = intel_svm_enable_prq(iommu);
4346                 if (ret)
4347                         goto disable_iommu;
4348         }
4349 #endif
4350         ret = dmar_set_interrupt(iommu);
4351         if (ret)
4352                 goto disable_iommu;
4353
4354         iommu_set_root_entry(iommu);
4355         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4356         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4357         iommu_enable_translation(iommu);
4358
4359         iommu_disable_protect_mem_regions(iommu);
4360         return 0;
4361
4362 disable_iommu:
4363         disable_dmar_iommu(iommu);
4364 out:
4365         free_dmar_iommu(iommu);
4366         return ret;
4367 }
4368
4369 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4370 {
4371         int ret = 0;
4372         struct intel_iommu *iommu = dmaru->iommu;
4373
4374         if (!intel_iommu_enabled)
4375                 return 0;
4376         if (iommu == NULL)
4377                 return -EINVAL;
4378
4379         if (insert) {
4380                 ret = intel_iommu_add(dmaru);
4381         } else {
4382                 disable_dmar_iommu(iommu);
4383                 free_dmar_iommu(iommu);
4384         }
4385
4386         return ret;
4387 }
4388
4389 static void intel_iommu_free_dmars(void)
4390 {
4391         struct dmar_rmrr_unit *rmrru, *rmrr_n;
4392         struct dmar_atsr_unit *atsru, *atsr_n;
4393
4394         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4395                 list_del(&rmrru->list);
4396                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4397                 kfree(rmrru->resv);
4398                 kfree(rmrru);
4399         }
4400
4401         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4402                 list_del(&atsru->list);
4403                 intel_iommu_free_atsr(atsru);
4404         }
4405 }
4406
4407 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4408 {
4409         int i, ret = 1;
4410         struct pci_bus *bus;
4411         struct pci_dev *bridge = NULL;
4412         struct device *tmp;
4413         struct acpi_dmar_atsr *atsr;
4414         struct dmar_atsr_unit *atsru;
4415
4416         dev = pci_physfn(dev);
4417         for (bus = dev->bus; bus; bus = bus->parent) {
4418                 bridge = bus->self;
4419                 /* If it's an integrated device, allow ATS */
4420                 if (!bridge)
4421                         return 1;
4422                 /* Connected via non-PCIe: no ATS */
4423                 if (!pci_is_pcie(bridge) ||
4424                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4425                         return 0;
4426                 /* If we found the root port, look it up in the ATSR */
4427                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4428                         break;
4429         }
4430
4431         rcu_read_lock();
4432         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4433                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4434                 if (atsr->segment != pci_domain_nr(dev->bus))
4435                         continue;
4436
4437                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4438                         if (tmp == &bridge->dev)
4439                                 goto out;
4440
4441                 if (atsru->include_all)
4442                         goto out;
4443         }
4444         ret = 0;
4445 out:
4446         rcu_read_unlock();
4447
4448         return ret;
4449 }
4450
4451 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4452 {
4453         int ret = 0;
4454         struct dmar_rmrr_unit *rmrru;
4455         struct dmar_atsr_unit *atsru;
4456         struct acpi_dmar_atsr *atsr;
4457         struct acpi_dmar_reserved_memory *rmrr;
4458
4459         if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4460                 return 0;
4461
4462         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4463                 rmrr = container_of(rmrru->hdr,
4464                                     struct acpi_dmar_reserved_memory, header);
4465                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4466                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4467                                 ((void *)rmrr) + rmrr->header.length,
4468                                 rmrr->segment, rmrru->devices,
4469                                 rmrru->devices_cnt);
4470                         if(ret < 0)
4471                                 return ret;
4472                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4473                         dmar_remove_dev_scope(info, rmrr->segment,
4474                                 rmrru->devices, rmrru->devices_cnt);
4475                 }
4476         }
4477
4478         list_for_each_entry(atsru, &dmar_atsr_units, list) {
4479                 if (atsru->include_all)
4480                         continue;
4481
4482                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4483                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4484                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4485                                         (void *)atsr + atsr->header.length,
4486                                         atsr->segment, atsru->devices,
4487                                         atsru->devices_cnt);
4488                         if (ret > 0)
4489                                 break;
4490                         else if(ret < 0)
4491                                 return ret;
4492                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4493                         if (dmar_remove_dev_scope(info, atsr->segment,
4494                                         atsru->devices, atsru->devices_cnt))
4495                                 break;
4496                 }
4497         }
4498
4499         return 0;
4500 }
4501
4502 /*
4503  * Here we only respond to action of unbound device from driver.
4504  *
4505  * Added device is not attached to its DMAR domain here yet. That will happen
4506  * when mapping the device to iova.
4507  */
4508 static int device_notifier(struct notifier_block *nb,
4509                                   unsigned long action, void *data)
4510 {
4511         struct device *dev = data;
4512         struct dmar_domain *domain;
4513
4514         if (iommu_dummy(dev))
4515                 return 0;
4516
4517         if (action != BUS_NOTIFY_REMOVED_DEVICE)
4518                 return 0;
4519
4520         domain = find_domain(dev);
4521         if (!domain)
4522                 return 0;
4523
4524         dmar_remove_one_dev_info(domain, dev);
4525         if (!domain_type_is_vm_or_si(domain) && list_empty(&domain->devices))
4526                 domain_exit(domain);
4527
4528         return 0;
4529 }
4530
4531 static struct notifier_block device_nb = {
4532         .notifier_call = device_notifier,
4533 };
4534
4535 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4536                                        unsigned long val, void *v)
4537 {
4538         struct memory_notify *mhp = v;
4539         unsigned long long start, end;
4540         unsigned long start_vpfn, last_vpfn;
4541
4542         switch (val) {
4543         case MEM_GOING_ONLINE:
4544                 start = mhp->start_pfn << PAGE_SHIFT;
4545                 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4546                 if (iommu_domain_identity_map(si_domain, start, end)) {
4547                         pr_warn("Failed to build identity map for [%llx-%llx]\n",
4548                                 start, end);
4549                         return NOTIFY_BAD;
4550                 }
4551                 break;
4552
4553         case MEM_OFFLINE:
4554         case MEM_CANCEL_ONLINE:
4555                 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4556                 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4557                 while (start_vpfn <= last_vpfn) {
4558                         struct iova *iova;
4559                         struct dmar_drhd_unit *drhd;
4560                         struct intel_iommu *iommu;
4561                         struct page *freelist;
4562
4563                         iova = find_iova(&si_domain->iovad, start_vpfn);
4564                         if (iova == NULL) {
4565                                 pr_debug("Failed get IOVA for PFN %lx\n",
4566                                          start_vpfn);
4567                                 break;
4568                         }
4569
4570                         iova = split_and_remove_iova(&si_domain->iovad, iova,
4571                                                      start_vpfn, last_vpfn);
4572                         if (iova == NULL) {
4573                                 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4574                                         start_vpfn, last_vpfn);
4575                                 return NOTIFY_BAD;
4576                         }
4577
4578                         freelist = domain_unmap(si_domain, iova->pfn_lo,
4579                                                iova->pfn_hi);
4580
4581                         rcu_read_lock();
4582                         for_each_active_iommu(iommu, drhd)
4583                                 iommu_flush_iotlb_psi(iommu, si_domain,
4584                                         iova->pfn_lo, iova_size(iova),
4585                                         !freelist, 0);
4586                         rcu_read_unlock();
4587                         dma_free_pagelist(freelist);
4588
4589                         start_vpfn = iova->pfn_hi + 1;
4590                         free_iova_mem(iova);
4591                 }
4592                 break;
4593         }
4594
4595         return NOTIFY_OK;
4596 }
4597
4598 static struct notifier_block intel_iommu_memory_nb = {
4599         .notifier_call = intel_iommu_memory_notifier,
4600         .priority = 0
4601 };
4602
4603 static void free_all_cpu_cached_iovas(unsigned int cpu)
4604 {
4605         int i;
4606
4607         for (i = 0; i < g_num_of_iommus; i++) {
4608                 struct intel_iommu *iommu = g_iommus[i];
4609                 struct dmar_domain *domain;
4610                 int did;
4611
4612                 if (!iommu)
4613                         continue;
4614
4615                 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4616                         domain = get_iommu_domain(iommu, (u16)did);
4617
4618                         if (!domain)
4619                                 continue;
4620                         free_cpu_cached_iovas(cpu, &domain->iovad);
4621                 }
4622         }
4623 }
4624
4625 static int intel_iommu_cpu_dead(unsigned int cpu)
4626 {
4627         free_all_cpu_cached_iovas(cpu);
4628         return 0;
4629 }
4630
4631 static void intel_disable_iommus(void)
4632 {
4633         struct intel_iommu *iommu = NULL;
4634         struct dmar_drhd_unit *drhd;
4635
4636         for_each_iommu(iommu, drhd)
4637                 iommu_disable_translation(iommu);
4638 }
4639
4640 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4641 {
4642         struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4643
4644         return container_of(iommu_dev, struct intel_iommu, iommu);
4645 }
4646
4647 static ssize_t intel_iommu_show_version(struct device *dev,
4648                                         struct device_attribute *attr,
4649                                         char *buf)
4650 {
4651         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4652         u32 ver = readl(iommu->reg + DMAR_VER_REG);
4653         return sprintf(buf, "%d:%d\n",
4654                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4655 }
4656 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4657
4658 static ssize_t intel_iommu_show_address(struct device *dev,
4659                                         struct device_attribute *attr,
4660                                         char *buf)
4661 {
4662         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4663         return sprintf(buf, "%llx\n", iommu->reg_phys);
4664 }
4665 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4666
4667 static ssize_t intel_iommu_show_cap(struct device *dev,
4668                                     struct device_attribute *attr,
4669                                     char *buf)
4670 {
4671         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4672         return sprintf(buf, "%llx\n", iommu->cap);
4673 }
4674 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4675
4676 static ssize_t intel_iommu_show_ecap(struct device *dev,
4677                                     struct device_attribute *attr,
4678                                     char *buf)
4679 {
4680         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4681         return sprintf(buf, "%llx\n", iommu->ecap);
4682 }
4683 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4684
4685 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4686                                       struct device_attribute *attr,
4687                                       char *buf)
4688 {
4689         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4690         return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4691 }
4692 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4693
4694 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4695                                            struct device_attribute *attr,
4696                                            char *buf)
4697 {
4698         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4699         return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4700                                                   cap_ndoms(iommu->cap)));
4701 }
4702 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4703
4704 static struct attribute *intel_iommu_attrs[] = {
4705         &dev_attr_version.attr,
4706         &dev_attr_address.attr,
4707         &dev_attr_cap.attr,
4708         &dev_attr_ecap.attr,
4709         &dev_attr_domains_supported.attr,
4710         &dev_attr_domains_used.attr,
4711         NULL,
4712 };
4713
4714 static struct attribute_group intel_iommu_group = {
4715         .name = "intel-iommu",
4716         .attrs = intel_iommu_attrs,
4717 };
4718
4719 const struct attribute_group *intel_iommu_groups[] = {
4720         &intel_iommu_group,
4721         NULL,
4722 };
4723
4724 int __init intel_iommu_init(void)
4725 {
4726         int ret = -ENODEV;
4727         struct dmar_drhd_unit *drhd;
4728         struct intel_iommu *iommu;
4729
4730         /* VT-d is required for a TXT/tboot launch, so enforce that */
4731         force_on = tboot_force_iommu();
4732
4733         if (iommu_init_mempool()) {
4734                 if (force_on)
4735                         panic("tboot: Failed to initialize iommu memory\n");
4736                 return -ENOMEM;
4737         }
4738
4739         down_write(&dmar_global_lock);
4740         if (dmar_table_init()) {
4741                 if (force_on)
4742                         panic("tboot: Failed to initialize DMAR table\n");
4743                 goto out_free_dmar;
4744         }
4745
4746         if (dmar_dev_scope_init() < 0) {
4747                 if (force_on)
4748                         panic("tboot: Failed to initialize DMAR device scope\n");
4749                 goto out_free_dmar;
4750         }
4751
4752         up_write(&dmar_global_lock);
4753
4754         /*
4755          * The bus notifier takes the dmar_global_lock, so lockdep will
4756          * complain later when we register it under the lock.
4757          */
4758         dmar_register_bus_notifier();
4759
4760         down_write(&dmar_global_lock);
4761
4762         if (no_iommu || dmar_disabled) {
4763                 /*
4764                  * We exit the function here to ensure IOMMU's remapping and
4765                  * mempool aren't setup, which means that the IOMMU's PMRs
4766                  * won't be disabled via the call to init_dmars(). So disable
4767                  * it explicitly here. The PMRs were setup by tboot prior to
4768                  * calling SENTER, but the kernel is expected to reset/tear
4769                  * down the PMRs.
4770                  */
4771                 if (intel_iommu_tboot_noforce) {
4772                         for_each_iommu(iommu, drhd)
4773                                 iommu_disable_protect_mem_regions(iommu);
4774                 }
4775
4776                 /*
4777                  * Make sure the IOMMUs are switched off, even when we
4778                  * boot into a kexec kernel and the previous kernel left
4779                  * them enabled
4780                  */
4781                 intel_disable_iommus();
4782                 goto out_free_dmar;
4783         }
4784
4785         if (list_empty(&dmar_rmrr_units))
4786                 pr_info("No RMRR found\n");
4787
4788         if (list_empty(&dmar_atsr_units))
4789                 pr_info("No ATSR found\n");
4790
4791         if (dmar_init_reserved_ranges()) {
4792                 if (force_on)
4793                         panic("tboot: Failed to reserve iommu ranges\n");
4794                 goto out_free_reserved_range;
4795         }
4796
4797         init_no_remapping_devices();
4798
4799         ret = init_dmars();
4800         if (ret) {
4801                 if (force_on)
4802                         panic("tboot: Failed to initialize DMARs\n");
4803                 pr_err("Initialization failed\n");
4804                 goto out_free_reserved_range;
4805         }
4806         up_write(&dmar_global_lock);
4807         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4808
4809 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
4810         swiotlb = 0;
4811 #endif
4812         dma_ops = &intel_dma_ops;
4813
4814         init_iommu_pm_ops();
4815
4816         for_each_active_iommu(iommu, drhd) {
4817                 iommu_device_sysfs_add(&iommu->iommu, NULL,
4818                                        intel_iommu_groups,
4819                                        "%s", iommu->name);
4820                 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4821                 iommu_device_register(&iommu->iommu);
4822         }
4823
4824         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4825         bus_register_notifier(&pci_bus_type, &device_nb);
4826         if (si_domain && !hw_pass_through)
4827                 register_memory_notifier(&intel_iommu_memory_nb);
4828         cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4829                           intel_iommu_cpu_dead);
4830         intel_iommu_enabled = 1;
4831
4832         return 0;
4833
4834 out_free_reserved_range:
4835         put_iova_domain(&reserved_iova_list);
4836 out_free_dmar:
4837         intel_iommu_free_dmars();
4838         up_write(&dmar_global_lock);
4839         iommu_exit_mempool();
4840         return ret;
4841 }
4842
4843 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4844 {
4845         struct intel_iommu *iommu = opaque;
4846
4847         domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4848         return 0;
4849 }
4850
4851 /*
4852  * NB - intel-iommu lacks any sort of reference counting for the users of
4853  * dependent devices.  If multiple endpoints have intersecting dependent
4854  * devices, unbinding the driver from any one of them will possibly leave
4855  * the others unable to operate.
4856  */
4857 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4858 {
4859         if (!iommu || !dev || !dev_is_pci(dev))
4860                 return;
4861
4862         pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4863 }
4864
4865 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4866 {
4867         struct intel_iommu *iommu;
4868         unsigned long flags;
4869
4870         assert_spin_locked(&device_domain_lock);
4871
4872         if (WARN_ON(!info))
4873                 return;
4874
4875         iommu = info->iommu;
4876
4877         if (info->dev) {
4878                 iommu_disable_dev_iotlb(info);
4879                 domain_context_clear(iommu, info->dev);
4880         }
4881
4882         unlink_domain_info(info);
4883
4884         spin_lock_irqsave(&iommu->lock, flags);
4885         domain_detach_iommu(info->domain, iommu);
4886         spin_unlock_irqrestore(&iommu->lock, flags);
4887
4888         free_devinfo_mem(info);
4889 }
4890
4891 static void dmar_remove_one_dev_info(struct dmar_domain *domain,
4892                                      struct device *dev)
4893 {
4894         struct device_domain_info *info;
4895         unsigned long flags;
4896
4897         spin_lock_irqsave(&device_domain_lock, flags);
4898         info = dev->archdata.iommu;
4899         __dmar_remove_one_dev_info(info);
4900         spin_unlock_irqrestore(&device_domain_lock, flags);
4901 }
4902
4903 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4904 {
4905         int adjust_width;
4906
4907         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
4908         domain_reserve_special_ranges(domain);
4909
4910         /* calculate AGAW */
4911         domain->gaw = guest_width;
4912         adjust_width = guestwidth_to_adjustwidth(guest_width);
4913         domain->agaw = width_to_agaw(adjust_width);
4914
4915         domain->iommu_coherency = 0;
4916         domain->iommu_snooping = 0;
4917         domain->iommu_superpage = 0;
4918         domain->max_addr = 0;
4919
4920         /* always allocate the top pgd */
4921         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4922         if (!domain->pgd)
4923                 return -ENOMEM;
4924         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4925         return 0;
4926 }
4927
4928 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4929 {
4930         struct dmar_domain *dmar_domain;
4931         struct iommu_domain *domain;
4932
4933         if (type != IOMMU_DOMAIN_UNMANAGED)
4934                 return NULL;
4935
4936         dmar_domain = alloc_domain(DOMAIN_FLAG_VIRTUAL_MACHINE);
4937         if (!dmar_domain) {
4938                 pr_err("Can't allocate dmar_domain\n");
4939                 return NULL;
4940         }
4941         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4942                 pr_err("Domain initialization failed\n");
4943                 domain_exit(dmar_domain);
4944                 return NULL;
4945         }
4946         domain_update_iommu_cap(dmar_domain);
4947
4948         domain = &dmar_domain->domain;
4949         domain->geometry.aperture_start = 0;
4950         domain->geometry.aperture_end   = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4951         domain->geometry.force_aperture = true;
4952
4953         return domain;
4954 }
4955
4956 static void intel_iommu_domain_free(struct iommu_domain *domain)
4957 {
4958         domain_exit(to_dmar_domain(domain));
4959 }
4960
4961 static int intel_iommu_attach_device(struct iommu_domain *domain,
4962                                      struct device *dev)
4963 {
4964         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4965         struct intel_iommu *iommu;
4966         int addr_width;
4967         u8 bus, devfn;
4968
4969         if (device_is_rmrr_locked(dev)) {
4970                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4971                 return -EPERM;
4972         }
4973
4974         /* normally dev is not mapped */
4975         if (unlikely(domain_context_mapped(dev))) {
4976                 struct dmar_domain *old_domain;
4977
4978                 old_domain = find_domain(dev);
4979                 if (old_domain) {
4980                         rcu_read_lock();
4981                         dmar_remove_one_dev_info(old_domain, dev);
4982                         rcu_read_unlock();
4983
4984                         if (!domain_type_is_vm_or_si(old_domain) &&
4985                              list_empty(&old_domain->devices))
4986                                 domain_exit(old_domain);
4987                 }
4988         }
4989
4990         iommu = device_to_iommu(dev, &bus, &devfn);
4991         if (!iommu)
4992                 return -ENODEV;
4993
4994         /* check if this iommu agaw is sufficient for max mapped address */
4995         addr_width = agaw_to_width(iommu->agaw);
4996         if (addr_width > cap_mgaw(iommu->cap))
4997                 addr_width = cap_mgaw(iommu->cap);
4998
4999         if (dmar_domain->max_addr > (1LL << addr_width)) {
5000                 pr_err("%s: iommu width (%d) is not "
5001                        "sufficient for the mapped address (%llx)\n",
5002                        __func__, addr_width, dmar_domain->max_addr);
5003                 return -EFAULT;
5004         }
5005         dmar_domain->gaw = addr_width;
5006
5007         /*
5008          * Knock out extra levels of page tables if necessary
5009          */
5010         while (iommu->agaw < dmar_domain->agaw) {
5011                 struct dma_pte *pte;
5012
5013                 pte = dmar_domain->pgd;
5014                 if (dma_pte_present(pte)) {
5015                         dmar_domain->pgd = (struct dma_pte *)
5016                                 phys_to_virt(dma_pte_addr(pte));
5017                         free_pgtable_page(pte);
5018                 }
5019                 dmar_domain->agaw--;
5020         }
5021
5022         return domain_add_dev_info(dmar_domain, dev);
5023 }
5024
5025 static void intel_iommu_detach_device(struct iommu_domain *domain,
5026                                       struct device *dev)
5027 {
5028         dmar_remove_one_dev_info(to_dmar_domain(domain), dev);
5029 }
5030
5031 static int intel_iommu_map(struct iommu_domain *domain,
5032                            unsigned long iova, phys_addr_t hpa,
5033                            size_t size, int iommu_prot)
5034 {
5035         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5036         u64 max_addr;
5037         int prot = 0;
5038         int ret;
5039
5040         if (iommu_prot & IOMMU_READ)
5041                 prot |= DMA_PTE_READ;
5042         if (iommu_prot & IOMMU_WRITE)
5043                 prot |= DMA_PTE_WRITE;
5044         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5045                 prot |= DMA_PTE_SNP;
5046
5047         max_addr = iova + size;
5048         if (dmar_domain->max_addr < max_addr) {
5049                 u64 end;
5050
5051                 /* check if minimum agaw is sufficient for mapped address */
5052                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5053                 if (end < max_addr) {
5054                         pr_err("%s: iommu width (%d) is not "
5055                                "sufficient for the mapped address (%llx)\n",
5056                                __func__, dmar_domain->gaw, max_addr);
5057                         return -EFAULT;
5058                 }
5059                 dmar_domain->max_addr = max_addr;
5060         }
5061         /* Round up size to next multiple of PAGE_SIZE, if it and
5062            the low bits of hpa would take us onto the next page */
5063         size = aligned_nrpages(hpa, size);
5064         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5065                                  hpa >> VTD_PAGE_SHIFT, size, prot);
5066         return ret;
5067 }
5068
5069 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5070                                 unsigned long iova, size_t size)
5071 {
5072         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5073         struct page *freelist = NULL;
5074         unsigned long start_pfn, last_pfn;
5075         unsigned int npages;
5076         int iommu_id, level = 0;
5077
5078         /* Cope with horrid API which requires us to unmap more than the
5079            size argument if it happens to be a large-page mapping. */
5080         BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5081
5082         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5083                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5084
5085         start_pfn = iova >> VTD_PAGE_SHIFT;
5086         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5087
5088         freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5089
5090         npages = last_pfn - start_pfn + 1;
5091
5092         for_each_domain_iommu(iommu_id, dmar_domain)
5093                 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5094                                       start_pfn, npages, !freelist, 0);
5095
5096         dma_free_pagelist(freelist);
5097
5098         if (dmar_domain->max_addr == iova + size)
5099                 dmar_domain->max_addr = iova;
5100
5101         return size;
5102 }
5103
5104 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5105                                             dma_addr_t iova)
5106 {
5107         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5108         struct dma_pte *pte;
5109         int level = 0;
5110         u64 phys = 0;
5111
5112         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5113         if (pte)
5114                 phys = dma_pte_addr(pte);
5115
5116         return phys;
5117 }
5118
5119 static bool intel_iommu_capable(enum iommu_cap cap)
5120 {
5121         if (cap == IOMMU_CAP_CACHE_COHERENCY)
5122                 return domain_update_iommu_snooping(NULL) == 1;
5123         if (cap == IOMMU_CAP_INTR_REMAP)
5124                 return irq_remapping_enabled == 1;
5125
5126         return false;
5127 }
5128
5129 static int intel_iommu_add_device(struct device *dev)
5130 {
5131         struct intel_iommu *iommu;
5132         struct iommu_group *group;
5133         u8 bus, devfn;
5134
5135         iommu = device_to_iommu(dev, &bus, &devfn);
5136         if (!iommu)
5137                 return -ENODEV;
5138
5139         iommu_device_link(&iommu->iommu, dev);
5140
5141         group = iommu_group_get_for_dev(dev);
5142
5143         if (IS_ERR(group))
5144                 return PTR_ERR(group);
5145
5146         iommu_group_put(group);
5147         return 0;
5148 }
5149
5150 static void intel_iommu_remove_device(struct device *dev)
5151 {
5152         struct intel_iommu *iommu;
5153         u8 bus, devfn;
5154
5155         iommu = device_to_iommu(dev, &bus, &devfn);
5156         if (!iommu)
5157                 return;
5158
5159         iommu_group_remove_device(dev);
5160
5161         iommu_device_unlink(&iommu->iommu, dev);
5162 }
5163
5164 static void intel_iommu_get_resv_regions(struct device *device,
5165                                          struct list_head *head)
5166 {
5167         struct iommu_resv_region *reg;
5168         struct dmar_rmrr_unit *rmrr;
5169         struct device *i_dev;
5170         int i;
5171
5172         rcu_read_lock();
5173         for_each_rmrr_units(rmrr) {
5174                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5175                                           i, i_dev) {
5176                         if (i_dev != device)
5177                                 continue;
5178
5179                         list_add_tail(&rmrr->resv->list, head);
5180                 }
5181         }
5182         rcu_read_unlock();
5183
5184         reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5185                                       IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5186                                       0, IOMMU_RESV_MSI);
5187         if (!reg)
5188                 return;
5189         list_add_tail(&reg->list, head);
5190 }
5191
5192 static void intel_iommu_put_resv_regions(struct device *dev,
5193                                          struct list_head *head)
5194 {
5195         struct iommu_resv_region *entry, *next;
5196
5197         list_for_each_entry_safe(entry, next, head, list) {
5198                 if (entry->type == IOMMU_RESV_RESERVED)
5199                         kfree(entry);
5200         }
5201 }
5202
5203 #ifdef CONFIG_INTEL_IOMMU_SVM
5204 #define MAX_NR_PASID_BITS (20)
5205 static inline unsigned long intel_iommu_get_pts(struct intel_iommu *iommu)
5206 {
5207         /*
5208          * Convert ecap_pss to extend context entry pts encoding, also
5209          * respect the soft pasid_max value set by the iommu.
5210          * - number of PASID bits = ecap_pss + 1
5211          * - number of PASID table entries = 2^(pts + 5)
5212          * Therefore, pts = ecap_pss - 4
5213          * e.g. KBL ecap_pss = 0x13, PASID has 20 bits, pts = 15
5214          */
5215         if (ecap_pss(iommu->ecap) < 5)
5216                 return 0;
5217
5218         /* pasid_max is encoded as actual number of entries not the bits */
5219         return find_first_bit((unsigned long *)&iommu->pasid_max,
5220                         MAX_NR_PASID_BITS) - 5;
5221 }
5222
5223 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct intel_svm_dev *sdev)
5224 {
5225         struct device_domain_info *info;
5226         struct context_entry *context;
5227         struct dmar_domain *domain;
5228         unsigned long flags;
5229         u64 ctx_lo;
5230         int ret;
5231
5232         domain = get_valid_domain_for_dev(sdev->dev);
5233         if (!domain)
5234                 return -EINVAL;
5235
5236         spin_lock_irqsave(&device_domain_lock, flags);
5237         spin_lock(&iommu->lock);
5238
5239         ret = -EINVAL;
5240         info = sdev->dev->archdata.iommu;
5241         if (!info || !info->pasid_supported)
5242                 goto out;
5243
5244         context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5245         if (WARN_ON(!context))
5246                 goto out;
5247
5248         ctx_lo = context[0].lo;
5249
5250         sdev->did = domain->iommu_did[iommu->seq_id];
5251         sdev->sid = PCI_DEVID(info->bus, info->devfn);
5252
5253         if (!(ctx_lo & CONTEXT_PASIDE)) {
5254                 if (iommu->pasid_state_table)
5255                         context[1].hi = (u64)virt_to_phys(iommu->pasid_state_table);
5256                 context[1].lo = (u64)virt_to_phys(iommu->pasid_table) |
5257                         intel_iommu_get_pts(iommu);
5258
5259                 wmb();
5260                 /* CONTEXT_TT_MULTI_LEVEL and CONTEXT_TT_DEV_IOTLB are both
5261                  * extended to permit requests-with-PASID if the PASIDE bit
5262                  * is set. which makes sense. For CONTEXT_TT_PASS_THROUGH,
5263                  * however, the PASIDE bit is ignored and requests-with-PASID
5264                  * are unconditionally blocked. Which makes less sense.
5265                  * So convert from CONTEXT_TT_PASS_THROUGH to one of the new
5266                  * "guest mode" translation types depending on whether ATS
5267                  * is available or not. Annoyingly, we can't use the new
5268                  * modes *unless* PASIDE is set. */
5269                 if ((ctx_lo & CONTEXT_TT_MASK) == (CONTEXT_TT_PASS_THROUGH << 2)) {
5270                         ctx_lo &= ~CONTEXT_TT_MASK;
5271                         if (info->ats_supported)
5272                                 ctx_lo |= CONTEXT_TT_PT_PASID_DEV_IOTLB << 2;
5273                         else
5274                                 ctx_lo |= CONTEXT_TT_PT_PASID << 2;
5275                 }
5276                 ctx_lo |= CONTEXT_PASIDE;
5277                 if (iommu->pasid_state_table)
5278                         ctx_lo |= CONTEXT_DINVE;
5279                 if (info->pri_supported)
5280                         ctx_lo |= CONTEXT_PRS;
5281                 context[0].lo = ctx_lo;
5282                 wmb();
5283                 iommu->flush.flush_context(iommu, sdev->did, sdev->sid,
5284                                            DMA_CCMD_MASK_NOBIT,
5285                                            DMA_CCMD_DEVICE_INVL);
5286         }
5287
5288         /* Enable PASID support in the device, if it wasn't already */
5289         if (!info->pasid_enabled)
5290                 iommu_enable_dev_iotlb(info);
5291
5292         if (info->ats_enabled) {
5293                 sdev->dev_iotlb = 1;
5294                 sdev->qdep = info->ats_qdep;
5295                 if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS)
5296                         sdev->qdep = 0;
5297         }
5298         ret = 0;
5299
5300  out:
5301         spin_unlock(&iommu->lock);
5302         spin_unlock_irqrestore(&device_domain_lock, flags);
5303
5304         return ret;
5305 }
5306
5307 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5308 {
5309         struct intel_iommu *iommu;
5310         u8 bus, devfn;
5311
5312         if (iommu_dummy(dev)) {
5313                 dev_warn(dev,
5314                          "No IOMMU translation for device; cannot enable SVM\n");
5315                 return NULL;
5316         }
5317
5318         iommu = device_to_iommu(dev, &bus, &devfn);
5319         if ((!iommu)) {
5320                 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5321                 return NULL;
5322         }
5323
5324         if (!iommu->pasid_table) {
5325                 dev_err(dev, "PASID not enabled on IOMMU; cannot enable SVM\n");
5326                 return NULL;
5327         }
5328
5329         return iommu;
5330 }
5331 #endif /* CONFIG_INTEL_IOMMU_SVM */
5332
5333 const struct iommu_ops intel_iommu_ops = {
5334         .capable                = intel_iommu_capable,
5335         .domain_alloc           = intel_iommu_domain_alloc,
5336         .domain_free            = intel_iommu_domain_free,
5337         .attach_dev             = intel_iommu_attach_device,
5338         .detach_dev             = intel_iommu_detach_device,
5339         .map                    = intel_iommu_map,
5340         .unmap                  = intel_iommu_unmap,
5341         .map_sg                 = default_iommu_map_sg,
5342         .iova_to_phys           = intel_iommu_iova_to_phys,
5343         .add_device             = intel_iommu_add_device,
5344         .remove_device          = intel_iommu_remove_device,
5345         .get_resv_regions       = intel_iommu_get_resv_regions,
5346         .put_resv_regions       = intel_iommu_put_resv_regions,
5347         .device_group           = pci_device_group,
5348         .pgsize_bitmap          = INTEL_IOMMU_PGSIZES,
5349 };
5350
5351 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
5352 {
5353         /* G4x/GM45 integrated gfx dmar support is totally busted. */
5354         pr_info("Disabling IOMMU for graphics on this chipset\n");
5355         dmar_map_gfx = 0;
5356 }
5357
5358 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
5359 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
5360 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
5361 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
5362 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
5363 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
5364 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
5365
5366 static void quirk_iommu_rwbf(struct pci_dev *dev)
5367 {
5368         /*
5369          * Mobile 4 Series Chipset neglects to set RWBF capability,
5370          * but needs it. Same seems to hold for the desktop versions.
5371          */
5372         pr_info("Forcing write-buffer flush capability\n");
5373         rwbf_quirk = 1;
5374 }
5375
5376 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5377 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5378 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5379 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5380 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5381 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5382 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5383
5384 #define GGC 0x52
5385 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
5386 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
5387 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
5388 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
5389 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
5390 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
5391 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
5392 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
5393
5394 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5395 {
5396         unsigned short ggc;
5397
5398         if (pci_read_config_word(dev, GGC, &ggc))
5399                 return;
5400
5401         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5402                 pr_info("BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5403                 dmar_map_gfx = 0;
5404         } else if (dmar_map_gfx) {
5405                 /* we have to ensure the gfx device is idle before we flush */
5406                 pr_info("Disabling batched IOTLB flush on Ironlake\n");
5407                 intel_iommu_strict = 1;
5408        }
5409 }
5410 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5411 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5412 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5413 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5414
5415 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5416    ISOCH DMAR unit for the Azalia sound device, but not give it any
5417    TLB entries, which causes it to deadlock. Check for that.  We do
5418    this in a function called from init_dmars(), instead of in a PCI
5419    quirk, because we don't want to print the obnoxious "BIOS broken"
5420    message if VT-d is actually disabled.
5421 */
5422 static void __init check_tylersburg_isoch(void)
5423 {
5424         struct pci_dev *pdev;
5425         uint32_t vtisochctrl;
5426
5427         /* If there's no Azalia in the system anyway, forget it. */
5428         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5429         if (!pdev)
5430                 return;
5431         pci_dev_put(pdev);
5432
5433         /* System Management Registers. Might be hidden, in which case
5434            we can't do the sanity check. But that's OK, because the
5435            known-broken BIOSes _don't_ actually hide it, so far. */
5436         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5437         if (!pdev)
5438                 return;
5439
5440         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5441                 pci_dev_put(pdev);
5442                 return;
5443         }
5444
5445         pci_dev_put(pdev);
5446
5447         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5448         if (vtisochctrl & 1)
5449                 return;
5450
5451         /* Drop all bits other than the number of TLB entries */
5452         vtisochctrl &= 0x1c;
5453
5454         /* If we have the recommended number of TLB entries (16), fine. */
5455         if (vtisochctrl == 0x10)
5456                 return;
5457
5458         /* Zero TLB entries? You get to ride the short bus to school. */
5459         if (!vtisochctrl) {
5460                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5461                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5462                      dmi_get_system_info(DMI_BIOS_VENDOR),
5463                      dmi_get_system_info(DMI_BIOS_VERSION),
5464                      dmi_get_system_info(DMI_PRODUCT_VERSION));
5465                 iommu_identity_mapping |= IDENTMAP_AZALIA;
5466                 return;
5467         }
5468
5469         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5470                vtisochctrl);
5471 }