Merge tag 'for-linus-iommufd' of git://git.kernel.org/pub/scm/linux/kernel/git/jgg...
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 2 Nov 2023 02:44:56 +0000 (16:44 -1000)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 2 Nov 2023 02:44:56 +0000 (16:44 -1000)
Pull iommufd updates from Jason Gunthorpe:
 "This brings three new iommufd capabilities:

   - Dirty tracking for DMA.

     AMD/ARM/Intel CPUs can now record if a DMA writes to a page in the
     IOPTEs within the IO page table. This can be used to generate a
     record of what memory is being dirtied by DMA activities during a
     VM migration process. A VMM like qemu will combine the IOMMU dirty
     bits with the CPU's dirty log to determine what memory to transfer.

     VFIO already has a DMA dirty tracking framework that requires PCI
     devices to implement tracking HW internally. The iommufd version
     provides an alternative that the VMM can select, if available. The
     two are designed to have very similar APIs.

   - Userspace controlled attributes for hardware page tables
     (HWPT/iommu_domain). There are currently a few generic attributes
     for HWPTs (support dirty tracking, and parent of a nest). This is
     an entry point for the userspace iommu driver to control the HW in
     detail.

   - Nested translation support for HWPTs. This is a 2D translation
     scheme similar to the CPU where a DMA goes through a first stage to
     determine an intermediate address which is then translated trough a
     second stage to a physical address.

     Like for CPU translation the first stage table would exist in VM
     controlled memory and the second stage is in the kernel and matches
     the VM's guest to physical map.

     As every IOMMU has a unique set of parameter to describe the S1 IO
     page table and its associated parameters the userspace IOMMU driver
     has to marshal the information into the correct format.

     This is 1/3 of the feature, it allows creating the nested
     translation and binding it to VFIO devices, however the API to
     support IOTLB and ATC invalidation of the stage 1 io page table,
     and forwarding of IO faults are still in progress.

  The series includes AMD and Intel support for dirty tracking. Intel
  support for nested translation.

  Along the way are a number of internal items:

   - New iommu core items: ops->domain_alloc_user(),
     ops->set_dirty_tracking, ops->read_and_clear_dirty(),
     IOMMU_DOMAIN_NESTED, and iommu_copy_struct_from_user

   - UAF fix in iopt_area_split()

   - Spelling fixes and some test suite improvement"

* tag 'for-linus-iommufd' of git://git.kernel.org/pub/scm/linux/kernel/git/jgg/iommufd: (52 commits)
  iommufd: Organize the mock domain alloc functions closer to Joerg's tree
  iommufd/selftest: Fix page-size check in iommufd_test_dirty()
  iommufd: Add iopt_area_alloc()
  iommufd: Fix missing update of domains_itree after splitting iopt_area
  iommu/vt-d: Disallow read-only mappings to nest parent domain
  iommu/vt-d: Add nested domain allocation
  iommu/vt-d: Set the nested domain to a device
  iommu/vt-d: Make domain attach helpers to be extern
  iommu/vt-d: Add helper to setup pasid nested translation
  iommu/vt-d: Add helper for nested domain allocation
  iommu/vt-d: Extend dmar_domain to support nested domain
  iommufd: Add data structure for Intel VT-d stage-1 domain allocation
  iommu/vt-d: Enhance capability check for nested parent domain allocation
  iommufd/selftest: Add coverage for IOMMU_HWPT_ALLOC with nested HWPTs
  iommufd/selftest: Add nested domain allocation for mock domain
  iommu: Add iommu_copy_struct_from_user helper
  iommufd: Add a nested HW pagetable object
  iommu: Pass in parent domain with user_data to domain_alloc_user op
  iommufd: Share iommufd_hwpt_alloc with IOMMUFD_OBJ_HWPT_NESTED
  iommufd: Derive iommufd_hwpt_paging from iommufd_hw_pagetable
  ...

36 files changed:
drivers/iommu/Kconfig
drivers/iommu/amd/Kconfig
drivers/iommu/amd/amd_iommu_types.h
drivers/iommu/amd/io_pgtable.c
drivers/iommu/amd/iommu.c
drivers/iommu/intel/Kconfig
drivers/iommu/intel/Makefile
drivers/iommu/intel/iommu.c
drivers/iommu/intel/iommu.h
drivers/iommu/intel/nested.c [new file with mode: 0644]
drivers/iommu/intel/pasid.c
drivers/iommu/intel/pasid.h
drivers/iommu/iommufd/Makefile
drivers/iommu/iommufd/device.c
drivers/iommu/iommufd/hw_pagetable.c
drivers/iommu/iommufd/io_pagetable.c
drivers/iommu/iommufd/iommufd_private.h
drivers/iommu/iommufd/iommufd_test.h
drivers/iommu/iommufd/iova_bitmap.c [moved from drivers/vfio/iova_bitmap.c with 98% similarity]
drivers/iommu/iommufd/main.c
drivers/iommu/iommufd/pages.c
drivers/iommu/iommufd/selftest.c
drivers/iommu/iommufd/vfio_compat.c
drivers/vfio/Makefile
drivers/vfio/pci/mlx5/Kconfig
drivers/vfio/pci/mlx5/main.c
drivers/vfio/pci/pds/Kconfig
drivers/vfio/pci/pds/pci_drv.c
drivers/vfio/vfio_main.c
include/linux/io-pgtable.h
include/linux/iommu.h
include/linux/iova_bitmap.h
include/uapi/linux/iommufd.h
tools/testing/selftests/iommu/iommufd.c
tools/testing/selftests/iommu/iommufd_fail_nth.c
tools/testing/selftests/iommu/iommufd_utils.h

index 7f04491ca5f01fddd258e93faf3c27db69507457..ee9e2a2edbf563efa22ad2902ac9d5d23077c073 100644 (file)
@@ -7,6 +7,10 @@ config IOMMU_IOVA
 config IOMMU_API
        bool
 
+config IOMMUFD_DRIVER
+       bool
+       default n
+
 menuconfig IOMMU_SUPPORT
        bool "IOMMU Hardware Support"
        depends on MMU
index 9b5fc3356bf2d8ac1ca9e3a8398200ae8ccd2940..8bd4c3b183ec6e475b58a1990d7b5c33ab141120 100644 (file)
@@ -10,6 +10,7 @@ config AMD_IOMMU
        select IOMMU_API
        select IOMMU_IOVA
        select IOMMU_IO_PGTABLE
+       select IOMMUFD_DRIVER if IOMMUFD
        depends on X86_64 && PCI && ACPI && HAVE_CMPXCHG_DOUBLE
        help
          With this option you can enable support for AMD IOMMU hardware in
index 7dc30c2b56b302d8bd3cd129f410610d536059e6..dec4e5c2b66b8236fcd6faeb8497fdc9b42dfe20 100644 (file)
@@ -97,7 +97,9 @@
 #define FEATURE_GATS_MASK      (3ULL)
 #define FEATURE_GAM_VAPIC      BIT_ULL(21)
 #define FEATURE_GIOSUP         BIT_ULL(48)
+#define FEATURE_HASUP          BIT_ULL(49)
 #define FEATURE_EPHSUP         BIT_ULL(50)
+#define FEATURE_HDSUP          BIT_ULL(52)
 #define FEATURE_SNP            BIT_ULL(63)
 
 #define FEATURE_PASID_SHIFT    32
 /* macros and definitions for device table entries */
 #define DEV_ENTRY_VALID         0x00
 #define DEV_ENTRY_TRANSLATION   0x01
+#define DEV_ENTRY_HAD           0x07
 #define DEV_ENTRY_PPR           0x34
 #define DEV_ENTRY_IR            0x3d
 #define DEV_ENTRY_IW            0x3e
 #define PTE_LEVEL_PAGE_SIZE(level)                     \
        (1ULL << (12 + (9 * (level))))
 
+/*
+ * The IOPTE dirty bit
+ */
+#define IOMMU_PTE_HD_BIT (6)
+
 /*
  * Bit value definition for I/O PTE fields
  */
 #define IOMMU_PTE_PR   BIT_ULL(0)
+#define IOMMU_PTE_HD   BIT_ULL(IOMMU_PTE_HD_BIT)
 #define IOMMU_PTE_U    BIT_ULL(59)
 #define IOMMU_PTE_FC   BIT_ULL(60)
 #define IOMMU_PTE_IR   BIT_ULL(61)
  */
 #define DTE_FLAG_V     BIT_ULL(0)
 #define DTE_FLAG_TV    BIT_ULL(1)
+#define DTE_FLAG_HAD   (3ULL << 7)
 #define DTE_FLAG_GIOV  BIT_ULL(54)
 #define DTE_FLAG_GV    BIT_ULL(55)
 #define DTE_GLX_SHIFT  (56)
 
 #define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL)
 #define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_PR)
+#define IOMMU_PTE_DIRTY(pte) ((pte) & IOMMU_PTE_HD)
 #define IOMMU_PTE_PAGE(pte) (iommu_phys_to_virt((pte) & IOMMU_PAGE_MASK))
 #define IOMMU_PTE_MODE(pte) (((pte) >> 9) & 0x07)
 
@@ -563,6 +574,7 @@ struct protection_domain {
        int nid;                /* Node ID */
        u64 *gcr3_tbl;          /* Guest CR3 table */
        unsigned long flags;    /* flags to find out type of domain */
+       bool dirty_tracking;    /* dirty tracking is enabled in the domain */
        unsigned dev_cnt;       /* devices assigned to this domain */
        unsigned dev_iommu[MAX_IOMMUS]; /* per-IOMMU reference count */
 };
index 2892aa1b4dc1db1771b9ebe5d418a14da9e5f456..6c0621f6f572a4c4c0fb72ea1bdb5abe9d504311 100644 (file)
@@ -486,6 +486,73 @@ static phys_addr_t iommu_v1_iova_to_phys(struct io_pgtable_ops *ops, unsigned lo
        return (__pte & ~offset_mask) | (iova & offset_mask);
 }
 
+static bool pte_test_and_clear_dirty(u64 *ptep, unsigned long size,
+                                    unsigned long flags)
+{
+       bool test_only = flags & IOMMU_DIRTY_NO_CLEAR;
+       bool dirty = false;
+       int i, count;
+
+       /*
+        * 2.2.3.2 Host Dirty Support
+        * When a non-default page size is used , software must OR the
+        * Dirty bits in all of the replicated host PTEs used to map
+        * the page. The IOMMU does not guarantee the Dirty bits are
+        * set in all of the replicated PTEs. Any portion of the page
+        * may have been written even if the Dirty bit is set in only
+        * one of the replicated PTEs.
+        */
+       count = PAGE_SIZE_PTE_COUNT(size);
+       for (i = 0; i < count && test_only; i++) {
+               if (test_bit(IOMMU_PTE_HD_BIT, (unsigned long *)&ptep[i])) {
+                       dirty = true;
+                       break;
+               }
+       }
+
+       for (i = 0; i < count && !test_only; i++) {
+               if (test_and_clear_bit(IOMMU_PTE_HD_BIT,
+                                      (unsigned long *)&ptep[i])) {
+                       dirty = true;
+               }
+       }
+
+       return dirty;
+}
+
+static int iommu_v1_read_and_clear_dirty(struct io_pgtable_ops *ops,
+                                        unsigned long iova, size_t size,
+                                        unsigned long flags,
+                                        struct iommu_dirty_bitmap *dirty)
+{
+       struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops);
+       unsigned long end = iova + size - 1;
+
+       do {
+               unsigned long pgsize = 0;
+               u64 *ptep, pte;
+
+               ptep = fetch_pte(pgtable, iova, &pgsize);
+               if (ptep)
+                       pte = READ_ONCE(*ptep);
+               if (!ptep || !IOMMU_PTE_PRESENT(pte)) {
+                       pgsize = pgsize ?: PTE_LEVEL_PAGE_SIZE(0);
+                       iova += pgsize;
+                       continue;
+               }
+
+               /*
+                * Mark the whole IOVA range as dirty even if only one of
+                * the replicated PTEs were marked dirty.
+                */
+               if (pte_test_and_clear_dirty(ptep, pgsize, flags))
+                       iommu_dirty_bitmap_record(dirty, iova, pgsize);
+               iova += pgsize;
+       } while (iova < end);
+
+       return 0;
+}
+
 /*
  * ----------------------------------------------------
  */
@@ -527,6 +594,7 @@ static struct io_pgtable *v1_alloc_pgtable(struct io_pgtable_cfg *cfg, void *coo
        pgtable->iop.ops.map_pages    = iommu_v1_map_pages;
        pgtable->iop.ops.unmap_pages  = iommu_v1_unmap_pages;
        pgtable->iop.ops.iova_to_phys = iommu_v1_iova_to_phys;
+       pgtable->iop.ops.read_and_clear_dirty = iommu_v1_read_and_clear_dirty;
 
        return &pgtable->iop;
 }
index 95bd7c25ba6f366b5db2582e8cb5318491cbb523..b399c57413784688e69beaa54f2fed5b23d0462c 100644 (file)
@@ -37,6 +37,7 @@
 #include <asm/iommu.h>
 #include <asm/gart.h>
 #include <asm/dma.h>
+#include <uapi/linux/iommufd.h>
 
 #include "amd_iommu.h"
 #include "../dma-iommu.h"
@@ -65,6 +66,7 @@ LIST_HEAD(hpet_map);
 LIST_HEAD(acpihid_map);
 
 const struct iommu_ops amd_iommu_ops;
+const struct iommu_dirty_ops amd_dirty_ops;
 
 static ATOMIC_NOTIFIER_HEAD(ppr_notifier);
 int amd_iommu_max_glx_val = -1;
@@ -1610,6 +1612,9 @@ static void set_dte_entry(struct amd_iommu *iommu, u16 devid,
                        pte_root |= 1ULL << DEV_ENTRY_PPR;
        }
 
+       if (domain->dirty_tracking)
+               pte_root |= DTE_FLAG_HAD;
+
        if (domain->flags & PD_IOMMUV2_MASK) {
                u64 gcr3 = iommu_virt_to_phys(domain->gcr3_tbl);
                u64 glx  = domain->glx;
@@ -2155,28 +2160,79 @@ static inline u64 dma_max_address(void)
        return ((1ULL << PM_LEVEL_SHIFT(amd_iommu_gpt_level)) - 1);
 }
 
-static struct iommu_domain *amd_iommu_domain_alloc(unsigned type)
+static bool amd_iommu_hd_support(struct amd_iommu *iommu)
 {
+       return iommu && (iommu->features & FEATURE_HDSUP);
+}
+
+static struct iommu_domain *do_iommu_domain_alloc(unsigned int type,
+                                                 struct device *dev, u32 flags)
+{
+       bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
        struct protection_domain *domain;
+       struct amd_iommu *iommu = NULL;
+
+       if (dev) {
+               iommu = rlookup_amd_iommu(dev);
+               if (!iommu)
+                       return ERR_PTR(-ENODEV);
+       }
 
        /*
         * Since DTE[Mode]=0 is prohibited on SNP-enabled system,
         * default to use IOMMU_DOMAIN_DMA[_FQ].
         */
        if (amd_iommu_snp_en && (type == IOMMU_DOMAIN_IDENTITY))
-               return NULL;
+               return ERR_PTR(-EINVAL);
+
+       if (dirty_tracking && !amd_iommu_hd_support(iommu))
+               return ERR_PTR(-EOPNOTSUPP);
 
        domain = protection_domain_alloc(type);
        if (!domain)
-               return NULL;
+               return ERR_PTR(-ENOMEM);
 
        domain->domain.geometry.aperture_start = 0;
        domain->domain.geometry.aperture_end   = dma_max_address();
        domain->domain.geometry.force_aperture = true;
 
+       if (iommu) {
+               domain->domain.type = type;
+               domain->domain.pgsize_bitmap = iommu->iommu.ops->pgsize_bitmap;
+               domain->domain.ops = iommu->iommu.ops->default_domain_ops;
+
+               if (dirty_tracking)
+                       domain->domain.dirty_ops = &amd_dirty_ops;
+       }
+
        return &domain->domain;
 }
 
+static struct iommu_domain *amd_iommu_domain_alloc(unsigned int type)
+{
+       struct iommu_domain *domain;
+
+       domain = do_iommu_domain_alloc(type, NULL, 0);
+       if (IS_ERR(domain))
+               return NULL;
+
+       return domain;
+}
+
+static struct iommu_domain *
+amd_iommu_domain_alloc_user(struct device *dev, u32 flags,
+                           struct iommu_domain *parent,
+                           const struct iommu_user_data *user_data)
+
+{
+       unsigned int type = IOMMU_DOMAIN_UNMANAGED;
+
+       if ((flags & ~IOMMU_HWPT_ALLOC_DIRTY_TRACKING) || parent || user_data)
+               return ERR_PTR(-EOPNOTSUPP);
+
+       return do_iommu_domain_alloc(type, dev, flags);
+}
+
 static void amd_iommu_domain_free(struct iommu_domain *dom)
 {
        struct protection_domain *domain;
@@ -2214,6 +2270,13 @@ static int amd_iommu_attach_device(struct iommu_domain *dom,
 
        dev_data->defer_attach = false;
 
+       /*
+        * Restrict to devices with compatible IOMMU hardware support
+        * when enforcement of dirty tracking is enabled.
+        */
+       if (dom->dirty_ops && !amd_iommu_hd_support(iommu))
+               return -EINVAL;
+
        if (dev_data->domain)
                detach_device(dev);
 
@@ -2332,6 +2395,11 @@ static bool amd_iommu_capable(struct device *dev, enum iommu_cap cap)
                return true;
        case IOMMU_CAP_DEFERRED_FLUSH:
                return true;
+       case IOMMU_CAP_DIRTY_TRACKING: {
+               struct amd_iommu *iommu = rlookup_amd_iommu(dev);
+
+               return amd_iommu_hd_support(iommu);
+       }
        default:
                break;
        }
@@ -2339,6 +2407,73 @@ static bool amd_iommu_capable(struct device *dev, enum iommu_cap cap)
        return false;
 }
 
+static int amd_iommu_set_dirty_tracking(struct iommu_domain *domain,
+                                       bool enable)
+{
+       struct protection_domain *pdomain = to_pdomain(domain);
+       struct dev_table_entry *dev_table;
+       struct iommu_dev_data *dev_data;
+       bool domain_flush = false;
+       struct amd_iommu *iommu;
+       unsigned long flags;
+       u64 pte_root;
+
+       spin_lock_irqsave(&pdomain->lock, flags);
+       if (!(pdomain->dirty_tracking ^ enable)) {
+               spin_unlock_irqrestore(&pdomain->lock, flags);
+               return 0;
+       }
+
+       list_for_each_entry(dev_data, &pdomain->dev_list, list) {
+               iommu = rlookup_amd_iommu(dev_data->dev);
+               if (!iommu)
+                       continue;
+
+               dev_table = get_dev_table(iommu);
+               pte_root = dev_table[dev_data->devid].data[0];
+
+               pte_root = (enable ? pte_root | DTE_FLAG_HAD :
+                                    pte_root & ~DTE_FLAG_HAD);
+
+               /* Flush device DTE */
+               dev_table[dev_data->devid].data[0] = pte_root;
+               device_flush_dte(dev_data);
+               domain_flush = true;
+       }
+
+       /* Flush IOTLB to mark IOPTE dirty on the next translation(s) */
+       if (domain_flush) {
+               amd_iommu_domain_flush_tlb_pde(pdomain);
+               amd_iommu_domain_flush_complete(pdomain);
+       }
+       pdomain->dirty_tracking = enable;
+       spin_unlock_irqrestore(&pdomain->lock, flags);
+
+       return 0;
+}
+
+static int amd_iommu_read_and_clear_dirty(struct iommu_domain *domain,
+                                         unsigned long iova, size_t size,
+                                         unsigned long flags,
+                                         struct iommu_dirty_bitmap *dirty)
+{
+       struct protection_domain *pdomain = to_pdomain(domain);
+       struct io_pgtable_ops *ops = &pdomain->iop.iop.ops;
+       unsigned long lflags;
+
+       if (!ops || !ops->read_and_clear_dirty)
+               return -EOPNOTSUPP;
+
+       spin_lock_irqsave(&pdomain->lock, lflags);
+       if (!pdomain->dirty_tracking && dirty->bitmap) {
+               spin_unlock_irqrestore(&pdomain->lock, lflags);
+               return -EINVAL;
+       }
+       spin_unlock_irqrestore(&pdomain->lock, lflags);
+
+       return ops->read_and_clear_dirty(ops, iova, size, flags, dirty);
+}
+
 static void amd_iommu_get_resv_regions(struct device *dev,
                                       struct list_head *head)
 {
@@ -2461,9 +2596,15 @@ static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain)
        return true;
 }
 
+const struct iommu_dirty_ops amd_dirty_ops = {
+       .set_dirty_tracking = amd_iommu_set_dirty_tracking,
+       .read_and_clear_dirty = amd_iommu_read_and_clear_dirty,
+};
+
 const struct iommu_ops amd_iommu_ops = {
        .capable = amd_iommu_capable,
        .domain_alloc = amd_iommu_domain_alloc,
+       .domain_alloc_user = amd_iommu_domain_alloc_user,
        .probe_device = amd_iommu_probe_device,
        .release_device = amd_iommu_release_device,
        .probe_finalize = amd_iommu_probe_finalize,
index 119d2c57a48ed5203af0e28f1330a0b86ac071aa..012cd2541a68a62b8360591f2c680fe14104eec1 100644 (file)
@@ -15,6 +15,7 @@ config INTEL_IOMMU
        select DMA_OPS
        select IOMMU_API
        select IOMMU_IOVA
+       select IOMMUFD_DRIVER if IOMMUFD
        select NEED_DMA_MAP_STATE
        select DMAR_TABLE
        select SWIOTLB
index 7af3b8a4f2a0054031accd8be92d753192b6485c..5dabf081a779353b2efead1db02918f3a7c9f91a 100644 (file)
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
 obj-$(CONFIG_DMAR_TABLE) += dmar.o
-obj-$(CONFIG_INTEL_IOMMU) += iommu.o pasid.o
+obj-$(CONFIG_INTEL_IOMMU) += iommu.o pasid.o nested.o
 obj-$(CONFIG_DMAR_TABLE) += trace.o cap_audit.o
 obj-$(CONFIG_DMAR_PERF) += perf.o
 obj-$(CONFIG_INTEL_IOMMU_DEBUGFS) += debugfs.o
index 3685ba90ec88e81baac849f1693f507e005f4a21..d1037280abf7a2bc4fd51d5e6de5ce0932c66424 100644 (file)
@@ -282,7 +282,6 @@ static LIST_HEAD(dmar_satc_units);
 #define for_each_rmrr_units(rmrr) \
        list_for_each_entry(rmrr, &dmar_rmrr_units, list)
 
-static void device_block_translation(struct device *dev);
 static void intel_iommu_domain_free(struct iommu_domain *domain);
 
 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
@@ -300,6 +299,7 @@ static int iommu_skip_te_disable;
 #define IDENTMAP_AZALIA                4
 
 const struct iommu_ops intel_iommu_ops;
+const struct iommu_dirty_ops intel_dirty_ops;
 
 static bool translation_pre_enabled(struct intel_iommu *iommu)
 {
@@ -560,7 +560,7 @@ static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
 }
 
 /* Some capabilities may be different across iommus */
-static void domain_update_iommu_cap(struct dmar_domain *domain)
+void domain_update_iommu_cap(struct dmar_domain *domain)
 {
        domain_update_iommu_coherency(domain);
        domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
@@ -1778,8 +1778,7 @@ static struct dmar_domain *alloc_domain(unsigned int type)
        return domain;
 }
 
-static int domain_attach_iommu(struct dmar_domain *domain,
-                              struct intel_iommu *iommu)
+int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
 {
        struct iommu_domain_info *info, *curr;
        unsigned long ndomains;
@@ -1828,8 +1827,7 @@ err_unlock:
        return ret;
 }
 
-static void domain_detach_iommu(struct dmar_domain *domain,
-                               struct intel_iommu *iommu)
+void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
 {
        struct iommu_domain_info *info;
 
@@ -2196,6 +2194,11 @@ __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
        if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
                return -EINVAL;
 
+       if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) {
+               pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n");
+               return -EINVAL;
+       }
+
        attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
        attr |= DMA_FL_PTE_PRESENT;
        if (domain->use_first_level) {
@@ -3958,7 +3961,7 @@ static void dmar_remove_one_dev_info(struct device *dev)
  * all DMA requests without PASID from the device are blocked. If the page
  * table has been set, clean up the data structures.
  */
-static void device_block_translation(struct device *dev)
+void device_block_translation(struct device *dev)
 {
        struct device_domain_info *info = dev_iommu_priv_get(dev);
        struct intel_iommu *iommu = info->iommu;
@@ -4058,14 +4061,62 @@ static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
        return NULL;
 }
 
+static struct iommu_domain *
+intel_iommu_domain_alloc_user(struct device *dev, u32 flags,
+                             struct iommu_domain *parent,
+                             const struct iommu_user_data *user_data)
+{
+       struct device_domain_info *info = dev_iommu_priv_get(dev);
+       bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
+       bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
+       struct intel_iommu *iommu = info->iommu;
+       struct iommu_domain *domain;
+
+       /* Must be NESTING domain */
+       if (parent) {
+               if (!nested_supported(iommu) || flags)
+                       return ERR_PTR(-EOPNOTSUPP);
+               return intel_nested_domain_alloc(parent, user_data);
+       }
+
+       if (flags &
+           (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING)))
+               return ERR_PTR(-EOPNOTSUPP);
+       if (nested_parent && !nested_supported(iommu))
+               return ERR_PTR(-EOPNOTSUPP);
+       if (user_data || (dirty_tracking && !ssads_supported(iommu)))
+               return ERR_PTR(-EOPNOTSUPP);
+
+       /*
+        * domain_alloc_user op needs to fully initialize a domain before
+        * return, so uses iommu_domain_alloc() here for simple.
+        */
+       domain = iommu_domain_alloc(dev->bus);
+       if (!domain)
+               return ERR_PTR(-ENOMEM);
+
+       if (nested_parent)
+               to_dmar_domain(domain)->nested_parent = true;
+
+       if (dirty_tracking) {
+               if (to_dmar_domain(domain)->use_first_level) {
+                       iommu_domain_free(domain);
+                       return ERR_PTR(-EOPNOTSUPP);
+               }
+               domain->dirty_ops = &intel_dirty_ops;
+       }
+
+       return domain;
+}
+
 static void intel_iommu_domain_free(struct iommu_domain *domain)
 {
        if (domain != &si_domain->domain && domain != &blocking_domain)
                domain_exit(to_dmar_domain(domain));
 }
 
-static int prepare_domain_attach_device(struct iommu_domain *domain,
-                                       struct device *dev)
+int prepare_domain_attach_device(struct iommu_domain *domain,
+                                struct device *dev)
 {
        struct dmar_domain *dmar_domain = to_dmar_domain(domain);
        struct intel_iommu *iommu;
@@ -4078,6 +4129,9 @@ static int prepare_domain_attach_device(struct iommu_domain *domain,
        if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
                return -EINVAL;
 
+       if (domain->dirty_ops && !ssads_supported(iommu))
+               return -EINVAL;
+
        /* check if this iommu agaw is sufficient for max mapped address */
        addr_width = agaw_to_width(iommu->agaw);
        if (addr_width > cap_mgaw(iommu->cap))
@@ -4332,6 +4386,8 @@ static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
                return dmar_platform_optin();
        case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
                return ecap_sc_support(info->iommu->ecap);
+       case IOMMU_CAP_DIRTY_TRACKING:
+               return ssads_supported(info->iommu);
        default:
                return false;
        }
@@ -4729,6 +4785,9 @@ static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
        if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
                return -EOPNOTSUPP;
 
+       if (domain->dirty_ops)
+               return -EINVAL;
+
        if (context_copied(iommu, info->bus, info->devfn))
                return -EBUSY;
 
@@ -4780,6 +4839,7 @@ static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
        if (!vtd)
                return ERR_PTR(-ENOMEM);
 
+       vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17;
        vtd->cap_reg = iommu->cap;
        vtd->ecap_reg = iommu->ecap;
        *length = sizeof(*vtd);
@@ -4787,10 +4847,88 @@ static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
        return vtd;
 }
 
+static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
+                                         bool enable)
+{
+       struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+       struct device_domain_info *info;
+       int ret;
+
+       spin_lock(&dmar_domain->lock);
+       if (dmar_domain->dirty_tracking == enable)
+               goto out_unlock;
+
+       list_for_each_entry(info, &dmar_domain->devices, link) {
+               ret = intel_pasid_setup_dirty_tracking(info->iommu,
+                                                      info->domain, info->dev,
+                                                      IOMMU_NO_PASID, enable);
+               if (ret)
+                       goto err_unwind;
+       }
+
+       dmar_domain->dirty_tracking = enable;
+out_unlock:
+       spin_unlock(&dmar_domain->lock);
+
+       return 0;
+
+err_unwind:
+       list_for_each_entry(info, &dmar_domain->devices, link)
+               intel_pasid_setup_dirty_tracking(info->iommu, dmar_domain,
+                                                info->dev, IOMMU_NO_PASID,
+                                                dmar_domain->dirty_tracking);
+       spin_unlock(&dmar_domain->lock);
+       return ret;
+}
+
+static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain,
+                                           unsigned long iova, size_t size,
+                                           unsigned long flags,
+                                           struct iommu_dirty_bitmap *dirty)
+{
+       struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+       unsigned long end = iova + size - 1;
+       unsigned long pgsize;
+
+       /*
+        * IOMMUFD core calls into a dirty tracking disabled domain without an
+        * IOVA bitmap set in order to clean dirty bits in all PTEs that might
+        * have occurred when we stopped dirty tracking. This ensures that we
+        * never inherit dirtied bits from a previous cycle.
+        */
+       if (!dmar_domain->dirty_tracking && dirty->bitmap)
+               return -EINVAL;
+
+       do {
+               struct dma_pte *pte;
+               int lvl = 0;
+
+               pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl,
+                                    GFP_ATOMIC);
+               pgsize = level_size(lvl) << VTD_PAGE_SHIFT;
+               if (!pte || !dma_pte_present(pte)) {
+                       iova += pgsize;
+                       continue;
+               }
+
+               if (dma_sl_pte_test_and_clear_dirty(pte, flags))
+                       iommu_dirty_bitmap_record(dirty, iova, pgsize);
+               iova += pgsize;
+       } while (iova < end);
+
+       return 0;
+}
+
+const struct iommu_dirty_ops intel_dirty_ops = {
+       .set_dirty_tracking = intel_iommu_set_dirty_tracking,
+       .read_and_clear_dirty = intel_iommu_read_and_clear_dirty,
+};
+
 const struct iommu_ops intel_iommu_ops = {
        .capable                = intel_iommu_capable,
        .hw_info                = intel_iommu_hw_info,
        .domain_alloc           = intel_iommu_domain_alloc,
+       .domain_alloc_user      = intel_iommu_domain_alloc_user,
        .probe_device           = intel_iommu_probe_device,
        .probe_finalize         = intel_iommu_probe_finalize,
        .release_device         = intel_iommu_release_device,
index 7dac94f62b4ec661af7030b475103ef4ac184fee..d796d0d9b114a4cf29bda9202636a05df091421d 100644 (file)
@@ -25,6 +25,7 @@
 
 #include <asm/cacheflush.h>
 #include <asm/iommu.h>
+#include <uapi/linux/iommufd.h>
 
 /*
  * VT-d hardware uses 4KiB page size regardless of host page size.
@@ -48,6 +49,9 @@
 #define DMA_FL_PTE_DIRTY       BIT_ULL(6)
 #define DMA_FL_PTE_XD          BIT_ULL(63)
 
+#define DMA_SL_PTE_DIRTY_BIT   9
+#define DMA_SL_PTE_DIRTY       BIT_ULL(DMA_SL_PTE_DIRTY_BIT)
+
 #define ADDR_WIDTH_5LEVEL      (57)
 #define ADDR_WIDTH_4LEVEL      (48)
 
@@ -539,6 +543,10 @@ enum {
 #define sm_supported(iommu)    (intel_iommu_sm && ecap_smts((iommu)->ecap))
 #define pasid_supported(iommu) (sm_supported(iommu) &&                 \
                                 ecap_pasid((iommu)->ecap))
+#define ssads_supported(iommu) (sm_supported(iommu) &&                 \
+                               ecap_slads((iommu)->ecap))
+#define nested_supported(iommu)        (sm_supported(iommu) &&                 \
+                                ecap_nest((iommu)->ecap))
 
 struct pasid_entry;
 struct pasid_state_entry;
@@ -592,20 +600,45 @@ struct dmar_domain {
                                         * otherwise, goes through the second
                                         * level.
                                         */
+       u8 dirty_tracking:1;            /* Dirty tracking is enabled */
+       u8 nested_parent:1;             /* Has other domains nested on it */
 
        spinlock_t lock;                /* Protect device tracking lists */
        struct list_head devices;       /* all devices' list */
        struct list_head dev_pasids;    /* all attached pasids */
 
-       struct dma_pte  *pgd;           /* virtual address */
-       int             gaw;            /* max guest address width */
-
-       /* adjusted guest address width, 0 is level 2 30-bit */
-       int             agaw;
        int             iommu_superpage;/* Level of superpages supported:
                                           0 == 4KiB (no superpages), 1 == 2MiB,
                                           2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
-       u64             max_addr;       /* maximum mapped address */
+       union {
+               /* DMA remapping domain */
+               struct {
+                       /* virtual address */
+                       struct dma_pte  *pgd;
+                       /* max guest address width */
+                       int             gaw;
+                       /*
+                        * adjusted guest address width:
+                        *   0: level 2 30-bit
+                        *   1: level 3 39-bit
+                        *   2: level 4 48-bit
+                        *   3: level 5 57-bit
+                        */
+                       int             agaw;
+                       /* maximum mapped address */
+                       u64             max_addr;
+               };
+
+               /* Nested user domain */
+               struct {
+                       /* parent page table which the user domain is nested on */
+                       struct dmar_domain *s2_domain;
+                       /* user page table pointer (in GPA) */
+                       unsigned long s1_pgtbl;
+                       /* page table attributes */
+                       struct iommu_hwpt_vtd_s1 s1_cfg;
+               };
+       };
 
        struct iommu_domain domain;     /* generic domain data structure for
                                           iommu core */
@@ -781,6 +814,16 @@ static inline bool dma_pte_present(struct dma_pte *pte)
        return (pte->val & 3) != 0;
 }
 
+static inline bool dma_sl_pte_test_and_clear_dirty(struct dma_pte *pte,
+                                                  unsigned long flags)
+{
+       if (flags & IOMMU_DIRTY_NO_CLEAR)
+               return (pte->val & DMA_SL_PTE_DIRTY) != 0;
+
+       return test_and_clear_bit(DMA_SL_PTE_DIRTY_BIT,
+                                 (unsigned long *)&pte->val);
+}
+
 static inline bool dma_pte_superpage(struct dma_pte *pte)
 {
        return (pte->val & DMA_PTE_LARGE_PAGE);
@@ -836,12 +879,21 @@ int qi_submit_sync(struct intel_iommu *iommu, struct qi_desc *desc,
  */
 #define QI_OPT_WAIT_DRAIN              BIT(0)
 
+int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu);
+void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu);
+void device_block_translation(struct device *dev);
+int prepare_domain_attach_device(struct iommu_domain *domain,
+                                struct device *dev);
+void domain_update_iommu_cap(struct dmar_domain *domain);
+
 int dmar_ir_support(void);
 
 void *alloc_pgtable_page(int node, gfp_t gfp);
 void free_pgtable_page(void *vaddr);
 void iommu_flush_write_buffer(struct intel_iommu *iommu);
 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn);
+struct iommu_domain *intel_nested_domain_alloc(struct iommu_domain *parent,
+                                              const struct iommu_user_data *user_data);
 
 #ifdef CONFIG_INTEL_IOMMU_SVM
 void intel_svm_check(struct intel_iommu *iommu);
diff --git a/drivers/iommu/intel/nested.c b/drivers/iommu/intel/nested.c
new file mode 100644 (file)
index 0000000..b5a5563
--- /dev/null
@@ -0,0 +1,117 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * nested.c - nested mode translation support
+ *
+ * Copyright (C) 2023 Intel Corporation
+ *
+ * Author: Lu Baolu <baolu.lu@linux.intel.com>
+ *         Jacob Pan <jacob.jun.pan@linux.intel.com>
+ *         Yi Liu <yi.l.liu@intel.com>
+ */
+
+#define pr_fmt(fmt)    "DMAR: " fmt
+
+#include <linux/iommu.h>
+#include <linux/pci.h>
+#include <linux/pci-ats.h>
+
+#include "iommu.h"
+#include "pasid.h"
+
+static int intel_nested_attach_dev(struct iommu_domain *domain,
+                                  struct device *dev)
+{
+       struct device_domain_info *info = dev_iommu_priv_get(dev);
+       struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+       struct intel_iommu *iommu = info->iommu;
+       unsigned long flags;
+       int ret = 0;
+
+       if (info->domain)
+               device_block_translation(dev);
+
+       if (iommu->agaw < dmar_domain->s2_domain->agaw) {
+               dev_err_ratelimited(dev, "Adjusted guest address width not compatible\n");
+               return -ENODEV;
+       }
+
+       /*
+        * Stage-1 domain cannot work alone, it is nested on a s2_domain.
+        * The s2_domain will be used in nested translation, hence needs
+        * to ensure the s2_domain is compatible with this IOMMU.
+        */
+       ret = prepare_domain_attach_device(&dmar_domain->s2_domain->domain, dev);
+       if (ret) {
+               dev_err_ratelimited(dev, "s2 domain is not compatible\n");
+               return ret;
+       }
+
+       ret = domain_attach_iommu(dmar_domain, iommu);
+       if (ret) {
+               dev_err_ratelimited(dev, "Failed to attach domain to iommu\n");
+               return ret;
+       }
+
+       ret = intel_pasid_setup_nested(iommu, dev,
+                                      IOMMU_NO_PASID, dmar_domain);
+       if (ret) {
+               domain_detach_iommu(dmar_domain, iommu);
+               dev_err_ratelimited(dev, "Failed to setup pasid entry\n");
+               return ret;
+       }
+
+       info->domain = dmar_domain;
+       spin_lock_irqsave(&dmar_domain->lock, flags);
+       list_add(&info->link, &dmar_domain->devices);
+       spin_unlock_irqrestore(&dmar_domain->lock, flags);
+
+       return 0;
+}
+
+static void intel_nested_domain_free(struct iommu_domain *domain)
+{
+       kfree(to_dmar_domain(domain));
+}
+
+static const struct iommu_domain_ops intel_nested_domain_ops = {
+       .attach_dev             = intel_nested_attach_dev,
+       .free                   = intel_nested_domain_free,
+};
+
+struct iommu_domain *intel_nested_domain_alloc(struct iommu_domain *parent,
+                                              const struct iommu_user_data *user_data)
+{
+       struct dmar_domain *s2_domain = to_dmar_domain(parent);
+       struct iommu_hwpt_vtd_s1 vtd;
+       struct dmar_domain *domain;
+       int ret;
+
+       /* Must be nested domain */
+       if (user_data->type != IOMMU_HWPT_DATA_VTD_S1)
+               return ERR_PTR(-EOPNOTSUPP);
+       if (parent->ops != intel_iommu_ops.default_domain_ops ||
+           !s2_domain->nested_parent)
+               return ERR_PTR(-EINVAL);
+
+       ret = iommu_copy_struct_from_user(&vtd, user_data,
+                                         IOMMU_HWPT_DATA_VTD_S1, __reserved);
+       if (ret)
+               return ERR_PTR(ret);
+
+       domain = kzalloc(sizeof(*domain), GFP_KERNEL_ACCOUNT);
+       if (!domain)
+               return ERR_PTR(-ENOMEM);
+
+       domain->use_first_level = true;
+       domain->s2_domain = s2_domain;
+       domain->s1_pgtbl = vtd.pgtbl_addr;
+       domain->s1_cfg = vtd;
+       domain->domain.ops = &intel_nested_domain_ops;
+       domain->domain.type = IOMMU_DOMAIN_NESTED;
+       INIT_LIST_HEAD(&domain->devices);
+       INIT_LIST_HEAD(&domain->dev_pasids);
+       spin_lock_init(&domain->lock);
+       xa_init(&domain->iommu_array);
+
+       return &domain->domain;
+}
index 8f92b92f3d2aba5ce2455fcf8d3604ea4eeee4ae..74e8e4c17e81430f216fa88a9575d591de203e3a 100644 (file)
@@ -277,6 +277,11 @@ static inline void pasid_set_bits(u64 *ptr, u64 mask, u64 bits)
        WRITE_ONCE(*ptr, (old & ~mask) | bits);
 }
 
+static inline u64 pasid_get_bits(u64 *ptr)
+{
+       return READ_ONCE(*ptr);
+}
+
 /*
  * Setup the DID(Domain Identifier) field (Bit 64~79) of scalable mode
  * PASID entry.
@@ -335,6 +340,45 @@ static inline void pasid_set_fault_enable(struct pasid_entry *pe)
        pasid_set_bits(&pe->val[0], 1 << 1, 0);
 }
 
+/*
+ * Enable second level A/D bits by setting the SLADE (Second Level
+ * Access Dirty Enable) field (Bit 9) of a scalable mode PASID
+ * entry.
+ */
+static inline void pasid_set_ssade(struct pasid_entry *pe)
+{
+       pasid_set_bits(&pe->val[0], 1 << 9, 1 << 9);
+}
+
+/*
+ * Disable second level A/D bits by clearing the SLADE (Second Level
+ * Access Dirty Enable) field (Bit 9) of a scalable mode PASID
+ * entry.
+ */
+static inline void pasid_clear_ssade(struct pasid_entry *pe)
+{
+       pasid_set_bits(&pe->val[0], 1 << 9, 0);
+}
+
+/*
+ * Checks if second level A/D bits specifically the SLADE (Second Level
+ * Access Dirty Enable) field (Bit 9) of a scalable mode PASID
+ * entry is set.
+ */
+static inline bool pasid_get_ssade(struct pasid_entry *pe)
+{
+       return pasid_get_bits(&pe->val[0]) & (1 << 9);
+}
+
+/*
+ * Setup the SRE(Supervisor Request Enable) field (Bit 128) of a
+ * scalable mode PASID entry.
+ */
+static inline void pasid_set_sre(struct pasid_entry *pe)
+{
+       pasid_set_bits(&pe->val[2], 1 << 0, 1);
+}
+
 /*
  * Setup the WPE(Write Protect Enable) field (Bit 132) of a
  * scalable mode PASID entry.
@@ -402,6 +446,15 @@ pasid_set_flpm(struct pasid_entry *pe, u64 value)
        pasid_set_bits(&pe->val[2], GENMASK_ULL(3, 2), value << 2);
 }
 
+/*
+ * Setup the Extended Access Flag Enable (EAFE) field (Bit 135)
+ * of a scalable mode PASID entry.
+ */
+static inline void pasid_set_eafe(struct pasid_entry *pe)
+{
+       pasid_set_bits(&pe->val[2], 1 << 7, 1 << 7);
+}
+
 static void
 pasid_cache_invalidation_with_pasid(struct intel_iommu *iommu,
                                    u16 did, u32 pasid)
@@ -627,6 +680,8 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu,
        pasid_set_translation_type(pte, PASID_ENTRY_PGTT_SL_ONLY);
        pasid_set_fault_enable(pte);
        pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap));
+       if (domain->dirty_tracking)
+               pasid_set_ssade(pte);
 
        pasid_set_present(pte);
        spin_unlock(&iommu->lock);
@@ -636,6 +691,78 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu,
        return 0;
 }
 
+/*
+ * Set up dirty tracking on a second only or nested translation type.
+ */
+int intel_pasid_setup_dirty_tracking(struct intel_iommu *iommu,
+                                    struct dmar_domain *domain,
+                                    struct device *dev, u32 pasid,
+                                    bool enabled)
+{
+       struct pasid_entry *pte;
+       u16 did, pgtt;
+
+       spin_lock(&iommu->lock);
+
+       pte = intel_pasid_get_entry(dev, pasid);
+       if (!pte) {
+               spin_unlock(&iommu->lock);
+               dev_err_ratelimited(
+                       dev, "Failed to get pasid entry of PASID %d\n", pasid);
+               return -ENODEV;
+       }
+
+       did = domain_id_iommu(domain, iommu);
+       pgtt = pasid_pte_get_pgtt(pte);
+       if (pgtt != PASID_ENTRY_PGTT_SL_ONLY &&
+           pgtt != PASID_ENTRY_PGTT_NESTED) {
+               spin_unlock(&iommu->lock);
+               dev_err_ratelimited(
+                       dev,
+                       "Dirty tracking not supported on translation type %d\n",
+                       pgtt);
+               return -EOPNOTSUPP;
+       }
+
+       if (pasid_get_ssade(pte) == enabled) {
+               spin_unlock(&iommu->lock);
+               return 0;
+       }
+
+       if (enabled)
+               pasid_set_ssade(pte);
+       else
+               pasid_clear_ssade(pte);
+       spin_unlock(&iommu->lock);
+
+       if (!ecap_coherent(iommu->ecap))
+               clflush_cache_range(pte, sizeof(*pte));
+
+       /*
+        * From VT-d spec table 25 "Guidance to Software for Invalidations":
+        *
+        * - PASID-selective-within-Domain PASID-cache invalidation
+        *   If (PGTT=SS or Nested)
+        *    - Domain-selective IOTLB invalidation
+        *   Else
+        *    - PASID-selective PASID-based IOTLB invalidation
+        * - If (pasid is RID_PASID)
+        *    - Global Device-TLB invalidation to affected functions
+        *   Else
+        *    - PASID-based Device-TLB invalidation (with S=1 and
+        *      Addr[63:12]=0x7FFFFFFF_FFFFF) to affected functions
+        */
+       pasid_cache_invalidation_with_pasid(iommu, did, pasid);
+
+       iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
+
+       /* Device IOTLB doesn't need to be flushed in caching mode. */
+       if (!cap_caching_mode(iommu->cap))
+               devtlb_invalidation_with_pasid(iommu, dev, pasid);
+
+       return 0;
+}
+
 /*
  * Set up the scalable mode pasid entry for passthrough translation type.
  */
@@ -713,3 +840,97 @@ void intel_pasid_setup_page_snoop_control(struct intel_iommu *iommu,
        if (!cap_caching_mode(iommu->cap))
                devtlb_invalidation_with_pasid(iommu, dev, pasid);
 }
+
+/**
+ * intel_pasid_setup_nested() - Set up PASID entry for nested translation.
+ * @iommu:      IOMMU which the device belong to
+ * @dev:        Device to be set up for translation
+ * @pasid:      PASID to be programmed in the device PASID table
+ * @domain:     User stage-1 domain nested on a stage-2 domain
+ *
+ * This is used for nested translation. The input domain should be
+ * nested type and nested on a parent with 'is_nested_parent' flag
+ * set.
+ */
+int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev,
+                            u32 pasid, struct dmar_domain *domain)
+{
+       struct iommu_hwpt_vtd_s1 *s1_cfg = &domain->s1_cfg;
+       pgd_t *s1_gpgd = (pgd_t *)(uintptr_t)domain->s1_pgtbl;
+       struct dmar_domain *s2_domain = domain->s2_domain;
+       u16 did = domain_id_iommu(domain, iommu);
+       struct dma_pte *pgd = s2_domain->pgd;
+       struct pasid_entry *pte;
+
+       /* Address width should match the address width supported by hardware */
+       switch (s1_cfg->addr_width) {
+       case ADDR_WIDTH_4LEVEL:
+               break;
+       case ADDR_WIDTH_5LEVEL:
+               if (!cap_fl5lp_support(iommu->cap)) {
+                       dev_err_ratelimited(dev,
+                                           "5-level paging not supported\n");
+                       return -EINVAL;
+               }
+               break;
+       default:
+               dev_err_ratelimited(dev, "Invalid stage-1 address width %d\n",
+                                   s1_cfg->addr_width);
+               return -EINVAL;
+       }
+
+       if ((s1_cfg->flags & IOMMU_VTD_S1_SRE) && !ecap_srs(iommu->ecap)) {
+               pr_err_ratelimited("No supervisor request support on %s\n",
+                                  iommu->name);
+               return -EINVAL;
+       }
+
+       if ((s1_cfg->flags & IOMMU_VTD_S1_EAFE) && !ecap_eafs(iommu->ecap)) {
+               pr_err_ratelimited("No extended access flag support on %s\n",
+                                  iommu->name);
+               return -EINVAL;
+       }
+
+       spin_lock(&iommu->lock);
+       pte = intel_pasid_get_entry(dev, pasid);
+       if (!pte) {
+               spin_unlock(&iommu->lock);
+               return -ENODEV;
+       }
+       if (pasid_pte_is_present(pte)) {
+               spin_unlock(&iommu->lock);
+               return -EBUSY;
+       }
+
+       pasid_clear_entry(pte);
+
+       if (s1_cfg->addr_width == ADDR_WIDTH_5LEVEL)
+               pasid_set_flpm(pte, 1);
+
+       pasid_set_flptr(pte, (uintptr_t)s1_gpgd);
+
+       if (s1_cfg->flags & IOMMU_VTD_S1_SRE) {
+               pasid_set_sre(pte);
+               if (s1_cfg->flags & IOMMU_VTD_S1_WPE)
+                       pasid_set_wpe(pte);
+       }
+
+       if (s1_cfg->flags & IOMMU_VTD_S1_EAFE)
+               pasid_set_eafe(pte);
+
+       if (s2_domain->force_snooping)
+               pasid_set_pgsnp(pte);
+
+       pasid_set_slptr(pte, virt_to_phys(pgd));
+       pasid_set_fault_enable(pte);
+       pasid_set_domain_id(pte, did);
+       pasid_set_address_width(pte, s2_domain->agaw);
+       pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap));
+       pasid_set_translation_type(pte, PASID_ENTRY_PGTT_NESTED);
+       pasid_set_present(pte);
+       spin_unlock(&iommu->lock);
+
+       pasid_flush_caches(iommu, pte, pasid, did);
+
+       return 0;
+}
index 4e9e68c3c3888f6acd4c3ecff8ebc90f1db39955..dd37611175cc1b9e4009aad7d0c09522147128eb 100644 (file)
@@ -106,9 +106,15 @@ int intel_pasid_setup_first_level(struct intel_iommu *iommu,
 int intel_pasid_setup_second_level(struct intel_iommu *iommu,
                                   struct dmar_domain *domain,
                                   struct device *dev, u32 pasid);
+int intel_pasid_setup_dirty_tracking(struct intel_iommu *iommu,
+                                    struct dmar_domain *domain,
+                                    struct device *dev, u32 pasid,
+                                    bool enabled);
 int intel_pasid_setup_pass_through(struct intel_iommu *iommu,
                                   struct dmar_domain *domain,
                                   struct device *dev, u32 pasid);
+int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev,
+                            u32 pasid, struct dmar_domain *domain);
 void intel_pasid_tear_down_entry(struct intel_iommu *iommu,
                                 struct device *dev, u32 pasid,
                                 bool fault_ignore);
index 8aeba81800c512dc9e9eb1c32b3240080b503db1..34b446146961c29e7b24dc5cc890a5aa557a6ce8 100644 (file)
@@ -11,3 +11,4 @@ iommufd-y := \
 iommufd-$(CONFIG_IOMMUFD_TEST) += selftest.o
 
 obj-$(CONFIG_IOMMUFD) += iommufd.o
+obj-$(CONFIG_IOMMUFD_DRIVER) += iova_bitmap.o
index ce78c3671539c77d27059d3aa11c7367f493eeff..59d3a07300d934484e11cf3a15f68543e11e168a 100644 (file)
@@ -293,7 +293,7 @@ u32 iommufd_device_to_id(struct iommufd_device *idev)
 EXPORT_SYMBOL_NS_GPL(iommufd_device_to_id, IOMMUFD);
 
 static int iommufd_group_setup_msi(struct iommufd_group *igroup,
-                                  struct iommufd_hw_pagetable *hwpt)
+                                  struct iommufd_hwpt_paging *hwpt_paging)
 {
        phys_addr_t sw_msi_start = igroup->sw_msi_start;
        int rc;
@@ -311,8 +311,9 @@ static int iommufd_group_setup_msi(struct iommufd_group *igroup,
         * matches what the IRQ layer actually expects in a newly created
         * domain.
         */
-       if (sw_msi_start != PHYS_ADDR_MAX && !hwpt->msi_cookie) {
-               rc = iommu_get_msi_cookie(hwpt->domain, sw_msi_start);
+       if (sw_msi_start != PHYS_ADDR_MAX && !hwpt_paging->msi_cookie) {
+               rc = iommu_get_msi_cookie(hwpt_paging->common.domain,
+                                         sw_msi_start);
                if (rc)
                        return rc;
 
@@ -320,7 +321,31 @@ static int iommufd_group_setup_msi(struct iommufd_group *igroup,
                 * iommu_get_msi_cookie() can only be called once per domain,
                 * it returns -EBUSY on later calls.
                 */
-               hwpt->msi_cookie = true;
+               hwpt_paging->msi_cookie = true;
+       }
+       return 0;
+}
+
+static int iommufd_hwpt_paging_attach(struct iommufd_hwpt_paging *hwpt_paging,
+                                     struct iommufd_device *idev)
+{
+       int rc;
+
+       lockdep_assert_held(&idev->igroup->lock);
+
+       rc = iopt_table_enforce_dev_resv_regions(&hwpt_paging->ioas->iopt,
+                                                idev->dev,
+                                                &idev->igroup->sw_msi_start);
+       if (rc)
+               return rc;
+
+       if (list_empty(&idev->igroup->device_list)) {
+               rc = iommufd_group_setup_msi(idev->igroup, hwpt_paging);
+               if (rc) {
+                       iopt_remove_reserved_iova(&hwpt_paging->ioas->iopt,
+                                                 idev->dev);
+                       return rc;
+               }
        }
        return 0;
 }
@@ -337,18 +362,12 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt,
                goto err_unlock;
        }
 
-       /* Try to upgrade the domain we have */
-       if (idev->enforce_cache_coherency) {
-               rc = iommufd_hw_pagetable_enforce_cc(hwpt);
+       if (hwpt_is_paging(hwpt)) {
+               rc = iommufd_hwpt_paging_attach(to_hwpt_paging(hwpt), idev);
                if (rc)
                        goto err_unlock;
        }
 
-       rc = iopt_table_enforce_dev_resv_regions(&hwpt->ioas->iopt, idev->dev,
-                                                &idev->igroup->sw_msi_start);
-       if (rc)
-               goto err_unlock;
-
        /*
         * Only attach to the group once for the first device that is in the
         * group. All the other devices will follow this attachment. The user
@@ -357,10 +376,6 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt,
         * attachment.
         */
        if (list_empty(&idev->igroup->device_list)) {
-               rc = iommufd_group_setup_msi(idev->igroup, hwpt);
-               if (rc)
-                       goto err_unresv;
-
                rc = iommu_attach_group(hwpt->domain, idev->igroup->group);
                if (rc)
                        goto err_unresv;
@@ -371,7 +386,9 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt,
        mutex_unlock(&idev->igroup->lock);
        return 0;
 err_unresv:
-       iopt_remove_reserved_iova(&hwpt->ioas->iopt, idev->dev);
+       if (hwpt_is_paging(hwpt))
+               iopt_remove_reserved_iova(&to_hwpt_paging(hwpt)->ioas->iopt,
+                                         idev->dev);
 err_unlock:
        mutex_unlock(&idev->igroup->lock);
        return rc;
@@ -388,7 +405,9 @@ iommufd_hw_pagetable_detach(struct iommufd_device *idev)
                iommu_detach_group(hwpt->domain, idev->igroup->group);
                idev->igroup->hwpt = NULL;
        }
-       iopt_remove_reserved_iova(&hwpt->ioas->iopt, idev->dev);
+       if (hwpt_is_paging(hwpt))
+               iopt_remove_reserved_iova(&to_hwpt_paging(hwpt)->ioas->iopt,
+                                         idev->dev);
        mutex_unlock(&idev->igroup->lock);
 
        /* Caller must destroy hwpt */
@@ -407,14 +426,55 @@ iommufd_device_do_attach(struct iommufd_device *idev,
        return NULL;
 }
 
+static void
+iommufd_group_remove_reserved_iova(struct iommufd_group *igroup,
+                                  struct iommufd_hwpt_paging *hwpt_paging)
+{
+       struct iommufd_device *cur;
+
+       lockdep_assert_held(&igroup->lock);
+
+       list_for_each_entry(cur, &igroup->device_list, group_item)
+               iopt_remove_reserved_iova(&hwpt_paging->ioas->iopt, cur->dev);
+}
+
+static int
+iommufd_group_do_replace_paging(struct iommufd_group *igroup,
+                               struct iommufd_hwpt_paging *hwpt_paging)
+{
+       struct iommufd_hw_pagetable *old_hwpt = igroup->hwpt;
+       struct iommufd_device *cur;
+       int rc;
+
+       lockdep_assert_held(&igroup->lock);
+
+       if (!hwpt_is_paging(old_hwpt) ||
+           hwpt_paging->ioas != to_hwpt_paging(old_hwpt)->ioas) {
+               list_for_each_entry(cur, &igroup->device_list, group_item) {
+                       rc = iopt_table_enforce_dev_resv_regions(
+                               &hwpt_paging->ioas->iopt, cur->dev, NULL);
+                       if (rc)
+                               goto err_unresv;
+               }
+       }
+
+       rc = iommufd_group_setup_msi(igroup, hwpt_paging);
+       if (rc)
+               goto err_unresv;
+       return 0;
+
+err_unresv:
+       iommufd_group_remove_reserved_iova(igroup, hwpt_paging);
+       return rc;
+}
+
 static struct iommufd_hw_pagetable *
 iommufd_device_do_replace(struct iommufd_device *idev,
                          struct iommufd_hw_pagetable *hwpt)
 {
        struct iommufd_group *igroup = idev->igroup;
        struct iommufd_hw_pagetable *old_hwpt;
-       unsigned int num_devices = 0;
-       struct iommufd_device *cur;
+       unsigned int num_devices;
        int rc;
 
        mutex_lock(&idev->igroup->lock);
@@ -429,42 +489,27 @@ iommufd_device_do_replace(struct iommufd_device *idev,
                return NULL;
        }
 
-       /* Try to upgrade the domain we have */
-       list_for_each_entry(cur, &igroup->device_list, group_item) {
-               num_devices++;
-               if (cur->enforce_cache_coherency) {
-                       rc = iommufd_hw_pagetable_enforce_cc(hwpt);
-                       if (rc)
-                               goto err_unlock;
-               }
-       }
-
        old_hwpt = igroup->hwpt;
-       if (hwpt->ioas != old_hwpt->ioas) {
-               list_for_each_entry(cur, &igroup->device_list, group_item) {
-                       rc = iopt_table_enforce_dev_resv_regions(
-                               &hwpt->ioas->iopt, cur->dev, NULL);
-                       if (rc)
-                               goto err_unresv;
-               }
+       if (hwpt_is_paging(hwpt)) {
+               rc = iommufd_group_do_replace_paging(igroup,
+                                                    to_hwpt_paging(hwpt));
+               if (rc)
+                       goto err_unlock;
        }
 
-       rc = iommufd_group_setup_msi(idev->igroup, hwpt);
-       if (rc)
-               goto err_unresv;
-
        rc = iommu_group_replace_domain(igroup->group, hwpt->domain);
        if (rc)
                goto err_unresv;
 
-       if (hwpt->ioas != old_hwpt->ioas) {
-               list_for_each_entry(cur, &igroup->device_list, group_item)
-                       iopt_remove_reserved_iova(&old_hwpt->ioas->iopt,
-                                                 cur->dev);
-       }
+       if (hwpt_is_paging(old_hwpt) &&
+           (!hwpt_is_paging(hwpt) ||
+            to_hwpt_paging(hwpt)->ioas != to_hwpt_paging(old_hwpt)->ioas))
+               iommufd_group_remove_reserved_iova(igroup,
+                                                  to_hwpt_paging(old_hwpt));
 
        igroup->hwpt = hwpt;
 
+       num_devices = list_count_nodes(&igroup->device_list);
        /*
         * Move the refcounts held by the device_list to the new hwpt. Retain a
         * refcount for this thread as the caller will free it.
@@ -478,8 +523,9 @@ iommufd_device_do_replace(struct iommufd_device *idev,
        /* Caller must destroy old_hwpt */
        return old_hwpt;
 err_unresv:
-       list_for_each_entry(cur, &igroup->device_list, group_item)
-               iopt_remove_reserved_iova(&hwpt->ioas->iopt, cur->dev);
+       if (hwpt_is_paging(hwpt))
+               iommufd_group_remove_reserved_iova(igroup,
+                                                  to_hwpt_paging(old_hwpt));
 err_unlock:
        mutex_unlock(&idev->igroup->lock);
        return ERR_PTR(rc);
@@ -507,6 +553,7 @@ iommufd_device_auto_get_domain(struct iommufd_device *idev,
         */
        bool immediate_attach = do_attach == iommufd_device_do_attach;
        struct iommufd_hw_pagetable *destroy_hwpt;
+       struct iommufd_hwpt_paging *hwpt_paging;
        struct iommufd_hw_pagetable *hwpt;
 
        /*
@@ -515,10 +562,11 @@ iommufd_device_auto_get_domain(struct iommufd_device *idev,
         * other.
         */
        mutex_lock(&ioas->mutex);
-       list_for_each_entry(hwpt, &ioas->hwpt_list, hwpt_item) {
-               if (!hwpt->auto_domain)
+       list_for_each_entry(hwpt_paging, &ioas->hwpt_list, hwpt_item) {
+               if (!hwpt_paging->auto_domain)
                        continue;
 
+               hwpt = &hwpt_paging->common;
                if (!iommufd_lock_obj(&hwpt->obj))
                        continue;
                destroy_hwpt = (*do_attach)(idev, hwpt);
@@ -539,12 +587,13 @@ iommufd_device_auto_get_domain(struct iommufd_device *idev,
                goto out_unlock;
        }
 
-       hwpt = iommufd_hw_pagetable_alloc(idev->ictx, ioas, idev,
-                                         immediate_attach);
-       if (IS_ERR(hwpt)) {
-               destroy_hwpt = ERR_CAST(hwpt);
+       hwpt_paging = iommufd_hwpt_paging_alloc(idev->ictx, ioas, idev, 0,
+                                               immediate_attach, NULL);
+       if (IS_ERR(hwpt_paging)) {
+               destroy_hwpt = ERR_CAST(hwpt_paging);
                goto out_unlock;
        }
+       hwpt = &hwpt_paging->common;
 
        if (!immediate_attach) {
                destroy_hwpt = (*do_attach)(idev, hwpt);
@@ -554,7 +603,7 @@ iommufd_device_auto_get_domain(struct iommufd_device *idev,
                destroy_hwpt = NULL;
        }
 
-       hwpt->auto_domain = true;
+       hwpt_paging->auto_domain = true;
        *pt_id = hwpt->obj.id;
 
        iommufd_object_finalize(idev->ictx, &hwpt->obj);
@@ -579,7 +628,8 @@ static int iommufd_device_change_pt(struct iommufd_device *idev, u32 *pt_id,
                return PTR_ERR(pt_obj);
 
        switch (pt_obj->type) {
-       case IOMMUFD_OBJ_HW_PAGETABLE: {
+       case IOMMUFD_OBJ_HWPT_NESTED:
+       case IOMMUFD_OBJ_HWPT_PAGING: {
                struct iommufd_hw_pagetable *hwpt =
                        container_of(pt_obj, struct iommufd_hw_pagetable, obj);
 
@@ -617,8 +667,8 @@ out_put_pt_obj:
 /**
  * iommufd_device_attach - Connect a device to an iommu_domain
  * @idev: device to attach
- * @pt_id: Input a IOMMUFD_OBJ_IOAS, or IOMMUFD_OBJ_HW_PAGETABLE
- *         Output the IOMMUFD_OBJ_HW_PAGETABLE ID
+ * @pt_id: Input a IOMMUFD_OBJ_IOAS, or IOMMUFD_OBJ_HWPT_PAGING
+ *         Output the IOMMUFD_OBJ_HWPT_PAGING ID
  *
  * This connects the device to an iommu_domain, either automatically or manually
  * selected. Once this completes the device could do DMA.
@@ -646,8 +696,8 @@ EXPORT_SYMBOL_NS_GPL(iommufd_device_attach, IOMMUFD);
 /**
  * iommufd_device_replace - Change the device's iommu_domain
  * @idev: device to change
- * @pt_id: Input a IOMMUFD_OBJ_IOAS, or IOMMUFD_OBJ_HW_PAGETABLE
- *         Output the IOMMUFD_OBJ_HW_PAGETABLE ID
+ * @pt_id: Input a IOMMUFD_OBJ_IOAS, or IOMMUFD_OBJ_HWPT_PAGING
+ *         Output the IOMMUFD_OBJ_HWPT_PAGING ID
  *
  * This is the same as::
  *
@@ -1185,6 +1235,10 @@ int iommufd_get_hw_info(struct iommufd_ucmd *ucmd)
         */
        cmd->data_len = data_len;
 
+       cmd->out_capabilities = 0;
+       if (device_iommu_capable(idev->dev, IOMMU_CAP_DIRTY_TRACKING))
+               cmd->out_capabilities |= IOMMU_HW_CAP_DIRTY_TRACKING;
+
        rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
 out_free:
        kfree(data);
index cf2c1504e20d843a6c00741f52c4fa6f66b01a90..2abbeafdbd22d86019f665662f30fb367c40bd2e 100644 (file)
@@ -5,62 +5,87 @@
 #include <linux/iommu.h>
 #include <uapi/linux/iommufd.h>
 
+#include "../iommu-priv.h"
 #include "iommufd_private.h"
 
-void iommufd_hw_pagetable_destroy(struct iommufd_object *obj)
+void iommufd_hwpt_paging_destroy(struct iommufd_object *obj)
 {
-       struct iommufd_hw_pagetable *hwpt =
-               container_of(obj, struct iommufd_hw_pagetable, obj);
+       struct iommufd_hwpt_paging *hwpt_paging =
+               container_of(obj, struct iommufd_hwpt_paging, common.obj);
 
-       if (!list_empty(&hwpt->hwpt_item)) {
-               mutex_lock(&hwpt->ioas->mutex);
-               list_del(&hwpt->hwpt_item);
-               mutex_unlock(&hwpt->ioas->mutex);
+       if (!list_empty(&hwpt_paging->hwpt_item)) {
+               mutex_lock(&hwpt_paging->ioas->mutex);
+               list_del(&hwpt_paging->hwpt_item);
+               mutex_unlock(&hwpt_paging->ioas->mutex);
 
-               iopt_table_remove_domain(&hwpt->ioas->iopt, hwpt->domain);
+               iopt_table_remove_domain(&hwpt_paging->ioas->iopt,
+                                        hwpt_paging->common.domain);
        }
 
-       if (hwpt->domain)
-               iommu_domain_free(hwpt->domain);
+       if (hwpt_paging->common.domain)
+               iommu_domain_free(hwpt_paging->common.domain);
 
-       refcount_dec(&hwpt->ioas->obj.users);
+       refcount_dec(&hwpt_paging->ioas->obj.users);
 }
 
-void iommufd_hw_pagetable_abort(struct iommufd_object *obj)
+void iommufd_hwpt_paging_abort(struct iommufd_object *obj)
 {
-       struct iommufd_hw_pagetable *hwpt =
-               container_of(obj, struct iommufd_hw_pagetable, obj);
+       struct iommufd_hwpt_paging *hwpt_paging =
+               container_of(obj, struct iommufd_hwpt_paging, common.obj);
 
        /* The ioas->mutex must be held until finalize is called. */
-       lockdep_assert_held(&hwpt->ioas->mutex);
+       lockdep_assert_held(&hwpt_paging->ioas->mutex);
 
-       if (!list_empty(&hwpt->hwpt_item)) {
-               list_del_init(&hwpt->hwpt_item);
-               iopt_table_remove_domain(&hwpt->ioas->iopt, hwpt->domain);
+       if (!list_empty(&hwpt_paging->hwpt_item)) {
+               list_del_init(&hwpt_paging->hwpt_item);
+               iopt_table_remove_domain(&hwpt_paging->ioas->iopt,
+                                        hwpt_paging->common.domain);
        }
-       iommufd_hw_pagetable_destroy(obj);
+       iommufd_hwpt_paging_destroy(obj);
 }
 
-int iommufd_hw_pagetable_enforce_cc(struct iommufd_hw_pagetable *hwpt)
+void iommufd_hwpt_nested_destroy(struct iommufd_object *obj)
 {
-       if (hwpt->enforce_cache_coherency)
+       struct iommufd_hwpt_nested *hwpt_nested =
+               container_of(obj, struct iommufd_hwpt_nested, common.obj);
+
+       if (hwpt_nested->common.domain)
+               iommu_domain_free(hwpt_nested->common.domain);
+
+       refcount_dec(&hwpt_nested->parent->common.obj.users);
+}
+
+void iommufd_hwpt_nested_abort(struct iommufd_object *obj)
+{
+       iommufd_hwpt_nested_destroy(obj);
+}
+
+static int
+iommufd_hwpt_paging_enforce_cc(struct iommufd_hwpt_paging *hwpt_paging)
+{
+       struct iommu_domain *paging_domain = hwpt_paging->common.domain;
+
+       if (hwpt_paging->enforce_cache_coherency)
                return 0;
 
-       if (hwpt->domain->ops->enforce_cache_coherency)
-               hwpt->enforce_cache_coherency =
-                       hwpt->domain->ops->enforce_cache_coherency(
-                               hwpt->domain);
-       if (!hwpt->enforce_cache_coherency)
+       if (paging_domain->ops->enforce_cache_coherency)
+               hwpt_paging->enforce_cache_coherency =
+                       paging_domain->ops->enforce_cache_coherency(
+                               paging_domain);
+       if (!hwpt_paging->enforce_cache_coherency)
                return -EINVAL;
        return 0;
 }
 
 /**
- * iommufd_hw_pagetable_alloc() - Get an iommu_domain for a device
+ * iommufd_hwpt_paging_alloc() - Get a PAGING iommu_domain for a device
  * @ictx: iommufd context
  * @ioas: IOAS to associate the domain with
  * @idev: Device to get an iommu_domain for
+ * @flags: Flags from userspace
  * @immediate_attach: True if idev should be attached to the hwpt
+ * @user_data: The user provided driver specific data describing the domain to
+ *             create
  *
  * Allocate a new iommu_domain and return it as a hw_pagetable. The HWPT
  * will be linked to the given ioas and upon return the underlying iommu_domain
@@ -70,28 +95,52 @@ int iommufd_hw_pagetable_enforce_cc(struct iommufd_hw_pagetable *hwpt)
  * iommufd_object_abort_and_destroy() or iommufd_object_finalize() is called on
  * the returned hwpt.
  */
-struct iommufd_hw_pagetable *
-iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
-                          struct iommufd_device *idev, bool immediate_attach)
+struct iommufd_hwpt_paging *
+iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
+                         struct iommufd_device *idev, u32 flags,
+                         bool immediate_attach,
+                         const struct iommu_user_data *user_data)
 {
+       const u32 valid_flags = IOMMU_HWPT_ALLOC_NEST_PARENT |
+                               IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
+       const struct iommu_ops *ops = dev_iommu_ops(idev->dev);
+       struct iommufd_hwpt_paging *hwpt_paging;
        struct iommufd_hw_pagetable *hwpt;
        int rc;
 
        lockdep_assert_held(&ioas->mutex);
 
-       hwpt = iommufd_object_alloc(ictx, hwpt, IOMMUFD_OBJ_HW_PAGETABLE);
-       if (IS_ERR(hwpt))
-               return hwpt;
+       if ((flags || user_data) && !ops->domain_alloc_user)
+               return ERR_PTR(-EOPNOTSUPP);
+       if (flags & ~valid_flags)
+               return ERR_PTR(-EOPNOTSUPP);
+
+       hwpt_paging = __iommufd_object_alloc(
+               ictx, hwpt_paging, IOMMUFD_OBJ_HWPT_PAGING, common.obj);
+       if (IS_ERR(hwpt_paging))
+               return ERR_CAST(hwpt_paging);
+       hwpt = &hwpt_paging->common;
 
-       INIT_LIST_HEAD(&hwpt->hwpt_item);
+       INIT_LIST_HEAD(&hwpt_paging->hwpt_item);
        /* Pairs with iommufd_hw_pagetable_destroy() */
        refcount_inc(&ioas->obj.users);
-       hwpt->ioas = ioas;
+       hwpt_paging->ioas = ioas;
+       hwpt_paging->nest_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
 
-       hwpt->domain = iommu_domain_alloc(idev->dev->bus);
-       if (!hwpt->domain) {
-               rc = -ENOMEM;
-               goto out_abort;
+       if (ops->domain_alloc_user) {
+               hwpt->domain = ops->domain_alloc_user(idev->dev, flags, NULL,
+                                                     user_data);
+               if (IS_ERR(hwpt->domain)) {
+                       rc = PTR_ERR(hwpt->domain);
+                       hwpt->domain = NULL;
+                       goto out_abort;
+               }
+       } else {
+               hwpt->domain = iommu_domain_alloc(idev->dev->bus);
+               if (!hwpt->domain) {
+                       rc = -ENOMEM;
+                       goto out_abort;
+               }
        }
 
        /*
@@ -100,9 +149,16 @@ iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
         * doing any maps. It is an iommu driver bug to report
         * IOMMU_CAP_ENFORCE_CACHE_COHERENCY but fail enforce_cache_coherency on
         * a new domain.
+        *
+        * The cache coherency mode must be configured here and unchanged later.
+        * Note that a HWPT (non-CC) created for a device (non-CC) can be later
+        * reused by another device (either non-CC or CC). However, A HWPT (CC)
+        * created for a device (CC) cannot be reused by another device (non-CC)
+        * but only devices (CC). Instead user space in this case would need to
+        * allocate a separate HWPT (non-CC).
         */
        if (idev->enforce_cache_coherency) {
-               rc = iommufd_hw_pagetable_enforce_cc(hwpt);
+               rc = iommufd_hwpt_paging_enforce_cc(hwpt_paging);
                if (WARN_ON(rc))
                        goto out_abort;
        }
@@ -119,11 +175,11 @@ iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
                        goto out_abort;
        }
 
-       rc = iopt_table_add_domain(&hwpt->ioas->iopt, hwpt->domain);
+       rc = iopt_table_add_domain(&ioas->iopt, hwpt->domain);
        if (rc)
                goto out_detach;
-       list_add_tail(&hwpt->hwpt_item, &hwpt->ioas->hwpt_list);
-       return hwpt;
+       list_add_tail(&hwpt_paging->hwpt_item, &ioas->hwpt_list);
+       return hwpt_paging;
 
 out_detach:
        if (immediate_attach)
@@ -133,32 +189,120 @@ out_abort:
        return ERR_PTR(rc);
 }
 
+/**
+ * iommufd_hwpt_nested_alloc() - Get a NESTED iommu_domain for a device
+ * @ictx: iommufd context
+ * @parent: Parent PAGING-type hwpt to associate the domain with
+ * @idev: Device to get an iommu_domain for
+ * @flags: Flags from userspace
+ * @user_data: user_data pointer. Must be valid
+ *
+ * Allocate a new iommu_domain (must be IOMMU_DOMAIN_NESTED) and return it as
+ * a NESTED hw_pagetable. The given parent PAGING-type hwpt must be capable of
+ * being a parent.
+ */
+static struct iommufd_hwpt_nested *
+iommufd_hwpt_nested_alloc(struct iommufd_ctx *ictx,
+                         struct iommufd_hwpt_paging *parent,
+                         struct iommufd_device *idev, u32 flags,
+                         const struct iommu_user_data *user_data)
+{
+       const struct iommu_ops *ops = dev_iommu_ops(idev->dev);
+       struct iommufd_hwpt_nested *hwpt_nested;
+       struct iommufd_hw_pagetable *hwpt;
+       int rc;
+
+       if (flags || !user_data->len || !ops->domain_alloc_user)
+               return ERR_PTR(-EOPNOTSUPP);
+       if (parent->auto_domain || !parent->nest_parent)
+               return ERR_PTR(-EINVAL);
+
+       hwpt_nested = __iommufd_object_alloc(
+               ictx, hwpt_nested, IOMMUFD_OBJ_HWPT_NESTED, common.obj);
+       if (IS_ERR(hwpt_nested))
+               return ERR_CAST(hwpt_nested);
+       hwpt = &hwpt_nested->common;
+
+       refcount_inc(&parent->common.obj.users);
+       hwpt_nested->parent = parent;
+
+       hwpt->domain = ops->domain_alloc_user(idev->dev, flags,
+                                             parent->common.domain, user_data);
+       if (IS_ERR(hwpt->domain)) {
+               rc = PTR_ERR(hwpt->domain);
+               hwpt->domain = NULL;
+               goto out_abort;
+       }
+
+       if (WARN_ON_ONCE(hwpt->domain->type != IOMMU_DOMAIN_NESTED)) {
+               rc = -EINVAL;
+               goto out_abort;
+       }
+       return hwpt_nested;
+
+out_abort:
+       iommufd_object_abort_and_destroy(ictx, &hwpt->obj);
+       return ERR_PTR(rc);
+}
+
 int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd)
 {
        struct iommu_hwpt_alloc *cmd = ucmd->cmd;
+       const struct iommu_user_data user_data = {
+               .type = cmd->data_type,
+               .uptr = u64_to_user_ptr(cmd->data_uptr),
+               .len = cmd->data_len,
+       };
        struct iommufd_hw_pagetable *hwpt;
+       struct iommufd_ioas *ioas = NULL;
+       struct iommufd_object *pt_obj;
        struct iommufd_device *idev;
-       struct iommufd_ioas *ioas;
        int rc;
 
-       if (cmd->flags || cmd->__reserved)
+       if (cmd->__reserved)
                return -EOPNOTSUPP;
+       if (cmd->data_type == IOMMU_HWPT_DATA_NONE && cmd->data_len)
+               return -EINVAL;
 
        idev = iommufd_get_device(ucmd, cmd->dev_id);
        if (IS_ERR(idev))
                return PTR_ERR(idev);
 
-       ioas = iommufd_get_ioas(ucmd->ictx, cmd->pt_id);
-       if (IS_ERR(ioas)) {
-               rc = PTR_ERR(ioas);
+       pt_obj = iommufd_get_object(ucmd->ictx, cmd->pt_id, IOMMUFD_OBJ_ANY);
+       if (IS_ERR(pt_obj)) {
+               rc = -EINVAL;
                goto out_put_idev;
        }
 
-       mutex_lock(&ioas->mutex);
-       hwpt = iommufd_hw_pagetable_alloc(ucmd->ictx, ioas, idev, false);
-       if (IS_ERR(hwpt)) {
-               rc = PTR_ERR(hwpt);
-               goto out_unlock;
+       if (pt_obj->type == IOMMUFD_OBJ_IOAS) {
+               struct iommufd_hwpt_paging *hwpt_paging;
+
+               ioas = container_of(pt_obj, struct iommufd_ioas, obj);
+               mutex_lock(&ioas->mutex);
+               hwpt_paging = iommufd_hwpt_paging_alloc(
+                       ucmd->ictx, ioas, idev, cmd->flags, false,
+                       user_data.len ? &user_data : NULL);
+               if (IS_ERR(hwpt_paging)) {
+                       rc = PTR_ERR(hwpt_paging);
+                       goto out_unlock;
+               }
+               hwpt = &hwpt_paging->common;
+       } else if (pt_obj->type == IOMMUFD_OBJ_HWPT_PAGING) {
+               struct iommufd_hwpt_nested *hwpt_nested;
+
+               hwpt_nested = iommufd_hwpt_nested_alloc(
+                       ucmd->ictx,
+                       container_of(pt_obj, struct iommufd_hwpt_paging,
+                                    common.obj),
+                       idev, cmd->flags, &user_data);
+               if (IS_ERR(hwpt_nested)) {
+                       rc = PTR_ERR(hwpt_nested);
+                       goto out_unlock;
+               }
+               hwpt = &hwpt_nested->common;
+       } else {
+               rc = -EINVAL;
+               goto out_put_pt;
        }
 
        cmd->out_hwpt_id = hwpt->obj.id;
@@ -171,9 +315,59 @@ int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd)
 out_hwpt:
        iommufd_object_abort_and_destroy(ucmd->ictx, &hwpt->obj);
 out_unlock:
-       mutex_unlock(&ioas->mutex);
-       iommufd_put_object(&ioas->obj);
+       if (ioas)
+               mutex_unlock(&ioas->mutex);
+out_put_pt:
+       iommufd_put_object(pt_obj);
 out_put_idev:
        iommufd_put_object(&idev->obj);
        return rc;
 }
+
+int iommufd_hwpt_set_dirty_tracking(struct iommufd_ucmd *ucmd)
+{
+       struct iommu_hwpt_set_dirty_tracking *cmd = ucmd->cmd;
+       struct iommufd_hwpt_paging *hwpt_paging;
+       struct iommufd_ioas *ioas;
+       int rc = -EOPNOTSUPP;
+       bool enable;
+
+       if (cmd->flags & ~IOMMU_HWPT_DIRTY_TRACKING_ENABLE)
+               return rc;
+
+       hwpt_paging = iommufd_get_hwpt_paging(ucmd, cmd->hwpt_id);
+       if (IS_ERR(hwpt_paging))
+               return PTR_ERR(hwpt_paging);
+
+       ioas = hwpt_paging->ioas;
+       enable = cmd->flags & IOMMU_HWPT_DIRTY_TRACKING_ENABLE;
+
+       rc = iopt_set_dirty_tracking(&ioas->iopt, hwpt_paging->common.domain,
+                                    enable);
+
+       iommufd_put_object(&hwpt_paging->common.obj);
+       return rc;
+}
+
+int iommufd_hwpt_get_dirty_bitmap(struct iommufd_ucmd *ucmd)
+{
+       struct iommu_hwpt_get_dirty_bitmap *cmd = ucmd->cmd;
+       struct iommufd_hwpt_paging *hwpt_paging;
+       struct iommufd_ioas *ioas;
+       int rc = -EOPNOTSUPP;
+
+       if ((cmd->flags & ~(IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR)) ||
+           cmd->__reserved)
+               return -EOPNOTSUPP;
+
+       hwpt_paging = iommufd_get_hwpt_paging(ucmd, cmd->hwpt_id);
+       if (IS_ERR(hwpt_paging))
+               return PTR_ERR(hwpt_paging);
+
+       ioas = hwpt_paging->ioas;
+       rc = iopt_read_and_clear_dirty_data(
+               &ioas->iopt, hwpt_paging->common.domain, cmd->flags, cmd);
+
+       iommufd_put_object(&hwpt_paging->common.obj);
+       return rc;
+}
index 3a598182b76191377ad92d903a720c3b799fe082..504ac1b01b2d2ab45fbc22fde2bdcf324ce2d973 100644 (file)
@@ -15,6 +15,7 @@
 #include <linux/err.h>
 #include <linux/slab.h>
 #include <linux/errno.h>
+#include <uapi/linux/iommufd.h>
 
 #include "io_pagetable.h"
 #include "double_span.h"
@@ -221,6 +222,18 @@ static int iopt_insert_area(struct io_pagetable *iopt, struct iopt_area *area,
        return 0;
 }
 
+static struct iopt_area *iopt_area_alloc(void)
+{
+       struct iopt_area *area;
+
+       area = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT);
+       if (!area)
+               return NULL;
+       RB_CLEAR_NODE(&area->node.rb);
+       RB_CLEAR_NODE(&area->pages_node.rb);
+       return area;
+}
+
 static int iopt_alloc_area_pages(struct io_pagetable *iopt,
                                 struct list_head *pages_list,
                                 unsigned long length, unsigned long *dst_iova,
@@ -231,7 +244,7 @@ static int iopt_alloc_area_pages(struct io_pagetable *iopt,
        int rc = 0;
 
        list_for_each_entry(elm, pages_list, next) {
-               elm->area = kzalloc(sizeof(*elm->area), GFP_KERNEL_ACCOUNT);
+               elm->area = iopt_area_alloc();
                if (!elm->area)
                        return -ENOMEM;
        }
@@ -412,6 +425,177 @@ int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
        return 0;
 }
 
+struct iova_bitmap_fn_arg {
+       unsigned long flags;
+       struct io_pagetable *iopt;
+       struct iommu_domain *domain;
+       struct iommu_dirty_bitmap *dirty;
+};
+
+static int __iommu_read_and_clear_dirty(struct iova_bitmap *bitmap,
+                                       unsigned long iova, size_t length,
+                                       void *opaque)
+{
+       struct iopt_area *area;
+       struct iopt_area_contig_iter iter;
+       struct iova_bitmap_fn_arg *arg = opaque;
+       struct iommu_domain *domain = arg->domain;
+       struct iommu_dirty_bitmap *dirty = arg->dirty;
+       const struct iommu_dirty_ops *ops = domain->dirty_ops;
+       unsigned long last_iova = iova + length - 1;
+       unsigned long flags = arg->flags;
+       int ret;
+
+       iopt_for_each_contig_area(&iter, area, arg->iopt, iova, last_iova) {
+               unsigned long last = min(last_iova, iopt_area_last_iova(area));
+
+               ret = ops->read_and_clear_dirty(domain, iter.cur_iova,
+                                               last - iter.cur_iova + 1, flags,
+                                               dirty);
+               if (ret)
+                       return ret;
+       }
+
+       if (!iopt_area_contig_done(&iter))
+               return -EINVAL;
+       return 0;
+}
+
+static int
+iommu_read_and_clear_dirty(struct iommu_domain *domain,
+                          struct io_pagetable *iopt, unsigned long flags,
+                          struct iommu_hwpt_get_dirty_bitmap *bitmap)
+{
+       const struct iommu_dirty_ops *ops = domain->dirty_ops;
+       struct iommu_iotlb_gather gather;
+       struct iommu_dirty_bitmap dirty;
+       struct iova_bitmap_fn_arg arg;
+       struct iova_bitmap *iter;
+       int ret = 0;
+
+       if (!ops || !ops->read_and_clear_dirty)
+               return -EOPNOTSUPP;
+
+       iter = iova_bitmap_alloc(bitmap->iova, bitmap->length,
+                                bitmap->page_size,
+                                u64_to_user_ptr(bitmap->data));
+       if (IS_ERR(iter))
+               return -ENOMEM;
+
+       iommu_dirty_bitmap_init(&dirty, iter, &gather);
+
+       arg.flags = flags;
+       arg.iopt = iopt;
+       arg.domain = domain;
+       arg.dirty = &dirty;
+       iova_bitmap_for_each(iter, &arg, __iommu_read_and_clear_dirty);
+
+       if (!(flags & IOMMU_DIRTY_NO_CLEAR))
+               iommu_iotlb_sync(domain, &gather);
+
+       iova_bitmap_free(iter);
+
+       return ret;
+}
+
+int iommufd_check_iova_range(struct io_pagetable *iopt,
+                            struct iommu_hwpt_get_dirty_bitmap *bitmap)
+{
+       size_t iommu_pgsize = iopt->iova_alignment;
+       u64 last_iova;
+
+       if (check_add_overflow(bitmap->iova, bitmap->length - 1, &last_iova))
+               return -EOVERFLOW;
+
+       if (bitmap->iova > ULONG_MAX || last_iova > ULONG_MAX)
+               return -EOVERFLOW;
+
+       if ((bitmap->iova & (iommu_pgsize - 1)) ||
+           ((last_iova + 1) & (iommu_pgsize - 1)))
+               return -EINVAL;
+
+       if (!bitmap->page_size)
+               return -EINVAL;
+
+       if ((bitmap->iova & (bitmap->page_size - 1)) ||
+           ((last_iova + 1) & (bitmap->page_size - 1)))
+               return -EINVAL;
+
+       return 0;
+}
+
+int iopt_read_and_clear_dirty_data(struct io_pagetable *iopt,
+                                  struct iommu_domain *domain,
+                                  unsigned long flags,
+                                  struct iommu_hwpt_get_dirty_bitmap *bitmap)
+{
+       int ret;
+
+       ret = iommufd_check_iova_range(iopt, bitmap);
+       if (ret)
+               return ret;
+
+       down_read(&iopt->iova_rwsem);
+       ret = iommu_read_and_clear_dirty(domain, iopt, flags, bitmap);
+       up_read(&iopt->iova_rwsem);
+
+       return ret;
+}
+
+static int iopt_clear_dirty_data(struct io_pagetable *iopt,
+                                struct iommu_domain *domain)
+{
+       const struct iommu_dirty_ops *ops = domain->dirty_ops;
+       struct iommu_iotlb_gather gather;
+       struct iommu_dirty_bitmap dirty;
+       struct iopt_area *area;
+       int ret = 0;
+
+       lockdep_assert_held_read(&iopt->iova_rwsem);
+
+       iommu_dirty_bitmap_init(&dirty, NULL, &gather);
+
+       for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
+            area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
+               if (!area->pages)
+                       continue;
+
+               ret = ops->read_and_clear_dirty(domain, iopt_area_iova(area),
+                                               iopt_area_length(area), 0,
+                                               &dirty);
+               if (ret)
+                       break;
+       }
+
+       iommu_iotlb_sync(domain, &gather);
+       return ret;
+}
+
+int iopt_set_dirty_tracking(struct io_pagetable *iopt,
+                           struct iommu_domain *domain, bool enable)
+{
+       const struct iommu_dirty_ops *ops = domain->dirty_ops;
+       int ret = 0;
+
+       if (!ops)
+               return -EOPNOTSUPP;
+
+       down_read(&iopt->iova_rwsem);
+
+       /* Clear dirty bits from PTEs to ensure a clean snapshot */
+       if (enable) {
+               ret = iopt_clear_dirty_data(iopt, domain);
+               if (ret)
+                       goto out_unlock;
+       }
+
+       ret = ops->set_dirty_tracking(domain, enable);
+
+out_unlock:
+       up_read(&iopt->iova_rwsem);
+       return ret;
+}
+
 int iopt_get_pages(struct io_pagetable *iopt, unsigned long iova,
                   unsigned long length, struct list_head *pages_list)
 {
@@ -1005,11 +1189,11 @@ static int iopt_area_split(struct iopt_area *area, unsigned long iova)
            iopt_area_start_byte(area, new_start) & (alignment - 1))
                return -EINVAL;
 
-       lhs = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT);
+       lhs = iopt_area_alloc();
        if (!lhs)
                return -ENOMEM;
 
-       rhs = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT);
+       rhs = iopt_area_alloc();
        if (!rhs) {
                rc = -ENOMEM;
                goto err_free_lhs;
@@ -1048,6 +1232,16 @@ static int iopt_area_split(struct iopt_area *area, unsigned long iova)
        if (WARN_ON(rc))
                goto err_remove_lhs;
 
+       /*
+        * If the original area has filled a domain, domains_itree has to be
+        * updated.
+        */
+       if (area->storage_domain) {
+               interval_tree_remove(&area->pages_node, &pages->domains_itree);
+               interval_tree_insert(&lhs->pages_node, &pages->domains_itree);
+               interval_tree_insert(&rhs->pages_node, &pages->domains_itree);
+       }
+
        lhs->storage_domain = area->storage_domain;
        lhs->pages = area->pages;
        rhs->storage_domain = area->storage_domain;
index 2c58670011fe979b6da6687a9904408bba5f8a9a..a74cfefffbc6c5045c7b22063f66978f1f275e59 100644 (file)
@@ -8,6 +8,9 @@
 #include <linux/xarray.h>
 #include <linux/refcount.h>
 #include <linux/uaccess.h>
+#include <linux/iommu.h>
+#include <linux/iova_bitmap.h>
+#include <uapi/linux/iommufd.h>
 
 struct iommu_domain;
 struct iommu_group;
@@ -70,6 +73,13 @@ int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova,
                    unsigned long length, unsigned long *unmapped);
 int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped);
 
+int iopt_read_and_clear_dirty_data(struct io_pagetable *iopt,
+                                  struct iommu_domain *domain,
+                                  unsigned long flags,
+                                  struct iommu_hwpt_get_dirty_bitmap *bitmap);
+int iopt_set_dirty_tracking(struct io_pagetable *iopt,
+                           struct iommu_domain *domain, bool enable);
+
 void iommufd_access_notify_unmap(struct io_pagetable *iopt, unsigned long iova,
                                 unsigned long length);
 int iopt_table_add_domain(struct io_pagetable *iopt,
@@ -113,7 +123,8 @@ enum iommufd_object_type {
        IOMMUFD_OBJ_NONE,
        IOMMUFD_OBJ_ANY = IOMMUFD_OBJ_NONE,
        IOMMUFD_OBJ_DEVICE,
-       IOMMUFD_OBJ_HW_PAGETABLE,
+       IOMMUFD_OBJ_HWPT_PAGING,
+       IOMMUFD_OBJ_HWPT_NESTED,
        IOMMUFD_OBJ_IOAS,
        IOMMUFD_OBJ_ACCESS,
 #ifdef CONFIG_IOMMUFD_TEST
@@ -171,7 +182,7 @@ struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx,
                                             size_t size,
                                             enum iommufd_object_type type);
 
-#define iommufd_object_alloc(ictx, ptr, type)                                  \
+#define __iommufd_object_alloc(ictx, ptr, type, obj)                           \
        container_of(_iommufd_object_alloc(                                    \
                             ictx,                                             \
                             sizeof(*(ptr)) + BUILD_BUG_ON_ZERO(               \
@@ -180,6 +191,9 @@ struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx,
                             type),                                            \
                     typeof(*(ptr)), obj)
 
+#define iommufd_object_alloc(ictx, ptr, type) \
+       __iommufd_object_alloc(ictx, ptr, type, obj)
+
 /*
  * The IO Address Space (IOAS) pagetable is a virtual page table backed by the
  * io_pagetable object. It is a user controlled mapping of IOVA -> PFNs. The
@@ -222,6 +236,8 @@ int iommufd_option_rlimit_mode(struct iommu_option *cmd,
                               struct iommufd_ctx *ictx);
 
 int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd);
+int iommufd_check_iova_range(struct io_pagetable *iopt,
+                            struct iommu_hwpt_get_dirty_bitmap *bitmap);
 
 /*
  * A HW pagetable is called an iommu_domain inside the kernel. This user object
@@ -231,35 +247,75 @@ int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd);
  */
 struct iommufd_hw_pagetable {
        struct iommufd_object obj;
-       struct iommufd_ioas *ioas;
        struct iommu_domain *domain;
+};
+
+struct iommufd_hwpt_paging {
+       struct iommufd_hw_pagetable common;
+       struct iommufd_ioas *ioas;
        bool auto_domain : 1;
        bool enforce_cache_coherency : 1;
        bool msi_cookie : 1;
+       bool nest_parent : 1;
        /* Head at iommufd_ioas::hwpt_list */
        struct list_head hwpt_item;
 };
 
-struct iommufd_hw_pagetable *
-iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
-                          struct iommufd_device *idev, bool immediate_attach);
-int iommufd_hw_pagetable_enforce_cc(struct iommufd_hw_pagetable *hwpt);
+struct iommufd_hwpt_nested {
+       struct iommufd_hw_pagetable common;
+       struct iommufd_hwpt_paging *parent;
+};
+
+static inline bool hwpt_is_paging(struct iommufd_hw_pagetable *hwpt)
+{
+       return hwpt->obj.type == IOMMUFD_OBJ_HWPT_PAGING;
+}
+
+static inline struct iommufd_hwpt_paging *
+to_hwpt_paging(struct iommufd_hw_pagetable *hwpt)
+{
+       return container_of(hwpt, struct iommufd_hwpt_paging, common);
+}
+
+static inline struct iommufd_hwpt_paging *
+iommufd_get_hwpt_paging(struct iommufd_ucmd *ucmd, u32 id)
+{
+       return container_of(iommufd_get_object(ucmd->ictx, id,
+                                              IOMMUFD_OBJ_HWPT_PAGING),
+                           struct iommufd_hwpt_paging, common.obj);
+}
+int iommufd_hwpt_set_dirty_tracking(struct iommufd_ucmd *ucmd);
+int iommufd_hwpt_get_dirty_bitmap(struct iommufd_ucmd *ucmd);
+
+struct iommufd_hwpt_paging *
+iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
+                         struct iommufd_device *idev, u32 flags,
+                         bool immediate_attach,
+                         const struct iommu_user_data *user_data);
 int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt,
                                struct iommufd_device *idev);
 struct iommufd_hw_pagetable *
 iommufd_hw_pagetable_detach(struct iommufd_device *idev);
-void iommufd_hw_pagetable_destroy(struct iommufd_object *obj);
-void iommufd_hw_pagetable_abort(struct iommufd_object *obj);
+void iommufd_hwpt_paging_destroy(struct iommufd_object *obj);
+void iommufd_hwpt_paging_abort(struct iommufd_object *obj);
+void iommufd_hwpt_nested_destroy(struct iommufd_object *obj);
+void iommufd_hwpt_nested_abort(struct iommufd_object *obj);
 int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd);
 
 static inline void iommufd_hw_pagetable_put(struct iommufd_ctx *ictx,
                                            struct iommufd_hw_pagetable *hwpt)
 {
-       lockdep_assert_not_held(&hwpt->ioas->mutex);
-       if (hwpt->auto_domain)
-               iommufd_object_deref_user(ictx, &hwpt->obj);
-       else
-               refcount_dec(&hwpt->obj.users);
+       if (hwpt->obj.type == IOMMUFD_OBJ_HWPT_PAGING) {
+               struct iommufd_hwpt_paging *hwpt_paging = to_hwpt_paging(hwpt);
+
+               lockdep_assert_not_held(&hwpt_paging->ioas->mutex);
+
+               if (hwpt_paging->auto_domain) {
+                       iommufd_object_deref_user(ictx, &hwpt->obj);
+                       return;
+               }
+       }
+       refcount_dec(&hwpt->obj.users);
 }
 
 struct iommufd_group {
index 3f3644375bf13c8fa78600f1f9e15d893195af65..7910fbe1962d78b9c8b65726fad12e75c0fd4a22 100644 (file)
@@ -19,6 +19,8 @@ enum {
        IOMMU_TEST_OP_SET_TEMP_MEMORY_LIMIT,
        IOMMU_TEST_OP_MOCK_DOMAIN_REPLACE,
        IOMMU_TEST_OP_ACCESS_REPLACE_IOAS,
+       IOMMU_TEST_OP_MOCK_DOMAIN_FLAGS,
+       IOMMU_TEST_OP_DIRTY,
 };
 
 enum {
@@ -40,6 +42,15 @@ enum {
        MOCK_FLAGS_ACCESS_CREATE_NEEDS_PIN_PAGES = 1 << 0,
 };
 
+enum {
+       MOCK_FLAGS_DEVICE_NO_DIRTY = 1 << 0,
+};
+
+enum {
+       MOCK_NESTED_DOMAIN_IOTLB_ID_MAX = 3,
+       MOCK_NESTED_DOMAIN_IOTLB_NUM = 4,
+};
+
 struct iommu_test_cmd {
        __u32 size;
        __u32 op;
@@ -56,6 +67,13 @@ struct iommu_test_cmd {
                        /* out_idev_id is the standard iommufd_bind object */
                        __u32 out_idev_id;
                } mock_domain;
+               struct {
+                       __u32 out_stdev_id;
+                       __u32 out_hwpt_id;
+                       __u32 out_idev_id;
+                       /* Expand mock_domain to set mock device flags */
+                       __u32 dev_flags;
+               } mock_domain_flags;
                struct {
                        __u32 pt_id;
                } mock_domain_replace;
@@ -95,6 +113,14 @@ struct iommu_test_cmd {
                struct {
                        __u32 ioas_id;
                } access_replace_ioas;
+               struct {
+                       __u32 flags;
+                       __aligned_u64 iova;
+                       __aligned_u64 length;
+                       __aligned_u64 page_size;
+                       __aligned_u64 uptr;
+                       __aligned_u64 out_nr_dirty;
+               } dirty;
        };
        __u32 last;
 };
@@ -109,4 +135,17 @@ struct iommu_test_hw_info {
        __u32 test_reg;
 };
 
+/* Should not be equal to any defined value in enum iommu_hwpt_data_type */
+#define IOMMU_HWPT_DATA_SELFTEST 0xdead
+#define IOMMU_TEST_IOTLB_DEFAULT 0xbadbeef
+
+/**
+ * struct iommu_hwpt_selftest
+ *
+ * @iotlb: default mock iotlb value, IOMMU_TEST_IOTLB_DEFAULT
+ */
+struct iommu_hwpt_selftest {
+       __u32 iotlb;
+};
+
 #endif
similarity index 98%
rename from drivers/vfio/iova_bitmap.c
rename to drivers/iommu/iommufd/iova_bitmap.c
index 0848f920efb7c1c13521cc9d59ae96231cbf4481..0a92c9eeaf7f50a6fe05c266b9ec39d1021844a9 100644 (file)
@@ -268,6 +268,7 @@ err:
        iova_bitmap_free(bitmap);
        return ERR_PTR(rc);
 }
+EXPORT_SYMBOL_NS_GPL(iova_bitmap_alloc, IOMMUFD);
 
 /**
  * iova_bitmap_free() - Frees an IOVA bitmap object
@@ -289,6 +290,7 @@ void iova_bitmap_free(struct iova_bitmap *bitmap)
 
        kfree(bitmap);
 }
+EXPORT_SYMBOL_NS_GPL(iova_bitmap_free, IOMMUFD);
 
 /*
  * Returns the remaining bitmap indexes from mapped_total_index to process for
@@ -387,6 +389,7 @@ int iova_bitmap_for_each(struct iova_bitmap *bitmap, void *opaque,
 
        return ret;
 }
+EXPORT_SYMBOL_NS_GPL(iova_bitmap_for_each, IOMMUFD);
 
 /**
  * iova_bitmap_set() - Records an IOVA range in bitmap
@@ -420,4 +423,4 @@ void iova_bitmap_set(struct iova_bitmap *bitmap,
                cur_bit += nbits;
        } while (cur_bit <= last_bit);
 }
-EXPORT_SYMBOL_GPL(iova_bitmap_set);
+EXPORT_SYMBOL_NS_GPL(iova_bitmap_set, IOMMUFD);
index e71523cbd0de4352479aadeb8f33dd6d2ba87df8..45b9d40773b13a4255c3d6114240fd7e2a469eef 100644 (file)
@@ -307,6 +307,8 @@ union ucmd_buffer {
        struct iommu_destroy destroy;
        struct iommu_hw_info info;
        struct iommu_hwpt_alloc hwpt;
+       struct iommu_hwpt_get_dirty_bitmap get_dirty_bitmap;
+       struct iommu_hwpt_set_dirty_tracking set_dirty_tracking;
        struct iommu_ioas_alloc alloc;
        struct iommu_ioas_allow_iovas allow_iovas;
        struct iommu_ioas_copy ioas_copy;
@@ -342,6 +344,10 @@ static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = {
                 __reserved),
        IOCTL_OP(IOMMU_HWPT_ALLOC, iommufd_hwpt_alloc, struct iommu_hwpt_alloc,
                 __reserved),
+       IOCTL_OP(IOMMU_HWPT_GET_DIRTY_BITMAP, iommufd_hwpt_get_dirty_bitmap,
+                struct iommu_hwpt_get_dirty_bitmap, data),
+       IOCTL_OP(IOMMU_HWPT_SET_DIRTY_TRACKING, iommufd_hwpt_set_dirty_tracking,
+                struct iommu_hwpt_set_dirty_tracking, __reserved),
        IOCTL_OP(IOMMU_IOAS_ALLOC, iommufd_ioas_alloc_ioctl,
                 struct iommu_ioas_alloc, out_ioas_id),
        IOCTL_OP(IOMMU_IOAS_ALLOW_IOVAS, iommufd_ioas_allow_iovas,
@@ -482,9 +488,13 @@ static const struct iommufd_object_ops iommufd_object_ops[] = {
        [IOMMUFD_OBJ_IOAS] = {
                .destroy = iommufd_ioas_destroy,
        },
-       [IOMMUFD_OBJ_HW_PAGETABLE] = {
-               .destroy = iommufd_hw_pagetable_destroy,
-               .abort = iommufd_hw_pagetable_abort,
+       [IOMMUFD_OBJ_HWPT_PAGING] = {
+               .destroy = iommufd_hwpt_paging_destroy,
+               .abort = iommufd_hwpt_paging_abort,
+       },
+       [IOMMUFD_OBJ_HWPT_NESTED] = {
+               .destroy = iommufd_hwpt_nested_destroy,
+               .abort = iommufd_hwpt_nested_abort,
        },
 #ifdef CONFIG_IOMMUFD_TEST
        [IOMMUFD_OBJ_SELFTEST] = {
@@ -552,5 +562,6 @@ MODULE_ALIAS_MISCDEV(VFIO_MINOR);
 MODULE_ALIAS("devname:vfio/vfio");
 #endif
 MODULE_IMPORT_NS(IOMMUFD_INTERNAL);
+MODULE_IMPORT_NS(IOMMUFD);
 MODULE_DESCRIPTION("I/O Address Space Management for passthrough devices");
 MODULE_LICENSE("GPL");
index 8d9aa297c117e4cfa6e238a68d157757df144b0c..528f356238b343a72fefa609334062c9c91219f1 100644 (file)
@@ -1507,6 +1507,8 @@ void iopt_area_unfill_domains(struct iopt_area *area, struct iopt_pages *pages)
                                area, domain, iopt_area_index(area),
                                iopt_area_last_index(area));
 
+       if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
+               WARN_ON(RB_EMPTY_NODE(&area->pages_node.rb));
        interval_tree_remove(&area->pages_node, &pages->domains_itree);
        iopt_area_unfill_domain(area, pages, area->storage_domain);
        area->storage_domain = NULL;
index 56506d5753f15c9f7079a661773404636d975c9e..d43a87737c1e88bd8b2eff67822f5cc8b2b6be6e 100644 (file)
 static DECLARE_FAULT_ATTR(fail_iommufd);
 static struct dentry *dbgfs_root;
 static struct platform_device *selftest_iommu_dev;
+static const struct iommu_ops mock_ops;
+static struct iommu_domain_ops domain_nested_ops;
 
 size_t iommufd_test_memory_limit = 65536;
 
 enum {
+       MOCK_DIRTY_TRACK = 1,
        MOCK_IO_PAGE_SIZE = PAGE_SIZE / 2,
 
        /*
@@ -36,6 +39,7 @@ enum {
        _MOCK_PFN_START = MOCK_PFN_MASK + 1,
        MOCK_PFN_START_IOVA = _MOCK_PFN_START,
        MOCK_PFN_LAST_IOVA = _MOCK_PFN_START,
+       MOCK_PFN_DIRTY_IOVA = _MOCK_PFN_START << 1,
 };
 
 /*
@@ -86,16 +90,24 @@ void iommufd_test_syz_conv_iova_id(struct iommufd_ucmd *ucmd,
 }
 
 struct mock_iommu_domain {
+       unsigned long flags;
        struct iommu_domain domain;
        struct xarray pfns;
 };
 
+struct mock_iommu_domain_nested {
+       struct iommu_domain domain;
+       struct mock_iommu_domain *parent;
+       u32 iotlb[MOCK_NESTED_DOMAIN_IOTLB_NUM];
+};
+
 enum selftest_obj_type {
        TYPE_IDEV,
 };
 
 struct mock_dev {
        struct device dev;
+       unsigned long flags;
 };
 
 struct selftest_obj {
@@ -118,6 +130,11 @@ static void mock_domain_blocking_free(struct iommu_domain *domain)
 static int mock_domain_nop_attach(struct iommu_domain *domain,
                                  struct device *dev)
 {
+       struct mock_dev *mdev = container_of(dev, struct mock_dev, dev);
+
+       if (domain->dirty_ops && (mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY))
+               return -EINVAL;
+
        return 0;
 }
 
@@ -146,15 +163,70 @@ static void *mock_domain_hw_info(struct device *dev, u32 *length, u32 *type)
        return info;
 }
 
-static struct iommu_domain *mock_domain_alloc(unsigned int iommu_domain_type)
+static int mock_domain_set_dirty_tracking(struct iommu_domain *domain,
+                                         bool enable)
 {
-       struct mock_iommu_domain *mock;
+       struct mock_iommu_domain *mock =
+               container_of(domain, struct mock_iommu_domain, domain);
+       unsigned long flags = mock->flags;
 
-       if (iommu_domain_type == IOMMU_DOMAIN_BLOCKED)
-               return &mock_blocking_domain;
+       if (enable && !domain->dirty_ops)
+               return -EINVAL;
 
-       if (iommu_domain_type != IOMMU_DOMAIN_UNMANAGED)
-               return NULL;
+       /* No change? */
+       if (!(enable ^ !!(flags & MOCK_DIRTY_TRACK)))
+               return 0;
+
+       flags = (enable ? flags | MOCK_DIRTY_TRACK : flags & ~MOCK_DIRTY_TRACK);
+
+       mock->flags = flags;
+       return 0;
+}
+
+static int mock_domain_read_and_clear_dirty(struct iommu_domain *domain,
+                                           unsigned long iova, size_t size,
+                                           unsigned long flags,
+                                           struct iommu_dirty_bitmap *dirty)
+{
+       struct mock_iommu_domain *mock =
+               container_of(domain, struct mock_iommu_domain, domain);
+       unsigned long i, max = size / MOCK_IO_PAGE_SIZE;
+       void *ent, *old;
+
+       if (!(mock->flags & MOCK_DIRTY_TRACK) && dirty->bitmap)
+               return -EINVAL;
+
+       for (i = 0; i < max; i++) {
+               unsigned long cur = iova + i * MOCK_IO_PAGE_SIZE;
+
+               ent = xa_load(&mock->pfns, cur / MOCK_IO_PAGE_SIZE);
+               if (ent && (xa_to_value(ent) & MOCK_PFN_DIRTY_IOVA)) {
+                       /* Clear dirty */
+                       if (!(flags & IOMMU_DIRTY_NO_CLEAR)) {
+                               unsigned long val;
+
+                               val = xa_to_value(ent) & ~MOCK_PFN_DIRTY_IOVA;
+                               old = xa_store(&mock->pfns,
+                                              cur / MOCK_IO_PAGE_SIZE,
+                                              xa_mk_value(val), GFP_KERNEL);
+                               WARN_ON_ONCE(ent != old);
+                       }
+                       iommu_dirty_bitmap_record(dirty, cur,
+                                                 MOCK_IO_PAGE_SIZE);
+               }
+       }
+
+       return 0;
+}
+
+const struct iommu_dirty_ops dirty_ops = {
+       .set_dirty_tracking = mock_domain_set_dirty_tracking,
+       .read_and_clear_dirty = mock_domain_read_and_clear_dirty,
+};
+
+static struct iommu_domain *mock_domain_alloc_paging(struct device *dev)
+{
+       struct mock_iommu_domain *mock;
 
        mock = kzalloc(sizeof(*mock), GFP_KERNEL);
        if (!mock)
@@ -162,10 +234,87 @@ static struct iommu_domain *mock_domain_alloc(unsigned int iommu_domain_type)
        mock->domain.geometry.aperture_start = MOCK_APERTURE_START;
        mock->domain.geometry.aperture_end = MOCK_APERTURE_LAST;
        mock->domain.pgsize_bitmap = MOCK_IO_PAGE_SIZE;
+       mock->domain.ops = mock_ops.default_domain_ops;
+       mock->domain.type = IOMMU_DOMAIN_UNMANAGED;
        xa_init(&mock->pfns);
        return &mock->domain;
 }
 
+static struct iommu_domain *
+__mock_domain_alloc_nested(struct mock_iommu_domain *mock_parent,
+                          const struct iommu_hwpt_selftest *user_cfg)
+{
+       struct mock_iommu_domain_nested *mock_nested;
+       int i;
+
+       mock_nested = kzalloc(sizeof(*mock_nested), GFP_KERNEL);
+       if (!mock_nested)
+               return ERR_PTR(-ENOMEM);
+       mock_nested->parent = mock_parent;
+       mock_nested->domain.ops = &domain_nested_ops;
+       mock_nested->domain.type = IOMMU_DOMAIN_NESTED;
+       for (i = 0; i < MOCK_NESTED_DOMAIN_IOTLB_NUM; i++)
+               mock_nested->iotlb[i] = user_cfg->iotlb;
+       return &mock_nested->domain;
+}
+
+static struct iommu_domain *mock_domain_alloc(unsigned int iommu_domain_type)
+{
+       if (iommu_domain_type == IOMMU_DOMAIN_BLOCKED)
+               return &mock_blocking_domain;
+       if (iommu_domain_type == IOMMU_DOMAIN_UNMANAGED)
+               return mock_domain_alloc_paging(NULL);
+       return NULL;
+}
+
+static struct iommu_domain *
+mock_domain_alloc_user(struct device *dev, u32 flags,
+                      struct iommu_domain *parent,
+                      const struct iommu_user_data *user_data)
+{
+       struct mock_iommu_domain *mock_parent;
+       struct iommu_hwpt_selftest user_cfg;
+       int rc;
+
+       /* must be mock_domain */
+       if (!parent) {
+               struct mock_dev *mdev = container_of(dev, struct mock_dev, dev);
+               bool has_dirty_flag = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
+               bool no_dirty_ops = mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY;
+               struct iommu_domain *domain;
+
+               if (flags & (~(IOMMU_HWPT_ALLOC_NEST_PARENT |
+                              IOMMU_HWPT_ALLOC_DIRTY_TRACKING)))
+                       return ERR_PTR(-EOPNOTSUPP);
+               if (user_data || (has_dirty_flag && no_dirty_ops))
+                       return ERR_PTR(-EOPNOTSUPP);
+               domain = mock_domain_alloc_paging(NULL);
+               if (!domain)
+                       return ERR_PTR(-ENOMEM);
+               if (has_dirty_flag)
+                       container_of(domain, struct mock_iommu_domain, domain)
+                               ->domain.dirty_ops = &dirty_ops;
+               return domain;
+       }
+
+       /* must be mock_domain_nested */
+       if (user_data->type != IOMMU_HWPT_DATA_SELFTEST || flags)
+               return ERR_PTR(-EOPNOTSUPP);
+       if (!parent || parent->ops != mock_ops.default_domain_ops)
+               return ERR_PTR(-EINVAL);
+
+       mock_parent = container_of(parent, struct mock_iommu_domain, domain);
+       if (!mock_parent)
+               return ERR_PTR(-EINVAL);
+
+       rc = iommu_copy_struct_from_user(&user_cfg, user_data,
+                                        IOMMU_HWPT_DATA_SELFTEST, iotlb);
+       if (rc)
+               return ERR_PTR(rc);
+
+       return __mock_domain_alloc_nested(mock_parent, &user_cfg);
+}
+
 static void mock_domain_free(struct iommu_domain *domain)
 {
        struct mock_iommu_domain *mock =
@@ -243,7 +392,7 @@ static size_t mock_domain_unmap_pages(struct iommu_domain *domain,
 
                for (cur = 0; cur != pgsize; cur += MOCK_IO_PAGE_SIZE) {
                        ent = xa_erase(&mock->pfns, iova / MOCK_IO_PAGE_SIZE);
-                       WARN_ON(!ent);
+
                        /*
                         * iommufd generates unmaps that must be a strict
                         * superset of the map's performend So every starting
@@ -253,13 +402,13 @@ static size_t mock_domain_unmap_pages(struct iommu_domain *domain,
                         * passed to map_pages
                         */
                        if (first) {
-                               WARN_ON(!(xa_to_value(ent) &
-                                         MOCK_PFN_START_IOVA));
+                               WARN_ON(ent && !(xa_to_value(ent) &
+                                                MOCK_PFN_START_IOVA));
                                first = false;
                        }
                        if (pgcount == 1 && cur + MOCK_IO_PAGE_SIZE == pgsize)
-                               WARN_ON(!(xa_to_value(ent) &
-                                         MOCK_PFN_LAST_IOVA));
+                               WARN_ON(ent && !(xa_to_value(ent) &
+                                                MOCK_PFN_LAST_IOVA));
 
                        iova += MOCK_IO_PAGE_SIZE;
                        ret += MOCK_IO_PAGE_SIZE;
@@ -283,7 +432,18 @@ static phys_addr_t mock_domain_iova_to_phys(struct iommu_domain *domain,
 
 static bool mock_domain_capable(struct device *dev, enum iommu_cap cap)
 {
-       return cap == IOMMU_CAP_CACHE_COHERENCY;
+       struct mock_dev *mdev = container_of(dev, struct mock_dev, dev);
+
+       switch (cap) {
+       case IOMMU_CAP_CACHE_COHERENCY:
+               return true;
+       case IOMMU_CAP_DIRTY_TRACKING:
+               return !(mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY);
+       default:
+               break;
+       }
+
+       return false;
 }
 
 static void mock_domain_set_plaform_dma_ops(struct device *dev)
@@ -307,6 +467,7 @@ static const struct iommu_ops mock_ops = {
        .pgsize_bitmap = MOCK_IO_PAGE_SIZE,
        .hw_info = mock_domain_hw_info,
        .domain_alloc = mock_domain_alloc,
+       .domain_alloc_user = mock_domain_alloc_user,
        .capable = mock_domain_capable,
        .set_platform_dma_ops = mock_domain_set_plaform_dma_ops,
        .device_group = generic_device_group,
@@ -321,19 +482,41 @@ static const struct iommu_ops mock_ops = {
                },
 };
 
+static void mock_domain_free_nested(struct iommu_domain *domain)
+{
+       struct mock_iommu_domain_nested *mock_nested =
+               container_of(domain, struct mock_iommu_domain_nested, domain);
+
+       kfree(mock_nested);
+}
+
+static struct iommu_domain_ops domain_nested_ops = {
+       .free = mock_domain_free_nested,
+       .attach_dev = mock_domain_nop_attach,
+};
+
 static inline struct iommufd_hw_pagetable *
-get_md_pagetable(struct iommufd_ucmd *ucmd, u32 mockpt_id,
-                struct mock_iommu_domain **mock)
+__get_md_pagetable(struct iommufd_ucmd *ucmd, u32 mockpt_id, u32 hwpt_type)
 {
-       struct iommufd_hw_pagetable *hwpt;
        struct iommufd_object *obj;
 
-       obj = iommufd_get_object(ucmd->ictx, mockpt_id,
-                                IOMMUFD_OBJ_HW_PAGETABLE);
+       obj = iommufd_get_object(ucmd->ictx, mockpt_id, hwpt_type);
        if (IS_ERR(obj))
                return ERR_CAST(obj);
-       hwpt = container_of(obj, struct iommufd_hw_pagetable, obj);
-       if (hwpt->domain->ops != mock_ops.default_domain_ops) {
+       return container_of(obj, struct iommufd_hw_pagetable, obj);
+}
+
+static inline struct iommufd_hw_pagetable *
+get_md_pagetable(struct iommufd_ucmd *ucmd, u32 mockpt_id,
+                struct mock_iommu_domain **mock)
+{
+       struct iommufd_hw_pagetable *hwpt;
+
+       hwpt = __get_md_pagetable(ucmd, mockpt_id, IOMMUFD_OBJ_HWPT_PAGING);
+       if (IS_ERR(hwpt))
+               return hwpt;
+       if (hwpt->domain->type != IOMMU_DOMAIN_UNMANAGED ||
+           hwpt->domain->ops != mock_ops.default_domain_ops) {
                iommufd_put_object(&hwpt->obj);
                return ERR_PTR(-EINVAL);
        }
@@ -341,6 +524,25 @@ get_md_pagetable(struct iommufd_ucmd *ucmd, u32 mockpt_id,
        return hwpt;
 }
 
+static inline struct iommufd_hw_pagetable *
+get_md_pagetable_nested(struct iommufd_ucmd *ucmd, u32 mockpt_id,
+                       struct mock_iommu_domain_nested **mock_nested)
+{
+       struct iommufd_hw_pagetable *hwpt;
+
+       hwpt = __get_md_pagetable(ucmd, mockpt_id, IOMMUFD_OBJ_HWPT_NESTED);
+       if (IS_ERR(hwpt))
+               return hwpt;
+       if (hwpt->domain->type != IOMMU_DOMAIN_NESTED ||
+           hwpt->domain->ops != &domain_nested_ops) {
+               iommufd_put_object(&hwpt->obj);
+               return ERR_PTR(-EINVAL);
+       }
+       *mock_nested = container_of(hwpt->domain,
+                                   struct mock_iommu_domain_nested, domain);
+       return hwpt;
+}
+
 struct mock_bus_type {
        struct bus_type bus;
        struct notifier_block nb;
@@ -362,16 +564,20 @@ static void mock_dev_release(struct device *dev)
        kfree(mdev);
 }
 
-static struct mock_dev *mock_dev_create(void)
+static struct mock_dev *mock_dev_create(unsigned long dev_flags)
 {
        struct mock_dev *mdev;
        int rc;
 
+       if (dev_flags & ~(MOCK_FLAGS_DEVICE_NO_DIRTY))
+               return ERR_PTR(-EINVAL);
+
        mdev = kzalloc(sizeof(*mdev), GFP_KERNEL);
        if (!mdev)
                return ERR_PTR(-ENOMEM);
 
        device_initialize(&mdev->dev);
+       mdev->flags = dev_flags;
        mdev->dev.release = mock_dev_release;
        mdev->dev.bus = &iommufd_mock_bus_type.bus;
 
@@ -407,6 +613,7 @@ static int iommufd_test_mock_domain(struct iommufd_ucmd *ucmd,
        struct iommufd_device *idev;
        struct selftest_obj *sobj;
        u32 pt_id = cmd->id;
+       u32 dev_flags = 0;
        u32 idev_id;
        int rc;
 
@@ -417,7 +624,10 @@ static int iommufd_test_mock_domain(struct iommufd_ucmd *ucmd,
        sobj->idev.ictx = ucmd->ictx;
        sobj->type = TYPE_IDEV;
 
-       sobj->idev.mock_dev = mock_dev_create();
+       if (cmd->op == IOMMU_TEST_OP_MOCK_DOMAIN_FLAGS)
+               dev_flags = cmd->mock_domain_flags.dev_flags;
+
+       sobj->idev.mock_dev = mock_dev_create(dev_flags);
        if (IS_ERR(sobj->idev.mock_dev)) {
                rc = PTR_ERR(sobj->idev.mock_dev);
                goto out_sobj;
@@ -977,6 +1187,73 @@ static_assert((unsigned int)MOCK_ACCESS_RW_WRITE == IOMMUFD_ACCESS_RW_WRITE);
 static_assert((unsigned int)MOCK_ACCESS_RW_SLOW_PATH ==
              __IOMMUFD_ACCESS_RW_SLOW_PATH);
 
+static int iommufd_test_dirty(struct iommufd_ucmd *ucmd, unsigned int mockpt_id,
+                             unsigned long iova, size_t length,
+                             unsigned long page_size, void __user *uptr,
+                             u32 flags)
+{
+       unsigned long bitmap_size, i, max;
+       struct iommu_test_cmd *cmd = ucmd->cmd;
+       struct iommufd_hw_pagetable *hwpt;
+       struct mock_iommu_domain *mock;
+       int rc, count = 0;
+       void *tmp;
+
+       if (!page_size || !length || iova % page_size || length % page_size ||
+           !uptr)
+               return -EINVAL;
+
+       hwpt = get_md_pagetable(ucmd, mockpt_id, &mock);
+       if (IS_ERR(hwpt))
+               return PTR_ERR(hwpt);
+
+       if (!(mock->flags & MOCK_DIRTY_TRACK)) {
+               rc = -EINVAL;
+               goto out_put;
+       }
+
+       max = length / page_size;
+       bitmap_size = max / BITS_PER_BYTE;
+
+       tmp = kvzalloc(bitmap_size, GFP_KERNEL_ACCOUNT);
+       if (!tmp) {
+               rc = -ENOMEM;
+               goto out_put;
+       }
+
+       if (copy_from_user(tmp, uptr, bitmap_size)) {
+               rc = -EFAULT;
+               goto out_free;
+       }
+
+       for (i = 0; i < max; i++) {
+               unsigned long cur = iova + i * page_size;
+               void *ent, *old;
+
+               if (!test_bit(i, (unsigned long *)tmp))
+                       continue;
+
+               ent = xa_load(&mock->pfns, cur / page_size);
+               if (ent) {
+                       unsigned long val;
+
+                       val = xa_to_value(ent) | MOCK_PFN_DIRTY_IOVA;
+                       old = xa_store(&mock->pfns, cur / page_size,
+                                      xa_mk_value(val), GFP_KERNEL);
+                       WARN_ON_ONCE(ent != old);
+                       count++;
+               }
+       }
+
+       cmd->dirty.out_nr_dirty = count;
+       rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+out_free:
+       kvfree(tmp);
+out_put:
+       iommufd_put_object(&hwpt->obj);
+       return rc;
+}
+
 void iommufd_selftest_destroy(struct iommufd_object *obj)
 {
        struct selftest_obj *sobj = container_of(obj, struct selftest_obj, obj);
@@ -1000,6 +1277,7 @@ int iommufd_test(struct iommufd_ucmd *ucmd)
                                                 cmd->add_reserved.start,
                                                 cmd->add_reserved.length);
        case IOMMU_TEST_OP_MOCK_DOMAIN:
+       case IOMMU_TEST_OP_MOCK_DOMAIN_FLAGS:
                return iommufd_test_mock_domain(ucmd, cmd);
        case IOMMU_TEST_OP_MOCK_DOMAIN_REPLACE:
                return iommufd_test_mock_domain_replace(
@@ -1041,6 +1319,12 @@ int iommufd_test(struct iommufd_ucmd *ucmd)
                        return -EINVAL;
                iommufd_test_memory_limit = cmd->memory_limit.limit;
                return 0;
+       case IOMMU_TEST_OP_DIRTY:
+               return iommufd_test_dirty(ucmd, cmd->id, cmd->dirty.iova,
+                                         cmd->dirty.length,
+                                         cmd->dirty.page_size,
+                                         u64_to_user_ptr(cmd->dirty.uptr),
+                                         cmd->dirty.flags);
        default:
                return -EOPNOTSUPP;
        }
index 6c810bf80f99a71305f9c371441e1a33c8f25925..538fbf76354d13d5b7f6478a82dd40e2daf67add 100644 (file)
@@ -255,7 +255,7 @@ err_put:
 
 static int iommufd_vfio_cc_iommu(struct iommufd_ctx *ictx)
 {
-       struct iommufd_hw_pagetable *hwpt;
+       struct iommufd_hwpt_paging *hwpt_paging;
        struct iommufd_ioas *ioas;
        int rc = 1;
 
@@ -264,8 +264,8 @@ static int iommufd_vfio_cc_iommu(struct iommufd_ctx *ictx)
                return PTR_ERR(ioas);
 
        mutex_lock(&ioas->mutex);
-       list_for_each_entry(hwpt, &ioas->hwpt_list, hwpt_item) {
-               if (!hwpt->enforce_cache_coherency) {
+       list_for_each_entry(hwpt_paging, &ioas->hwpt_list, hwpt_item) {
+               if (!hwpt_paging->enforce_cache_coherency) {
                        rc = 0;
                        break;
                }
index c82ea032d3521268138811a1cc1b718755c90c26..68c05705200fce8fc9824a8521bbe554e5c130f7 100644 (file)
@@ -1,8 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 obj-$(CONFIG_VFIO) += vfio.o
 
-vfio-y += vfio_main.o \
-         iova_bitmap.o
+vfio-y += vfio_main.o
 vfio-$(CONFIG_VFIO_DEVICE_CDEV) += device_cdev.o
 vfio-$(CONFIG_VFIO_GROUP) += group.o
 vfio-$(CONFIG_IOMMUFD) += iommufd.o
index 7088edc4fb28d88f5603e8f68462993123eece46..c3ced56b7787650ce8b82039b419413e81deedfa 100644 (file)
@@ -3,6 +3,7 @@ config MLX5_VFIO_PCI
        tristate "VFIO support for MLX5 PCI devices"
        depends on MLX5_CORE
        select VFIO_PCI_CORE
+       select IOMMUFD_DRIVER
        help
          This provides migration support for MLX5 devices using the VFIO
          framework.
index b6ac66c5008d970a664389eb9cf9eb9101cadd76..fe09a8c8af95e8dedac6e08a4fba74379d1c4b5d 100644 (file)
@@ -1517,6 +1517,7 @@ static struct pci_driver mlx5vf_pci_driver = {
 
 module_pci_driver(mlx5vf_pci_driver);
 
+MODULE_IMPORT_NS(IOMMUFD);
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Max Gurtovoy <mgurtovoy@nvidia.com>");
 MODULE_AUTHOR("Yishai Hadas <yishaih@nvidia.com>");
index 6eceef7b028aae9b8b7a8cb49614e88525f4bade..fec9b167c7b9ac98ae24dddd9265e30d95942e7d 100644 (file)
@@ -5,6 +5,7 @@ config PDS_VFIO_PCI
        tristate "VFIO support for PDS PCI devices"
        depends on PDS_CORE && PCI_IOV
        select VFIO_PCI_CORE
+       select IOMMUFD_DRIVER
        help
          This provides generic PCI support for PDS devices using the VFIO
          framework.
index ab4b5958e4131c08eaba5f013983534a55203373..dd8c00c895a2eed661d5e5264fc50f96ae9307ec 100644 (file)
@@ -204,6 +204,7 @@ static struct pci_driver pds_vfio_pci_driver = {
 
 module_pci_driver(pds_vfio_pci_driver);
 
+MODULE_IMPORT_NS(IOMMUFD);
 MODULE_DESCRIPTION(PDS_VFIO_DRV_DESCRIPTION);
 MODULE_AUTHOR("Brett Creeley <brett.creeley@amd.com>");
 MODULE_LICENSE("GPL");
index e31e1952d7b8f1a49a27f005e05653e3b6f23976..8d4995ada74a01848ce8e7becf61120cc10ec33a 100644 (file)
@@ -1703,6 +1703,7 @@ static void __exit vfio_cleanup(void)
 module_init(vfio_init);
 module_exit(vfio_cleanup);
 
+MODULE_IMPORT_NS(IOMMUFD);
 MODULE_VERSION(DRIVER_VERSION);
 MODULE_LICENSE("GPL v2");
 MODULE_AUTHOR(DRIVER_AUTHOR);
index 1b7a44b35616c7d00cb383425c72fe10ee079ff1..25142a0e2fc2c51d4c7807a1fb87cc21b16a163b 100644 (file)
@@ -166,6 +166,10 @@ struct io_pgtable_ops {
                              struct iommu_iotlb_gather *gather);
        phys_addr_t (*iova_to_phys)(struct io_pgtable_ops *ops,
                                    unsigned long iova);
+       int (*read_and_clear_dirty)(struct io_pgtable_ops *ops,
+                                   unsigned long iova, size_t size,
+                                   unsigned long flags,
+                                   struct iommu_dirty_bitmap *dirty);
 };
 
 /**
index c50a769d569a60f2922a40ad7b813de4b29513d0..8fb1b41b4d1580a4c5c3f45c4fb7fb557dca464d 100644 (file)
@@ -13,6 +13,7 @@
 #include <linux/errno.h>
 #include <linux/err.h>
 #include <linux/of.h>
+#include <linux/iova_bitmap.h>
 #include <uapi/linux/iommu.h>
 
 #define IOMMU_READ     (1 << 0)
@@ -37,6 +38,7 @@ struct bus_type;
 struct device;
 struct iommu_domain;
 struct iommu_domain_ops;
+struct iommu_dirty_ops;
 struct notifier_block;
 struct iommu_sva;
 struct iommu_fault_event;
@@ -65,6 +67,9 @@ struct iommu_domain_geometry {
 
 #define __IOMMU_DOMAIN_SVA     (1U << 4)  /* Shared process address space */
 
+#define __IOMMU_DOMAIN_NESTED  (1U << 6)  /* User-managed address space nested
+                                             on a stage-2 translation        */
+
 #define IOMMU_DOMAIN_ALLOC_FLAGS ~__IOMMU_DOMAIN_DMA_FQ
 /*
  * This are the possible domain-types
@@ -91,10 +96,13 @@ struct iommu_domain_geometry {
                                 __IOMMU_DOMAIN_DMA_API |       \
                                 __IOMMU_DOMAIN_DMA_FQ)
 #define IOMMU_DOMAIN_SVA       (__IOMMU_DOMAIN_SVA)
+#define IOMMU_DOMAIN_NESTED    (__IOMMU_DOMAIN_NESTED)
 
 struct iommu_domain {
        unsigned type;
        const struct iommu_domain_ops *ops;
+       const struct iommu_dirty_ops *dirty_ops;
+
        unsigned long pgsize_bitmap;    /* Bitmap of page sizes in use */
        struct iommu_domain_geometry geometry;
        struct iommu_dma_cookie *iova_cookie;
@@ -133,6 +141,7 @@ enum iommu_cap {
         * usefully support the non-strict DMA flush queue.
         */
        IOMMU_CAP_DEFERRED_FLUSH,
+       IOMMU_CAP_DIRTY_TRACKING,       /* IOMMU supports dirty tracking */
 };
 
 /* These are the possible reserved region types */
@@ -227,6 +236,90 @@ struct iommu_iotlb_gather {
        bool                    queued;
 };
 
+/**
+ * struct iommu_dirty_bitmap - Dirty IOVA bitmap state
+ * @bitmap: IOVA bitmap
+ * @gather: Range information for a pending IOTLB flush
+ */
+struct iommu_dirty_bitmap {
+       struct iova_bitmap *bitmap;
+       struct iommu_iotlb_gather *gather;
+};
+
+/* Read but do not clear any dirty bits */
+#define IOMMU_DIRTY_NO_CLEAR (1 << 0)
+
+/**
+ * struct iommu_dirty_ops - domain specific dirty tracking operations
+ * @set_dirty_tracking: Enable or Disable dirty tracking on the iommu domain
+ * @read_and_clear_dirty: Walk IOMMU page tables for dirtied PTEs marshalled
+ *                        into a bitmap, with a bit represented as a page.
+ *                        Reads the dirty PTE bits and clears it from IO
+ *                        pagetables.
+ */
+struct iommu_dirty_ops {
+       int (*set_dirty_tracking)(struct iommu_domain *domain, bool enabled);
+       int (*read_and_clear_dirty)(struct iommu_domain *domain,
+                                   unsigned long iova, size_t size,
+                                   unsigned long flags,
+                                   struct iommu_dirty_bitmap *dirty);
+};
+
+/**
+ * struct iommu_user_data - iommu driver specific user space data info
+ * @type: The data type of the user buffer
+ * @uptr: Pointer to the user buffer for copy_from_user()
+ * @len: The length of the user buffer in bytes
+ *
+ * A user space data is an uAPI that is defined in include/uapi/linux/iommufd.h
+ * @type, @uptr and @len should be just copied from an iommufd core uAPI struct.
+ */
+struct iommu_user_data {
+       unsigned int type;
+       void __user *uptr;
+       size_t len;
+};
+
+/**
+ * __iommu_copy_struct_from_user - Copy iommu driver specific user space data
+ * @dst_data: Pointer to an iommu driver specific user data that is defined in
+ *            include/uapi/linux/iommufd.h
+ * @src_data: Pointer to a struct iommu_user_data for user space data info
+ * @data_type: The data type of the @dst_data. Must match with @src_data.type
+ * @data_len: Length of current user data structure, i.e. sizeof(struct _dst)
+ * @min_len: Initial length of user data structure for backward compatibility.
+ *           This should be offsetofend using the last member in the user data
+ *           struct that was initially added to include/uapi/linux/iommufd.h
+ */
+static inline int __iommu_copy_struct_from_user(
+       void *dst_data, const struct iommu_user_data *src_data,
+       unsigned int data_type, size_t data_len, size_t min_len)
+{
+       if (src_data->type != data_type)
+               return -EINVAL;
+       if (WARN_ON(!dst_data || !src_data))
+               return -EINVAL;
+       if (src_data->len < min_len || data_len < src_data->len)
+               return -EINVAL;
+       return copy_struct_from_user(dst_data, data_len, src_data->uptr,
+                                    src_data->len);
+}
+
+/**
+ * iommu_copy_struct_from_user - Copy iommu driver specific user space data
+ * @kdst: Pointer to an iommu driver specific user data that is defined in
+ *        include/uapi/linux/iommufd.h
+ * @user_data: Pointer to a struct iommu_user_data for user space data info
+ * @data_type: The data type of the @kdst. Must match with @user_data->type
+ * @min_last: The last memember of the data structure @kdst points in the
+ *            initial version.
+ * Return 0 for success, otherwise -error.
+ */
+#define iommu_copy_struct_from_user(kdst, user_data, data_type, min_last) \
+       __iommu_copy_struct_from_user(kdst, user_data, data_type,         \
+                                     sizeof(*kdst),                      \
+                                     offsetofend(typeof(*kdst), min_last))
+
 /**
  * struct iommu_ops - iommu ops and capabilities
  * @capable: check capability
@@ -234,7 +327,19 @@ struct iommu_iotlb_gather {
  *           op is allocated in the iommu driver and freed by the caller after
  *           use. The information type is one of enum iommu_hw_info_type defined
  *           in include/uapi/linux/iommufd.h.
- * @domain_alloc: allocate iommu domain
+ * @domain_alloc: allocate and return an iommu domain if success. Otherwise
+ *                NULL is returned. The domain is not fully initialized until
+ *                the caller iommu_domain_alloc() returns.
+ * @domain_alloc_user: Allocate an iommu domain corresponding to the input
+ *                     parameters as defined in include/uapi/linux/iommufd.h.
+ *                     Unlike @domain_alloc, it is called only by IOMMUFD and
+ *                     must fully initialize the new domain before return.
+ *                     Upon success, if the @user_data is valid and the @parent
+ *                     points to a kernel-managed domain, the new domain must be
+ *                     IOMMU_DOMAIN_NESTED type; otherwise, the @parent must be
+ *                     NULL while the @user_data can be optionally provided, the
+ *                     new domain must support __IOMMU_DOMAIN_PAGING.
+ *                     Upon failure, ERR_PTR must be returned.
  * @probe_device: Add device to iommu driver handling
  * @release_device: Remove device from iommu driver handling
  * @probe_finalize: Do final setup work after the device is added to an IOMMU
@@ -267,6 +372,9 @@ struct iommu_ops {
 
        /* Domain allocation and freeing by the iommu driver */
        struct iommu_domain *(*domain_alloc)(unsigned iommu_domain_type);
+       struct iommu_domain *(*domain_alloc_user)(
+               struct device *dev, u32 flags, struct iommu_domain *parent,
+               const struct iommu_user_data *user_data);
 
        struct iommu_device *(*probe_device)(struct device *dev);
        void (*release_device)(struct device *dev);
@@ -632,6 +740,28 @@ static inline bool iommu_iotlb_gather_queued(struct iommu_iotlb_gather *gather)
        return gather && gather->queued;
 }
 
+static inline void iommu_dirty_bitmap_init(struct iommu_dirty_bitmap *dirty,
+                                          struct iova_bitmap *bitmap,
+                                          struct iommu_iotlb_gather *gather)
+{
+       if (gather)
+               iommu_iotlb_gather_init(gather);
+
+       dirty->bitmap = bitmap;
+       dirty->gather = gather;
+}
+
+static inline void iommu_dirty_bitmap_record(struct iommu_dirty_bitmap *dirty,
+                                            unsigned long iova,
+                                            unsigned long length)
+{
+       if (dirty->bitmap)
+               iova_bitmap_set(dirty->bitmap, iova, length);
+
+       if (dirty->gather)
+               iommu_iotlb_gather_add_range(dirty->gather, iova, length);
+}
+
 /* PCI device grouping function */
 extern struct iommu_group *pci_device_group(struct device *dev);
 /* Generic device grouping function */
@@ -737,6 +867,8 @@ struct iommu_fwspec {};
 struct iommu_device {};
 struct iommu_fault_param {};
 struct iommu_iotlb_gather {};
+struct iommu_dirty_bitmap {};
+struct iommu_dirty_ops {};
 
 static inline bool iommu_present(const struct bus_type *bus)
 {
@@ -969,6 +1101,18 @@ static inline bool iommu_iotlb_gather_queued(struct iommu_iotlb_gather *gather)
        return false;
 }
 
+static inline void iommu_dirty_bitmap_init(struct iommu_dirty_bitmap *dirty,
+                                          struct iova_bitmap *bitmap,
+                                          struct iommu_iotlb_gather *gather)
+{
+}
+
+static inline void iommu_dirty_bitmap_record(struct iommu_dirty_bitmap *dirty,
+                                            unsigned long iova,
+                                            unsigned long length)
+{
+}
+
 static inline void iommu_device_unregister(struct iommu_device *iommu)
 {
 }
index c006cf0a25f3daac2ccc39c67c9a3193245a4077..1c338f5e5b7a62027290b44ad47c4a74d84706ac 100644 (file)
@@ -7,6 +7,7 @@
 #define _IOVA_BITMAP_H_
 
 #include <linux/types.h>
+#include <linux/errno.h>
 
 struct iova_bitmap;
 
@@ -14,6 +15,7 @@ typedef int (*iova_bitmap_fn_t)(struct iova_bitmap *bitmap,
                                unsigned long iova, size_t length,
                                void *opaque);
 
+#if IS_ENABLED(CONFIG_IOMMUFD_DRIVER)
 struct iova_bitmap *iova_bitmap_alloc(unsigned long iova, size_t length,
                                      unsigned long page_size,
                                      u64 __user *data);
@@ -22,5 +24,29 @@ int iova_bitmap_for_each(struct iova_bitmap *bitmap, void *opaque,
                         iova_bitmap_fn_t fn);
 void iova_bitmap_set(struct iova_bitmap *bitmap,
                     unsigned long iova, size_t length);
+#else
+static inline struct iova_bitmap *iova_bitmap_alloc(unsigned long iova,
+                                                   size_t length,
+                                                   unsigned long page_size,
+                                                   u64 __user *data)
+{
+       return NULL;
+}
+
+static inline void iova_bitmap_free(struct iova_bitmap *bitmap)
+{
+}
+
+static inline int iova_bitmap_for_each(struct iova_bitmap *bitmap, void *opaque,
+                                      iova_bitmap_fn_t fn)
+{
+       return -EOPNOTSUPP;
+}
+
+static inline void iova_bitmap_set(struct iova_bitmap *bitmap,
+                                  unsigned long iova, size_t length)
+{
+}
+#endif
 
 #endif
index b4ba0c0cbab6b8b1562fa359d34f9835a9cde757..0b2bc6252e2ca2840b556ee6dd858ae123f22c9b 100644 (file)
@@ -47,6 +47,8 @@ enum {
        IOMMUFD_CMD_VFIO_IOAS,
        IOMMUFD_CMD_HWPT_ALLOC,
        IOMMUFD_CMD_GET_HW_INFO,
+       IOMMUFD_CMD_HWPT_SET_DIRTY_TRACKING,
+       IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP,
 };
 
 /**
@@ -347,20 +349,86 @@ struct iommu_vfio_ioas {
 };
 #define IOMMU_VFIO_IOAS _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VFIO_IOAS)
 
+/**
+ * enum iommufd_hwpt_alloc_flags - Flags for HWPT allocation
+ * @IOMMU_HWPT_ALLOC_NEST_PARENT: If set, allocate a HWPT that can serve as
+ *                                the parent HWPT in a nesting configuration.
+ * @IOMMU_HWPT_ALLOC_DIRTY_TRACKING: Dirty tracking support for device IOMMU is
+ *                                   enforced on device attachment
+ */
+enum iommufd_hwpt_alloc_flags {
+       IOMMU_HWPT_ALLOC_NEST_PARENT = 1 << 0,
+       IOMMU_HWPT_ALLOC_DIRTY_TRACKING = 1 << 1,
+};
+
+/**
+ * enum iommu_hwpt_vtd_s1_flags - Intel VT-d stage-1 page table
+ *                                entry attributes
+ * @IOMMU_VTD_S1_SRE: Supervisor request
+ * @IOMMU_VTD_S1_EAFE: Extended access enable
+ * @IOMMU_VTD_S1_WPE: Write protect enable
+ */
+enum iommu_hwpt_vtd_s1_flags {
+       IOMMU_VTD_S1_SRE = 1 << 0,
+       IOMMU_VTD_S1_EAFE = 1 << 1,
+       IOMMU_VTD_S1_WPE = 1 << 2,
+};
+
+/**
+ * struct iommu_hwpt_vtd_s1 - Intel VT-d stage-1 page table
+ *                            info (IOMMU_HWPT_DATA_VTD_S1)
+ * @flags: Combination of enum iommu_hwpt_vtd_s1_flags
+ * @pgtbl_addr: The base address of the stage-1 page table.
+ * @addr_width: The address width of the stage-1 page table
+ * @__reserved: Must be 0
+ */
+struct iommu_hwpt_vtd_s1 {
+       __aligned_u64 flags;
+       __aligned_u64 pgtbl_addr;
+       __u32 addr_width;
+       __u32 __reserved;
+};
+
+/**
+ * enum iommu_hwpt_data_type - IOMMU HWPT Data Type
+ * @IOMMU_HWPT_DATA_NONE: no data
+ * @IOMMU_HWPT_DATA_VTD_S1: Intel VT-d stage-1 page table
+ */
+enum iommu_hwpt_data_type {
+       IOMMU_HWPT_DATA_NONE,
+       IOMMU_HWPT_DATA_VTD_S1,
+};
+
 /**
  * struct iommu_hwpt_alloc - ioctl(IOMMU_HWPT_ALLOC)
  * @size: sizeof(struct iommu_hwpt_alloc)
- * @flags: Must be 0
+ * @flags: Combination of enum iommufd_hwpt_alloc_flags
  * @dev_id: The device to allocate this HWPT for
- * @pt_id: The IOAS to connect this HWPT to
+ * @pt_id: The IOAS or HWPT to connect this HWPT to
  * @out_hwpt_id: The ID of the new HWPT
  * @__reserved: Must be 0
+ * @data_type: One of enum iommu_hwpt_data_type
+ * @data_len: Length of the type specific data
+ * @data_uptr: User pointer to the type specific data
  *
  * Explicitly allocate a hardware page table object. This is the same object
  * type that is returned by iommufd_device_attach() and represents the
  * underlying iommu driver's iommu_domain kernel object.
  *
- * A HWPT will be created with the IOVA mappings from the given IOAS.
+ * A kernel-managed HWPT will be created with the mappings from the given
+ * IOAS via the @pt_id. The @data_type for this allocation must be set to
+ * IOMMU_HWPT_DATA_NONE. The HWPT can be allocated as a parent HWPT for a
+ * nesting configuration by passing IOMMU_HWPT_ALLOC_NEST_PARENT via @flags.
+ *
+ * A user-managed nested HWPT will be created from a given parent HWPT via
+ * @pt_id, in which the parent HWPT must be allocated previously via the
+ * same ioctl from a given IOAS (@pt_id). In this case, the @data_type
+ * must be set to a pre-defined type corresponding to an I/O page table
+ * type supported by the underlying IOMMU hardware.
+ *
+ * If the @data_type is set to IOMMU_HWPT_DATA_NONE, @data_len and
+ * @data_uptr should be zero. Otherwise, both @data_len and @data_uptr
+ * must be given.
  */
 struct iommu_hwpt_alloc {
        __u32 size;
@@ -369,13 +437,26 @@ struct iommu_hwpt_alloc {
        __u32 pt_id;
        __u32 out_hwpt_id;
        __u32 __reserved;
+       __u32 data_type;
+       __u32 data_len;
+       __aligned_u64 data_uptr;
 };
 #define IOMMU_HWPT_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_HWPT_ALLOC)
 
+/**
+ * enum iommu_hw_info_vtd_flags - Flags for VT-d hw_info
+ * @IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17: If set, disallow read-only mappings
+ *                                         on a nested_parent domain.
+ *                                         https://www.intel.com/content/www/us/en/content-details/772415/content-details.html
+ */
+enum iommu_hw_info_vtd_flags {
+       IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17 = 1 << 0,
+};
+
 /**
  * struct iommu_hw_info_vtd - Intel VT-d hardware information
  *
- * @flags: Must be 0
+ * @flags: Combination of enum iommu_hw_info_vtd_flags
  * @__reserved: Must be 0
  *
  * @cap_reg: Value of Intel VT-d capability register defined in VT-d spec
@@ -404,6 +485,20 @@ enum iommu_hw_info_type {
        IOMMU_HW_INFO_TYPE_INTEL_VTD,
 };
 
+/**
+ * enum iommufd_hw_capabilities
+ * @IOMMU_HW_CAP_DIRTY_TRACKING: IOMMU hardware support for dirty tracking
+ *                               If available, it means the following APIs
+ *                               are supported:
+ *
+ *                                   IOMMU_HWPT_GET_DIRTY_BITMAP
+ *                                   IOMMU_HWPT_SET_DIRTY_TRACKING
+ *
+ */
+enum iommufd_hw_capabilities {
+       IOMMU_HW_CAP_DIRTY_TRACKING = 1 << 0,
+};
+
 /**
  * struct iommu_hw_info - ioctl(IOMMU_GET_HW_INFO)
  * @size: sizeof(struct iommu_hw_info)
@@ -415,6 +510,8 @@ enum iommu_hw_info_type {
  *             the iommu type specific hardware information data
  * @out_data_type: Output the iommu hardware info type as defined in the enum
  *                 iommu_hw_info_type.
+ * @out_capabilities: Output the generic iommu capability info type as defined
+ *                    in the enum iommu_hw_capabilities.
  * @__reserved: Must be 0
  *
  * Query an iommu type specific hardware information data from an iommu behind
@@ -439,6 +536,81 @@ struct iommu_hw_info {
        __aligned_u64 data_uptr;
        __u32 out_data_type;
        __u32 __reserved;
+       __aligned_u64 out_capabilities;
 };
 #define IOMMU_GET_HW_INFO _IO(IOMMUFD_TYPE, IOMMUFD_CMD_GET_HW_INFO)
+
+/*
+ * enum iommufd_hwpt_set_dirty_tracking_flags - Flags for steering dirty
+ *                                              tracking
+ * @IOMMU_HWPT_DIRTY_TRACKING_ENABLE: Enable dirty tracking
+ */
+enum iommufd_hwpt_set_dirty_tracking_flags {
+       IOMMU_HWPT_DIRTY_TRACKING_ENABLE = 1,
+};
+
+/**
+ * struct iommu_hwpt_set_dirty_tracking - ioctl(IOMMU_HWPT_SET_DIRTY_TRACKING)
+ * @size: sizeof(struct iommu_hwpt_set_dirty_tracking)
+ * @flags: Combination of enum iommufd_hwpt_set_dirty_tracking_flags
+ * @hwpt_id: HW pagetable ID that represents the IOMMU domain
+ * @__reserved: Must be 0
+ *
+ * Toggle dirty tracking on an HW pagetable.
+ */
+struct iommu_hwpt_set_dirty_tracking {
+       __u32 size;
+       __u32 flags;
+       __u32 hwpt_id;
+       __u32 __reserved;
+};
+#define IOMMU_HWPT_SET_DIRTY_TRACKING _IO(IOMMUFD_TYPE, \
+                                         IOMMUFD_CMD_HWPT_SET_DIRTY_TRACKING)
+
+/**
+ * enum iommufd_hwpt_get_dirty_bitmap_flags - Flags for getting dirty bits
+ * @IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR: Just read the PTEs without clearing
+ *                                        any dirty bits metadata. This flag
+ *                                        can be passed in the expectation
+ *                                        where the next operation is an unmap
+ *                                        of the same IOVA range.
+ *
+ */
+enum iommufd_hwpt_get_dirty_bitmap_flags {
+       IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR = 1,
+};
+
+/**
+ * struct iommu_hwpt_get_dirty_bitmap - ioctl(IOMMU_HWPT_GET_DIRTY_BITMAP)
+ * @size: sizeof(struct iommu_hwpt_get_dirty_bitmap)
+ * @hwpt_id: HW pagetable ID that represents the IOMMU domain
+ * @flags: Combination of enum iommufd_hwpt_get_dirty_bitmap_flags
+ * @__reserved: Must be 0
+ * @iova: base IOVA of the bitmap first bit
+ * @length: IOVA range size
+ * @page_size: page size granularity of each bit in the bitmap
+ * @data: bitmap where to set the dirty bits. The bitmap bits each
+ *        represent a page_size which you deviate from an arbitrary iova.
+ *
+ * Checking a given IOVA is dirty:
+ *
+ *  data[(iova / page_size) / 64] & (1ULL << ((iova / page_size) % 64))
+ *
+ * Walk the IOMMU pagetables for a given IOVA range to return a bitmap
+ * with the dirty IOVAs. In doing so it will also by default clear any
+ * dirty bit metadata set in the IOPTE.
+ */
+struct iommu_hwpt_get_dirty_bitmap {
+       __u32 size;
+       __u32 hwpt_id;
+       __u32 flags;
+       __u32 __reserved;
+       __aligned_u64 iova;
+       __aligned_u64 length;
+       __aligned_u64 page_size;
+       __aligned_u64 data;
+};
+#define IOMMU_HWPT_GET_DIRTY_BITMAP _IO(IOMMUFD_TYPE, \
+                                       IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP)
+
 #endif
index 33d08600be13d6c23cf7b36ef6ef8ab6145b4ecc..6ed328c863c4f13ccc9db5bcf51092bd4ce183c4 100644 (file)
@@ -86,12 +86,13 @@ TEST_F(iommufd, cmd_fail)
 
 TEST_F(iommufd, cmd_length)
 {
-#define TEST_LENGTH(_struct, _ioctl)                                     \
+#define TEST_LENGTH(_struct, _ioctl, _last)                              \
        {                                                                \
+               size_t min_size = offsetofend(struct _struct, _last);    \
                struct {                                                 \
                        struct _struct cmd;                              \
                        uint8_t extra;                                   \
-               } cmd = { .cmd = { .size = sizeof(struct _struct) - 1 }, \
+               } cmd = { .cmd = { .size = min_size - 1 },               \
                          .extra = UINT8_MAX };                          \
                int old_errno;                                           \
                int rc;                                                  \
@@ -112,16 +113,19 @@ TEST_F(iommufd, cmd_length)
                }                                                        \
        }
 
-       TEST_LENGTH(iommu_destroy, IOMMU_DESTROY);
-       TEST_LENGTH(iommu_hw_info, IOMMU_GET_HW_INFO);
-       TEST_LENGTH(iommu_ioas_alloc, IOMMU_IOAS_ALLOC);
-       TEST_LENGTH(iommu_ioas_iova_ranges, IOMMU_IOAS_IOVA_RANGES);
-       TEST_LENGTH(iommu_ioas_allow_iovas, IOMMU_IOAS_ALLOW_IOVAS);
-       TEST_LENGTH(iommu_ioas_map, IOMMU_IOAS_MAP);
-       TEST_LENGTH(iommu_ioas_copy, IOMMU_IOAS_COPY);
-       TEST_LENGTH(iommu_ioas_unmap, IOMMU_IOAS_UNMAP);
-       TEST_LENGTH(iommu_option, IOMMU_OPTION);
-       TEST_LENGTH(iommu_vfio_ioas, IOMMU_VFIO_IOAS);
+       TEST_LENGTH(iommu_destroy, IOMMU_DESTROY, id);
+       TEST_LENGTH(iommu_hw_info, IOMMU_GET_HW_INFO, __reserved);
+       TEST_LENGTH(iommu_hwpt_alloc, IOMMU_HWPT_ALLOC, __reserved);
+       TEST_LENGTH(iommu_ioas_alloc, IOMMU_IOAS_ALLOC, out_ioas_id);
+       TEST_LENGTH(iommu_ioas_iova_ranges, IOMMU_IOAS_IOVA_RANGES,
+                   out_iova_alignment);
+       TEST_LENGTH(iommu_ioas_allow_iovas, IOMMU_IOAS_ALLOW_IOVAS,
+                   allowed_iovas);
+       TEST_LENGTH(iommu_ioas_map, IOMMU_IOAS_MAP, iova);
+       TEST_LENGTH(iommu_ioas_copy, IOMMU_IOAS_COPY, src_iova);
+       TEST_LENGTH(iommu_ioas_unmap, IOMMU_IOAS_UNMAP, length);
+       TEST_LENGTH(iommu_option, IOMMU_OPTION, val64);
+       TEST_LENGTH(iommu_vfio_ioas, IOMMU_VFIO_IOAS, __reserved);
 #undef TEST_LENGTH
 }
 
@@ -260,6 +264,121 @@ TEST_F(iommufd_ioas, ioas_destroy)
        }
 }
 
+TEST_F(iommufd_ioas, alloc_hwpt_nested)
+{
+       const uint32_t min_data_len =
+               offsetofend(struct iommu_hwpt_selftest, iotlb);
+       struct iommu_hwpt_selftest data = {
+               .iotlb = IOMMU_TEST_IOTLB_DEFAULT,
+       };
+       uint32_t nested_hwpt_id[2] = {};
+       uint32_t parent_hwpt_id = 0;
+       uint32_t parent_hwpt_id_not_work = 0;
+       uint32_t test_hwpt_id = 0;
+
+       if (self->device_id) {
+               /* Negative tests */
+               test_err_hwpt_alloc(ENOENT, self->ioas_id, self->device_id, 0,
+                                   &test_hwpt_id);
+               test_err_hwpt_alloc(EINVAL, self->device_id, self->device_id, 0,
+                                   &test_hwpt_id);
+
+               test_cmd_hwpt_alloc(self->device_id, self->ioas_id,
+                                   IOMMU_HWPT_ALLOC_NEST_PARENT,
+                                   &parent_hwpt_id);
+
+               test_cmd_hwpt_alloc(self->device_id, self->ioas_id, 0,
+                                   &parent_hwpt_id_not_work);
+
+               /* Negative nested tests */
+               test_err_hwpt_alloc_nested(EINVAL, self->device_id,
+                                          parent_hwpt_id, 0,
+                                          &nested_hwpt_id[0],
+                                          IOMMU_HWPT_DATA_NONE, &data,
+                                          sizeof(data));
+               test_err_hwpt_alloc_nested(EOPNOTSUPP, self->device_id,
+                                          parent_hwpt_id, 0,
+                                          &nested_hwpt_id[0],
+                                          IOMMU_HWPT_DATA_SELFTEST + 1, &data,
+                                          sizeof(data));
+               test_err_hwpt_alloc_nested(EINVAL, self->device_id,
+                                          parent_hwpt_id, 0,
+                                          &nested_hwpt_id[0],
+                                          IOMMU_HWPT_DATA_SELFTEST, &data,
+                                          min_data_len - 1);
+               test_err_hwpt_alloc_nested(EFAULT, self->device_id,
+                                          parent_hwpt_id, 0,
+                                          &nested_hwpt_id[0],
+                                          IOMMU_HWPT_DATA_SELFTEST, NULL,
+                                          sizeof(data));
+               test_err_hwpt_alloc_nested(
+                       EOPNOTSUPP, self->device_id, parent_hwpt_id,
+                       IOMMU_HWPT_ALLOC_NEST_PARENT, &nested_hwpt_id[0],
+                       IOMMU_HWPT_DATA_SELFTEST, &data, sizeof(data));
+               test_err_hwpt_alloc_nested(EINVAL, self->device_id,
+                                          parent_hwpt_id_not_work, 0,
+                                          &nested_hwpt_id[0],
+                                          IOMMU_HWPT_DATA_SELFTEST, &data,
+                                          sizeof(data));
+
+               /* Allocate two nested hwpts sharing one common parent hwpt */
+               test_cmd_hwpt_alloc_nested(self->device_id, parent_hwpt_id, 0,
+                                          &nested_hwpt_id[0],
+                                          IOMMU_HWPT_DATA_SELFTEST, &data,
+                                          sizeof(data));
+               test_cmd_hwpt_alloc_nested(self->device_id, parent_hwpt_id, 0,
+                                          &nested_hwpt_id[1],
+                                          IOMMU_HWPT_DATA_SELFTEST, &data,
+                                          sizeof(data));
+
+               /* Negative test: a nested hwpt on top of a nested hwpt */
+               test_err_hwpt_alloc_nested(EINVAL, self->device_id,
+                                          nested_hwpt_id[0], 0, &test_hwpt_id,
+                                          IOMMU_HWPT_DATA_SELFTEST, &data,
+                                          sizeof(data));
+               /* Negative test: parent hwpt now cannot be freed */
+               EXPECT_ERRNO(EBUSY,
+                            _test_ioctl_destroy(self->fd, parent_hwpt_id));
+
+               /* Attach device to nested_hwpt_id[0] that then will be busy */
+               test_cmd_mock_domain_replace(self->stdev_id, nested_hwpt_id[0]);
+               EXPECT_ERRNO(EBUSY,
+                            _test_ioctl_destroy(self->fd, nested_hwpt_id[0]));
+
+               /* Switch from nested_hwpt_id[0] to nested_hwpt_id[1] */
+               test_cmd_mock_domain_replace(self->stdev_id, nested_hwpt_id[1]);
+               EXPECT_ERRNO(EBUSY,
+                            _test_ioctl_destroy(self->fd, nested_hwpt_id[1]));
+               test_ioctl_destroy(nested_hwpt_id[0]);
+
+               /* Detach from nested_hwpt_id[1] and destroy it */
+               test_cmd_mock_domain_replace(self->stdev_id, parent_hwpt_id);
+               test_ioctl_destroy(nested_hwpt_id[1]);
+
+               /* Detach from the parent hw_pagetable and destroy it */
+               test_cmd_mock_domain_replace(self->stdev_id, self->ioas_id);
+               test_ioctl_destroy(parent_hwpt_id);
+               test_ioctl_destroy(parent_hwpt_id_not_work);
+       } else {
+               test_err_hwpt_alloc(ENOENT, self->device_id, self->ioas_id, 0,
+                                   &parent_hwpt_id);
+               test_err_hwpt_alloc_nested(ENOENT, self->device_id,
+                                          parent_hwpt_id, 0,
+                                          &nested_hwpt_id[0],
+                                          IOMMU_HWPT_DATA_SELFTEST, &data,
+                                          sizeof(data));
+               test_err_hwpt_alloc_nested(ENOENT, self->device_id,
+                                          parent_hwpt_id, 0,
+                                          &nested_hwpt_id[1],
+                                          IOMMU_HWPT_DATA_SELFTEST, &data,
+                                          sizeof(data));
+               test_err_mock_domain_replace(ENOENT, self->stdev_id,
+                                            nested_hwpt_id[0]);
+               test_err_mock_domain_replace(ENOENT, self->stdev_id,
+                                            nested_hwpt_id[1]);
+       }
+}
+
 TEST_F(iommufd_ioas, hwpt_attach)
 {
        /* Create a device attached directly to a hwpt */
@@ -1404,16 +1523,242 @@ TEST_F(iommufd_mock_domain, alloc_hwpt)
        int i;
 
        for (i = 0; i != variant->mock_domains; i++) {
+               uint32_t hwpt_id[2];
                uint32_t stddev_id;
-               uint32_t hwpt_id;
 
-               test_cmd_hwpt_alloc(self->idev_ids[0], self->ioas_id, &hwpt_id);
-               test_cmd_mock_domain(hwpt_id, &stddev_id, NULL, NULL);
+               test_err_hwpt_alloc(EOPNOTSUPP,
+                                   self->idev_ids[i], self->ioas_id,
+                                   ~IOMMU_HWPT_ALLOC_NEST_PARENT, &hwpt_id[0]);
+               test_cmd_hwpt_alloc(self->idev_ids[i], self->ioas_id,
+                                   0, &hwpt_id[0]);
+               test_cmd_hwpt_alloc(self->idev_ids[i], self->ioas_id,
+                                   IOMMU_HWPT_ALLOC_NEST_PARENT, &hwpt_id[1]);
+
+               /* Do a hw_pagetable rotation test */
+               test_cmd_mock_domain_replace(self->stdev_ids[i], hwpt_id[0]);
+               EXPECT_ERRNO(EBUSY, _test_ioctl_destroy(self->fd, hwpt_id[0]));
+               test_cmd_mock_domain_replace(self->stdev_ids[i], hwpt_id[1]);
+               EXPECT_ERRNO(EBUSY, _test_ioctl_destroy(self->fd, hwpt_id[1]));
+               test_cmd_mock_domain_replace(self->stdev_ids[i], self->ioas_id);
+               test_ioctl_destroy(hwpt_id[1]);
+
+               test_cmd_mock_domain(hwpt_id[0], &stddev_id, NULL, NULL);
                test_ioctl_destroy(stddev_id);
-               test_ioctl_destroy(hwpt_id);
+               test_ioctl_destroy(hwpt_id[0]);
        }
 }
 
+FIXTURE(iommufd_dirty_tracking)
+{
+       int fd;
+       uint32_t ioas_id;
+       uint32_t hwpt_id;
+       uint32_t stdev_id;
+       uint32_t idev_id;
+       unsigned long page_size;
+       unsigned long bitmap_size;
+       void *bitmap;
+       void *buffer;
+};
+
+FIXTURE_VARIANT(iommufd_dirty_tracking)
+{
+       unsigned long buffer_size;
+};
+
+FIXTURE_SETUP(iommufd_dirty_tracking)
+{
+       void *vrc;
+       int rc;
+
+       self->fd = open("/dev/iommu", O_RDWR);
+       ASSERT_NE(-1, self->fd);
+
+       rc = posix_memalign(&self->buffer, HUGEPAGE_SIZE, variant->buffer_size);
+       if (rc || !self->buffer) {
+               SKIP(return, "Skipping buffer_size=%lu due to errno=%d",
+                          variant->buffer_size, rc);
+       }
+
+       assert((uintptr_t)self->buffer % HUGEPAGE_SIZE == 0);
+       vrc = mmap(self->buffer, variant->buffer_size, PROT_READ | PROT_WRITE,
+                  MAP_SHARED | MAP_ANONYMOUS | MAP_FIXED, -1, 0);
+       assert(vrc == self->buffer);
+
+       self->page_size = MOCK_PAGE_SIZE;
+       self->bitmap_size =
+               variant->buffer_size / self->page_size / BITS_PER_BYTE;
+
+       /* Provision with an extra (MOCK_PAGE_SIZE) for the unaligned case */
+       rc = posix_memalign(&self->bitmap, PAGE_SIZE,
+                           self->bitmap_size + MOCK_PAGE_SIZE);
+       assert(!rc);
+       assert(self->bitmap);
+       assert((uintptr_t)self->bitmap % PAGE_SIZE == 0);
+
+       test_ioctl_ioas_alloc(&self->ioas_id);
+       test_cmd_mock_domain(self->ioas_id, &self->stdev_id, &self->hwpt_id,
+                            &self->idev_id);
+}
+
+FIXTURE_TEARDOWN(iommufd_dirty_tracking)
+{
+       munmap(self->buffer, variant->buffer_size);
+       munmap(self->bitmap, self->bitmap_size);
+       teardown_iommufd(self->fd, _metadata);
+}
+
+FIXTURE_VARIANT_ADD(iommufd_dirty_tracking, domain_dirty128k)
+{
+       /* one u32 index bitmap */
+       .buffer_size = 128UL * 1024UL,
+};
+
+FIXTURE_VARIANT_ADD(iommufd_dirty_tracking, domain_dirty256k)
+{
+       /* one u64 index bitmap */
+       .buffer_size = 256UL * 1024UL,
+};
+
+FIXTURE_VARIANT_ADD(iommufd_dirty_tracking, domain_dirty640k)
+{
+       /* two u64 index and trailing end bitmap */
+       .buffer_size = 640UL * 1024UL,
+};
+
+FIXTURE_VARIANT_ADD(iommufd_dirty_tracking, domain_dirty128M)
+{
+       /* 4K bitmap (128M IOVA range) */
+       .buffer_size = 128UL * 1024UL * 1024UL,
+};
+
+FIXTURE_VARIANT_ADD(iommufd_dirty_tracking, domain_dirty256M)
+{
+       /* 8K bitmap (256M IOVA range) */
+       .buffer_size = 256UL * 1024UL * 1024UL,
+};
+
+TEST_F(iommufd_dirty_tracking, enforce_dirty)
+{
+       uint32_t ioas_id, stddev_id, idev_id;
+       uint32_t hwpt_id, _hwpt_id;
+       uint32_t dev_flags;
+
+       /* Regular case */
+       dev_flags = MOCK_FLAGS_DEVICE_NO_DIRTY;
+       test_cmd_hwpt_alloc(self->idev_id, self->ioas_id,
+                           IOMMU_HWPT_ALLOC_DIRTY_TRACKING, &hwpt_id);
+       test_cmd_mock_domain(hwpt_id, &stddev_id, NULL, NULL);
+       test_err_mock_domain_flags(EINVAL, hwpt_id, dev_flags, &stddev_id,
+                                  NULL);
+       test_ioctl_destroy(stddev_id);
+       test_ioctl_destroy(hwpt_id);
+
+       /* IOMMU device does not support dirty tracking */
+       test_ioctl_ioas_alloc(&ioas_id);
+       test_cmd_mock_domain_flags(ioas_id, dev_flags, &stddev_id, &_hwpt_id,
+                                  &idev_id);
+       test_err_hwpt_alloc(EOPNOTSUPP, idev_id, ioas_id,
+                           IOMMU_HWPT_ALLOC_DIRTY_TRACKING, &hwpt_id);
+       test_ioctl_destroy(stddev_id);
+}
+
+TEST_F(iommufd_dirty_tracking, set_dirty_tracking)
+{
+       uint32_t stddev_id;
+       uint32_t hwpt_id;
+
+       test_cmd_hwpt_alloc(self->idev_id, self->ioas_id,
+                           IOMMU_HWPT_ALLOC_DIRTY_TRACKING, &hwpt_id);
+       test_cmd_mock_domain(hwpt_id, &stddev_id, NULL, NULL);
+       test_cmd_set_dirty_tracking(hwpt_id, true);
+       test_cmd_set_dirty_tracking(hwpt_id, false);
+
+       test_ioctl_destroy(stddev_id);
+       test_ioctl_destroy(hwpt_id);
+}
+
+TEST_F(iommufd_dirty_tracking, device_dirty_capability)
+{
+       uint32_t caps = 0;
+       uint32_t stddev_id;
+       uint32_t hwpt_id;
+
+       test_cmd_hwpt_alloc(self->idev_id, self->ioas_id, 0, &hwpt_id);
+       test_cmd_mock_domain(hwpt_id, &stddev_id, NULL, NULL);
+       test_cmd_get_hw_capabilities(self->idev_id, caps,
+                                    IOMMU_HW_CAP_DIRTY_TRACKING);
+       ASSERT_EQ(IOMMU_HW_CAP_DIRTY_TRACKING,
+                 caps & IOMMU_HW_CAP_DIRTY_TRACKING);
+
+       test_ioctl_destroy(stddev_id);
+       test_ioctl_destroy(hwpt_id);
+}
+
+TEST_F(iommufd_dirty_tracking, get_dirty_bitmap)
+{
+       uint32_t stddev_id;
+       uint32_t hwpt_id;
+       uint32_t ioas_id;
+
+       test_ioctl_ioas_alloc(&ioas_id);
+       test_ioctl_ioas_map_fixed_id(ioas_id, self->buffer,
+                                    variant->buffer_size, MOCK_APERTURE_START);
+
+       test_cmd_hwpt_alloc(self->idev_id, ioas_id,
+                           IOMMU_HWPT_ALLOC_DIRTY_TRACKING, &hwpt_id);
+       test_cmd_mock_domain(hwpt_id, &stddev_id, NULL, NULL);
+
+       test_cmd_set_dirty_tracking(hwpt_id, true);
+
+       test_mock_dirty_bitmaps(hwpt_id, variant->buffer_size,
+                               MOCK_APERTURE_START, self->page_size,
+                               self->bitmap, self->bitmap_size, 0, _metadata);
+
+       /* PAGE_SIZE unaligned bitmap */
+       test_mock_dirty_bitmaps(hwpt_id, variant->buffer_size,
+                               MOCK_APERTURE_START, self->page_size,
+                               self->bitmap + MOCK_PAGE_SIZE,
+                               self->bitmap_size, 0, _metadata);
+
+       test_ioctl_destroy(stddev_id);
+       test_ioctl_destroy(hwpt_id);
+}
+
+TEST_F(iommufd_dirty_tracking, get_dirty_bitmap_no_clear)
+{
+       uint32_t stddev_id;
+       uint32_t hwpt_id;
+       uint32_t ioas_id;
+
+       test_ioctl_ioas_alloc(&ioas_id);
+       test_ioctl_ioas_map_fixed_id(ioas_id, self->buffer,
+                                    variant->buffer_size, MOCK_APERTURE_START);
+
+       test_cmd_hwpt_alloc(self->idev_id, ioas_id,
+                           IOMMU_HWPT_ALLOC_DIRTY_TRACKING, &hwpt_id);
+       test_cmd_mock_domain(hwpt_id, &stddev_id, NULL, NULL);
+
+       test_cmd_set_dirty_tracking(hwpt_id, true);
+
+       test_mock_dirty_bitmaps(hwpt_id, variant->buffer_size,
+                               MOCK_APERTURE_START, self->page_size,
+                               self->bitmap, self->bitmap_size,
+                               IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR,
+                               _metadata);
+
+       /* Unaligned bitmap */
+       test_mock_dirty_bitmaps(hwpt_id, variant->buffer_size,
+                               MOCK_APERTURE_START, self->page_size,
+                               self->bitmap + MOCK_PAGE_SIZE,
+                               self->bitmap_size,
+                               IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR,
+                               _metadata);
+
+       test_ioctl_destroy(stddev_id);
+       test_ioctl_destroy(hwpt_id);
+}
+
 /* VFIO compatibility IOCTLs */
 
 TEST_F(iommufd, simple_ioctls)
@@ -1729,7 +2074,7 @@ TEST_F(vfio_compat_mock_domain, map)
        ASSERT_EQ(0, ioctl(self->fd, VFIO_IOMMU_UNMAP_DMA, &unmap_cmd));
        ASSERT_EQ(BUFFER_SIZE, unmap_cmd.size);
 
-       /* UNMAP_FLAG_ALL requres 0 iova/size */
+       /* UNMAP_FLAG_ALL requires 0 iova/size */
        ASSERT_EQ(0, ioctl(self->fd, VFIO_IOMMU_MAP_DMA, &map_cmd));
        unmap_cmd.flags = VFIO_DMA_UNMAP_FLAG_ALL;
        EXPECT_ERRNO(EINVAL, ioctl(self->fd, VFIO_IOMMU_UNMAP_DMA, &unmap_cmd));
index a220ca2a689d160c95129d0bc281ed63dfb38c73..f590417cd67a95bf31065f9b7682bc7c627ebf3a 100644 (file)
@@ -105,7 +105,7 @@ static bool fail_nth_next(struct __test_metadata *_metadata,
 
        /*
         * This is just an arbitrary limit based on the current kernel
-        * situation. Changes in the kernel can dramtically change the number of
+        * situation. Changes in the kernel can dramatically change the number of
         * required fault injection sites, so if this hits it doesn't
         * necessarily mean a test failure, just that the limit has to be made
         * bigger.
@@ -612,10 +612,11 @@ TEST_FAIL_NTH(basic_fail_nth, device)
                                  &idev_id))
                return -1;
 
-       if (_test_cmd_get_hw_info(self->fd, idev_id, &info, sizeof(info)))
+       if (_test_cmd_get_hw_info(self->fd, idev_id, &info, sizeof(info), NULL))
                return -1;
 
-       if (_test_cmd_hwpt_alloc(self->fd, idev_id, ioas_id, &hwpt_id))
+       if (_test_cmd_hwpt_alloc(self->fd, idev_id, ioas_id, 0, &hwpt_id,
+                                IOMMU_HWPT_DATA_NONE, 0, 0))
                return -1;
 
        if (_test_cmd_mock_domain_replace(self->fd, stdev_id, ioas_id2, NULL))
index e0753d03ecaa8576005120cf86dd47a529df1f94..050e9751321cff1015474c3023df28c2194d5a52 100644 (file)
 /* Hack to make assertions more readable */
 #define _IOMMU_TEST_CMD(x) IOMMU_TEST_CMD
 
+/* Imported from include/asm-generic/bitops/generic-non-atomic.h */
+#define BITS_PER_BYTE 8
+#define BITS_PER_LONG __BITS_PER_LONG
+#define BIT_MASK(nr) (1UL << ((nr) % __BITS_PER_LONG))
+#define BIT_WORD(nr) ((nr) / __BITS_PER_LONG)
+
+static inline void set_bit(unsigned int nr, unsigned long *addr)
+{
+       unsigned long mask = BIT_MASK(nr);
+       unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
+
+       *p |= mask;
+}
+
+static inline bool test_bit(unsigned int nr, unsigned long *addr)
+{
+       return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG - 1)));
+}
+
 static void *buffer;
 static unsigned long BUFFER_SIZE;
 
@@ -74,6 +93,38 @@ static int _test_cmd_mock_domain(int fd, unsigned int ioas_id, __u32 *stdev_id,
        EXPECT_ERRNO(_errno, _test_cmd_mock_domain(self->fd, ioas_id, \
                                                   stdev_id, hwpt_id, NULL))
 
+static int _test_cmd_mock_domain_flags(int fd, unsigned int ioas_id,
+                                      __u32 stdev_flags, __u32 *stdev_id,
+                                      __u32 *hwpt_id, __u32 *idev_id)
+{
+       struct iommu_test_cmd cmd = {
+               .size = sizeof(cmd),
+               .op = IOMMU_TEST_OP_MOCK_DOMAIN_FLAGS,
+               .id = ioas_id,
+               .mock_domain_flags = { .dev_flags = stdev_flags },
+       };
+       int ret;
+
+       ret = ioctl(fd, IOMMU_TEST_CMD, &cmd);
+       if (ret)
+               return ret;
+       if (stdev_id)
+               *stdev_id = cmd.mock_domain_flags.out_stdev_id;
+       assert(cmd.id != 0);
+       if (hwpt_id)
+               *hwpt_id = cmd.mock_domain_flags.out_hwpt_id;
+       if (idev_id)
+               *idev_id = cmd.mock_domain_flags.out_idev_id;
+       return 0;
+}
+#define test_cmd_mock_domain_flags(ioas_id, flags, stdev_id, hwpt_id, idev_id) \
+       ASSERT_EQ(0, _test_cmd_mock_domain_flags(self->fd, ioas_id, flags,     \
+                                                stdev_id, hwpt_id, idev_id))
+#define test_err_mock_domain_flags(_errno, ioas_id, flags, stdev_id, hwpt_id) \
+       EXPECT_ERRNO(_errno,                                                  \
+                    _test_cmd_mock_domain_flags(self->fd, ioas_id, flags,    \
+                                                stdev_id, hwpt_id, NULL))
+
 static int _test_cmd_mock_domain_replace(int fd, __u32 stdev_id, __u32 pt_id,
                                         __u32 *hwpt_id)
 {
@@ -103,12 +154,17 @@ static int _test_cmd_mock_domain_replace(int fd, __u32 stdev_id, __u32 pt_id,
                                                           pt_id, NULL))
 
 static int _test_cmd_hwpt_alloc(int fd, __u32 device_id, __u32 pt_id,
-                                        __u32 *hwpt_id)
+                               __u32 flags, __u32 *hwpt_id, __u32 data_type,
+                               void *data, size_t data_len)
 {
        struct iommu_hwpt_alloc cmd = {
                .size = sizeof(cmd),
+               .flags = flags,
                .dev_id = device_id,
                .pt_id = pt_id,
+               .data_type = data_type,
+               .data_len = data_len,
+               .data_uptr = (uint64_t)data,
        };
        int ret;
 
@@ -120,8 +176,24 @@ static int _test_cmd_hwpt_alloc(int fd, __u32 device_id, __u32 pt_id,
        return 0;
 }
 
-#define test_cmd_hwpt_alloc(device_id, pt_id, hwpt_id) \
-       ASSERT_EQ(0, _test_cmd_hwpt_alloc(self->fd, device_id, pt_id, hwpt_id))
+#define test_cmd_hwpt_alloc(device_id, pt_id, flags, hwpt_id)                  \
+       ASSERT_EQ(0, _test_cmd_hwpt_alloc(self->fd, device_id, pt_id, flags,   \
+                                         hwpt_id, IOMMU_HWPT_DATA_NONE, NULL, \
+                                         0))
+#define test_err_hwpt_alloc(_errno, device_id, pt_id, flags, hwpt_id)   \
+       EXPECT_ERRNO(_errno, _test_cmd_hwpt_alloc(                      \
+                                    self->fd, device_id, pt_id, flags, \
+                                    hwpt_id, IOMMU_HWPT_DATA_NONE, NULL, 0))
+
+#define test_cmd_hwpt_alloc_nested(device_id, pt_id, flags, hwpt_id,         \
+                                  data_type, data, data_len)                \
+       ASSERT_EQ(0, _test_cmd_hwpt_alloc(self->fd, device_id, pt_id, flags, \
+                                         hwpt_id, data_type, data, data_len))
+#define test_err_hwpt_alloc_nested(_errno, device_id, pt_id, flags, hwpt_id, \
+                                  data_type, data, data_len)                \
+       EXPECT_ERRNO(_errno,                                                 \
+                    _test_cmd_hwpt_alloc(self->fd, device_id, pt_id, flags, \
+                                         hwpt_id, data_type, data, data_len))
 
 static int _test_cmd_access_replace_ioas(int fd, __u32 access_id,
                                         unsigned int ioas_id)
@@ -142,6 +214,126 @@ static int _test_cmd_access_replace_ioas(int fd, __u32 access_id,
 #define test_cmd_access_replace_ioas(access_id, ioas_id) \
        ASSERT_EQ(0, _test_cmd_access_replace_ioas(self->fd, access_id, ioas_id))
 
+static int _test_cmd_set_dirty_tracking(int fd, __u32 hwpt_id, bool enabled)
+{
+       struct iommu_hwpt_set_dirty_tracking cmd = {
+               .size = sizeof(cmd),
+               .flags = enabled ? IOMMU_HWPT_DIRTY_TRACKING_ENABLE : 0,
+               .hwpt_id = hwpt_id,
+       };
+       int ret;
+
+       ret = ioctl(fd, IOMMU_HWPT_SET_DIRTY_TRACKING, &cmd);
+       if (ret)
+               return -errno;
+       return 0;
+}
+#define test_cmd_set_dirty_tracking(hwpt_id, enabled) \
+       ASSERT_EQ(0, _test_cmd_set_dirty_tracking(self->fd, hwpt_id, enabled))
+
+static int _test_cmd_get_dirty_bitmap(int fd, __u32 hwpt_id, size_t length,
+                                     __u64 iova, size_t page_size,
+                                     __u64 *bitmap, __u32 flags)
+{
+       struct iommu_hwpt_get_dirty_bitmap cmd = {
+               .size = sizeof(cmd),
+               .hwpt_id = hwpt_id,
+               .flags = flags,
+               .iova = iova,
+               .length = length,
+               .page_size = page_size,
+               .data = (uintptr_t)bitmap,
+       };
+       int ret;
+
+       ret = ioctl(fd, IOMMU_HWPT_GET_DIRTY_BITMAP, &cmd);
+       if (ret)
+               return ret;
+       return 0;
+}
+
+#define test_cmd_get_dirty_bitmap(fd, hwpt_id, length, iova, page_size,    \
+                                 bitmap, flags)                           \
+       ASSERT_EQ(0, _test_cmd_get_dirty_bitmap(fd, hwpt_id, length, iova, \
+                                               page_size, bitmap, flags))
+
+static int _test_cmd_mock_domain_set_dirty(int fd, __u32 hwpt_id, size_t length,
+                                          __u64 iova, size_t page_size,
+                                          __u64 *bitmap, __u64 *dirty)
+{
+       struct iommu_test_cmd cmd = {
+               .size = sizeof(cmd),
+               .op = IOMMU_TEST_OP_DIRTY,
+               .id = hwpt_id,
+               .dirty = {
+                       .iova = iova,
+                       .length = length,
+                       .page_size = page_size,
+                       .uptr = (uintptr_t)bitmap,
+               }
+       };
+       int ret;
+
+       ret = ioctl(fd, _IOMMU_TEST_CMD(IOMMU_TEST_OP_DIRTY), &cmd);
+       if (ret)
+               return -ret;
+       if (dirty)
+               *dirty = cmd.dirty.out_nr_dirty;
+       return 0;
+}
+
+#define test_cmd_mock_domain_set_dirty(fd, hwpt_id, length, iova, page_size, \
+                                      bitmap, nr)                           \
+       ASSERT_EQ(0,                                                         \
+                 _test_cmd_mock_domain_set_dirty(fd, hwpt_id, length, iova, \
+                                                 page_size, bitmap, nr))
+
+static int _test_mock_dirty_bitmaps(int fd, __u32 hwpt_id, size_t length,
+                                   __u64 iova, size_t page_size, __u64 *bitmap,
+                                   __u64 bitmap_size, __u32 flags,
+                                   struct __test_metadata *_metadata)
+{
+       unsigned long i, count, nbits = bitmap_size * BITS_PER_BYTE;
+       unsigned long nr = nbits / 2;
+       __u64 out_dirty = 0;
+
+       /* Mark all even bits as dirty in the mock domain */
+       for (count = 0, i = 0; i < nbits; count += !(i % 2), i++)
+               if (!(i % 2))
+                       set_bit(i, (unsigned long *)bitmap);
+       ASSERT_EQ(nr, count);
+
+       test_cmd_mock_domain_set_dirty(fd, hwpt_id, length, iova, page_size,
+                                      bitmap, &out_dirty);
+       ASSERT_EQ(nr, out_dirty);
+
+       /* Expect all even bits as dirty in the user bitmap */
+       memset(bitmap, 0, bitmap_size);
+       test_cmd_get_dirty_bitmap(fd, hwpt_id, length, iova, page_size, bitmap,
+                                 flags);
+       for (count = 0, i = 0; i < nbits; count += !(i % 2), i++)
+               ASSERT_EQ(!(i % 2), test_bit(i, (unsigned long *)bitmap));
+       ASSERT_EQ(count, out_dirty);
+
+       memset(bitmap, 0, bitmap_size);
+       test_cmd_get_dirty_bitmap(fd, hwpt_id, length, iova, page_size, bitmap,
+                                 flags);
+
+       /* It as read already -- expect all zeroes */
+       for (i = 0; i < nbits; i++) {
+               ASSERT_EQ(!(i % 2) && (flags &
+                                      IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR),
+                         test_bit(i, (unsigned long *)bitmap));
+       }
+
+       return 0;
+}
+#define test_mock_dirty_bitmaps(hwpt_id, length, iova, page_size, bitmap,      \
+                               bitmap_size, flags, _metadata)                 \
+       ASSERT_EQ(0, _test_mock_dirty_bitmaps(self->fd, hwpt_id, length, iova, \
+                                             page_size, bitmap, bitmap_size,  \
+                                             flags, _metadata))
+
 static int _test_cmd_create_access(int fd, unsigned int ioas_id,
                                   __u32 *access_id, unsigned int flags)
 {
@@ -266,6 +458,17 @@ static int _test_ioctl_ioas_map(int fd, unsigned int ioas_id, void *buffer,
                                             IOMMU_IOAS_MAP_READABLE));       \
        })
 
+#define test_ioctl_ioas_map_fixed_id(ioas_id, buffer, length, iova)           \
+       ({                                                                    \
+               __u64 __iova = iova;                                          \
+               ASSERT_EQ(0,                                                  \
+                         _test_ioctl_ioas_map(                               \
+                                 self->fd, ioas_id, buffer, length, &__iova, \
+                                 IOMMU_IOAS_MAP_FIXED_IOVA |                 \
+                                         IOMMU_IOAS_MAP_WRITEABLE |          \
+                                         IOMMU_IOAS_MAP_READABLE));          \
+       })
+
 #define test_err_ioctl_ioas_map_fixed(_errno, buffer, length, iova)           \
        ({                                                                    \
                __u64 __iova = iova;                                          \
@@ -354,8 +557,8 @@ static void teardown_iommufd(int fd, struct __test_metadata *_metadata)
 #endif
 
 /* @data can be NULL */
-static int _test_cmd_get_hw_info(int fd, __u32 device_id,
-                                void *data, size_t data_len)
+static int _test_cmd_get_hw_info(int fd, __u32 device_id, void *data,
+                                size_t data_len, uint32_t *capabilities)
 {
        struct iommu_test_hw_info *info = (struct iommu_test_hw_info *)data;
        struct iommu_hw_info cmd = {
@@ -363,6 +566,7 @@ static int _test_cmd_get_hw_info(int fd, __u32 device_id,
                .dev_id = device_id,
                .data_len = data_len,
                .data_uptr = (uint64_t)data,
+               .out_capabilities = 0,
        };
        int ret;
 
@@ -399,14 +603,19 @@ static int _test_cmd_get_hw_info(int fd, __u32 device_id,
                        assert(!info->flags);
        }
 
+       if (capabilities)
+               *capabilities = cmd.out_capabilities;
+
        return 0;
 }
 
-#define test_cmd_get_hw_info(device_id, data, data_len)         \
-       ASSERT_EQ(0, _test_cmd_get_hw_info(self->fd, device_id, \
-                                          data, data_len))
+#define test_cmd_get_hw_info(device_id, data, data_len)               \
+       ASSERT_EQ(0, _test_cmd_get_hw_info(self->fd, device_id, data, \
+                                          data_len, NULL))
+
+#define test_err_get_hw_info(_errno, device_id, data, data_len)               \
+       EXPECT_ERRNO(_errno, _test_cmd_get_hw_info(self->fd, device_id, data, \
+                                                  data_len, NULL))
 
-#define test_err_get_hw_info(_errno, device_id, data, data_len) \
-       EXPECT_ERRNO(_errno,                                    \
-                    _test_cmd_get_hw_info(self->fd, device_id, \
-                                          data, data_len))
+#define test_cmd_get_hw_capabilities(device_id, caps, mask) \
+       ASSERT_EQ(0, _test_cmd_get_hw_info(self->fd, device_id, NULL, 0, &caps))