#include <linux/hugetlb.h>
#include <linux/memremap.h>
#include <linux/jump_label.h>
+#include <linux/dma-mapping.h>
#include <linux/mmu_notifier.h>
#include <linux/memory_hotplug.h>
update.start = nrange->start;
update.end = nrange->end;
update.event = HMM_UPDATE_INVALIDATE;
- update.blockable = nrange->blockable;
+ update.blockable = mmu_notifier_range_blockable(nrange);
- if (nrange->blockable)
+ if (mmu_notifier_range_blockable(nrange))
mutex_lock(&hmm->lock);
else if (!mutex_trylock(&hmm->lock)) {
ret = -EAGAIN;
}
mutex_unlock(&hmm->lock);
- if (nrange->blockable)
+ if (mmu_notifier_range_blockable(nrange))
down_read(&hmm->mirrors_sem);
else if (!down_read_trylock(&hmm->mirrors_sem)) {
ret = -EAGAIN;
struct hmm_vma_walk {
struct hmm_range *range;
+ struct dev_pagemap *pgmap;
unsigned long last;
bool fault;
bool block;
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
uint64_t *pfns = range->pfns;
- unsigned long i;
+ unsigned long i, page_size;
hmm_vma_walk->last = addr;
- i = (addr - range->start) >> PAGE_SHIFT;
- for (; addr < end; addr += PAGE_SIZE, i++) {
+ page_size = hmm_range_page_size(range);
+ i = (addr - range->start) >> range->page_shift;
+
+ for (; addr < end; addr += page_size, i++) {
pfns[i] = range->values[HMM_PFN_NONE];
if (fault || write_fault) {
int ret;
if (!hmm_vma_walk->fault)
return;
+ /*
+ * So we not only consider the individual per page request we also
+ * consider the default flags requested for the range. The API can
+ * be use in 2 fashions. The first one where the HMM user coalesce
+ * multiple page fault into one request and set flags per pfns for
+ * of those faults. The second one where the HMM user want to pre-
+ * fault a range with specific flags. For the latter one it is a
+ * waste to have the user pre-fill the pfn arrays with a default
+ * flags value.
+ */
+ pfns = (pfns & range->pfn_flags_mask) | range->default_flags;
+
/* We aren't ask to do anything ... */
if (!(pfns & range->flags[HMM_PFN_VALID]))
return;
range->flags[HMM_PFN_VALID];
}
+static inline uint64_t pud_to_hmm_pfn_flags(struct hmm_range *range, pud_t pud)
+{
+ if (!pud_present(pud))
+ return 0;
+ return pud_write(pud) ? range->flags[HMM_PFN_VALID] |
+ range->flags[HMM_PFN_WRITE] :
+ range->flags[HMM_PFN_VALID];
+}
+
static int hmm_vma_handle_pmd(struct mm_walk *walk,
unsigned long addr,
unsigned long end,
uint64_t *pfns,
pmd_t pmd)
{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
unsigned long pfn, npages, i;
return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
pfn = pmd_pfn(pmd) + pte_index(addr);
- for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++)
- pfns[i] = hmm_pfn_from_pfn(range, pfn) | cpu_flags;
+ for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) {
+ if (pmd_devmap(pmd)) {
+ hmm_vma_walk->pgmap = get_dev_pagemap(pfn,
+ hmm_vma_walk->pgmap);
+ if (unlikely(!hmm_vma_walk->pgmap))
+ return -EBUSY;
+ }
+ pfns[i] = hmm_device_entry_from_pfn(range, pfn) | cpu_flags;
+ }
+ if (hmm_vma_walk->pgmap) {
+ put_dev_pagemap(hmm_vma_walk->pgmap);
+ hmm_vma_walk->pgmap = NULL;
+ }
hmm_vma_walk->last = end;
return 0;
+#else
+ /* If THP is not enabled then we should never reach that code ! */
+ return -EINVAL;
+#endif
}
static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte)
&fault, &write_fault);
if (fault || write_fault)
goto fault;
- *pfn = hmm_pfn_from_pfn(range, swp_offset(entry));
+ *pfn = hmm_device_entry_from_pfn(range,
+ swp_offset(entry));
*pfn |= cpu_flags;
return 0;
}
if (fault || write_fault)
goto fault;
- *pfn = hmm_pfn_from_pfn(range, pte_pfn(pte)) | cpu_flags;
+ if (pte_devmap(pte)) {
+ hmm_vma_walk->pgmap = get_dev_pagemap(pte_pfn(pte),
+ hmm_vma_walk->pgmap);
+ if (unlikely(!hmm_vma_walk->pgmap))
+ return -EBUSY;
+ } else if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && pte_special(pte)) {
+ *pfn = range->values[HMM_PFN_SPECIAL];
+ return -EFAULT;
+ }
+
+ *pfn = hmm_device_entry_from_pfn(range, pte_pfn(pte)) | cpu_flags;
return 0;
fault:
+ if (hmm_vma_walk->pgmap) {
+ put_dev_pagemap(hmm_vma_walk->pgmap);
+ hmm_vma_walk->pgmap = NULL;
+ }
pte_unmap(ptep);
/* Fault any virtual address we were asked to fault */
return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
return r;
}
}
+ if (hmm_vma_walk->pgmap) {
+ /*
+ * We do put_dev_pagemap() here and not in hmm_vma_handle_pte()
+ * so that we can leverage get_dev_pagemap() optimization which
+ * will not re-take a reference on a pgmap if we already have
+ * one.
+ */
+ put_dev_pagemap(hmm_vma_walk->pgmap);
+ hmm_vma_walk->pgmap = NULL;
+ }
pte_unmap(ptep - 1);
hmm_vma_walk->last = addr;
return 0;
}
+static int hmm_vma_walk_pud(pud_t *pudp,
+ unsigned long start,
+ unsigned long end,
+ struct mm_walk *walk)
+{
+ struct hmm_vma_walk *hmm_vma_walk = walk->private;
+ struct hmm_range *range = hmm_vma_walk->range;
+ unsigned long addr = start, next;
+ pmd_t *pmdp;
+ pud_t pud;
+ int ret;
+
+again:
+ pud = READ_ONCE(*pudp);
+ if (pud_none(pud))
+ return hmm_vma_walk_hole(start, end, walk);
+
+ if (pud_huge(pud) && pud_devmap(pud)) {
+ unsigned long i, npages, pfn;
+ uint64_t *pfns, cpu_flags;
+ bool fault, write_fault;
+
+ if (!pud_present(pud))
+ return hmm_vma_walk_hole(start, end, walk);
+
+ i = (addr - range->start) >> PAGE_SHIFT;
+ npages = (end - addr) >> PAGE_SHIFT;
+ pfns = &range->pfns[i];
+
+ cpu_flags = pud_to_hmm_pfn_flags(range, pud);
+ hmm_range_need_fault(hmm_vma_walk, pfns, npages,
+ cpu_flags, &fault, &write_fault);
+ if (fault || write_fault)
+ return hmm_vma_walk_hole_(addr, end, fault,
+ write_fault, walk);
+
+#ifdef CONFIG_HUGETLB_PAGE
+ pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
+ for (i = 0; i < npages; ++i, ++pfn) {
+ hmm_vma_walk->pgmap = get_dev_pagemap(pfn,
+ hmm_vma_walk->pgmap);
+ if (unlikely(!hmm_vma_walk->pgmap))
+ return -EBUSY;
+ pfns[i] = hmm_device_entry_from_pfn(range, pfn) |
+ cpu_flags;
+ }
+ if (hmm_vma_walk->pgmap) {
+ put_dev_pagemap(hmm_vma_walk->pgmap);
+ hmm_vma_walk->pgmap = NULL;
+ }
+ hmm_vma_walk->last = end;
+ return 0;
+#else
+ return -EINVAL;
+#endif
+ }
+
+ split_huge_pud(walk->vma, pudp, addr);
+ if (pud_none(*pudp))
+ goto again;
+
+ pmdp = pmd_offset(pudp, addr);
+ do {
+ next = pmd_addr_end(addr, end);
+ ret = hmm_vma_walk_pmd(pmdp, addr, next, walk);
+ if (ret)
+ return ret;
+ } while (pmdp++, addr = next, addr != end);
+
+ return 0;
+}
+
+static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
+ unsigned long start, unsigned long end,
+ struct mm_walk *walk)
+{
+#ifdef CONFIG_HUGETLB_PAGE
+ unsigned long addr = start, i, pfn, mask, size, pfn_inc;
+ struct hmm_vma_walk *hmm_vma_walk = walk->private;
+ struct hmm_range *range = hmm_vma_walk->range;
+ struct vm_area_struct *vma = walk->vma;
+ struct hstate *h = hstate_vma(vma);
+ uint64_t orig_pfn, cpu_flags;
+ bool fault, write_fault;
+ spinlock_t *ptl;
+ pte_t entry;
+ int ret = 0;
+
+ size = 1UL << huge_page_shift(h);
+ mask = size - 1;
+ if (range->page_shift != PAGE_SHIFT) {
+ /* Make sure we are looking at full page. */
+ if (start & mask)
+ return -EINVAL;
+ if (end < (start + size))
+ return -EINVAL;
+ pfn_inc = size >> PAGE_SHIFT;
+ } else {
+ pfn_inc = 1;
+ size = PAGE_SIZE;
+ }
+
+
+ ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
+ entry = huge_ptep_get(pte);
+
+ i = (start - range->start) >> range->page_shift;
+ orig_pfn = range->pfns[i];
+ range->pfns[i] = range->values[HMM_PFN_NONE];
+ cpu_flags = pte_to_hmm_pfn_flags(range, entry);
+ fault = write_fault = false;
+ hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
+ &fault, &write_fault);
+ if (fault || write_fault) {
+ ret = -ENOENT;
+ goto unlock;
+ }
+
+ pfn = pte_pfn(entry) + ((start & mask) >> range->page_shift);
+ for (; addr < end; addr += size, i++, pfn += pfn_inc)
+ range->pfns[i] = hmm_device_entry_from_pfn(range, pfn) |
+ cpu_flags;
+ hmm_vma_walk->last = end;
+
+unlock:
+ spin_unlock(ptl);
+
+ if (ret == -ENOENT)
+ return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
+
+ return ret;
+#else /* CONFIG_HUGETLB_PAGE */
+ return -EINVAL;
+#endif
+}
+
static void hmm_pfns_clear(struct hmm_range *range,
uint64_t *pfns,
unsigned long addr,
*pfns = range->values[HMM_PFN_NONE];
}
-static void hmm_pfns_special(struct hmm_range *range)
-{
- unsigned long addr = range->start, i = 0;
-
- for (; addr < range->end; addr += PAGE_SIZE, i++)
- range->pfns[i] = range->values[HMM_PFN_SPECIAL];
-}
-
/*
* hmm_range_register() - start tracking change to CPU page table over a range
* @range: range
* @mm: the mm struct for the range of virtual address
* @start: start virtual address (inclusive)
* @end: end virtual address (exclusive)
+ * @page_shift: expect page shift for the range
* Returns 0 on success, -EFAULT if the address space is no longer valid
*
* Track updates to the CPU page table see include/linux/hmm.h
int hmm_range_register(struct hmm_range *range,
struct mm_struct *mm,
unsigned long start,
- unsigned long end)
+ unsigned long end,
+ unsigned page_shift)
{
- range->start = start & PAGE_MASK;
- range->end = end & PAGE_MASK;
+ unsigned long mask = ((1UL << page_shift) - 1UL);
+
range->valid = false;
range->hmm = NULL;
- if (range->start >= range->end)
+ if ((start & mask) || (end & mask))
+ return -EINVAL;
+ if (start >= end)
return -EINVAL;
+ range->page_shift = page_shift;
range->start = start;
range->end = end;
*/
long hmm_range_snapshot(struct hmm_range *range)
{
+ const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP;
unsigned long start = range->start, end;
struct hmm_vma_walk hmm_vma_walk;
struct hmm *hmm = range->hmm;
return -EAGAIN;
vma = find_vma(hmm->mm, start);
- if (vma == NULL || (vma->vm_flags & VM_SPECIAL))
+ if (vma == NULL || (vma->vm_flags & device_vma))
return -EFAULT;
- /* FIXME support hugetlb fs/dax */
- if (is_vm_hugetlb_page(vma) || vma_is_dax(vma)) {
- hmm_pfns_special(range);
- return -EINVAL;
+ if (is_vm_hugetlb_page(vma)) {
+ struct hstate *h = hstate_vma(vma);
+
+ if (huge_page_shift(h) != range->page_shift &&
+ range->page_shift != PAGE_SHIFT)
+ return -EINVAL;
+ } else {
+ if (range->page_shift != PAGE_SHIFT)
+ return -EINVAL;
}
if (!(vma->vm_flags & VM_READ)) {
}
range->vma = vma;
+ hmm_vma_walk.pgmap = NULL;
hmm_vma_walk.last = start;
hmm_vma_walk.fault = false;
hmm_vma_walk.range = range;
mm_walk.pte_entry = NULL;
mm_walk.test_walk = NULL;
mm_walk.hugetlb_entry = NULL;
+ mm_walk.pud_entry = hmm_vma_walk_pud;
mm_walk.pmd_entry = hmm_vma_walk_pmd;
mm_walk.pte_hole = hmm_vma_walk_hole;
+ mm_walk.hugetlb_entry = hmm_vma_walk_hugetlb_entry;
walk_page_range(start, end, &mm_walk);
start = end;
* then one of the following values may be returned:
*
* -EINVAL invalid arguments or mm or virtual address are in an
- * invalid vma (ie either hugetlbfs or device file vma).
+ * invalid vma (for instance device file vma).
* -ENOMEM: Out of memory.
* -EPERM: Invalid permission (for instance asking for write and
* range is read only).
*/
long hmm_range_fault(struct hmm_range *range, bool block)
{
+ const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP;
unsigned long start = range->start, end;
struct hmm_vma_walk hmm_vma_walk;
struct hmm *hmm = range->hmm;
}
vma = find_vma(hmm->mm, start);
- if (vma == NULL || (vma->vm_flags & VM_SPECIAL))
+ if (vma == NULL || (vma->vm_flags & device_vma))
return -EFAULT;
- /* FIXME support hugetlb fs/dax */
- if (is_vm_hugetlb_page(vma) || vma_is_dax(vma)) {
- hmm_pfns_special(range);
- return -EINVAL;
+ if (is_vm_hugetlb_page(vma)) {
+ if (huge_page_shift(hstate_vma(vma)) !=
+ range->page_shift &&
+ range->page_shift != PAGE_SHIFT)
+ return -EINVAL;
+ } else {
+ if (range->page_shift != PAGE_SHIFT)
+ return -EINVAL;
}
if (!(vma->vm_flags & VM_READ)) {
}
range->vma = vma;
+ hmm_vma_walk.pgmap = NULL;
hmm_vma_walk.last = start;
hmm_vma_walk.fault = true;
hmm_vma_walk.block = block;
mm_walk.pte_entry = NULL;
mm_walk.test_walk = NULL;
mm_walk.hugetlb_entry = NULL;
+ mm_walk.pud_entry = hmm_vma_walk_pud;
mm_walk.pmd_entry = hmm_vma_walk_pmd;
mm_walk.pte_hole = hmm_vma_walk_hole;
+ mm_walk.hugetlb_entry = hmm_vma_walk_hugetlb_entry;
do {
ret = walk_page_range(start, end, &mm_walk);
return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
}
EXPORT_SYMBOL(hmm_range_fault);
+
+/**
+ * hmm_range_dma_map() - hmm_range_fault() and dma map page all in one.
+ * @range: range being faulted
+ * @device: device against to dma map page to
+ * @daddrs: dma address of mapped pages
+ * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem)
+ * Returns: number of pages mapped on success, -EAGAIN if mmap_sem have been
+ * drop and you need to try again, some other error value otherwise
+ *
+ * Note same usage pattern as hmm_range_fault().
+ */
+long hmm_range_dma_map(struct hmm_range *range,
+ struct device *device,
+ dma_addr_t *daddrs,
+ bool block)
+{
+ unsigned long i, npages, mapped;
+ long ret;
+
+ ret = hmm_range_fault(range, block);
+ if (ret <= 0)
+ return ret ? ret : -EBUSY;
+
+ npages = (range->end - range->start) >> PAGE_SHIFT;
+ for (i = 0, mapped = 0; i < npages; ++i) {
+ enum dma_data_direction dir = DMA_TO_DEVICE;
+ struct page *page;
+
+ /*
+ * FIXME need to update DMA API to provide invalid DMA address
+ * value instead of a function to test dma address value. This
+ * would remove lot of dumb code duplicated accross many arch.
+ *
+ * For now setting it to 0 here is good enough as the pfns[]
+ * value is what is use to check what is valid and what isn't.
+ */
+ daddrs[i] = 0;
+
+ page = hmm_device_entry_to_page(range, range->pfns[i]);
+ if (page == NULL)
+ continue;
+
+ /* Check if range is being invalidated */
+ if (!range->valid) {
+ ret = -EBUSY;
+ goto unmap;
+ }
+
+ /* If it is read and write than map bi-directional. */
+ if (range->pfns[i] & range->flags[HMM_PFN_WRITE])
+ dir = DMA_BIDIRECTIONAL;
+
+ daddrs[i] = dma_map_page(device, page, 0, PAGE_SIZE, dir);
+ if (dma_mapping_error(device, daddrs[i])) {
+ ret = -EFAULT;
+ goto unmap;
+ }
+
+ mapped++;
+ }
+
+ return mapped;
+
+unmap:
+ for (npages = i, i = 0; (i < npages) && mapped; ++i) {
+ enum dma_data_direction dir = DMA_TO_DEVICE;
+ struct page *page;
+
+ page = hmm_device_entry_to_page(range, range->pfns[i]);
+ if (page == NULL)
+ continue;
+
+ if (dma_mapping_error(device, daddrs[i]))
+ continue;
+
+ /* If it is read and write than map bi-directional. */
+ if (range->pfns[i] & range->flags[HMM_PFN_WRITE])
+ dir = DMA_BIDIRECTIONAL;
+
+ dma_unmap_page(device, daddrs[i], PAGE_SIZE, dir);
+ mapped--;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(hmm_range_dma_map);
+
+/**
+ * hmm_range_dma_unmap() - unmap range of that was map with hmm_range_dma_map()
+ * @range: range being unmapped
+ * @vma: the vma against which the range (optional)
+ * @device: device against which dma map was done
+ * @daddrs: dma address of mapped pages
+ * @dirty: dirty page if it had the write flag set
+ * Returns: number of page unmapped on success, -EINVAL otherwise
+ *
+ * Note that caller MUST abide by mmu notifier or use HMM mirror and abide
+ * to the sync_cpu_device_pagetables() callback so that it is safe here to
+ * call set_page_dirty(). Caller must also take appropriate locks to avoid
+ * concurrent mmu notifier or sync_cpu_device_pagetables() to make progress.
+ */
+long hmm_range_dma_unmap(struct hmm_range *range,
+ struct vm_area_struct *vma,
+ struct device *device,
+ dma_addr_t *daddrs,
+ bool dirty)
+{
+ unsigned long i, npages;
+ long cpages = 0;
+
+ /* Sanity check. */
+ if (range->end <= range->start)
+ return -EINVAL;
+ if (!daddrs)
+ return -EINVAL;
+ if (!range->pfns)
+ return -EINVAL;
+
+ npages = (range->end - range->start) >> PAGE_SHIFT;
+ for (i = 0; i < npages; ++i) {
+ enum dma_data_direction dir = DMA_TO_DEVICE;
+ struct page *page;
+
+ page = hmm_device_entry_to_page(range, range->pfns[i]);
+ if (page == NULL)
+ continue;
+
+ /* If it is read and write than map bi-directional. */
+ if (range->pfns[i] & range->flags[HMM_PFN_WRITE]) {
+ dir = DMA_BIDIRECTIONAL;
+
+ /*
+ * See comments in function description on why it is
+ * safe here to call set_page_dirty()
+ */
+ if (dirty)
+ set_page_dirty(page);
+ }
+
+ /* Unmap and clear pfns/dma address */
+ dma_unmap_page(device, daddrs[i], PAGE_SIZE, dir);
+ range->pfns[i] = range->values[HMM_PFN_NONE];
+ /* FIXME see comments in hmm_vma_dma_map() */
+ daddrs[i] = 0;
+ cpages++;
+ }
+
+ return cpages;
+}
+EXPORT_SYMBOL(hmm_range_dma_unmap);
#endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */