drivers/infiniband/core/umem_odp.c

   1 /*
   2  * Copyright (c) 2014 Mellanox Technologies. All rights reserved.
   3  *
   4  * This software is available to you under a choice of one of two
   5  * licenses.  You may choose to be licensed under the terms of the GNU
   6  * General Public License (GPL) Version 2, available from the file
   7  * COPYING in the main directory of this source tree, or the
   8  * OpenIB.org BSD license below:
   9  *
  10  *     Redistribution and use in source and binary forms, with or
  11  *     without modification, are permitted provided that the following
  12  *     conditions are met:
  13  *
  14  *      - Redistributions of source code must retain the above
  15  *        copyright notice, this list of conditions and the following
  16  *        disclaimer.
  17  *
  18  *      - Redistributions in binary form must reproduce the above
  19  *        copyright notice, this list of conditions and the following
  20  *        disclaimer in the documentation and/or other materials
  21  *        provided with the distribution.
  22  *
  23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  24  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  25  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  26  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  27  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  28  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  29  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  30  * SOFTWARE.
  31  */
  32
  33 #include <linux/types.h>
  34 #include <linux/sched.h>
  35 #include <linux/sched/mm.h>
  36 #include <linux/sched/task.h>
  37 #include <linux/pid.h>
  38 #include <linux/slab.h>
  39 #include <linux/export.h>
  40 #include <linux/vmalloc.h>
  41 #include <linux/hugetlb.h>
  42 #include <linux/interval_tree.h>
  43 #include <linux/pagemap.h>
  44
  45 #include <rdma/ib_verbs.h>
  46 #include <rdma/ib_umem.h>
  47 #include <rdma/ib_umem_odp.h>
  48
  49 #include "uverbs.h"
  50
  51 static void ib_umem_notifier_start_account(struct ib_umem_odp *umem_odp)
  52 {
  53         mutex_lock(&umem_odp->umem_mutex);
  54         if (umem_odp->notifiers_count++ == 0)
  55                 /*
  56                  * Initialize the completion object for waiting on
  57                  * notifiers. Since notifier_count is zero, no one should be
  58                  * waiting right now.
  59                  */
  60                 reinit_completion(&umem_odp->notifier_completion);
  61         mutex_unlock(&umem_odp->umem_mutex);
  62 }
  63
  64 static void ib_umem_notifier_end_account(struct ib_umem_odp *umem_odp)
  65 {
  66         mutex_lock(&umem_odp->umem_mutex);
  67         /*
  68          * This sequence increase will notify the QP page fault that the page
  69          * that is going to be mapped in the spte could have been freed.
  70          */
  71         ++umem_odp->notifiers_seq;
  72         if (--umem_odp->notifiers_count == 0)
  73                 complete_all(&umem_odp->notifier_completion);
  74         mutex_unlock(&umem_odp->umem_mutex);
  75 }
  76
  77 static void ib_umem_notifier_release(struct mmu_notifier *mn,
  78                                      struct mm_struct *mm)
  79 {
  80         struct ib_ucontext_per_mm *per_mm =
  81                 container_of(mn, struct ib_ucontext_per_mm, mn);
  82         struct rb_node *node;
  83
  84         down_read(&per_mm->umem_rwsem);
  85         if (!per_mm->mn.users)
  86                 goto out;
  87
  88         for (node = rb_first_cached(&per_mm->umem_tree); node;
  89              node = rb_next(node)) {
  90                 struct ib_umem_odp *umem_odp =
  91                         rb_entry(node, struct ib_umem_odp, interval_tree.rb);
  92
  93                 /*
  94                  * Increase the number of notifiers running, to prevent any
  95                  * further fault handling on this MR.
  96                  */
  97                 ib_umem_notifier_start_account(umem_odp);
  98                 complete_all(&umem_odp->notifier_completion);
  99                 umem_odp->umem.ibdev->ops.invalidate_range(
 100                         umem_odp, ib_umem_start(umem_odp),
 101                         ib_umem_end(umem_odp));
 102         }
 103
 104 out:
 105         up_read(&per_mm->umem_rwsem);
 106 }
 107
 108 static int invalidate_range_start_trampoline(struct ib_umem_odp *item,
 109                                              u64 start, u64 end, void *cookie)
 110 {
 111         ib_umem_notifier_start_account(item);
 112         item->umem.ibdev->ops.invalidate_range(item, start, end);
 113         return 0;
 114 }
 115
 116 static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn,
 117                                 const struct mmu_notifier_range *range)
 118 {
 119         struct ib_ucontext_per_mm *per_mm =
 120                 container_of(mn, struct ib_ucontext_per_mm, mn);
 121         int rc;
 122
 123         if (mmu_notifier_range_blockable(range))
 124                 down_read(&per_mm->umem_rwsem);
 125         else if (!down_read_trylock(&per_mm->umem_rwsem))
 126                 return -EAGAIN;
 127
 128         if (!per_mm->mn.users) {
 129                 up_read(&per_mm->umem_rwsem);
 130                 /*
 131                  * At this point users is permanently zero and visible to this
 132                  * CPU without a lock, that fact is relied on to skip the unlock
 133                  * in range_end.
 134                  */
 135                 return 0;
 136         }
 137
 138         rc = rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, range->start,
 139                                            range->end,
 140                                            invalidate_range_start_trampoline,
 141                                            mmu_notifier_range_blockable(range),
 142                                            NULL);
 143         if (rc)
 144                 up_read(&per_mm->umem_rwsem);
 145         return rc;
 146 }
 147
 148 static int invalidate_range_end_trampoline(struct ib_umem_odp *item, u64 start,
 149                                            u64 end, void *cookie)
 150 {
 151         ib_umem_notifier_end_account(item);
 152         return 0;
 153 }
 154
 155 static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn,
 156                                 const struct mmu_notifier_range *range)
 157 {
 158         struct ib_ucontext_per_mm *per_mm =
 159                 container_of(mn, struct ib_ucontext_per_mm, mn);
 160
 161         if (unlikely(!per_mm->mn.users))
 162                 return;
 163
 164         rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, range->start,
 165                                       range->end,
 166                                       invalidate_range_end_trampoline, true, NULL);
 167         up_read(&per_mm->umem_rwsem);
 168 }
 169
 170 static struct mmu_notifier *ib_umem_alloc_notifier(struct mm_struct *mm)
 171 {
 172         struct ib_ucontext_per_mm *per_mm;
 173
 174         per_mm = kzalloc(sizeof(*per_mm), GFP_KERNEL);
 175         if (!per_mm)
 176                 return ERR_PTR(-ENOMEM);
 177
 178         per_mm->umem_tree = RB_ROOT_CACHED;
 179         init_rwsem(&per_mm->umem_rwsem);
 180
 181         WARN_ON(mm != current->mm);
 182         rcu_read_lock();
 183         per_mm->tgid = get_task_pid(current->group_leader, PIDTYPE_PID);
 184         rcu_read_unlock();
 185         return &per_mm->mn;
 186 }
 187
 188 static void ib_umem_free_notifier(struct mmu_notifier *mn)
 189 {
 190         struct ib_ucontext_per_mm *per_mm =
 191                 container_of(mn, struct ib_ucontext_per_mm, mn);
 192
 193         WARN_ON(!RB_EMPTY_ROOT(&per_mm->umem_tree.rb_root));
 194
 195         put_pid(per_mm->tgid);
 196         kfree(per_mm);
 197 }
 198
 199 static const struct mmu_notifier_ops ib_umem_notifiers = {
 200         .release                    = ib_umem_notifier_release,
 201         .invalidate_range_start     = ib_umem_notifier_invalidate_range_start,
 202         .invalidate_range_end       = ib_umem_notifier_invalidate_range_end,
 203         .alloc_notifier             = ib_umem_alloc_notifier,
 204         .free_notifier              = ib_umem_free_notifier,
 205 };
 206
 207 static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp)
 208 {
 209         struct ib_ucontext_per_mm *per_mm;
 210         struct mmu_notifier *mn;
 211         int ret;
 212
 213         umem_odp->umem.is_odp = 1;
 214         if (!umem_odp->is_implicit_odp) {
 215                 size_t page_size = 1UL << umem_odp->page_shift;
 216                 size_t pages;
 217
 218                 umem_odp->interval_tree.start =
 219                         ALIGN_DOWN(umem_odp->umem.address, page_size);
 220                 if (check_add_overflow(umem_odp->umem.address,
 221                                        (unsigned long)umem_odp->umem.length,
 222                                        &umem_odp->interval_tree.last))
 223                         return -EOVERFLOW;
 224                 umem_odp->interval_tree.last =
 225                         ALIGN(umem_odp->interval_tree.last, page_size);
 226                 if (unlikely(umem_odp->interval_tree.last < page_size))
 227                         return -EOVERFLOW;
 228
 229                 pages = (umem_odp->interval_tree.last -
 230                          umem_odp->interval_tree.start) >>
 231                         umem_odp->page_shift;
 232                 if (!pages)
 233                         return -EINVAL;
 234
 235                 /*
 236                  * Note that the representation of the intervals in the
 237                  * interval tree considers the ending point as contained in
 238                  * the interval.
 239                  */
 240                 umem_odp->interval_tree.last--;
 241
 242                 umem_odp->page_list = kvcalloc(
 243                         pages, sizeof(*umem_odp->page_list), GFP_KERNEL);
 244                 if (!umem_odp->page_list)
 245                         return -ENOMEM;
 246
 247                 umem_odp->dma_list = kvcalloc(
 248                         pages, sizeof(*umem_odp->dma_list), GFP_KERNEL);
 249                 if (!umem_odp->dma_list) {
 250                         ret = -ENOMEM;
 251                         goto out_page_list;
 252                 }
 253         }
 254
 255         mn = mmu_notifier_get(&ib_umem_notifiers, umem_odp->umem.owning_mm);
 256         if (IS_ERR(mn)) {
 257                 ret = PTR_ERR(mn);
 258                 goto out_dma_list;
 259         }
 260         umem_odp->per_mm = per_mm =
 261                 container_of(mn, struct ib_ucontext_per_mm, mn);
 262
 263         mutex_init(&umem_odp->umem_mutex);
 264         init_completion(&umem_odp->notifier_completion);
 265
 266         if (!umem_odp->is_implicit_odp) {
 267                 down_write(&per_mm->umem_rwsem);
 268                 interval_tree_insert(&umem_odp->interval_tree,
 269                                      &per_mm->umem_tree);
 270                 up_write(&per_mm->umem_rwsem);
 271         }
 272         mmgrab(umem_odp->umem.owning_mm);
 273
 274         return 0;
 275
 276 out_dma_list:
 277         kvfree(umem_odp->dma_list);
 278 out_page_list:
 279         kvfree(umem_odp->page_list);
 280         return ret;
 281 }
 282
 283 /**
 284  * ib_umem_odp_alloc_implicit - Allocate a parent implicit ODP umem
 285  *
 286  * Implicit ODP umems do not have a VA range and do not have any page lists.
 287  * They exist only to hold the per_mm reference to help the driver create
 288  * children umems.
 289  *
 290  * @udata: udata from the syscall being used to create the umem
 291  * @access: ib_reg_mr access flags
 292  */
 293 struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_udata *udata,
 294                                                int access)
 295 {
 296         struct ib_ucontext *context =
 297                 container_of(udata, struct uverbs_attr_bundle, driver_udata)
 298                         ->context;
 299         struct ib_umem *umem;
 300         struct ib_umem_odp *umem_odp;
 301         int ret;
 302
 303         if (access & IB_ACCESS_HUGETLB)
 304                 return ERR_PTR(-EINVAL);
 305
 306         if (!context)
 307                 return ERR_PTR(-EIO);
 308         if (WARN_ON_ONCE(!context->device->ops.invalidate_range))
 309                 return ERR_PTR(-EINVAL);
 310
 311         umem_odp = kzalloc(sizeof(*umem_odp), GFP_KERNEL);
 312         if (!umem_odp)
 313                 return ERR_PTR(-ENOMEM);
 314         umem = &umem_odp->umem;
 315         umem->ibdev = context->device;
 316         umem->writable = ib_access_writable(access);
 317         umem->owning_mm = current->mm;
 318         umem_odp->is_implicit_odp = 1;
 319         umem_odp->page_shift = PAGE_SHIFT;
 320
 321         ret = ib_init_umem_odp(umem_odp);
 322         if (ret) {
 323                 kfree(umem_odp);
 324                 return ERR_PTR(ret);
 325         }
 326         return umem_odp;
 327 }
 328 EXPORT_SYMBOL(ib_umem_odp_alloc_implicit);
 329
 330 /**
 331  * ib_umem_odp_alloc_child - Allocate a child ODP umem under an implicit
 332  *                           parent ODP umem
 333  *
 334  * @root: The parent umem enclosing the child. This must be allocated using
 335  *        ib_alloc_implicit_odp_umem()
 336  * @addr: The starting userspace VA
 337  * @size: The length of the userspace VA
 338  */
 339 struct ib_umem_odp *ib_umem_odp_alloc_child(struct ib_umem_odp *root,
 340                                             unsigned long addr, size_t size)
 341 {
 342         /*
 343          * Caller must ensure that root cannot be freed during the call to
 344          * ib_alloc_odp_umem.
 345          */
 346         struct ib_umem_odp *odp_data;
 347         struct ib_umem *umem;
 348         int ret;
 349
 350         if (WARN_ON(!root->is_implicit_odp))
 351                 return ERR_PTR(-EINVAL);
 352
 353         odp_data = kzalloc(sizeof(*odp_data), GFP_KERNEL);
 354         if (!odp_data)
 355                 return ERR_PTR(-ENOMEM);
 356         umem = &odp_data->umem;
 357         umem->ibdev = root->umem.ibdev;
 358         umem->length     = size;
 359         umem->address    = addr;
 360         umem->writable   = root->umem.writable;
 361         umem->owning_mm  = root->umem.owning_mm;
 362         odp_data->page_shift = PAGE_SHIFT;
 363
 364         ret = ib_init_umem_odp(odp_data);
 365         if (ret) {
 366                 kfree(odp_data);
 367                 return ERR_PTR(ret);
 368         }
 369         return odp_data;
 370 }
 371 EXPORT_SYMBOL(ib_umem_odp_alloc_child);
 372
 373 /**
 374  * ib_umem_odp_get - Create a umem_odp for a userspace va
 375  *
 376  * @udata: userspace context to pin memory for
 377  * @addr: userspace virtual address to start at
 378  * @size: length of region to pin
 379  * @access: IB_ACCESS_xxx flags for memory being pinned
 380  *
 381  * The driver should use when the access flags indicate ODP memory. It avoids
 382  * pinning, instead, stores the mm for future page fault handling in
 383  * conjunction with MMU notifiers.
 384  */
 385 struct ib_umem_odp *ib_umem_odp_get(struct ib_udata *udata, unsigned long addr,
 386                                     size_t size, int access)
 387 {
 388         struct ib_umem_odp *umem_odp;
 389         struct ib_ucontext *context;
 390         struct mm_struct *mm;
 391         int ret;
 392
 393         if (!udata)
 394                 return ERR_PTR(-EIO);
 395
 396         context = container_of(udata, struct uverbs_attr_bundle, driver_udata)
 397                           ->context;
 398         if (!context)
 399                 return ERR_PTR(-EIO);
 400
 401         if (WARN_ON_ONCE(!(access & IB_ACCESS_ON_DEMAND)) ||
 402             WARN_ON_ONCE(!context->device->ops.invalidate_range))
 403                 return ERR_PTR(-EINVAL);
 404
 405         umem_odp = kzalloc(sizeof(struct ib_umem_odp), GFP_KERNEL);
 406         if (!umem_odp)
 407                 return ERR_PTR(-ENOMEM);
 408
 409         umem_odp->umem.ibdev = context->device;
 410         umem_odp->umem.length = size;
 411         umem_odp->umem.address = addr;
 412         umem_odp->umem.writable = ib_access_writable(access);
 413         umem_odp->umem.owning_mm = mm = current->mm;
 414
 415         umem_odp->page_shift = PAGE_SHIFT;
 416         if (access & IB_ACCESS_HUGETLB) {
 417                 struct vm_area_struct *vma;
 418                 struct hstate *h;
 419
 420                 down_read(&mm->mmap_sem);
 421                 vma = find_vma(mm, ib_umem_start(umem_odp));
 422                 if (!vma || !is_vm_hugetlb_page(vma)) {
 423                         up_read(&mm->mmap_sem);
 424                         ret = -EINVAL;
 425                         goto err_free;
 426                 }
 427                 h = hstate_vma(vma);
 428                 umem_odp->page_shift = huge_page_shift(h);
 429                 up_read(&mm->mmap_sem);
 430         }
 431
 432         ret = ib_init_umem_odp(umem_odp);
 433         if (ret)
 434                 goto err_free;
 435         return umem_odp;
 436
 437 err_free:
 438         kfree(umem_odp);
 439         return ERR_PTR(ret);
 440 }
 441 EXPORT_SYMBOL(ib_umem_odp_get);
 442
 443 void ib_umem_odp_release(struct ib_umem_odp *umem_odp)
 444 {
 445         struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm;
 446
 447         /*
 448          * Ensure that no more pages are mapped in the umem.
 449          *
 450          * It is the driver's responsibility to ensure, before calling us,
 451          * that the hardware will not attempt to access the MR any more.
 452          */
 453         if (!umem_odp->is_implicit_odp) {
 454                 mutex_lock(&umem_odp->umem_mutex);
 455                 ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp),
 456                                             ib_umem_end(umem_odp));
 457                 mutex_unlock(&umem_odp->umem_mutex);
 458                 kvfree(umem_odp->dma_list);
 459                 kvfree(umem_odp->page_list);
 460         }
 461
 462         down_write(&per_mm->umem_rwsem);
 463         if (!umem_odp->is_implicit_odp) {
 464                 interval_tree_remove(&umem_odp->interval_tree,
 465                                      &per_mm->umem_tree);
 466                 complete_all(&umem_odp->notifier_completion);
 467         }
 468         /*
 469          * NOTE! mmu_notifier_unregister() can happen between a start/end
 470          * callback, resulting in a missing end, and thus an unbalanced
 471          * lock. This doesn't really matter to us since we are about to kfree
 472          * the memory that holds the lock, however LOCKDEP doesn't like this.
 473          * Thus we call the mmu_notifier_put under the rwsem and test the
 474          * internal users count to reliably see if we are past this point.
 475          */
 476         mmu_notifier_put(&per_mm->mn);
 477         up_write(&per_mm->umem_rwsem);
 478
 479         mmdrop(umem_odp->umem.owning_mm);
 480         kfree(umem_odp);
 481 }
 482 EXPORT_SYMBOL(ib_umem_odp_release);
 483
 484 /*
 485  * Map for DMA and insert a single page into the on-demand paging page tables.
 486  *
 487  * @umem: the umem to insert the page to.
 488  * @page_index: index in the umem to add the page to.
 489  * @page: the page struct to map and add.
 490  * @access_mask: access permissions needed for this page.
 491  * @current_seq: sequence number for synchronization with invalidations.
 492  *               the sequence number is taken from
 493  *               umem_odp->notifiers_seq.
 494  *
 495  * The function returns -EFAULT if the DMA mapping operation fails. It returns
 496  * -EAGAIN if a concurrent invalidation prevents us from updating the page.
 497  *
 498  * The page is released via put_user_page even if the operation failed. For
 499  * on-demand pinning, the page is released whenever it isn't stored in the
 500  * umem.
 501  */
 502 static int ib_umem_odp_map_dma_single_page(
 503                 struct ib_umem_odp *umem_odp,
 504                 int page_index,
 505                 struct page *page,
 506                 u64 access_mask,
 507                 unsigned long current_seq)
 508 {
 509         struct ib_device *dev = umem_odp->umem.ibdev;
 510         dma_addr_t dma_addr;
 511         int ret = 0;
 512
 513         /*
 514          * Note: we avoid writing if seq is different from the initial seq, to
 515          * handle case of a racing notifier. This check also allows us to bail
 516          * early if we have a notifier running in parallel with us.
 517          */
 518         if (ib_umem_mmu_notifier_retry(umem_odp, current_seq)) {
 519                 ret = -EAGAIN;
 520                 goto out;
 521         }
 522         if (!(umem_odp->dma_list[page_index])) {
 523                 dma_addr =
 524                         ib_dma_map_page(dev, page, 0, BIT(umem_odp->page_shift),
 525                                         DMA_BIDIRECTIONAL);
 526                 if (ib_dma_mapping_error(dev, dma_addr)) {
 527                         ret = -EFAULT;
 528                         goto out;
 529                 }
 530                 umem_odp->dma_list[page_index] = dma_addr | access_mask;
 531                 umem_odp->page_list[page_index] = page;
 532                 umem_odp->npages++;
 533         } else if (umem_odp->page_list[page_index] == page) {
 534                 umem_odp->dma_list[page_index] |= access_mask;
 535         } else {
 536                 /*
 537                  * This is a race here where we could have done:
 538                  *
 539                  *         CPU0                             CPU1
 540                  *   get_user_pages()
 541                  *                                       invalidate()
 542                  *                                       page_fault()
 543                  *   mutex_lock(umem_mutex)
 544                  *    page from GUP != page in ODP
 545                  *
 546                  * It should be prevented by the retry test above as reading
 547                  * the seq number should be reliable under the
 548                  * umem_mutex. Thus something is really not working right if
 549                  * things get here.
 550                  */
 551                 WARN(true,
 552                      "Got different pages in IB device and from get_user_pages. IB device page: %p, gup page: %p\n",
 553                      umem_odp->page_list[page_index], page);
 554                 ret = -EAGAIN;
 555         }
 556
 557 out:
 558         put_user_page(page);
 559         return ret;
 560 }
 561
 562 /**
 563  * ib_umem_odp_map_dma_pages - Pin and DMA map userspace memory in an ODP MR.
 564  *
 565  * Pins the range of pages passed in the argument, and maps them to
 566  * DMA addresses. The DMA addresses of the mapped pages is updated in
 567  * umem_odp->dma_list.
 568  *
 569  * Returns the number of pages mapped in success, negative error code
 570  * for failure.
 571  * An -EAGAIN error code is returned when a concurrent mmu notifier prevents
 572  * the function from completing its task.
 573  * An -ENOENT error code indicates that userspace process is being terminated
 574  * and mm was already destroyed.
 575  * @umem_odp: the umem to map and pin
 576  * @user_virt: the address from which we need to map.
 577  * @bcnt: the minimal number of bytes to pin and map. The mapping might be
 578  *        bigger due to alignment, and may also be smaller in case of an error
 579  *        pinning or mapping a page. The actual pages mapped is returned in
 580  *        the return value.
 581  * @access_mask: bit mask of the requested access permissions for the given
 582  *               range.
 583  * @current_seq: the MMU notifiers sequance value for synchronization with
 584  *               invalidations. the sequance number is read from
 585  *               umem_odp->notifiers_seq before calling this function
 586  */
 587 int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt,
 588                               u64 bcnt, u64 access_mask,
 589                               unsigned long current_seq)
 590 {
 591         struct task_struct *owning_process  = NULL;
 592         struct mm_struct *owning_mm = umem_odp->umem.owning_mm;
 593         struct page       **local_page_list = NULL;
 594         u64 page_mask, off;
 595         int j, k, ret = 0, start_idx, npages = 0;
 596         unsigned int flags = 0, page_shift;
 597         phys_addr_t p = 0;
 598
 599         if (access_mask == 0)
 600                 return -EINVAL;
 601
 602         if (user_virt < ib_umem_start(umem_odp) ||
 603             user_virt + bcnt > ib_umem_end(umem_odp))
 604                 return -EFAULT;
 605
 606         local_page_list = (struct page **)__get_free_page(GFP_KERNEL);
 607         if (!local_page_list)
 608                 return -ENOMEM;
 609
 610         page_shift = umem_odp->page_shift;
 611         page_mask = ~(BIT(page_shift) - 1);
 612         off = user_virt & (~page_mask);
 613         user_virt = user_virt & page_mask;
 614         bcnt += off; /* Charge for the first page offset as well. */
 615
 616         /*
 617          * owning_process is allowed to be NULL, this means somehow the mm is
 618          * existing beyond the lifetime of the originating process.. Presumably
 619          * mmget_not_zero will fail in this case.
 620          */
 621         owning_process = get_pid_task(umem_odp->per_mm->tgid, PIDTYPE_PID);
 622         if (!owning_process || !mmget_not_zero(owning_mm)) {
 623                 ret = -EINVAL;
 624                 goto out_put_task;
 625         }
 626
 627         if (access_mask & ODP_WRITE_ALLOWED_BIT)
 628                 flags |= FOLL_WRITE;
 629
 630         start_idx = (user_virt - ib_umem_start(umem_odp)) >> page_shift;
 631         k = start_idx;
 632
 633         while (bcnt > 0) {
 634                 const size_t gup_num_pages = min_t(size_t,
 635                                 (bcnt + BIT(page_shift) - 1) >> page_shift,
 636                                 PAGE_SIZE / sizeof(struct page *));
 637
 638                 down_read(&owning_mm->mmap_sem);
 639                 /*
 640                  * Note: this might result in redundent page getting. We can
 641                  * avoid this by checking dma_list to be 0 before calling
 642                  * get_user_pages. However, this make the code much more
 643                  * complex (and doesn't gain us much performance in most use
 644                  * cases).
 645                  */
 646                 npages = get_user_pages_remote(owning_process, owning_mm,
 647                                 user_virt, gup_num_pages,
 648                                 flags, local_page_list, NULL, NULL);
 649                 up_read(&owning_mm->mmap_sem);
 650
 651                 if (npages < 0) {
 652                         if (npages != -EAGAIN)
 653                                 pr_warn("fail to get %zu user pages with error %d\n", gup_num_pages, npages);
 654                         else
 655                                 pr_debug("fail to get %zu user pages with error %d\n", gup_num_pages, npages);
 656                         break;
 657                 }
 658
 659                 bcnt -= min_t(size_t, npages << PAGE_SHIFT, bcnt);
 660                 mutex_lock(&umem_odp->umem_mutex);
 661                 for (j = 0; j < npages; j++, user_virt += PAGE_SIZE) {
 662                         if (user_virt & ~page_mask) {
 663                                 p += PAGE_SIZE;
 664                                 if (page_to_phys(local_page_list[j]) != p) {
 665                                         ret = -EFAULT;
 666                                         break;
 667                                 }
 668                                 put_user_page(local_page_list[j]);
 669                                 continue;
 670                         }
 671
 672                         ret = ib_umem_odp_map_dma_single_page(
 673                                         umem_odp, k, local_page_list[j],
 674                                         access_mask, current_seq);
 675                         if (ret < 0) {
 676                                 if (ret != -EAGAIN)
 677                                         pr_warn("ib_umem_odp_map_dma_single_page failed with error %d\n", ret);
 678                                 else
 679                                         pr_debug("ib_umem_odp_map_dma_single_page failed with error %d\n", ret);
 680                                 break;
 681                         }
 682
 683                         p = page_to_phys(local_page_list[j]);
 684                         k++;
 685                 }
 686                 mutex_unlock(&umem_odp->umem_mutex);
 687
 688                 if (ret < 0) {
 689                         /*
 690                          * Release pages, remembering that the first page
 691                          * to hit an error was already released by
 692                          * ib_umem_odp_map_dma_single_page().
 693                          */
 694                         if (npages - (j + 1) > 0)
 695                                 put_user_pages(&local_page_list[j+1],
 696                                                npages - (j + 1));
 697                         break;
 698                 }
 699         }
 700
 701         if (ret >= 0) {
 702                 if (npages < 0 && k == start_idx)
 703                         ret = npages;
 704                 else
 705                         ret = k - start_idx;
 706         }
 707
 708         mmput(owning_mm);
 709 out_put_task:
 710         if (owning_process)
 711                 put_task_struct(owning_process);
 712         free_page((unsigned long)local_page_list);
 713         return ret;
 714 }
 715 EXPORT_SYMBOL(ib_umem_odp_map_dma_pages);
 716
 717 void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt,
 718                                  u64 bound)
 719 {
 720         int idx;
 721         u64 addr;
 722         struct ib_device *dev = umem_odp->umem.ibdev;
 723
 724         lockdep_assert_held(&umem_odp->umem_mutex);
 725
 726         virt = max_t(u64, virt, ib_umem_start(umem_odp));
 727         bound = min_t(u64, bound, ib_umem_end(umem_odp));
 728         /* Note that during the run of this function, the
 729          * notifiers_count of the MR is > 0, preventing any racing
 730          * faults from completion. We might be racing with other
 731          * invalidations, so we must make sure we free each page only
 732          * once. */
 733         for (addr = virt; addr < bound; addr += BIT(umem_odp->page_shift)) {
 734                 idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift;
 735                 if (umem_odp->page_list[idx]) {
 736                         struct page *page = umem_odp->page_list[idx];
 737                         dma_addr_t dma = umem_odp->dma_list[idx];
 738                         dma_addr_t dma_addr = dma & ODP_DMA_ADDR_MASK;
 739
 740                         WARN_ON(!dma_addr);
 741
 742                         ib_dma_unmap_page(dev, dma_addr,
 743                                           BIT(umem_odp->page_shift),
 744                                           DMA_BIDIRECTIONAL);
 745                         if (dma & ODP_WRITE_ALLOWED_BIT) {
 746                                 struct page *head_page = compound_head(page);
 747                                 /*
 748                                  * set_page_dirty prefers being called with
 749                                  * the page lock. However, MMU notifiers are
 750                                  * called sometimes with and sometimes without
 751                                  * the lock. We rely on the umem_mutex instead
 752                                  * to prevent other mmu notifiers from
 753                                  * continuing and allowing the page mapping to
 754                                  * be removed.
 755                                  */
 756                                 set_page_dirty(head_page);
 757                         }
 758                         umem_odp->page_list[idx] = NULL;
 759                         umem_odp->dma_list[idx] = 0;
 760                         umem_odp->npages--;
 761                 }
 762         }
 763 }
 764 EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages);
 765
 766 /* @last is not a part of the interval. See comment for function
 767  * node_last.
 768  */
 769 int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root,
 770                                   u64 start, u64 last,
 771                                   umem_call_back cb,
 772                                   bool blockable,
 773                                   void *cookie)
 774 {
 775         int ret_val = 0;
 776         struct interval_tree_node *node, *next;
 777         struct ib_umem_odp *umem;
 778
 779         if (unlikely(start == last))
 780                 return ret_val;
 781
 782         for (node = interval_tree_iter_first(root, start, last - 1);
 783                         node; node = next) {
 784                 /* TODO move the blockable decision up to the callback */
 785                 if (!blockable)
 786                         return -EAGAIN;
 787                 next = interval_tree_iter_next(node, start, last - 1);
 788                 umem = container_of(node, struct ib_umem_odp, interval_tree);
 789                 ret_val = cb(umem, start, last, cookie) || ret_val;
 790         }
 791
 792         return ret_val;
 793 }