drivers/gpu/drm/i915/i915_gem.c

   1 /*
   2  * Copyright © 2008-2015 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *    Eric Anholt <eric@anholt.net>
  25  *
  26  */
  27
  28 #include <drm/drmP.h>
  29 #include <drm/drm_vma_manager.h>
  30 #include <drm/i915_drm.h>
  31 #include "i915_drv.h"
  32 #include "i915_gem_clflush.h"
  33 #include "i915_vgpu.h"
  34 #include "i915_trace.h"
  35 #include "intel_drv.h"
  36 #include "intel_frontbuffer.h"
  37 #include "intel_mocs.h"
  38 #include "i915_gemfs.h"
  39 #include <linux/dma-fence-array.h>
  40 #include <linux/kthread.h>
  41 #include <linux/reservation.h>
  42 #include <linux/shmem_fs.h>
  43 #include <linux/slab.h>
  44 #include <linux/stop_machine.h>
  45 #include <linux/swap.h>
  46 #include <linux/pci.h>
  47 #include <linux/dma-buf.h>
  48
  49 static void i915_gem_flush_free_objects(struct drm_i915_private *i915);
  50
  51 static bool cpu_write_needs_clflush(struct drm_i915_gem_object *obj)
  52 {
  53         if (obj->cache_dirty)
  54                 return false;
  55
  56         if (!(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE))
  57                 return true;
  58
  59         return obj->pin_global; /* currently in use by HW, keep flushed */
  60 }
  61
  62 static int
  63 insert_mappable_node(struct i915_ggtt *ggtt,
  64                      struct drm_mm_node *node, u32 size)
  65 {
  66         memset(node, 0, sizeof(*node));
  67         return drm_mm_insert_node_in_range(&ggtt->base.mm, node,
  68                                            size, 0, I915_COLOR_UNEVICTABLE,
  69                                            0, ggtt->mappable_end,
  70                                            DRM_MM_INSERT_LOW);
  71 }
  72
  73 static void
  74 remove_mappable_node(struct drm_mm_node *node)
  75 {
  76         drm_mm_remove_node(node);
  77 }
  78
  79 /* some bookkeeping */
  80 static void i915_gem_info_add_obj(struct drm_i915_private *dev_priv,
  81                                   u64 size)
  82 {
  83         spin_lock(&dev_priv->mm.object_stat_lock);
  84         dev_priv->mm.object_count++;
  85         dev_priv->mm.object_memory += size;
  86         spin_unlock(&dev_priv->mm.object_stat_lock);
  87 }
  88
  89 static void i915_gem_info_remove_obj(struct drm_i915_private *dev_priv,
  90                                      u64 size)
  91 {
  92         spin_lock(&dev_priv->mm.object_stat_lock);
  93         dev_priv->mm.object_count--;
  94         dev_priv->mm.object_memory -= size;
  95         spin_unlock(&dev_priv->mm.object_stat_lock);
  96 }
  97
  98 static int
  99 i915_gem_wait_for_error(struct i915_gpu_error *error)
 100 {
 101         int ret;
 102
 103         might_sleep();
 104
 105         /*
 106          * Only wait 10 seconds for the gpu reset to complete to avoid hanging
 107          * userspace. If it takes that long something really bad is going on and
 108          * we should simply try to bail out and fail as gracefully as possible.
 109          */
 110         ret = wait_event_interruptible_timeout(error->reset_queue,
 111                                                !i915_reset_backoff(error),
 112                                                I915_RESET_TIMEOUT);
 113         if (ret == 0) {
 114                 DRM_ERROR("Timed out waiting for the gpu reset to complete\n");
 115                 return -EIO;
 116         } else if (ret < 0) {
 117                 return ret;
 118         } else {
 119                 return 0;
 120         }
 121 }
 122
 123 int i915_mutex_lock_interruptible(struct drm_device *dev)
 124 {
 125         struct drm_i915_private *dev_priv = to_i915(dev);
 126         int ret;
 127
 128         ret = i915_gem_wait_for_error(&dev_priv->gpu_error);
 129         if (ret)
 130                 return ret;
 131
 132         ret = mutex_lock_interruptible(&dev->struct_mutex);
 133         if (ret)
 134                 return ret;
 135
 136         return 0;
 137 }
 138
 139 int
 140 i915_gem_get_aperture_ioctl(struct drm_device *dev, void *data,
 141                             struct drm_file *file)
 142 {
 143         struct drm_i915_private *dev_priv = to_i915(dev);
 144         struct i915_ggtt *ggtt = &dev_priv->ggtt;
 145         struct drm_i915_gem_get_aperture *args = data;
 146         struct i915_vma *vma;
 147         u64 pinned;
 148
 149         pinned = ggtt->base.reserved;
 150         mutex_lock(&dev->struct_mutex);
 151         list_for_each_entry(vma, &ggtt->base.active_list, vm_link)
 152                 if (i915_vma_is_pinned(vma))
 153                         pinned += vma->node.size;
 154         list_for_each_entry(vma, &ggtt->base.inactive_list, vm_link)
 155                 if (i915_vma_is_pinned(vma))
 156                         pinned += vma->node.size;
 157         mutex_unlock(&dev->struct_mutex);
 158
 159         args->aper_size = ggtt->base.total;
 160         args->aper_available_size = args->aper_size - pinned;
 161
 162         return 0;
 163 }
 164
 165 static int i915_gem_object_get_pages_phys(struct drm_i915_gem_object *obj)
 166 {
 167         struct address_space *mapping = obj->base.filp->f_mapping;
 168         drm_dma_handle_t *phys;
 169         struct sg_table *st;
 170         struct scatterlist *sg;
 171         char *vaddr;
 172         int i;
 173         int err;
 174
 175         if (WARN_ON(i915_gem_object_needs_bit17_swizzle(obj)))
 176                 return -EINVAL;
 177
 178         /* Always aligning to the object size, allows a single allocation
 179          * to handle all possible callers, and given typical object sizes,
 180          * the alignment of the buddy allocation will naturally match.
 181          */
 182         phys = drm_pci_alloc(obj->base.dev,
 183                              roundup_pow_of_two(obj->base.size),
 184                              roundup_pow_of_two(obj->base.size));
 185         if (!phys)
 186                 return -ENOMEM;
 187
 188         vaddr = phys->vaddr;
 189         for (i = 0; i < obj->base.size / PAGE_SIZE; i++) {
 190                 struct page *page;
 191                 char *src;
 192
 193                 page = shmem_read_mapping_page(mapping, i);
 194                 if (IS_ERR(page)) {
 195                         err = PTR_ERR(page);
 196                         goto err_phys;
 197                 }
 198
 199                 src = kmap_atomic(page);
 200                 memcpy(vaddr, src, PAGE_SIZE);
 201                 drm_clflush_virt_range(vaddr, PAGE_SIZE);
 202                 kunmap_atomic(src);
 203
 204                 put_page(page);
 205                 vaddr += PAGE_SIZE;
 206         }
 207
 208         i915_gem_chipset_flush(to_i915(obj->base.dev));
 209
 210         st = kmalloc(sizeof(*st), GFP_KERNEL);
 211         if (!st) {
 212                 err = -ENOMEM;
 213                 goto err_phys;
 214         }
 215
 216         if (sg_alloc_table(st, 1, GFP_KERNEL)) {
 217                 kfree(st);
 218                 err = -ENOMEM;
 219                 goto err_phys;
 220         }
 221
 222         sg = st->sgl;
 223         sg->offset = 0;
 224         sg->length = obj->base.size;
 225
 226         sg_dma_address(sg) = phys->busaddr;
 227         sg_dma_len(sg) = obj->base.size;
 228
 229         obj->phys_handle = phys;
 230
 231         __i915_gem_object_set_pages(obj, st, sg->length);
 232
 233         return 0;
 234
 235 err_phys:
 236         drm_pci_free(obj->base.dev, phys);
 237
 238         return err;
 239 }
 240
 241 static void __start_cpu_write(struct drm_i915_gem_object *obj)
 242 {
 243         obj->read_domains = I915_GEM_DOMAIN_CPU;
 244         obj->write_domain = I915_GEM_DOMAIN_CPU;
 245         if (cpu_write_needs_clflush(obj))
 246                 obj->cache_dirty = true;
 247 }
 248
 249 static void
 250 __i915_gem_object_release_shmem(struct drm_i915_gem_object *obj,
 251                                 struct sg_table *pages,
 252                                 bool needs_clflush)
 253 {
 254         GEM_BUG_ON(obj->mm.madv == __I915_MADV_PURGED);
 255
 256         if (obj->mm.madv == I915_MADV_DONTNEED)
 257                 obj->mm.dirty = false;
 258
 259         if (needs_clflush &&
 260             (obj->read_domains & I915_GEM_DOMAIN_CPU) == 0 &&
 261             !(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ))
 262                 drm_clflush_sg(pages);
 263
 264         __start_cpu_write(obj);
 265 }
 266
 267 static void
 268 i915_gem_object_put_pages_phys(struct drm_i915_gem_object *obj,
 269                                struct sg_table *pages)
 270 {
 271         __i915_gem_object_release_shmem(obj, pages, false);
 272
 273         if (obj->mm.dirty) {
 274                 struct address_space *mapping = obj->base.filp->f_mapping;
 275                 char *vaddr = obj->phys_handle->vaddr;
 276                 int i;
 277
 278                 for (i = 0; i < obj->base.size / PAGE_SIZE; i++) {
 279                         struct page *page;
 280                         char *dst;
 281
 282                         page = shmem_read_mapping_page(mapping, i);
 283                         if (IS_ERR(page))
 284                                 continue;
 285
 286                         dst = kmap_atomic(page);
 287                         drm_clflush_virt_range(vaddr, PAGE_SIZE);
 288                         memcpy(dst, vaddr, PAGE_SIZE);
 289                         kunmap_atomic(dst);
 290
 291                         set_page_dirty(page);
 292                         if (obj->mm.madv == I915_MADV_WILLNEED)
 293                                 mark_page_accessed(page);
 294                         put_page(page);
 295                         vaddr += PAGE_SIZE;
 296                 }
 297                 obj->mm.dirty = false;
 298         }
 299
 300         sg_free_table(pages);
 301         kfree(pages);
 302
 303         drm_pci_free(obj->base.dev, obj->phys_handle);
 304 }
 305
 306 static void
 307 i915_gem_object_release_phys(struct drm_i915_gem_object *obj)
 308 {
 309         i915_gem_object_unpin_pages(obj);
 310 }
 311
 312 static const struct drm_i915_gem_object_ops i915_gem_phys_ops = {
 313         .get_pages = i915_gem_object_get_pages_phys,
 314         .put_pages = i915_gem_object_put_pages_phys,
 315         .release = i915_gem_object_release_phys,
 316 };
 317
 318 static const struct drm_i915_gem_object_ops i915_gem_object_ops;
 319
 320 int i915_gem_object_unbind(struct drm_i915_gem_object *obj)
 321 {
 322         struct i915_vma *vma;
 323         LIST_HEAD(still_in_list);
 324         int ret;
 325
 326         lockdep_assert_held(&obj->base.dev->struct_mutex);
 327
 328         /* Closed vma are removed from the obj->vma_list - but they may
 329          * still have an active binding on the object. To remove those we
 330          * must wait for all rendering to complete to the object (as unbinding
 331          * must anyway), and retire the requests.
 332          */
 333         ret = i915_gem_object_set_to_cpu_domain(obj, false);
 334         if (ret)
 335                 return ret;
 336
 337         while ((vma = list_first_entry_or_null(&obj->vma_list,
 338                                                struct i915_vma,
 339                                                obj_link))) {
 340                 list_move_tail(&vma->obj_link, &still_in_list);
 341                 ret = i915_vma_unbind(vma);
 342                 if (ret)
 343                         break;
 344         }
 345         list_splice(&still_in_list, &obj->vma_list);
 346
 347         return ret;
 348 }
 349
 350 static long
 351 i915_gem_object_wait_fence(struct dma_fence *fence,
 352                            unsigned int flags,
 353                            long timeout,
 354                            struct intel_rps_client *rps_client)
 355 {
 356         struct drm_i915_gem_request *rq;
 357
 358         BUILD_BUG_ON(I915_WAIT_INTERRUPTIBLE != 0x1);
 359
 360         if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags))
 361                 return timeout;
 362
 363         if (!dma_fence_is_i915(fence))
 364                 return dma_fence_wait_timeout(fence,
 365                                               flags & I915_WAIT_INTERRUPTIBLE,
 366                                               timeout);
 367
 368         rq = to_request(fence);
 369         if (i915_gem_request_completed(rq))
 370                 goto out;
 371
 372         /*
 373          * This client is about to stall waiting for the GPU. In many cases
 374          * this is undesirable and limits the throughput of the system, as
 375          * many clients cannot continue processing user input/output whilst
 376          * blocked. RPS autotuning may take tens of milliseconds to respond
 377          * to the GPU load and thus incurs additional latency for the client.
 378          * We can circumvent that by promoting the GPU frequency to maximum
 379          * before we wait. This makes the GPU throttle up much more quickly
 380          * (good for benchmarks and user experience, e.g. window animations),
 381          * but at a cost of spending more power processing the workload
 382          * (bad for battery). Not all clients even want their results
 383          * immediately and for them we should just let the GPU select its own
 384          * frequency to maximise efficiency. To prevent a single client from
 385          * forcing the clocks too high for the whole system, we only allow
 386          * each client to waitboost once in a busy period.
 387          */
 388         if (rps_client && !i915_gem_request_started(rq)) {
 389                 if (INTEL_GEN(rq->i915) >= 6)
 390                         gen6_rps_boost(rq, rps_client);
 391         }
 392
 393         timeout = i915_wait_request(rq, flags, timeout);
 394
 395 out:
 396         if (flags & I915_WAIT_LOCKED && i915_gem_request_completed(rq))
 397                 i915_gem_request_retire_upto(rq);
 398
 399         return timeout;
 400 }
 401
 402 static long
 403 i915_gem_object_wait_reservation(struct reservation_object *resv,
 404                                  unsigned int flags,
 405                                  long timeout,
 406                                  struct intel_rps_client *rps_client)
 407 {
 408         unsigned int seq = __read_seqcount_begin(&resv->seq);
 409         struct dma_fence *excl;
 410         bool prune_fences = false;
 411
 412         if (flags & I915_WAIT_ALL) {
 413                 struct dma_fence **shared;
 414                 unsigned int count, i;
 415                 int ret;
 416
 417                 ret = reservation_object_get_fences_rcu(resv,
 418                                                         &excl, &count, &shared);
 419                 if (ret)
 420                         return ret;
 421
 422                 for (i = 0; i < count; i++) {
 423                         timeout = i915_gem_object_wait_fence(shared[i],
 424                                                              flags, timeout,
 425                                                              rps_client);
 426                         if (timeout < 0)
 427                                 break;
 428
 429                         dma_fence_put(shared[i]);
 430                 }
 431
 432                 for (; i < count; i++)
 433                         dma_fence_put(shared[i]);
 434                 kfree(shared);
 435
 436                 prune_fences = count && timeout >= 0;
 437         } else {
 438                 excl = reservation_object_get_excl_rcu(resv);
 439         }
 440
 441         if (excl && timeout >= 0) {
 442                 timeout = i915_gem_object_wait_fence(excl, flags, timeout,
 443                                                      rps_client);
 444                 prune_fences = timeout >= 0;
 445         }
 446
 447         dma_fence_put(excl);
 448
 449         /* Oportunistically prune the fences iff we know they have *all* been
 450          * signaled and that the reservation object has not been changed (i.e.
 451          * no new fences have been added).
 452          */
 453         if (prune_fences && !__read_seqcount_retry(&resv->seq, seq)) {
 454                 if (reservation_object_trylock(resv)) {
 455                         if (!__read_seqcount_retry(&resv->seq, seq))
 456                                 reservation_object_add_excl_fence(resv, NULL);
 457                         reservation_object_unlock(resv);
 458                 }
 459         }
 460
 461         return timeout;
 462 }
 463
 464 static void __fence_set_priority(struct dma_fence *fence, int prio)
 465 {
 466         struct drm_i915_gem_request *rq;
 467         struct intel_engine_cs *engine;
 468
 469         if (dma_fence_is_signaled(fence) || !dma_fence_is_i915(fence))
 470                 return;
 471
 472         rq = to_request(fence);
 473         engine = rq->engine;
 474         if (!engine->schedule)
 475                 return;
 476
 477         engine->schedule(rq, prio);
 478 }
 479
 480 static void fence_set_priority(struct dma_fence *fence, int prio)
 481 {
 482         /* Recurse once into a fence-array */
 483         if (dma_fence_is_array(fence)) {
 484                 struct dma_fence_array *array = to_dma_fence_array(fence);
 485                 int i;
 486
 487                 for (i = 0; i < array->num_fences; i++)
 488                         __fence_set_priority(array->fences[i], prio);
 489         } else {
 490                 __fence_set_priority(fence, prio);
 491         }
 492 }
 493
 494 int
 495 i915_gem_object_wait_priority(struct drm_i915_gem_object *obj,
 496                               unsigned int flags,
 497                               int prio)
 498 {
 499         struct dma_fence *excl;
 500
 501         if (flags & I915_WAIT_ALL) {
 502                 struct dma_fence **shared;
 503                 unsigned int count, i;
 504                 int ret;
 505
 506                 ret = reservation_object_get_fences_rcu(obj->resv,
 507                                                         &excl, &count, &shared);
 508                 if (ret)
 509                         return ret;
 510
 511                 for (i = 0; i < count; i++) {
 512                         fence_set_priority(shared[i], prio);
 513                         dma_fence_put(shared[i]);
 514                 }
 515
 516                 kfree(shared);
 517         } else {
 518                 excl = reservation_object_get_excl_rcu(obj->resv);
 519         }
 520
 521         if (excl) {
 522                 fence_set_priority(excl, prio);
 523                 dma_fence_put(excl);
 524         }
 525         return 0;
 526 }
 527
 528 /**
 529  * Waits for rendering to the object to be completed
 530  * @obj: i915 gem object
 531  * @flags: how to wait (under a lock, for all rendering or just for writes etc)
 532  * @timeout: how long to wait
 533  * @rps_client: client (user process) to charge for any waitboosting
 534  */
 535 int
 536 i915_gem_object_wait(struct drm_i915_gem_object *obj,
 537                      unsigned int flags,
 538                      long timeout,
 539                      struct intel_rps_client *rps_client)
 540 {
 541         might_sleep();
 542 #if IS_ENABLED(CONFIG_LOCKDEP)
 543         GEM_BUG_ON(debug_locks &&
 544                    !!lockdep_is_held(&obj->base.dev->struct_mutex) !=
 545                    !!(flags & I915_WAIT_LOCKED));
 546 #endif
 547         GEM_BUG_ON(timeout < 0);
 548
 549         timeout = i915_gem_object_wait_reservation(obj->resv,
 550                                                    flags, timeout,
 551                                                    rps_client);
 552         return timeout < 0 ? timeout : 0;
 553 }
 554
 555 static struct intel_rps_client *to_rps_client(struct drm_file *file)
 556 {
 557         struct drm_i915_file_private *fpriv = file->driver_priv;
 558
 559         return &fpriv->rps_client;
 560 }
 561
 562 static int
 563 i915_gem_phys_pwrite(struct drm_i915_gem_object *obj,
 564                      struct drm_i915_gem_pwrite *args,
 565                      struct drm_file *file)
 566 {
 567         void *vaddr = obj->phys_handle->vaddr + args->offset;
 568         char __user *user_data = u64_to_user_ptr(args->data_ptr);
 569
 570         /* We manually control the domain here and pretend that it
 571          * remains coherent i.e. in the GTT domain, like shmem_pwrite.
 572          */
 573         intel_fb_obj_invalidate(obj, ORIGIN_CPU);
 574         if (copy_from_user(vaddr, user_data, args->size))
 575                 return -EFAULT;
 576
 577         drm_clflush_virt_range(vaddr, args->size);
 578         i915_gem_chipset_flush(to_i915(obj->base.dev));
 579
 580         intel_fb_obj_flush(obj, ORIGIN_CPU);
 581         return 0;
 582 }
 583
 584 void *i915_gem_object_alloc(struct drm_i915_private *dev_priv)
 585 {
 586         return kmem_cache_zalloc(dev_priv->objects, GFP_KERNEL);
 587 }
 588
 589 void i915_gem_object_free(struct drm_i915_gem_object *obj)
 590 {
 591         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
 592         kmem_cache_free(dev_priv->objects, obj);
 593 }
 594
 595 static int
 596 i915_gem_create(struct drm_file *file,
 597                 struct drm_i915_private *dev_priv,
 598                 uint64_t size,
 599                 uint32_t *handle_p)
 600 {
 601         struct drm_i915_gem_object *obj;
 602         int ret;
 603         u32 handle;
 604
 605         size = roundup(size, PAGE_SIZE);
 606         if (size == 0)
 607                 return -EINVAL;
 608
 609         /* Allocate the new object */
 610         obj = i915_gem_object_create(dev_priv, size);
 611         if (IS_ERR(obj))
 612                 return PTR_ERR(obj);
 613
 614         ret = drm_gem_handle_create(file, &obj->base, &handle);
 615         /* drop reference from allocate - handle holds it now */
 616         i915_gem_object_put(obj);
 617         if (ret)
 618                 return ret;
 619
 620         *handle_p = handle;
 621         return 0;
 622 }
 623
 624 int
 625 i915_gem_dumb_create(struct drm_file *file,
 626                      struct drm_device *dev,
 627                      struct drm_mode_create_dumb *args)
 628 {
 629         /* have to work out size/pitch and return them */
 630         args->pitch = ALIGN(args->width * DIV_ROUND_UP(args->bpp, 8), 64);
 631         args->size = args->pitch * args->height;
 632         return i915_gem_create(file, to_i915(dev),
 633                                args->size, &args->handle);
 634 }
 635
 636 static bool gpu_write_needs_clflush(struct drm_i915_gem_object *obj)
 637 {
 638         return !(obj->cache_level == I915_CACHE_NONE ||
 639                  obj->cache_level == I915_CACHE_WT);
 640 }
 641
 642 /**
 643  * Creates a new mm object and returns a handle to it.
 644  * @dev: drm device pointer
 645  * @data: ioctl data blob
 646  * @file: drm file pointer
 647  */
 648 int
 649 i915_gem_create_ioctl(struct drm_device *dev, void *data,
 650                       struct drm_file *file)
 651 {
 652         struct drm_i915_private *dev_priv = to_i915(dev);
 653         struct drm_i915_gem_create *args = data;
 654
 655         i915_gem_flush_free_objects(dev_priv);
 656
 657         return i915_gem_create(file, dev_priv,
 658                                args->size, &args->handle);
 659 }
 660
 661 static inline enum fb_op_origin
 662 fb_write_origin(struct drm_i915_gem_object *obj, unsigned int domain)
 663 {
 664         return (domain == I915_GEM_DOMAIN_GTT ?
 665                 obj->frontbuffer_ggtt_origin : ORIGIN_CPU);
 666 }
 667
 668 void i915_gem_flush_ggtt_writes(struct drm_i915_private *dev_priv)
 669 {
 670         /*
 671          * No actual flushing is required for the GTT write domain for reads
 672          * from the GTT domain. Writes to it "immediately" go to main memory
 673          * as far as we know, so there's no chipset flush. It also doesn't
 674          * land in the GPU render cache.
 675          *
 676          * However, we do have to enforce the order so that all writes through
 677          * the GTT land before any writes to the device, such as updates to
 678          * the GATT itself.
 679          *
 680          * We also have to wait a bit for the writes to land from the GTT.
 681          * An uncached read (i.e. mmio) seems to be ideal for the round-trip
 682          * timing. This issue has only been observed when switching quickly
 683          * between GTT writes and CPU reads from inside the kernel on recent hw,
 684          * and it appears to only affect discrete GTT blocks (i.e. on LLC
 685          * system agents we cannot reproduce this behaviour, until Cannonlake
 686          * that was!).
 687          */
 688
 689         wmb();
 690
 691         intel_runtime_pm_get(dev_priv);
 692         spin_lock_irq(&dev_priv->uncore.lock);
 693
 694         POSTING_READ_FW(RING_HEAD(RENDER_RING_BASE));
 695
 696         spin_unlock_irq(&dev_priv->uncore.lock);
 697         intel_runtime_pm_put(dev_priv);
 698 }
 699
 700 static void
 701 flush_write_domain(struct drm_i915_gem_object *obj, unsigned int flush_domains)
 702 {
 703         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
 704         struct i915_vma *vma;
 705
 706         if (!(obj->write_domain & flush_domains))
 707                 return;
 708
 709         switch (obj->write_domain) {
 710         case I915_GEM_DOMAIN_GTT:
 711                 i915_gem_flush_ggtt_writes(dev_priv);
 712
 713                 intel_fb_obj_flush(obj,
 714                                    fb_write_origin(obj, I915_GEM_DOMAIN_GTT));
 715
 716                 for_each_ggtt_vma(vma, obj) {
 717                         if (vma->iomap)
 718                                 continue;
 719
 720                         i915_vma_unset_ggtt_write(vma);
 721                 }
 722                 break;
 723
 724         case I915_GEM_DOMAIN_CPU:
 725                 i915_gem_clflush_object(obj, I915_CLFLUSH_SYNC);
 726                 break;
 727
 728         case I915_GEM_DOMAIN_RENDER:
 729                 if (gpu_write_needs_clflush(obj))
 730                         obj->cache_dirty = true;
 731                 break;
 732         }
 733
 734         obj->write_domain = 0;
 735 }
 736
 737 static inline int
 738 __copy_to_user_swizzled(char __user *cpu_vaddr,
 739                         const char *gpu_vaddr, int gpu_offset,
 740                         int length)
 741 {
 742         int ret, cpu_offset = 0;
 743
 744         while (length > 0) {
 745                 int cacheline_end = ALIGN(gpu_offset + 1, 64);
 746                 int this_length = min(cacheline_end - gpu_offset, length);
 747                 int swizzled_gpu_offset = gpu_offset ^ 64;
 748
 749                 ret = __copy_to_user(cpu_vaddr + cpu_offset,
 750                                      gpu_vaddr + swizzled_gpu_offset,
 751                                      this_length);
 752                 if (ret)
 753                         return ret + length;
 754
 755                 cpu_offset += this_length;
 756                 gpu_offset += this_length;
 757                 length -= this_length;
 758         }
 759
 760         return 0;
 761 }
 762
 763 static inline int
 764 __copy_from_user_swizzled(char *gpu_vaddr, int gpu_offset,
 765                           const char __user *cpu_vaddr,
 766                           int length)
 767 {
 768         int ret, cpu_offset = 0;
 769
 770         while (length > 0) {
 771                 int cacheline_end = ALIGN(gpu_offset + 1, 64);
 772                 int this_length = min(cacheline_end - gpu_offset, length);
 773                 int swizzled_gpu_offset = gpu_offset ^ 64;
 774
 775                 ret = __copy_from_user(gpu_vaddr + swizzled_gpu_offset,
 776                                        cpu_vaddr + cpu_offset,
 777                                        this_length);
 778                 if (ret)
 779                         return ret + length;
 780
 781                 cpu_offset += this_length;
 782                 gpu_offset += this_length;
 783                 length -= this_length;
 784         }
 785
 786         return 0;
 787 }
 788
 789 /*
 790  * Pins the specified object's pages and synchronizes the object with
 791  * GPU accesses. Sets needs_clflush to non-zero if the caller should
 792  * flush the object from the CPU cache.
 793  */
 794 int i915_gem_obj_prepare_shmem_read(struct drm_i915_gem_object *obj,
 795                                     unsigned int *needs_clflush)
 796 {
 797         int ret;
 798
 799         lockdep_assert_held(&obj->base.dev->struct_mutex);
 800
 801         *needs_clflush = 0;
 802         if (!i915_gem_object_has_struct_page(obj))
 803                 return -ENODEV;
 804
 805         ret = i915_gem_object_wait(obj,
 806                                    I915_WAIT_INTERRUPTIBLE |
 807                                    I915_WAIT_LOCKED,
 808                                    MAX_SCHEDULE_TIMEOUT,
 809                                    NULL);
 810         if (ret)
 811                 return ret;
 812
 813         ret = i915_gem_object_pin_pages(obj);
 814         if (ret)
 815                 return ret;
 816
 817         if (obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ ||
 818             !static_cpu_has(X86_FEATURE_CLFLUSH)) {
 819                 ret = i915_gem_object_set_to_cpu_domain(obj, false);
 820                 if (ret)
 821                         goto err_unpin;
 822                 else
 823                         goto out;
 824         }
 825
 826         flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
 827
 828         /* If we're not in the cpu read domain, set ourself into the gtt
 829          * read domain and manually flush cachelines (if required). This
 830          * optimizes for the case when the gpu will dirty the data
 831          * anyway again before the next pread happens.
 832          */
 833         if (!obj->cache_dirty &&
 834             !(obj->read_domains & I915_GEM_DOMAIN_CPU))
 835                 *needs_clflush = CLFLUSH_BEFORE;
 836
 837 out:
 838         /* return with the pages pinned */
 839         return 0;
 840
 841 err_unpin:
 842         i915_gem_object_unpin_pages(obj);
 843         return ret;
 844 }
 845
 846 int i915_gem_obj_prepare_shmem_write(struct drm_i915_gem_object *obj,
 847                                      unsigned int *needs_clflush)
 848 {
 849         int ret;
 850
 851         lockdep_assert_held(&obj->base.dev->struct_mutex);
 852
 853         *needs_clflush = 0;
 854         if (!i915_gem_object_has_struct_page(obj))
 855                 return -ENODEV;
 856
 857         ret = i915_gem_object_wait(obj,
 858                                    I915_WAIT_INTERRUPTIBLE |
 859                                    I915_WAIT_LOCKED |
 860                                    I915_WAIT_ALL,
 861                                    MAX_SCHEDULE_TIMEOUT,
 862                                    NULL);
 863         if (ret)
 864                 return ret;
 865
 866         ret = i915_gem_object_pin_pages(obj);
 867         if (ret)
 868                 return ret;
 869
 870         if (obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE ||
 871             !static_cpu_has(X86_FEATURE_CLFLUSH)) {
 872                 ret = i915_gem_object_set_to_cpu_domain(obj, true);
 873                 if (ret)
 874                         goto err_unpin;
 875                 else
 876                         goto out;
 877         }
 878
 879         flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
 880
 881         /* If we're not in the cpu write domain, set ourself into the
 882          * gtt write domain and manually flush cachelines (as required).
 883          * This optimizes for the case when the gpu will use the data
 884          * right away and we therefore have to clflush anyway.
 885          */
 886         if (!obj->cache_dirty) {
 887                 *needs_clflush |= CLFLUSH_AFTER;
 888
 889                 /*
 890                  * Same trick applies to invalidate partially written
 891                  * cachelines read before writing.
 892                  */
 893                 if (!(obj->read_domains & I915_GEM_DOMAIN_CPU))
 894                         *needs_clflush |= CLFLUSH_BEFORE;
 895         }
 896
 897 out:
 898         intel_fb_obj_invalidate(obj, ORIGIN_CPU);
 899         obj->mm.dirty = true;
 900         /* return with the pages pinned */
 901         return 0;
 902
 903 err_unpin:
 904         i915_gem_object_unpin_pages(obj);
 905         return ret;
 906 }
 907
 908 static void
 909 shmem_clflush_swizzled_range(char *addr, unsigned long length,
 910                              bool swizzled)
 911 {
 912         if (unlikely(swizzled)) {
 913                 unsigned long start = (unsigned long) addr;
 914                 unsigned long end = (unsigned long) addr + length;
 915
 916                 /* For swizzling simply ensure that we always flush both
 917                  * channels. Lame, but simple and it works. Swizzled
 918                  * pwrite/pread is far from a hotpath - current userspace
 919                  * doesn't use it at all. */
 920                 start = round_down(start, 128);
 921                 end = round_up(end, 128);
 922
 923                 drm_clflush_virt_range((void *)start, end - start);
 924         } else {
 925                 drm_clflush_virt_range(addr, length);
 926         }
 927
 928 }
 929
 930 /* Only difference to the fast-path function is that this can handle bit17
 931  * and uses non-atomic copy and kmap functions. */
 932 static int
 933 shmem_pread_slow(struct page *page, int offset, int length,
 934                  char __user *user_data,
 935                  bool page_do_bit17_swizzling, bool needs_clflush)
 936 {
 937         char *vaddr;
 938         int ret;
 939
 940         vaddr = kmap(page);
 941         if (needs_clflush)
 942                 shmem_clflush_swizzled_range(vaddr + offset, length,
 943                                              page_do_bit17_swizzling);
 944
 945         if (page_do_bit17_swizzling)
 946                 ret = __copy_to_user_swizzled(user_data, vaddr, offset, length);
 947         else
 948                 ret = __copy_to_user(user_data, vaddr + offset, length);
 949         kunmap(page);
 950
 951         return ret ? - EFAULT : 0;
 952 }
 953
 954 static int
 955 shmem_pread(struct page *page, int offset, int length, char __user *user_data,
 956             bool page_do_bit17_swizzling, bool needs_clflush)
 957 {
 958         int ret;
 959
 960         ret = -ENODEV;
 961         if (!page_do_bit17_swizzling) {
 962                 char *vaddr = kmap_atomic(page);
 963
 964                 if (needs_clflush)
 965                         drm_clflush_virt_range(vaddr + offset, length);
 966                 ret = __copy_to_user_inatomic(user_data, vaddr + offset, length);
 967                 kunmap_atomic(vaddr);
 968         }
 969         if (ret == 0)
 970                 return 0;
 971
 972         return shmem_pread_slow(page, offset, length, user_data,
 973                                 page_do_bit17_swizzling, needs_clflush);
 974 }
 975
 976 static int
 977 i915_gem_shmem_pread(struct drm_i915_gem_object *obj,
 978                      struct drm_i915_gem_pread *args)
 979 {
 980         char __user *user_data;
 981         u64 remain;
 982         unsigned int obj_do_bit17_swizzling;
 983         unsigned int needs_clflush;
 984         unsigned int idx, offset;
 985         int ret;
 986
 987         obj_do_bit17_swizzling = 0;
 988         if (i915_gem_object_needs_bit17_swizzle(obj))
 989                 obj_do_bit17_swizzling = BIT(17);
 990
 991         ret = mutex_lock_interruptible(&obj->base.dev->struct_mutex);
 992         if (ret)
 993                 return ret;
 994
 995         ret = i915_gem_obj_prepare_shmem_read(obj, &needs_clflush);
 996         mutex_unlock(&obj->base.dev->struct_mutex);
 997         if (ret)
 998                 return ret;
 999
1000         remain = args->size;
1001         user_data = u64_to_user_ptr(args->data_ptr);
1002         offset = offset_in_page(args->offset);
1003         for (idx = args->offset >> PAGE_SHIFT; remain; idx++) {
1004                 struct page *page = i915_gem_object_get_page(obj, idx);
1005                 int length;
1006
1007                 length = remain;
1008                 if (offset + length > PAGE_SIZE)
1009                         length = PAGE_SIZE - offset;
1010
1011                 ret = shmem_pread(page, offset, length, user_data,
1012                                   page_to_phys(page) & obj_do_bit17_swizzling,
1013                                   needs_clflush);
1014                 if (ret)
1015                         break;
1016
1017                 remain -= length;
1018                 user_data += length;
1019                 offset = 0;
1020         }
1021
1022         i915_gem_obj_finish_shmem_access(obj);
1023         return ret;
1024 }
1025
1026 static inline bool
1027 gtt_user_read(struct io_mapping *mapping,
1028               loff_t base, int offset,
1029               char __user *user_data, int length)
1030 {
1031         void __iomem *vaddr;
1032         unsigned long unwritten;
1033
1034         /* We can use the cpu mem copy function because this is X86. */
1035         vaddr = io_mapping_map_atomic_wc(mapping, base);
1036         unwritten = __copy_to_user_inatomic(user_data,
1037                                             (void __force *)vaddr + offset,
1038                                             length);
1039         io_mapping_unmap_atomic(vaddr);
1040         if (unwritten) {
1041                 vaddr = io_mapping_map_wc(mapping, base, PAGE_SIZE);
1042                 unwritten = copy_to_user(user_data,
1043                                          (void __force *)vaddr + offset,
1044                                          length);
1045                 io_mapping_unmap(vaddr);
1046         }
1047         return unwritten;
1048 }
1049
1050 static int
1051 i915_gem_gtt_pread(struct drm_i915_gem_object *obj,
1052                    const struct drm_i915_gem_pread *args)
1053 {
1054         struct drm_i915_private *i915 = to_i915(obj->base.dev);
1055         struct i915_ggtt *ggtt = &i915->ggtt;
1056         struct drm_mm_node node;
1057         struct i915_vma *vma;
1058         void __user *user_data;
1059         u64 remain, offset;
1060         int ret;
1061
1062         ret = mutex_lock_interruptible(&i915->drm.struct_mutex);
1063         if (ret)
1064                 return ret;
1065
1066         intel_runtime_pm_get(i915);
1067         vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
1068                                        PIN_MAPPABLE |
1069                                        PIN_NONFAULT |
1070                                        PIN_NONBLOCK);
1071         if (!IS_ERR(vma)) {
1072                 node.start = i915_ggtt_offset(vma);
1073                 node.allocated = false;
1074                 ret = i915_vma_put_fence(vma);
1075                 if (ret) {
1076                         i915_vma_unpin(vma);
1077                         vma = ERR_PTR(ret);
1078                 }
1079         }
1080         if (IS_ERR(vma)) {
1081                 ret = insert_mappable_node(ggtt, &node, PAGE_SIZE);
1082                 if (ret)
1083                         goto out_unlock;
1084                 GEM_BUG_ON(!node.allocated);
1085         }
1086
1087         ret = i915_gem_object_set_to_gtt_domain(obj, false);
1088         if (ret)
1089                 goto out_unpin;
1090
1091         mutex_unlock(&i915->drm.struct_mutex);
1092
1093         user_data = u64_to_user_ptr(args->data_ptr);
1094         remain = args->size;
1095         offset = args->offset;
1096
1097         while (remain > 0) {
1098                 /* Operation in this page
1099                  *
1100                  * page_base = page offset within aperture
1101                  * page_offset = offset within page
1102                  * page_length = bytes to copy for this page
1103                  */
1104                 u32 page_base = node.start;
1105                 unsigned page_offset = offset_in_page(offset);
1106                 unsigned page_length = PAGE_SIZE - page_offset;
1107                 page_length = remain < page_length ? remain : page_length;
1108                 if (node.allocated) {
1109                         wmb();
1110                         ggtt->base.insert_page(&ggtt->base,
1111                                                i915_gem_object_get_dma_address(obj, offset >> PAGE_SHIFT),
1112                                                node.start, I915_CACHE_NONE, 0);
1113                         wmb();
1114                 } else {
1115                         page_base += offset & PAGE_MASK;
1116                 }
1117
1118                 if (gtt_user_read(&ggtt->iomap, page_base, page_offset,
1119                                   user_data, page_length)) {
1120                         ret = -EFAULT;
1121                         break;
1122                 }
1123
1124                 remain -= page_length;
1125                 user_data += page_length;
1126                 offset += page_length;
1127         }
1128
1129         mutex_lock(&i915->drm.struct_mutex);
1130 out_unpin:
1131         if (node.allocated) {
1132                 wmb();
1133                 ggtt->base.clear_range(&ggtt->base,
1134                                        node.start, node.size);
1135                 remove_mappable_node(&node);
1136         } else {
1137                 i915_vma_unpin(vma);
1138         }
1139 out_unlock:
1140         intel_runtime_pm_put(i915);
1141         mutex_unlock(&i915->drm.struct_mutex);
1142
1143         return ret;
1144 }
1145
1146 /**
1147  * Reads data from the object referenced by handle.
1148  * @dev: drm device pointer
1149  * @data: ioctl data blob
1150  * @file: drm file pointer
1151  *
1152  * On error, the contents of *data are undefined.
1153  */
1154 int
1155 i915_gem_pread_ioctl(struct drm_device *dev, void *data,
1156                      struct drm_file *file)
1157 {
1158         struct drm_i915_gem_pread *args = data;
1159         struct drm_i915_gem_object *obj;
1160         int ret;
1161
1162         if (args->size == 0)
1163                 return 0;
1164
1165         if (!access_ok(VERIFY_WRITE,
1166                        u64_to_user_ptr(args->data_ptr),
1167                        args->size))
1168                 return -EFAULT;
1169
1170         obj = i915_gem_object_lookup(file, args->handle);
1171         if (!obj)
1172                 return -ENOENT;
1173
1174         /* Bounds check source.  */
1175         if (range_overflows_t(u64, args->offset, args->size, obj->base.size)) {
1176                 ret = -EINVAL;
1177                 goto out;
1178         }
1179
1180         trace_i915_gem_object_pread(obj, args->offset, args->size);
1181
1182         ret = i915_gem_object_wait(obj,
1183                                    I915_WAIT_INTERRUPTIBLE,
1184                                    MAX_SCHEDULE_TIMEOUT,
1185                                    to_rps_client(file));
1186         if (ret)
1187                 goto out;
1188
1189         ret = i915_gem_object_pin_pages(obj);
1190         if (ret)
1191                 goto out;
1192
1193         ret = i915_gem_shmem_pread(obj, args);
1194         if (ret == -EFAULT || ret == -ENODEV)
1195                 ret = i915_gem_gtt_pread(obj, args);
1196
1197         i915_gem_object_unpin_pages(obj);
1198 out:
1199         i915_gem_object_put(obj);
1200         return ret;
1201 }
1202
1203 /* This is the fast write path which cannot handle
1204  * page faults in the source data
1205  */
1206
1207 static inline bool
1208 ggtt_write(struct io_mapping *mapping,
1209            loff_t base, int offset,
1210            char __user *user_data, int length)
1211 {
1212         void __iomem *vaddr;
1213         unsigned long unwritten;
1214
1215         /* We can use the cpu mem copy function because this is X86. */
1216         vaddr = io_mapping_map_atomic_wc(mapping, base);
1217         unwritten = __copy_from_user_inatomic_nocache((void __force *)vaddr + offset,
1218                                                       user_data, length);
1219         io_mapping_unmap_atomic(vaddr);
1220         if (unwritten) {
1221                 vaddr = io_mapping_map_wc(mapping, base, PAGE_SIZE);
1222                 unwritten = copy_from_user((void __force *)vaddr + offset,
1223                                            user_data, length);
1224                 io_mapping_unmap(vaddr);
1225         }
1226
1227         return unwritten;
1228 }
1229
1230 /**
1231  * This is the fast pwrite path, where we copy the data directly from the
1232  * user into the GTT, uncached.
1233  * @obj: i915 GEM object
1234  * @args: pwrite arguments structure
1235  */
1236 static int
1237 i915_gem_gtt_pwrite_fast(struct drm_i915_gem_object *obj,
1238                          const struct drm_i915_gem_pwrite *args)
1239 {
1240         struct drm_i915_private *i915 = to_i915(obj->base.dev);
1241         struct i915_ggtt *ggtt = &i915->ggtt;
1242         struct drm_mm_node node;
1243         struct i915_vma *vma;
1244         u64 remain, offset;
1245         void __user *user_data;
1246         int ret;
1247
1248         ret = mutex_lock_interruptible(&i915->drm.struct_mutex);
1249         if (ret)
1250                 return ret;
1251
1252         if (i915_gem_object_has_struct_page(obj)) {
1253                 /*
1254                  * Avoid waking the device up if we can fallback, as
1255                  * waking/resuming is very slow (worst-case 10-100 ms
1256                  * depending on PCI sleeps and our own resume time).
1257                  * This easily dwarfs any performance advantage from
1258                  * using the cache bypass of indirect GGTT access.
1259                  */
1260                 if (!intel_runtime_pm_get_if_in_use(i915)) {
1261                         ret = -EFAULT;
1262                         goto out_unlock;
1263                 }
1264         } else {
1265                 /* No backing pages, no fallback, we must force GGTT access */
1266                 intel_runtime_pm_get(i915);
1267         }
1268
1269         vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
1270                                        PIN_MAPPABLE |
1271                                        PIN_NONFAULT |
1272                                        PIN_NONBLOCK);
1273         if (!IS_ERR(vma)) {
1274                 node.start = i915_ggtt_offset(vma);
1275                 node.allocated = false;
1276                 ret = i915_vma_put_fence(vma);
1277                 if (ret) {
1278                         i915_vma_unpin(vma);
1279                         vma = ERR_PTR(ret);
1280                 }
1281         }
1282         if (IS_ERR(vma)) {
1283                 ret = insert_mappable_node(ggtt, &node, PAGE_SIZE);
1284                 if (ret)
1285                         goto out_rpm;
1286                 GEM_BUG_ON(!node.allocated);
1287         }
1288
1289         ret = i915_gem_object_set_to_gtt_domain(obj, true);
1290         if (ret)
1291                 goto out_unpin;
1292
1293         mutex_unlock(&i915->drm.struct_mutex);
1294
1295         intel_fb_obj_invalidate(obj, ORIGIN_CPU);
1296
1297         user_data = u64_to_user_ptr(args->data_ptr);
1298         offset = args->offset;
1299         remain = args->size;
1300         while (remain) {
1301                 /* Operation in this page
1302                  *
1303                  * page_base = page offset within aperture
1304                  * page_offset = offset within page
1305                  * page_length = bytes to copy for this page
1306                  */
1307                 u32 page_base = node.start;
1308                 unsigned int page_offset = offset_in_page(offset);
1309                 unsigned int page_length = PAGE_SIZE - page_offset;
1310                 page_length = remain < page_length ? remain : page_length;
1311                 if (node.allocated) {
1312                         wmb(); /* flush the write before we modify the GGTT */
1313                         ggtt->base.insert_page(&ggtt->base,
1314                                                i915_gem_object_get_dma_address(obj, offset >> PAGE_SHIFT),
1315                                                node.start, I915_CACHE_NONE, 0);
1316                         wmb(); /* flush modifications to the GGTT (insert_page) */
1317                 } else {
1318                         page_base += offset & PAGE_MASK;
1319                 }
1320                 /* If we get a fault while copying data, then (presumably) our
1321                  * source page isn't available.  Return the error and we'll
1322                  * retry in the slow path.
1323                  * If the object is non-shmem backed, we retry again with the
1324                  * path that handles page fault.
1325                  */
1326                 if (ggtt_write(&ggtt->iomap, page_base, page_offset,
1327                                user_data, page_length)) {
1328                         ret = -EFAULT;
1329                         break;
1330                 }
1331
1332                 remain -= page_length;
1333                 user_data += page_length;
1334                 offset += page_length;
1335         }
1336         intel_fb_obj_flush(obj, ORIGIN_CPU);
1337
1338         mutex_lock(&i915->drm.struct_mutex);
1339 out_unpin:
1340         if (node.allocated) {
1341                 wmb();
1342                 ggtt->base.clear_range(&ggtt->base,
1343                                        node.start, node.size);
1344                 remove_mappable_node(&node);
1345         } else {
1346                 i915_vma_unpin(vma);
1347         }
1348 out_rpm:
1349         intel_runtime_pm_put(i915);
1350 out_unlock:
1351         mutex_unlock(&i915->drm.struct_mutex);
1352         return ret;
1353 }
1354
1355 static int
1356 shmem_pwrite_slow(struct page *page, int offset, int length,
1357                   char __user *user_data,
1358                   bool page_do_bit17_swizzling,
1359                   bool needs_clflush_before,
1360                   bool needs_clflush_after)
1361 {
1362         char *vaddr;
1363         int ret;
1364
1365         vaddr = kmap(page);
1366         if (unlikely(needs_clflush_before || page_do_bit17_swizzling))
1367                 shmem_clflush_swizzled_range(vaddr + offset, length,
1368                                              page_do_bit17_swizzling);
1369         if (page_do_bit17_swizzling)
1370                 ret = __copy_from_user_swizzled(vaddr, offset, user_data,
1371                                                 length);
1372         else
1373                 ret = __copy_from_user(vaddr + offset, user_data, length);
1374         if (needs_clflush_after)
1375                 shmem_clflush_swizzled_range(vaddr + offset, length,
1376                                              page_do_bit17_swizzling);
1377         kunmap(page);
1378
1379         return ret ? -EFAULT : 0;
1380 }
1381
1382 /* Per-page copy function for the shmem pwrite fastpath.
1383  * Flushes invalid cachelines before writing to the target if
1384  * needs_clflush_before is set and flushes out any written cachelines after
1385  * writing if needs_clflush is set.
1386  */
1387 static int
1388 shmem_pwrite(struct page *page, int offset, int len, char __user *user_data,
1389              bool page_do_bit17_swizzling,
1390              bool needs_clflush_before,
1391              bool needs_clflush_after)
1392 {
1393         int ret;
1394
1395         ret = -ENODEV;
1396         if (!page_do_bit17_swizzling) {
1397                 char *vaddr = kmap_atomic(page);
1398
1399                 if (needs_clflush_before)
1400                         drm_clflush_virt_range(vaddr + offset, len);
1401                 ret = __copy_from_user_inatomic(vaddr + offset, user_data, len);
1402                 if (needs_clflush_after)
1403                         drm_clflush_virt_range(vaddr + offset, len);
1404
1405                 kunmap_atomic(vaddr);
1406         }
1407         if (ret == 0)
1408                 return ret;
1409
1410         return shmem_pwrite_slow(page, offset, len, user_data,
1411                                  page_do_bit17_swizzling,
1412                                  needs_clflush_before,
1413                                  needs_clflush_after);
1414 }
1415
1416 static int
1417 i915_gem_shmem_pwrite(struct drm_i915_gem_object *obj,
1418                       const struct drm_i915_gem_pwrite *args)
1419 {
1420         struct drm_i915_private *i915 = to_i915(obj->base.dev);
1421         void __user *user_data;
1422         u64 remain;
1423         unsigned int obj_do_bit17_swizzling;
1424         unsigned int partial_cacheline_write;
1425         unsigned int needs_clflush;
1426         unsigned int offset, idx;
1427         int ret;
1428
1429         ret = mutex_lock_interruptible(&i915->drm.struct_mutex);
1430         if (ret)
1431                 return ret;
1432
1433         ret = i915_gem_obj_prepare_shmem_write(obj, &needs_clflush);
1434         mutex_unlock(&i915->drm.struct_mutex);
1435         if (ret)
1436                 return ret;
1437
1438         obj_do_bit17_swizzling = 0;
1439         if (i915_gem_object_needs_bit17_swizzle(obj))
1440                 obj_do_bit17_swizzling = BIT(17);
1441
1442         /* If we don't overwrite a cacheline completely we need to be
1443          * careful to have up-to-date data by first clflushing. Don't
1444          * overcomplicate things and flush the entire patch.
1445          */
1446         partial_cacheline_write = 0;
1447         if (needs_clflush & CLFLUSH_BEFORE)
1448                 partial_cacheline_write = boot_cpu_data.x86_clflush_size - 1;
1449
1450         user_data = u64_to_user_ptr(args->data_ptr);
1451         remain = args->size;
1452         offset = offset_in_page(args->offset);
1453         for (idx = args->offset >> PAGE_SHIFT; remain; idx++) {
1454                 struct page *page = i915_gem_object_get_page(obj, idx);
1455                 int length;
1456
1457                 length = remain;
1458                 if (offset + length > PAGE_SIZE)
1459                         length = PAGE_SIZE - offset;
1460
1461                 ret = shmem_pwrite(page, offset, length, user_data,
1462                                    page_to_phys(page) & obj_do_bit17_swizzling,
1463                                    (offset | length) & partial_cacheline_write,
1464                                    needs_clflush & CLFLUSH_AFTER);
1465                 if (ret)
1466                         break;
1467
1468                 remain -= length;
1469                 user_data += length;
1470                 offset = 0;
1471         }
1472
1473         intel_fb_obj_flush(obj, ORIGIN_CPU);
1474         i915_gem_obj_finish_shmem_access(obj);
1475         return ret;
1476 }
1477
1478 /**
1479  * Writes data to the object referenced by handle.
1480  * @dev: drm device
1481  * @data: ioctl data blob
1482  * @file: drm file
1483  *
1484  * On error, the contents of the buffer that were to be modified are undefined.
1485  */
1486 int
1487 i915_gem_pwrite_ioctl(struct drm_device *dev, void *data,
1488                       struct drm_file *file)
1489 {
1490         struct drm_i915_gem_pwrite *args = data;
1491         struct drm_i915_gem_object *obj;
1492         int ret;
1493
1494         if (args->size == 0)
1495                 return 0;
1496
1497         if (!access_ok(VERIFY_READ,
1498                        u64_to_user_ptr(args->data_ptr),
1499                        args->size))
1500                 return -EFAULT;
1501
1502         obj = i915_gem_object_lookup(file, args->handle);
1503         if (!obj)
1504                 return -ENOENT;
1505
1506         /* Bounds check destination. */
1507         if (range_overflows_t(u64, args->offset, args->size, obj->base.size)) {
1508                 ret = -EINVAL;
1509                 goto err;
1510         }
1511
1512         trace_i915_gem_object_pwrite(obj, args->offset, args->size);
1513
1514         ret = -ENODEV;
1515         if (obj->ops->pwrite)
1516                 ret = obj->ops->pwrite(obj, args);
1517         if (ret != -ENODEV)
1518                 goto err;
1519
1520         ret = i915_gem_object_wait(obj,
1521                                    I915_WAIT_INTERRUPTIBLE |
1522                                    I915_WAIT_ALL,
1523                                    MAX_SCHEDULE_TIMEOUT,
1524                                    to_rps_client(file));
1525         if (ret)
1526                 goto err;
1527
1528         ret = i915_gem_object_pin_pages(obj);
1529         if (ret)
1530                 goto err;
1531
1532         ret = -EFAULT;
1533         /* We can only do the GTT pwrite on untiled buffers, as otherwise
1534          * it would end up going through the fenced access, and we'll get
1535          * different detiling behavior between reading and writing.
1536          * pread/pwrite currently are reading and writing from the CPU
1537          * perspective, requiring manual detiling by the client.
1538          */
1539         if (!i915_gem_object_has_struct_page(obj) ||
1540             cpu_write_needs_clflush(obj))
1541                 /* Note that the gtt paths might fail with non-page-backed user
1542                  * pointers (e.g. gtt mappings when moving data between
1543                  * textures). Fallback to the shmem path in that case.
1544                  */
1545                 ret = i915_gem_gtt_pwrite_fast(obj, args);
1546
1547         if (ret == -EFAULT || ret == -ENOSPC) {
1548                 if (obj->phys_handle)
1549                         ret = i915_gem_phys_pwrite(obj, args, file);
1550                 else
1551                         ret = i915_gem_shmem_pwrite(obj, args);
1552         }
1553
1554         i915_gem_object_unpin_pages(obj);
1555 err:
1556         i915_gem_object_put(obj);
1557         return ret;
1558 }
1559
1560 static void i915_gem_object_bump_inactive_ggtt(struct drm_i915_gem_object *obj)
1561 {
1562         struct drm_i915_private *i915;
1563         struct list_head *list;
1564         struct i915_vma *vma;
1565
1566         GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
1567
1568         for_each_ggtt_vma(vma, obj) {
1569                 if (i915_vma_is_active(vma))
1570                         continue;
1571
1572                 if (!drm_mm_node_allocated(&vma->node))
1573                         continue;
1574
1575                 list_move_tail(&vma->vm_link, &vma->vm->inactive_list);
1576         }
1577
1578         i915 = to_i915(obj->base.dev);
1579         spin_lock(&i915->mm.obj_lock);
1580         list = obj->bind_count ? &i915->mm.bound_list : &i915->mm.unbound_list;
1581         list_move_tail(&obj->mm.link, list);
1582         spin_unlock(&i915->mm.obj_lock);
1583 }
1584
1585 /**
1586  * Called when user space prepares to use an object with the CPU, either
1587  * through the mmap ioctl's mapping or a GTT mapping.
1588  * @dev: drm device
1589  * @data: ioctl data blob
1590  * @file: drm file
1591  */
1592 int
1593 i915_gem_set_domain_ioctl(struct drm_device *dev, void *data,
1594                           struct drm_file *file)
1595 {
1596         struct drm_i915_gem_set_domain *args = data;
1597         struct drm_i915_gem_object *obj;
1598         uint32_t read_domains = args->read_domains;
1599         uint32_t write_domain = args->write_domain;
1600         int err;
1601
1602         /* Only handle setting domains to types used by the CPU. */
1603         if ((write_domain | read_domains) & I915_GEM_GPU_DOMAINS)
1604                 return -EINVAL;
1605
1606         /* Having something in the write domain implies it's in the read
1607          * domain, and only that read domain.  Enforce that in the request.
1608          */
1609         if (write_domain != 0 && read_domains != write_domain)
1610                 return -EINVAL;
1611
1612         obj = i915_gem_object_lookup(file, args->handle);
1613         if (!obj)
1614                 return -ENOENT;
1615
1616         /* Try to flush the object off the GPU without holding the lock.
1617          * We will repeat the flush holding the lock in the normal manner
1618          * to catch cases where we are gazumped.
1619          */
1620         err = i915_gem_object_wait(obj,
1621                                    I915_WAIT_INTERRUPTIBLE |
1622                                    (write_domain ? I915_WAIT_ALL : 0),
1623                                    MAX_SCHEDULE_TIMEOUT,
1624                                    to_rps_client(file));
1625         if (err)
1626                 goto out;
1627
1628         /*
1629          * Proxy objects do not control access to the backing storage, ergo
1630          * they cannot be used as a means to manipulate the cache domain
1631          * tracking for that backing storage. The proxy object is always
1632          * considered to be outside of any cache domain.
1633          */
1634         if (i915_gem_object_is_proxy(obj)) {
1635                 err = -ENXIO;
1636                 goto out;
1637         }
1638
1639         /*
1640          * Flush and acquire obj->pages so that we are coherent through
1641          * direct access in memory with previous cached writes through
1642          * shmemfs and that our cache domain tracking remains valid.
1643          * For example, if the obj->filp was moved to swap without us
1644          * being notified and releasing the pages, we would mistakenly
1645          * continue to assume that the obj remained out of the CPU cached
1646          * domain.
1647          */
1648         err = i915_gem_object_pin_pages(obj);
1649         if (err)
1650                 goto out;
1651
1652         err = i915_mutex_lock_interruptible(dev);
1653         if (err)
1654                 goto out_unpin;
1655
1656         if (read_domains & I915_GEM_DOMAIN_WC)
1657                 err = i915_gem_object_set_to_wc_domain(obj, write_domain);
1658         else if (read_domains & I915_GEM_DOMAIN_GTT)
1659                 err = i915_gem_object_set_to_gtt_domain(obj, write_domain);
1660         else
1661                 err = i915_gem_object_set_to_cpu_domain(obj, write_domain);
1662
1663         /* And bump the LRU for this access */
1664         i915_gem_object_bump_inactive_ggtt(obj);
1665
1666         mutex_unlock(&dev->struct_mutex);
1667
1668         if (write_domain != 0)
1669                 intel_fb_obj_invalidate(obj,
1670                                         fb_write_origin(obj, write_domain));
1671
1672 out_unpin:
1673         i915_gem_object_unpin_pages(obj);
1674 out:
1675         i915_gem_object_put(obj);
1676         return err;
1677 }
1678
1679 /**
1680  * Called when user space has done writes to this buffer
1681  * @dev: drm device
1682  * @data: ioctl data blob
1683  * @file: drm file
1684  */
1685 int
1686 i915_gem_sw_finish_ioctl(struct drm_device *dev, void *data,
1687                          struct drm_file *file)
1688 {
1689         struct drm_i915_gem_sw_finish *args = data;
1690         struct drm_i915_gem_object *obj;
1691
1692         obj = i915_gem_object_lookup(file, args->handle);
1693         if (!obj)
1694                 return -ENOENT;
1695
1696         /*
1697          * Proxy objects are barred from CPU access, so there is no
1698          * need to ban sw_finish as it is a nop.
1699          */
1700
1701         /* Pinned buffers may be scanout, so flush the cache */
1702         i915_gem_object_flush_if_display(obj);
1703         i915_gem_object_put(obj);
1704
1705         return 0;
1706 }
1707
1708 /**
1709  * i915_gem_mmap_ioctl - Maps the contents of an object, returning the address
1710  *                       it is mapped to.
1711  * @dev: drm device
1712  * @data: ioctl data blob
1713  * @file: drm file
1714  *
1715  * While the mapping holds a reference on the contents of the object, it doesn't
1716  * imply a ref on the object itself.
1717  *
1718  * IMPORTANT:
1719  *
1720  * DRM driver writers who look a this function as an example for how to do GEM
1721  * mmap support, please don't implement mmap support like here. The modern way
1722  * to implement DRM mmap support is with an mmap offset ioctl (like
1723  * i915_gem_mmap_gtt) and then using the mmap syscall on the DRM fd directly.
1724  * That way debug tooling like valgrind will understand what's going on, hiding
1725  * the mmap call in a driver private ioctl will break that. The i915 driver only
1726  * does cpu mmaps this way because we didn't know better.
1727  */
1728 int
1729 i915_gem_mmap_ioctl(struct drm_device *dev, void *data,
1730                     struct drm_file *file)
1731 {
1732         struct drm_i915_gem_mmap *args = data;
1733         struct drm_i915_gem_object *obj;
1734         unsigned long addr;
1735
1736         if (args->flags & ~(I915_MMAP_WC))
1737                 return -EINVAL;
1738
1739         if (args->flags & I915_MMAP_WC && !boot_cpu_has(X86_FEATURE_PAT))
1740                 return -ENODEV;
1741
1742         obj = i915_gem_object_lookup(file, args->handle);
1743         if (!obj)
1744                 return -ENOENT;
1745
1746         /* prime objects have no backing filp to GEM mmap
1747          * pages from.
1748          */
1749         if (!obj->base.filp) {
1750                 i915_gem_object_put(obj);
1751                 return -ENXIO;
1752         }
1753
1754         addr = vm_mmap(obj->base.filp, 0, args->size,
1755                        PROT_READ | PROT_WRITE, MAP_SHARED,
1756                        args->offset);
1757         if (args->flags & I915_MMAP_WC) {
1758                 struct mm_struct *mm = current->mm;
1759                 struct vm_area_struct *vma;
1760
1761                 if (down_write_killable(&mm->mmap_sem)) {
1762                         i915_gem_object_put(obj);
1763                         return -EINTR;
1764                 }
1765                 vma = find_vma(mm, addr);
1766                 if (vma)
1767                         vma->vm_page_prot =
1768                                 pgprot_writecombine(vm_get_page_prot(vma->vm_flags));
1769                 else
1770                         addr = -ENOMEM;
1771                 up_write(&mm->mmap_sem);
1772
1773                 /* This may race, but that's ok, it only gets set */
1774                 WRITE_ONCE(obj->frontbuffer_ggtt_origin, ORIGIN_CPU);
1775         }
1776         i915_gem_object_put(obj);
1777         if (IS_ERR((void *)addr))
1778                 return addr;
1779
1780         args->addr_ptr = (uint64_t) addr;
1781
1782         return 0;
1783 }
1784
1785 static unsigned int tile_row_pages(struct drm_i915_gem_object *obj)
1786 {
1787         return i915_gem_object_get_tile_row_size(obj) >> PAGE_SHIFT;
1788 }
1789
1790 /**
1791  * i915_gem_mmap_gtt_version - report the current feature set for GTT mmaps
1792  *
1793  * A history of the GTT mmap interface:
1794  *
1795  * 0 - Everything had to fit into the GTT. Both parties of a memcpy had to
1796  *     aligned and suitable for fencing, and still fit into the available
1797  *     mappable space left by the pinned display objects. A classic problem
1798  *     we called the page-fault-of-doom where we would ping-pong between
1799  *     two objects that could not fit inside the GTT and so the memcpy
1800  *     would page one object in at the expense of the other between every
1801  *     single byte.
1802  *
1803  * 1 - Objects can be any size, and have any compatible fencing (X Y, or none
1804  *     as set via i915_gem_set_tiling() [DRM_I915_GEM_SET_TILING]). If the
1805  *     object is too large for the available space (or simply too large
1806  *     for the mappable aperture!), a view is created instead and faulted
1807  *     into userspace. (This view is aligned and sized appropriately for
1808  *     fenced access.)
1809  *
1810  * 2 - Recognise WC as a separate cache domain so that we can flush the
1811  *     delayed writes via GTT before performing direct access via WC.
1812  *
1813  * Restrictions:
1814  *
1815  *  * snoopable objects cannot be accessed via the GTT. It can cause machine
1816  *    hangs on some architectures, corruption on others. An attempt to service
1817  *    a GTT page fault from a snoopable object will generate a SIGBUS.
1818  *
1819  *  * the object must be able to fit into RAM (physical memory, though no
1820  *    limited to the mappable aperture).
1821  *
1822  *
1823  * Caveats:
1824  *
1825  *  * a new GTT page fault will synchronize rendering from the GPU and flush
1826  *    all data to system memory. Subsequent access will not be synchronized.
1827  *
1828  *  * all mappings are revoked on runtime device suspend.
1829  *
1830  *  * there are only 8, 16 or 32 fence registers to share between all users
1831  *    (older machines require fence register for display and blitter access
1832  *    as well). Contention of the fence registers will cause the previous users
1833  *    to be unmapped and any new access will generate new page faults.
1834  *
1835  *  * running out of memory while servicing a fault may generate a SIGBUS,
1836  *    rather than the expected SIGSEGV.
1837  */
1838 int i915_gem_mmap_gtt_version(void)
1839 {
1840         return 2;
1841 }
1842
1843 static inline struct i915_ggtt_view
1844 compute_partial_view(struct drm_i915_gem_object *obj,
1845                      pgoff_t page_offset,
1846                      unsigned int chunk)
1847 {
1848         struct i915_ggtt_view view;
1849
1850         if (i915_gem_object_is_tiled(obj))
1851                 chunk = roundup(chunk, tile_row_pages(obj));
1852
1853         view.type = I915_GGTT_VIEW_PARTIAL;
1854         view.partial.offset = rounddown(page_offset, chunk);
1855         view.partial.size =
1856                 min_t(unsigned int, chunk,
1857                       (obj->base.size >> PAGE_SHIFT) - view.partial.offset);
1858
1859         /* If the partial covers the entire object, just create a normal VMA. */
1860         if (chunk >= obj->base.size >> PAGE_SHIFT)
1861                 view.type = I915_GGTT_VIEW_NORMAL;
1862
1863         return view;
1864 }
1865
1866 /**
1867  * i915_gem_fault - fault a page into the GTT
1868  * @vmf: fault info
1869  *
1870  * The fault handler is set up by drm_gem_mmap() when a object is GTT mapped
1871  * from userspace.  The fault handler takes care of binding the object to
1872  * the GTT (if needed), allocating and programming a fence register (again,
1873  * only if needed based on whether the old reg is still valid or the object
1874  * is tiled) and inserting a new PTE into the faulting process.
1875  *
1876  * Note that the faulting process may involve evicting existing objects
1877  * from the GTT and/or fence registers to make room.  So performance may
1878  * suffer if the GTT working set is large or there are few fence registers
1879  * left.
1880  *
1881  * The current feature set supported by i915_gem_fault() and thus GTT mmaps
1882  * is exposed via I915_PARAM_MMAP_GTT_VERSION (see i915_gem_mmap_gtt_version).
1883  */
1884 int i915_gem_fault(struct vm_fault *vmf)
1885 {
1886 #define MIN_CHUNK_PAGES ((1 << 20) >> PAGE_SHIFT) /* 1 MiB */
1887         struct vm_area_struct *area = vmf->vma;
1888         struct drm_i915_gem_object *obj = to_intel_bo(area->vm_private_data);
1889         struct drm_device *dev = obj->base.dev;
1890         struct drm_i915_private *dev_priv = to_i915(dev);
1891         struct i915_ggtt *ggtt = &dev_priv->ggtt;
1892         bool write = !!(vmf->flags & FAULT_FLAG_WRITE);
1893         struct i915_vma *vma;
1894         pgoff_t page_offset;
1895         unsigned int flags;
1896         int ret;
1897
1898         /* We don't use vmf->pgoff since that has the fake offset */
1899         page_offset = (vmf->address - area->vm_start) >> PAGE_SHIFT;
1900
1901         trace_i915_gem_object_fault(obj, page_offset, true, write);
1902
1903         /* Try to flush the object off the GPU first without holding the lock.
1904          * Upon acquiring the lock, we will perform our sanity checks and then
1905          * repeat the flush holding the lock in the normal manner to catch cases
1906          * where we are gazumped.
1907          */
1908         ret = i915_gem_object_wait(obj,
1909                                    I915_WAIT_INTERRUPTIBLE,
1910                                    MAX_SCHEDULE_TIMEOUT,
1911                                    NULL);
1912         if (ret)
1913                 goto err;
1914
1915         ret = i915_gem_object_pin_pages(obj);
1916         if (ret)
1917                 goto err;
1918
1919         intel_runtime_pm_get(dev_priv);
1920
1921         ret = i915_mutex_lock_interruptible(dev);
1922         if (ret)
1923                 goto err_rpm;
1924
1925         /* Access to snoopable pages through the GTT is incoherent. */
1926         if (obj->cache_level != I915_CACHE_NONE && !HAS_LLC(dev_priv)) {
1927                 ret = -EFAULT;
1928                 goto err_unlock;
1929         }
1930
1931         /* If the object is smaller than a couple of partial vma, it is
1932          * not worth only creating a single partial vma - we may as well
1933          * clear enough space for the full object.
1934          */
1935         flags = PIN_MAPPABLE;
1936         if (obj->base.size > 2 * MIN_CHUNK_PAGES << PAGE_SHIFT)
1937                 flags |= PIN_NONBLOCK | PIN_NONFAULT;
1938
1939         /* Now pin it into the GTT as needed */
1940         vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0, flags);
1941         if (IS_ERR(vma)) {
1942                 /* Use a partial view if it is bigger than available space */
1943                 struct i915_ggtt_view view =
1944                         compute_partial_view(obj, page_offset, MIN_CHUNK_PAGES);
1945
1946                 /* Userspace is now writing through an untracked VMA, abandon
1947                  * all hope that the hardware is able to track future writes.
1948                  */
1949                 obj->frontbuffer_ggtt_origin = ORIGIN_CPU;
1950
1951                 vma = i915_gem_object_ggtt_pin(obj, &view, 0, 0, PIN_MAPPABLE);
1952         }
1953         if (IS_ERR(vma)) {
1954                 ret = PTR_ERR(vma);
1955                 goto err_unlock;
1956         }
1957
1958         ret = i915_gem_object_set_to_gtt_domain(obj, write);
1959         if (ret)
1960                 goto err_unpin;
1961
1962         ret = i915_vma_pin_fence(vma);
1963         if (ret)
1964                 goto err_unpin;
1965
1966         /* Finally, remap it using the new GTT offset */
1967         ret = remap_io_mapping(area,
1968                                area->vm_start + (vma->ggtt_view.partial.offset << PAGE_SHIFT),
1969                                (ggtt->gmadr.start + vma->node.start) >> PAGE_SHIFT,
1970                                min_t(u64, vma->size, area->vm_end - area->vm_start),
1971                                &ggtt->iomap);
1972         if (ret)
1973                 goto err_fence;
1974
1975         /* Mark as being mmapped into userspace for later revocation */
1976         assert_rpm_wakelock_held(dev_priv);
1977         if (!i915_vma_set_userfault(vma) && !obj->userfault_count++)
1978                 list_add(&obj->userfault_link, &dev_priv->mm.userfault_list);
1979         GEM_BUG_ON(!obj->userfault_count);
1980
1981         i915_vma_set_ggtt_write(vma);
1982
1983 err_fence:
1984         i915_vma_unpin_fence(vma);
1985 err_unpin:
1986         __i915_vma_unpin(vma);
1987 err_unlock:
1988         mutex_unlock(&dev->struct_mutex);
1989 err_rpm:
1990         intel_runtime_pm_put(dev_priv);
1991         i915_gem_object_unpin_pages(obj);
1992 err:
1993         switch (ret) {
1994         case -EIO:
1995                 /*
1996                  * We eat errors when the gpu is terminally wedged to avoid
1997                  * userspace unduly crashing (gl has no provisions for mmaps to
1998                  * fail). But any other -EIO isn't ours (e.g. swap in failure)
1999                  * and so needs to be reported.
2000                  */
2001                 if (!i915_terminally_wedged(&dev_priv->gpu_error)) {
2002                         ret = VM_FAULT_SIGBUS;
2003                         break;
2004                 }
2005         case -EAGAIN:
2006                 /*
2007                  * EAGAIN means the gpu is hung and we'll wait for the error
2008                  * handler to reset everything when re-faulting in
2009                  * i915_mutex_lock_interruptible.
2010                  */
2011         case 0:
2012         case -ERESTARTSYS:
2013         case -EINTR:
2014         case -EBUSY:
2015                 /*
2016                  * EBUSY is ok: this just means that another thread
2017                  * already did the job.
2018                  */
2019                 ret = VM_FAULT_NOPAGE;
2020                 break;
2021         case -ENOMEM:
2022                 ret = VM_FAULT_OOM;
2023                 break;
2024         case -ENOSPC:
2025         case -EFAULT:
2026                 ret = VM_FAULT_SIGBUS;
2027                 break;
2028         default:
2029                 WARN_ONCE(ret, "unhandled error in i915_gem_fault: %i\n", ret);
2030                 ret = VM_FAULT_SIGBUS;
2031                 break;
2032         }
2033         return ret;
2034 }
2035
2036 static void __i915_gem_object_release_mmap(struct drm_i915_gem_object *obj)
2037 {
2038         struct i915_vma *vma;
2039
2040         GEM_BUG_ON(!obj->userfault_count);
2041
2042         obj->userfault_count = 0;
2043         list_del(&obj->userfault_link);
2044         drm_vma_node_unmap(&obj->base.vma_node,
2045                            obj->base.dev->anon_inode->i_mapping);
2046
2047         for_each_ggtt_vma(vma, obj)
2048                 i915_vma_unset_userfault(vma);
2049 }
2050
2051 /**
2052  * i915_gem_release_mmap - remove physical page mappings
2053  * @obj: obj in question
2054  *
2055  * Preserve the reservation of the mmapping with the DRM core code, but
2056  * relinquish ownership of the pages back to the system.
2057  *
2058  * It is vital that we remove the page mapping if we have mapped a tiled
2059  * object through the GTT and then lose the fence register due to
2060  * resource pressure. Similarly if the object has been moved out of the
2061  * aperture, than pages mapped into userspace must be revoked. Removing the
2062  * mapping will then trigger a page fault on the next user access, allowing
2063  * fixup by i915_gem_fault().
2064  */
2065 void
2066 i915_gem_release_mmap(struct drm_i915_gem_object *obj)
2067 {
2068         struct drm_i915_private *i915 = to_i915(obj->base.dev);
2069
2070         /* Serialisation between user GTT access and our code depends upon
2071          * revoking the CPU's PTE whilst the mutex is held. The next user
2072          * pagefault then has to wait until we release the mutex.
2073          *
2074          * Note that RPM complicates somewhat by adding an additional
2075          * requirement that operations to the GGTT be made holding the RPM
2076          * wakeref.
2077          */
2078         lockdep_assert_held(&i915->drm.struct_mutex);
2079         intel_runtime_pm_get(i915);
2080
2081         if (!obj->userfault_count)
2082                 goto out;
2083
2084         __i915_gem_object_release_mmap(obj);
2085
2086         /* Ensure that the CPU's PTE are revoked and there are not outstanding
2087          * memory transactions from userspace before we return. The TLB
2088          * flushing implied above by changing the PTE above *should* be
2089          * sufficient, an extra barrier here just provides us with a bit
2090          * of paranoid documentation about our requirement to serialise
2091          * memory writes before touching registers / GSM.
2092          */
2093         wmb();
2094
2095 out:
2096         intel_runtime_pm_put(i915);
2097 }
2098
2099 void i915_gem_runtime_suspend(struct drm_i915_private *dev_priv)
2100 {
2101         struct drm_i915_gem_object *obj, *on;
2102         int i;
2103
2104         /*
2105          * Only called during RPM suspend. All users of the userfault_list
2106          * must be holding an RPM wakeref to ensure that this can not
2107          * run concurrently with themselves (and use the struct_mutex for
2108          * protection between themselves).
2109          */
2110
2111         list_for_each_entry_safe(obj, on,
2112                                  &dev_priv->mm.userfault_list, userfault_link)
2113                 __i915_gem_object_release_mmap(obj);
2114
2115         /* The fence will be lost when the device powers down. If any were
2116          * in use by hardware (i.e. they are pinned), we should not be powering
2117          * down! All other fences will be reacquired by the user upon waking.
2118          */
2119         for (i = 0; i < dev_priv->num_fence_regs; i++) {
2120                 struct drm_i915_fence_reg *reg = &dev_priv->fence_regs[i];
2121
2122                 /* Ideally we want to assert that the fence register is not
2123                  * live at this point (i.e. that no piece of code will be
2124                  * trying to write through fence + GTT, as that both violates
2125                  * our tracking of activity and associated locking/barriers,
2126                  * but also is illegal given that the hw is powered down).
2127                  *
2128                  * Previously we used reg->pin_count as a "liveness" indicator.
2129                  * That is not sufficient, and we need a more fine-grained
2130                  * tool if we want to have a sanity check here.
2131                  */
2132
2133                 if (!reg->vma)
2134                         continue;
2135
2136                 GEM_BUG_ON(i915_vma_has_userfault(reg->vma));
2137                 reg->dirty = true;
2138         }
2139 }
2140
2141 static int i915_gem_object_create_mmap_offset(struct drm_i915_gem_object *obj)
2142 {
2143         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
2144         int err;
2145
2146         err = drm_gem_create_mmap_offset(&obj->base);
2147         if (likely(!err))
2148                 return 0;
2149
2150         /* Attempt to reap some mmap space from dead objects */
2151         do {
2152                 err = i915_gem_wait_for_idle(dev_priv, I915_WAIT_INTERRUPTIBLE);
2153                 if (err)
2154                         break;
2155
2156                 i915_gem_drain_freed_objects(dev_priv);
2157                 err = drm_gem_create_mmap_offset(&obj->base);
2158                 if (!err)
2159                         break;
2160
2161         } while (flush_delayed_work(&dev_priv->gt.retire_work));
2162
2163         return err;
2164 }
2165
2166 static void i915_gem_object_free_mmap_offset(struct drm_i915_gem_object *obj)
2167 {
2168         drm_gem_free_mmap_offset(&obj->base);
2169 }
2170
2171 int
2172 i915_gem_mmap_gtt(struct drm_file *file,
2173                   struct drm_device *dev,
2174                   uint32_t handle,
2175                   uint64_t *offset)
2176 {
2177         struct drm_i915_gem_object *obj;
2178         int ret;
2179
2180         obj = i915_gem_object_lookup(file, handle);
2181         if (!obj)
2182                 return -ENOENT;
2183
2184         ret = i915_gem_object_create_mmap_offset(obj);
2185         if (ret == 0)
2186                 *offset = drm_vma_node_offset_addr(&obj->base.vma_node);
2187
2188         i915_gem_object_put(obj);
2189         return ret;
2190 }
2191
2192 /**
2193  * i915_gem_mmap_gtt_ioctl - prepare an object for GTT mmap'ing
2194  * @dev: DRM device
2195  * @data: GTT mapping ioctl data
2196  * @file: GEM object info
2197  *
2198  * Simply returns the fake offset to userspace so it can mmap it.
2199  * The mmap call will end up in drm_gem_mmap(), which will set things
2200  * up so we can get faults in the handler above.
2201  *
2202  * The fault handler will take care of binding the object into the GTT
2203  * (since it may have been evicted to make room for something), allocating
2204  * a fence register, and mapping the appropriate aperture address into
2205  * userspace.
2206  */
2207 int
2208 i915_gem_mmap_gtt_ioctl(struct drm_device *dev, void *data,
2209                         struct drm_file *file)
2210 {
2211         struct drm_i915_gem_mmap_gtt *args = data;
2212
2213         return i915_gem_mmap_gtt(file, dev, args->handle, &args->offset);
2214 }
2215
2216 /* Immediately discard the backing storage */
2217 static void
2218 i915_gem_object_truncate(struct drm_i915_gem_object *obj)
2219 {
2220         i915_gem_object_free_mmap_offset(obj);
2221
2222         if (obj->base.filp == NULL)
2223                 return;
2224
2225         /* Our goal here is to return as much of the memory as
2226          * is possible back to the system as we are called from OOM.
2227          * To do this we must instruct the shmfs to drop all of its
2228          * backing pages, *now*.
2229          */
2230         shmem_truncate_range(file_inode(obj->base.filp), 0, (loff_t)-1);
2231         obj->mm.madv = __I915_MADV_PURGED;
2232         obj->mm.pages = ERR_PTR(-EFAULT);
2233 }
2234
2235 /* Try to discard unwanted pages */
2236 void __i915_gem_object_invalidate(struct drm_i915_gem_object *obj)
2237 {
2238         struct address_space *mapping;
2239
2240         lockdep_assert_held(&obj->mm.lock);
2241         GEM_BUG_ON(i915_gem_object_has_pages(obj));
2242
2243         switch (obj->mm.madv) {
2244         case I915_MADV_DONTNEED:
2245                 i915_gem_object_truncate(obj);
2246         case __I915_MADV_PURGED:
2247                 return;
2248         }
2249
2250         if (obj->base.filp == NULL)
2251                 return;
2252
2253         mapping = obj->base.filp->f_mapping,
2254         invalidate_mapping_pages(mapping, 0, (loff_t)-1);
2255 }
2256
2257 static void
2258 i915_gem_object_put_pages_gtt(struct drm_i915_gem_object *obj,
2259                               struct sg_table *pages)
2260 {
2261         struct sgt_iter sgt_iter;
2262         struct page *page;
2263
2264         __i915_gem_object_release_shmem(obj, pages, true);
2265
2266         i915_gem_gtt_finish_pages(obj, pages);
2267
2268         if (i915_gem_object_needs_bit17_swizzle(obj))
2269                 i915_gem_object_save_bit_17_swizzle(obj, pages);
2270
2271         for_each_sgt_page(page, sgt_iter, pages) {
2272                 if (obj->mm.dirty)
2273                         set_page_dirty(page);
2274
2275                 if (obj->mm.madv == I915_MADV_WILLNEED)
2276                         mark_page_accessed(page);
2277
2278                 put_page(page);
2279         }
2280         obj->mm.dirty = false;
2281
2282         sg_free_table(pages);
2283         kfree(pages);
2284 }
2285
2286 static void __i915_gem_object_reset_page_iter(struct drm_i915_gem_object *obj)
2287 {
2288         struct radix_tree_iter iter;
2289         void __rcu **slot;
2290
2291         rcu_read_lock();
2292         radix_tree_for_each_slot(slot, &obj->mm.get_page.radix, &iter, 0)
2293                 radix_tree_delete(&obj->mm.get_page.radix, iter.index);
2294         rcu_read_unlock();
2295 }
2296
2297 void __i915_gem_object_put_pages(struct drm_i915_gem_object *obj,
2298                                  enum i915_mm_subclass subclass)
2299 {
2300         struct drm_i915_private *i915 = to_i915(obj->base.dev);
2301         struct sg_table *pages;
2302
2303         if (i915_gem_object_has_pinned_pages(obj))
2304                 return;
2305
2306         GEM_BUG_ON(obj->bind_count);
2307         if (!i915_gem_object_has_pages(obj))
2308                 return;
2309
2310         /* May be called by shrinker from within get_pages() (on another bo) */
2311         mutex_lock_nested(&obj->mm.lock, subclass);
2312         if (unlikely(atomic_read(&obj->mm.pages_pin_count)))
2313                 goto unlock;
2314
2315         /* ->put_pages might need to allocate memory for the bit17 swizzle
2316          * array, hence protect them from being reaped by removing them from gtt
2317          * lists early. */
2318         pages = fetch_and_zero(&obj->mm.pages);
2319         GEM_BUG_ON(!pages);
2320
2321         spin_lock(&i915->mm.obj_lock);
2322         list_del(&obj->mm.link);
2323         spin_unlock(&i915->mm.obj_lock);
2324
2325         if (obj->mm.mapping) {
2326                 void *ptr;
2327
2328                 ptr = page_mask_bits(obj->mm.mapping);
2329                 if (is_vmalloc_addr(ptr))
2330                         vunmap(ptr);
2331                 else
2332                         kunmap(kmap_to_page(ptr));
2333
2334                 obj->mm.mapping = NULL;
2335         }
2336
2337         __i915_gem_object_reset_page_iter(obj);
2338
2339         if (!IS_ERR(pages))
2340                 obj->ops->put_pages(obj, pages);
2341
2342         obj->mm.page_sizes.phys = obj->mm.page_sizes.sg = 0;
2343
2344 unlock:
2345         mutex_unlock(&obj->mm.lock);
2346 }
2347
2348 static bool i915_sg_trim(struct sg_table *orig_st)
2349 {
2350         struct sg_table new_st;
2351         struct scatterlist *sg, *new_sg;
2352         unsigned int i;
2353
2354         if (orig_st->nents == orig_st->orig_nents)
2355                 return false;
2356
2357         if (sg_alloc_table(&new_st, orig_st->nents, GFP_KERNEL | __GFP_NOWARN))
2358                 return false;
2359
2360         new_sg = new_st.sgl;
2361         for_each_sg(orig_st->sgl, sg, orig_st->nents, i) {
2362                 sg_set_page(new_sg, sg_page(sg), sg->length, 0);
2363                 /* called before being DMA mapped, no need to copy sg->dma_* */
2364                 new_sg = sg_next(new_sg);
2365         }
2366         GEM_BUG_ON(new_sg); /* Should walk exactly nents and hit the end */
2367
2368         sg_free_table(orig_st);
2369
2370         *orig_st = new_st;
2371         return true;
2372 }
2373
2374 static int i915_gem_object_get_pages_gtt(struct drm_i915_gem_object *obj)
2375 {
2376         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
2377         const unsigned long page_count = obj->base.size / PAGE_SIZE;
2378         unsigned long i;
2379         struct address_space *mapping;
2380         struct sg_table *st;
2381         struct scatterlist *sg;
2382         struct sgt_iter sgt_iter;
2383         struct page *page;
2384         unsigned long last_pfn = 0;     /* suppress gcc warning */
2385         unsigned int max_segment = i915_sg_segment_size();
2386         unsigned int sg_page_sizes;
2387         gfp_t noreclaim;
2388         int ret;
2389
2390         /* Assert that the object is not currently in any GPU domain. As it
2391          * wasn't in the GTT, there shouldn't be any way it could have been in
2392          * a GPU cache
2393          */
2394         GEM_BUG_ON(obj->read_domains & I915_GEM_GPU_DOMAINS);
2395         GEM_BUG_ON(obj->write_domain & I915_GEM_GPU_DOMAINS);
2396
2397         st = kmalloc(sizeof(*st), GFP_KERNEL);
2398         if (st == NULL)
2399                 return -ENOMEM;
2400
2401 rebuild_st:
2402         if (sg_alloc_table(st, page_count, GFP_KERNEL)) {
2403                 kfree(st);
2404                 return -ENOMEM;
2405         }
2406
2407         /* Get the list of pages out of our struct file.  They'll be pinned
2408          * at this point until we release them.
2409          *
2410          * Fail silently without starting the shrinker
2411          */
2412         mapping = obj->base.filp->f_mapping;
2413         noreclaim = mapping_gfp_constraint(mapping, ~__GFP_RECLAIM);
2414         noreclaim |= __GFP_NORETRY | __GFP_NOWARN;
2415
2416         sg = st->sgl;
2417         st->nents = 0;
2418         sg_page_sizes = 0;
2419         for (i = 0; i < page_count; i++) {
2420                 const unsigned int shrink[] = {
2421                         I915_SHRINK_BOUND | I915_SHRINK_UNBOUND | I915_SHRINK_PURGEABLE,
2422                         0,
2423                 }, *s = shrink;
2424                 gfp_t gfp = noreclaim;
2425
2426                 do {
2427                         page = shmem_read_mapping_page_gfp(mapping, i, gfp);
2428                         if (likely(!IS_ERR(page)))
2429                                 break;
2430
2431                         if (!*s) {
2432                                 ret = PTR_ERR(page);
2433                                 goto err_sg;
2434                         }
2435
2436                         i915_gem_shrink(dev_priv, 2 * page_count, NULL, *s++);
2437                         cond_resched();
2438
2439                         /* We've tried hard to allocate the memory by reaping
2440                          * our own buffer, now let the real VM do its job and
2441                          * go down in flames if truly OOM.
2442                          *
2443                          * However, since graphics tend to be disposable,
2444                          * defer the oom here by reporting the ENOMEM back
2445                          * to userspace.
2446                          */
2447                         if (!*s) {
2448                                 /* reclaim and warn, but no oom */
2449                                 gfp = mapping_gfp_mask(mapping);
2450
2451                                 /* Our bo are always dirty and so we require
2452                                  * kswapd to reclaim our pages (direct reclaim
2453                                  * does not effectively begin pageout of our
2454                                  * buffers on its own). However, direct reclaim
2455                                  * only waits for kswapd when under allocation
2456                                  * congestion. So as a result __GFP_RECLAIM is
2457                                  * unreliable and fails to actually reclaim our
2458                                  * dirty pages -- unless you try over and over
2459                                  * again with !__GFP_NORETRY. However, we still
2460                                  * want to fail this allocation rather than
2461                                  * trigger the out-of-memory killer and for
2462                                  * this we want __GFP_RETRY_MAYFAIL.
2463                                  */
2464                                 gfp |= __GFP_RETRY_MAYFAIL;
2465                         }
2466                 } while (1);
2467
2468                 if (!i ||
2469                     sg->length >= max_segment ||
2470                     page_to_pfn(page) != last_pfn + 1) {
2471                         if (i) {
2472                                 sg_page_sizes |= sg->length;
2473                                 sg = sg_next(sg);
2474                         }
2475                         st->nents++;
2476                         sg_set_page(sg, page, PAGE_SIZE, 0);
2477                 } else {
2478                         sg->length += PAGE_SIZE;
2479                 }
2480                 last_pfn = page_to_pfn(page);
2481
2482                 /* Check that the i965g/gm workaround works. */
2483                 WARN_ON((gfp & __GFP_DMA32) && (last_pfn >= 0x00100000UL));
2484         }
2485         if (sg) { /* loop terminated early; short sg table */
2486                 sg_page_sizes |= sg->length;
2487                 sg_mark_end(sg);
2488         }
2489
2490         /* Trim unused sg entries to avoid wasting memory. */
2491         i915_sg_trim(st);
2492
2493         ret = i915_gem_gtt_prepare_pages(obj, st);
2494         if (ret) {
2495                 /* DMA remapping failed? One possible cause is that
2496                  * it could not reserve enough large entries, asking
2497                  * for PAGE_SIZE chunks instead may be helpful.
2498                  */
2499                 if (max_segment > PAGE_SIZE) {
2500                         for_each_sgt_page(page, sgt_iter, st)
2501                                 put_page(page);
2502                         sg_free_table(st);
2503
2504                         max_segment = PAGE_SIZE;
2505                         goto rebuild_st;
2506                 } else {
2507                         dev_warn(&dev_priv->drm.pdev->dev,
2508                                  "Failed to DMA remap %lu pages\n",
2509                                  page_count);
2510                         goto err_pages;
2511                 }
2512         }
2513
2514         if (i915_gem_object_needs_bit17_swizzle(obj))
2515                 i915_gem_object_do_bit_17_swizzle(obj, st);
2516
2517         __i915_gem_object_set_pages(obj, st, sg_page_sizes);
2518
2519         return 0;
2520
2521 err_sg:
2522         sg_mark_end(sg);
2523 err_pages:
2524         for_each_sgt_page(page, sgt_iter, st)
2525                 put_page(page);
2526         sg_free_table(st);
2527         kfree(st);
2528
2529         /* shmemfs first checks if there is enough memory to allocate the page
2530          * and reports ENOSPC should there be insufficient, along with the usual
2531          * ENOMEM for a genuine allocation failure.
2532          *
2533          * We use ENOSPC in our driver to mean that we have run out of aperture
2534          * space and so want to translate the error from shmemfs back to our
2535          * usual understanding of ENOMEM.
2536          */
2537         if (ret == -ENOSPC)
2538                 ret = -ENOMEM;
2539
2540         return ret;
2541 }
2542
2543 void __i915_gem_object_set_pages(struct drm_i915_gem_object *obj,
2544                                  struct sg_table *pages,
2545                                  unsigned int sg_page_sizes)
2546 {
2547         struct drm_i915_private *i915 = to_i915(obj->base.dev);
2548         unsigned long supported = INTEL_INFO(i915)->page_sizes;
2549         int i;
2550
2551         lockdep_assert_held(&obj->mm.lock);
2552
2553         obj->mm.get_page.sg_pos = pages->sgl;
2554         obj->mm.get_page.sg_idx = 0;
2555
2556         obj->mm.pages = pages;
2557
2558         if (i915_gem_object_is_tiled(obj) &&
2559             i915->quirks & QUIRK_PIN_SWIZZLED_PAGES) {
2560                 GEM_BUG_ON(obj->mm.quirked);
2561                 __i915_gem_object_pin_pages(obj);
2562                 obj->mm.quirked = true;
2563         }
2564
2565         GEM_BUG_ON(!sg_page_sizes);
2566         obj->mm.page_sizes.phys = sg_page_sizes;
2567
2568         /*
2569          * Calculate the supported page-sizes which fit into the given
2570          * sg_page_sizes. This will give us the page-sizes which we may be able
2571          * to use opportunistically when later inserting into the GTT. For
2572          * example if phys=2G, then in theory we should be able to use 1G, 2M,
2573          * 64K or 4K pages, although in practice this will depend on a number of
2574          * other factors.
2575          */
2576         obj->mm.page_sizes.sg = 0;
2577         for_each_set_bit(i, &supported, ilog2(I915_GTT_MAX_PAGE_SIZE) + 1) {
2578                 if (obj->mm.page_sizes.phys & ~0u << i)
2579                         obj->mm.page_sizes.sg |= BIT(i);
2580         }
2581         GEM_BUG_ON(!HAS_PAGE_SIZES(i915, obj->mm.page_sizes.sg));
2582
2583         spin_lock(&i915->mm.obj_lock);
2584         list_add(&obj->mm.link, &i915->mm.unbound_list);
2585         spin_unlock(&i915->mm.obj_lock);
2586 }
2587
2588 static int ____i915_gem_object_get_pages(struct drm_i915_gem_object *obj)
2589 {
2590         int err;
2591
2592         if (unlikely(obj->mm.madv != I915_MADV_WILLNEED)) {
2593                 DRM_DEBUG("Attempting to obtain a purgeable object\n");
2594                 return -EFAULT;
2595         }
2596
2597         err = obj->ops->get_pages(obj);
2598         GEM_BUG_ON(!err && !i915_gem_object_has_pages(obj));
2599
2600         return err;
2601 }
2602
2603 /* Ensure that the associated pages are gathered from the backing storage
2604  * and pinned into our object. i915_gem_object_pin_pages() may be called
2605  * multiple times before they are released by a single call to
2606  * i915_gem_object_unpin_pages() - once the pages are no longer referenced
2607  * either as a result of memory pressure (reaping pages under the shrinker)
2608  * or as the object is itself released.
2609  */
2610 int __i915_gem_object_get_pages(struct drm_i915_gem_object *obj)
2611 {
2612         int err;
2613
2614         err = mutex_lock_interruptible(&obj->mm.lock);
2615         if (err)
2616                 return err;
2617
2618         if (unlikely(!i915_gem_object_has_pages(obj))) {
2619                 GEM_BUG_ON(i915_gem_object_has_pinned_pages(obj));
2620
2621                 err = ____i915_gem_object_get_pages(obj);
2622                 if (err)
2623                         goto unlock;
2624
2625                 smp_mb__before_atomic();
2626         }
2627         atomic_inc(&obj->mm.pages_pin_count);
2628
2629 unlock:
2630         mutex_unlock(&obj->mm.lock);
2631         return err;
2632 }
2633
2634 /* The 'mapping' part of i915_gem_object_pin_map() below */
2635 static void *i915_gem_object_map(const struct drm_i915_gem_object *obj,
2636                                  enum i915_map_type type)
2637 {
2638         unsigned long n_pages = obj->base.size >> PAGE_SHIFT;
2639         struct sg_table *sgt = obj->mm.pages;
2640         struct sgt_iter sgt_iter;
2641         struct page *page;
2642         struct page *stack_pages[32];
2643         struct page **pages = stack_pages;
2644         unsigned long i = 0;
2645         pgprot_t pgprot;
2646         void *addr;
2647
2648         /* A single page can always be kmapped */
2649         if (n_pages == 1 && type == I915_MAP_WB)
2650                 return kmap(sg_page(sgt->sgl));
2651
2652         if (n_pages > ARRAY_SIZE(stack_pages)) {
2653                 /* Too big for stack -- allocate temporary array instead */
2654                 pages = kvmalloc_array(n_pages, sizeof(*pages), GFP_KERNEL);
2655                 if (!pages)
2656                         return NULL;
2657         }
2658
2659         for_each_sgt_page(page, sgt_iter, sgt)
2660                 pages[i++] = page;
2661
2662         /* Check that we have the expected number of pages */
2663         GEM_BUG_ON(i != n_pages);
2664
2665         switch (type) {
2666         default:
2667                 MISSING_CASE(type);
2668                 /* fallthrough to use PAGE_KERNEL anyway */
2669         case I915_MAP_WB:
2670                 pgprot = PAGE_KERNEL;
2671                 break;
2672         case I915_MAP_WC:
2673                 pgprot = pgprot_writecombine(PAGE_KERNEL_IO);
2674                 break;
2675         }
2676         addr = vmap(pages, n_pages, 0, pgprot);
2677
2678         if (pages != stack_pages)
2679                 kvfree(pages);
2680
2681         return addr;
2682 }
2683
2684 /* get, pin, and map the pages of the object into kernel space */
2685 void *i915_gem_object_pin_map(struct drm_i915_gem_object *obj,
2686                               enum i915_map_type type)
2687 {
2688         enum i915_map_type has_type;
2689         bool pinned;
2690         void *ptr;
2691         int ret;
2692
2693         if (unlikely(!i915_gem_object_has_struct_page(obj)))
2694                 return ERR_PTR(-ENXIO);
2695
2696         ret = mutex_lock_interruptible(&obj->mm.lock);
2697         if (ret)
2698                 return ERR_PTR(ret);
2699
2700         pinned = !(type & I915_MAP_OVERRIDE);
2701         type &= ~I915_MAP_OVERRIDE;
2702
2703         if (!atomic_inc_not_zero(&obj->mm.pages_pin_count)) {
2704                 if (unlikely(!i915_gem_object_has_pages(obj))) {
2705                         GEM_BUG_ON(i915_gem_object_has_pinned_pages(obj));
2706
2707                         ret = ____i915_gem_object_get_pages(obj);
2708                         if (ret)
2709                                 goto err_unlock;
2710
2711                         smp_mb__before_atomic();
2712                 }
2713                 atomic_inc(&obj->mm.pages_pin_count);
2714                 pinned = false;
2715         }
2716         GEM_BUG_ON(!i915_gem_object_has_pages(obj));
2717
2718         ptr = page_unpack_bits(obj->mm.mapping, &has_type);
2719         if (ptr && has_type != type) {
2720                 if (pinned) {
2721                         ret = -EBUSY;
2722                         goto err_unpin;
2723                 }
2724
2725                 if (is_vmalloc_addr(ptr))
2726                         vunmap(ptr);
2727                 else
2728                         kunmap(kmap_to_page(ptr));
2729
2730                 ptr = obj->mm.mapping = NULL;
2731         }
2732
2733         if (!ptr) {
2734                 ptr = i915_gem_object_map(obj, type);
2735                 if (!ptr) {
2736                         ret = -ENOMEM;
2737                         goto err_unpin;
2738                 }
2739
2740                 obj->mm.mapping = page_pack_bits(ptr, type);
2741         }
2742
2743 out_unlock:
2744         mutex_unlock(&obj->mm.lock);
2745         return ptr;
2746
2747 err_unpin:
2748         atomic_dec(&obj->mm.pages_pin_count);
2749 err_unlock:
2750         ptr = ERR_PTR(ret);
2751         goto out_unlock;
2752 }
2753
2754 static int
2755 i915_gem_object_pwrite_gtt(struct drm_i915_gem_object *obj,
2756                            const struct drm_i915_gem_pwrite *arg)
2757 {
2758         struct address_space *mapping = obj->base.filp->f_mapping;
2759         char __user *user_data = u64_to_user_ptr(arg->data_ptr);
2760         u64 remain, offset;
2761         unsigned int pg;
2762
2763         /* Before we instantiate/pin the backing store for our use, we
2764          * can prepopulate the shmemfs filp efficiently using a write into
2765          * the pagecache. We avoid the penalty of instantiating all the
2766          * pages, important if the user is just writing to a few and never
2767          * uses the object on the GPU, and using a direct write into shmemfs
2768          * allows it to avoid the cost of retrieving a page (either swapin
2769          * or clearing-before-use) before it is overwritten.
2770          */
2771         if (i915_gem_object_has_pages(obj))
2772                 return -ENODEV;
2773
2774         if (obj->mm.madv != I915_MADV_WILLNEED)
2775                 return -EFAULT;
2776
2777         /* Before the pages are instantiated the object is treated as being
2778          * in the CPU domain. The pages will be clflushed as required before
2779          * use, and we can freely write into the pages directly. If userspace
2780          * races pwrite with any other operation; corruption will ensue -
2781          * that is userspace's prerogative!
2782          */
2783
2784         remain = arg->size;
2785         offset = arg->offset;
2786         pg = offset_in_page(offset);
2787
2788         do {
2789                 unsigned int len, unwritten;
2790                 struct page *page;
2791                 void *data, *vaddr;
2792                 int err;
2793
2794                 len = PAGE_SIZE - pg;
2795                 if (len > remain)
2796                         len = remain;
2797
2798                 err = pagecache_write_begin(obj->base.filp, mapping,
2799                                             offset, len, 0,
2800                                             &page, &data);
2801                 if (err < 0)
2802                         return err;
2803
2804                 vaddr = kmap(page);
2805                 unwritten = copy_from_user(vaddr + pg, user_data, len);
2806                 kunmap(page);
2807
2808                 err = pagecache_write_end(obj->base.filp, mapping,
2809                                           offset, len, len - unwritten,
2810                                           page, data);
2811                 if (err < 0)
2812                         return err;
2813
2814                 if (unwritten)
2815                         return -EFAULT;
2816
2817                 remain -= len;
2818                 user_data += len;
2819                 offset += len;
2820                 pg = 0;
2821         } while (remain);
2822
2823         return 0;
2824 }
2825
2826 static void i915_gem_context_mark_guilty(struct i915_gem_context *ctx)
2827 {
2828         bool banned;
2829
2830         atomic_inc(&ctx->guilty_count);
2831
2832         banned = false;
2833         if (i915_gem_context_is_bannable(ctx)) {
2834                 unsigned int score;
2835
2836                 score = atomic_add_return(CONTEXT_SCORE_GUILTY,
2837                                           &ctx->ban_score);
2838                 banned = score >= CONTEXT_SCORE_BAN_THRESHOLD;
2839
2840                 DRM_DEBUG_DRIVER("context %s marked guilty (score %d) banned? %s\n",
2841                                  ctx->name, score, yesno(banned));
2842         }
2843         if (!banned)
2844                 return;
2845
2846         i915_gem_context_set_banned(ctx);
2847         if (!IS_ERR_OR_NULL(ctx->file_priv)) {
2848                 atomic_inc(&ctx->file_priv->context_bans);
2849                 DRM_DEBUG_DRIVER("client %s has had %d context banned\n",
2850                                  ctx->name, atomic_read(&ctx->file_priv->context_bans));
2851         }
2852 }
2853
2854 static void i915_gem_context_mark_innocent(struct i915_gem_context *ctx)
2855 {
2856         atomic_inc(&ctx->active_count);
2857 }
2858
2859 struct drm_i915_gem_request *
2860 i915_gem_find_active_request(struct intel_engine_cs *engine)
2861 {
2862         struct drm_i915_gem_request *request, *active = NULL;
2863         unsigned long flags;
2864
2865         /* We are called by the error capture and reset at a random
2866          * point in time. In particular, note that neither is crucially
2867          * ordered with an interrupt. After a hang, the GPU is dead and we
2868          * assume that no more writes can happen (we waited long enough for
2869          * all writes that were in transaction to be flushed) - adding an
2870          * extra delay for a recent interrupt is pointless. Hence, we do
2871          * not need an engine->irq_seqno_barrier() before the seqno reads.
2872          */
2873         spin_lock_irqsave(&engine->timeline->lock, flags);
2874         list_for_each_entry(request, &engine->timeline->requests, link) {
2875                 if (__i915_gem_request_completed(request,
2876                                                  request->global_seqno))
2877                         continue;
2878
2879                 GEM_BUG_ON(request->engine != engine);
2880                 GEM_BUG_ON(test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
2881                                     &request->fence.flags));
2882
2883                 active = request;
2884                 break;
2885         }
2886         spin_unlock_irqrestore(&engine->timeline->lock, flags);
2887
2888         return active;
2889 }
2890
2891 static bool engine_stalled(struct intel_engine_cs *engine)
2892 {
2893         if (!engine->hangcheck.stalled)
2894                 return false;
2895
2896         /* Check for possible seqno movement after hang declaration */
2897         if (engine->hangcheck.seqno != intel_engine_get_seqno(engine)) {
2898                 DRM_DEBUG_DRIVER("%s pardoned\n", engine->name);
2899                 return false;
2900         }
2901
2902         return true;
2903 }
2904
2905 /*
2906  * Ensure irq handler finishes, and not run again.
2907  * Also return the active request so that we only search for it once.
2908  */
2909 struct drm_i915_gem_request *
2910 i915_gem_reset_prepare_engine(struct intel_engine_cs *engine)
2911 {
2912         struct drm_i915_gem_request *request = NULL;
2913
2914         /*
2915          * During the reset sequence, we must prevent the engine from
2916          * entering RC6. As the context state is undefined until we restart
2917          * the engine, if it does enter RC6 during the reset, the state
2918          * written to the powercontext is undefined and so we may lose
2919          * GPU state upon resume, i.e. fail to restart after a reset.
2920          */
2921         intel_uncore_forcewake_get(engine->i915, FORCEWAKE_ALL);
2922
2923         /*
2924          * Prevent the signaler thread from updating the request
2925          * state (by calling dma_fence_signal) as we are processing
2926          * the reset. The write from the GPU of the seqno is
2927          * asynchronous and the signaler thread may see a different
2928          * value to us and declare the request complete, even though
2929          * the reset routine have picked that request as the active
2930          * (incomplete) request. This conflict is not handled
2931          * gracefully!
2932          */
2933         kthread_park(engine->breadcrumbs.signaler);
2934
2935         /*
2936          * Prevent request submission to the hardware until we have
2937          * completed the reset in i915_gem_reset_finish(). If a request
2938          * is completed by one engine, it may then queue a request
2939          * to a second via its execlists->tasklet *just* as we are
2940          * calling engine->init_hw() and also writing the ELSP.
2941          * Turning off the execlists->tasklet until the reset is over
2942          * prevents the race.
2943          */
2944         tasklet_kill(&engine->execlists.tasklet);
2945         tasklet_disable(&engine->execlists.tasklet);
2946
2947         /*
2948          * We're using worker to queue preemption requests from the tasklet in
2949          * GuC submission mode.
2950          * Even though tasklet was disabled, we may still have a worker queued.
2951          * Let's make sure that all workers scheduled before disabling the
2952          * tasklet are completed before continuing with the reset.
2953          */
2954         if (engine->i915->guc.preempt_wq)
2955                 flush_workqueue(engine->i915->guc.preempt_wq);
2956
2957         if (engine->irq_seqno_barrier)
2958                 engine->irq_seqno_barrier(engine);
2959
2960         request = i915_gem_find_active_request(engine);
2961         if (request && request->fence.error == -EIO)
2962                 request = ERR_PTR(-EIO); /* Previous reset failed! */
2963
2964         return request;
2965 }
2966
2967 int i915_gem_reset_prepare(struct drm_i915_private *dev_priv)
2968 {
2969         struct intel_engine_cs *engine;
2970         struct drm_i915_gem_request *request;
2971         enum intel_engine_id id;
2972         int err = 0;
2973
2974         for_each_engine(engine, dev_priv, id) {
2975                 request = i915_gem_reset_prepare_engine(engine);
2976                 if (IS_ERR(request)) {
2977                         err = PTR_ERR(request);
2978                         continue;
2979                 }
2980
2981                 engine->hangcheck.active_request = request;
2982         }
2983
2984         i915_gem_revoke_fences(dev_priv);
2985
2986         return err;
2987 }
2988
2989 static void skip_request(struct drm_i915_gem_request *request)
2990 {
2991         void *vaddr = request->ring->vaddr;
2992         u32 head;
2993
2994         /* As this request likely depends on state from the lost
2995          * context, clear out all the user operations leaving the
2996          * breadcrumb at the end (so we get the fence notifications).
2997          */
2998         head = request->head;
2999         if (request->postfix < head) {
3000                 memset(vaddr + head, 0, request->ring->size - head);
3001                 head = 0;
3002         }
3003         memset(vaddr + head, 0, request->postfix - head);
3004
3005         dma_fence_set_error(&request->fence, -EIO);
3006 }
3007
3008 static void engine_skip_context(struct drm_i915_gem_request *request)
3009 {
3010         struct intel_engine_cs *engine = request->engine;
3011         struct i915_gem_context *hung_ctx = request->ctx;
3012         struct intel_timeline *timeline;
3013         unsigned long flags;
3014
3015         timeline = i915_gem_context_lookup_timeline(hung_ctx, engine);
3016
3017         spin_lock_irqsave(&engine->timeline->lock, flags);
3018         spin_lock(&timeline->lock);
3019
3020         list_for_each_entry_continue(request, &engine->timeline->requests, link)
3021                 if (request->ctx == hung_ctx)
3022                         skip_request(request);
3023
3024         list_for_each_entry(request, &timeline->requests, link)
3025                 skip_request(request);
3026
3027         spin_unlock(&timeline->lock);
3028         spin_unlock_irqrestore(&engine->timeline->lock, flags);
3029 }
3030
3031 /* Returns the request if it was guilty of the hang */
3032 static struct drm_i915_gem_request *
3033 i915_gem_reset_request(struct intel_engine_cs *engine,
3034                        struct drm_i915_gem_request *request)
3035 {
3036         /* The guilty request will get skipped on a hung engine.
3037          *
3038          * Users of client default contexts do not rely on logical
3039          * state preserved between batches so it is safe to execute
3040          * queued requests following the hang. Non default contexts
3041          * rely on preserved state, so skipping a batch loses the
3042          * evolution of the state and it needs to be considered corrupted.
3043          * Executing more queued batches on top of corrupted state is
3044          * risky. But we take the risk by trying to advance through
3045          * the queued requests in order to make the client behaviour
3046          * more predictable around resets, by not throwing away random
3047          * amount of batches it has prepared for execution. Sophisticated
3048          * clients can use gem_reset_stats_ioctl and dma fence status
3049          * (exported via sync_file info ioctl on explicit fences) to observe
3050          * when it loses the context state and should rebuild accordingly.
3051          *
3052          * The context ban, and ultimately the client ban, mechanism are safety
3053          * valves if client submission ends up resulting in nothing more than
3054          * subsequent hangs.
3055          */
3056
3057         if (engine_stalled(engine)) {
3058                 i915_gem_context_mark_guilty(request->ctx);
3059                 skip_request(request);
3060
3061                 /* If this context is now banned, skip all pending requests. */
3062                 if (i915_gem_context_is_banned(request->ctx))
3063                         engine_skip_context(request);
3064         } else {
3065                 /*
3066                  * Since this is not the hung engine, it may have advanced
3067                  * since the hang declaration. Double check by refinding
3068                  * the active request at the time of the reset.
3069                  */
3070                 request = i915_gem_find_active_request(engine);
3071                 if (request) {
3072                         i915_gem_context_mark_innocent(request->ctx);
3073                         dma_fence_set_error(&request->fence, -EAGAIN);
3074
3075                         /* Rewind the engine to replay the incomplete rq */
3076                         spin_lock_irq(&engine->timeline->lock);
3077                         request = list_prev_entry(request, link);
3078                         if (&request->link == &engine->timeline->requests)
3079                                 request = NULL;
3080                         spin_unlock_irq(&engine->timeline->lock);
3081                 }
3082         }
3083
3084         return request;
3085 }
3086
3087 void i915_gem_reset_engine(struct intel_engine_cs *engine,
3088                            struct drm_i915_gem_request *request)
3089 {
3090         /*
3091          * Make sure this write is visible before we re-enable the interrupt
3092          * handlers on another CPU, as tasklet_enable() resolves to just
3093          * a compiler barrier which is insufficient for our purpose here.
3094          */
3095         smp_store_mb(engine->irq_posted, 0);
3096
3097         if (request)
3098                 request = i915_gem_reset_request(engine, request);
3099
3100         if (request) {
3101                 DRM_DEBUG_DRIVER("resetting %s to restart from tail of request 0x%x\n",
3102                                  engine->name, request->global_seqno);
3103         }
3104
3105         /* Setup the CS to resume from the breadcrumb of the hung request */
3106         engine->reset_hw(engine, request);
3107 }
3108
3109 void i915_gem_reset(struct drm_i915_private *dev_priv)
3110 {
3111         struct intel_engine_cs *engine;
3112         enum intel_engine_id id;
3113
3114         lockdep_assert_held(&dev_priv->drm.struct_mutex);
3115
3116         i915_gem_retire_requests(dev_priv);
3117
3118         for_each_engine(engine, dev_priv, id) {
3119                 struct i915_gem_context *ctx;
3120
3121                 i915_gem_reset_engine(engine, engine->hangcheck.active_request);
3122                 ctx = fetch_and_zero(&engine->last_retired_context);
3123                 if (ctx)
3124                         engine->context_unpin(engine, ctx);
3125
3126                 /*
3127                  * Ostensibily, we always want a context loaded for powersaving,
3128                  * so if the engine is idle after the reset, send a request
3129                  * to load our scratch kernel_context.
3130                  *
3131                  * More mysteriously, if we leave the engine idle after a reset,
3132                  * the next userspace batch may hang, with what appears to be
3133                  * an incoherent read by the CS (presumably stale TLB). An
3134                  * empty request appears sufficient to paper over the glitch.
3135                  */
3136                 if (intel_engine_is_idle(engine)) {
3137                         struct drm_i915_gem_request *rq;
3138
3139                         rq = i915_gem_request_alloc(engine,
3140                                                     dev_priv->kernel_context);
3141                         if (!IS_ERR(rq))
3142                                 __i915_add_request(rq, false);
3143                 }
3144         }
3145
3146         i915_gem_restore_fences(dev_priv);
3147
3148         if (dev_priv->gt.awake) {
3149                 intel_sanitize_gt_powersave(dev_priv);
3150                 intel_enable_gt_powersave(dev_priv);
3151                 if (INTEL_GEN(dev_priv) >= 6)
3152                         gen6_rps_busy(dev_priv);
3153         }
3154 }
3155
3156 void i915_gem_reset_finish_engine(struct intel_engine_cs *engine)
3157 {
3158         tasklet_enable(&engine->execlists.tasklet);
3159         kthread_unpark(engine->breadcrumbs.signaler);
3160
3161         intel_uncore_forcewake_put(engine->i915, FORCEWAKE_ALL);
3162 }
3163
3164 void i915_gem_reset_finish(struct drm_i915_private *dev_priv)
3165 {
3166         struct intel_engine_cs *engine;
3167         enum intel_engine_id id;
3168
3169         lockdep_assert_held(&dev_priv->drm.struct_mutex);
3170
3171         for_each_engine(engine, dev_priv, id) {
3172                 engine->hangcheck.active_request = NULL;
3173                 i915_gem_reset_finish_engine(engine);
3174         }
3175 }
3176
3177 static void nop_submit_request(struct drm_i915_gem_request *request)
3178 {
3179         dma_fence_set_error(&request->fence, -EIO);
3180
3181         i915_gem_request_submit(request);
3182 }
3183
3184 static void nop_complete_submit_request(struct drm_i915_gem_request *request)
3185 {
3186         unsigned long flags;
3187
3188         dma_fence_set_error(&request->fence, -EIO);
3189
3190         spin_lock_irqsave(&request->engine->timeline->lock, flags);
3191         __i915_gem_request_submit(request);
3192         intel_engine_init_global_seqno(request->engine, request->global_seqno);
3193         spin_unlock_irqrestore(&request->engine->timeline->lock, flags);
3194 }
3195
3196 void i915_gem_set_wedged(struct drm_i915_private *i915)
3197 {
3198         struct intel_engine_cs *engine;
3199         enum intel_engine_id id;
3200
3201         if (drm_debug & DRM_UT_DRIVER) {
3202                 struct drm_printer p = drm_debug_printer(__func__);
3203
3204                 for_each_engine(engine, i915, id)
3205                         intel_engine_dump(engine, &p, "%s\n", engine->name);
3206         }
3207
3208         set_bit(I915_WEDGED, &i915->gpu_error.flags);
3209         smp_mb__after_atomic();
3210
3211         /*
3212          * First, stop submission to hw, but do not yet complete requests by
3213          * rolling the global seqno forward (since this would complete requests
3214          * for which we haven't set the fence error to EIO yet).
3215          */
3216         for_each_engine(engine, i915, id)
3217                 engine->submit_request = nop_submit_request;
3218
3219         /*
3220          * Make sure no one is running the old callback before we proceed with
3221          * cancelling requests and resetting the completion tracking. Otherwise
3222          * we might submit a request to the hardware which never completes.
3223          */
3224         synchronize_rcu();
3225
3226         for_each_engine(engine, i915, id) {
3227                 /* Mark all executing requests as skipped */
3228                 engine->cancel_requests(engine);
3229
3230                 /*
3231                  * Only once we've force-cancelled all in-flight requests can we
3232                  * start to complete all requests.
3233                  */
3234                 engine->submit_request = nop_complete_submit_request;
3235                 engine->schedule = NULL;
3236         }
3237
3238         i915->caps.scheduler = 0;
3239
3240         /*
3241          * Make sure no request can slip through without getting completed by
3242          * either this call here to intel_engine_init_global_seqno, or the one
3243          * in nop_complete_submit_request.
3244          */
3245         synchronize_rcu();
3246
3247         for_each_engine(engine, i915, id) {
3248                 unsigned long flags;
3249
3250                 /*
3251                  * Mark all pending requests as complete so that any concurrent
3252                  * (lockless) lookup doesn't try and wait upon the request as we
3253                  * reset it.
3254                  */
3255                 spin_lock_irqsave(&engine->timeline->lock, flags);
3256                 intel_engine_init_global_seqno(engine,
3257                                                intel_engine_last_submit(engine));
3258                 spin_unlock_irqrestore(&engine->timeline->lock, flags);
3259         }
3260
3261         wake_up_all(&i915->gpu_error.reset_queue);
3262 }
3263
3264 bool i915_gem_unset_wedged(struct drm_i915_private *i915)
3265 {
3266         struct i915_gem_timeline *tl;
3267         int i;
3268
3269         lockdep_assert_held(&i915->drm.struct_mutex);
3270         if (!test_bit(I915_WEDGED, &i915->gpu_error.flags))
3271                 return true;
3272
3273         /* Before unwedging, make sure that all pending operations
3274          * are flushed and errored out - we may have requests waiting upon
3275          * third party fences. We marked all inflight requests as EIO, and
3276          * every execbuf since returned EIO, for consistency we want all
3277          * the currently pending requests to also be marked as EIO, which
3278          * is done inside our nop_submit_request - and so we must wait.
3279          *
3280          * No more can be submitted until we reset the wedged bit.
3281          */
3282         list_for_each_entry(tl, &i915->gt.timelines, link) {
3283                 for (i = 0; i < ARRAY_SIZE(tl->engine); i++) {
3284                         struct drm_i915_gem_request *rq;
3285
3286                         rq = i915_gem_active_peek(&tl->engine[i].last_request,
3287                                                   &i915->drm.struct_mutex);
3288                         if (!rq)
3289                                 continue;
3290
3291                         /* We can't use our normal waiter as we want to
3292                          * avoid recursively trying to handle the current
3293                          * reset. The basic dma_fence_default_wait() installs
3294                          * a callback for dma_fence_signal(), which is
3295                          * triggered by our nop handler (indirectly, the
3296                          * callback enables the signaler thread which is
3297                          * woken by the nop_submit_request() advancing the seqno
3298                          * and when the seqno passes the fence, the signaler
3299                          * then signals the fence waking us up).
3300                          */
3301                         if (dma_fence_default_wait(&rq->fence, true,
3302                                                    MAX_SCHEDULE_TIMEOUT) < 0)
3303                                 return false;
3304                 }
3305         }
3306
3307         /* Undo nop_submit_request. We prevent all new i915 requests from
3308          * being queued (by disallowing execbuf whilst wedged) so having
3309          * waited for all active requests above, we know the system is idle
3310          * and do not have to worry about a thread being inside
3311          * engine->submit_request() as we swap over. So unlike installing
3312          * the nop_submit_request on reset, we can do this from normal
3313          * context and do not require stop_machine().
3314          */
3315         intel_engines_reset_default_submission(i915);
3316         i915_gem_contexts_lost(i915);
3317
3318         smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
3319         clear_bit(I915_WEDGED, &i915->gpu_error.flags);
3320
3321         return true;
3322 }
3323
3324 static void
3325 i915_gem_retire_work_handler(struct work_struct *work)
3326 {
3327         struct drm_i915_private *dev_priv =
3328                 container_of(work, typeof(*dev_priv), gt.retire_work.work);
3329         struct drm_device *dev = &dev_priv->drm;
3330
3331         /* Come back later if the device is busy... */
3332         if (mutex_trylock(&dev->struct_mutex)) {
3333                 i915_gem_retire_requests(dev_priv);
3334                 mutex_unlock(&dev->struct_mutex);
3335         }
3336
3337         /*
3338          * Keep the retire handler running until we are finally idle.
3339          * We do not need to do this test under locking as in the worst-case
3340          * we queue the retire worker once too often.
3341          */
3342         if (READ_ONCE(dev_priv->gt.awake))
3343                 queue_delayed_work(dev_priv->wq,
3344                                    &dev_priv->gt.retire_work,
3345                                    round_jiffies_up_relative(HZ));
3346 }
3347
3348 static void shrink_caches(struct drm_i915_private *i915)
3349 {
3350         /*
3351          * kmem_cache_shrink() discards empty slabs and reorders partially
3352          * filled slabs to prioritise allocating from the mostly full slabs,
3353          * with the aim of reducing fragmentation.
3354          */
3355         kmem_cache_shrink(i915->priorities);
3356         kmem_cache_shrink(i915->dependencies);
3357         kmem_cache_shrink(i915->requests);
3358         kmem_cache_shrink(i915->luts);
3359         kmem_cache_shrink(i915->vmas);
3360         kmem_cache_shrink(i915->objects);
3361 }
3362
3363 struct sleep_rcu_work {
3364         union {
3365                 struct rcu_head rcu;
3366                 struct work_struct work;
3367         };
3368         struct drm_i915_private *i915;
3369         unsigned int epoch;
3370 };
3371
3372 static inline bool
3373 same_epoch(struct drm_i915_private *i915, unsigned int epoch)
3374 {
3375         /*
3376          * There is a small chance that the epoch wrapped since we started
3377          * sleeping. If we assume that epoch is at least a u32, then it will
3378          * take at least 2^32 * 100ms for it to wrap, or about 326 years.
3379          */
3380         return epoch == READ_ONCE(i915->gt.epoch);
3381 }
3382
3383 static void __sleep_work(struct work_struct *work)
3384 {
3385         struct sleep_rcu_work *s = container_of(work, typeof(*s), work);
3386         struct drm_i915_private *i915 = s->i915;
3387         unsigned int epoch = s->epoch;
3388
3389         kfree(s);
3390         if (same_epoch(i915, epoch))
3391                 shrink_caches(i915);
3392 }
3393
3394 static void __sleep_rcu(struct rcu_head *rcu)
3395 {
3396         struct sleep_rcu_work *s = container_of(rcu, typeof(*s), rcu);
3397         struct drm_i915_private *i915 = s->i915;
3398
3399         if (same_epoch(i915, s->epoch)) {
3400                 INIT_WORK(&s->work, __sleep_work);
3401                 queue_work(i915->wq, &s->work);
3402         } else {
3403                 kfree(s);
3404         }
3405 }
3406
3407 static inline bool
3408 new_requests_since_last_retire(const struct drm_i915_private *i915)
3409 {
3410         return (READ_ONCE(i915->gt.active_requests) ||
3411                 work_pending(&i915->gt.idle_work.work));
3412 }
3413
3414 static void
3415 i915_gem_idle_work_handler(struct work_struct *work)
3416 {
3417         struct drm_i915_private *dev_priv =
3418                 container_of(work, typeof(*dev_priv), gt.idle_work.work);
3419         unsigned int epoch = I915_EPOCH_INVALID;
3420         bool rearm_hangcheck;
3421         ktime_t end;
3422
3423         if (!READ_ONCE(dev_priv->gt.awake))
3424                 return;
3425
3426         /*
3427          * Wait for last execlists context complete, but bail out in case a
3428          * new request is submitted.
3429          */
3430         end = ktime_add_ms(ktime_get(), I915_IDLE_ENGINES_TIMEOUT);
3431         do {
3432                 if (new_requests_since_last_retire(dev_priv))
3433                         return;
3434
3435                 if (intel_engines_are_idle(dev_priv))
3436                         break;
3437
3438                 usleep_range(100, 500);
3439         } while (ktime_before(ktime_get(), end));
3440
3441         rearm_hangcheck =
3442                 cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work);
3443
3444         if (!mutex_trylock(&dev_priv->drm.struct_mutex)) {
3445                 /* Currently busy, come back later */
3446                 mod_delayed_work(dev_priv->wq,
3447                                  &dev_priv->gt.idle_work,
3448                                  msecs_to_jiffies(50));
3449                 goto out_rearm;
3450         }
3451
3452         /*
3453          * New request retired after this work handler started, extend active
3454          * period until next instance of the work.
3455          */
3456         if (new_requests_since_last_retire(dev_priv))
3457                 goto out_unlock;
3458
3459         /*
3460          * Be paranoid and flush a concurrent interrupt to make sure
3461          * we don't reactivate any irq tasklets after parking.
3462          *
3463          * FIXME: Note that even though we have waited for execlists to be idle,
3464          * there may still be an in-flight interrupt even though the CSB
3465          * is now empty. synchronize_irq() makes sure that a residual interrupt
3466          * is completed before we continue, but it doesn't prevent the HW from
3467          * raising a spurious interrupt later. To complete the shield we should
3468          * coordinate disabling the CS irq with flushing the interrupts.
3469          */
3470         synchronize_irq(dev_priv->drm.irq);
3471
3472         intel_engines_park(dev_priv);
3473         i915_gem_timelines_park(dev_priv);
3474
3475         i915_pmu_gt_parked(dev_priv);
3476
3477         GEM_BUG_ON(!dev_priv->gt.awake);
3478         dev_priv->gt.awake = false;
3479         epoch = dev_priv->gt.epoch;
3480         GEM_BUG_ON(epoch == I915_EPOCH_INVALID);
3481         rearm_hangcheck = false;
3482
3483         if (INTEL_GEN(dev_priv) >= 6)
3484                 gen6_rps_idle(dev_priv);
3485
3486         intel_display_power_put(dev_priv, POWER_DOMAIN_GT_IRQ);
3487
3488         intel_runtime_pm_put(dev_priv);
3489 out_unlock:
3490         mutex_unlock(&dev_priv->drm.struct_mutex);
3491
3492 out_rearm:
3493         if (rearm_hangcheck) {
3494                 GEM_BUG_ON(!dev_priv->gt.awake);
3495                 i915_queue_hangcheck(dev_priv);
3496         }
3497
3498         /*
3499          * When we are idle, it is an opportune time to reap our caches.
3500          * However, we have many objects that utilise RCU and the ordered
3501          * i915->wq that this work is executing on. To try and flush any
3502          * pending frees now we are idle, we first wait for an RCU grace
3503          * period, and then queue a task (that will run last on the wq) to
3504          * shrink and re-optimize the caches.
3505          */
3506         if (same_epoch(dev_priv, epoch)) {
3507                 struct sleep_rcu_work *s = kmalloc(sizeof(*s), GFP_KERNEL);
3508                 if (s) {
3509                         s->i915 = dev_priv;
3510                         s->epoch = epoch;
3511                         call_rcu(&s->rcu, __sleep_rcu);
3512                 }
3513         }
3514 }
3515
3516 void i915_gem_close_object(struct drm_gem_object *gem, struct drm_file *file)
3517 {
3518         struct drm_i915_private *i915 = to_i915(gem->dev);
3519         struct drm_i915_gem_object *obj = to_intel_bo(gem);
3520         struct drm_i915_file_private *fpriv = file->driver_priv;
3521         struct i915_lut_handle *lut, *ln;
3522
3523         mutex_lock(&i915->drm.struct_mutex);
3524
3525         list_for_each_entry_safe(lut, ln, &obj->lut_list, obj_link) {
3526                 struct i915_gem_context *ctx = lut->ctx;
3527                 struct i915_vma *vma;
3528
3529                 GEM_BUG_ON(ctx->file_priv == ERR_PTR(-EBADF));
3530                 if (ctx->file_priv != fpriv)
3531                         continue;
3532
3533                 vma = radix_tree_delete(&ctx->handles_vma, lut->handle);
3534                 GEM_BUG_ON(vma->obj != obj);
3535
3536                 /* We allow the process to have multiple handles to the same
3537                  * vma, in the same fd namespace, by virtue of flink/open.
3538                  */
3539                 GEM_BUG_ON(!vma->open_count);
3540                 if (!--vma->open_count && !i915_vma_is_ggtt(vma))
3541                         i915_vma_close(vma);
3542
3543                 list_del(&lut->obj_link);
3544                 list_del(&lut->ctx_link);
3545
3546                 kmem_cache_free(i915->luts, lut);
3547                 __i915_gem_object_release_unless_active(obj);
3548         }
3549
3550         mutex_unlock(&i915->drm.struct_mutex);
3551 }
3552
3553 static unsigned long to_wait_timeout(s64 timeout_ns)
3554 {
3555         if (timeout_ns < 0)
3556                 return MAX_SCHEDULE_TIMEOUT;
3557
3558         if (timeout_ns == 0)
3559                 return 0;
3560
3561         return nsecs_to_jiffies_timeout(timeout_ns);
3562 }
3563
3564 /**
3565  * i915_gem_wait_ioctl - implements DRM_IOCTL_I915_GEM_WAIT
3566  * @dev: drm device pointer
3567  * @data: ioctl data blob
3568  * @file: drm file pointer
3569  *
3570  * Returns 0 if successful, else an error is returned with the remaining time in
3571  * the timeout parameter.
3572  *  -ETIME: object is still busy after timeout
3573  *  -ERESTARTSYS: signal interrupted the wait
3574  *  -ENONENT: object doesn't exist
3575  * Also possible, but rare:
3576  *  -EAGAIN: incomplete, restart syscall
3577  *  -ENOMEM: damn
3578  *  -ENODEV: Internal IRQ fail
3579  *  -E?: The add request failed
3580  *
3581  * The wait ioctl with a timeout of 0 reimplements the busy ioctl. With any
3582  * non-zero timeout parameter the wait ioctl will wait for the given number of
3583  * nanoseconds on an object becoming unbusy. Since the wait itself does so
3584  * without holding struct_mutex the object may become re-busied before this
3585  * function completes. A similar but shorter * race condition exists in the busy
3586  * ioctl
3587  */
3588 int
3589 i915_gem_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
3590 {
3591         struct drm_i915_gem_wait *args = data;
3592         struct drm_i915_gem_object *obj;
3593         ktime_t start;
3594         long ret;
3595
3596         if (args->flags != 0)
3597                 return -EINVAL;
3598
3599         obj = i915_gem_object_lookup(file, args->bo_handle);
3600         if (!obj)
3601                 return -ENOENT;
3602
3603         start = ktime_get();
3604
3605         ret = i915_gem_object_wait(obj,
3606                                    I915_WAIT_INTERRUPTIBLE | I915_WAIT_ALL,
3607                                    to_wait_timeout(args->timeout_ns),
3608                                    to_rps_client(file));
3609
3610         if (args->timeout_ns > 0) {
3611                 args->timeout_ns -= ktime_to_ns(ktime_sub(ktime_get(), start));
3612                 if (args->timeout_ns < 0)
3613                         args->timeout_ns = 0;
3614
3615                 /*
3616                  * Apparently ktime isn't accurate enough and occasionally has a
3617                  * bit of mismatch in the jiffies<->nsecs<->ktime loop. So patch
3618                  * things up to make the test happy. We allow up to 1 jiffy.
3619                  *
3620                  * This is a regression from the timespec->ktime conversion.
3621                  */
3622                 if (ret == -ETIME && !nsecs_to_jiffies(args->timeout_ns))
3623                         args->timeout_ns = 0;
3624
3625                 /* Asked to wait beyond the jiffie/scheduler precision? */
3626                 if (ret == -ETIME && args->timeout_ns)
3627                         ret = -EAGAIN;
3628         }
3629
3630         i915_gem_object_put(obj);
3631         return ret;
3632 }
3633
3634 static int wait_for_timeline(struct i915_gem_timeline *tl, unsigned int flags)
3635 {
3636         int ret, i;
3637
3638         for (i = 0; i < ARRAY_SIZE(tl->engine); i++) {
3639                 ret = i915_gem_active_wait(&tl->engine[i].last_request, flags);
3640                 if (ret)
3641                         return ret;
3642         }
3643
3644         return 0;
3645 }
3646
3647 static int wait_for_engines(struct drm_i915_private *i915)
3648 {
3649         if (wait_for(intel_engines_are_idle(i915), I915_IDLE_ENGINES_TIMEOUT)) {
3650                 dev_err(i915->drm.dev,
3651                         "Failed to idle engines, declaring wedged!\n");
3652                 if (drm_debug & DRM_UT_DRIVER) {
3653                         struct drm_printer p = drm_debug_printer(__func__);
3654                         struct intel_engine_cs *engine;
3655                         enum intel_engine_id id;
3656
3657                         for_each_engine(engine, i915, id)
3658                                 intel_engine_dump(engine, &p,
3659                                                   "%s\n", engine->name);
3660                 }
3661
3662                 i915_gem_set_wedged(i915);
3663                 return -EIO;
3664         }
3665
3666         return 0;
3667 }
3668
3669 int i915_gem_wait_for_idle(struct drm_i915_private *i915, unsigned int flags)
3670 {
3671         int ret;
3672
3673         /* If the device is asleep, we have no requests outstanding */
3674         if (!READ_ONCE(i915->gt.awake))
3675                 return 0;
3676
3677         if (flags & I915_WAIT_LOCKED) {
3678                 struct i915_gem_timeline *tl;
3679
3680                 lockdep_assert_held(&i915->drm.struct_mutex);
3681
3682                 list_for_each_entry(tl, &i915->gt.timelines, link) {
3683                         ret = wait_for_timeline(tl, flags);
3684                         if (ret)
3685                                 return ret;
3686                 }
3687                 i915_gem_retire_requests(i915);
3688
3689                 ret = wait_for_engines(i915);
3690         } else {
3691                 ret = wait_for_timeline(&i915->gt.global_timeline, flags);
3692         }
3693
3694         return ret;
3695 }
3696
3697 static void __i915_gem_object_flush_for_display(struct drm_i915_gem_object *obj)
3698 {
3699         /*
3700          * We manually flush the CPU domain so that we can override and
3701          * force the flush for the display, and perform it asyncrhonously.
3702          */
3703         flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
3704         if (obj->cache_dirty)
3705                 i915_gem_clflush_object(obj, I915_CLFLUSH_FORCE);
3706         obj->write_domain = 0;
3707 }
3708
3709 void i915_gem_object_flush_if_display(struct drm_i915_gem_object *obj)
3710 {
3711         if (!READ_ONCE(obj->pin_global))
3712                 return;
3713
3714         mutex_lock(&obj->base.dev->struct_mutex);
3715         __i915_gem_object_flush_for_display(obj);
3716         mutex_unlock(&obj->base.dev->struct_mutex);
3717 }
3718
3719 /**
3720  * Moves a single object to the WC read, and possibly write domain.
3721  * @obj: object to act on
3722  * @write: ask for write access or read only
3723  *
3724  * This function returns when the move is complete, including waiting on
3725  * flushes to occur.
3726  */
3727 int
3728 i915_gem_object_set_to_wc_domain(struct drm_i915_gem_object *obj, bool write)
3729 {
3730         int ret;
3731
3732         lockdep_assert_held(&obj->base.dev->struct_mutex);
3733
3734         ret = i915_gem_object_wait(obj,
3735                                    I915_WAIT_INTERRUPTIBLE |
3736                                    I915_WAIT_LOCKED |
3737                                    (write ? I915_WAIT_ALL : 0),
3738                                    MAX_SCHEDULE_TIMEOUT,
3739                                    NULL);
3740         if (ret)
3741                 return ret;
3742
3743         if (obj->write_domain == I915_GEM_DOMAIN_WC)
3744                 return 0;
3745
3746         /* Flush and acquire obj->pages so that we are coherent through
3747          * direct access in memory with previous cached writes through
3748          * shmemfs and that our cache domain tracking remains valid.
3749          * For example, if the obj->filp was moved to swap without us
3750          * being notified and releasing the pages, we would mistakenly
3751          * continue to assume that the obj remained out of the CPU cached
3752          * domain.
3753          */
3754         ret = i915_gem_object_pin_pages(obj);
3755         if (ret)
3756                 return ret;
3757
3758         flush_write_domain(obj, ~I915_GEM_DOMAIN_WC);
3759
3760         /* Serialise direct access to this object with the barriers for
3761          * coherent writes from the GPU, by effectively invalidating the
3762          * WC domain upon first access.
3763          */
3764         if ((obj->read_domains & I915_GEM_DOMAIN_WC) == 0)
3765                 mb();
3766
3767         /* It should now be out of any other write domains, and we can update
3768          * the domain values for our changes.
3769          */
3770         GEM_BUG_ON((obj->write_domain & ~I915_GEM_DOMAIN_WC) != 0);
3771         obj->read_domains |= I915_GEM_DOMAIN_WC;
3772         if (write) {
3773                 obj->read_domains = I915_GEM_DOMAIN_WC;
3774                 obj->write_domain = I915_GEM_DOMAIN_WC;
3775                 obj->mm.dirty = true;
3776         }
3777
3778         i915_gem_object_unpin_pages(obj);
3779         return 0;
3780 }
3781
3782 /**
3783  * Moves a single object to the GTT read, and possibly write domain.
3784  * @obj: object to act on
3785  * @write: ask for write access or read only
3786  *
3787  * This function returns when the move is complete, including waiting on
3788  * flushes to occur.
3789  */
3790 int
3791 i915_gem_object_set_to_gtt_domain(struct drm_i915_gem_object *obj, bool write)
3792 {
3793         int ret;
3794
3795         lockdep_assert_held(&obj->base.dev->struct_mutex);
3796
3797         ret = i915_gem_object_wait(obj,
3798                                    I915_WAIT_INTERRUPTIBLE |
3799                                    I915_WAIT_LOCKED |
3800                                    (write ? I915_WAIT_ALL : 0),
3801                                    MAX_SCHEDULE_TIMEOUT,
3802                                    NULL);
3803         if (ret)
3804                 return ret;
3805
3806         if (obj->write_domain == I915_GEM_DOMAIN_GTT)
3807                 return 0;
3808
3809         /* Flush and acquire obj->pages so that we are coherent through
3810          * direct access in memory with previous cached writes through
3811          * shmemfs and that our cache domain tracking remains valid.
3812          * For example, if the obj->filp was moved to swap without us
3813          * being notified and releasing the pages, we would mistakenly
3814          * continue to assume that the obj remained out of the CPU cached
3815          * domain.
3816          */
3817         ret = i915_gem_object_pin_pages(obj);
3818         if (ret)
3819                 return ret;
3820
3821         flush_write_domain(obj, ~I915_GEM_DOMAIN_GTT);
3822
3823         /* Serialise direct access to this object with the barriers for
3824          * coherent writes from the GPU, by effectively invalidating the
3825          * GTT domain upon first access.
3826          */
3827         if ((obj->read_domains & I915_GEM_DOMAIN_GTT) == 0)
3828                 mb();
3829
3830         /* It should now be out of any other write domains, and we can update
3831          * the domain values for our changes.
3832          */
3833         GEM_BUG_ON((obj->write_domain & ~I915_GEM_DOMAIN_GTT) != 0);
3834         obj->read_domains |= I915_GEM_DOMAIN_GTT;
3835         if (write) {
3836                 obj->read_domains = I915_GEM_DOMAIN_GTT;
3837                 obj->write_domain = I915_GEM_DOMAIN_GTT;
3838                 obj->mm.dirty = true;
3839         }
3840
3841         i915_gem_object_unpin_pages(obj);
3842         return 0;
3843 }
3844
3845 /**
3846  * Changes the cache-level of an object across all VMA.
3847  * @obj: object to act on
3848  * @cache_level: new cache level to set for the object
3849  *
3850  * After this function returns, the object will be in the new cache-level
3851  * across all GTT and the contents of the backing storage will be coherent,
3852  * with respect to the new cache-level. In order to keep the backing storage
3853  * coherent for all users, we only allow a single cache level to be set
3854  * globally on the object and prevent it from being changed whilst the
3855  * hardware is reading from the object. That is if the object is currently
3856  * on the scanout it will be set to uncached (or equivalent display
3857  * cache coherency) and all non-MOCS GPU access will also be uncached so
3858  * that all direct access to the scanout remains coherent.
3859  */
3860 int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
3861                                     enum i915_cache_level cache_level)
3862 {
3863         struct i915_vma *vma;
3864         int ret;
3865
3866         lockdep_assert_held(&obj->base.dev->struct_mutex);
3867
3868         if (obj->cache_level == cache_level)
3869                 return 0;
3870
3871         /* Inspect the list of currently bound VMA and unbind any that would
3872          * be invalid given the new cache-level. This is principally to
3873          * catch the issue of the CS prefetch crossing page boundaries and
3874          * reading an invalid PTE on older architectures.
3875          */
3876 restart:
3877         list_for_each_entry(vma, &obj->vma_list, obj_link) {
3878                 if (!drm_mm_node_allocated(&vma->node))
3879                         continue;
3880
3881                 if (i915_vma_is_pinned(vma)) {
3882                         DRM_DEBUG("can not change the cache level of pinned objects\n");
3883                         return -EBUSY;
3884                 }
3885
3886                 if (!i915_vma_is_closed(vma) &&
3887                     i915_gem_valid_gtt_space(vma, cache_level))
3888                         continue;
3889
3890                 ret = i915_vma_unbind(vma);
3891                 if (ret)
3892                         return ret;
3893
3894                 /* As unbinding may affect other elements in the
3895                  * obj->vma_list (due to side-effects from retiring
3896                  * an active vma), play safe and restart the iterator.
3897                  */
3898                 goto restart;
3899         }
3900
3901         /* We can reuse the existing drm_mm nodes but need to change the
3902          * cache-level on the PTE. We could simply unbind them all and
3903          * rebind with the correct cache-level on next use. However since
3904          * we already have a valid slot, dma mapping, pages etc, we may as
3905          * rewrite the PTE in the belief that doing so tramples upon less
3906          * state and so involves less work.
3907          */
3908         if (obj->bind_count) {
3909                 /* Before we change the PTE, the GPU must not be accessing it.
3910                  * If we wait upon the object, we know that all the bound
3911                  * VMA are no longer active.
3912                  */
3913                 ret = i915_gem_object_wait(obj,
3914                                            I915_WAIT_INTERRUPTIBLE |
3915                                            I915_WAIT_LOCKED |
3916                                            I915_WAIT_ALL,
3917                                            MAX_SCHEDULE_TIMEOUT,
3918                                            NULL);
3919                 if (ret)
3920                         return ret;
3921
3922                 if (!HAS_LLC(to_i915(obj->base.dev)) &&
3923                     cache_level != I915_CACHE_NONE) {
3924                         /* Access to snoopable pages through the GTT is
3925                          * incoherent and on some machines causes a hard
3926                          * lockup. Relinquish the CPU mmaping to force
3927                          * userspace to refault in the pages and we can
3928                          * then double check if the GTT mapping is still
3929                          * valid for that pointer access.
3930                          */
3931                         i915_gem_release_mmap(obj);
3932
3933                         /* As we no longer need a fence for GTT access,
3934                          * we can relinquish it now (and so prevent having
3935                          * to steal a fence from someone else on the next
3936                          * fence request). Note GPU activity would have
3937                          * dropped the fence as all snoopable access is
3938                          * supposed to be linear.
3939                          */
3940                         for_each_ggtt_vma(vma, obj) {
3941                                 ret = i915_vma_put_fence(vma);
3942                                 if (ret)
3943                                         return ret;
3944                         }
3945                 } else {
3946                         /* We either have incoherent backing store and
3947                          * so no GTT access or the architecture is fully
3948                          * coherent. In such cases, existing GTT mmaps
3949                          * ignore the cache bit in the PTE and we can
3950                          * rewrite it without confusing the GPU or having
3951                          * to force userspace to fault back in its mmaps.
3952                          */
3953                 }
3954
3955                 list_for_each_entry(vma, &obj->vma_list, obj_link) {
3956                         if (!drm_mm_node_allocated(&vma->node))
3957                                 continue;
3958
3959                         ret = i915_vma_bind(vma, cache_level, PIN_UPDATE);
3960                         if (ret)
3961                                 return ret;
3962                 }
3963         }
3964
3965         list_for_each_entry(vma, &obj->vma_list, obj_link)
3966                 vma->node.color = cache_level;
3967         i915_gem_object_set_cache_coherency(obj, cache_level);
3968         obj->cache_dirty = true; /* Always invalidate stale cachelines */
3969
3970         return 0;
3971 }
3972
3973 int i915_gem_get_caching_ioctl(struct drm_device *dev, void *data,
3974                                struct drm_file *file)
3975 {
3976         struct drm_i915_gem_caching *args = data;
3977         struct drm_i915_gem_object *obj;
3978         int err = 0;
3979
3980         rcu_read_lock();
3981         obj = i915_gem_object_lookup_rcu(file, args->handle);
3982         if (!obj) {
3983                 err = -ENOENT;
3984                 goto out;
3985         }
3986
3987         switch (obj->cache_level) {
3988         case I915_CACHE_LLC:
3989         case I915_CACHE_L3_LLC:
3990                 args->caching = I915_CACHING_CACHED;
3991                 break;
3992
3993         case I915_CACHE_WT:
3994                 args->caching = I915_CACHING_DISPLAY;
3995                 break;
3996
3997         default:
3998                 args->caching = I915_CACHING_NONE;
3999                 break;
4000         }
4001 out:
4002         rcu_read_unlock();
4003         return err;
4004 }
4005
4006 int i915_gem_set_caching_ioctl(struct drm_device *dev, void *data,
4007                                struct drm_file *file)
4008 {
4009         struct drm_i915_private *i915 = to_i915(dev);
4010         struct drm_i915_gem_caching *args = data;
4011         struct drm_i915_gem_object *obj;
4012         enum i915_cache_level level;
4013         int ret = 0;
4014
4015         switch (args->caching) {
4016         case I915_CACHING_NONE:
4017                 level = I915_CACHE_NONE;
4018                 break;
4019         case I915_CACHING_CACHED:
4020                 /*
4021                  * Due to a HW issue on BXT A stepping, GPU stores via a
4022                  * snooped mapping may leave stale data in a corresponding CPU
4023                  * cacheline, whereas normally such cachelines would get
4024                  * invalidated.
4025                  */
4026                 if (!HAS_LLC(i915) && !HAS_SNOOP(i915))
4027                         return -ENODEV;
4028
4029                 level = I915_CACHE_LLC;
4030                 break;
4031         case I915_CACHING_DISPLAY:
4032                 level = HAS_WT(i915) ? I915_CACHE_WT : I915_CACHE_NONE;
4033                 break;
4034         default:
4035                 return -EINVAL;
4036         }
4037
4038         obj = i915_gem_object_lookup(file, args->handle);
4039         if (!obj)
4040                 return -ENOENT;
4041
4042         /*
4043          * The caching mode of proxy object is handled by its generator, and
4044          * not allowed to be changed by userspace.
4045          */
4046         if (i915_gem_object_is_proxy(obj)) {
4047                 ret = -ENXIO;
4048                 goto out;
4049         }
4050
4051         if (obj->cache_level == level)
4052                 goto out;
4053
4054         ret = i915_gem_object_wait(obj,
4055                                    I915_WAIT_INTERRUPTIBLE,
4056                                    MAX_SCHEDULE_TIMEOUT,
4057                                    to_rps_client(file));
4058         if (ret)
4059                 goto out;
4060
4061         ret = i915_mutex_lock_interruptible(dev);
4062         if (ret)
4063                 goto out;
4064
4065         ret = i915_gem_object_set_cache_level(obj, level);
4066         mutex_unlock(&dev->struct_mutex);
4067
4068 out:
4069         i915_gem_object_put(obj);
4070         return ret;
4071 }
4072
4073 /*
4074  * Prepare buffer for display plane (scanout, cursors, etc).
4075  * Can be called from an uninterruptible phase (modesetting) and allows
4076  * any flushes to be pipelined (for pageflips).
4077  */
4078 struct i915_vma *
4079 i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj,
4080                                      u32 alignment,
4081                                      const struct i915_ggtt_view *view,
4082                                      unsigned int flags)
4083 {
4084         struct i915_vma *vma;
4085         int ret;
4086
4087         lockdep_assert_held(&obj->base.dev->struct_mutex);
4088
4089         /* Mark the global pin early so that we account for the
4090          * display coherency whilst setting up the cache domains.
4091          */
4092         obj->pin_global++;
4093
4094         /* The display engine is not coherent with the LLC cache on gen6.  As
4095          * a result, we make sure that the pinning that is about to occur is
4096          * done with uncached PTEs. This is lowest common denominator for all
4097          * chipsets.
4098          *
4099          * However for gen6+, we could do better by using the GFDT bit instead
4100          * of uncaching, which would allow us to flush all the LLC-cached data
4101          * with that bit in the PTE to main memory with just one PIPE_CONTROL.
4102          */
4103         ret = i915_gem_object_set_cache_level(obj,
4104                                               HAS_WT(to_i915(obj->base.dev)) ?
4105                                               I915_CACHE_WT : I915_CACHE_NONE);
4106         if (ret) {
4107                 vma = ERR_PTR(ret);
4108                 goto err_unpin_global;
4109         }
4110
4111         /* As the user may map the buffer once pinned in the display plane
4112          * (e.g. libkms for the bootup splash), we have to ensure that we
4113          * always use map_and_fenceable for all scanout buffers. However,
4114          * it may simply be too big to fit into mappable, in which case
4115          * put it anyway and hope that userspace can cope (but always first
4116          * try to preserve the existing ABI).
4117          */
4118         vma = ERR_PTR(-ENOSPC);
4119         if ((flags & PIN_MAPPABLE) == 0 &&
4120             (!view || view->type == I915_GGTT_VIEW_NORMAL))
4121                 vma = i915_gem_object_ggtt_pin(obj, view, 0, alignment,
4122                                                flags |
4123                                                PIN_MAPPABLE |
4124                                                PIN_NONBLOCK);
4125         if (IS_ERR(vma))
4126                 vma = i915_gem_object_ggtt_pin(obj, view, 0, alignment, flags);
4127         if (IS_ERR(vma))
4128                 goto err_unpin_global;
4129
4130         vma->display_alignment = max_t(u64, vma->display_alignment, alignment);
4131
4132         /* Treat this as an end-of-frame, like intel_user_framebuffer_dirty() */
4133         __i915_gem_object_flush_for_display(obj);
4134         intel_fb_obj_flush(obj, ORIGIN_DIRTYFB);
4135
4136         /* It should now be out of any other write domains, and we can update
4137          * the domain values for our changes.
4138          */
4139         obj->read_domains |= I915_GEM_DOMAIN_GTT;
4140
4141         return vma;
4142
4143 err_unpin_global:
4144         obj->pin_global--;
4145         return vma;
4146 }
4147
4148 void
4149 i915_gem_object_unpin_from_display_plane(struct i915_vma *vma)
4150 {
4151         lockdep_assert_held(&vma->vm->i915->drm.struct_mutex);
4152
4153         if (WARN_ON(vma->obj->pin_global == 0))
4154                 return;
4155
4156         if (--vma->obj->pin_global == 0)
4157                 vma->display_alignment = I915_GTT_MIN_ALIGNMENT;
4158
4159         /* Bump the LRU to try and avoid premature eviction whilst flipping  */
4160         i915_gem_object_bump_inactive_ggtt(vma->obj);
4161
4162         i915_vma_unpin(vma);
4163 }
4164
4165 /**
4166  * Moves a single object to the CPU read, and possibly write domain.
4167  * @obj: object to act on
4168  * @write: requesting write or read-only access
4169  *
4170  * This function returns when the move is complete, including waiting on
4171  * flushes to occur.
4172  */
4173 int
4174 i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write)
4175 {
4176         int ret;
4177
4178         lockdep_assert_held(&obj->base.dev->struct_mutex);
4179
4180         ret = i915_gem_object_wait(obj,
4181                                    I915_WAIT_INTERRUPTIBLE |
4182                                    I915_WAIT_LOCKED |
4183                                    (write ? I915_WAIT_ALL : 0),
4184                                    MAX_SCHEDULE_TIMEOUT,
4185                                    NULL);
4186         if (ret)
4187                 return ret;
4188
4189         flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
4190
4191         /* Flush the CPU cache if it's still invalid. */
4192         if ((obj->read_domains & I915_GEM_DOMAIN_CPU) == 0) {
4193                 i915_gem_clflush_object(obj, I915_CLFLUSH_SYNC);
4194                 obj->read_domains |= I915_GEM_DOMAIN_CPU;
4195         }
4196
4197         /* It should now be out of any other write domains, and we can update
4198          * the domain values for our changes.
4199          */
4200         GEM_BUG_ON(obj->write_domain & ~I915_GEM_DOMAIN_CPU);
4201
4202         /* If we're writing through the CPU, then the GPU read domains will
4203          * need to be invalidated at next use.
4204          */
4205         if (write)
4206                 __start_cpu_write(obj);
4207
4208         return 0;
4209 }
4210
4211 /* Throttle our rendering by waiting until the ring has completed our requests
4212  * emitted over 20 msec ago.
4213  *
4214  * Note that if we were to use the current jiffies each time around the loop,
4215  * we wouldn't escape the function with any frames outstanding if the time to
4216  * render a frame was over 20ms.
4217  *
4218  * This should get us reasonable parallelism between CPU and GPU but also
4219  * relatively low latency when blocking on a particular request to finish.
4220  */
4221 static int
4222 i915_gem_ring_throttle(struct drm_device *dev, struct drm_file *file)
4223 {
4224         struct drm_i915_private *dev_priv = to_i915(dev);
4225         struct drm_i915_file_private *file_priv = file->driver_priv;
4226         unsigned long recent_enough = jiffies - DRM_I915_THROTTLE_JIFFIES;
4227         struct drm_i915_gem_request *request, *target = NULL;
4228         long ret;
4229
4230         /* ABI: return -EIO if already wedged */
4231         if (i915_terminally_wedged(&dev_priv->gpu_error))
4232                 return -EIO;
4233
4234         spin_lock(&file_priv->mm.lock);
4235         list_for_each_entry(request, &file_priv->mm.request_list, client_link) {
4236                 if (time_after_eq(request->emitted_jiffies, recent_enough))
4237                         break;
4238
4239                 if (target) {
4240                         list_del(&target->client_link);
4241                         target->file_priv = NULL;
4242                 }
4243
4244                 target = request;
4245         }
4246         if (target)
4247                 i915_gem_request_get(target);
4248         spin_unlock(&file_priv->mm.lock);
4249
4250         if (target == NULL)
4251                 return 0;
4252
4253         ret = i915_wait_request(target,
4254                                 I915_WAIT_INTERRUPTIBLE,
4255                                 MAX_SCHEDULE_TIMEOUT);
4256         i915_gem_request_put(target);
4257
4258         return ret < 0 ? ret : 0;
4259 }
4260
4261 struct i915_vma *
4262 i915_gem_object_ggtt_pin(struct drm_i915_gem_object *obj,
4263                          const struct i915_ggtt_view *view,
4264                          u64 size,
4265                          u64 alignment,
4266                          u64 flags)
4267 {
4268         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
4269         struct i915_address_space *vm = &dev_priv->ggtt.base;
4270         struct i915_vma *vma;
4271         int ret;
4272
4273         lockdep_assert_held(&obj->base.dev->struct_mutex);
4274
4275         if (flags & PIN_MAPPABLE &&
4276             (!view || view->type == I915_GGTT_VIEW_NORMAL)) {
4277                 /* If the required space is larger than the available
4278                  * aperture, we will not able to find a slot for the
4279                  * object and unbinding the object now will be in
4280                  * vain. Worse, doing so may cause us to ping-pong
4281                  * the object in and out of the Global GTT and
4282                  * waste a lot of cycles under the mutex.
4283                  */
4284                 if (obj->base.size > dev_priv->ggtt.mappable_end)
4285                         return ERR_PTR(-E2BIG);
4286
4287                 /* If NONBLOCK is set the caller is optimistically
4288                  * trying to cache the full object within the mappable
4289                  * aperture, and *must* have a fallback in place for
4290                  * situations where we cannot bind the object. We
4291                  * can be a little more lax here and use the fallback
4292                  * more often to avoid costly migrations of ourselves
4293                  * and other objects within the aperture.
4294                  *
4295                  * Half-the-aperture is used as a simple heuristic.
4296                  * More interesting would to do search for a free
4297                  * block prior to making the commitment to unbind.
4298                  * That caters for the self-harm case, and with a
4299                  * little more heuristics (e.g. NOFAULT, NOEVICT)
4300                  * we could try to minimise harm to others.
4301                  */
4302                 if (flags & PIN_NONBLOCK &&
4303                     obj->base.size > dev_priv->ggtt.mappable_end / 2)
4304                         return ERR_PTR(-ENOSPC);
4305         }
4306
4307         vma = i915_vma_instance(obj, vm, view);
4308         if (unlikely(IS_ERR(vma)))
4309                 return vma;
4310
4311         if (i915_vma_misplaced(vma, size, alignment, flags)) {
4312                 if (flags & PIN_NONBLOCK) {
4313                         if (i915_vma_is_pinned(vma) || i915_vma_is_active(vma))
4314                                 return ERR_PTR(-ENOSPC);
4315
4316                         if (flags & PIN_MAPPABLE &&
4317                             vma->fence_size > dev_priv->ggtt.mappable_end / 2)
4318                                 return ERR_PTR(-ENOSPC);
4319                 }
4320
4321                 WARN(i915_vma_is_pinned(vma),
4322                      "bo is already pinned in ggtt with incorrect alignment:"
4323                      " offset=%08x, req.alignment=%llx,"
4324                      " req.map_and_fenceable=%d, vma->map_and_fenceable=%d\n",
4325                      i915_ggtt_offset(vma), alignment,
4326                      !!(flags & PIN_MAPPABLE),
4327                      i915_vma_is_map_and_fenceable(vma));
4328                 ret = i915_vma_unbind(vma);
4329                 if (ret)
4330                         return ERR_PTR(ret);
4331         }
4332
4333         ret = i915_vma_pin(vma, size, alignment, flags | PIN_GLOBAL);
4334         if (ret)
4335                 return ERR_PTR(ret);
4336
4337         return vma;
4338 }
4339
4340 static __always_inline unsigned int __busy_read_flag(unsigned int id)
4341 {
4342         /* Note that we could alias engines in the execbuf API, but
4343          * that would be very unwise as it prevents userspace from
4344          * fine control over engine selection. Ahem.
4345          *
4346          * This should be something like EXEC_MAX_ENGINE instead of
4347          * I915_NUM_ENGINES.
4348          */
4349         BUILD_BUG_ON(I915_NUM_ENGINES > 16);
4350         return 0x10000 << id;
4351 }
4352
4353 static __always_inline unsigned int __busy_write_id(unsigned int id)
4354 {
4355         /* The uABI guarantees an active writer is also amongst the read
4356          * engines. This would be true if we accessed the activity tracking
4357          * under the lock, but as we perform the lookup of the object and
4358          * its activity locklessly we can not guarantee that the last_write
4359          * being active implies that we have set the same engine flag from
4360          * last_read - hence we always set both read and write busy for
4361          * last_write.
4362          */
4363         return id | __busy_read_flag(id);
4364 }
4365
4366 static __always_inline unsigned int
4367 __busy_set_if_active(const struct dma_fence *fence,
4368                      unsigned int (*flag)(unsigned int id))
4369 {
4370         struct drm_i915_gem_request *rq;
4371
4372         /* We have to check the current hw status of the fence as the uABI
4373          * guarantees forward progress. We could rely on the idle worker
4374          * to eventually flush us, but to minimise latency just ask the
4375          * hardware.
4376          *
4377          * Note we only report on the status of native fences.
4378          */
4379         if (!dma_fence_is_i915(fence))
4380                 return 0;
4381
4382         /* opencode to_request() in order to avoid const warnings */
4383         rq = container_of(fence, struct drm_i915_gem_request, fence);
4384         if (i915_gem_request_completed(rq))
4385                 return 0;
4386
4387         return flag(rq->engine->uabi_id);
4388 }
4389
4390 static __always_inline unsigned int
4391 busy_check_reader(const struct dma_fence *fence)
4392 {
4393         return __busy_set_if_active(fence, __busy_read_flag);
4394 }
4395
4396 static __always_inline unsigned int
4397 busy_check_writer(const struct dma_fence *fence)
4398 {
4399         if (!fence)
4400                 return 0;
4401
4402         return __busy_set_if_active(fence, __busy_write_id);
4403 }
4404
4405 int
4406 i915_gem_busy_ioctl(struct drm_device *dev, void *data,
4407                     struct drm_file *file)
4408 {
4409         struct drm_i915_gem_busy *args = data;
4410         struct drm_i915_gem_object *obj;
4411         struct reservation_object_list *list;
4412         unsigned int seq;
4413         int err;
4414
4415         err = -ENOENT;
4416         rcu_read_lock();
4417         obj = i915_gem_object_lookup_rcu(file, args->handle);
4418         if (!obj)
4419                 goto out;
4420
4421         /* A discrepancy here is that we do not report the status of
4422          * non-i915 fences, i.e. even though we may report the object as idle,
4423          * a call to set-domain may still stall waiting for foreign rendering.
4424          * This also means that wait-ioctl may report an object as busy,
4425          * where busy-ioctl considers it idle.
4426          *
4427          * We trade the ability to warn of foreign fences to report on which
4428          * i915 engines are active for the object.
4429          *
4430          * Alternatively, we can trade that extra information on read/write
4431          * activity with
4432          *      args->busy =
4433          *              !reservation_object_test_signaled_rcu(obj->resv, true);
4434          * to report the overall busyness. This is what the wait-ioctl does.
4435          *
4436          */
4437 retry:
4438         seq = raw_read_seqcount(&obj->resv->seq);
4439
4440         /* Translate the exclusive fence to the READ *and* WRITE engine */
4441         args->busy = busy_check_writer(rcu_dereference(obj->resv->fence_excl));
4442
4443         /* Translate shared fences to READ set of engines */
4444         list = rcu_dereference(obj->resv->fence);
4445         if (list) {
4446                 unsigned int shared_count = list->shared_count, i;
4447
4448                 for (i = 0; i < shared_count; ++i) {
4449                         struct dma_fence *fence =
4450                                 rcu_dereference(list->shared[i]);
4451
4452                         args->busy |= busy_check_reader(fence);
4453                 }
4454         }
4455
4456         if (args->busy && read_seqcount_retry(&obj->resv->seq, seq))
4457                 goto retry;
4458
4459         err = 0;
4460 out:
4461         rcu_read_unlock();
4462         return err;
4463 }
4464
4465 int
4466 i915_gem_throttle_ioctl(struct drm_device *dev, void *data,
4467                         struct drm_file *file_priv)
4468 {
4469         return i915_gem_ring_throttle(dev, file_priv);
4470 }
4471
4472 int
4473 i915_gem_madvise_ioctl(struct drm_device *dev, void *data,
4474                        struct drm_file *file_priv)
4475 {
4476         struct drm_i915_private *dev_priv = to_i915(dev);
4477         struct drm_i915_gem_madvise *args = data;
4478         struct drm_i915_gem_object *obj;
4479         int err;
4480
4481         switch (args->madv) {
4482         case I915_MADV_DONTNEED:
4483         case I915_MADV_WILLNEED:
4484             break;
4485         default:
4486             return -EINVAL;
4487         }
4488
4489         obj = i915_gem_object_lookup(file_priv, args->handle);
4490         if (!obj)
4491                 return -ENOENT;
4492
4493         err = mutex_lock_interruptible(&obj->mm.lock);
4494         if (err)
4495                 goto out;
4496
4497         if (i915_gem_object_has_pages(obj) &&
4498             i915_gem_object_is_tiled(obj) &&
4499             dev_priv->quirks & QUIRK_PIN_SWIZZLED_PAGES) {
4500                 if (obj->mm.madv == I915_MADV_WILLNEED) {
4501                         GEM_BUG_ON(!obj->mm.quirked);
4502                         __i915_gem_object_unpin_pages(obj);
4503                         obj->mm.quirked = false;
4504                 }
4505                 if (args->madv == I915_MADV_WILLNEED) {
4506                         GEM_BUG_ON(obj->mm.quirked);
4507                         __i915_gem_object_pin_pages(obj);
4508                         obj->mm.quirked = true;
4509                 }
4510         }
4511
4512         if (obj->mm.madv != __I915_MADV_PURGED)
4513                 obj->mm.madv = args->madv;
4514
4515         /* if the object is no longer attached, discard its backing storage */
4516         if (obj->mm.madv == I915_MADV_DONTNEED &&
4517             !i915_gem_object_has_pages(obj))
4518                 i915_gem_object_truncate(obj);
4519
4520         args->retained = obj->mm.madv != __I915_MADV_PURGED;
4521         mutex_unlock(&obj->mm.lock);
4522
4523 out:
4524         i915_gem_object_put(obj);
4525         return err;
4526 }
4527
4528 static void
4529 frontbuffer_retire(struct i915_gem_active *active,
4530                    struct drm_i915_gem_request *request)
4531 {
4532         struct drm_i915_gem_object *obj =
4533                 container_of(active, typeof(*obj), frontbuffer_write);
4534
4535         intel_fb_obj_flush(obj, ORIGIN_CS);
4536 }
4537
4538 void i915_gem_object_init(struct drm_i915_gem_object *obj,
4539                           const struct drm_i915_gem_object_ops *ops)
4540 {
4541         mutex_init(&obj->mm.lock);
4542
4543         INIT_LIST_HEAD(&obj->vma_list);
4544         INIT_LIST_HEAD(&obj->lut_list);
4545         INIT_LIST_HEAD(&obj->batch_pool_link);
4546
4547         obj->ops = ops;
4548
4549         reservation_object_init(&obj->__builtin_resv);
4550         obj->resv = &obj->__builtin_resv;
4551
4552         obj->frontbuffer_ggtt_origin = ORIGIN_GTT;
4553         init_request_active(&obj->frontbuffer_write, frontbuffer_retire);
4554
4555         obj->mm.madv = I915_MADV_WILLNEED;
4556         INIT_RADIX_TREE(&obj->mm.get_page.radix, GFP_KERNEL | __GFP_NOWARN);
4557         mutex_init(&obj->mm.get_page.lock);
4558
4559         i915_gem_info_add_obj(to_i915(obj->base.dev), obj->base.size);
4560 }
4561
4562 static const struct drm_i915_gem_object_ops i915_gem_object_ops = {
4563         .flags = I915_GEM_OBJECT_HAS_STRUCT_PAGE |
4564                  I915_GEM_OBJECT_IS_SHRINKABLE,
4565
4566         .get_pages = i915_gem_object_get_pages_gtt,
4567         .put_pages = i915_gem_object_put_pages_gtt,
4568
4569         .pwrite = i915_gem_object_pwrite_gtt,
4570 };
4571
4572 static int i915_gem_object_create_shmem(struct drm_device *dev,
4573                                         struct drm_gem_object *obj,
4574                                         size_t size)
4575 {
4576         struct drm_i915_private *i915 = to_i915(dev);
4577         unsigned long flags = VM_NORESERVE;
4578         struct file *filp;
4579
4580         drm_gem_private_object_init(dev, obj, size);
4581
4582         if (i915->mm.gemfs)
4583                 filp = shmem_file_setup_with_mnt(i915->mm.gemfs, "i915", size,
4584                                                  flags);
4585         else
4586                 filp = shmem_file_setup("i915", size, flags);
4587
4588         if (IS_ERR(filp))
4589                 return PTR_ERR(filp);
4590
4591         obj->filp = filp;
4592
4593         return 0;
4594 }
4595
4596 struct drm_i915_gem_object *
4597 i915_gem_object_create(struct drm_i915_private *dev_priv, u64 size)
4598 {
4599         struct drm_i915_gem_object *obj;
4600         struct address_space *mapping;
4601         unsigned int cache_level;
4602         gfp_t mask;
4603         int ret;
4604
4605         /* There is a prevalence of the assumption that we fit the object's
4606          * page count inside a 32bit _signed_ variable. Let's document this and
4607          * catch if we ever need to fix it. In the meantime, if you do spot
4608          * such a local variable, please consider fixing!
4609          */
4610         if (size >> PAGE_SHIFT > INT_MAX)
4611                 return ERR_PTR(-E2BIG);
4612
4613         if (overflows_type(size, obj->base.size))
4614                 return ERR_PTR(-E2BIG);
4615
4616         obj = i915_gem_object_alloc(dev_priv);
4617         if (obj == NULL)
4618                 return ERR_PTR(-ENOMEM);
4619
4620         ret = i915_gem_object_create_shmem(&dev_priv->drm, &obj->base, size);
4621         if (ret)
4622                 goto fail;
4623
4624         mask = GFP_HIGHUSER | __GFP_RECLAIMABLE;
4625         if (IS_I965GM(dev_priv) || IS_I965G(dev_priv)) {
4626                 /* 965gm cannot relocate objects above 4GiB. */
4627                 mask &= ~__GFP_HIGHMEM;
4628                 mask |= __GFP_DMA32;
4629         }
4630
4631         mapping = obj->base.filp->f_mapping;
4632         mapping_set_gfp_mask(mapping, mask);
4633         GEM_BUG_ON(!(mapping_gfp_mask(mapping) & __GFP_RECLAIM));
4634
4635         i915_gem_object_init(obj, &i915_gem_object_ops);
4636
4637         obj->write_domain = I915_GEM_DOMAIN_CPU;
4638         obj->read_domains = I915_GEM_DOMAIN_CPU;
4639
4640         if (HAS_LLC(dev_priv))
4641                 /* On some devices, we can have the GPU use the LLC (the CPU
4642                  * cache) for about a 10% performance improvement
4643                  * compared to uncached.  Graphics requests other than
4644                  * display scanout are coherent with the CPU in
4645                  * accessing this cache.  This means in this mode we
4646                  * don't need to clflush on the CPU side, and on the
4647                  * GPU side we only need to flush internal caches to
4648                  * get data visible to the CPU.
4649                  *
4650                  * However, we maintain the display planes as UC, and so
4651                  * need to rebind when first used as such.
4652                  */
4653                 cache_level = I915_CACHE_LLC;
4654         else
4655                 cache_level = I915_CACHE_NONE;
4656
4657         i915_gem_object_set_cache_coherency(obj, cache_level);
4658
4659         trace_i915_gem_object_create(obj);
4660
4661         return obj;
4662
4663 fail:
4664         i915_gem_object_free(obj);
4665         return ERR_PTR(ret);
4666 }
4667
4668 static bool discard_backing_storage(struct drm_i915_gem_object *obj)
4669 {
4670         /* If we are the last user of the backing storage (be it shmemfs
4671          * pages or stolen etc), we know that the pages are going to be
4672          * immediately released. In this case, we can then skip copying
4673          * back the contents from the GPU.
4674          */
4675
4676         if (obj->mm.madv != I915_MADV_WILLNEED)
4677                 return false;
4678
4679         if (obj->base.filp == NULL)
4680                 return true;
4681
4682         /* At first glance, this looks racy, but then again so would be
4683          * userspace racing mmap against close. However, the first external
4684          * reference to the filp can only be obtained through the
4685          * i915_gem_mmap_ioctl() which safeguards us against the user
4686          * acquiring such a reference whilst we are in the middle of
4687          * freeing the object.
4688          */
4689         return atomic_long_read(&obj->base.filp->f_count) == 1;
4690 }
4691
4692 static void __i915_gem_free_objects(struct drm_i915_private *i915,
4693                                     struct llist_node *freed)
4694 {
4695         struct drm_i915_gem_object *obj, *on;
4696
4697         intel_runtime_pm_get(i915);
4698         llist_for_each_entry_safe(obj, on, freed, freed) {
4699                 struct i915_vma *vma, *vn;
4700
4701                 trace_i915_gem_object_destroy(obj);
4702
4703                 mutex_lock(&i915->drm.struct_mutex);
4704
4705                 GEM_BUG_ON(i915_gem_object_is_active(obj));
4706                 list_for_each_entry_safe(vma, vn,
4707                                          &obj->vma_list, obj_link) {
4708                         GEM_BUG_ON(i915_vma_is_active(vma));
4709                         vma->flags &= ~I915_VMA_PIN_MASK;
4710                         i915_vma_close(vma);
4711                 }
4712                 GEM_BUG_ON(!list_empty(&obj->vma_list));
4713                 GEM_BUG_ON(!RB_EMPTY_ROOT(&obj->vma_tree));
4714
4715                 /* This serializes freeing with the shrinker. Since the free
4716                  * is delayed, first by RCU then by the workqueue, we want the
4717                  * shrinker to be able to free pages of unreferenced objects,
4718                  * or else we may oom whilst there are plenty of deferred
4719                  * freed objects.
4720                  */
4721                 if (i915_gem_object_has_pages(obj)) {
4722                         spin_lock(&i915->mm.obj_lock);
4723                         list_del_init(&obj->mm.link);
4724                         spin_unlock(&i915->mm.obj_lock);
4725                 }
4726
4727                 mutex_unlock(&i915->drm.struct_mutex);
4728
4729                 GEM_BUG_ON(obj->bind_count);
4730                 GEM_BUG_ON(obj->userfault_count);
4731                 GEM_BUG_ON(atomic_read(&obj->frontbuffer_bits));
4732                 GEM_BUG_ON(!list_empty(&obj->lut_list));
4733
4734                 if (obj->ops->release)
4735                         obj->ops->release(obj);
4736
4737                 if (WARN_ON(i915_gem_object_has_pinned_pages(obj)))
4738                         atomic_set(&obj->mm.pages_pin_count, 0);
4739                 __i915_gem_object_put_pages(obj, I915_MM_NORMAL);
4740                 GEM_BUG_ON(i915_gem_object_has_pages(obj));
4741
4742                 if (obj->base.import_attach)
4743                         drm_prime_gem_destroy(&obj->base, NULL);
4744
4745                 reservation_object_fini(&obj->__builtin_resv);
4746                 drm_gem_object_release(&obj->base);
4747                 i915_gem_info_remove_obj(i915, obj->base.size);
4748
4749                 kfree(obj->bit_17);
4750                 i915_gem_object_free(obj);
4751
4752                 GEM_BUG_ON(!atomic_read(&i915->mm.free_count));
4753                 atomic_dec(&i915->mm.free_count);
4754
4755                 if (on)
4756                         cond_resched();
4757         }
4758         intel_runtime_pm_put(i915);
4759 }
4760
4761 static void i915_gem_flush_free_objects(struct drm_i915_private *i915)
4762 {
4763         struct llist_node *freed;
4764
4765         /* Free the oldest, most stale object to keep the free_list short */
4766         freed = NULL;
4767         if (!llist_empty(&i915->mm.free_list)) { /* quick test for hotpath */
4768                 /* Only one consumer of llist_del_first() allowed */
4769                 spin_lock(&i915->mm.free_lock);
4770                 freed = llist_del_first(&i915->mm.free_list);
4771                 spin_unlock(&i915->mm.free_lock);
4772         }
4773         if (unlikely(freed)) {
4774                 freed->next = NULL;
4775                 __i915_gem_free_objects(i915, freed);
4776         }
4777 }
4778
4779 static void __i915_gem_free_work(struct work_struct *work)
4780 {
4781         struct drm_i915_private *i915 =
4782                 container_of(work, struct drm_i915_private, mm.free_work);
4783         struct llist_node *freed;
4784
4785         /*
4786          * All file-owned VMA should have been released by this point through
4787          * i915_gem_close_object(), or earlier by i915_gem_context_close().
4788          * However, the object may also be bound into the global GTT (e.g.
4789          * older GPUs without per-process support, or for direct access through
4790          * the GTT either for the user or for scanout). Those VMA still need to
4791          * unbound now.
4792          */
4793
4794         spin_lock(&i915->mm.free_lock);
4795         while ((freed = llist_del_all(&i915->mm.free_list))) {
4796                 spin_unlock(&i915->mm.free_lock);
4797
4798                 __i915_gem_free_objects(i915, freed);
4799                 if (need_resched())
4800                         return;
4801
4802                 spin_lock(&i915->mm.free_lock);
4803         }
4804         spin_unlock(&i915->mm.free_lock);
4805 }
4806
4807 static void __i915_gem_free_object_rcu(struct rcu_head *head)
4808 {
4809         struct drm_i915_gem_object *obj =
4810                 container_of(head, typeof(*obj), rcu);
4811         struct drm_i915_private *i915 = to_i915(obj->base.dev);
4812
4813         /*
4814          * Since we require blocking on struct_mutex to unbind the freed
4815          * object from the GPU before releasing resources back to the
4816          * system, we can not do that directly from the RCU callback (which may
4817          * be a softirq context), but must instead then defer that work onto a
4818          * kthread. We use the RCU callback rather than move the freed object
4819          * directly onto the work queue so that we can mix between using the
4820          * worker and performing frees directly from subsequent allocations for
4821          * crude but effective memory throttling.
4822          */
4823         if (llist_add(&obj->freed, &i915->mm.free_list))
4824                 queue_work(i915->wq, &i915->mm.free_work);
4825 }
4826
4827 void i915_gem_free_object(struct drm_gem_object *gem_obj)
4828 {
4829         struct drm_i915_gem_object *obj = to_intel_bo(gem_obj);
4830
4831         if (obj->mm.quirked)
4832                 __i915_gem_object_unpin_pages(obj);
4833
4834         if (discard_backing_storage(obj))
4835                 obj->mm.madv = I915_MADV_DONTNEED;
4836
4837         /*
4838          * Before we free the object, make sure any pure RCU-only
4839          * read-side critical sections are complete, e.g.
4840          * i915_gem_busy_ioctl(). For the corresponding synchronized
4841          * lookup see i915_gem_object_lookup_rcu().
4842          */
4843         atomic_inc(&to_i915(obj->base.dev)->mm.free_count);
4844         call_rcu(&obj->rcu, __i915_gem_free_object_rcu);
4845 }
4846
4847 void __i915_gem_object_release_unless_active(struct drm_i915_gem_object *obj)
4848 {
4849         lockdep_assert_held(&obj->base.dev->struct_mutex);
4850
4851         if (!i915_gem_object_has_active_reference(obj) &&
4852             i915_gem_object_is_active(obj))
4853                 i915_gem_object_set_active_reference(obj);
4854         else
4855                 i915_gem_object_put(obj);
4856 }
4857
4858 static void assert_kernel_context_is_current(struct drm_i915_private *i915)
4859 {
4860         struct i915_gem_context *kernel_context = i915->kernel_context;
4861         struct intel_engine_cs *engine;
4862         enum intel_engine_id id;
4863
4864         for_each_engine(engine, i915, id) {
4865                 GEM_BUG_ON(__i915_gem_active_peek(&engine->timeline->last_request));
4866                 GEM_BUG_ON(engine->last_retired_context != kernel_context);
4867         }
4868 }
4869
4870 void i915_gem_sanitize(struct drm_i915_private *i915)
4871 {
4872         if (i915_terminally_wedged(&i915->gpu_error)) {
4873                 mutex_lock(&i915->drm.struct_mutex);
4874                 i915_gem_unset_wedged(i915);
4875                 mutex_unlock(&i915->drm.struct_mutex);
4876         }
4877
4878         /*
4879          * If we inherit context state from the BIOS or earlier occupants
4880          * of the GPU, the GPU may be in an inconsistent state when we
4881          * try to take over. The only way to remove the earlier state
4882          * is by resetting. However, resetting on earlier gen is tricky as
4883          * it may impact the display and we are uncertain about the stability
4884          * of the reset, so this could be applied to even earlier gen.
4885          */
4886         if (INTEL_GEN(i915) >= 5 && intel_has_gpu_reset(i915))
4887                 WARN_ON(intel_gpu_reset(i915, ALL_ENGINES));
4888 }
4889
4890 int i915_gem_suspend(struct drm_i915_private *dev_priv)
4891 {
4892         struct drm_device *dev = &dev_priv->drm;
4893         int ret;
4894
4895         intel_runtime_pm_get(dev_priv);
4896         intel_suspend_gt_powersave(dev_priv);
4897
4898         mutex_lock(&dev->struct_mutex);
4899
4900         /* We have to flush all the executing contexts to main memory so
4901          * that they can saved in the hibernation image. To ensure the last
4902          * context image is coherent, we have to switch away from it. That
4903          * leaves the dev_priv->kernel_context still active when
4904          * we actually suspend, and its image in memory may not match the GPU
4905          * state. Fortunately, the kernel_context is disposable and we do
4906          * not rely on its state.
4907          */
4908         if (!i915_terminally_wedged(&dev_priv->gpu_error)) {
4909                 ret = i915_gem_switch_to_kernel_context(dev_priv);
4910                 if (ret)
4911                         goto err_unlock;
4912
4913                 ret = i915_gem_wait_for_idle(dev_priv,
4914                                              I915_WAIT_INTERRUPTIBLE |
4915                                              I915_WAIT_LOCKED);
4916                 if (ret && ret != -EIO)
4917                         goto err_unlock;
4918
4919                 assert_kernel_context_is_current(dev_priv);
4920         }
4921         i915_gem_contexts_lost(dev_priv);
4922         mutex_unlock(&dev->struct_mutex);
4923
4924         intel_guc_suspend(dev_priv);
4925
4926         cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work);
4927         cancel_delayed_work_sync(&dev_priv->gt.retire_work);
4928
4929         /* As the idle_work is rearming if it detects a race, play safe and
4930          * repeat the flush until it is definitely idle.
4931          */
4932         drain_delayed_work(&dev_priv->gt.idle_work);
4933
4934         /* Assert that we sucessfully flushed all the work and
4935          * reset the GPU back to its idle, low power state.
4936          */
4937         WARN_ON(dev_priv->gt.awake);
4938         if (WARN_ON(!intel_engines_are_idle(dev_priv)))
4939                 i915_gem_set_wedged(dev_priv); /* no hope, discard everything */
4940
4941         /*
4942          * Neither the BIOS, ourselves or any other kernel
4943          * expects the system to be in execlists mode on startup,
4944          * so we need to reset the GPU back to legacy mode. And the only
4945          * known way to disable logical contexts is through a GPU reset.
4946          *
4947          * So in order to leave the system in a known default configuration,
4948          * always reset the GPU upon unload and suspend. Afterwards we then
4949          * clean up the GEM state tracking, flushing off the requests and
4950          * leaving the system in a known idle state.
4951          *
4952          * Note that is of the upmost importance that the GPU is idle and
4953          * all stray writes are flushed *before* we dismantle the backing
4954          * storage for the pinned objects.
4955          *
4956          * However, since we are uncertain that resetting the GPU on older
4957          * machines is a good idea, we don't - just in case it leaves the
4958          * machine in an unusable condition.
4959          */
4960         i915_gem_sanitize(dev_priv);
4961
4962         intel_runtime_pm_put(dev_priv);
4963         return 0;
4964
4965 err_unlock:
4966         mutex_unlock(&dev->struct_mutex);
4967         intel_runtime_pm_put(dev_priv);
4968         return ret;
4969 }
4970
4971 void i915_gem_resume(struct drm_i915_private *i915)
4972 {
4973         WARN_ON(i915->gt.awake);
4974
4975         mutex_lock(&i915->drm.struct_mutex);
4976         intel_uncore_forcewake_get(i915, FORCEWAKE_ALL);
4977
4978         i915_gem_restore_gtt_mappings(i915);
4979         i915_gem_restore_fences(i915);
4980
4981         /*
4982          * As we didn't flush the kernel context before suspend, we cannot
4983          * guarantee that the context image is complete. So let's just reset
4984          * it and start again.
4985          */
4986         i915->gt.resume(i915);
4987
4988         if (i915_gem_init_hw(i915))
4989                 goto err_wedged;
4990
4991         intel_guc_resume(i915);
4992
4993         /* Always reload a context for powersaving. */
4994         if (i915_gem_switch_to_kernel_context(i915))
4995                 goto err_wedged;
4996
4997 out_unlock:
4998         intel_uncore_forcewake_put(i915, FORCEWAKE_ALL);
4999         mutex_unlock(&i915->drm.struct_mutex);
5000         return;
5001
5002 err_wedged:
5003         if (!i915_terminally_wedged(&i915->gpu_error)) {
5004                 DRM_ERROR("failed to re-initialize GPU, declaring wedged!\n");
5005                 i915_gem_set_wedged(i915);
5006         }
5007         goto out_unlock;
5008 }
5009
5010 void i915_gem_init_swizzling(struct drm_i915_private *dev_priv)
5011 {
5012         if (INTEL_GEN(dev_priv) < 5 ||
5013             dev_priv->mm.bit_6_swizzle_x == I915_BIT_6_SWIZZLE_NONE)
5014                 return;
5015
5016         I915_WRITE(DISP_ARB_CTL, I915_READ(DISP_ARB_CTL) |
5017                                  DISP_TILE_SURFACE_SWIZZLING);
5018
5019         if (IS_GEN5(dev_priv))
5020                 return;
5021
5022         I915_WRITE(TILECTL, I915_READ(TILECTL) | TILECTL_SWZCTL);
5023         if (IS_GEN6(dev_priv))
5024                 I915_WRITE(ARB_MODE, _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_SNB));
5025         else if (IS_GEN7(dev_priv))
5026                 I915_WRITE(ARB_MODE, _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_IVB));
5027         else if (IS_GEN8(dev_priv))
5028                 I915_WRITE(GAMTARBMODE, _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_BDW));
5029         else
5030                 BUG();
5031 }
5032
5033 static void init_unused_ring(struct drm_i915_private *dev_priv, u32 base)
5034 {
5035         I915_WRITE(RING_CTL(base), 0);
5036         I915_WRITE(RING_HEAD(base), 0);
5037         I915_WRITE(RING_TAIL(base), 0);
5038         I915_WRITE(RING_START(base), 0);
5039 }
5040
5041 static void init_unused_rings(struct drm_i915_private *dev_priv)
5042 {
5043         if (IS_I830(dev_priv)) {
5044                 init_unused_ring(dev_priv, PRB1_BASE);
5045                 init_unused_ring(dev_priv, SRB0_BASE);
5046                 init_unused_ring(dev_priv, SRB1_BASE);
5047                 init_unused_ring(dev_priv, SRB2_BASE);
5048                 init_unused_ring(dev_priv, SRB3_BASE);
5049         } else if (IS_GEN2(dev_priv)) {
5050                 init_unused_ring(dev_priv, SRB0_BASE);
5051                 init_unused_ring(dev_priv, SRB1_BASE);
5052         } else if (IS_GEN3(dev_priv)) {
5053                 init_unused_ring(dev_priv, PRB1_BASE);
5054                 init_unused_ring(dev_priv, PRB2_BASE);
5055         }
5056 }
5057
5058 static int __i915_gem_restart_engines(void *data)
5059 {
5060         struct drm_i915_private *i915 = data;
5061         struct intel_engine_cs *engine;
5062         enum intel_engine_id id;
5063         int err;
5064
5065         for_each_engine(engine, i915, id) {
5066                 err = engine->init_hw(engine);
5067                 if (err) {
5068                         DRM_ERROR("Failed to restart %s (%d)\n",
5069                                   engine->name, err);
5070                         return err;
5071                 }
5072         }
5073
5074         return 0;
5075 }
5076
5077 int i915_gem_init_hw(struct drm_i915_private *dev_priv)
5078 {
5079         int ret;
5080
5081         dev_priv->gt.last_init_time = ktime_get();
5082
5083         /* Double layer security blanket, see i915_gem_init() */
5084         intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
5085
5086         if (HAS_EDRAM(dev_priv) && INTEL_GEN(dev_priv) < 9)
5087                 I915_WRITE(HSW_IDICR, I915_READ(HSW_IDICR) | IDIHASHMSK(0xf));
5088
5089         if (IS_HASWELL(dev_priv))
5090                 I915_WRITE(MI_PREDICATE_RESULT_2, IS_HSW_GT3(dev_priv) ?
5091                            LOWER_SLICE_ENABLED : LOWER_SLICE_DISABLED);
5092
5093         if (HAS_PCH_NOP(dev_priv)) {
5094                 if (IS_IVYBRIDGE(dev_priv)) {
5095                         u32 temp = I915_READ(GEN7_MSG_CTL);
5096                         temp &= ~(WAIT_FOR_PCH_FLR_ACK | WAIT_FOR_PCH_RESET_ACK);
5097                         I915_WRITE(GEN7_MSG_CTL, temp);
5098                 } else if (INTEL_GEN(dev_priv) >= 7) {
5099                         u32 temp = I915_READ(HSW_NDE_RSTWRN_OPT);
5100                         temp &= ~RESET_PCH_HANDSHAKE_ENABLE;
5101                         I915_WRITE(HSW_NDE_RSTWRN_OPT, temp);
5102                 }
5103         }
5104
5105         i915_gem_init_swizzling(dev_priv);
5106
5107         /*
5108          * At least 830 can leave some of the unused rings
5109          * "active" (ie. head != tail) after resume which
5110          * will prevent c3 entry. Makes sure all unused rings
5111          * are totally idle.
5112          */
5113         init_unused_rings(dev_priv);
5114
5115         BUG_ON(!dev_priv->kernel_context);
5116         if (i915_terminally_wedged(&dev_priv->gpu_error)) {
5117                 ret = -EIO;
5118                 goto out;
5119         }
5120
5121         ret = i915_ppgtt_init_hw(dev_priv);
5122         if (ret) {
5123                 DRM_ERROR("Enabling PPGTT failed (%d)\n", ret);
5124                 goto out;
5125         }
5126
5127         /* We can't enable contexts until all firmware is loaded */
5128         ret = intel_uc_init_hw(dev_priv);
5129         if (ret) {
5130                 DRM_ERROR("Enabling uc failed (%d)\n", ret);
5131                 goto out;
5132         }
5133
5134         intel_mocs_init_l3cc_table(dev_priv);
5135
5136         /* Only when the HW is re-initialised, can we replay the requests */
5137         ret = __i915_gem_restart_engines(dev_priv);
5138 out:
5139         intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
5140         return ret;
5141 }
5142
5143 static int __intel_engines_record_defaults(struct drm_i915_private *i915)
5144 {
5145         struct i915_gem_context *ctx;
5146         struct intel_engine_cs *engine;
5147         enum intel_engine_id id;
5148         int err;
5149
5150         /*
5151          * As we reset the gpu during very early sanitisation, the current
5152          * register state on the GPU should reflect its defaults values.
5153          * We load a context onto the hw (with restore-inhibit), then switch
5154          * over to a second context to save that default register state. We
5155          * can then prime every new context with that state so they all start
5156          * from the same default HW values.
5157          */
5158
5159         ctx = i915_gem_context_create_kernel(i915, 0);
5160         if (IS_ERR(ctx))
5161                 return PTR_ERR(ctx);
5162
5163         for_each_engine(engine, i915, id) {
5164                 struct drm_i915_gem_request *rq;
5165
5166                 rq = i915_gem_request_alloc(engine, ctx);
5167                 if (IS_ERR(rq)) {
5168                         err = PTR_ERR(rq);
5169                         goto out_ctx;
5170                 }
5171
5172                 err = 0;
5173                 if (engine->init_context)
5174                         err = engine->init_context(rq);
5175
5176                 __i915_add_request(rq, true);
5177                 if (err)
5178                         goto err_active;
5179         }
5180
5181         err = i915_gem_switch_to_kernel_context(i915);
5182         if (err)
5183                 goto err_active;
5184
5185         err = i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED);
5186         if (err)
5187                 goto err_active;
5188
5189         assert_kernel_context_is_current(i915);
5190
5191         for_each_engine(engine, i915, id) {
5192                 struct i915_vma *state;
5193
5194                 state = ctx->engine[id].state;
5195                 if (!state)
5196                         continue;
5197
5198                 /*
5199                  * As we will hold a reference to the logical state, it will
5200                  * not be torn down with the context, and importantly the
5201                  * object will hold onto its vma (making it possible for a
5202                  * stray GTT write to corrupt our defaults). Unmap the vma
5203                  * from the GTT to prevent such accidents and reclaim the
5204                  * space.
5205                  */
5206                 err = i915_vma_unbind(state);
5207                 if (err)
5208                         goto err_active;
5209
5210                 err = i915_gem_object_set_to_cpu_domain(state->obj, false);
5211                 if (err)
5212                         goto err_active;
5213
5214                 engine->default_state = i915_gem_object_get(state->obj);
5215         }
5216
5217         if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) {
5218                 unsigned int found = intel_engines_has_context_isolation(i915);
5219
5220                 /*
5221                  * Make sure that classes with multiple engine instances all
5222                  * share the same basic configuration.
5223                  */
5224                 for_each_engine(engine, i915, id) {
5225                         unsigned int bit = BIT(engine->uabi_class);
5226                         unsigned int expected = engine->default_state ? bit : 0;
5227
5228                         if ((found & bit) != expected) {
5229                                 DRM_ERROR("mismatching default context state for class %d on engine %s\n",
5230                                           engine->uabi_class, engine->name);
5231                         }
5232                 }
5233         }
5234
5235 out_ctx:
5236         i915_gem_context_set_closed(ctx);
5237         i915_gem_context_put(ctx);
5238         return err;
5239
5240 err_active:
5241         /*
5242          * If we have to abandon now, we expect the engines to be idle
5243          * and ready to be torn-down. First try to flush any remaining
5244          * request, ensure we are pointing at the kernel context and
5245          * then remove it.
5246          */
5247         if (WARN_ON(i915_gem_switch_to_kernel_context(i915)))
5248                 goto out_ctx;
5249
5250         if (WARN_ON(i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED)))
5251                 goto out_ctx;
5252
5253         i915_gem_contexts_lost(i915);
5254         goto out_ctx;
5255 }
5256
5257 int i915_gem_init(struct drm_i915_private *dev_priv)
5258 {
5259         int ret;
5260
5261         /*
5262          * We need to fallback to 4K pages since gvt gtt handling doesn't
5263          * support huge page entries - we will need to check either hypervisor
5264          * mm can support huge guest page or just do emulation in gvt.
5265          */
5266         if (intel_vgpu_active(dev_priv))
5267                 mkwrite_device_info(dev_priv)->page_sizes =
5268                         I915_GTT_PAGE_SIZE_4K;
5269
5270         dev_priv->mm.unordered_timeline = dma_fence_context_alloc(1);
5271
5272         if (HAS_LOGICAL_RING_CONTEXTS(dev_priv)) {
5273                 dev_priv->gt.resume = intel_lr_context_resume;
5274                 dev_priv->gt.cleanup_engine = intel_logical_ring_cleanup;
5275         } else {
5276                 dev_priv->gt.resume = intel_legacy_submission_resume;
5277                 dev_priv->gt.cleanup_engine = intel_engine_cleanup;
5278         }
5279
5280         ret = i915_gem_init_userptr(dev_priv);
5281         if (ret)
5282                 return ret;
5283
5284         ret = intel_uc_init_misc(dev_priv);
5285         if (ret)
5286                 return ret;
5287
5288         /* This is just a security blanket to placate dragons.
5289          * On some systems, we very sporadically observe that the first TLBs
5290          * used by the CS may be stale, despite us poking the TLB reset. If
5291          * we hold the forcewake during initialisation these problems
5292          * just magically go away.
5293          */
5294         mutex_lock(&dev_priv->drm.struct_mutex);
5295         intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
5296
5297         ret = i915_gem_init_ggtt(dev_priv);
5298         if (ret) {
5299                 GEM_BUG_ON(ret == -EIO);
5300                 goto err_unlock;
5301         }
5302
5303         ret = i915_gem_contexts_init(dev_priv);
5304         if (ret) {
5305                 GEM_BUG_ON(ret == -EIO);
5306                 goto err_ggtt;
5307         }
5308
5309         ret = intel_engines_init(dev_priv);
5310         if (ret) {
5311                 GEM_BUG_ON(ret == -EIO);
5312                 goto err_context;
5313         }
5314
5315         intel_init_gt_powersave(dev_priv);
5316
5317         ret = intel_uc_init(dev_priv);
5318         if (ret)
5319                 goto err_pm;
5320
5321         ret = i915_gem_init_hw(dev_priv);
5322         if (ret)
5323                 goto err_uc_init;
5324
5325         /*
5326          * Despite its name intel_init_clock_gating applies both display
5327          * clock gating workarounds; GT mmio workarounds and the occasional
5328          * GT power context workaround. Worse, sometimes it includes a context
5329          * register workaround which we need to apply before we record the
5330          * default HW state for all contexts.
5331          *
5332          * FIXME: break up the workarounds and apply them at the right time!
5333          */
5334         intel_init_clock_gating(dev_priv);
5335
5336         ret = __intel_engines_record_defaults(dev_priv);
5337         if (ret)
5338                 goto err_init_hw;
5339
5340         if (i915_inject_load_failure()) {
5341                 ret = -ENODEV;
5342                 goto err_init_hw;
5343         }
5344
5345         if (i915_inject_load_failure()) {
5346                 ret = -EIO;
5347                 goto err_init_hw;
5348         }
5349
5350         intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
5351         mutex_unlock(&dev_priv->drm.struct_mutex);
5352
5353         return 0;
5354
5355         /*
5356          * Unwinding is complicated by that we want to handle -EIO to mean
5357          * disable GPU submission but keep KMS alive. We want to mark the
5358          * HW as irrevisibly wedged, but keep enough state around that the
5359          * driver doesn't explode during runtime.
5360          */
5361 err_init_hw:
5362         i915_gem_wait_for_idle(dev_priv, I915_WAIT_LOCKED);
5363         i915_gem_contexts_lost(dev_priv);
5364         intel_uc_fini_hw(dev_priv);
5365 err_uc_init:
5366         intel_uc_fini(dev_priv);
5367 err_pm:
5368         if (ret != -EIO) {
5369                 intel_cleanup_gt_powersave(dev_priv);
5370                 i915_gem_cleanup_engines(dev_priv);
5371         }
5372 err_context:
5373         if (ret != -EIO)
5374                 i915_gem_contexts_fini(dev_priv);
5375 err_ggtt:
5376 err_unlock:
5377         intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
5378         mutex_unlock(&dev_priv->drm.struct_mutex);
5379
5380         intel_uc_fini_misc(dev_priv);
5381
5382         if (ret != -EIO)
5383                 i915_gem_cleanup_userptr(dev_priv);
5384
5385         if (ret == -EIO) {
5386                 /*
5387                  * Allow engine initialisation to fail by marking the GPU as
5388                  * wedged. But we only want to do this where the GPU is angry,
5389                  * for all other failure, such as an allocation failure, bail.
5390                  */
5391                 if (!i915_terminally_wedged(&dev_priv->gpu_error)) {
5392                         DRM_ERROR("Failed to initialize GPU, declaring it wedged\n");
5393                         i915_gem_set_wedged(dev_priv);
5394                 }
5395                 ret = 0;
5396         }
5397
5398         i915_gem_drain_freed_objects(dev_priv);
5399         return ret;
5400 }
5401
5402 void i915_gem_init_mmio(struct drm_i915_private *i915)
5403 {
5404         i915_gem_sanitize(i915);
5405 }
5406
5407 void
5408 i915_gem_cleanup_engines(struct drm_i915_private *dev_priv)
5409 {
5410         struct intel_engine_cs *engine;
5411         enum intel_engine_id id;
5412
5413         for_each_engine(engine, dev_priv, id)
5414                 dev_priv->gt.cleanup_engine(engine);
5415 }
5416
5417 void
5418 i915_gem_load_init_fences(struct drm_i915_private *dev_priv)
5419 {
5420         int i;
5421
5422         if (INTEL_GEN(dev_priv) >= 7 && !IS_VALLEYVIEW(dev_priv) &&
5423             !IS_CHERRYVIEW(dev_priv))
5424                 dev_priv->num_fence_regs = 32;
5425         else if (INTEL_GEN(dev_priv) >= 4 ||
5426                  IS_I945G(dev_priv) || IS_I945GM(dev_priv) ||
5427                  IS_G33(dev_priv) || IS_PINEVIEW(dev_priv))
5428                 dev_priv->num_fence_regs = 16;
5429         else
5430                 dev_priv->num_fence_regs = 8;
5431
5432         if (intel_vgpu_active(dev_priv))
5433                 dev_priv->num_fence_regs =
5434                                 I915_READ(vgtif_reg(avail_rs.fence_num));
5435
5436         /* Initialize fence registers to zero */
5437         for (i = 0; i < dev_priv->num_fence_regs; i++) {
5438                 struct drm_i915_fence_reg *fence = &dev_priv->fence_regs[i];
5439
5440                 fence->i915 = dev_priv;
5441                 fence->id = i;
5442                 list_add_tail(&fence->link, &dev_priv->mm.fence_list);
5443         }
5444         i915_gem_restore_fences(dev_priv);
5445
5446         i915_gem_detect_bit_6_swizzle(dev_priv);
5447 }
5448
5449 static void i915_gem_init__mm(struct drm_i915_private *i915)
5450 {
5451         spin_lock_init(&i915->mm.object_stat_lock);
5452         spin_lock_init(&i915->mm.obj_lock);
5453         spin_lock_init(&i915->mm.free_lock);
5454
5455         init_llist_head(&i915->mm.free_list);
5456
5457         INIT_LIST_HEAD(&i915->mm.unbound_list);
5458         INIT_LIST_HEAD(&i915->mm.bound_list);
5459         INIT_LIST_HEAD(&i915->mm.fence_list);
5460         INIT_LIST_HEAD(&i915->mm.userfault_list);
5461
5462         INIT_WORK(&i915->mm.free_work, __i915_gem_free_work);
5463 }
5464
5465 int
5466 i915_gem_load_init(struct drm_i915_private *dev_priv)
5467 {
5468         int err = -ENOMEM;
5469
5470         dev_priv->objects = KMEM_CACHE(drm_i915_gem_object, SLAB_HWCACHE_ALIGN);
5471         if (!dev_priv->objects)
5472                 goto err_out;
5473
5474         dev_priv->vmas = KMEM_CACHE(i915_vma, SLAB_HWCACHE_ALIGN);
5475         if (!dev_priv->vmas)
5476                 goto err_objects;
5477
5478         dev_priv->luts = KMEM_CACHE(i915_lut_handle, 0);
5479         if (!dev_priv->luts)
5480                 goto err_vmas;
5481
5482         dev_priv->requests = KMEM_CACHE(drm_i915_gem_request,
5483                                         SLAB_HWCACHE_ALIGN |
5484                                         SLAB_RECLAIM_ACCOUNT |
5485                                         SLAB_TYPESAFE_BY_RCU);
5486         if (!dev_priv->requests)
5487                 goto err_luts;
5488
5489         dev_priv->dependencies = KMEM_CACHE(i915_dependency,
5490                                             SLAB_HWCACHE_ALIGN |
5491                                             SLAB_RECLAIM_ACCOUNT);
5492         if (!dev_priv->dependencies)
5493                 goto err_requests;
5494
5495         dev_priv->priorities = KMEM_CACHE(i915_priolist, SLAB_HWCACHE_ALIGN);
5496         if (!dev_priv->priorities)
5497                 goto err_dependencies;
5498
5499         mutex_lock(&dev_priv->drm.struct_mutex);
5500         INIT_LIST_HEAD(&dev_priv->gt.timelines);
5501         err = i915_gem_timeline_init__global(dev_priv);
5502         mutex_unlock(&dev_priv->drm.struct_mutex);
5503         if (err)
5504                 goto err_priorities;
5505
5506         i915_gem_init__mm(dev_priv);
5507
5508         INIT_DELAYED_WORK(&dev_priv->gt.retire_work,
5509                           i915_gem_retire_work_handler);
5510         INIT_DELAYED_WORK(&dev_priv->gt.idle_work,
5511                           i915_gem_idle_work_handler);
5512         init_waitqueue_head(&dev_priv->gpu_error.wait_queue);
5513         init_waitqueue_head(&dev_priv->gpu_error.reset_queue);
5514
5515         atomic_set(&dev_priv->mm.bsd_engine_dispatch_index, 0);
5516
5517         spin_lock_init(&dev_priv->fb_tracking.lock);
5518
5519         err = i915_gemfs_init(dev_priv);
5520         if (err)
5521                 DRM_NOTE("Unable to create a private tmpfs mount, hugepage support will be disabled(%d).\n", err);
5522
5523         return 0;
5524
5525 err_priorities:
5526         kmem_cache_destroy(dev_priv->priorities);
5527 err_dependencies:
5528         kmem_cache_destroy(dev_priv->dependencies);
5529 err_requests:
5530         kmem_cache_destroy(dev_priv->requests);
5531 err_luts:
5532         kmem_cache_destroy(dev_priv->luts);
5533 err_vmas:
5534         kmem_cache_destroy(dev_priv->vmas);
5535 err_objects:
5536         kmem_cache_destroy(dev_priv->objects);
5537 err_out:
5538         return err;
5539 }
5540
5541 void i915_gem_load_cleanup(struct drm_i915_private *dev_priv)
5542 {
5543         i915_gem_drain_freed_objects(dev_priv);
5544         GEM_BUG_ON(!llist_empty(&dev_priv->mm.free_list));
5545         GEM_BUG_ON(atomic_read(&dev_priv->mm.free_count));
5546         WARN_ON(dev_priv->mm.object_count);
5547
5548         mutex_lock(&dev_priv->drm.struct_mutex);
5549         i915_gem_timeline_fini(&dev_priv->gt.global_timeline);
5550         WARN_ON(!list_empty(&dev_priv->gt.timelines));
5551         mutex_unlock(&dev_priv->drm.struct_mutex);
5552
5553         kmem_cache_destroy(dev_priv->priorities);
5554         kmem_cache_destroy(dev_priv->dependencies);
5555         kmem_cache_destroy(dev_priv->requests);
5556         kmem_cache_destroy(dev_priv->luts);
5557         kmem_cache_destroy(dev_priv->vmas);
5558         kmem_cache_destroy(dev_priv->objects);
5559
5560         /* And ensure that our DESTROY_BY_RCU slabs are truly destroyed */
5561         rcu_barrier();
5562
5563         i915_gemfs_fini(dev_priv);
5564 }
5565
5566 int i915_gem_freeze(struct drm_i915_private *dev_priv)
5567 {
5568         /* Discard all purgeable objects, let userspace recover those as
5569          * required after resuming.
5570          */
5571         i915_gem_shrink_all(dev_priv);
5572
5573         return 0;
5574 }
5575
5576 int i915_gem_freeze_late(struct drm_i915_private *dev_priv)
5577 {
5578         struct drm_i915_gem_object *obj;
5579         struct list_head *phases[] = {
5580                 &dev_priv->mm.unbound_list,
5581                 &dev_priv->mm.bound_list,
5582                 NULL
5583         }, **p;
5584
5585         /* Called just before we write the hibernation image.
5586          *
5587          * We need to update the domain tracking to reflect that the CPU
5588          * will be accessing all the pages to create and restore from the
5589          * hibernation, and so upon restoration those pages will be in the
5590          * CPU domain.
5591          *
5592          * To make sure the hibernation image contains the latest state,
5593          * we update that state just before writing out the image.
5594          *
5595          * To try and reduce the hibernation image, we manually shrink
5596          * the objects as well, see i915_gem_freeze()
5597          */
5598
5599         i915_gem_shrink(dev_priv, -1UL, NULL, I915_SHRINK_UNBOUND);
5600         i915_gem_drain_freed_objects(dev_priv);
5601
5602         spin_lock(&dev_priv->mm.obj_lock);
5603         for (p = phases; *p; p++) {
5604                 list_for_each_entry(obj, *p, mm.link)
5605                         __start_cpu_write(obj);
5606         }
5607         spin_unlock(&dev_priv->mm.obj_lock);
5608
5609         return 0;
5610 }
5611
5612 void i915_gem_release(struct drm_device *dev, struct drm_file *file)
5613 {
5614         struct drm_i915_file_private *file_priv = file->driver_priv;
5615         struct drm_i915_gem_request *request;
5616
5617         /* Clean up our request list when the client is going away, so that
5618          * later retire_requests won't dereference our soon-to-be-gone
5619          * file_priv.
5620          */
5621         spin_lock(&file_priv->mm.lock);
5622         list_for_each_entry(request, &file_priv->mm.request_list, client_link)
5623                 request->file_priv = NULL;
5624         spin_unlock(&file_priv->mm.lock);
5625 }
5626
5627 int i915_gem_open(struct drm_i915_private *i915, struct drm_file *file)
5628 {
5629         struct drm_i915_file_private *file_priv;
5630         int ret;
5631
5632         DRM_DEBUG("\n");
5633
5634         file_priv = kzalloc(sizeof(*file_priv), GFP_KERNEL);
5635         if (!file_priv)
5636                 return -ENOMEM;
5637
5638         file->driver_priv = file_priv;
5639         file_priv->dev_priv = i915;
5640         file_priv->file = file;
5641
5642         spin_lock_init(&file_priv->mm.lock);
5643         INIT_LIST_HEAD(&file_priv->mm.request_list);
5644
5645         file_priv->bsd_engine = -1;
5646
5647         ret = i915_gem_context_open(i915, file);
5648         if (ret)
5649                 kfree(file_priv);
5650
5651         return ret;
5652 }
5653
5654 /**
5655  * i915_gem_track_fb - update frontbuffer tracking
5656  * @old: current GEM buffer for the frontbuffer slots
5657  * @new: new GEM buffer for the frontbuffer slots
5658  * @frontbuffer_bits: bitmask of frontbuffer slots
5659  *
5660  * This updates the frontbuffer tracking bits @frontbuffer_bits by clearing them
5661  * from @old and setting them in @new. Both @old and @new can be NULL.
5662  */
5663 void i915_gem_track_fb(struct drm_i915_gem_object *old,
5664                        struct drm_i915_gem_object *new,
5665                        unsigned frontbuffer_bits)
5666 {
5667         /* Control of individual bits within the mask are guarded by
5668          * the owning plane->mutex, i.e. we can never see concurrent
5669          * manipulation of individual bits. But since the bitfield as a whole
5670          * is updated using RMW, we need to use atomics in order to update
5671          * the bits.
5672          */
5673         BUILD_BUG_ON(INTEL_FRONTBUFFER_BITS_PER_PIPE * I915_MAX_PIPES >
5674                      sizeof(atomic_t) * BITS_PER_BYTE);
5675
5676         if (old) {
5677                 WARN_ON(!(atomic_read(&old->frontbuffer_bits) & frontbuffer_bits));
5678                 atomic_andnot(frontbuffer_bits, &old->frontbuffer_bits);
5679         }
5680
5681         if (new) {
5682                 WARN_ON(atomic_read(&new->frontbuffer_bits) & frontbuffer_bits);
5683                 atomic_or(frontbuffer_bits, &new->frontbuffer_bits);
5684         }
5685 }
5686
5687 /* Allocate a new GEM object and fill it with the supplied data */
5688 struct drm_i915_gem_object *
5689 i915_gem_object_create_from_data(struct drm_i915_private *dev_priv,
5690                                  const void *data, size_t size)
5691 {
5692         struct drm_i915_gem_object *obj;
5693         struct file *file;
5694         size_t offset;
5695         int err;
5696
5697         obj = i915_gem_object_create(dev_priv, round_up(size, PAGE_SIZE));
5698         if (IS_ERR(obj))
5699                 return obj;
5700
5701         GEM_BUG_ON(obj->write_domain != I915_GEM_DOMAIN_CPU);
5702
5703         file = obj->base.filp;
5704         offset = 0;
5705         do {
5706                 unsigned int len = min_t(typeof(size), size, PAGE_SIZE);
5707                 struct page *page;
5708                 void *pgdata, *vaddr;
5709
5710                 err = pagecache_write_begin(file, file->f_mapping,
5711                                             offset, len, 0,
5712                                             &page, &pgdata);
5713                 if (err < 0)
5714                         goto fail;
5715
5716                 vaddr = kmap(page);
5717                 memcpy(vaddr, data, len);
5718                 kunmap(page);
5719
5720                 err = pagecache_write_end(file, file->f_mapping,
5721                                           offset, len, len,
5722                                           page, pgdata);
5723                 if (err < 0)
5724                         goto fail;
5725
5726                 size -= len;
5727                 data += len;
5728                 offset += len;
5729         } while (size);
5730
5731         return obj;
5732
5733 fail:
5734         i915_gem_object_put(obj);
5735         return ERR_PTR(err);
5736 }
5737
5738 struct scatterlist *
5739 i915_gem_object_get_sg(struct drm_i915_gem_object *obj,
5740                        unsigned int n,
5741                        unsigned int *offset)
5742 {
5743         struct i915_gem_object_page_iter *iter = &obj->mm.get_page;
5744         struct scatterlist *sg;
5745         unsigned int idx, count;
5746
5747         might_sleep();
5748         GEM_BUG_ON(n >= obj->base.size >> PAGE_SHIFT);
5749         GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
5750
5751         /* As we iterate forward through the sg, we record each entry in a
5752          * radixtree for quick repeated (backwards) lookups. If we have seen
5753          * this index previously, we will have an entry for it.
5754          *
5755          * Initial lookup is O(N), but this is amortized to O(1) for
5756          * sequential page access (where each new request is consecutive
5757          * to the previous one). Repeated lookups are O(lg(obj->base.size)),
5758          * i.e. O(1) with a large constant!
5759          */
5760         if (n < READ_ONCE(iter->sg_idx))
5761                 goto lookup;
5762
5763         mutex_lock(&iter->lock);
5764
5765         /* We prefer to reuse the last sg so that repeated lookup of this
5766          * (or the subsequent) sg are fast - comparing against the last
5767          * sg is faster than going through the radixtree.
5768          */
5769
5770         sg = iter->sg_pos;
5771         idx = iter->sg_idx;
5772         count = __sg_page_count(sg);
5773
5774         while (idx + count <= n) {
5775                 unsigned long exception, i;
5776                 int ret;
5777
5778                 /* If we cannot allocate and insert this entry, or the
5779                  * individual pages from this range, cancel updating the
5780                  * sg_idx so that on this lookup we are forced to linearly
5781                  * scan onwards, but on future lookups we will try the
5782                  * insertion again (in which case we need to be careful of
5783                  * the error return reporting that we have already inserted
5784                  * this index).
5785                  */
5786                 ret = radix_tree_insert(&iter->radix, idx, sg);
5787                 if (ret && ret != -EEXIST)
5788                         goto scan;
5789
5790                 exception =
5791                         RADIX_TREE_EXCEPTIONAL_ENTRY |
5792                         idx << RADIX_TREE_EXCEPTIONAL_SHIFT;
5793                 for (i = 1; i < count; i++) {
5794                         ret = radix_tree_insert(&iter->radix, idx + i,
5795                                                 (void *)exception);
5796                         if (ret && ret != -EEXIST)
5797                                 goto scan;
5798                 }
5799
5800                 idx += count;
5801                 sg = ____sg_next(sg);
5802                 count = __sg_page_count(sg);
5803         }
5804
5805 scan:
5806         iter->sg_pos = sg;
5807         iter->sg_idx = idx;
5808
5809         mutex_unlock(&iter->lock);
5810
5811         if (unlikely(n < idx)) /* insertion completed by another thread */
5812                 goto lookup;
5813
5814         /* In case we failed to insert the entry into the radixtree, we need
5815          * to look beyond the current sg.
5816          */
5817         while (idx + count <= n) {
5818                 idx += count;
5819                 sg = ____sg_next(sg);
5820                 count = __sg_page_count(sg);
5821         }
5822
5823         *offset = n - idx;
5824         return sg;
5825
5826 lookup:
5827         rcu_read_lock();
5828
5829         sg = radix_tree_lookup(&iter->radix, n);
5830         GEM_BUG_ON(!sg);
5831
5832         /* If this index is in the middle of multi-page sg entry,
5833          * the radixtree will contain an exceptional entry that points
5834          * to the start of that range. We will return the pointer to
5835          * the base page and the offset of this page within the
5836          * sg entry's range.
5837          */
5838         *offset = 0;
5839         if (unlikely(radix_tree_exception(sg))) {
5840                 unsigned long base =
5841                         (unsigned long)sg >> RADIX_TREE_EXCEPTIONAL_SHIFT;
5842
5843                 sg = radix_tree_lookup(&iter->radix, base);
5844                 GEM_BUG_ON(!sg);
5845
5846                 *offset = n - base;
5847         }
5848
5849         rcu_read_unlock();
5850
5851         return sg;
5852 }
5853
5854 struct page *
5855 i915_gem_object_get_page(struct drm_i915_gem_object *obj, unsigned int n)
5856 {
5857         struct scatterlist *sg;
5858         unsigned int offset;
5859
5860         GEM_BUG_ON(!i915_gem_object_has_struct_page(obj));
5861
5862         sg = i915_gem_object_get_sg(obj, n, &offset);
5863         return nth_page(sg_page(sg), offset);
5864 }
5865
5866 /* Like i915_gem_object_get_page(), but mark the returned page dirty */
5867 struct page *
5868 i915_gem_object_get_dirty_page(struct drm_i915_gem_object *obj,
5869                                unsigned int n)
5870 {
5871         struct page *page;
5872
5873         page = i915_gem_object_get_page(obj, n);
5874         if (!obj->mm.dirty)
5875                 set_page_dirty(page);
5876
5877         return page;
5878 }
5879
5880 dma_addr_t
5881 i915_gem_object_get_dma_address(struct drm_i915_gem_object *obj,
5882                                 unsigned long n)
5883 {
5884         struct scatterlist *sg;
5885         unsigned int offset;
5886
5887         sg = i915_gem_object_get_sg(obj, n, &offset);
5888         return sg_dma_address(sg) + (offset << PAGE_SHIFT);
5889 }
5890
5891 int i915_gem_object_attach_phys(struct drm_i915_gem_object *obj, int align)
5892 {
5893         struct sg_table *pages;
5894         int err;
5895
5896         if (align > obj->base.size)
5897                 return -EINVAL;
5898
5899         if (obj->ops == &i915_gem_phys_ops)
5900                 return 0;
5901
5902         if (obj->ops != &i915_gem_object_ops)
5903                 return -EINVAL;
5904
5905         err = i915_gem_object_unbind(obj);
5906         if (err)
5907                 return err;
5908
5909         mutex_lock(&obj->mm.lock);
5910
5911         if (obj->mm.madv != I915_MADV_WILLNEED) {
5912                 err = -EFAULT;
5913                 goto err_unlock;
5914         }
5915
5916         if (obj->mm.quirked) {
5917                 err = -EFAULT;
5918                 goto err_unlock;
5919         }
5920
5921         if (obj->mm.mapping) {
5922                 err = -EBUSY;
5923                 goto err_unlock;
5924         }
5925
5926         pages = fetch_and_zero(&obj->mm.pages);
5927         if (pages) {
5928                 struct drm_i915_private *i915 = to_i915(obj->base.dev);
5929
5930                 __i915_gem_object_reset_page_iter(obj);
5931
5932                 spin_lock(&i915->mm.obj_lock);
5933                 list_del(&obj->mm.link);
5934                 spin_unlock(&i915->mm.obj_lock);
5935         }
5936
5937         obj->ops = &i915_gem_phys_ops;
5938
5939         err = ____i915_gem_object_get_pages(obj);
5940         if (err)
5941                 goto err_xfer;
5942
5943         /* Perma-pin (until release) the physical set of pages */
5944         __i915_gem_object_pin_pages(obj);
5945
5946         if (!IS_ERR_OR_NULL(pages))
5947                 i915_gem_object_ops.put_pages(obj, pages);
5948         mutex_unlock(&obj->mm.lock);
5949         return 0;
5950
5951 err_xfer:
5952         obj->ops = &i915_gem_object_ops;
5953         obj->mm.pages = pages;
5954 err_unlock:
5955         mutex_unlock(&obj->mm.lock);
5956         return err;
5957 }
5958
5959 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
5960 #include "selftests/scatterlist.c"
5961 #include "selftests/mock_gem_device.c"
5962 #include "selftests/huge_gem_object.c"
5963 #include "selftests/huge_pages.c"
5964 #include "selftests/i915_gem_object.c"
5965 #include "selftests/i915_gem_coherency.c"
5966 #endif