drivers/gpu/drm/i915/i915_gem.c

   1 /*
   2  * Copyright © 2008-2015 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *    Eric Anholt <eric@anholt.net>
  25  *
  26  */
  27
  28 #include <drm/drmP.h>
  29 #include <drm/drm_vma_manager.h>
  30 #include <drm/i915_drm.h>
  31 #include "i915_drv.h"
  32 #include "i915_gem_clflush.h"
  33 #include "i915_vgpu.h"
  34 #include "i915_trace.h"
  35 #include "intel_drv.h"
  36 #include "intel_frontbuffer.h"
  37 #include "intel_mocs.h"
  38 #include "intel_workarounds.h"
  39 #include "i915_gemfs.h"
  40 #include <linux/dma-fence-array.h>
  41 #include <linux/kthread.h>
  42 #include <linux/reservation.h>
  43 #include <linux/shmem_fs.h>
  44 #include <linux/slab.h>
  45 #include <linux/stop_machine.h>
  46 #include <linux/swap.h>
  47 #include <linux/pci.h>
  48 #include <linux/dma-buf.h>
  49
  50 static void i915_gem_flush_free_objects(struct drm_i915_private *i915);
  51
  52 static bool cpu_write_needs_clflush(struct drm_i915_gem_object *obj)
  53 {
  54         if (obj->cache_dirty)
  55                 return false;
  56
  57         if (!(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE))
  58                 return true;
  59
  60         return obj->pin_global; /* currently in use by HW, keep flushed */
  61 }
  62
  63 static int
  64 insert_mappable_node(struct i915_ggtt *ggtt,
  65                      struct drm_mm_node *node, u32 size)
  66 {
  67         memset(node, 0, sizeof(*node));
  68         return drm_mm_insert_node_in_range(&ggtt->base.mm, node,
  69                                            size, 0, I915_COLOR_UNEVICTABLE,
  70                                            0, ggtt->mappable_end,
  71                                            DRM_MM_INSERT_LOW);
  72 }
  73
  74 static void
  75 remove_mappable_node(struct drm_mm_node *node)
  76 {
  77         drm_mm_remove_node(node);
  78 }
  79
  80 /* some bookkeeping */
  81 static void i915_gem_info_add_obj(struct drm_i915_private *dev_priv,
  82                                   u64 size)
  83 {
  84         spin_lock(&dev_priv->mm.object_stat_lock);
  85         dev_priv->mm.object_count++;
  86         dev_priv->mm.object_memory += size;
  87         spin_unlock(&dev_priv->mm.object_stat_lock);
  88 }
  89
  90 static void i915_gem_info_remove_obj(struct drm_i915_private *dev_priv,
  91                                      u64 size)
  92 {
  93         spin_lock(&dev_priv->mm.object_stat_lock);
  94         dev_priv->mm.object_count--;
  95         dev_priv->mm.object_memory -= size;
  96         spin_unlock(&dev_priv->mm.object_stat_lock);
  97 }
  98
  99 static int
 100 i915_gem_wait_for_error(struct i915_gpu_error *error)
 101 {
 102         int ret;
 103
 104         might_sleep();
 105
 106         /*
 107          * Only wait 10 seconds for the gpu reset to complete to avoid hanging
 108          * userspace. If it takes that long something really bad is going on and
 109          * we should simply try to bail out and fail as gracefully as possible.
 110          */
 111         ret = wait_event_interruptible_timeout(error->reset_queue,
 112                                                !i915_reset_backoff(error),
 113                                                I915_RESET_TIMEOUT);
 114         if (ret == 0) {
 115                 DRM_ERROR("Timed out waiting for the gpu reset to complete\n");
 116                 return -EIO;
 117         } else if (ret < 0) {
 118                 return ret;
 119         } else {
 120                 return 0;
 121         }
 122 }
 123
 124 int i915_mutex_lock_interruptible(struct drm_device *dev)
 125 {
 126         struct drm_i915_private *dev_priv = to_i915(dev);
 127         int ret;
 128
 129         ret = i915_gem_wait_for_error(&dev_priv->gpu_error);
 130         if (ret)
 131                 return ret;
 132
 133         ret = mutex_lock_interruptible(&dev->struct_mutex);
 134         if (ret)
 135                 return ret;
 136
 137         return 0;
 138 }
 139
 140 static u32 __i915_gem_park(struct drm_i915_private *i915)
 141 {
 142         lockdep_assert_held(&i915->drm.struct_mutex);
 143         GEM_BUG_ON(i915->gt.active_requests);
 144         GEM_BUG_ON(!list_empty(&i915->gt.active_rings));
 145
 146         if (!i915->gt.awake)
 147                 return I915_EPOCH_INVALID;
 148
 149         GEM_BUG_ON(i915->gt.epoch == I915_EPOCH_INVALID);
 150
 151         /*
 152          * Be paranoid and flush a concurrent interrupt to make sure
 153          * we don't reactivate any irq tasklets after parking.
 154          *
 155          * FIXME: Note that even though we have waited for execlists to be idle,
 156          * there may still be an in-flight interrupt even though the CSB
 157          * is now empty. synchronize_irq() makes sure that a residual interrupt
 158          * is completed before we continue, but it doesn't prevent the HW from
 159          * raising a spurious interrupt later. To complete the shield we should
 160          * coordinate disabling the CS irq with flushing the interrupts.
 161          */
 162         synchronize_irq(i915->drm.irq);
 163
 164         intel_engines_park(i915);
 165         i915_timelines_park(i915);
 166
 167         i915_pmu_gt_parked(i915);
 168         i915_vma_parked(i915);
 169
 170         i915->gt.awake = false;
 171
 172         if (INTEL_GEN(i915) >= 6)
 173                 gen6_rps_idle(i915);
 174
 175         intel_display_power_put(i915, POWER_DOMAIN_GT_IRQ);
 176
 177         intel_runtime_pm_put(i915);
 178
 179         return i915->gt.epoch;
 180 }
 181
 182 void i915_gem_park(struct drm_i915_private *i915)
 183 {
 184         lockdep_assert_held(&i915->drm.struct_mutex);
 185         GEM_BUG_ON(i915->gt.active_requests);
 186
 187         if (!i915->gt.awake)
 188                 return;
 189
 190         /* Defer the actual call to __i915_gem_park() to prevent ping-pongs */
 191         mod_delayed_work(i915->wq, &i915->gt.idle_work, msecs_to_jiffies(100));
 192 }
 193
 194 void i915_gem_unpark(struct drm_i915_private *i915)
 195 {
 196         lockdep_assert_held(&i915->drm.struct_mutex);
 197         GEM_BUG_ON(!i915->gt.active_requests);
 198
 199         if (i915->gt.awake)
 200                 return;
 201
 202         intel_runtime_pm_get_noresume(i915);
 203
 204         /*
 205          * It seems that the DMC likes to transition between the DC states a lot
 206          * when there are no connected displays (no active power domains) during
 207          * command submission.
 208          *
 209          * This activity has negative impact on the performance of the chip with
 210          * huge latencies observed in the interrupt handler and elsewhere.
 211          *
 212          * Work around it by grabbing a GT IRQ power domain whilst there is any
 213          * GT activity, preventing any DC state transitions.
 214          */
 215         intel_display_power_get(i915, POWER_DOMAIN_GT_IRQ);
 216
 217         i915->gt.awake = true;
 218         if (unlikely(++i915->gt.epoch == 0)) /* keep 0 as invalid */
 219                 i915->gt.epoch = 1;
 220
 221         intel_enable_gt_powersave(i915);
 222         i915_update_gfx_val(i915);
 223         if (INTEL_GEN(i915) >= 6)
 224                 gen6_rps_busy(i915);
 225         i915_pmu_gt_unparked(i915);
 226
 227         intel_engines_unpark(i915);
 228
 229         i915_queue_hangcheck(i915);
 230
 231         queue_delayed_work(i915->wq,
 232                            &i915->gt.retire_work,
 233                            round_jiffies_up_relative(HZ));
 234 }
 235
 236 int
 237 i915_gem_get_aperture_ioctl(struct drm_device *dev, void *data,
 238                             struct drm_file *file)
 239 {
 240         struct drm_i915_private *dev_priv = to_i915(dev);
 241         struct i915_ggtt *ggtt = &dev_priv->ggtt;
 242         struct drm_i915_gem_get_aperture *args = data;
 243         struct i915_vma *vma;
 244         u64 pinned;
 245
 246         pinned = ggtt->base.reserved;
 247         mutex_lock(&dev->struct_mutex);
 248         list_for_each_entry(vma, &ggtt->base.active_list, vm_link)
 249                 if (i915_vma_is_pinned(vma))
 250                         pinned += vma->node.size;
 251         list_for_each_entry(vma, &ggtt->base.inactive_list, vm_link)
 252                 if (i915_vma_is_pinned(vma))
 253                         pinned += vma->node.size;
 254         mutex_unlock(&dev->struct_mutex);
 255
 256         args->aper_size = ggtt->base.total;
 257         args->aper_available_size = args->aper_size - pinned;
 258
 259         return 0;
 260 }
 261
 262 static int i915_gem_object_get_pages_phys(struct drm_i915_gem_object *obj)
 263 {
 264         struct address_space *mapping = obj->base.filp->f_mapping;
 265         drm_dma_handle_t *phys;
 266         struct sg_table *st;
 267         struct scatterlist *sg;
 268         char *vaddr;
 269         int i;
 270         int err;
 271
 272         if (WARN_ON(i915_gem_object_needs_bit17_swizzle(obj)))
 273                 return -EINVAL;
 274
 275         /* Always aligning to the object size, allows a single allocation
 276          * to handle all possible callers, and given typical object sizes,
 277          * the alignment of the buddy allocation will naturally match.
 278          */
 279         phys = drm_pci_alloc(obj->base.dev,
 280                              roundup_pow_of_two(obj->base.size),
 281                              roundup_pow_of_two(obj->base.size));
 282         if (!phys)
 283                 return -ENOMEM;
 284
 285         vaddr = phys->vaddr;
 286         for (i = 0; i < obj->base.size / PAGE_SIZE; i++) {
 287                 struct page *page;
 288                 char *src;
 289
 290                 page = shmem_read_mapping_page(mapping, i);
 291                 if (IS_ERR(page)) {
 292                         err = PTR_ERR(page);
 293                         goto err_phys;
 294                 }
 295
 296                 src = kmap_atomic(page);
 297                 memcpy(vaddr, src, PAGE_SIZE);
 298                 drm_clflush_virt_range(vaddr, PAGE_SIZE);
 299                 kunmap_atomic(src);
 300
 301                 put_page(page);
 302                 vaddr += PAGE_SIZE;
 303         }
 304
 305         i915_gem_chipset_flush(to_i915(obj->base.dev));
 306
 307         st = kmalloc(sizeof(*st), GFP_KERNEL);
 308         if (!st) {
 309                 err = -ENOMEM;
 310                 goto err_phys;
 311         }
 312
 313         if (sg_alloc_table(st, 1, GFP_KERNEL)) {
 314                 kfree(st);
 315                 err = -ENOMEM;
 316                 goto err_phys;
 317         }
 318
 319         sg = st->sgl;
 320         sg->offset = 0;
 321         sg->length = obj->base.size;
 322
 323         sg_dma_address(sg) = phys->busaddr;
 324         sg_dma_len(sg) = obj->base.size;
 325
 326         obj->phys_handle = phys;
 327
 328         __i915_gem_object_set_pages(obj, st, sg->length);
 329
 330         return 0;
 331
 332 err_phys:
 333         drm_pci_free(obj->base.dev, phys);
 334
 335         return err;
 336 }
 337
 338 static void __start_cpu_write(struct drm_i915_gem_object *obj)
 339 {
 340         obj->read_domains = I915_GEM_DOMAIN_CPU;
 341         obj->write_domain = I915_GEM_DOMAIN_CPU;
 342         if (cpu_write_needs_clflush(obj))
 343                 obj->cache_dirty = true;
 344 }
 345
 346 static void
 347 __i915_gem_object_release_shmem(struct drm_i915_gem_object *obj,
 348                                 struct sg_table *pages,
 349                                 bool needs_clflush)
 350 {
 351         GEM_BUG_ON(obj->mm.madv == __I915_MADV_PURGED);
 352
 353         if (obj->mm.madv == I915_MADV_DONTNEED)
 354                 obj->mm.dirty = false;
 355
 356         if (needs_clflush &&
 357             (obj->read_domains & I915_GEM_DOMAIN_CPU) == 0 &&
 358             !(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ))
 359                 drm_clflush_sg(pages);
 360
 361         __start_cpu_write(obj);
 362 }
 363
 364 static void
 365 i915_gem_object_put_pages_phys(struct drm_i915_gem_object *obj,
 366                                struct sg_table *pages)
 367 {
 368         __i915_gem_object_release_shmem(obj, pages, false);
 369
 370         if (obj->mm.dirty) {
 371                 struct address_space *mapping = obj->base.filp->f_mapping;
 372                 char *vaddr = obj->phys_handle->vaddr;
 373                 int i;
 374
 375                 for (i = 0; i < obj->base.size / PAGE_SIZE; i++) {
 376                         struct page *page;
 377                         char *dst;
 378
 379                         page = shmem_read_mapping_page(mapping, i);
 380                         if (IS_ERR(page))
 381                                 continue;
 382
 383                         dst = kmap_atomic(page);
 384                         drm_clflush_virt_range(vaddr, PAGE_SIZE);
 385                         memcpy(dst, vaddr, PAGE_SIZE);
 386                         kunmap_atomic(dst);
 387
 388                         set_page_dirty(page);
 389                         if (obj->mm.madv == I915_MADV_WILLNEED)
 390                                 mark_page_accessed(page);
 391                         put_page(page);
 392                         vaddr += PAGE_SIZE;
 393                 }
 394                 obj->mm.dirty = false;
 395         }
 396
 397         sg_free_table(pages);
 398         kfree(pages);
 399
 400         drm_pci_free(obj->base.dev, obj->phys_handle);
 401 }
 402
 403 static void
 404 i915_gem_object_release_phys(struct drm_i915_gem_object *obj)
 405 {
 406         i915_gem_object_unpin_pages(obj);
 407 }
 408
 409 static const struct drm_i915_gem_object_ops i915_gem_phys_ops = {
 410         .get_pages = i915_gem_object_get_pages_phys,
 411         .put_pages = i915_gem_object_put_pages_phys,
 412         .release = i915_gem_object_release_phys,
 413 };
 414
 415 static const struct drm_i915_gem_object_ops i915_gem_object_ops;
 416
 417 int i915_gem_object_unbind(struct drm_i915_gem_object *obj)
 418 {
 419         struct i915_vma *vma;
 420         LIST_HEAD(still_in_list);
 421         int ret;
 422
 423         lockdep_assert_held(&obj->base.dev->struct_mutex);
 424
 425         /* Closed vma are removed from the obj->vma_list - but they may
 426          * still have an active binding on the object. To remove those we
 427          * must wait for all rendering to complete to the object (as unbinding
 428          * must anyway), and retire the requests.
 429          */
 430         ret = i915_gem_object_set_to_cpu_domain(obj, false);
 431         if (ret)
 432                 return ret;
 433
 434         while ((vma = list_first_entry_or_null(&obj->vma_list,
 435                                                struct i915_vma,
 436                                                obj_link))) {
 437                 list_move_tail(&vma->obj_link, &still_in_list);
 438                 ret = i915_vma_unbind(vma);
 439                 if (ret)
 440                         break;
 441         }
 442         list_splice(&still_in_list, &obj->vma_list);
 443
 444         return ret;
 445 }
 446
 447 static long
 448 i915_gem_object_wait_fence(struct dma_fence *fence,
 449                            unsigned int flags,
 450                            long timeout,
 451                            struct intel_rps_client *rps_client)
 452 {
 453         struct i915_request *rq;
 454
 455         BUILD_BUG_ON(I915_WAIT_INTERRUPTIBLE != 0x1);
 456
 457         if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags))
 458                 return timeout;
 459
 460         if (!dma_fence_is_i915(fence))
 461                 return dma_fence_wait_timeout(fence,
 462                                               flags & I915_WAIT_INTERRUPTIBLE,
 463                                               timeout);
 464
 465         rq = to_request(fence);
 466         if (i915_request_completed(rq))
 467                 goto out;
 468
 469         /*
 470          * This client is about to stall waiting for the GPU. In many cases
 471          * this is undesirable and limits the throughput of the system, as
 472          * many clients cannot continue processing user input/output whilst
 473          * blocked. RPS autotuning may take tens of milliseconds to respond
 474          * to the GPU load and thus incurs additional latency for the client.
 475          * We can circumvent that by promoting the GPU frequency to maximum
 476          * before we wait. This makes the GPU throttle up much more quickly
 477          * (good for benchmarks and user experience, e.g. window animations),
 478          * but at a cost of spending more power processing the workload
 479          * (bad for battery). Not all clients even want their results
 480          * immediately and for them we should just let the GPU select its own
 481          * frequency to maximise efficiency. To prevent a single client from
 482          * forcing the clocks too high for the whole system, we only allow
 483          * each client to waitboost once in a busy period.
 484          */
 485         if (rps_client && !i915_request_started(rq)) {
 486                 if (INTEL_GEN(rq->i915) >= 6)
 487                         gen6_rps_boost(rq, rps_client);
 488         }
 489
 490         timeout = i915_request_wait(rq, flags, timeout);
 491
 492 out:
 493         if (flags & I915_WAIT_LOCKED && i915_request_completed(rq))
 494                 i915_request_retire_upto(rq);
 495
 496         return timeout;
 497 }
 498
 499 static long
 500 i915_gem_object_wait_reservation(struct reservation_object *resv,
 501                                  unsigned int flags,
 502                                  long timeout,
 503                                  struct intel_rps_client *rps_client)
 504 {
 505         unsigned int seq = __read_seqcount_begin(&resv->seq);
 506         struct dma_fence *excl;
 507         bool prune_fences = false;
 508
 509         if (flags & I915_WAIT_ALL) {
 510                 struct dma_fence **shared;
 511                 unsigned int count, i;
 512                 int ret;
 513
 514                 ret = reservation_object_get_fences_rcu(resv,
 515                                                         &excl, &count, &shared);
 516                 if (ret)
 517                         return ret;
 518
 519                 for (i = 0; i < count; i++) {
 520                         timeout = i915_gem_object_wait_fence(shared[i],
 521                                                              flags, timeout,
 522                                                              rps_client);
 523                         if (timeout < 0)
 524                                 break;
 525
 526                         dma_fence_put(shared[i]);
 527                 }
 528
 529                 for (; i < count; i++)
 530                         dma_fence_put(shared[i]);
 531                 kfree(shared);
 532
 533                 /*
 534                  * If both shared fences and an exclusive fence exist,
 535                  * then by construction the shared fences must be later
 536                  * than the exclusive fence. If we successfully wait for
 537                  * all the shared fences, we know that the exclusive fence
 538                  * must all be signaled. If all the shared fences are
 539                  * signaled, we can prune the array and recover the
 540                  * floating references on the fences/requests.
 541                  */
 542                 prune_fences = count && timeout >= 0;
 543         } else {
 544                 excl = reservation_object_get_excl_rcu(resv);
 545         }
 546
 547         if (excl && timeout >= 0)
 548                 timeout = i915_gem_object_wait_fence(excl, flags, timeout,
 549                                                      rps_client);
 550
 551         dma_fence_put(excl);
 552
 553         /*
 554          * Opportunistically prune the fences iff we know they have *all* been
 555          * signaled and that the reservation object has not been changed (i.e.
 556          * no new fences have been added).
 557          */
 558         if (prune_fences && !__read_seqcount_retry(&resv->seq, seq)) {
 559                 if (reservation_object_trylock(resv)) {
 560                         if (!__read_seqcount_retry(&resv->seq, seq))
 561                                 reservation_object_add_excl_fence(resv, NULL);
 562                         reservation_object_unlock(resv);
 563                 }
 564         }
 565
 566         return timeout;
 567 }
 568
 569 static void __fence_set_priority(struct dma_fence *fence,
 570                                  const struct i915_sched_attr *attr)
 571 {
 572         struct i915_request *rq;
 573         struct intel_engine_cs *engine;
 574
 575         if (dma_fence_is_signaled(fence) || !dma_fence_is_i915(fence))
 576                 return;
 577
 578         rq = to_request(fence);
 579         engine = rq->engine;
 580
 581         local_bh_disable();
 582         rcu_read_lock(); /* RCU serialisation for set-wedged protection */
 583         if (engine->schedule)
 584                 engine->schedule(rq, attr);
 585         rcu_read_unlock();
 586         local_bh_enable(); /* kick the tasklets if queues were reprioritised */
 587 }
 588
 589 static void fence_set_priority(struct dma_fence *fence,
 590                                const struct i915_sched_attr *attr)
 591 {
 592         /* Recurse once into a fence-array */
 593         if (dma_fence_is_array(fence)) {
 594                 struct dma_fence_array *array = to_dma_fence_array(fence);
 595                 int i;
 596
 597                 for (i = 0; i < array->num_fences; i++)
 598                         __fence_set_priority(array->fences[i], attr);
 599         } else {
 600                 __fence_set_priority(fence, attr);
 601         }
 602 }
 603
 604 int
 605 i915_gem_object_wait_priority(struct drm_i915_gem_object *obj,
 606                               unsigned int flags,
 607                               const struct i915_sched_attr *attr)
 608 {
 609         struct dma_fence *excl;
 610
 611         if (flags & I915_WAIT_ALL) {
 612                 struct dma_fence **shared;
 613                 unsigned int count, i;
 614                 int ret;
 615
 616                 ret = reservation_object_get_fences_rcu(obj->resv,
 617                                                         &excl, &count, &shared);
 618                 if (ret)
 619                         return ret;
 620
 621                 for (i = 0; i < count; i++) {
 622                         fence_set_priority(shared[i], attr);
 623                         dma_fence_put(shared[i]);
 624                 }
 625
 626                 kfree(shared);
 627         } else {
 628                 excl = reservation_object_get_excl_rcu(obj->resv);
 629         }
 630
 631         if (excl) {
 632                 fence_set_priority(excl, attr);
 633                 dma_fence_put(excl);
 634         }
 635         return 0;
 636 }
 637
 638 /**
 639  * Waits for rendering to the object to be completed
 640  * @obj: i915 gem object
 641  * @flags: how to wait (under a lock, for all rendering or just for writes etc)
 642  * @timeout: how long to wait
 643  * @rps_client: client (user process) to charge for any waitboosting
 644  */
 645 int
 646 i915_gem_object_wait(struct drm_i915_gem_object *obj,
 647                      unsigned int flags,
 648                      long timeout,
 649                      struct intel_rps_client *rps_client)
 650 {
 651         might_sleep();
 652 #if IS_ENABLED(CONFIG_LOCKDEP)
 653         GEM_BUG_ON(debug_locks &&
 654                    !!lockdep_is_held(&obj->base.dev->struct_mutex) !=
 655                    !!(flags & I915_WAIT_LOCKED));
 656 #endif
 657         GEM_BUG_ON(timeout < 0);
 658
 659         timeout = i915_gem_object_wait_reservation(obj->resv,
 660                                                    flags, timeout,
 661                                                    rps_client);
 662         return timeout < 0 ? timeout : 0;
 663 }
 664
 665 static struct intel_rps_client *to_rps_client(struct drm_file *file)
 666 {
 667         struct drm_i915_file_private *fpriv = file->driver_priv;
 668
 669         return &fpriv->rps_client;
 670 }
 671
 672 static int
 673 i915_gem_phys_pwrite(struct drm_i915_gem_object *obj,
 674                      struct drm_i915_gem_pwrite *args,
 675                      struct drm_file *file)
 676 {
 677         void *vaddr = obj->phys_handle->vaddr + args->offset;
 678         char __user *user_data = u64_to_user_ptr(args->data_ptr);
 679
 680         /* We manually control the domain here and pretend that it
 681          * remains coherent i.e. in the GTT domain, like shmem_pwrite.
 682          */
 683         intel_fb_obj_invalidate(obj, ORIGIN_CPU);
 684         if (copy_from_user(vaddr, user_data, args->size))
 685                 return -EFAULT;
 686
 687         drm_clflush_virt_range(vaddr, args->size);
 688         i915_gem_chipset_flush(to_i915(obj->base.dev));
 689
 690         intel_fb_obj_flush(obj, ORIGIN_CPU);
 691         return 0;
 692 }
 693
 694 void *i915_gem_object_alloc(struct drm_i915_private *dev_priv)
 695 {
 696         return kmem_cache_zalloc(dev_priv->objects, GFP_KERNEL);
 697 }
 698
 699 void i915_gem_object_free(struct drm_i915_gem_object *obj)
 700 {
 701         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
 702         kmem_cache_free(dev_priv->objects, obj);
 703 }
 704
 705 static int
 706 i915_gem_create(struct drm_file *file,
 707                 struct drm_i915_private *dev_priv,
 708                 uint64_t size,
 709                 uint32_t *handle_p)
 710 {
 711         struct drm_i915_gem_object *obj;
 712         int ret;
 713         u32 handle;
 714
 715         size = roundup(size, PAGE_SIZE);
 716         if (size == 0)
 717                 return -EINVAL;
 718
 719         /* Allocate the new object */
 720         obj = i915_gem_object_create(dev_priv, size);
 721         if (IS_ERR(obj))
 722                 return PTR_ERR(obj);
 723
 724         ret = drm_gem_handle_create(file, &obj->base, &handle);
 725         /* drop reference from allocate - handle holds it now */
 726         i915_gem_object_put(obj);
 727         if (ret)
 728                 return ret;
 729
 730         *handle_p = handle;
 731         return 0;
 732 }
 733
 734 int
 735 i915_gem_dumb_create(struct drm_file *file,
 736                      struct drm_device *dev,
 737                      struct drm_mode_create_dumb *args)
 738 {
 739         /* have to work out size/pitch and return them */
 740         args->pitch = ALIGN(args->width * DIV_ROUND_UP(args->bpp, 8), 64);
 741         args->size = args->pitch * args->height;
 742         return i915_gem_create(file, to_i915(dev),
 743                                args->size, &args->handle);
 744 }
 745
 746 static bool gpu_write_needs_clflush(struct drm_i915_gem_object *obj)
 747 {
 748         return !(obj->cache_level == I915_CACHE_NONE ||
 749                  obj->cache_level == I915_CACHE_WT);
 750 }
 751
 752 /**
 753  * Creates a new mm object and returns a handle to it.
 754  * @dev: drm device pointer
 755  * @data: ioctl data blob
 756  * @file: drm file pointer
 757  */
 758 int
 759 i915_gem_create_ioctl(struct drm_device *dev, void *data,
 760                       struct drm_file *file)
 761 {
 762         struct drm_i915_private *dev_priv = to_i915(dev);
 763         struct drm_i915_gem_create *args = data;
 764
 765         i915_gem_flush_free_objects(dev_priv);
 766
 767         return i915_gem_create(file, dev_priv,
 768                                args->size, &args->handle);
 769 }
 770
 771 static inline enum fb_op_origin
 772 fb_write_origin(struct drm_i915_gem_object *obj, unsigned int domain)
 773 {
 774         return (domain == I915_GEM_DOMAIN_GTT ?
 775                 obj->frontbuffer_ggtt_origin : ORIGIN_CPU);
 776 }
 777
 778 void i915_gem_flush_ggtt_writes(struct drm_i915_private *dev_priv)
 779 {
 780         /*
 781          * No actual flushing is required for the GTT write domain for reads
 782          * from the GTT domain. Writes to it "immediately" go to main memory
 783          * as far as we know, so there's no chipset flush. It also doesn't
 784          * land in the GPU render cache.
 785          *
 786          * However, we do have to enforce the order so that all writes through
 787          * the GTT land before any writes to the device, such as updates to
 788          * the GATT itself.
 789          *
 790          * We also have to wait a bit for the writes to land from the GTT.
 791          * An uncached read (i.e. mmio) seems to be ideal for the round-trip
 792          * timing. This issue has only been observed when switching quickly
 793          * between GTT writes and CPU reads from inside the kernel on recent hw,
 794          * and it appears to only affect discrete GTT blocks (i.e. on LLC
 795          * system agents we cannot reproduce this behaviour, until Cannonlake
 796          * that was!).
 797          */
 798
 799         wmb();
 800
 801         intel_runtime_pm_get(dev_priv);
 802         spin_lock_irq(&dev_priv->uncore.lock);
 803
 804         POSTING_READ_FW(RING_HEAD(RENDER_RING_BASE));
 805
 806         spin_unlock_irq(&dev_priv->uncore.lock);
 807         intel_runtime_pm_put(dev_priv);
 808 }
 809
 810 static void
 811 flush_write_domain(struct drm_i915_gem_object *obj, unsigned int flush_domains)
 812 {
 813         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
 814         struct i915_vma *vma;
 815
 816         if (!(obj->write_domain & flush_domains))
 817                 return;
 818
 819         switch (obj->write_domain) {
 820         case I915_GEM_DOMAIN_GTT:
 821                 i915_gem_flush_ggtt_writes(dev_priv);
 822
 823                 intel_fb_obj_flush(obj,
 824                                    fb_write_origin(obj, I915_GEM_DOMAIN_GTT));
 825
 826                 for_each_ggtt_vma(vma, obj) {
 827                         if (vma->iomap)
 828                                 continue;
 829
 830                         i915_vma_unset_ggtt_write(vma);
 831                 }
 832                 break;
 833
 834         case I915_GEM_DOMAIN_CPU:
 835                 i915_gem_clflush_object(obj, I915_CLFLUSH_SYNC);
 836                 break;
 837
 838         case I915_GEM_DOMAIN_RENDER:
 839                 if (gpu_write_needs_clflush(obj))
 840                         obj->cache_dirty = true;
 841                 break;
 842         }
 843
 844         obj->write_domain = 0;
 845 }
 846
 847 static inline int
 848 __copy_to_user_swizzled(char __user *cpu_vaddr,
 849                         const char *gpu_vaddr, int gpu_offset,
 850                         int length)
 851 {
 852         int ret, cpu_offset = 0;
 853
 854         while (length > 0) {
 855                 int cacheline_end = ALIGN(gpu_offset + 1, 64);
 856                 int this_length = min(cacheline_end - gpu_offset, length);
 857                 int swizzled_gpu_offset = gpu_offset ^ 64;
 858
 859                 ret = __copy_to_user(cpu_vaddr + cpu_offset,
 860                                      gpu_vaddr + swizzled_gpu_offset,
 861                                      this_length);
 862                 if (ret)
 863                         return ret + length;
 864
 865                 cpu_offset += this_length;
 866                 gpu_offset += this_length;
 867                 length -= this_length;
 868         }
 869
 870         return 0;
 871 }
 872
 873 static inline int
 874 __copy_from_user_swizzled(char *gpu_vaddr, int gpu_offset,
 875                           const char __user *cpu_vaddr,
 876                           int length)
 877 {
 878         int ret, cpu_offset = 0;
 879
 880         while (length > 0) {
 881                 int cacheline_end = ALIGN(gpu_offset + 1, 64);
 882                 int this_length = min(cacheline_end - gpu_offset, length);
 883                 int swizzled_gpu_offset = gpu_offset ^ 64;
 884
 885                 ret = __copy_from_user(gpu_vaddr + swizzled_gpu_offset,
 886                                        cpu_vaddr + cpu_offset,
 887                                        this_length);
 888                 if (ret)
 889                         return ret + length;
 890
 891                 cpu_offset += this_length;
 892                 gpu_offset += this_length;
 893                 length -= this_length;
 894         }
 895
 896         return 0;
 897 }
 898
 899 /*
 900  * Pins the specified object's pages and synchronizes the object with
 901  * GPU accesses. Sets needs_clflush to non-zero if the caller should
 902  * flush the object from the CPU cache.
 903  */
 904 int i915_gem_obj_prepare_shmem_read(struct drm_i915_gem_object *obj,
 905                                     unsigned int *needs_clflush)
 906 {
 907         int ret;
 908
 909         lockdep_assert_held(&obj->base.dev->struct_mutex);
 910
 911         *needs_clflush = 0;
 912         if (!i915_gem_object_has_struct_page(obj))
 913                 return -ENODEV;
 914
 915         ret = i915_gem_object_wait(obj,
 916                                    I915_WAIT_INTERRUPTIBLE |
 917                                    I915_WAIT_LOCKED,
 918                                    MAX_SCHEDULE_TIMEOUT,
 919                                    NULL);
 920         if (ret)
 921                 return ret;
 922
 923         ret = i915_gem_object_pin_pages(obj);
 924         if (ret)
 925                 return ret;
 926
 927         if (obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ ||
 928             !static_cpu_has(X86_FEATURE_CLFLUSH)) {
 929                 ret = i915_gem_object_set_to_cpu_domain(obj, false);
 930                 if (ret)
 931                         goto err_unpin;
 932                 else
 933                         goto out;
 934         }
 935
 936         flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
 937
 938         /* If we're not in the cpu read domain, set ourself into the gtt
 939          * read domain and manually flush cachelines (if required). This
 940          * optimizes for the case when the gpu will dirty the data
 941          * anyway again before the next pread happens.
 942          */
 943         if (!obj->cache_dirty &&
 944             !(obj->read_domains & I915_GEM_DOMAIN_CPU))
 945                 *needs_clflush = CLFLUSH_BEFORE;
 946
 947 out:
 948         /* return with the pages pinned */
 949         return 0;
 950
 951 err_unpin:
 952         i915_gem_object_unpin_pages(obj);
 953         return ret;
 954 }
 955
 956 int i915_gem_obj_prepare_shmem_write(struct drm_i915_gem_object *obj,
 957                                      unsigned int *needs_clflush)
 958 {
 959         int ret;
 960
 961         lockdep_assert_held(&obj->base.dev->struct_mutex);
 962
 963         *needs_clflush = 0;
 964         if (!i915_gem_object_has_struct_page(obj))
 965                 return -ENODEV;
 966
 967         ret = i915_gem_object_wait(obj,
 968                                    I915_WAIT_INTERRUPTIBLE |
 969                                    I915_WAIT_LOCKED |
 970                                    I915_WAIT_ALL,
 971                                    MAX_SCHEDULE_TIMEOUT,
 972                                    NULL);
 973         if (ret)
 974                 return ret;
 975
 976         ret = i915_gem_object_pin_pages(obj);
 977         if (ret)
 978                 return ret;
 979
 980         if (obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE ||
 981             !static_cpu_has(X86_FEATURE_CLFLUSH)) {
 982                 ret = i915_gem_object_set_to_cpu_domain(obj, true);
 983                 if (ret)
 984                         goto err_unpin;
 985                 else
 986                         goto out;
 987         }
 988
 989         flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
 990
 991         /* If we're not in the cpu write domain, set ourself into the
 992          * gtt write domain and manually flush cachelines (as required).
 993          * This optimizes for the case when the gpu will use the data
 994          * right away and we therefore have to clflush anyway.
 995          */
 996         if (!obj->cache_dirty) {
 997                 *needs_clflush |= CLFLUSH_AFTER;
 998
 999                 /*
1000                  * Same trick applies to invalidate partially written
1001                  * cachelines read before writing.
1002                  */
1003                 if (!(obj->read_domains & I915_GEM_DOMAIN_CPU))
1004                         *needs_clflush |= CLFLUSH_BEFORE;
1005         }
1006
1007 out:
1008         intel_fb_obj_invalidate(obj, ORIGIN_CPU);
1009         obj->mm.dirty = true;
1010         /* return with the pages pinned */
1011         return 0;
1012
1013 err_unpin:
1014         i915_gem_object_unpin_pages(obj);
1015         return ret;
1016 }
1017
1018 static void
1019 shmem_clflush_swizzled_range(char *addr, unsigned long length,
1020                              bool swizzled)
1021 {
1022         if (unlikely(swizzled)) {
1023                 unsigned long start = (unsigned long) addr;
1024                 unsigned long end = (unsigned long) addr + length;
1025
1026                 /* For swizzling simply ensure that we always flush both
1027                  * channels. Lame, but simple and it works. Swizzled
1028                  * pwrite/pread is far from a hotpath - current userspace
1029                  * doesn't use it at all. */
1030                 start = round_down(start, 128);
1031                 end = round_up(end, 128);
1032
1033                 drm_clflush_virt_range((void *)start, end - start);
1034         } else {
1035                 drm_clflush_virt_range(addr, length);
1036         }
1037
1038 }
1039
1040 /* Only difference to the fast-path function is that this can handle bit17
1041  * and uses non-atomic copy and kmap functions. */
1042 static int
1043 shmem_pread_slow(struct page *page, int offset, int length,
1044                  char __user *user_data,
1045                  bool page_do_bit17_swizzling, bool needs_clflush)
1046 {
1047         char *vaddr;
1048         int ret;
1049
1050         vaddr = kmap(page);
1051         if (needs_clflush)
1052                 shmem_clflush_swizzled_range(vaddr + offset, length,
1053                                              page_do_bit17_swizzling);
1054
1055         if (page_do_bit17_swizzling)
1056                 ret = __copy_to_user_swizzled(user_data, vaddr, offset, length);
1057         else
1058                 ret = __copy_to_user(user_data, vaddr + offset, length);
1059         kunmap(page);
1060
1061         return ret ? - EFAULT : 0;
1062 }
1063
1064 static int
1065 shmem_pread(struct page *page, int offset, int length, char __user *user_data,
1066             bool page_do_bit17_swizzling, bool needs_clflush)
1067 {
1068         int ret;
1069
1070         ret = -ENODEV;
1071         if (!page_do_bit17_swizzling) {
1072                 char *vaddr = kmap_atomic(page);
1073
1074                 if (needs_clflush)
1075                         drm_clflush_virt_range(vaddr + offset, length);
1076                 ret = __copy_to_user_inatomic(user_data, vaddr + offset, length);
1077                 kunmap_atomic(vaddr);
1078         }
1079         if (ret == 0)
1080                 return 0;
1081
1082         return shmem_pread_slow(page, offset, length, user_data,
1083                                 page_do_bit17_swizzling, needs_clflush);
1084 }
1085
1086 static int
1087 i915_gem_shmem_pread(struct drm_i915_gem_object *obj,
1088                      struct drm_i915_gem_pread *args)
1089 {
1090         char __user *user_data;
1091         u64 remain;
1092         unsigned int obj_do_bit17_swizzling;
1093         unsigned int needs_clflush;
1094         unsigned int idx, offset;
1095         int ret;
1096
1097         obj_do_bit17_swizzling = 0;
1098         if (i915_gem_object_needs_bit17_swizzle(obj))
1099                 obj_do_bit17_swizzling = BIT(17);
1100
1101         ret = mutex_lock_interruptible(&obj->base.dev->struct_mutex);
1102         if (ret)
1103                 return ret;
1104
1105         ret = i915_gem_obj_prepare_shmem_read(obj, &needs_clflush);
1106         mutex_unlock(&obj->base.dev->struct_mutex);
1107         if (ret)
1108                 return ret;
1109
1110         remain = args->size;
1111         user_data = u64_to_user_ptr(args->data_ptr);
1112         offset = offset_in_page(args->offset);
1113         for (idx = args->offset >> PAGE_SHIFT; remain; idx++) {
1114                 struct page *page = i915_gem_object_get_page(obj, idx);
1115                 int length;
1116
1117                 length = remain;
1118                 if (offset + length > PAGE_SIZE)
1119                         length = PAGE_SIZE - offset;
1120
1121                 ret = shmem_pread(page, offset, length, user_data,
1122                                   page_to_phys(page) & obj_do_bit17_swizzling,
1123                                   needs_clflush);
1124                 if (ret)
1125                         break;
1126
1127                 remain -= length;
1128                 user_data += length;
1129                 offset = 0;
1130         }
1131
1132         i915_gem_obj_finish_shmem_access(obj);
1133         return ret;
1134 }
1135
1136 static inline bool
1137 gtt_user_read(struct io_mapping *mapping,
1138               loff_t base, int offset,
1139               char __user *user_data, int length)
1140 {
1141         void __iomem *vaddr;
1142         unsigned long unwritten;
1143
1144         /* We can use the cpu mem copy function because this is X86. */
1145         vaddr = io_mapping_map_atomic_wc(mapping, base);
1146         unwritten = __copy_to_user_inatomic(user_data,
1147                                             (void __force *)vaddr + offset,
1148                                             length);
1149         io_mapping_unmap_atomic(vaddr);
1150         if (unwritten) {
1151                 vaddr = io_mapping_map_wc(mapping, base, PAGE_SIZE);
1152                 unwritten = copy_to_user(user_data,
1153                                          (void __force *)vaddr + offset,
1154                                          length);
1155                 io_mapping_unmap(vaddr);
1156         }
1157         return unwritten;
1158 }
1159
1160 static int
1161 i915_gem_gtt_pread(struct drm_i915_gem_object *obj,
1162                    const struct drm_i915_gem_pread *args)
1163 {
1164         struct drm_i915_private *i915 = to_i915(obj->base.dev);
1165         struct i915_ggtt *ggtt = &i915->ggtt;
1166         struct drm_mm_node node;
1167         struct i915_vma *vma;
1168         void __user *user_data;
1169         u64 remain, offset;
1170         int ret;
1171
1172         ret = mutex_lock_interruptible(&i915->drm.struct_mutex);
1173         if (ret)
1174                 return ret;
1175
1176         intel_runtime_pm_get(i915);
1177         vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
1178                                        PIN_MAPPABLE |
1179                                        PIN_NONFAULT |
1180                                        PIN_NONBLOCK);
1181         if (!IS_ERR(vma)) {
1182                 node.start = i915_ggtt_offset(vma);
1183                 node.allocated = false;
1184                 ret = i915_vma_put_fence(vma);
1185                 if (ret) {
1186                         i915_vma_unpin(vma);
1187                         vma = ERR_PTR(ret);
1188                 }
1189         }
1190         if (IS_ERR(vma)) {
1191                 ret = insert_mappable_node(ggtt, &node, PAGE_SIZE);
1192                 if (ret)
1193                         goto out_unlock;
1194                 GEM_BUG_ON(!node.allocated);
1195         }
1196
1197         ret = i915_gem_object_set_to_gtt_domain(obj, false);
1198         if (ret)
1199                 goto out_unpin;
1200
1201         mutex_unlock(&i915->drm.struct_mutex);
1202
1203         user_data = u64_to_user_ptr(args->data_ptr);
1204         remain = args->size;
1205         offset = args->offset;
1206
1207         while (remain > 0) {
1208                 /* Operation in this page
1209                  *
1210                  * page_base = page offset within aperture
1211                  * page_offset = offset within page
1212                  * page_length = bytes to copy for this page
1213                  */
1214                 u32 page_base = node.start;
1215                 unsigned page_offset = offset_in_page(offset);
1216                 unsigned page_length = PAGE_SIZE - page_offset;
1217                 page_length = remain < page_length ? remain : page_length;
1218                 if (node.allocated) {
1219                         wmb();
1220                         ggtt->base.insert_page(&ggtt->base,
1221                                                i915_gem_object_get_dma_address(obj, offset >> PAGE_SHIFT),
1222                                                node.start, I915_CACHE_NONE, 0);
1223                         wmb();
1224                 } else {
1225                         page_base += offset & PAGE_MASK;
1226                 }
1227
1228                 if (gtt_user_read(&ggtt->iomap, page_base, page_offset,
1229                                   user_data, page_length)) {
1230                         ret = -EFAULT;
1231                         break;
1232                 }
1233
1234                 remain -= page_length;
1235                 user_data += page_length;
1236                 offset += page_length;
1237         }
1238
1239         mutex_lock(&i915->drm.struct_mutex);
1240 out_unpin:
1241         if (node.allocated) {
1242                 wmb();
1243                 ggtt->base.clear_range(&ggtt->base,
1244                                        node.start, node.size);
1245                 remove_mappable_node(&node);
1246         } else {
1247                 i915_vma_unpin(vma);
1248         }
1249 out_unlock:
1250         intel_runtime_pm_put(i915);
1251         mutex_unlock(&i915->drm.struct_mutex);
1252
1253         return ret;
1254 }
1255
1256 /**
1257  * Reads data from the object referenced by handle.
1258  * @dev: drm device pointer
1259  * @data: ioctl data blob
1260  * @file: drm file pointer
1261  *
1262  * On error, the contents of *data are undefined.
1263  */
1264 int
1265 i915_gem_pread_ioctl(struct drm_device *dev, void *data,
1266                      struct drm_file *file)
1267 {
1268         struct drm_i915_gem_pread *args = data;
1269         struct drm_i915_gem_object *obj;
1270         int ret;
1271
1272         if (args->size == 0)
1273                 return 0;
1274
1275         if (!access_ok(VERIFY_WRITE,
1276                        u64_to_user_ptr(args->data_ptr),
1277                        args->size))
1278                 return -EFAULT;
1279
1280         obj = i915_gem_object_lookup(file, args->handle);
1281         if (!obj)
1282                 return -ENOENT;
1283
1284         /* Bounds check source.  */
1285         if (range_overflows_t(u64, args->offset, args->size, obj->base.size)) {
1286                 ret = -EINVAL;
1287                 goto out;
1288         }
1289
1290         trace_i915_gem_object_pread(obj, args->offset, args->size);
1291
1292         ret = i915_gem_object_wait(obj,
1293                                    I915_WAIT_INTERRUPTIBLE,
1294                                    MAX_SCHEDULE_TIMEOUT,
1295                                    to_rps_client(file));
1296         if (ret)
1297                 goto out;
1298
1299         ret = i915_gem_object_pin_pages(obj);
1300         if (ret)
1301                 goto out;
1302
1303         ret = i915_gem_shmem_pread(obj, args);
1304         if (ret == -EFAULT || ret == -ENODEV)
1305                 ret = i915_gem_gtt_pread(obj, args);
1306
1307         i915_gem_object_unpin_pages(obj);
1308 out:
1309         i915_gem_object_put(obj);
1310         return ret;
1311 }
1312
1313 /* This is the fast write path which cannot handle
1314  * page faults in the source data
1315  */
1316
1317 static inline bool
1318 ggtt_write(struct io_mapping *mapping,
1319            loff_t base, int offset,
1320            char __user *user_data, int length)
1321 {
1322         void __iomem *vaddr;
1323         unsigned long unwritten;
1324
1325         /* We can use the cpu mem copy function because this is X86. */
1326         vaddr = io_mapping_map_atomic_wc(mapping, base);
1327         unwritten = __copy_from_user_inatomic_nocache((void __force *)vaddr + offset,
1328                                                       user_data, length);
1329         io_mapping_unmap_atomic(vaddr);
1330         if (unwritten) {
1331                 vaddr = io_mapping_map_wc(mapping, base, PAGE_SIZE);
1332                 unwritten = copy_from_user((void __force *)vaddr + offset,
1333                                            user_data, length);
1334                 io_mapping_unmap(vaddr);
1335         }
1336
1337         return unwritten;
1338 }
1339
1340 /**
1341  * This is the fast pwrite path, where we copy the data directly from the
1342  * user into the GTT, uncached.
1343  * @obj: i915 GEM object
1344  * @args: pwrite arguments structure
1345  */
1346 static int
1347 i915_gem_gtt_pwrite_fast(struct drm_i915_gem_object *obj,
1348                          const struct drm_i915_gem_pwrite *args)
1349 {
1350         struct drm_i915_private *i915 = to_i915(obj->base.dev);
1351         struct i915_ggtt *ggtt = &i915->ggtt;
1352         struct drm_mm_node node;
1353         struct i915_vma *vma;
1354         u64 remain, offset;
1355         void __user *user_data;
1356         int ret;
1357
1358         ret = mutex_lock_interruptible(&i915->drm.struct_mutex);
1359         if (ret)
1360                 return ret;
1361
1362         if (i915_gem_object_has_struct_page(obj)) {
1363                 /*
1364                  * Avoid waking the device up if we can fallback, as
1365                  * waking/resuming is very slow (worst-case 10-100 ms
1366                  * depending on PCI sleeps and our own resume time).
1367                  * This easily dwarfs any performance advantage from
1368                  * using the cache bypass of indirect GGTT access.
1369                  */
1370                 if (!intel_runtime_pm_get_if_in_use(i915)) {
1371                         ret = -EFAULT;
1372                         goto out_unlock;
1373                 }
1374         } else {
1375                 /* No backing pages, no fallback, we must force GGTT access */
1376                 intel_runtime_pm_get(i915);
1377         }
1378
1379         vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
1380                                        PIN_MAPPABLE |
1381                                        PIN_NONFAULT |
1382                                        PIN_NONBLOCK);
1383         if (!IS_ERR(vma)) {
1384                 node.start = i915_ggtt_offset(vma);
1385                 node.allocated = false;
1386                 ret = i915_vma_put_fence(vma);
1387                 if (ret) {
1388                         i915_vma_unpin(vma);
1389                         vma = ERR_PTR(ret);
1390                 }
1391         }
1392         if (IS_ERR(vma)) {
1393                 ret = insert_mappable_node(ggtt, &node, PAGE_SIZE);
1394                 if (ret)
1395                         goto out_rpm;
1396                 GEM_BUG_ON(!node.allocated);
1397         }
1398
1399         ret = i915_gem_object_set_to_gtt_domain(obj, true);
1400         if (ret)
1401                 goto out_unpin;
1402
1403         mutex_unlock(&i915->drm.struct_mutex);
1404
1405         intel_fb_obj_invalidate(obj, ORIGIN_CPU);
1406
1407         user_data = u64_to_user_ptr(args->data_ptr);
1408         offset = args->offset;
1409         remain = args->size;
1410         while (remain) {
1411                 /* Operation in this page
1412                  *
1413                  * page_base = page offset within aperture
1414                  * page_offset = offset within page
1415                  * page_length = bytes to copy for this page
1416                  */
1417                 u32 page_base = node.start;
1418                 unsigned int page_offset = offset_in_page(offset);
1419                 unsigned int page_length = PAGE_SIZE - page_offset;
1420                 page_length = remain < page_length ? remain : page_length;
1421                 if (node.allocated) {
1422                         wmb(); /* flush the write before we modify the GGTT */
1423                         ggtt->base.insert_page(&ggtt->base,
1424                                                i915_gem_object_get_dma_address(obj, offset >> PAGE_SHIFT),
1425                                                node.start, I915_CACHE_NONE, 0);
1426                         wmb(); /* flush modifications to the GGTT (insert_page) */
1427                 } else {
1428                         page_base += offset & PAGE_MASK;
1429                 }
1430                 /* If we get a fault while copying data, then (presumably) our
1431                  * source page isn't available.  Return the error and we'll
1432                  * retry in the slow path.
1433                  * If the object is non-shmem backed, we retry again with the
1434                  * path that handles page fault.
1435                  */
1436                 if (ggtt_write(&ggtt->iomap, page_base, page_offset,
1437                                user_data, page_length)) {
1438                         ret = -EFAULT;
1439                         break;
1440                 }
1441
1442                 remain -= page_length;
1443                 user_data += page_length;
1444                 offset += page_length;
1445         }
1446         intel_fb_obj_flush(obj, ORIGIN_CPU);
1447
1448         mutex_lock(&i915->drm.struct_mutex);
1449 out_unpin:
1450         if (node.allocated) {
1451                 wmb();
1452                 ggtt->base.clear_range(&ggtt->base,
1453                                        node.start, node.size);
1454                 remove_mappable_node(&node);
1455         } else {
1456                 i915_vma_unpin(vma);
1457         }
1458 out_rpm:
1459         intel_runtime_pm_put(i915);
1460 out_unlock:
1461         mutex_unlock(&i915->drm.struct_mutex);
1462         return ret;
1463 }
1464
1465 static int
1466 shmem_pwrite_slow(struct page *page, int offset, int length,
1467                   char __user *user_data,
1468                   bool page_do_bit17_swizzling,
1469                   bool needs_clflush_before,
1470                   bool needs_clflush_after)
1471 {
1472         char *vaddr;
1473         int ret;
1474
1475         vaddr = kmap(page);
1476         if (unlikely(needs_clflush_before || page_do_bit17_swizzling))
1477                 shmem_clflush_swizzled_range(vaddr + offset, length,
1478                                              page_do_bit17_swizzling);
1479         if (page_do_bit17_swizzling)
1480                 ret = __copy_from_user_swizzled(vaddr, offset, user_data,
1481                                                 length);
1482         else
1483                 ret = __copy_from_user(vaddr + offset, user_data, length);
1484         if (needs_clflush_after)
1485                 shmem_clflush_swizzled_range(vaddr + offset, length,
1486                                              page_do_bit17_swizzling);
1487         kunmap(page);
1488
1489         return ret ? -EFAULT : 0;
1490 }
1491
1492 /* Per-page copy function for the shmem pwrite fastpath.
1493  * Flushes invalid cachelines before writing to the target if
1494  * needs_clflush_before is set and flushes out any written cachelines after
1495  * writing if needs_clflush is set.
1496  */
1497 static int
1498 shmem_pwrite(struct page *page, int offset, int len, char __user *user_data,
1499              bool page_do_bit17_swizzling,
1500              bool needs_clflush_before,
1501              bool needs_clflush_after)
1502 {
1503         int ret;
1504
1505         ret = -ENODEV;
1506         if (!page_do_bit17_swizzling) {
1507                 char *vaddr = kmap_atomic(page);
1508
1509                 if (needs_clflush_before)
1510                         drm_clflush_virt_range(vaddr + offset, len);
1511                 ret = __copy_from_user_inatomic(vaddr + offset, user_data, len);
1512                 if (needs_clflush_after)
1513                         drm_clflush_virt_range(vaddr + offset, len);
1514
1515                 kunmap_atomic(vaddr);
1516         }
1517         if (ret == 0)
1518                 return ret;
1519
1520         return shmem_pwrite_slow(page, offset, len, user_data,
1521                                  page_do_bit17_swizzling,
1522                                  needs_clflush_before,
1523                                  needs_clflush_after);
1524 }
1525
1526 static int
1527 i915_gem_shmem_pwrite(struct drm_i915_gem_object *obj,
1528                       const struct drm_i915_gem_pwrite *args)
1529 {
1530         struct drm_i915_private *i915 = to_i915(obj->base.dev);
1531         void __user *user_data;
1532         u64 remain;
1533         unsigned int obj_do_bit17_swizzling;
1534         unsigned int partial_cacheline_write;
1535         unsigned int needs_clflush;
1536         unsigned int offset, idx;
1537         int ret;
1538
1539         ret = mutex_lock_interruptible(&i915->drm.struct_mutex);
1540         if (ret)
1541                 return ret;
1542
1543         ret = i915_gem_obj_prepare_shmem_write(obj, &needs_clflush);
1544         mutex_unlock(&i915->drm.struct_mutex);
1545         if (ret)
1546                 return ret;
1547
1548         obj_do_bit17_swizzling = 0;
1549         if (i915_gem_object_needs_bit17_swizzle(obj))
1550                 obj_do_bit17_swizzling = BIT(17);
1551
1552         /* If we don't overwrite a cacheline completely we need to be
1553          * careful to have up-to-date data by first clflushing. Don't
1554          * overcomplicate things and flush the entire patch.
1555          */
1556         partial_cacheline_write = 0;
1557         if (needs_clflush & CLFLUSH_BEFORE)
1558                 partial_cacheline_write = boot_cpu_data.x86_clflush_size - 1;
1559
1560         user_data = u64_to_user_ptr(args->data_ptr);
1561         remain = args->size;
1562         offset = offset_in_page(args->offset);
1563         for (idx = args->offset >> PAGE_SHIFT; remain; idx++) {
1564                 struct page *page = i915_gem_object_get_page(obj, idx);
1565                 int length;
1566
1567                 length = remain;
1568                 if (offset + length > PAGE_SIZE)
1569                         length = PAGE_SIZE - offset;
1570
1571                 ret = shmem_pwrite(page, offset, length, user_data,
1572                                    page_to_phys(page) & obj_do_bit17_swizzling,
1573                                    (offset | length) & partial_cacheline_write,
1574                                    needs_clflush & CLFLUSH_AFTER);
1575                 if (ret)
1576                         break;
1577
1578                 remain -= length;
1579                 user_data += length;
1580                 offset = 0;
1581         }
1582
1583         intel_fb_obj_flush(obj, ORIGIN_CPU);
1584         i915_gem_obj_finish_shmem_access(obj);
1585         return ret;
1586 }
1587
1588 /**
1589  * Writes data to the object referenced by handle.
1590  * @dev: drm device
1591  * @data: ioctl data blob
1592  * @file: drm file
1593  *
1594  * On error, the contents of the buffer that were to be modified are undefined.
1595  */
1596 int
1597 i915_gem_pwrite_ioctl(struct drm_device *dev, void *data,
1598                       struct drm_file *file)
1599 {
1600         struct drm_i915_gem_pwrite *args = data;
1601         struct drm_i915_gem_object *obj;
1602         int ret;
1603
1604         if (args->size == 0)
1605                 return 0;
1606
1607         if (!access_ok(VERIFY_READ,
1608                        u64_to_user_ptr(args->data_ptr),
1609                        args->size))
1610                 return -EFAULT;
1611
1612         obj = i915_gem_object_lookup(file, args->handle);
1613         if (!obj)
1614                 return -ENOENT;
1615
1616         /* Bounds check destination. */
1617         if (range_overflows_t(u64, args->offset, args->size, obj->base.size)) {
1618                 ret = -EINVAL;
1619                 goto err;
1620         }
1621
1622         trace_i915_gem_object_pwrite(obj, args->offset, args->size);
1623
1624         ret = -ENODEV;
1625         if (obj->ops->pwrite)
1626                 ret = obj->ops->pwrite(obj, args);
1627         if (ret != -ENODEV)
1628                 goto err;
1629
1630         ret = i915_gem_object_wait(obj,
1631                                    I915_WAIT_INTERRUPTIBLE |
1632                                    I915_WAIT_ALL,
1633                                    MAX_SCHEDULE_TIMEOUT,
1634                                    to_rps_client(file));
1635         if (ret)
1636                 goto err;
1637
1638         ret = i915_gem_object_pin_pages(obj);
1639         if (ret)
1640                 goto err;
1641
1642         ret = -EFAULT;
1643         /* We can only do the GTT pwrite on untiled buffers, as otherwise
1644          * it would end up going through the fenced access, and we'll get
1645          * different detiling behavior between reading and writing.
1646          * pread/pwrite currently are reading and writing from the CPU
1647          * perspective, requiring manual detiling by the client.
1648          */
1649         if (!i915_gem_object_has_struct_page(obj) ||
1650             cpu_write_needs_clflush(obj))
1651                 /* Note that the gtt paths might fail with non-page-backed user
1652                  * pointers (e.g. gtt mappings when moving data between
1653                  * textures). Fallback to the shmem path in that case.
1654                  */
1655                 ret = i915_gem_gtt_pwrite_fast(obj, args);
1656
1657         if (ret == -EFAULT || ret == -ENOSPC) {
1658                 if (obj->phys_handle)
1659                         ret = i915_gem_phys_pwrite(obj, args, file);
1660                 else
1661                         ret = i915_gem_shmem_pwrite(obj, args);
1662         }
1663
1664         i915_gem_object_unpin_pages(obj);
1665 err:
1666         i915_gem_object_put(obj);
1667         return ret;
1668 }
1669
1670 static void i915_gem_object_bump_inactive_ggtt(struct drm_i915_gem_object *obj)
1671 {
1672         struct drm_i915_private *i915;
1673         struct list_head *list;
1674         struct i915_vma *vma;
1675
1676         GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
1677
1678         for_each_ggtt_vma(vma, obj) {
1679                 if (i915_vma_is_active(vma))
1680                         continue;
1681
1682                 if (!drm_mm_node_allocated(&vma->node))
1683                         continue;
1684
1685                 list_move_tail(&vma->vm_link, &vma->vm->inactive_list);
1686         }
1687
1688         i915 = to_i915(obj->base.dev);
1689         spin_lock(&i915->mm.obj_lock);
1690         list = obj->bind_count ? &i915->mm.bound_list : &i915->mm.unbound_list;
1691         list_move_tail(&obj->mm.link, list);
1692         spin_unlock(&i915->mm.obj_lock);
1693 }
1694
1695 /**
1696  * Called when user space prepares to use an object with the CPU, either
1697  * through the mmap ioctl's mapping or a GTT mapping.
1698  * @dev: drm device
1699  * @data: ioctl data blob
1700  * @file: drm file
1701  */
1702 int
1703 i915_gem_set_domain_ioctl(struct drm_device *dev, void *data,
1704                           struct drm_file *file)
1705 {
1706         struct drm_i915_gem_set_domain *args = data;
1707         struct drm_i915_gem_object *obj;
1708         uint32_t read_domains = args->read_domains;
1709         uint32_t write_domain = args->write_domain;
1710         int err;
1711
1712         /* Only handle setting domains to types used by the CPU. */
1713         if ((write_domain | read_domains) & I915_GEM_GPU_DOMAINS)
1714                 return -EINVAL;
1715
1716         /* Having something in the write domain implies it's in the read
1717          * domain, and only that read domain.  Enforce that in the request.
1718          */
1719         if (write_domain != 0 && read_domains != write_domain)
1720                 return -EINVAL;
1721
1722         obj = i915_gem_object_lookup(file, args->handle);
1723         if (!obj)
1724                 return -ENOENT;
1725
1726         /* Try to flush the object off the GPU without holding the lock.
1727          * We will repeat the flush holding the lock in the normal manner
1728          * to catch cases where we are gazumped.
1729          */
1730         err = i915_gem_object_wait(obj,
1731                                    I915_WAIT_INTERRUPTIBLE |
1732                                    (write_domain ? I915_WAIT_ALL : 0),
1733                                    MAX_SCHEDULE_TIMEOUT,
1734                                    to_rps_client(file));
1735         if (err)
1736                 goto out;
1737
1738         /*
1739          * Proxy objects do not control access to the backing storage, ergo
1740          * they cannot be used as a means to manipulate the cache domain
1741          * tracking for that backing storage. The proxy object is always
1742          * considered to be outside of any cache domain.
1743          */
1744         if (i915_gem_object_is_proxy(obj)) {
1745                 err = -ENXIO;
1746                 goto out;
1747         }
1748
1749         /*
1750          * Flush and acquire obj->pages so that we are coherent through
1751          * direct access in memory with previous cached writes through
1752          * shmemfs and that our cache domain tracking remains valid.
1753          * For example, if the obj->filp was moved to swap without us
1754          * being notified and releasing the pages, we would mistakenly
1755          * continue to assume that the obj remained out of the CPU cached
1756          * domain.
1757          */
1758         err = i915_gem_object_pin_pages(obj);
1759         if (err)
1760                 goto out;
1761
1762         err = i915_mutex_lock_interruptible(dev);
1763         if (err)
1764                 goto out_unpin;
1765
1766         if (read_domains & I915_GEM_DOMAIN_WC)
1767                 err = i915_gem_object_set_to_wc_domain(obj, write_domain);
1768         else if (read_domains & I915_GEM_DOMAIN_GTT)
1769                 err = i915_gem_object_set_to_gtt_domain(obj, write_domain);
1770         else
1771                 err = i915_gem_object_set_to_cpu_domain(obj, write_domain);
1772
1773         /* And bump the LRU for this access */
1774         i915_gem_object_bump_inactive_ggtt(obj);
1775
1776         mutex_unlock(&dev->struct_mutex);
1777
1778         if (write_domain != 0)
1779                 intel_fb_obj_invalidate(obj,
1780                                         fb_write_origin(obj, write_domain));
1781
1782 out_unpin:
1783         i915_gem_object_unpin_pages(obj);
1784 out:
1785         i915_gem_object_put(obj);
1786         return err;
1787 }
1788
1789 /**
1790  * Called when user space has done writes to this buffer
1791  * @dev: drm device
1792  * @data: ioctl data blob
1793  * @file: drm file
1794  */
1795 int
1796 i915_gem_sw_finish_ioctl(struct drm_device *dev, void *data,
1797                          struct drm_file *file)
1798 {
1799         struct drm_i915_gem_sw_finish *args = data;
1800         struct drm_i915_gem_object *obj;
1801
1802         obj = i915_gem_object_lookup(file, args->handle);
1803         if (!obj)
1804                 return -ENOENT;
1805
1806         /*
1807          * Proxy objects are barred from CPU access, so there is no
1808          * need to ban sw_finish as it is a nop.
1809          */
1810
1811         /* Pinned buffers may be scanout, so flush the cache */
1812         i915_gem_object_flush_if_display(obj);
1813         i915_gem_object_put(obj);
1814
1815         return 0;
1816 }
1817
1818 /**
1819  * i915_gem_mmap_ioctl - Maps the contents of an object, returning the address
1820  *                       it is mapped to.
1821  * @dev: drm device
1822  * @data: ioctl data blob
1823  * @file: drm file
1824  *
1825  * While the mapping holds a reference on the contents of the object, it doesn't
1826  * imply a ref on the object itself.
1827  *
1828  * IMPORTANT:
1829  *
1830  * DRM driver writers who look a this function as an example for how to do GEM
1831  * mmap support, please don't implement mmap support like here. The modern way
1832  * to implement DRM mmap support is with an mmap offset ioctl (like
1833  * i915_gem_mmap_gtt) and then using the mmap syscall on the DRM fd directly.
1834  * That way debug tooling like valgrind will understand what's going on, hiding
1835  * the mmap call in a driver private ioctl will break that. The i915 driver only
1836  * does cpu mmaps this way because we didn't know better.
1837  */
1838 int
1839 i915_gem_mmap_ioctl(struct drm_device *dev, void *data,
1840                     struct drm_file *file)
1841 {
1842         struct drm_i915_gem_mmap *args = data;
1843         struct drm_i915_gem_object *obj;
1844         unsigned long addr;
1845
1846         if (args->flags & ~(I915_MMAP_WC))
1847                 return -EINVAL;
1848
1849         if (args->flags & I915_MMAP_WC && !boot_cpu_has(X86_FEATURE_PAT))
1850                 return -ENODEV;
1851
1852         obj = i915_gem_object_lookup(file, args->handle);
1853         if (!obj)
1854                 return -ENOENT;
1855
1856         /* prime objects have no backing filp to GEM mmap
1857          * pages from.
1858          */
1859         if (!obj->base.filp) {
1860                 i915_gem_object_put(obj);
1861                 return -ENXIO;
1862         }
1863
1864         addr = vm_mmap(obj->base.filp, 0, args->size,
1865                        PROT_READ | PROT_WRITE, MAP_SHARED,
1866                        args->offset);
1867         if (args->flags & I915_MMAP_WC) {
1868                 struct mm_struct *mm = current->mm;
1869                 struct vm_area_struct *vma;
1870
1871                 if (down_write_killable(&mm->mmap_sem)) {
1872                         i915_gem_object_put(obj);
1873                         return -EINTR;
1874                 }
1875                 vma = find_vma(mm, addr);
1876                 if (vma)
1877                         vma->vm_page_prot =
1878                                 pgprot_writecombine(vm_get_page_prot(vma->vm_flags));
1879                 else
1880                         addr = -ENOMEM;
1881                 up_write(&mm->mmap_sem);
1882
1883                 /* This may race, but that's ok, it only gets set */
1884                 WRITE_ONCE(obj->frontbuffer_ggtt_origin, ORIGIN_CPU);
1885         }
1886         i915_gem_object_put(obj);
1887         if (IS_ERR((void *)addr))
1888                 return addr;
1889
1890         args->addr_ptr = (uint64_t) addr;
1891
1892         return 0;
1893 }
1894
1895 static unsigned int tile_row_pages(struct drm_i915_gem_object *obj)
1896 {
1897         return i915_gem_object_get_tile_row_size(obj) >> PAGE_SHIFT;
1898 }
1899
1900 /**
1901  * i915_gem_mmap_gtt_version - report the current feature set for GTT mmaps
1902  *
1903  * A history of the GTT mmap interface:
1904  *
1905  * 0 - Everything had to fit into the GTT. Both parties of a memcpy had to
1906  *     aligned and suitable for fencing, and still fit into the available
1907  *     mappable space left by the pinned display objects. A classic problem
1908  *     we called the page-fault-of-doom where we would ping-pong between
1909  *     two objects that could not fit inside the GTT and so the memcpy
1910  *     would page one object in at the expense of the other between every
1911  *     single byte.
1912  *
1913  * 1 - Objects can be any size, and have any compatible fencing (X Y, or none
1914  *     as set via i915_gem_set_tiling() [DRM_I915_GEM_SET_TILING]). If the
1915  *     object is too large for the available space (or simply too large
1916  *     for the mappable aperture!), a view is created instead and faulted
1917  *     into userspace. (This view is aligned and sized appropriately for
1918  *     fenced access.)
1919  *
1920  * 2 - Recognise WC as a separate cache domain so that we can flush the
1921  *     delayed writes via GTT before performing direct access via WC.
1922  *
1923  * Restrictions:
1924  *
1925  *  * snoopable objects cannot be accessed via the GTT. It can cause machine
1926  *    hangs on some architectures, corruption on others. An attempt to service
1927  *    a GTT page fault from a snoopable object will generate a SIGBUS.
1928  *
1929  *  * the object must be able to fit into RAM (physical memory, though no
1930  *    limited to the mappable aperture).
1931  *
1932  *
1933  * Caveats:
1934  *
1935  *  * a new GTT page fault will synchronize rendering from the GPU and flush
1936  *    all data to system memory. Subsequent access will not be synchronized.
1937  *
1938  *  * all mappings are revoked on runtime device suspend.
1939  *
1940  *  * there are only 8, 16 or 32 fence registers to share between all users
1941  *    (older machines require fence register for display and blitter access
1942  *    as well). Contention of the fence registers will cause the previous users
1943  *    to be unmapped and any new access will generate new page faults.
1944  *
1945  *  * running out of memory while servicing a fault may generate a SIGBUS,
1946  *    rather than the expected SIGSEGV.
1947  */
1948 int i915_gem_mmap_gtt_version(void)
1949 {
1950         return 2;
1951 }
1952
1953 static inline struct i915_ggtt_view
1954 compute_partial_view(struct drm_i915_gem_object *obj,
1955                      pgoff_t page_offset,
1956                      unsigned int chunk)
1957 {
1958         struct i915_ggtt_view view;
1959
1960         if (i915_gem_object_is_tiled(obj))
1961                 chunk = roundup(chunk, tile_row_pages(obj));
1962
1963         view.type = I915_GGTT_VIEW_PARTIAL;
1964         view.partial.offset = rounddown(page_offset, chunk);
1965         view.partial.size =
1966                 min_t(unsigned int, chunk,
1967                       (obj->base.size >> PAGE_SHIFT) - view.partial.offset);
1968
1969         /* If the partial covers the entire object, just create a normal VMA. */
1970         if (chunk >= obj->base.size >> PAGE_SHIFT)
1971                 view.type = I915_GGTT_VIEW_NORMAL;
1972
1973         return view;
1974 }
1975
1976 /**
1977  * i915_gem_fault - fault a page into the GTT
1978  * @vmf: fault info
1979  *
1980  * The fault handler is set up by drm_gem_mmap() when a object is GTT mapped
1981  * from userspace.  The fault handler takes care of binding the object to
1982  * the GTT (if needed), allocating and programming a fence register (again,
1983  * only if needed based on whether the old reg is still valid or the object
1984  * is tiled) and inserting a new PTE into the faulting process.
1985  *
1986  * Note that the faulting process may involve evicting existing objects
1987  * from the GTT and/or fence registers to make room.  So performance may
1988  * suffer if the GTT working set is large or there are few fence registers
1989  * left.
1990  *
1991  * The current feature set supported by i915_gem_fault() and thus GTT mmaps
1992  * is exposed via I915_PARAM_MMAP_GTT_VERSION (see i915_gem_mmap_gtt_version).
1993  */
1994 int i915_gem_fault(struct vm_fault *vmf)
1995 {
1996 #define MIN_CHUNK_PAGES ((1 << 20) >> PAGE_SHIFT) /* 1 MiB */
1997         struct vm_area_struct *area = vmf->vma;
1998         struct drm_i915_gem_object *obj = to_intel_bo(area->vm_private_data);
1999         struct drm_device *dev = obj->base.dev;
2000         struct drm_i915_private *dev_priv = to_i915(dev);
2001         struct i915_ggtt *ggtt = &dev_priv->ggtt;
2002         bool write = !!(vmf->flags & FAULT_FLAG_WRITE);
2003         struct i915_vma *vma;
2004         pgoff_t page_offset;
2005         unsigned int flags;
2006         int ret;
2007
2008         /* We don't use vmf->pgoff since that has the fake offset */
2009         page_offset = (vmf->address - area->vm_start) >> PAGE_SHIFT;
2010
2011         trace_i915_gem_object_fault(obj, page_offset, true, write);
2012
2013         /* Try to flush the object off the GPU first without holding the lock.
2014          * Upon acquiring the lock, we will perform our sanity checks and then
2015          * repeat the flush holding the lock in the normal manner to catch cases
2016          * where we are gazumped.
2017          */
2018         ret = i915_gem_object_wait(obj,
2019                                    I915_WAIT_INTERRUPTIBLE,
2020                                    MAX_SCHEDULE_TIMEOUT,
2021                                    NULL);
2022         if (ret)
2023                 goto err;
2024
2025         ret = i915_gem_object_pin_pages(obj);
2026         if (ret)
2027                 goto err;
2028
2029         intel_runtime_pm_get(dev_priv);
2030
2031         ret = i915_mutex_lock_interruptible(dev);
2032         if (ret)
2033                 goto err_rpm;
2034
2035         /* Access to snoopable pages through the GTT is incoherent. */
2036         if (obj->cache_level != I915_CACHE_NONE && !HAS_LLC(dev_priv)) {
2037                 ret = -EFAULT;
2038                 goto err_unlock;
2039         }
2040
2041         /* If the object is smaller than a couple of partial vma, it is
2042          * not worth only creating a single partial vma - we may as well
2043          * clear enough space for the full object.
2044          */
2045         flags = PIN_MAPPABLE;
2046         if (obj->base.size > 2 * MIN_CHUNK_PAGES << PAGE_SHIFT)
2047                 flags |= PIN_NONBLOCK | PIN_NONFAULT;
2048
2049         /* Now pin it into the GTT as needed */
2050         vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0, flags);
2051         if (IS_ERR(vma)) {
2052                 /* Use a partial view if it is bigger than available space */
2053                 struct i915_ggtt_view view =
2054                         compute_partial_view(obj, page_offset, MIN_CHUNK_PAGES);
2055
2056                 /* Userspace is now writing through an untracked VMA, abandon
2057                  * all hope that the hardware is able to track future writes.
2058                  */
2059                 obj->frontbuffer_ggtt_origin = ORIGIN_CPU;
2060
2061                 vma = i915_gem_object_ggtt_pin(obj, &view, 0, 0, PIN_MAPPABLE);
2062         }
2063         if (IS_ERR(vma)) {
2064                 ret = PTR_ERR(vma);
2065                 goto err_unlock;
2066         }
2067
2068         ret = i915_gem_object_set_to_gtt_domain(obj, write);
2069         if (ret)
2070                 goto err_unpin;
2071
2072         ret = i915_vma_pin_fence(vma);
2073         if (ret)
2074                 goto err_unpin;
2075
2076         /* Finally, remap it using the new GTT offset */
2077         ret = remap_io_mapping(area,
2078                                area->vm_start + (vma->ggtt_view.partial.offset << PAGE_SHIFT),
2079                                (ggtt->gmadr.start + vma->node.start) >> PAGE_SHIFT,
2080                                min_t(u64, vma->size, area->vm_end - area->vm_start),
2081                                &ggtt->iomap);
2082         if (ret)
2083                 goto err_fence;
2084
2085         /* Mark as being mmapped into userspace for later revocation */
2086         assert_rpm_wakelock_held(dev_priv);
2087         if (!i915_vma_set_userfault(vma) && !obj->userfault_count++)
2088                 list_add(&obj->userfault_link, &dev_priv->mm.userfault_list);
2089         GEM_BUG_ON(!obj->userfault_count);
2090
2091         i915_vma_set_ggtt_write(vma);
2092
2093 err_fence:
2094         i915_vma_unpin_fence(vma);
2095 err_unpin:
2096         __i915_vma_unpin(vma);
2097 err_unlock:
2098         mutex_unlock(&dev->struct_mutex);
2099 err_rpm:
2100         intel_runtime_pm_put(dev_priv);
2101         i915_gem_object_unpin_pages(obj);
2102 err:
2103         switch (ret) {
2104         case -EIO:
2105                 /*
2106                  * We eat errors when the gpu is terminally wedged to avoid
2107                  * userspace unduly crashing (gl has no provisions for mmaps to
2108                  * fail). But any other -EIO isn't ours (e.g. swap in failure)
2109                  * and so needs to be reported.
2110                  */
2111                 if (!i915_terminally_wedged(&dev_priv->gpu_error)) {
2112                         ret = VM_FAULT_SIGBUS;
2113                         break;
2114                 }
2115         case -EAGAIN:
2116                 /*
2117                  * EAGAIN means the gpu is hung and we'll wait for the error
2118                  * handler to reset everything when re-faulting in
2119                  * i915_mutex_lock_interruptible.
2120                  */
2121         case 0:
2122         case -ERESTARTSYS:
2123         case -EINTR:
2124         case -EBUSY:
2125                 /*
2126                  * EBUSY is ok: this just means that another thread
2127                  * already did the job.
2128                  */
2129                 ret = VM_FAULT_NOPAGE;
2130                 break;
2131         case -ENOMEM:
2132                 ret = VM_FAULT_OOM;
2133                 break;
2134         case -ENOSPC:
2135         case -EFAULT:
2136                 ret = VM_FAULT_SIGBUS;
2137                 break;
2138         default:
2139                 WARN_ONCE(ret, "unhandled error in i915_gem_fault: %i\n", ret);
2140                 ret = VM_FAULT_SIGBUS;
2141                 break;
2142         }
2143         return ret;
2144 }
2145
2146 static void __i915_gem_object_release_mmap(struct drm_i915_gem_object *obj)
2147 {
2148         struct i915_vma *vma;
2149
2150         GEM_BUG_ON(!obj->userfault_count);
2151
2152         obj->userfault_count = 0;
2153         list_del(&obj->userfault_link);
2154         drm_vma_node_unmap(&obj->base.vma_node,
2155                            obj->base.dev->anon_inode->i_mapping);
2156
2157         for_each_ggtt_vma(vma, obj)
2158                 i915_vma_unset_userfault(vma);
2159 }
2160
2161 /**
2162  * i915_gem_release_mmap - remove physical page mappings
2163  * @obj: obj in question
2164  *
2165  * Preserve the reservation of the mmapping with the DRM core code, but
2166  * relinquish ownership of the pages back to the system.
2167  *
2168  * It is vital that we remove the page mapping if we have mapped a tiled
2169  * object through the GTT and then lose the fence register due to
2170  * resource pressure. Similarly if the object has been moved out of the
2171  * aperture, than pages mapped into userspace must be revoked. Removing the
2172  * mapping will then trigger a page fault on the next user access, allowing
2173  * fixup by i915_gem_fault().
2174  */
2175 void
2176 i915_gem_release_mmap(struct drm_i915_gem_object *obj)
2177 {
2178         struct drm_i915_private *i915 = to_i915(obj->base.dev);
2179
2180         /* Serialisation between user GTT access and our code depends upon
2181          * revoking the CPU's PTE whilst the mutex is held. The next user
2182          * pagefault then has to wait until we release the mutex.
2183          *
2184          * Note that RPM complicates somewhat by adding an additional
2185          * requirement that operations to the GGTT be made holding the RPM
2186          * wakeref.
2187          */
2188         lockdep_assert_held(&i915->drm.struct_mutex);
2189         intel_runtime_pm_get(i915);
2190
2191         if (!obj->userfault_count)
2192                 goto out;
2193
2194         __i915_gem_object_release_mmap(obj);
2195
2196         /* Ensure that the CPU's PTE are revoked and there are not outstanding
2197          * memory transactions from userspace before we return. The TLB
2198          * flushing implied above by changing the PTE above *should* be
2199          * sufficient, an extra barrier here just provides us with a bit
2200          * of paranoid documentation about our requirement to serialise
2201          * memory writes before touching registers / GSM.
2202          */
2203         wmb();
2204
2205 out:
2206         intel_runtime_pm_put(i915);
2207 }
2208
2209 void i915_gem_runtime_suspend(struct drm_i915_private *dev_priv)
2210 {
2211         struct drm_i915_gem_object *obj, *on;
2212         int i;
2213
2214         /*
2215          * Only called during RPM suspend. All users of the userfault_list
2216          * must be holding an RPM wakeref to ensure that this can not
2217          * run concurrently with themselves (and use the struct_mutex for
2218          * protection between themselves).
2219          */
2220
2221         list_for_each_entry_safe(obj, on,
2222                                  &dev_priv->mm.userfault_list, userfault_link)
2223                 __i915_gem_object_release_mmap(obj);
2224
2225         /* The fence will be lost when the device powers down. If any were
2226          * in use by hardware (i.e. they are pinned), we should not be powering
2227          * down! All other fences will be reacquired by the user upon waking.
2228          */
2229         for (i = 0; i < dev_priv->num_fence_regs; i++) {
2230                 struct drm_i915_fence_reg *reg = &dev_priv->fence_regs[i];
2231
2232                 /* Ideally we want to assert that the fence register is not
2233                  * live at this point (i.e. that no piece of code will be
2234                  * trying to write through fence + GTT, as that both violates
2235                  * our tracking of activity and associated locking/barriers,
2236                  * but also is illegal given that the hw is powered down).
2237                  *
2238                  * Previously we used reg->pin_count as a "liveness" indicator.
2239                  * That is not sufficient, and we need a more fine-grained
2240                  * tool if we want to have a sanity check here.
2241                  */
2242
2243                 if (!reg->vma)
2244                         continue;
2245
2246                 GEM_BUG_ON(i915_vma_has_userfault(reg->vma));
2247                 reg->dirty = true;
2248         }
2249 }
2250
2251 static int i915_gem_object_create_mmap_offset(struct drm_i915_gem_object *obj)
2252 {
2253         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
2254         int err;
2255
2256         err = drm_gem_create_mmap_offset(&obj->base);
2257         if (likely(!err))
2258                 return 0;
2259
2260         /* Attempt to reap some mmap space from dead objects */
2261         do {
2262                 err = i915_gem_wait_for_idle(dev_priv, I915_WAIT_INTERRUPTIBLE);
2263                 if (err)
2264                         break;
2265
2266                 i915_gem_drain_freed_objects(dev_priv);
2267                 err = drm_gem_create_mmap_offset(&obj->base);
2268                 if (!err)
2269                         break;
2270
2271         } while (flush_delayed_work(&dev_priv->gt.retire_work));
2272
2273         return err;
2274 }
2275
2276 static void i915_gem_object_free_mmap_offset(struct drm_i915_gem_object *obj)
2277 {
2278         drm_gem_free_mmap_offset(&obj->base);
2279 }
2280
2281 int
2282 i915_gem_mmap_gtt(struct drm_file *file,
2283                   struct drm_device *dev,
2284                   uint32_t handle,
2285                   uint64_t *offset)
2286 {
2287         struct drm_i915_gem_object *obj;
2288         int ret;
2289
2290         obj = i915_gem_object_lookup(file, handle);
2291         if (!obj)
2292                 return -ENOENT;
2293
2294         ret = i915_gem_object_create_mmap_offset(obj);
2295         if (ret == 0)
2296                 *offset = drm_vma_node_offset_addr(&obj->base.vma_node);
2297
2298         i915_gem_object_put(obj);
2299         return ret;
2300 }
2301
2302 /**
2303  * i915_gem_mmap_gtt_ioctl - prepare an object for GTT mmap'ing
2304  * @dev: DRM device
2305  * @data: GTT mapping ioctl data
2306  * @file: GEM object info
2307  *
2308  * Simply returns the fake offset to userspace so it can mmap it.
2309  * The mmap call will end up in drm_gem_mmap(), which will set things
2310  * up so we can get faults in the handler above.
2311  *
2312  * The fault handler will take care of binding the object into the GTT
2313  * (since it may have been evicted to make room for something), allocating
2314  * a fence register, and mapping the appropriate aperture address into
2315  * userspace.
2316  */
2317 int
2318 i915_gem_mmap_gtt_ioctl(struct drm_device *dev, void *data,
2319                         struct drm_file *file)
2320 {
2321         struct drm_i915_gem_mmap_gtt *args = data;
2322
2323         return i915_gem_mmap_gtt(file, dev, args->handle, &args->offset);
2324 }
2325
2326 /* Immediately discard the backing storage */
2327 static void
2328 i915_gem_object_truncate(struct drm_i915_gem_object *obj)
2329 {
2330         i915_gem_object_free_mmap_offset(obj);
2331
2332         if (obj->base.filp == NULL)
2333                 return;
2334
2335         /* Our goal here is to return as much of the memory as
2336          * is possible back to the system as we are called from OOM.
2337          * To do this we must instruct the shmfs to drop all of its
2338          * backing pages, *now*.
2339          */
2340         shmem_truncate_range(file_inode(obj->base.filp), 0, (loff_t)-1);
2341         obj->mm.madv = __I915_MADV_PURGED;
2342         obj->mm.pages = ERR_PTR(-EFAULT);
2343 }
2344
2345 /* Try to discard unwanted pages */
2346 void __i915_gem_object_invalidate(struct drm_i915_gem_object *obj)
2347 {
2348         struct address_space *mapping;
2349
2350         lockdep_assert_held(&obj->mm.lock);
2351         GEM_BUG_ON(i915_gem_object_has_pages(obj));
2352
2353         switch (obj->mm.madv) {
2354         case I915_MADV_DONTNEED:
2355                 i915_gem_object_truncate(obj);
2356         case __I915_MADV_PURGED:
2357                 return;
2358         }
2359
2360         if (obj->base.filp == NULL)
2361                 return;
2362
2363         mapping = obj->base.filp->f_mapping,
2364         invalidate_mapping_pages(mapping, 0, (loff_t)-1);
2365 }
2366
2367 static void
2368 i915_gem_object_put_pages_gtt(struct drm_i915_gem_object *obj,
2369                               struct sg_table *pages)
2370 {
2371         struct sgt_iter sgt_iter;
2372         struct page *page;
2373
2374         __i915_gem_object_release_shmem(obj, pages, true);
2375
2376         i915_gem_gtt_finish_pages(obj, pages);
2377
2378         if (i915_gem_object_needs_bit17_swizzle(obj))
2379                 i915_gem_object_save_bit_17_swizzle(obj, pages);
2380
2381         for_each_sgt_page(page, sgt_iter, pages) {
2382                 if (obj->mm.dirty)
2383                         set_page_dirty(page);
2384
2385                 if (obj->mm.madv == I915_MADV_WILLNEED)
2386                         mark_page_accessed(page);
2387
2388                 put_page(page);
2389         }
2390         obj->mm.dirty = false;
2391
2392         sg_free_table(pages);
2393         kfree(pages);
2394 }
2395
2396 static void __i915_gem_object_reset_page_iter(struct drm_i915_gem_object *obj)
2397 {
2398         struct radix_tree_iter iter;
2399         void __rcu **slot;
2400
2401         rcu_read_lock();
2402         radix_tree_for_each_slot(slot, &obj->mm.get_page.radix, &iter, 0)
2403                 radix_tree_delete(&obj->mm.get_page.radix, iter.index);
2404         rcu_read_unlock();
2405 }
2406
2407 void __i915_gem_object_put_pages(struct drm_i915_gem_object *obj,
2408                                  enum i915_mm_subclass subclass)
2409 {
2410         struct drm_i915_private *i915 = to_i915(obj->base.dev);
2411         struct sg_table *pages;
2412
2413         if (i915_gem_object_has_pinned_pages(obj))
2414                 return;
2415
2416         GEM_BUG_ON(obj->bind_count);
2417         if (!i915_gem_object_has_pages(obj))
2418                 return;
2419
2420         /* May be called by shrinker from within get_pages() (on another bo) */
2421         mutex_lock_nested(&obj->mm.lock, subclass);
2422         if (unlikely(atomic_read(&obj->mm.pages_pin_count)))
2423                 goto unlock;
2424
2425         /* ->put_pages might need to allocate memory for the bit17 swizzle
2426          * array, hence protect them from being reaped by removing them from gtt
2427          * lists early. */
2428         pages = fetch_and_zero(&obj->mm.pages);
2429         GEM_BUG_ON(!pages);
2430
2431         spin_lock(&i915->mm.obj_lock);
2432         list_del(&obj->mm.link);
2433         spin_unlock(&i915->mm.obj_lock);
2434
2435         if (obj->mm.mapping) {
2436                 void *ptr;
2437
2438                 ptr = page_mask_bits(obj->mm.mapping);
2439                 if (is_vmalloc_addr(ptr))
2440                         vunmap(ptr);
2441                 else
2442                         kunmap(kmap_to_page(ptr));
2443
2444                 obj->mm.mapping = NULL;
2445         }
2446
2447         __i915_gem_object_reset_page_iter(obj);
2448
2449         if (!IS_ERR(pages))
2450                 obj->ops->put_pages(obj, pages);
2451
2452         obj->mm.page_sizes.phys = obj->mm.page_sizes.sg = 0;
2453
2454 unlock:
2455         mutex_unlock(&obj->mm.lock);
2456 }
2457
2458 static bool i915_sg_trim(struct sg_table *orig_st)
2459 {
2460         struct sg_table new_st;
2461         struct scatterlist *sg, *new_sg;
2462         unsigned int i;
2463
2464         if (orig_st->nents == orig_st->orig_nents)
2465                 return false;
2466
2467         if (sg_alloc_table(&new_st, orig_st->nents, GFP_KERNEL | __GFP_NOWARN))
2468                 return false;
2469
2470         new_sg = new_st.sgl;
2471         for_each_sg(orig_st->sgl, sg, orig_st->nents, i) {
2472                 sg_set_page(new_sg, sg_page(sg), sg->length, 0);
2473                 /* called before being DMA mapped, no need to copy sg->dma_* */
2474                 new_sg = sg_next(new_sg);
2475         }
2476         GEM_BUG_ON(new_sg); /* Should walk exactly nents and hit the end */
2477
2478         sg_free_table(orig_st);
2479
2480         *orig_st = new_st;
2481         return true;
2482 }
2483
2484 static int i915_gem_object_get_pages_gtt(struct drm_i915_gem_object *obj)
2485 {
2486         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
2487         const unsigned long page_count = obj->base.size / PAGE_SIZE;
2488         unsigned long i;
2489         struct address_space *mapping;
2490         struct sg_table *st;
2491         struct scatterlist *sg;
2492         struct sgt_iter sgt_iter;
2493         struct page *page;
2494         unsigned long last_pfn = 0;     /* suppress gcc warning */
2495         unsigned int max_segment = i915_sg_segment_size();
2496         unsigned int sg_page_sizes;
2497         gfp_t noreclaim;
2498         int ret;
2499
2500         /* Assert that the object is not currently in any GPU domain. As it
2501          * wasn't in the GTT, there shouldn't be any way it could have been in
2502          * a GPU cache
2503          */
2504         GEM_BUG_ON(obj->read_domains & I915_GEM_GPU_DOMAINS);
2505         GEM_BUG_ON(obj->write_domain & I915_GEM_GPU_DOMAINS);
2506
2507         st = kmalloc(sizeof(*st), GFP_KERNEL);
2508         if (st == NULL)
2509                 return -ENOMEM;
2510
2511 rebuild_st:
2512         if (sg_alloc_table(st, page_count, GFP_KERNEL)) {
2513                 kfree(st);
2514                 return -ENOMEM;
2515         }
2516
2517         /* Get the list of pages out of our struct file.  They'll be pinned
2518          * at this point until we release them.
2519          *
2520          * Fail silently without starting the shrinker
2521          */
2522         mapping = obj->base.filp->f_mapping;
2523         noreclaim = mapping_gfp_constraint(mapping, ~__GFP_RECLAIM);
2524         noreclaim |= __GFP_NORETRY | __GFP_NOWARN;
2525
2526         sg = st->sgl;
2527         st->nents = 0;
2528         sg_page_sizes = 0;
2529         for (i = 0; i < page_count; i++) {
2530                 const unsigned int shrink[] = {
2531                         I915_SHRINK_BOUND | I915_SHRINK_UNBOUND | I915_SHRINK_PURGEABLE,
2532                         0,
2533                 }, *s = shrink;
2534                 gfp_t gfp = noreclaim;
2535
2536                 do {
2537                         page = shmem_read_mapping_page_gfp(mapping, i, gfp);
2538                         if (likely(!IS_ERR(page)))
2539                                 break;
2540
2541                         if (!*s) {
2542                                 ret = PTR_ERR(page);
2543                                 goto err_sg;
2544                         }
2545
2546                         i915_gem_shrink(dev_priv, 2 * page_count, NULL, *s++);
2547                         cond_resched();
2548
2549                         /* We've tried hard to allocate the memory by reaping
2550                          * our own buffer, now let the real VM do its job and
2551                          * go down in flames if truly OOM.
2552                          *
2553                          * However, since graphics tend to be disposable,
2554                          * defer the oom here by reporting the ENOMEM back
2555                          * to userspace.
2556                          */
2557                         if (!*s) {
2558                                 /* reclaim and warn, but no oom */
2559                                 gfp = mapping_gfp_mask(mapping);
2560
2561                                 /* Our bo are always dirty and so we require
2562                                  * kswapd to reclaim our pages (direct reclaim
2563                                  * does not effectively begin pageout of our
2564                                  * buffers on its own). However, direct reclaim
2565                                  * only waits for kswapd when under allocation
2566                                  * congestion. So as a result __GFP_RECLAIM is
2567                                  * unreliable and fails to actually reclaim our
2568                                  * dirty pages -- unless you try over and over
2569                                  * again with !__GFP_NORETRY. However, we still
2570                                  * want to fail this allocation rather than
2571                                  * trigger the out-of-memory killer and for
2572                                  * this we want __GFP_RETRY_MAYFAIL.
2573                                  */
2574                                 gfp |= __GFP_RETRY_MAYFAIL;
2575                         }
2576                 } while (1);
2577
2578                 if (!i ||
2579                     sg->length >= max_segment ||
2580                     page_to_pfn(page) != last_pfn + 1) {
2581                         if (i) {
2582                                 sg_page_sizes |= sg->length;
2583                                 sg = sg_next(sg);
2584                         }
2585                         st->nents++;
2586                         sg_set_page(sg, page, PAGE_SIZE, 0);
2587                 } else {
2588                         sg->length += PAGE_SIZE;
2589                 }
2590                 last_pfn = page_to_pfn(page);
2591
2592                 /* Check that the i965g/gm workaround works. */
2593                 WARN_ON((gfp & __GFP_DMA32) && (last_pfn >= 0x00100000UL));
2594         }
2595         if (sg) { /* loop terminated early; short sg table */
2596                 sg_page_sizes |= sg->length;
2597                 sg_mark_end(sg);
2598         }
2599
2600         /* Trim unused sg entries to avoid wasting memory. */
2601         i915_sg_trim(st);
2602
2603         ret = i915_gem_gtt_prepare_pages(obj, st);
2604         if (ret) {
2605                 /* DMA remapping failed? One possible cause is that
2606                  * it could not reserve enough large entries, asking
2607                  * for PAGE_SIZE chunks instead may be helpful.
2608                  */
2609                 if (max_segment > PAGE_SIZE) {
2610                         for_each_sgt_page(page, sgt_iter, st)
2611                                 put_page(page);
2612                         sg_free_table(st);
2613
2614                         max_segment = PAGE_SIZE;
2615                         goto rebuild_st;
2616                 } else {
2617                         dev_warn(&dev_priv->drm.pdev->dev,
2618                                  "Failed to DMA remap %lu pages\n",
2619                                  page_count);
2620                         goto err_pages;
2621                 }
2622         }
2623
2624         if (i915_gem_object_needs_bit17_swizzle(obj))
2625                 i915_gem_object_do_bit_17_swizzle(obj, st);
2626
2627         __i915_gem_object_set_pages(obj, st, sg_page_sizes);
2628
2629         return 0;
2630
2631 err_sg:
2632         sg_mark_end(sg);
2633 err_pages:
2634         for_each_sgt_page(page, sgt_iter, st)
2635                 put_page(page);
2636         sg_free_table(st);
2637         kfree(st);
2638
2639         /* shmemfs first checks if there is enough memory to allocate the page
2640          * and reports ENOSPC should there be insufficient, along with the usual
2641          * ENOMEM for a genuine allocation failure.
2642          *
2643          * We use ENOSPC in our driver to mean that we have run out of aperture
2644          * space and so want to translate the error from shmemfs back to our
2645          * usual understanding of ENOMEM.
2646          */
2647         if (ret == -ENOSPC)
2648                 ret = -ENOMEM;
2649
2650         return ret;
2651 }
2652
2653 void __i915_gem_object_set_pages(struct drm_i915_gem_object *obj,
2654                                  struct sg_table *pages,
2655                                  unsigned int sg_page_sizes)
2656 {
2657         struct drm_i915_private *i915 = to_i915(obj->base.dev);
2658         unsigned long supported = INTEL_INFO(i915)->page_sizes;
2659         int i;
2660
2661         lockdep_assert_held(&obj->mm.lock);
2662
2663         obj->mm.get_page.sg_pos = pages->sgl;
2664         obj->mm.get_page.sg_idx = 0;
2665
2666         obj->mm.pages = pages;
2667
2668         if (i915_gem_object_is_tiled(obj) &&
2669             i915->quirks & QUIRK_PIN_SWIZZLED_PAGES) {
2670                 GEM_BUG_ON(obj->mm.quirked);
2671                 __i915_gem_object_pin_pages(obj);
2672                 obj->mm.quirked = true;
2673         }
2674
2675         GEM_BUG_ON(!sg_page_sizes);
2676         obj->mm.page_sizes.phys = sg_page_sizes;
2677
2678         /*
2679          * Calculate the supported page-sizes which fit into the given
2680          * sg_page_sizes. This will give us the page-sizes which we may be able
2681          * to use opportunistically when later inserting into the GTT. For
2682          * example if phys=2G, then in theory we should be able to use 1G, 2M,
2683          * 64K or 4K pages, although in practice this will depend on a number of
2684          * other factors.
2685          */
2686         obj->mm.page_sizes.sg = 0;
2687         for_each_set_bit(i, &supported, ilog2(I915_GTT_MAX_PAGE_SIZE) + 1) {
2688                 if (obj->mm.page_sizes.phys & ~0u << i)
2689                         obj->mm.page_sizes.sg |= BIT(i);
2690         }
2691         GEM_BUG_ON(!HAS_PAGE_SIZES(i915, obj->mm.page_sizes.sg));
2692
2693         spin_lock(&i915->mm.obj_lock);
2694         list_add(&obj->mm.link, &i915->mm.unbound_list);
2695         spin_unlock(&i915->mm.obj_lock);
2696 }
2697
2698 static int ____i915_gem_object_get_pages(struct drm_i915_gem_object *obj)
2699 {
2700         int err;
2701
2702         if (unlikely(obj->mm.madv != I915_MADV_WILLNEED)) {
2703                 DRM_DEBUG("Attempting to obtain a purgeable object\n");
2704                 return -EFAULT;
2705         }
2706
2707         err = obj->ops->get_pages(obj);
2708         GEM_BUG_ON(!err && !i915_gem_object_has_pages(obj));
2709
2710         return err;
2711 }
2712
2713 /* Ensure that the associated pages are gathered from the backing storage
2714  * and pinned into our object. i915_gem_object_pin_pages() may be called
2715  * multiple times before they are released by a single call to
2716  * i915_gem_object_unpin_pages() - once the pages are no longer referenced
2717  * either as a result of memory pressure (reaping pages under the shrinker)
2718  * or as the object is itself released.
2719  */
2720 int __i915_gem_object_get_pages(struct drm_i915_gem_object *obj)
2721 {
2722         int err;
2723
2724         err = mutex_lock_interruptible(&obj->mm.lock);
2725         if (err)
2726                 return err;
2727
2728         if (unlikely(!i915_gem_object_has_pages(obj))) {
2729                 GEM_BUG_ON(i915_gem_object_has_pinned_pages(obj));
2730
2731                 err = ____i915_gem_object_get_pages(obj);
2732                 if (err)
2733                         goto unlock;
2734
2735                 smp_mb__before_atomic();
2736         }
2737         atomic_inc(&obj->mm.pages_pin_count);
2738
2739 unlock:
2740         mutex_unlock(&obj->mm.lock);
2741         return err;
2742 }
2743
2744 /* The 'mapping' part of i915_gem_object_pin_map() below */
2745 static void *i915_gem_object_map(const struct drm_i915_gem_object *obj,
2746                                  enum i915_map_type type)
2747 {
2748         unsigned long n_pages = obj->base.size >> PAGE_SHIFT;
2749         struct sg_table *sgt = obj->mm.pages;
2750         struct sgt_iter sgt_iter;
2751         struct page *page;
2752         struct page *stack_pages[32];
2753         struct page **pages = stack_pages;
2754         unsigned long i = 0;
2755         pgprot_t pgprot;
2756         void *addr;
2757
2758         /* A single page can always be kmapped */
2759         if (n_pages == 1 && type == I915_MAP_WB)
2760                 return kmap(sg_page(sgt->sgl));
2761
2762         if (n_pages > ARRAY_SIZE(stack_pages)) {
2763                 /* Too big for stack -- allocate temporary array instead */
2764                 pages = kvmalloc_array(n_pages, sizeof(*pages), GFP_KERNEL);
2765                 if (!pages)
2766                         return NULL;
2767         }
2768
2769         for_each_sgt_page(page, sgt_iter, sgt)
2770                 pages[i++] = page;
2771
2772         /* Check that we have the expected number of pages */
2773         GEM_BUG_ON(i != n_pages);
2774
2775         switch (type) {
2776         default:
2777                 MISSING_CASE(type);
2778                 /* fallthrough to use PAGE_KERNEL anyway */
2779         case I915_MAP_WB:
2780                 pgprot = PAGE_KERNEL;
2781                 break;
2782         case I915_MAP_WC:
2783                 pgprot = pgprot_writecombine(PAGE_KERNEL_IO);
2784                 break;
2785         }
2786         addr = vmap(pages, n_pages, 0, pgprot);
2787
2788         if (pages != stack_pages)
2789                 kvfree(pages);
2790
2791         return addr;
2792 }
2793
2794 /* get, pin, and map the pages of the object into kernel space */
2795 void *i915_gem_object_pin_map(struct drm_i915_gem_object *obj,
2796                               enum i915_map_type type)
2797 {
2798         enum i915_map_type has_type;
2799         bool pinned;
2800         void *ptr;
2801         int ret;
2802
2803         if (unlikely(!i915_gem_object_has_struct_page(obj)))
2804                 return ERR_PTR(-ENXIO);
2805
2806         ret = mutex_lock_interruptible(&obj->mm.lock);
2807         if (ret)
2808                 return ERR_PTR(ret);
2809
2810         pinned = !(type & I915_MAP_OVERRIDE);
2811         type &= ~I915_MAP_OVERRIDE;
2812
2813         if (!atomic_inc_not_zero(&obj->mm.pages_pin_count)) {
2814                 if (unlikely(!i915_gem_object_has_pages(obj))) {
2815                         GEM_BUG_ON(i915_gem_object_has_pinned_pages(obj));
2816
2817                         ret = ____i915_gem_object_get_pages(obj);
2818                         if (ret)
2819                                 goto err_unlock;
2820
2821                         smp_mb__before_atomic();
2822                 }
2823                 atomic_inc(&obj->mm.pages_pin_count);
2824                 pinned = false;
2825         }
2826         GEM_BUG_ON(!i915_gem_object_has_pages(obj));
2827
2828         ptr = page_unpack_bits(obj->mm.mapping, &has_type);
2829         if (ptr && has_type != type) {
2830                 if (pinned) {
2831                         ret = -EBUSY;
2832                         goto err_unpin;
2833                 }
2834
2835                 if (is_vmalloc_addr(ptr))
2836                         vunmap(ptr);
2837                 else
2838                         kunmap(kmap_to_page(ptr));
2839
2840                 ptr = obj->mm.mapping = NULL;
2841         }
2842
2843         if (!ptr) {
2844                 ptr = i915_gem_object_map(obj, type);
2845                 if (!ptr) {
2846                         ret = -ENOMEM;
2847                         goto err_unpin;
2848                 }
2849
2850                 obj->mm.mapping = page_pack_bits(ptr, type);
2851         }
2852
2853 out_unlock:
2854         mutex_unlock(&obj->mm.lock);
2855         return ptr;
2856
2857 err_unpin:
2858         atomic_dec(&obj->mm.pages_pin_count);
2859 err_unlock:
2860         ptr = ERR_PTR(ret);
2861         goto out_unlock;
2862 }
2863
2864 static int
2865 i915_gem_object_pwrite_gtt(struct drm_i915_gem_object *obj,
2866                            const struct drm_i915_gem_pwrite *arg)
2867 {
2868         struct address_space *mapping = obj->base.filp->f_mapping;
2869         char __user *user_data = u64_to_user_ptr(arg->data_ptr);
2870         u64 remain, offset;
2871         unsigned int pg;
2872
2873         /* Before we instantiate/pin the backing store for our use, we
2874          * can prepopulate the shmemfs filp efficiently using a write into
2875          * the pagecache. We avoid the penalty of instantiating all the
2876          * pages, important if the user is just writing to a few and never
2877          * uses the object on the GPU, and using a direct write into shmemfs
2878          * allows it to avoid the cost of retrieving a page (either swapin
2879          * or clearing-before-use) before it is overwritten.
2880          */
2881         if (i915_gem_object_has_pages(obj))
2882                 return -ENODEV;
2883
2884         if (obj->mm.madv != I915_MADV_WILLNEED)
2885                 return -EFAULT;
2886
2887         /* Before the pages are instantiated the object is treated as being
2888          * in the CPU domain. The pages will be clflushed as required before
2889          * use, and we can freely write into the pages directly. If userspace
2890          * races pwrite with any other operation; corruption will ensue -
2891          * that is userspace's prerogative!
2892          */
2893
2894         remain = arg->size;
2895         offset = arg->offset;
2896         pg = offset_in_page(offset);
2897
2898         do {
2899                 unsigned int len, unwritten;
2900                 struct page *page;
2901                 void *data, *vaddr;
2902                 int err;
2903
2904                 len = PAGE_SIZE - pg;
2905                 if (len > remain)
2906                         len = remain;
2907
2908                 err = pagecache_write_begin(obj->base.filp, mapping,
2909                                             offset, len, 0,
2910                                             &page, &data);
2911                 if (err < 0)
2912                         return err;
2913
2914                 vaddr = kmap(page);
2915                 unwritten = copy_from_user(vaddr + pg, user_data, len);
2916                 kunmap(page);
2917
2918                 err = pagecache_write_end(obj->base.filp, mapping,
2919                                           offset, len, len - unwritten,
2920                                           page, data);
2921                 if (err < 0)
2922                         return err;
2923
2924                 if (unwritten)
2925                         return -EFAULT;
2926
2927                 remain -= len;
2928                 user_data += len;
2929                 offset += len;
2930                 pg = 0;
2931         } while (remain);
2932
2933         return 0;
2934 }
2935
2936 static void i915_gem_context_mark_guilty(struct i915_gem_context *ctx)
2937 {
2938         bool banned;
2939
2940         atomic_inc(&ctx->guilty_count);
2941
2942         banned = false;
2943         if (i915_gem_context_is_bannable(ctx)) {
2944                 unsigned int score;
2945
2946                 score = atomic_add_return(CONTEXT_SCORE_GUILTY,
2947                                           &ctx->ban_score);
2948                 banned = score >= CONTEXT_SCORE_BAN_THRESHOLD;
2949
2950                 DRM_DEBUG_DRIVER("context %s marked guilty (score %d) banned? %s\n",
2951                                  ctx->name, score, yesno(banned));
2952         }
2953         if (!banned)
2954                 return;
2955
2956         i915_gem_context_set_banned(ctx);
2957         if (!IS_ERR_OR_NULL(ctx->file_priv)) {
2958                 atomic_inc(&ctx->file_priv->context_bans);
2959                 DRM_DEBUG_DRIVER("client %s has had %d context banned\n",
2960                                  ctx->name, atomic_read(&ctx->file_priv->context_bans));
2961         }
2962 }
2963
2964 static void i915_gem_context_mark_innocent(struct i915_gem_context *ctx)
2965 {
2966         atomic_inc(&ctx->active_count);
2967 }
2968
2969 struct i915_request *
2970 i915_gem_find_active_request(struct intel_engine_cs *engine)
2971 {
2972         struct i915_request *request, *active = NULL;
2973         unsigned long flags;
2974
2975         /* We are called by the error capture and reset at a random
2976          * point in time. In particular, note that neither is crucially
2977          * ordered with an interrupt. After a hang, the GPU is dead and we
2978          * assume that no more writes can happen (we waited long enough for
2979          * all writes that were in transaction to be flushed) - adding an
2980          * extra delay for a recent interrupt is pointless. Hence, we do
2981          * not need an engine->irq_seqno_barrier() before the seqno reads.
2982          */
2983         spin_lock_irqsave(&engine->timeline.lock, flags);
2984         list_for_each_entry(request, &engine->timeline.requests, link) {
2985                 if (__i915_request_completed(request, request->global_seqno))
2986                         continue;
2987
2988                 GEM_BUG_ON(request->engine != engine);
2989                 GEM_BUG_ON(test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
2990                                     &request->fence.flags));
2991
2992                 active = request;
2993                 break;
2994         }
2995         spin_unlock_irqrestore(&engine->timeline.lock, flags);
2996
2997         return active;
2998 }
2999
3000 /*
3001  * Ensure irq handler finishes, and not run again.
3002  * Also return the active request so that we only search for it once.
3003  */
3004 struct i915_request *
3005 i915_gem_reset_prepare_engine(struct intel_engine_cs *engine)
3006 {
3007         struct i915_request *request = NULL;
3008
3009         /*
3010          * During the reset sequence, we must prevent the engine from
3011          * entering RC6. As the context state is undefined until we restart
3012          * the engine, if it does enter RC6 during the reset, the state
3013          * written to the powercontext is undefined and so we may lose
3014          * GPU state upon resume, i.e. fail to restart after a reset.
3015          */
3016         intel_uncore_forcewake_get(engine->i915, FORCEWAKE_ALL);
3017
3018         /*
3019          * Prevent the signaler thread from updating the request
3020          * state (by calling dma_fence_signal) as we are processing
3021          * the reset. The write from the GPU of the seqno is
3022          * asynchronous and the signaler thread may see a different
3023          * value to us and declare the request complete, even though
3024          * the reset routine have picked that request as the active
3025          * (incomplete) request. This conflict is not handled
3026          * gracefully!
3027          */
3028         kthread_park(engine->breadcrumbs.signaler);
3029
3030         /*
3031          * Prevent request submission to the hardware until we have
3032          * completed the reset in i915_gem_reset_finish(). If a request
3033          * is completed by one engine, it may then queue a request
3034          * to a second via its execlists->tasklet *just* as we are
3035          * calling engine->init_hw() and also writing the ELSP.
3036          * Turning off the execlists->tasklet until the reset is over
3037          * prevents the race.
3038          *
3039          * Note that this needs to be a single atomic operation on the
3040          * tasklet (flush existing tasks, prevent new tasks) to prevent
3041          * a race between reset and set-wedged. It is not, so we do the best
3042          * we can atm and make sure we don't lock the machine up in the more
3043          * common case of recursively being called from set-wedged from inside
3044          * i915_reset.
3045          */
3046         if (!atomic_read(&engine->execlists.tasklet.count))
3047                 tasklet_kill(&engine->execlists.tasklet);
3048         tasklet_disable(&engine->execlists.tasklet);
3049
3050         /*
3051          * We're using worker to queue preemption requests from the tasklet in
3052          * GuC submission mode.
3053          * Even though tasklet was disabled, we may still have a worker queued.
3054          * Let's make sure that all workers scheduled before disabling the
3055          * tasklet are completed before continuing with the reset.
3056          */
3057         if (engine->i915->guc.preempt_wq)
3058                 flush_workqueue(engine->i915->guc.preempt_wq);
3059
3060         if (engine->irq_seqno_barrier)
3061                 engine->irq_seqno_barrier(engine);
3062
3063         request = i915_gem_find_active_request(engine);
3064         if (request && request->fence.error == -EIO)
3065                 request = ERR_PTR(-EIO); /* Previous reset failed! */
3066
3067         return request;
3068 }
3069
3070 int i915_gem_reset_prepare(struct drm_i915_private *dev_priv)
3071 {
3072         struct intel_engine_cs *engine;
3073         struct i915_request *request;
3074         enum intel_engine_id id;
3075         int err = 0;
3076
3077         for_each_engine(engine, dev_priv, id) {
3078                 request = i915_gem_reset_prepare_engine(engine);
3079                 if (IS_ERR(request)) {
3080                         err = PTR_ERR(request);
3081                         continue;
3082                 }
3083
3084                 engine->hangcheck.active_request = request;
3085         }
3086
3087         i915_gem_revoke_fences(dev_priv);
3088         intel_uc_sanitize(dev_priv);
3089
3090         return err;
3091 }
3092
3093 static void skip_request(struct i915_request *request)
3094 {
3095         void *vaddr = request->ring->vaddr;
3096         u32 head;
3097
3098         /* As this request likely depends on state from the lost
3099          * context, clear out all the user operations leaving the
3100          * breadcrumb at the end (so we get the fence notifications).
3101          */
3102         head = request->head;
3103         if (request->postfix < head) {
3104                 memset(vaddr + head, 0, request->ring->size - head);
3105                 head = 0;
3106         }
3107         memset(vaddr + head, 0, request->postfix - head);
3108
3109         dma_fence_set_error(&request->fence, -EIO);
3110 }
3111
3112 static void engine_skip_context(struct i915_request *request)
3113 {
3114         struct intel_engine_cs *engine = request->engine;
3115         struct i915_gem_context *hung_ctx = request->ctx;
3116         struct i915_timeline *timeline = request->timeline;
3117         unsigned long flags;
3118
3119         GEM_BUG_ON(timeline == &engine->timeline);
3120
3121         spin_lock_irqsave(&engine->timeline.lock, flags);
3122         spin_lock_nested(&timeline->lock, SINGLE_DEPTH_NESTING);
3123
3124         list_for_each_entry_continue(request, &engine->timeline.requests, link)
3125                 if (request->ctx == hung_ctx)
3126                         skip_request(request);
3127
3128         list_for_each_entry(request, &timeline->requests, link)
3129                 skip_request(request);
3130
3131         spin_unlock(&timeline->lock);
3132         spin_unlock_irqrestore(&engine->timeline.lock, flags);
3133 }
3134
3135 /* Returns the request if it was guilty of the hang */
3136 static struct i915_request *
3137 i915_gem_reset_request(struct intel_engine_cs *engine,
3138                        struct i915_request *request,
3139                        bool stalled)
3140 {
3141         /* The guilty request will get skipped on a hung engine.
3142          *
3143          * Users of client default contexts do not rely on logical
3144          * state preserved between batches so it is safe to execute
3145          * queued requests following the hang. Non default contexts
3146          * rely on preserved state, so skipping a batch loses the
3147          * evolution of the state and it needs to be considered corrupted.
3148          * Executing more queued batches on top of corrupted state is
3149          * risky. But we take the risk by trying to advance through
3150          * the queued requests in order to make the client behaviour
3151          * more predictable around resets, by not throwing away random
3152          * amount of batches it has prepared for execution. Sophisticated
3153          * clients can use gem_reset_stats_ioctl and dma fence status
3154          * (exported via sync_file info ioctl on explicit fences) to observe
3155          * when it loses the context state and should rebuild accordingly.
3156          *
3157          * The context ban, and ultimately the client ban, mechanism are safety
3158          * valves if client submission ends up resulting in nothing more than
3159          * subsequent hangs.
3160          */
3161
3162         if (i915_request_completed(request)) {
3163                 GEM_TRACE("%s pardoned global=%d (fence %llx:%d), current %d\n",
3164                           engine->name, request->global_seqno,
3165                           request->fence.context, request->fence.seqno,
3166                           intel_engine_get_seqno(engine));
3167                 stalled = false;
3168         }
3169
3170         if (stalled) {
3171                 i915_gem_context_mark_guilty(request->ctx);
3172                 skip_request(request);
3173
3174                 /* If this context is now banned, skip all pending requests. */
3175                 if (i915_gem_context_is_banned(request->ctx))
3176                         engine_skip_context(request);
3177         } else {
3178                 /*
3179                  * Since this is not the hung engine, it may have advanced
3180                  * since the hang declaration. Double check by refinding
3181                  * the active request at the time of the reset.
3182                  */
3183                 request = i915_gem_find_active_request(engine);
3184                 if (request) {
3185                         i915_gem_context_mark_innocent(request->ctx);
3186                         dma_fence_set_error(&request->fence, -EAGAIN);
3187
3188                         /* Rewind the engine to replay the incomplete rq */
3189                         spin_lock_irq(&engine->timeline.lock);
3190                         request = list_prev_entry(request, link);
3191                         if (&request->link == &engine->timeline.requests)
3192                                 request = NULL;
3193                         spin_unlock_irq(&engine->timeline.lock);
3194                 }
3195         }
3196
3197         return request;
3198 }
3199
3200 void i915_gem_reset_engine(struct intel_engine_cs *engine,
3201                            struct i915_request *request,
3202                            bool stalled)
3203 {
3204         /*
3205          * Make sure this write is visible before we re-enable the interrupt
3206          * handlers on another CPU, as tasklet_enable() resolves to just
3207          * a compiler barrier which is insufficient for our purpose here.
3208          */
3209         smp_store_mb(engine->irq_posted, 0);
3210
3211         if (request)
3212                 request = i915_gem_reset_request(engine, request, stalled);
3213
3214         if (request) {
3215                 DRM_DEBUG_DRIVER("resetting %s to restart from tail of request 0x%x\n",
3216                                  engine->name, request->global_seqno);
3217         }
3218
3219         /* Setup the CS to resume from the breadcrumb of the hung request */
3220         engine->reset_hw(engine, request);
3221 }
3222
3223 void i915_gem_reset(struct drm_i915_private *dev_priv,
3224                     unsigned int stalled_mask)
3225 {
3226         struct intel_engine_cs *engine;
3227         enum intel_engine_id id;
3228
3229         lockdep_assert_held(&dev_priv->drm.struct_mutex);
3230
3231         i915_retire_requests(dev_priv);
3232
3233         for_each_engine(engine, dev_priv, id) {
3234                 struct i915_gem_context *ctx;
3235
3236                 i915_gem_reset_engine(engine,
3237                                       engine->hangcheck.active_request,
3238                                       stalled_mask & ENGINE_MASK(id));
3239                 ctx = fetch_and_zero(&engine->last_retired_context);
3240                 if (ctx)
3241                         intel_context_unpin(ctx, engine);
3242
3243                 /*
3244                  * Ostensibily, we always want a context loaded for powersaving,
3245                  * so if the engine is idle after the reset, send a request
3246                  * to load our scratch kernel_context.
3247                  *
3248                  * More mysteriously, if we leave the engine idle after a reset,
3249                  * the next userspace batch may hang, with what appears to be
3250                  * an incoherent read by the CS (presumably stale TLB). An
3251                  * empty request appears sufficient to paper over the glitch.
3252                  */
3253                 if (intel_engine_is_idle(engine)) {
3254                         struct i915_request *rq;
3255
3256                         rq = i915_request_alloc(engine,
3257                                                 dev_priv->kernel_context);
3258                         if (!IS_ERR(rq))
3259                                 __i915_request_add(rq, false);
3260                 }
3261         }
3262
3263         i915_gem_restore_fences(dev_priv);
3264 }
3265
3266 void i915_gem_reset_finish_engine(struct intel_engine_cs *engine)
3267 {
3268         tasklet_enable(&engine->execlists.tasklet);
3269         kthread_unpark(engine->breadcrumbs.signaler);
3270
3271         intel_uncore_forcewake_put(engine->i915, FORCEWAKE_ALL);
3272 }
3273
3274 void i915_gem_reset_finish(struct drm_i915_private *dev_priv)
3275 {
3276         struct intel_engine_cs *engine;
3277         enum intel_engine_id id;
3278
3279         lockdep_assert_held(&dev_priv->drm.struct_mutex);
3280
3281         for_each_engine(engine, dev_priv, id) {
3282                 engine->hangcheck.active_request = NULL;
3283                 i915_gem_reset_finish_engine(engine);
3284         }
3285 }
3286
3287 static void nop_submit_request(struct i915_request *request)
3288 {
3289         GEM_TRACE("%s fence %llx:%d -> -EIO\n",
3290                   request->engine->name,
3291                   request->fence.context, request->fence.seqno);
3292         dma_fence_set_error(&request->fence, -EIO);
3293
3294         i915_request_submit(request);
3295 }
3296
3297 static void nop_complete_submit_request(struct i915_request *request)
3298 {
3299         unsigned long flags;
3300
3301         GEM_TRACE("%s fence %llx:%d -> -EIO\n",
3302                   request->engine->name,
3303                   request->fence.context, request->fence.seqno);
3304         dma_fence_set_error(&request->fence, -EIO);
3305
3306         spin_lock_irqsave(&request->engine->timeline.lock, flags);
3307         __i915_request_submit(request);
3308         intel_engine_init_global_seqno(request->engine, request->global_seqno);
3309         spin_unlock_irqrestore(&request->engine->timeline.lock, flags);
3310 }
3311
3312 void i915_gem_set_wedged(struct drm_i915_private *i915)
3313 {
3314         struct intel_engine_cs *engine;
3315         enum intel_engine_id id;
3316
3317         GEM_TRACE("start\n");
3318
3319         if (GEM_SHOW_DEBUG()) {
3320                 struct drm_printer p = drm_debug_printer(__func__);
3321
3322                 for_each_engine(engine, i915, id)
3323                         intel_engine_dump(engine, &p, "%s\n", engine->name);
3324         }
3325
3326         set_bit(I915_WEDGED, &i915->gpu_error.flags);
3327         smp_mb__after_atomic();
3328
3329         /*
3330          * First, stop submission to hw, but do not yet complete requests by
3331          * rolling the global seqno forward (since this would complete requests
3332          * for which we haven't set the fence error to EIO yet).
3333          */
3334         for_each_engine(engine, i915, id) {
3335                 i915_gem_reset_prepare_engine(engine);
3336
3337                 engine->submit_request = nop_submit_request;
3338                 engine->schedule = NULL;
3339         }
3340         i915->caps.scheduler = 0;
3341
3342         /* Even if the GPU reset fails, it should still stop the engines */
3343         intel_gpu_reset(i915, ALL_ENGINES);
3344
3345         /*
3346          * Make sure no one is running the old callback before we proceed with
3347          * cancelling requests and resetting the completion tracking. Otherwise
3348          * we might submit a request to the hardware which never completes.
3349          */
3350         synchronize_rcu();
3351
3352         for_each_engine(engine, i915, id) {
3353                 /* Mark all executing requests as skipped */
3354                 engine->cancel_requests(engine);
3355
3356                 /*
3357                  * Only once we've force-cancelled all in-flight requests can we
3358                  * start to complete all requests.
3359                  */
3360                 engine->submit_request = nop_complete_submit_request;
3361         }
3362
3363         /*
3364          * Make sure no request can slip through without getting completed by
3365          * either this call here to intel_engine_init_global_seqno, or the one
3366          * in nop_complete_submit_request.
3367          */
3368         synchronize_rcu();
3369
3370         for_each_engine(engine, i915, id) {
3371                 unsigned long flags;
3372
3373                 /*
3374                  * Mark all pending requests as complete so that any concurrent
3375                  * (lockless) lookup doesn't try and wait upon the request as we
3376                  * reset it.
3377                  */
3378                 spin_lock_irqsave(&engine->timeline.lock, flags);
3379                 intel_engine_init_global_seqno(engine,
3380                                                intel_engine_last_submit(engine));
3381                 spin_unlock_irqrestore(&engine->timeline.lock, flags);
3382
3383                 i915_gem_reset_finish_engine(engine);
3384         }
3385
3386         GEM_TRACE("end\n");
3387
3388         wake_up_all(&i915->gpu_error.reset_queue);
3389 }
3390
3391 bool i915_gem_unset_wedged(struct drm_i915_private *i915)
3392 {
3393         struct i915_timeline *tl;
3394
3395         lockdep_assert_held(&i915->drm.struct_mutex);
3396         if (!test_bit(I915_WEDGED, &i915->gpu_error.flags))
3397                 return true;
3398
3399         GEM_TRACE("start\n");
3400
3401         /*
3402          * Before unwedging, make sure that all pending operations
3403          * are flushed and errored out - we may have requests waiting upon
3404          * third party fences. We marked all inflight requests as EIO, and
3405          * every execbuf since returned EIO, for consistency we want all
3406          * the currently pending requests to also be marked as EIO, which
3407          * is done inside our nop_submit_request - and so we must wait.
3408          *
3409          * No more can be submitted until we reset the wedged bit.
3410          */
3411         list_for_each_entry(tl, &i915->gt.timelines, link) {
3412                 struct i915_request *rq;
3413
3414                 rq = i915_gem_active_peek(&tl->last_request,
3415                                           &i915->drm.struct_mutex);
3416                 if (!rq)
3417                         continue;
3418
3419                 /*
3420                  * We can't use our normal waiter as we want to
3421                  * avoid recursively trying to handle the current
3422                  * reset. The basic dma_fence_default_wait() installs
3423                  * a callback for dma_fence_signal(), which is
3424                  * triggered by our nop handler (indirectly, the
3425                  * callback enables the signaler thread which is
3426                  * woken by the nop_submit_request() advancing the seqno
3427                  * and when the seqno passes the fence, the signaler
3428                  * then signals the fence waking us up).
3429                  */
3430                 if (dma_fence_default_wait(&rq->fence, true,
3431                                            MAX_SCHEDULE_TIMEOUT) < 0)
3432                         return false;
3433         }
3434         i915_retire_requests(i915);
3435         GEM_BUG_ON(i915->gt.active_requests);
3436
3437         /*
3438          * Undo nop_submit_request. We prevent all new i915 requests from
3439          * being queued (by disallowing execbuf whilst wedged) so having
3440          * waited for all active requests above, we know the system is idle
3441          * and do not have to worry about a thread being inside
3442          * engine->submit_request() as we swap over. So unlike installing
3443          * the nop_submit_request on reset, we can do this from normal
3444          * context and do not require stop_machine().
3445          */
3446         intel_engines_reset_default_submission(i915);
3447         i915_gem_contexts_lost(i915);
3448
3449         GEM_TRACE("end\n");
3450
3451         smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
3452         clear_bit(I915_WEDGED, &i915->gpu_error.flags);
3453
3454         return true;
3455 }
3456
3457 static void
3458 i915_gem_retire_work_handler(struct work_struct *work)
3459 {
3460         struct drm_i915_private *dev_priv =
3461                 container_of(work, typeof(*dev_priv), gt.retire_work.work);
3462         struct drm_device *dev = &dev_priv->drm;
3463
3464         /* Come back later if the device is busy... */
3465         if (mutex_trylock(&dev->struct_mutex)) {
3466                 i915_retire_requests(dev_priv);
3467                 mutex_unlock(&dev->struct_mutex);
3468         }
3469
3470         /*
3471          * Keep the retire handler running until we are finally idle.
3472          * We do not need to do this test under locking as in the worst-case
3473          * we queue the retire worker once too often.
3474          */
3475         if (READ_ONCE(dev_priv->gt.awake))
3476                 queue_delayed_work(dev_priv->wq,
3477                                    &dev_priv->gt.retire_work,
3478                                    round_jiffies_up_relative(HZ));
3479 }
3480
3481 static void shrink_caches(struct drm_i915_private *i915)
3482 {
3483         /*
3484          * kmem_cache_shrink() discards empty slabs and reorders partially
3485          * filled slabs to prioritise allocating from the mostly full slabs,
3486          * with the aim of reducing fragmentation.
3487          */
3488         kmem_cache_shrink(i915->priorities);
3489         kmem_cache_shrink(i915->dependencies);
3490         kmem_cache_shrink(i915->requests);
3491         kmem_cache_shrink(i915->luts);
3492         kmem_cache_shrink(i915->vmas);
3493         kmem_cache_shrink(i915->objects);
3494 }
3495
3496 struct sleep_rcu_work {
3497         union {
3498                 struct rcu_head rcu;
3499                 struct work_struct work;
3500         };
3501         struct drm_i915_private *i915;
3502         unsigned int epoch;
3503 };
3504
3505 static inline bool
3506 same_epoch(struct drm_i915_private *i915, unsigned int epoch)
3507 {
3508         /*
3509          * There is a small chance that the epoch wrapped since we started
3510          * sleeping. If we assume that epoch is at least a u32, then it will
3511          * take at least 2^32 * 100ms for it to wrap, or about 326 years.
3512          */
3513         return epoch == READ_ONCE(i915->gt.epoch);
3514 }
3515
3516 static void __sleep_work(struct work_struct *work)
3517 {
3518         struct sleep_rcu_work *s = container_of(work, typeof(*s), work);
3519         struct drm_i915_private *i915 = s->i915;
3520         unsigned int epoch = s->epoch;
3521
3522         kfree(s);
3523         if (same_epoch(i915, epoch))
3524                 shrink_caches(i915);
3525 }
3526
3527 static void __sleep_rcu(struct rcu_head *rcu)
3528 {
3529         struct sleep_rcu_work *s = container_of(rcu, typeof(*s), rcu);
3530         struct drm_i915_private *i915 = s->i915;
3531
3532         if (same_epoch(i915, s->epoch)) {
3533                 INIT_WORK(&s->work, __sleep_work);
3534                 queue_work(i915->wq, &s->work);
3535         } else {
3536                 kfree(s);
3537         }
3538 }
3539
3540 static inline bool
3541 new_requests_since_last_retire(const struct drm_i915_private *i915)
3542 {
3543         return (READ_ONCE(i915->gt.active_requests) ||
3544                 work_pending(&i915->gt.idle_work.work));
3545 }
3546
3547 static void
3548 i915_gem_idle_work_handler(struct work_struct *work)
3549 {
3550         struct drm_i915_private *dev_priv =
3551                 container_of(work, typeof(*dev_priv), gt.idle_work.work);
3552         unsigned int epoch = I915_EPOCH_INVALID;
3553         bool rearm_hangcheck;
3554
3555         if (!READ_ONCE(dev_priv->gt.awake))
3556                 return;
3557
3558         /*
3559          * Wait for last execlists context complete, but bail out in case a
3560          * new request is submitted. As we don't trust the hardware, we
3561          * continue on if the wait times out. This is necessary to allow
3562          * the machine to suspend even if the hardware dies, and we will
3563          * try to recover in resume (after depriving the hardware of power,
3564          * it may be in a better mmod).
3565          */
3566         __wait_for(if (new_requests_since_last_retire(dev_priv)) return,
3567                    intel_engines_are_idle(dev_priv),
3568                    I915_IDLE_ENGINES_TIMEOUT * 1000,
3569                    10, 500);
3570
3571         rearm_hangcheck =
3572                 cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work);
3573
3574         if (!mutex_trylock(&dev_priv->drm.struct_mutex)) {
3575                 /* Currently busy, come back later */
3576                 mod_delayed_work(dev_priv->wq,
3577                                  &dev_priv->gt.idle_work,
3578                                  msecs_to_jiffies(50));
3579                 goto out_rearm;
3580         }
3581
3582         /*
3583          * New request retired after this work handler started, extend active
3584          * period until next instance of the work.
3585          */
3586         if (new_requests_since_last_retire(dev_priv))
3587                 goto out_unlock;
3588
3589         epoch = __i915_gem_park(dev_priv);
3590
3591         rearm_hangcheck = false;
3592 out_unlock:
3593         mutex_unlock(&dev_priv->drm.struct_mutex);
3594
3595 out_rearm:
3596         if (rearm_hangcheck) {
3597                 GEM_BUG_ON(!dev_priv->gt.awake);
3598                 i915_queue_hangcheck(dev_priv);
3599         }
3600
3601         /*
3602          * When we are idle, it is an opportune time to reap our caches.
3603          * However, we have many objects that utilise RCU and the ordered
3604          * i915->wq that this work is executing on. To try and flush any
3605          * pending frees now we are idle, we first wait for an RCU grace
3606          * period, and then queue a task (that will run last on the wq) to
3607          * shrink and re-optimize the caches.
3608          */
3609         if (same_epoch(dev_priv, epoch)) {
3610                 struct sleep_rcu_work *s = kmalloc(sizeof(*s), GFP_KERNEL);
3611                 if (s) {
3612                         s->i915 = dev_priv;
3613                         s->epoch = epoch;
3614                         call_rcu(&s->rcu, __sleep_rcu);
3615                 }
3616         }
3617 }
3618
3619 void i915_gem_close_object(struct drm_gem_object *gem, struct drm_file *file)
3620 {
3621         struct drm_i915_private *i915 = to_i915(gem->dev);
3622         struct drm_i915_gem_object *obj = to_intel_bo(gem);
3623         struct drm_i915_file_private *fpriv = file->driver_priv;
3624         struct i915_lut_handle *lut, *ln;
3625
3626         mutex_lock(&i915->drm.struct_mutex);
3627
3628         list_for_each_entry_safe(lut, ln, &obj->lut_list, obj_link) {
3629                 struct i915_gem_context *ctx = lut->ctx;
3630                 struct i915_vma *vma;
3631
3632                 GEM_BUG_ON(ctx->file_priv == ERR_PTR(-EBADF));
3633                 if (ctx->file_priv != fpriv)
3634                         continue;
3635
3636                 vma = radix_tree_delete(&ctx->handles_vma, lut->handle);
3637                 GEM_BUG_ON(vma->obj != obj);
3638
3639                 /* We allow the process to have multiple handles to the same
3640                  * vma, in the same fd namespace, by virtue of flink/open.
3641                  */
3642                 GEM_BUG_ON(!vma->open_count);
3643                 if (!--vma->open_count && !i915_vma_is_ggtt(vma))
3644                         i915_vma_close(vma);
3645
3646                 list_del(&lut->obj_link);
3647                 list_del(&lut->ctx_link);
3648
3649                 kmem_cache_free(i915->luts, lut);
3650                 __i915_gem_object_release_unless_active(obj);
3651         }
3652
3653         mutex_unlock(&i915->drm.struct_mutex);
3654 }
3655
3656 static unsigned long to_wait_timeout(s64 timeout_ns)
3657 {
3658         if (timeout_ns < 0)
3659                 return MAX_SCHEDULE_TIMEOUT;
3660
3661         if (timeout_ns == 0)
3662                 return 0;
3663
3664         return nsecs_to_jiffies_timeout(timeout_ns);
3665 }
3666
3667 /**
3668  * i915_gem_wait_ioctl - implements DRM_IOCTL_I915_GEM_WAIT
3669  * @dev: drm device pointer
3670  * @data: ioctl data blob
3671  * @file: drm file pointer
3672  *
3673  * Returns 0 if successful, else an error is returned with the remaining time in
3674  * the timeout parameter.
3675  *  -ETIME: object is still busy after timeout
3676  *  -ERESTARTSYS: signal interrupted the wait
3677  *  -ENONENT: object doesn't exist
3678  * Also possible, but rare:
3679  *  -EAGAIN: incomplete, restart syscall
3680  *  -ENOMEM: damn
3681  *  -ENODEV: Internal IRQ fail
3682  *  -E?: The add request failed
3683  *
3684  * The wait ioctl with a timeout of 0 reimplements the busy ioctl. With any
3685  * non-zero timeout parameter the wait ioctl will wait for the given number of
3686  * nanoseconds on an object becoming unbusy. Since the wait itself does so
3687  * without holding struct_mutex the object may become re-busied before this
3688  * function completes. A similar but shorter * race condition exists in the busy
3689  * ioctl
3690  */
3691 int
3692 i915_gem_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
3693 {
3694         struct drm_i915_gem_wait *args = data;
3695         struct drm_i915_gem_object *obj;
3696         ktime_t start;
3697         long ret;
3698
3699         if (args->flags != 0)
3700                 return -EINVAL;
3701
3702         obj = i915_gem_object_lookup(file, args->bo_handle);
3703         if (!obj)
3704                 return -ENOENT;
3705
3706         start = ktime_get();
3707
3708         ret = i915_gem_object_wait(obj,
3709                                    I915_WAIT_INTERRUPTIBLE | I915_WAIT_ALL,
3710                                    to_wait_timeout(args->timeout_ns),
3711                                    to_rps_client(file));
3712
3713         if (args->timeout_ns > 0) {
3714                 args->timeout_ns -= ktime_to_ns(ktime_sub(ktime_get(), start));
3715                 if (args->timeout_ns < 0)
3716                         args->timeout_ns = 0;
3717
3718                 /*
3719                  * Apparently ktime isn't accurate enough and occasionally has a
3720                  * bit of mismatch in the jiffies<->nsecs<->ktime loop. So patch
3721                  * things up to make the test happy. We allow up to 1 jiffy.
3722                  *
3723                  * This is a regression from the timespec->ktime conversion.
3724                  */
3725                 if (ret == -ETIME && !nsecs_to_jiffies(args->timeout_ns))
3726                         args->timeout_ns = 0;
3727
3728                 /* Asked to wait beyond the jiffie/scheduler precision? */
3729                 if (ret == -ETIME && args->timeout_ns)
3730                         ret = -EAGAIN;
3731         }
3732
3733         i915_gem_object_put(obj);
3734         return ret;
3735 }
3736
3737 static int wait_for_timeline(struct i915_timeline *tl, unsigned int flags)
3738 {
3739         return i915_gem_active_wait(&tl->last_request, flags);
3740 }
3741
3742 static int wait_for_engines(struct drm_i915_private *i915)
3743 {
3744         if (wait_for(intel_engines_are_idle(i915), I915_IDLE_ENGINES_TIMEOUT)) {
3745                 dev_err(i915->drm.dev,
3746                         "Failed to idle engines, declaring wedged!\n");
3747                 GEM_TRACE_DUMP();
3748                 i915_gem_set_wedged(i915);
3749                 return -EIO;
3750         }
3751
3752         return 0;
3753 }
3754
3755 int i915_gem_wait_for_idle(struct drm_i915_private *i915, unsigned int flags)
3756 {
3757         /* If the device is asleep, we have no requests outstanding */
3758         if (!READ_ONCE(i915->gt.awake))
3759                 return 0;
3760
3761         if (flags & I915_WAIT_LOCKED) {
3762                 struct i915_timeline *tl;
3763                 int err;
3764
3765                 lockdep_assert_held(&i915->drm.struct_mutex);
3766
3767                 list_for_each_entry(tl, &i915->gt.timelines, link) {
3768                         err = wait_for_timeline(tl, flags);
3769                         if (err)
3770                                 return err;
3771                 }
3772                 i915_retire_requests(i915);
3773
3774                 return wait_for_engines(i915);
3775         } else {
3776                 struct intel_engine_cs *engine;
3777                 enum intel_engine_id id;
3778                 int err;
3779
3780                 for_each_engine(engine, i915, id) {
3781                         err = wait_for_timeline(&engine->timeline, flags);
3782                         if (err)
3783                                 return err;
3784                 }
3785
3786                 return 0;
3787         }
3788 }
3789
3790 static void __i915_gem_object_flush_for_display(struct drm_i915_gem_object *obj)
3791 {
3792         /*
3793          * We manually flush the CPU domain so that we can override and
3794          * force the flush for the display, and perform it asyncrhonously.
3795          */
3796         flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
3797         if (obj->cache_dirty)
3798                 i915_gem_clflush_object(obj, I915_CLFLUSH_FORCE);
3799         obj->write_domain = 0;
3800 }
3801
3802 void i915_gem_object_flush_if_display(struct drm_i915_gem_object *obj)
3803 {
3804         if (!READ_ONCE(obj->pin_global))
3805                 return;
3806
3807         mutex_lock(&obj->base.dev->struct_mutex);
3808         __i915_gem_object_flush_for_display(obj);
3809         mutex_unlock(&obj->base.dev->struct_mutex);
3810 }
3811
3812 /**
3813  * Moves a single object to the WC read, and possibly write domain.
3814  * @obj: object to act on
3815  * @write: ask for write access or read only
3816  *
3817  * This function returns when the move is complete, including waiting on
3818  * flushes to occur.
3819  */
3820 int
3821 i915_gem_object_set_to_wc_domain(struct drm_i915_gem_object *obj, bool write)
3822 {
3823         int ret;
3824
3825         lockdep_assert_held(&obj->base.dev->struct_mutex);
3826
3827         ret = i915_gem_object_wait(obj,
3828                                    I915_WAIT_INTERRUPTIBLE |
3829                                    I915_WAIT_LOCKED |
3830                                    (write ? I915_WAIT_ALL : 0),
3831                                    MAX_SCHEDULE_TIMEOUT,
3832                                    NULL);
3833         if (ret)
3834                 return ret;
3835
3836         if (obj->write_domain == I915_GEM_DOMAIN_WC)
3837                 return 0;
3838
3839         /* Flush and acquire obj->pages so that we are coherent through
3840          * direct access in memory with previous cached writes through
3841          * shmemfs and that our cache domain tracking remains valid.
3842          * For example, if the obj->filp was moved to swap without us
3843          * being notified and releasing the pages, we would mistakenly
3844          * continue to assume that the obj remained out of the CPU cached
3845          * domain.
3846          */
3847         ret = i915_gem_object_pin_pages(obj);
3848         if (ret)
3849                 return ret;
3850
3851         flush_write_domain(obj, ~I915_GEM_DOMAIN_WC);
3852
3853         /* Serialise direct access to this object with the barriers for
3854          * coherent writes from the GPU, by effectively invalidating the
3855          * WC domain upon first access.
3856          */
3857         if ((obj->read_domains & I915_GEM_DOMAIN_WC) == 0)
3858                 mb();
3859
3860         /* It should now be out of any other write domains, and we can update
3861          * the domain values for our changes.
3862          */
3863         GEM_BUG_ON((obj->write_domain & ~I915_GEM_DOMAIN_WC) != 0);
3864         obj->read_domains |= I915_GEM_DOMAIN_WC;
3865         if (write) {
3866                 obj->read_domains = I915_GEM_DOMAIN_WC;
3867                 obj->write_domain = I915_GEM_DOMAIN_WC;
3868                 obj->mm.dirty = true;
3869         }
3870
3871         i915_gem_object_unpin_pages(obj);
3872         return 0;
3873 }
3874
3875 /**
3876  * Moves a single object to the GTT read, and possibly write domain.
3877  * @obj: object to act on
3878  * @write: ask for write access or read only
3879  *
3880  * This function returns when the move is complete, including waiting on
3881  * flushes to occur.
3882  */
3883 int
3884 i915_gem_object_set_to_gtt_domain(struct drm_i915_gem_object *obj, bool write)
3885 {
3886         int ret;
3887
3888         lockdep_assert_held(&obj->base.dev->struct_mutex);
3889
3890         ret = i915_gem_object_wait(obj,
3891                                    I915_WAIT_INTERRUPTIBLE |
3892                                    I915_WAIT_LOCKED |
3893                                    (write ? I915_WAIT_ALL : 0),
3894                                    MAX_SCHEDULE_TIMEOUT,
3895                                    NULL);
3896         if (ret)
3897                 return ret;
3898
3899         if (obj->write_domain == I915_GEM_DOMAIN_GTT)
3900                 return 0;
3901
3902         /* Flush and acquire obj->pages so that we are coherent through
3903          * direct access in memory with previous cached writes through
3904          * shmemfs and that our cache domain tracking remains valid.
3905          * For example, if the obj->filp was moved to swap without us
3906          * being notified and releasing the pages, we would mistakenly
3907          * continue to assume that the obj remained out of the CPU cached
3908          * domain.
3909          */
3910         ret = i915_gem_object_pin_pages(obj);
3911         if (ret)
3912                 return ret;
3913
3914         flush_write_domain(obj, ~I915_GEM_DOMAIN_GTT);
3915
3916         /* Serialise direct access to this object with the barriers for
3917          * coherent writes from the GPU, by effectively invalidating the
3918          * GTT domain upon first access.
3919          */
3920         if ((obj->read_domains & I915_GEM_DOMAIN_GTT) == 0)
3921                 mb();
3922
3923         /* It should now be out of any other write domains, and we can update
3924          * the domain values for our changes.
3925          */
3926         GEM_BUG_ON((obj->write_domain & ~I915_GEM_DOMAIN_GTT) != 0);
3927         obj->read_domains |= I915_GEM_DOMAIN_GTT;
3928         if (write) {
3929                 obj->read_domains = I915_GEM_DOMAIN_GTT;
3930                 obj->write_domain = I915_GEM_DOMAIN_GTT;
3931                 obj->mm.dirty = true;
3932         }
3933
3934         i915_gem_object_unpin_pages(obj);
3935         return 0;
3936 }
3937
3938 /**
3939  * Changes the cache-level of an object across all VMA.
3940  * @obj: object to act on
3941  * @cache_level: new cache level to set for the object
3942  *
3943  * After this function returns, the object will be in the new cache-level
3944  * across all GTT and the contents of the backing storage will be coherent,
3945  * with respect to the new cache-level. In order to keep the backing storage
3946  * coherent for all users, we only allow a single cache level to be set
3947  * globally on the object and prevent it from being changed whilst the
3948  * hardware is reading from the object. That is if the object is currently
3949  * on the scanout it will be set to uncached (or equivalent display
3950  * cache coherency) and all non-MOCS GPU access will also be uncached so
3951  * that all direct access to the scanout remains coherent.
3952  */
3953 int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
3954                                     enum i915_cache_level cache_level)
3955 {
3956         struct i915_vma *vma;
3957         int ret;
3958
3959         lockdep_assert_held(&obj->base.dev->struct_mutex);
3960
3961         if (obj->cache_level == cache_level)
3962                 return 0;
3963
3964         /* Inspect the list of currently bound VMA and unbind any that would
3965          * be invalid given the new cache-level. This is principally to
3966          * catch the issue of the CS prefetch crossing page boundaries and
3967          * reading an invalid PTE on older architectures.
3968          */
3969 restart:
3970         list_for_each_entry(vma, &obj->vma_list, obj_link) {
3971                 if (!drm_mm_node_allocated(&vma->node))
3972                         continue;
3973
3974                 if (i915_vma_is_pinned(vma)) {
3975                         DRM_DEBUG("can not change the cache level of pinned objects\n");
3976                         return -EBUSY;
3977                 }
3978
3979                 if (!i915_vma_is_closed(vma) &&
3980                     i915_gem_valid_gtt_space(vma, cache_level))
3981                         continue;
3982
3983                 ret = i915_vma_unbind(vma);
3984                 if (ret)
3985                         return ret;
3986
3987                 /* As unbinding may affect other elements in the
3988                  * obj->vma_list (due to side-effects from retiring
3989                  * an active vma), play safe and restart the iterator.
3990                  */
3991                 goto restart;
3992         }
3993
3994         /* We can reuse the existing drm_mm nodes but need to change the
3995          * cache-level on the PTE. We could simply unbind them all and
3996          * rebind with the correct cache-level on next use. However since
3997          * we already have a valid slot, dma mapping, pages etc, we may as
3998          * rewrite the PTE in the belief that doing so tramples upon less
3999          * state and so involves less work.
4000          */
4001         if (obj->bind_count) {
4002                 /* Before we change the PTE, the GPU must not be accessing it.
4003                  * If we wait upon the object, we know that all the bound
4004                  * VMA are no longer active.
4005                  */
4006                 ret = i915_gem_object_wait(obj,
4007                                            I915_WAIT_INTERRUPTIBLE |
4008                                            I915_WAIT_LOCKED |
4009                                            I915_WAIT_ALL,
4010                                            MAX_SCHEDULE_TIMEOUT,
4011                                            NULL);
4012                 if (ret)
4013                         return ret;
4014
4015                 if (!HAS_LLC(to_i915(obj->base.dev)) &&
4016                     cache_level != I915_CACHE_NONE) {
4017                         /* Access to snoopable pages through the GTT is
4018                          * incoherent and on some machines causes a hard
4019                          * lockup. Relinquish the CPU mmaping to force
4020                          * userspace to refault in the pages and we can
4021                          * then double check if the GTT mapping is still
4022                          * valid for that pointer access.
4023                          */
4024                         i915_gem_release_mmap(obj);
4025
4026                         /* As we no longer need a fence for GTT access,
4027                          * we can relinquish it now (and so prevent having
4028                          * to steal a fence from someone else on the next
4029                          * fence request). Note GPU activity would have
4030                          * dropped the fence as all snoopable access is
4031                          * supposed to be linear.
4032                          */
4033                         for_each_ggtt_vma(vma, obj) {
4034                                 ret = i915_vma_put_fence(vma);
4035                                 if (ret)
4036                                         return ret;
4037                         }
4038                 } else {
4039                         /* We either have incoherent backing store and
4040                          * so no GTT access or the architecture is fully
4041                          * coherent. In such cases, existing GTT mmaps
4042                          * ignore the cache bit in the PTE and we can
4043                          * rewrite it without confusing the GPU or having
4044                          * to force userspace to fault back in its mmaps.
4045                          */
4046                 }
4047
4048                 list_for_each_entry(vma, &obj->vma_list, obj_link) {
4049                         if (!drm_mm_node_allocated(&vma->node))
4050                                 continue;
4051
4052                         ret = i915_vma_bind(vma, cache_level, PIN_UPDATE);
4053                         if (ret)
4054                                 return ret;
4055                 }
4056         }
4057
4058         list_for_each_entry(vma, &obj->vma_list, obj_link)
4059                 vma->node.color = cache_level;
4060         i915_gem_object_set_cache_coherency(obj, cache_level);
4061         obj->cache_dirty = true; /* Always invalidate stale cachelines */
4062
4063         return 0;
4064 }
4065
4066 int i915_gem_get_caching_ioctl(struct drm_device *dev, void *data,
4067                                struct drm_file *file)
4068 {
4069         struct drm_i915_gem_caching *args = data;
4070         struct drm_i915_gem_object *obj;
4071         int err = 0;
4072
4073         rcu_read_lock();
4074         obj = i915_gem_object_lookup_rcu(file, args->handle);
4075         if (!obj) {
4076                 err = -ENOENT;
4077                 goto out;
4078         }
4079
4080         switch (obj->cache_level) {
4081         case I915_CACHE_LLC:
4082         case I915_CACHE_L3_LLC:
4083                 args->caching = I915_CACHING_CACHED;
4084                 break;
4085
4086         case I915_CACHE_WT:
4087                 args->caching = I915_CACHING_DISPLAY;
4088                 break;
4089
4090         default:
4091                 args->caching = I915_CACHING_NONE;
4092                 break;
4093         }
4094 out:
4095         rcu_read_unlock();
4096         return err;
4097 }
4098
4099 int i915_gem_set_caching_ioctl(struct drm_device *dev, void *data,
4100                                struct drm_file *file)
4101 {
4102         struct drm_i915_private *i915 = to_i915(dev);
4103         struct drm_i915_gem_caching *args = data;
4104         struct drm_i915_gem_object *obj;
4105         enum i915_cache_level level;
4106         int ret = 0;
4107
4108         switch (args->caching) {
4109         case I915_CACHING_NONE:
4110                 level = I915_CACHE_NONE;
4111                 break;
4112         case I915_CACHING_CACHED:
4113                 /*
4114                  * Due to a HW issue on BXT A stepping, GPU stores via a
4115                  * snooped mapping may leave stale data in a corresponding CPU
4116                  * cacheline, whereas normally such cachelines would get
4117                  * invalidated.
4118                  */
4119                 if (!HAS_LLC(i915) && !HAS_SNOOP(i915))
4120                         return -ENODEV;
4121
4122                 level = I915_CACHE_LLC;
4123                 break;
4124         case I915_CACHING_DISPLAY:
4125                 level = HAS_WT(i915) ? I915_CACHE_WT : I915_CACHE_NONE;
4126                 break;
4127         default:
4128                 return -EINVAL;
4129         }
4130
4131         obj = i915_gem_object_lookup(file, args->handle);
4132         if (!obj)
4133                 return -ENOENT;
4134
4135         /*
4136          * The caching mode of proxy object is handled by its generator, and
4137          * not allowed to be changed by userspace.
4138          */
4139         if (i915_gem_object_is_proxy(obj)) {
4140                 ret = -ENXIO;
4141                 goto out;
4142         }
4143
4144         if (obj->cache_level == level)
4145                 goto out;
4146
4147         ret = i915_gem_object_wait(obj,
4148                                    I915_WAIT_INTERRUPTIBLE,
4149                                    MAX_SCHEDULE_TIMEOUT,
4150                                    to_rps_client(file));
4151         if (ret)
4152                 goto out;
4153
4154         ret = i915_mutex_lock_interruptible(dev);
4155         if (ret)
4156                 goto out;
4157
4158         ret = i915_gem_object_set_cache_level(obj, level);
4159         mutex_unlock(&dev->struct_mutex);
4160
4161 out:
4162         i915_gem_object_put(obj);
4163         return ret;
4164 }
4165
4166 /*
4167  * Prepare buffer for display plane (scanout, cursors, etc). Can be called from
4168  * an uninterruptible phase (modesetting) and allows any flushes to be pipelined
4169  * (for pageflips). We only flush the caches while preparing the buffer for
4170  * display, the callers are responsible for frontbuffer flush.
4171  */
4172 struct i915_vma *
4173 i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj,
4174                                      u32 alignment,
4175                                      const struct i915_ggtt_view *view,
4176                                      unsigned int flags)
4177 {
4178         struct i915_vma *vma;
4179         int ret;
4180
4181         lockdep_assert_held(&obj->base.dev->struct_mutex);
4182
4183         /* Mark the global pin early so that we account for the
4184          * display coherency whilst setting up the cache domains.
4185          */
4186         obj->pin_global++;
4187
4188         /* The display engine is not coherent with the LLC cache on gen6.  As
4189          * a result, we make sure that the pinning that is about to occur is
4190          * done with uncached PTEs. This is lowest common denominator for all
4191          * chipsets.
4192          *
4193          * However for gen6+, we could do better by using the GFDT bit instead
4194          * of uncaching, which would allow us to flush all the LLC-cached data
4195          * with that bit in the PTE to main memory with just one PIPE_CONTROL.
4196          */
4197         ret = i915_gem_object_set_cache_level(obj,
4198                                               HAS_WT(to_i915(obj->base.dev)) ?
4199                                               I915_CACHE_WT : I915_CACHE_NONE);
4200         if (ret) {
4201                 vma = ERR_PTR(ret);
4202                 goto err_unpin_global;
4203         }
4204
4205         /* As the user may map the buffer once pinned in the display plane
4206          * (e.g. libkms for the bootup splash), we have to ensure that we
4207          * always use map_and_fenceable for all scanout buffers. However,
4208          * it may simply be too big to fit into mappable, in which case
4209          * put it anyway and hope that userspace can cope (but always first
4210          * try to preserve the existing ABI).
4211          */
4212         vma = ERR_PTR(-ENOSPC);
4213         if ((flags & PIN_MAPPABLE) == 0 &&
4214             (!view || view->type == I915_GGTT_VIEW_NORMAL))
4215                 vma = i915_gem_object_ggtt_pin(obj, view, 0, alignment,
4216                                                flags |
4217                                                PIN_MAPPABLE |
4218                                                PIN_NONBLOCK);
4219         if (IS_ERR(vma))
4220                 vma = i915_gem_object_ggtt_pin(obj, view, 0, alignment, flags);
4221         if (IS_ERR(vma))
4222                 goto err_unpin_global;
4223
4224         vma->display_alignment = max_t(u64, vma->display_alignment, alignment);
4225
4226         __i915_gem_object_flush_for_display(obj);
4227
4228         /* It should now be out of any other write domains, and we can update
4229          * the domain values for our changes.
4230          */
4231         obj->read_domains |= I915_GEM_DOMAIN_GTT;
4232
4233         return vma;
4234
4235 err_unpin_global:
4236         obj->pin_global--;
4237         return vma;
4238 }
4239
4240 void
4241 i915_gem_object_unpin_from_display_plane(struct i915_vma *vma)
4242 {
4243         lockdep_assert_held(&vma->vm->i915->drm.struct_mutex);
4244
4245         if (WARN_ON(vma->obj->pin_global == 0))
4246                 return;
4247
4248         if (--vma->obj->pin_global == 0)
4249                 vma->display_alignment = I915_GTT_MIN_ALIGNMENT;
4250
4251         /* Bump the LRU to try and avoid premature eviction whilst flipping  */
4252         i915_gem_object_bump_inactive_ggtt(vma->obj);
4253
4254         i915_vma_unpin(vma);
4255 }
4256
4257 /**
4258  * Moves a single object to the CPU read, and possibly write domain.
4259  * @obj: object to act on
4260  * @write: requesting write or read-only access
4261  *
4262  * This function returns when the move is complete, including waiting on
4263  * flushes to occur.
4264  */
4265 int
4266 i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write)
4267 {
4268         int ret;
4269
4270         lockdep_assert_held(&obj->base.dev->struct_mutex);
4271
4272         ret = i915_gem_object_wait(obj,
4273                                    I915_WAIT_INTERRUPTIBLE |
4274                                    I915_WAIT_LOCKED |
4275                                    (write ? I915_WAIT_ALL : 0),
4276                                    MAX_SCHEDULE_TIMEOUT,
4277                                    NULL);
4278         if (ret)
4279                 return ret;
4280
4281         flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
4282
4283         /* Flush the CPU cache if it's still invalid. */
4284         if ((obj->read_domains & I915_GEM_DOMAIN_CPU) == 0) {
4285                 i915_gem_clflush_object(obj, I915_CLFLUSH_SYNC);
4286                 obj->read_domains |= I915_GEM_DOMAIN_CPU;
4287         }
4288
4289         /* It should now be out of any other write domains, and we can update
4290          * the domain values for our changes.
4291          */
4292         GEM_BUG_ON(obj->write_domain & ~I915_GEM_DOMAIN_CPU);
4293
4294         /* If we're writing through the CPU, then the GPU read domains will
4295          * need to be invalidated at next use.
4296          */
4297         if (write)
4298                 __start_cpu_write(obj);
4299
4300         return 0;
4301 }
4302
4303 /* Throttle our rendering by waiting until the ring has completed our requests
4304  * emitted over 20 msec ago.
4305  *
4306  * Note that if we were to use the current jiffies each time around the loop,
4307  * we wouldn't escape the function with any frames outstanding if the time to
4308  * render a frame was over 20ms.
4309  *
4310  * This should get us reasonable parallelism between CPU and GPU but also
4311  * relatively low latency when blocking on a particular request to finish.
4312  */
4313 static int
4314 i915_gem_ring_throttle(struct drm_device *dev, struct drm_file *file)
4315 {
4316         struct drm_i915_private *dev_priv = to_i915(dev);
4317         struct drm_i915_file_private *file_priv = file->driver_priv;
4318         unsigned long recent_enough = jiffies - DRM_I915_THROTTLE_JIFFIES;
4319         struct i915_request *request, *target = NULL;
4320         long ret;
4321
4322         /* ABI: return -EIO if already wedged */
4323         if (i915_terminally_wedged(&dev_priv->gpu_error))
4324                 return -EIO;
4325
4326         spin_lock(&file_priv->mm.lock);
4327         list_for_each_entry(request, &file_priv->mm.request_list, client_link) {
4328                 if (time_after_eq(request->emitted_jiffies, recent_enough))
4329                         break;
4330
4331                 if (target) {
4332                         list_del(&target->client_link);
4333                         target->file_priv = NULL;
4334                 }
4335
4336                 target = request;
4337         }
4338         if (target)
4339                 i915_request_get(target);
4340         spin_unlock(&file_priv->mm.lock);
4341
4342         if (target == NULL)
4343                 return 0;
4344
4345         ret = i915_request_wait(target,
4346                                 I915_WAIT_INTERRUPTIBLE,
4347                                 MAX_SCHEDULE_TIMEOUT);
4348         i915_request_put(target);
4349
4350         return ret < 0 ? ret : 0;
4351 }
4352
4353 struct i915_vma *
4354 i915_gem_object_ggtt_pin(struct drm_i915_gem_object *obj,
4355                          const struct i915_ggtt_view *view,
4356                          u64 size,
4357                          u64 alignment,
4358                          u64 flags)
4359 {
4360         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
4361         struct i915_address_space *vm = &dev_priv->ggtt.base;
4362         struct i915_vma *vma;
4363         int ret;
4364
4365         lockdep_assert_held(&obj->base.dev->struct_mutex);
4366
4367         if (flags & PIN_MAPPABLE &&
4368             (!view || view->type == I915_GGTT_VIEW_NORMAL)) {
4369                 /* If the required space is larger than the available
4370                  * aperture, we will not able to find a slot for the
4371                  * object and unbinding the object now will be in
4372                  * vain. Worse, doing so may cause us to ping-pong
4373                  * the object in and out of the Global GTT and
4374                  * waste a lot of cycles under the mutex.
4375                  */
4376                 if (obj->base.size > dev_priv->ggtt.mappable_end)
4377                         return ERR_PTR(-E2BIG);
4378
4379                 /* If NONBLOCK is set the caller is optimistically
4380                  * trying to cache the full object within the mappable
4381                  * aperture, and *must* have a fallback in place for
4382                  * situations where we cannot bind the object. We
4383                  * can be a little more lax here and use the fallback
4384                  * more often to avoid costly migrations of ourselves
4385                  * and other objects within the aperture.
4386                  *
4387                  * Half-the-aperture is used as a simple heuristic.
4388                  * More interesting would to do search for a free
4389                  * block prior to making the commitment to unbind.
4390                  * That caters for the self-harm case, and with a
4391                  * little more heuristics (e.g. NOFAULT, NOEVICT)
4392                  * we could try to minimise harm to others.
4393                  */
4394                 if (flags & PIN_NONBLOCK &&
4395                     obj->base.size > dev_priv->ggtt.mappable_end / 2)
4396                         return ERR_PTR(-ENOSPC);
4397         }
4398
4399         vma = i915_vma_instance(obj, vm, view);
4400         if (unlikely(IS_ERR(vma)))
4401                 return vma;
4402
4403         if (i915_vma_misplaced(vma, size, alignment, flags)) {
4404                 if (flags & PIN_NONBLOCK) {
4405                         if (i915_vma_is_pinned(vma) || i915_vma_is_active(vma))
4406                                 return ERR_PTR(-ENOSPC);
4407
4408                         if (flags & PIN_MAPPABLE &&
4409                             vma->fence_size > dev_priv->ggtt.mappable_end / 2)
4410                                 return ERR_PTR(-ENOSPC);
4411                 }
4412
4413                 WARN(i915_vma_is_pinned(vma),
4414                      "bo is already pinned in ggtt with incorrect alignment:"
4415                      " offset=%08x, req.alignment=%llx,"
4416                      " req.map_and_fenceable=%d, vma->map_and_fenceable=%d\n",
4417                      i915_ggtt_offset(vma), alignment,
4418                      !!(flags & PIN_MAPPABLE),
4419                      i915_vma_is_map_and_fenceable(vma));
4420                 ret = i915_vma_unbind(vma);
4421                 if (ret)
4422                         return ERR_PTR(ret);
4423         }
4424
4425         ret = i915_vma_pin(vma, size, alignment, flags | PIN_GLOBAL);
4426         if (ret)
4427                 return ERR_PTR(ret);
4428
4429         return vma;
4430 }
4431
4432 static __always_inline unsigned int __busy_read_flag(unsigned int id)
4433 {
4434         /* Note that we could alias engines in the execbuf API, but
4435          * that would be very unwise as it prevents userspace from
4436          * fine control over engine selection. Ahem.
4437          *
4438          * This should be something like EXEC_MAX_ENGINE instead of
4439          * I915_NUM_ENGINES.
4440          */
4441         BUILD_BUG_ON(I915_NUM_ENGINES > 16);
4442         return 0x10000 << id;
4443 }
4444
4445 static __always_inline unsigned int __busy_write_id(unsigned int id)
4446 {
4447         /* The uABI guarantees an active writer is also amongst the read
4448          * engines. This would be true if we accessed the activity tracking
4449          * under the lock, but as we perform the lookup of the object and
4450          * its activity locklessly we can not guarantee that the last_write
4451          * being active implies that we have set the same engine flag from
4452          * last_read - hence we always set both read and write busy for
4453          * last_write.
4454          */
4455         return id | __busy_read_flag(id);
4456 }
4457
4458 static __always_inline unsigned int
4459 __busy_set_if_active(const struct dma_fence *fence,
4460                      unsigned int (*flag)(unsigned int id))
4461 {
4462         struct i915_request *rq;
4463
4464         /* We have to check the current hw status of the fence as the uABI
4465          * guarantees forward progress. We could rely on the idle worker
4466          * to eventually flush us, but to minimise latency just ask the
4467          * hardware.
4468          *
4469          * Note we only report on the status of native fences.
4470          */
4471         if (!dma_fence_is_i915(fence))
4472                 return 0;
4473
4474         /* opencode to_request() in order to avoid const warnings */
4475         rq = container_of(fence, struct i915_request, fence);
4476         if (i915_request_completed(rq))
4477                 return 0;
4478
4479         return flag(rq->engine->uabi_id);
4480 }
4481
4482 static __always_inline unsigned int
4483 busy_check_reader(const struct dma_fence *fence)
4484 {
4485         return __busy_set_if_active(fence, __busy_read_flag);
4486 }
4487
4488 static __always_inline unsigned int
4489 busy_check_writer(const struct dma_fence *fence)
4490 {
4491         if (!fence)
4492                 return 0;
4493
4494         return __busy_set_if_active(fence, __busy_write_id);
4495 }
4496
4497 int
4498 i915_gem_busy_ioctl(struct drm_device *dev, void *data,
4499                     struct drm_file *file)
4500 {
4501         struct drm_i915_gem_busy *args = data;
4502         struct drm_i915_gem_object *obj;
4503         struct reservation_object_list *list;
4504         unsigned int seq;
4505         int err;
4506
4507         err = -ENOENT;
4508         rcu_read_lock();
4509         obj = i915_gem_object_lookup_rcu(file, args->handle);
4510         if (!obj)
4511                 goto out;
4512
4513         /* A discrepancy here is that we do not report the status of
4514          * non-i915 fences, i.e. even though we may report the object as idle,
4515          * a call to set-domain may still stall waiting for foreign rendering.
4516          * This also means that wait-ioctl may report an object as busy,
4517          * where busy-ioctl considers it idle.
4518          *
4519          * We trade the ability to warn of foreign fences to report on which
4520          * i915 engines are active for the object.
4521          *
4522          * Alternatively, we can trade that extra information on read/write
4523          * activity with
4524          *      args->busy =
4525          *              !reservation_object_test_signaled_rcu(obj->resv, true);
4526          * to report the overall busyness. This is what the wait-ioctl does.
4527          *
4528          */
4529 retry:
4530         seq = raw_read_seqcount(&obj->resv->seq);
4531
4532         /* Translate the exclusive fence to the READ *and* WRITE engine */
4533         args->busy = busy_check_writer(rcu_dereference(obj->resv->fence_excl));
4534
4535         /* Translate shared fences to READ set of engines */
4536         list = rcu_dereference(obj->resv->fence);
4537         if (list) {
4538                 unsigned int shared_count = list->shared_count, i;
4539
4540                 for (i = 0; i < shared_count; ++i) {
4541                         struct dma_fence *fence =
4542                                 rcu_dereference(list->shared[i]);
4543
4544                         args->busy |= busy_check_reader(fence);
4545                 }
4546         }
4547
4548         if (args->busy && read_seqcount_retry(&obj->resv->seq, seq))
4549                 goto retry;
4550
4551         err = 0;
4552 out:
4553         rcu_read_unlock();
4554         return err;
4555 }
4556
4557 int
4558 i915_gem_throttle_ioctl(struct drm_device *dev, void *data,
4559                         struct drm_file *file_priv)
4560 {
4561         return i915_gem_ring_throttle(dev, file_priv);
4562 }
4563
4564 int
4565 i915_gem_madvise_ioctl(struct drm_device *dev, void *data,
4566                        struct drm_file *file_priv)
4567 {
4568         struct drm_i915_private *dev_priv = to_i915(dev);
4569         struct drm_i915_gem_madvise *args = data;
4570         struct drm_i915_gem_object *obj;
4571         int err;
4572
4573         switch (args->madv) {
4574         case I915_MADV_DONTNEED:
4575         case I915_MADV_WILLNEED:
4576             break;
4577         default:
4578             return -EINVAL;
4579         }
4580
4581         obj = i915_gem_object_lookup(file_priv, args->handle);
4582         if (!obj)
4583                 return -ENOENT;
4584
4585         err = mutex_lock_interruptible(&obj->mm.lock);
4586         if (err)
4587                 goto out;
4588
4589         if (i915_gem_object_has_pages(obj) &&
4590             i915_gem_object_is_tiled(obj) &&
4591             dev_priv->quirks & QUIRK_PIN_SWIZZLED_PAGES) {
4592                 if (obj->mm.madv == I915_MADV_WILLNEED) {
4593                         GEM_BUG_ON(!obj->mm.quirked);
4594                         __i915_gem_object_unpin_pages(obj);
4595                         obj->mm.quirked = false;
4596                 }
4597                 if (args->madv == I915_MADV_WILLNEED) {
4598                         GEM_BUG_ON(obj->mm.quirked);
4599                         __i915_gem_object_pin_pages(obj);
4600                         obj->mm.quirked = true;
4601                 }
4602         }
4603
4604         if (obj->mm.madv != __I915_MADV_PURGED)
4605                 obj->mm.madv = args->madv;
4606
4607         /* if the object is no longer attached, discard its backing storage */
4608         if (obj->mm.madv == I915_MADV_DONTNEED &&
4609             !i915_gem_object_has_pages(obj))
4610                 i915_gem_object_truncate(obj);
4611
4612         args->retained = obj->mm.madv != __I915_MADV_PURGED;
4613         mutex_unlock(&obj->mm.lock);
4614
4615 out:
4616         i915_gem_object_put(obj);
4617         return err;
4618 }
4619
4620 static void
4621 frontbuffer_retire(struct i915_gem_active *active, struct i915_request *request)
4622 {
4623         struct drm_i915_gem_object *obj =
4624                 container_of(active, typeof(*obj), frontbuffer_write);
4625
4626         intel_fb_obj_flush(obj, ORIGIN_CS);
4627 }
4628
4629 void i915_gem_object_init(struct drm_i915_gem_object *obj,
4630                           const struct drm_i915_gem_object_ops *ops)
4631 {
4632         mutex_init(&obj->mm.lock);
4633
4634         INIT_LIST_HEAD(&obj->vma_list);
4635         INIT_LIST_HEAD(&obj->lut_list);
4636         INIT_LIST_HEAD(&obj->batch_pool_link);
4637
4638         obj->ops = ops;
4639
4640         reservation_object_init(&obj->__builtin_resv);
4641         obj->resv = &obj->__builtin_resv;
4642
4643         obj->frontbuffer_ggtt_origin = ORIGIN_GTT;
4644         init_request_active(&obj->frontbuffer_write, frontbuffer_retire);
4645
4646         obj->mm.madv = I915_MADV_WILLNEED;
4647         INIT_RADIX_TREE(&obj->mm.get_page.radix, GFP_KERNEL | __GFP_NOWARN);
4648         mutex_init(&obj->mm.get_page.lock);
4649
4650         i915_gem_info_add_obj(to_i915(obj->base.dev), obj->base.size);
4651 }
4652
4653 static const struct drm_i915_gem_object_ops i915_gem_object_ops = {
4654         .flags = I915_GEM_OBJECT_HAS_STRUCT_PAGE |
4655                  I915_GEM_OBJECT_IS_SHRINKABLE,
4656
4657         .get_pages = i915_gem_object_get_pages_gtt,
4658         .put_pages = i915_gem_object_put_pages_gtt,
4659
4660         .pwrite = i915_gem_object_pwrite_gtt,
4661 };
4662
4663 static int i915_gem_object_create_shmem(struct drm_device *dev,
4664                                         struct drm_gem_object *obj,
4665                                         size_t size)
4666 {
4667         struct drm_i915_private *i915 = to_i915(dev);
4668         unsigned long flags = VM_NORESERVE;
4669         struct file *filp;
4670
4671         drm_gem_private_object_init(dev, obj, size);
4672
4673         if (i915->mm.gemfs)
4674                 filp = shmem_file_setup_with_mnt(i915->mm.gemfs, "i915", size,
4675                                                  flags);
4676         else
4677                 filp = shmem_file_setup("i915", size, flags);
4678
4679         if (IS_ERR(filp))
4680                 return PTR_ERR(filp);
4681
4682         obj->filp = filp;
4683
4684         return 0;
4685 }
4686
4687 struct drm_i915_gem_object *
4688 i915_gem_object_create(struct drm_i915_private *dev_priv, u64 size)
4689 {
4690         struct drm_i915_gem_object *obj;
4691         struct address_space *mapping;
4692         unsigned int cache_level;
4693         gfp_t mask;
4694         int ret;
4695
4696         /* There is a prevalence of the assumption that we fit the object's
4697          * page count inside a 32bit _signed_ variable. Let's document this and
4698          * catch if we ever need to fix it. In the meantime, if you do spot
4699          * such a local variable, please consider fixing!
4700          */
4701         if (size >> PAGE_SHIFT > INT_MAX)
4702                 return ERR_PTR(-E2BIG);
4703
4704         if (overflows_type(size, obj->base.size))
4705                 return ERR_PTR(-E2BIG);
4706
4707         obj = i915_gem_object_alloc(dev_priv);
4708         if (obj == NULL)
4709                 return ERR_PTR(-ENOMEM);
4710
4711         ret = i915_gem_object_create_shmem(&dev_priv->drm, &obj->base, size);
4712         if (ret)
4713                 goto fail;
4714
4715         mask = GFP_HIGHUSER | __GFP_RECLAIMABLE;
4716         if (IS_I965GM(dev_priv) || IS_I965G(dev_priv)) {
4717                 /* 965gm cannot relocate objects above 4GiB. */
4718                 mask &= ~__GFP_HIGHMEM;
4719                 mask |= __GFP_DMA32;
4720         }
4721
4722         mapping = obj->base.filp->f_mapping;
4723         mapping_set_gfp_mask(mapping, mask);
4724         GEM_BUG_ON(!(mapping_gfp_mask(mapping) & __GFP_RECLAIM));
4725
4726         i915_gem_object_init(obj, &i915_gem_object_ops);
4727
4728         obj->write_domain = I915_GEM_DOMAIN_CPU;
4729         obj->read_domains = I915_GEM_DOMAIN_CPU;
4730
4731         if (HAS_LLC(dev_priv))
4732                 /* On some devices, we can have the GPU use the LLC (the CPU
4733                  * cache) for about a 10% performance improvement
4734                  * compared to uncached.  Graphics requests other than
4735                  * display scanout are coherent with the CPU in
4736                  * accessing this cache.  This means in this mode we
4737                  * don't need to clflush on the CPU side, and on the
4738                  * GPU side we only need to flush internal caches to
4739                  * get data visible to the CPU.
4740                  *
4741                  * However, we maintain the display planes as UC, and so
4742                  * need to rebind when first used as such.
4743                  */
4744                 cache_level = I915_CACHE_LLC;
4745         else
4746                 cache_level = I915_CACHE_NONE;
4747
4748         i915_gem_object_set_cache_coherency(obj, cache_level);
4749
4750         trace_i915_gem_object_create(obj);
4751
4752         return obj;
4753
4754 fail:
4755         i915_gem_object_free(obj);
4756         return ERR_PTR(ret);
4757 }
4758
4759 static bool discard_backing_storage(struct drm_i915_gem_object *obj)
4760 {
4761         /* If we are the last user of the backing storage (be it shmemfs
4762          * pages or stolen etc), we know that the pages are going to be
4763          * immediately released. In this case, we can then skip copying
4764          * back the contents from the GPU.
4765          */
4766
4767         if (obj->mm.madv != I915_MADV_WILLNEED)
4768                 return false;
4769
4770         if (obj->base.filp == NULL)
4771                 return true;
4772
4773         /* At first glance, this looks racy, but then again so would be
4774          * userspace racing mmap against close. However, the first external
4775          * reference to the filp can only be obtained through the
4776          * i915_gem_mmap_ioctl() which safeguards us against the user
4777          * acquiring such a reference whilst we are in the middle of
4778          * freeing the object.
4779          */
4780         return atomic_long_read(&obj->base.filp->f_count) == 1;
4781 }
4782
4783 static void __i915_gem_free_objects(struct drm_i915_private *i915,
4784                                     struct llist_node *freed)
4785 {
4786         struct drm_i915_gem_object *obj, *on;
4787
4788         intel_runtime_pm_get(i915);
4789         llist_for_each_entry_safe(obj, on, freed, freed) {
4790                 struct i915_vma *vma, *vn;
4791
4792                 trace_i915_gem_object_destroy(obj);
4793
4794                 mutex_lock(&i915->drm.struct_mutex);
4795
4796                 GEM_BUG_ON(i915_gem_object_is_active(obj));
4797                 list_for_each_entry_safe(vma, vn,
4798                                          &obj->vma_list, obj_link) {
4799                         GEM_BUG_ON(i915_vma_is_active(vma));
4800                         vma->flags &= ~I915_VMA_PIN_MASK;
4801                         i915_vma_destroy(vma);
4802                 }
4803                 GEM_BUG_ON(!list_empty(&obj->vma_list));
4804                 GEM_BUG_ON(!RB_EMPTY_ROOT(&obj->vma_tree));
4805
4806                 /* This serializes freeing with the shrinker. Since the free
4807                  * is delayed, first by RCU then by the workqueue, we want the
4808                  * shrinker to be able to free pages of unreferenced objects,
4809                  * or else we may oom whilst there are plenty of deferred
4810                  * freed objects.
4811                  */
4812                 if (i915_gem_object_has_pages(obj)) {
4813                         spin_lock(&i915->mm.obj_lock);
4814                         list_del_init(&obj->mm.link);
4815                         spin_unlock(&i915->mm.obj_lock);
4816                 }
4817
4818                 mutex_unlock(&i915->drm.struct_mutex);
4819
4820                 GEM_BUG_ON(obj->bind_count);
4821                 GEM_BUG_ON(obj->userfault_count);
4822                 GEM_BUG_ON(atomic_read(&obj->frontbuffer_bits));
4823                 GEM_BUG_ON(!list_empty(&obj->lut_list));
4824
4825                 if (obj->ops->release)
4826                         obj->ops->release(obj);
4827
4828                 if (WARN_ON(i915_gem_object_has_pinned_pages(obj)))
4829                         atomic_set(&obj->mm.pages_pin_count, 0);
4830                 __i915_gem_object_put_pages(obj, I915_MM_NORMAL);
4831                 GEM_BUG_ON(i915_gem_object_has_pages(obj));
4832
4833                 if (obj->base.import_attach)
4834                         drm_prime_gem_destroy(&obj->base, NULL);
4835
4836                 reservation_object_fini(&obj->__builtin_resv);
4837                 drm_gem_object_release(&obj->base);
4838                 i915_gem_info_remove_obj(i915, obj->base.size);
4839
4840                 kfree(obj->bit_17);
4841                 i915_gem_object_free(obj);
4842
4843                 GEM_BUG_ON(!atomic_read(&i915->mm.free_count));
4844                 atomic_dec(&i915->mm.free_count);
4845
4846                 if (on)
4847                         cond_resched();
4848         }
4849         intel_runtime_pm_put(i915);
4850 }
4851
4852 static void i915_gem_flush_free_objects(struct drm_i915_private *i915)
4853 {
4854         struct llist_node *freed;
4855
4856         /* Free the oldest, most stale object to keep the free_list short */
4857         freed = NULL;
4858         if (!llist_empty(&i915->mm.free_list)) { /* quick test for hotpath */
4859                 /* Only one consumer of llist_del_first() allowed */
4860                 spin_lock(&i915->mm.free_lock);
4861                 freed = llist_del_first(&i915->mm.free_list);
4862                 spin_unlock(&i915->mm.free_lock);
4863         }
4864         if (unlikely(freed)) {
4865                 freed->next = NULL;
4866                 __i915_gem_free_objects(i915, freed);
4867         }
4868 }
4869
4870 static void __i915_gem_free_work(struct work_struct *work)
4871 {
4872         struct drm_i915_private *i915 =
4873                 container_of(work, struct drm_i915_private, mm.free_work);
4874         struct llist_node *freed;
4875
4876         /*
4877          * All file-owned VMA should have been released by this point through
4878          * i915_gem_close_object(), or earlier by i915_gem_context_close().
4879          * However, the object may also be bound into the global GTT (e.g.
4880          * older GPUs without per-process support, or for direct access through
4881          * the GTT either for the user or for scanout). Those VMA still need to
4882          * unbound now.
4883          */
4884
4885         spin_lock(&i915->mm.free_lock);
4886         while ((freed = llist_del_all(&i915->mm.free_list))) {
4887                 spin_unlock(&i915->mm.free_lock);
4888
4889                 __i915_gem_free_objects(i915, freed);
4890                 if (need_resched())
4891                         return;
4892
4893                 spin_lock(&i915->mm.free_lock);
4894         }
4895         spin_unlock(&i915->mm.free_lock);
4896 }
4897
4898 static void __i915_gem_free_object_rcu(struct rcu_head *head)
4899 {
4900         struct drm_i915_gem_object *obj =
4901                 container_of(head, typeof(*obj), rcu);
4902         struct drm_i915_private *i915 = to_i915(obj->base.dev);
4903
4904         /*
4905          * Since we require blocking on struct_mutex to unbind the freed
4906          * object from the GPU before releasing resources back to the
4907          * system, we can not do that directly from the RCU callback (which may
4908          * be a softirq context), but must instead then defer that work onto a
4909          * kthread. We use the RCU callback rather than move the freed object
4910          * directly onto the work queue so that we can mix between using the
4911          * worker and performing frees directly from subsequent allocations for
4912          * crude but effective memory throttling.
4913          */
4914         if (llist_add(&obj->freed, &i915->mm.free_list))
4915                 queue_work(i915->wq, &i915->mm.free_work);
4916 }
4917
4918 void i915_gem_free_object(struct drm_gem_object *gem_obj)
4919 {
4920         struct drm_i915_gem_object *obj = to_intel_bo(gem_obj);
4921
4922         if (obj->mm.quirked)
4923                 __i915_gem_object_unpin_pages(obj);
4924
4925         if (discard_backing_storage(obj))
4926                 obj->mm.madv = I915_MADV_DONTNEED;
4927
4928         /*
4929          * Before we free the object, make sure any pure RCU-only
4930          * read-side critical sections are complete, e.g.
4931          * i915_gem_busy_ioctl(). For the corresponding synchronized
4932          * lookup see i915_gem_object_lookup_rcu().
4933          */
4934         atomic_inc(&to_i915(obj->base.dev)->mm.free_count);
4935         call_rcu(&obj->rcu, __i915_gem_free_object_rcu);
4936 }
4937
4938 void __i915_gem_object_release_unless_active(struct drm_i915_gem_object *obj)
4939 {
4940         lockdep_assert_held(&obj->base.dev->struct_mutex);
4941
4942         if (!i915_gem_object_has_active_reference(obj) &&
4943             i915_gem_object_is_active(obj))
4944                 i915_gem_object_set_active_reference(obj);
4945         else
4946                 i915_gem_object_put(obj);
4947 }
4948
4949 static void assert_kernel_context_is_current(struct drm_i915_private *i915)
4950 {
4951         struct i915_gem_context *kernel_context = i915->kernel_context;
4952         struct intel_engine_cs *engine;
4953         enum intel_engine_id id;
4954
4955         for_each_engine(engine, i915, id) {
4956                 GEM_BUG_ON(__i915_gem_active_peek(&engine->timeline.last_request));
4957                 GEM_BUG_ON(engine->last_retired_context != kernel_context);
4958         }
4959 }
4960
4961 void i915_gem_sanitize(struct drm_i915_private *i915)
4962 {
4963         if (i915_terminally_wedged(&i915->gpu_error)) {
4964                 mutex_lock(&i915->drm.struct_mutex);
4965                 i915_gem_unset_wedged(i915);
4966                 mutex_unlock(&i915->drm.struct_mutex);
4967         }
4968
4969         /*
4970          * If we inherit context state from the BIOS or earlier occupants
4971          * of the GPU, the GPU may be in an inconsistent state when we
4972          * try to take over. The only way to remove the earlier state
4973          * is by resetting. However, resetting on earlier gen is tricky as
4974          * it may impact the display and we are uncertain about the stability
4975          * of the reset, so this could be applied to even earlier gen.
4976          */
4977         if (INTEL_GEN(i915) >= 5 && intel_has_gpu_reset(i915))
4978                 WARN_ON(intel_gpu_reset(i915, ALL_ENGINES));
4979 }
4980
4981 int i915_gem_suspend(struct drm_i915_private *dev_priv)
4982 {
4983         struct drm_device *dev = &dev_priv->drm;
4984         int ret;
4985
4986         intel_runtime_pm_get(dev_priv);
4987         intel_suspend_gt_powersave(dev_priv);
4988
4989         mutex_lock(&dev->struct_mutex);
4990
4991         /* We have to flush all the executing contexts to main memory so
4992          * that they can saved in the hibernation image. To ensure the last
4993          * context image is coherent, we have to switch away from it. That
4994          * leaves the dev_priv->kernel_context still active when
4995          * we actually suspend, and its image in memory may not match the GPU
4996          * state. Fortunately, the kernel_context is disposable and we do
4997          * not rely on its state.
4998          */
4999         if (!i915_terminally_wedged(&dev_priv->gpu_error)) {
5000                 ret = i915_gem_switch_to_kernel_context(dev_priv);
5001                 if (ret)
5002                         goto err_unlock;
5003
5004                 ret = i915_gem_wait_for_idle(dev_priv,
5005                                              I915_WAIT_INTERRUPTIBLE |
5006                                              I915_WAIT_LOCKED);
5007                 if (ret && ret != -EIO)
5008                         goto err_unlock;
5009
5010                 assert_kernel_context_is_current(dev_priv);
5011         }
5012         i915_gem_contexts_lost(dev_priv);
5013         mutex_unlock(&dev->struct_mutex);
5014
5015         intel_uc_suspend(dev_priv);
5016
5017         cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work);
5018         cancel_delayed_work_sync(&dev_priv->gt.retire_work);
5019
5020         /* As the idle_work is rearming if it detects a race, play safe and
5021          * repeat the flush until it is definitely idle.
5022          */
5023         drain_delayed_work(&dev_priv->gt.idle_work);
5024
5025         /* Assert that we sucessfully flushed all the work and
5026          * reset the GPU back to its idle, low power state.
5027          */
5028         WARN_ON(dev_priv->gt.awake);
5029         if (WARN_ON(!intel_engines_are_idle(dev_priv)))
5030                 i915_gem_set_wedged(dev_priv); /* no hope, discard everything */
5031
5032         /*
5033          * Neither the BIOS, ourselves or any other kernel
5034          * expects the system to be in execlists mode on startup,
5035          * so we need to reset the GPU back to legacy mode. And the only
5036          * known way to disable logical contexts is through a GPU reset.
5037          *
5038          * So in order to leave the system in a known default configuration,
5039          * always reset the GPU upon unload and suspend. Afterwards we then
5040          * clean up the GEM state tracking, flushing off the requests and
5041          * leaving the system in a known idle state.
5042          *
5043          * Note that is of the upmost importance that the GPU is idle and
5044          * all stray writes are flushed *before* we dismantle the backing
5045          * storage for the pinned objects.
5046          *
5047          * However, since we are uncertain that resetting the GPU on older
5048          * machines is a good idea, we don't - just in case it leaves the
5049          * machine in an unusable condition.
5050          */
5051         intel_uc_sanitize(dev_priv);
5052         i915_gem_sanitize(dev_priv);
5053
5054         intel_runtime_pm_put(dev_priv);
5055         return 0;
5056
5057 err_unlock:
5058         mutex_unlock(&dev->struct_mutex);
5059         intel_runtime_pm_put(dev_priv);
5060         return ret;
5061 }
5062
5063 void i915_gem_resume(struct drm_i915_private *i915)
5064 {
5065         WARN_ON(i915->gt.awake);
5066
5067         mutex_lock(&i915->drm.struct_mutex);
5068         intel_uncore_forcewake_get(i915, FORCEWAKE_ALL);
5069
5070         i915_gem_restore_gtt_mappings(i915);
5071         i915_gem_restore_fences(i915);
5072
5073         /*
5074          * As we didn't flush the kernel context before suspend, we cannot
5075          * guarantee that the context image is complete. So let's just reset
5076          * it and start again.
5077          */
5078         i915->gt.resume(i915);
5079
5080         if (i915_gem_init_hw(i915))
5081                 goto err_wedged;
5082
5083         intel_uc_resume(i915);
5084
5085         /* Always reload a context for powersaving. */
5086         if (i915_gem_switch_to_kernel_context(i915))
5087                 goto err_wedged;
5088
5089 out_unlock:
5090         intel_uncore_forcewake_put(i915, FORCEWAKE_ALL);
5091         mutex_unlock(&i915->drm.struct_mutex);
5092         return;
5093
5094 err_wedged:
5095         if (!i915_terminally_wedged(&i915->gpu_error)) {
5096                 DRM_ERROR("failed to re-initialize GPU, declaring wedged!\n");
5097                 i915_gem_set_wedged(i915);
5098         }
5099         goto out_unlock;
5100 }
5101
5102 void i915_gem_init_swizzling(struct drm_i915_private *dev_priv)
5103 {
5104         if (INTEL_GEN(dev_priv) < 5 ||
5105             dev_priv->mm.bit_6_swizzle_x == I915_BIT_6_SWIZZLE_NONE)
5106                 return;
5107
5108         I915_WRITE(DISP_ARB_CTL, I915_READ(DISP_ARB_CTL) |
5109                                  DISP_TILE_SURFACE_SWIZZLING);
5110
5111         if (IS_GEN5(dev_priv))
5112                 return;
5113
5114         I915_WRITE(TILECTL, I915_READ(TILECTL) | TILECTL_SWZCTL);
5115         if (IS_GEN6(dev_priv))
5116                 I915_WRITE(ARB_MODE, _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_SNB));
5117         else if (IS_GEN7(dev_priv))
5118                 I915_WRITE(ARB_MODE, _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_IVB));
5119         else if (IS_GEN8(dev_priv))
5120                 I915_WRITE(GAMTARBMODE, _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_BDW));
5121         else
5122                 BUG();
5123 }
5124
5125 static void init_unused_ring(struct drm_i915_private *dev_priv, u32 base)
5126 {
5127         I915_WRITE(RING_CTL(base), 0);
5128         I915_WRITE(RING_HEAD(base), 0);
5129         I915_WRITE(RING_TAIL(base), 0);
5130         I915_WRITE(RING_START(base), 0);
5131 }
5132
5133 static void init_unused_rings(struct drm_i915_private *dev_priv)
5134 {
5135         if (IS_I830(dev_priv)) {
5136                 init_unused_ring(dev_priv, PRB1_BASE);
5137                 init_unused_ring(dev_priv, SRB0_BASE);
5138                 init_unused_ring(dev_priv, SRB1_BASE);
5139                 init_unused_ring(dev_priv, SRB2_BASE);
5140                 init_unused_ring(dev_priv, SRB3_BASE);
5141         } else if (IS_GEN2(dev_priv)) {
5142                 init_unused_ring(dev_priv, SRB0_BASE);
5143                 init_unused_ring(dev_priv, SRB1_BASE);
5144         } else if (IS_GEN3(dev_priv)) {
5145                 init_unused_ring(dev_priv, PRB1_BASE);
5146                 init_unused_ring(dev_priv, PRB2_BASE);
5147         }
5148 }
5149
5150 static int __i915_gem_restart_engines(void *data)
5151 {
5152         struct drm_i915_private *i915 = data;
5153         struct intel_engine_cs *engine;
5154         enum intel_engine_id id;
5155         int err;
5156
5157         for_each_engine(engine, i915, id) {
5158                 err = engine->init_hw(engine);
5159                 if (err) {
5160                         DRM_ERROR("Failed to restart %s (%d)\n",
5161                                   engine->name, err);
5162                         return err;
5163                 }
5164         }
5165
5166         return 0;
5167 }
5168
5169 int i915_gem_init_hw(struct drm_i915_private *dev_priv)
5170 {
5171         int ret;
5172
5173         dev_priv->gt.last_init_time = ktime_get();
5174
5175         /* Double layer security blanket, see i915_gem_init() */
5176         intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
5177
5178         if (HAS_EDRAM(dev_priv) && INTEL_GEN(dev_priv) < 9)
5179                 I915_WRITE(HSW_IDICR, I915_READ(HSW_IDICR) | IDIHASHMSK(0xf));
5180
5181         if (IS_HASWELL(dev_priv))
5182                 I915_WRITE(MI_PREDICATE_RESULT_2, IS_HSW_GT3(dev_priv) ?
5183                            LOWER_SLICE_ENABLED : LOWER_SLICE_DISABLED);
5184
5185         if (HAS_PCH_NOP(dev_priv)) {
5186                 if (IS_IVYBRIDGE(dev_priv)) {
5187                         u32 temp = I915_READ(GEN7_MSG_CTL);
5188                         temp &= ~(WAIT_FOR_PCH_FLR_ACK | WAIT_FOR_PCH_RESET_ACK);
5189                         I915_WRITE(GEN7_MSG_CTL, temp);
5190                 } else if (INTEL_GEN(dev_priv) >= 7) {
5191                         u32 temp = I915_READ(HSW_NDE_RSTWRN_OPT);
5192                         temp &= ~RESET_PCH_HANDSHAKE_ENABLE;
5193                         I915_WRITE(HSW_NDE_RSTWRN_OPT, temp);
5194                 }
5195         }
5196
5197         intel_gt_workarounds_apply(dev_priv);
5198
5199         i915_gem_init_swizzling(dev_priv);
5200
5201         /*
5202          * At least 830 can leave some of the unused rings
5203          * "active" (ie. head != tail) after resume which
5204          * will prevent c3 entry. Makes sure all unused rings
5205          * are totally idle.
5206          */
5207         init_unused_rings(dev_priv);
5208
5209         BUG_ON(!dev_priv->kernel_context);
5210         if (i915_terminally_wedged(&dev_priv->gpu_error)) {
5211                 ret = -EIO;
5212                 goto out;
5213         }
5214
5215         ret = i915_ppgtt_init_hw(dev_priv);
5216         if (ret) {
5217                 DRM_ERROR("Enabling PPGTT failed (%d)\n", ret);
5218                 goto out;
5219         }
5220
5221         ret = intel_wopcm_init_hw(&dev_priv->wopcm);
5222         if (ret) {
5223                 DRM_ERROR("Enabling WOPCM failed (%d)\n", ret);
5224                 goto out;
5225         }
5226
5227         /* We can't enable contexts until all firmware is loaded */
5228         ret = intel_uc_init_hw(dev_priv);
5229         if (ret) {
5230                 DRM_ERROR("Enabling uc failed (%d)\n", ret);
5231                 goto out;
5232         }
5233
5234         intel_mocs_init_l3cc_table(dev_priv);
5235
5236         /* Only when the HW is re-initialised, can we replay the requests */
5237         ret = __i915_gem_restart_engines(dev_priv);
5238 out:
5239         intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
5240         return ret;
5241 }
5242
5243 static int __intel_engines_record_defaults(struct drm_i915_private *i915)
5244 {
5245         struct i915_gem_context *ctx;
5246         struct intel_engine_cs *engine;
5247         enum intel_engine_id id;
5248         int err;
5249
5250         /*
5251          * As we reset the gpu during very early sanitisation, the current
5252          * register state on the GPU should reflect its defaults values.
5253          * We load a context onto the hw (with restore-inhibit), then switch
5254          * over to a second context to save that default register state. We
5255          * can then prime every new context with that state so they all start
5256          * from the same default HW values.
5257          */
5258
5259         ctx = i915_gem_context_create_kernel(i915, 0);
5260         if (IS_ERR(ctx))
5261                 return PTR_ERR(ctx);
5262
5263         for_each_engine(engine, i915, id) {
5264                 struct i915_request *rq;
5265
5266                 rq = i915_request_alloc(engine, ctx);
5267                 if (IS_ERR(rq)) {
5268                         err = PTR_ERR(rq);
5269                         goto out_ctx;
5270                 }
5271
5272                 err = 0;
5273                 if (engine->init_context)
5274                         err = engine->init_context(rq);
5275
5276                 __i915_request_add(rq, true);
5277                 if (err)
5278                         goto err_active;
5279         }
5280
5281         err = i915_gem_switch_to_kernel_context(i915);
5282         if (err)
5283                 goto err_active;
5284
5285         err = i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED);
5286         if (err)
5287                 goto err_active;
5288
5289         assert_kernel_context_is_current(i915);
5290
5291         for_each_engine(engine, i915, id) {
5292                 struct i915_vma *state;
5293
5294                 state = to_intel_context(ctx, engine)->state;
5295                 if (!state)
5296                         continue;
5297
5298                 /*
5299                  * As we will hold a reference to the logical state, it will
5300                  * not be torn down with the context, and importantly the
5301                  * object will hold onto its vma (making it possible for a
5302                  * stray GTT write to corrupt our defaults). Unmap the vma
5303                  * from the GTT to prevent such accidents and reclaim the
5304                  * space.
5305                  */
5306                 err = i915_vma_unbind(state);
5307                 if (err)
5308                         goto err_active;
5309
5310                 err = i915_gem_object_set_to_cpu_domain(state->obj, false);
5311                 if (err)
5312                         goto err_active;
5313
5314                 engine->default_state = i915_gem_object_get(state->obj);
5315         }
5316
5317         if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) {
5318                 unsigned int found = intel_engines_has_context_isolation(i915);
5319
5320                 /*
5321                  * Make sure that classes with multiple engine instances all
5322                  * share the same basic configuration.
5323                  */
5324                 for_each_engine(engine, i915, id) {
5325                         unsigned int bit = BIT(engine->uabi_class);
5326                         unsigned int expected = engine->default_state ? bit : 0;
5327
5328                         if ((found & bit) != expected) {
5329                                 DRM_ERROR("mismatching default context state for class %d on engine %s\n",
5330                                           engine->uabi_class, engine->name);
5331                         }
5332                 }
5333         }
5334
5335 out_ctx:
5336         i915_gem_context_set_closed(ctx);
5337         i915_gem_context_put(ctx);
5338         return err;
5339
5340 err_active:
5341         /*
5342          * If we have to abandon now, we expect the engines to be idle
5343          * and ready to be torn-down. First try to flush any remaining
5344          * request, ensure we are pointing at the kernel context and
5345          * then remove it.
5346          */
5347         if (WARN_ON(i915_gem_switch_to_kernel_context(i915)))
5348                 goto out_ctx;
5349
5350         if (WARN_ON(i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED)))
5351                 goto out_ctx;
5352
5353         i915_gem_contexts_lost(i915);
5354         goto out_ctx;
5355 }
5356
5357 int i915_gem_init(struct drm_i915_private *dev_priv)
5358 {
5359         int ret;
5360
5361         /*
5362          * We need to fallback to 4K pages since gvt gtt handling doesn't
5363          * support huge page entries - we will need to check either hypervisor
5364          * mm can support huge guest page or just do emulation in gvt.
5365          */
5366         if (intel_vgpu_active(dev_priv))
5367                 mkwrite_device_info(dev_priv)->page_sizes =
5368                         I915_GTT_PAGE_SIZE_4K;
5369
5370         dev_priv->mm.unordered_timeline = dma_fence_context_alloc(1);
5371
5372         if (HAS_LOGICAL_RING_CONTEXTS(dev_priv)) {
5373                 dev_priv->gt.resume = intel_lr_context_resume;
5374                 dev_priv->gt.cleanup_engine = intel_logical_ring_cleanup;
5375         } else {
5376                 dev_priv->gt.resume = intel_legacy_submission_resume;
5377                 dev_priv->gt.cleanup_engine = intel_engine_cleanup;
5378         }
5379
5380         ret = i915_gem_init_userptr(dev_priv);
5381         if (ret)
5382                 return ret;
5383
5384         ret = intel_wopcm_init(&dev_priv->wopcm);
5385         if (ret)
5386                 return ret;
5387
5388         ret = intel_uc_init_misc(dev_priv);
5389         if (ret)
5390                 return ret;
5391
5392         /* This is just a security blanket to placate dragons.
5393          * On some systems, we very sporadically observe that the first TLBs
5394          * used by the CS may be stale, despite us poking the TLB reset. If
5395          * we hold the forcewake during initialisation these problems
5396          * just magically go away.
5397          */
5398         mutex_lock(&dev_priv->drm.struct_mutex);
5399         intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
5400
5401         ret = i915_gem_init_ggtt(dev_priv);
5402         if (ret) {
5403                 GEM_BUG_ON(ret == -EIO);
5404                 goto err_unlock;
5405         }
5406
5407         ret = i915_gem_contexts_init(dev_priv);
5408         if (ret) {
5409                 GEM_BUG_ON(ret == -EIO);
5410                 goto err_ggtt;
5411         }
5412
5413         ret = intel_engines_init(dev_priv);
5414         if (ret) {
5415                 GEM_BUG_ON(ret == -EIO);
5416                 goto err_context;
5417         }
5418
5419         intel_init_gt_powersave(dev_priv);
5420
5421         ret = intel_uc_init(dev_priv);
5422         if (ret)
5423                 goto err_pm;
5424
5425         ret = i915_gem_init_hw(dev_priv);
5426         if (ret)
5427                 goto err_uc_init;
5428
5429         /*
5430          * Despite its name intel_init_clock_gating applies both display
5431          * clock gating workarounds; GT mmio workarounds and the occasional
5432          * GT power context workaround. Worse, sometimes it includes a context
5433          * register workaround which we need to apply before we record the
5434          * default HW state for all contexts.
5435          *
5436          * FIXME: break up the workarounds and apply them at the right time!
5437          */
5438         intel_init_clock_gating(dev_priv);
5439
5440         ret = __intel_engines_record_defaults(dev_priv);
5441         if (ret)
5442                 goto err_init_hw;
5443
5444         if (i915_inject_load_failure()) {
5445                 ret = -ENODEV;
5446                 goto err_init_hw;
5447         }
5448
5449         if (i915_inject_load_failure()) {
5450                 ret = -EIO;
5451                 goto err_init_hw;
5452         }
5453
5454         intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
5455         mutex_unlock(&dev_priv->drm.struct_mutex);
5456
5457         return 0;
5458
5459         /*
5460          * Unwinding is complicated by that we want to handle -EIO to mean
5461          * disable GPU submission but keep KMS alive. We want to mark the
5462          * HW as irrevisibly wedged, but keep enough state around that the
5463          * driver doesn't explode during runtime.
5464          */
5465 err_init_hw:
5466         i915_gem_wait_for_idle(dev_priv, I915_WAIT_LOCKED);
5467         i915_gem_contexts_lost(dev_priv);
5468         intel_uc_fini_hw(dev_priv);
5469 err_uc_init:
5470         intel_uc_fini(dev_priv);
5471 err_pm:
5472         if (ret != -EIO) {
5473                 intel_cleanup_gt_powersave(dev_priv);
5474                 i915_gem_cleanup_engines(dev_priv);
5475         }
5476 err_context:
5477         if (ret != -EIO)
5478                 i915_gem_contexts_fini(dev_priv);
5479 err_ggtt:
5480 err_unlock:
5481         intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
5482         mutex_unlock(&dev_priv->drm.struct_mutex);
5483
5484         intel_uc_fini_misc(dev_priv);
5485
5486         if (ret != -EIO)
5487                 i915_gem_cleanup_userptr(dev_priv);
5488
5489         if (ret == -EIO) {
5490                 /*
5491                  * Allow engine initialisation to fail by marking the GPU as
5492                  * wedged. But we only want to do this where the GPU is angry,
5493                  * for all other failure, such as an allocation failure, bail.
5494                  */
5495                 if (!i915_terminally_wedged(&dev_priv->gpu_error)) {
5496                         DRM_ERROR("Failed to initialize GPU, declaring it wedged\n");
5497                         i915_gem_set_wedged(dev_priv);
5498                 }
5499                 ret = 0;
5500         }
5501
5502         i915_gem_drain_freed_objects(dev_priv);
5503         return ret;
5504 }
5505
5506 void i915_gem_init_mmio(struct drm_i915_private *i915)
5507 {
5508         i915_gem_sanitize(i915);
5509 }
5510
5511 void
5512 i915_gem_cleanup_engines(struct drm_i915_private *dev_priv)
5513 {
5514         struct intel_engine_cs *engine;
5515         enum intel_engine_id id;
5516
5517         for_each_engine(engine, dev_priv, id)
5518                 dev_priv->gt.cleanup_engine(engine);
5519 }
5520
5521 void
5522 i915_gem_load_init_fences(struct drm_i915_private *dev_priv)
5523 {
5524         int i;
5525
5526         if (INTEL_GEN(dev_priv) >= 7 && !IS_VALLEYVIEW(dev_priv) &&
5527             !IS_CHERRYVIEW(dev_priv))
5528                 dev_priv->num_fence_regs = 32;
5529         else if (INTEL_GEN(dev_priv) >= 4 ||
5530                  IS_I945G(dev_priv) || IS_I945GM(dev_priv) ||
5531                  IS_G33(dev_priv) || IS_PINEVIEW(dev_priv))
5532                 dev_priv->num_fence_regs = 16;
5533         else
5534                 dev_priv->num_fence_regs = 8;
5535
5536         if (intel_vgpu_active(dev_priv))
5537                 dev_priv->num_fence_regs =
5538                                 I915_READ(vgtif_reg(avail_rs.fence_num));
5539
5540         /* Initialize fence registers to zero */
5541         for (i = 0; i < dev_priv->num_fence_regs; i++) {
5542                 struct drm_i915_fence_reg *fence = &dev_priv->fence_regs[i];
5543
5544                 fence->i915 = dev_priv;
5545                 fence->id = i;
5546                 list_add_tail(&fence->link, &dev_priv->mm.fence_list);
5547         }
5548         i915_gem_restore_fences(dev_priv);
5549
5550         i915_gem_detect_bit_6_swizzle(dev_priv);
5551 }
5552
5553 static void i915_gem_init__mm(struct drm_i915_private *i915)
5554 {
5555         spin_lock_init(&i915->mm.object_stat_lock);
5556         spin_lock_init(&i915->mm.obj_lock);
5557         spin_lock_init(&i915->mm.free_lock);
5558
5559         init_llist_head(&i915->mm.free_list);
5560
5561         INIT_LIST_HEAD(&i915->mm.unbound_list);
5562         INIT_LIST_HEAD(&i915->mm.bound_list);
5563         INIT_LIST_HEAD(&i915->mm.fence_list);
5564         INIT_LIST_HEAD(&i915->mm.userfault_list);
5565
5566         INIT_WORK(&i915->mm.free_work, __i915_gem_free_work);
5567 }
5568
5569 int i915_gem_init_early(struct drm_i915_private *dev_priv)
5570 {
5571         int err = -ENOMEM;
5572
5573         dev_priv->objects = KMEM_CACHE(drm_i915_gem_object, SLAB_HWCACHE_ALIGN);
5574         if (!dev_priv->objects)
5575                 goto err_out;
5576
5577         dev_priv->vmas = KMEM_CACHE(i915_vma, SLAB_HWCACHE_ALIGN);
5578         if (!dev_priv->vmas)
5579                 goto err_objects;
5580
5581         dev_priv->luts = KMEM_CACHE(i915_lut_handle, 0);
5582         if (!dev_priv->luts)
5583                 goto err_vmas;
5584
5585         dev_priv->requests = KMEM_CACHE(i915_request,
5586                                         SLAB_HWCACHE_ALIGN |
5587                                         SLAB_RECLAIM_ACCOUNT |
5588                                         SLAB_TYPESAFE_BY_RCU);
5589         if (!dev_priv->requests)
5590                 goto err_luts;
5591
5592         dev_priv->dependencies = KMEM_CACHE(i915_dependency,
5593                                             SLAB_HWCACHE_ALIGN |
5594                                             SLAB_RECLAIM_ACCOUNT);
5595         if (!dev_priv->dependencies)
5596                 goto err_requests;
5597
5598         dev_priv->priorities = KMEM_CACHE(i915_priolist, SLAB_HWCACHE_ALIGN);
5599         if (!dev_priv->priorities)
5600                 goto err_dependencies;
5601
5602         INIT_LIST_HEAD(&dev_priv->gt.timelines);
5603         INIT_LIST_HEAD(&dev_priv->gt.active_rings);
5604         INIT_LIST_HEAD(&dev_priv->gt.closed_vma);
5605
5606         i915_gem_init__mm(dev_priv);
5607
5608         INIT_DELAYED_WORK(&dev_priv->gt.retire_work,
5609                           i915_gem_retire_work_handler);
5610         INIT_DELAYED_WORK(&dev_priv->gt.idle_work,
5611                           i915_gem_idle_work_handler);
5612         init_waitqueue_head(&dev_priv->gpu_error.wait_queue);
5613         init_waitqueue_head(&dev_priv->gpu_error.reset_queue);
5614
5615         atomic_set(&dev_priv->mm.bsd_engine_dispatch_index, 0);
5616
5617         spin_lock_init(&dev_priv->fb_tracking.lock);
5618
5619         err = i915_gemfs_init(dev_priv);
5620         if (err)
5621                 DRM_NOTE("Unable to create a private tmpfs mount, hugepage support will be disabled(%d).\n", err);
5622
5623         return 0;
5624
5625 err_dependencies:
5626         kmem_cache_destroy(dev_priv->dependencies);
5627 err_requests:
5628         kmem_cache_destroy(dev_priv->requests);
5629 err_luts:
5630         kmem_cache_destroy(dev_priv->luts);
5631 err_vmas:
5632         kmem_cache_destroy(dev_priv->vmas);
5633 err_objects:
5634         kmem_cache_destroy(dev_priv->objects);
5635 err_out:
5636         return err;
5637 }
5638
5639 void i915_gem_cleanup_early(struct drm_i915_private *dev_priv)
5640 {
5641         i915_gem_drain_freed_objects(dev_priv);
5642         GEM_BUG_ON(!llist_empty(&dev_priv->mm.free_list));
5643         GEM_BUG_ON(atomic_read(&dev_priv->mm.free_count));
5644         WARN_ON(dev_priv->mm.object_count);
5645         WARN_ON(!list_empty(&dev_priv->gt.timelines));
5646
5647         kmem_cache_destroy(dev_priv->priorities);
5648         kmem_cache_destroy(dev_priv->dependencies);
5649         kmem_cache_destroy(dev_priv->requests);
5650         kmem_cache_destroy(dev_priv->luts);
5651         kmem_cache_destroy(dev_priv->vmas);
5652         kmem_cache_destroy(dev_priv->objects);
5653
5654         /* And ensure that our DESTROY_BY_RCU slabs are truly destroyed */
5655         rcu_barrier();
5656
5657         i915_gemfs_fini(dev_priv);
5658 }
5659
5660 int i915_gem_freeze(struct drm_i915_private *dev_priv)
5661 {
5662         /* Discard all purgeable objects, let userspace recover those as
5663          * required after resuming.
5664          */
5665         i915_gem_shrink_all(dev_priv);
5666
5667         return 0;
5668 }
5669
5670 int i915_gem_freeze_late(struct drm_i915_private *dev_priv)
5671 {
5672         struct drm_i915_gem_object *obj;
5673         struct list_head *phases[] = {
5674                 &dev_priv->mm.unbound_list,
5675                 &dev_priv->mm.bound_list,
5676                 NULL
5677         }, **p;
5678
5679         /* Called just before we write the hibernation image.
5680          *
5681          * We need to update the domain tracking to reflect that the CPU
5682          * will be accessing all the pages to create and restore from the
5683          * hibernation, and so upon restoration those pages will be in the
5684          * CPU domain.
5685          *
5686          * To make sure the hibernation image contains the latest state,
5687          * we update that state just before writing out the image.
5688          *
5689          * To try and reduce the hibernation image, we manually shrink
5690          * the objects as well, see i915_gem_freeze()
5691          */
5692
5693         i915_gem_shrink(dev_priv, -1UL, NULL, I915_SHRINK_UNBOUND);
5694         i915_gem_drain_freed_objects(dev_priv);
5695
5696         spin_lock(&dev_priv->mm.obj_lock);
5697         for (p = phases; *p; p++) {
5698                 list_for_each_entry(obj, *p, mm.link)
5699                         __start_cpu_write(obj);
5700         }
5701         spin_unlock(&dev_priv->mm.obj_lock);
5702
5703         return 0;
5704 }
5705
5706 void i915_gem_release(struct drm_device *dev, struct drm_file *file)
5707 {
5708         struct drm_i915_file_private *file_priv = file->driver_priv;
5709         struct i915_request *request;
5710
5711         /* Clean up our request list when the client is going away, so that
5712          * later retire_requests won't dereference our soon-to-be-gone
5713          * file_priv.
5714          */
5715         spin_lock(&file_priv->mm.lock);
5716         list_for_each_entry(request, &file_priv->mm.request_list, client_link)
5717                 request->file_priv = NULL;
5718         spin_unlock(&file_priv->mm.lock);
5719 }
5720
5721 int i915_gem_open(struct drm_i915_private *i915, struct drm_file *file)
5722 {
5723         struct drm_i915_file_private *file_priv;
5724         int ret;
5725
5726         DRM_DEBUG("\n");
5727
5728         file_priv = kzalloc(sizeof(*file_priv), GFP_KERNEL);
5729         if (!file_priv)
5730                 return -ENOMEM;
5731
5732         file->driver_priv = file_priv;
5733         file_priv->dev_priv = i915;
5734         file_priv->file = file;
5735
5736         spin_lock_init(&file_priv->mm.lock);
5737         INIT_LIST_HEAD(&file_priv->mm.request_list);
5738
5739         file_priv->bsd_engine = -1;
5740
5741         ret = i915_gem_context_open(i915, file);
5742         if (ret)
5743                 kfree(file_priv);
5744
5745         return ret;
5746 }
5747
5748 /**
5749  * i915_gem_track_fb - update frontbuffer tracking
5750  * @old: current GEM buffer for the frontbuffer slots
5751  * @new: new GEM buffer for the frontbuffer slots
5752  * @frontbuffer_bits: bitmask of frontbuffer slots
5753  *
5754  * This updates the frontbuffer tracking bits @frontbuffer_bits by clearing them
5755  * from @old and setting them in @new. Both @old and @new can be NULL.
5756  */
5757 void i915_gem_track_fb(struct drm_i915_gem_object *old,
5758                        struct drm_i915_gem_object *new,
5759                        unsigned frontbuffer_bits)
5760 {
5761         /* Control of individual bits within the mask are guarded by
5762          * the owning plane->mutex, i.e. we can never see concurrent
5763          * manipulation of individual bits. But since the bitfield as a whole
5764          * is updated using RMW, we need to use atomics in order to update
5765          * the bits.
5766          */
5767         BUILD_BUG_ON(INTEL_FRONTBUFFER_BITS_PER_PIPE * I915_MAX_PIPES >
5768                      sizeof(atomic_t) * BITS_PER_BYTE);
5769
5770         if (old) {
5771                 WARN_ON(!(atomic_read(&old->frontbuffer_bits) & frontbuffer_bits));
5772                 atomic_andnot(frontbuffer_bits, &old->frontbuffer_bits);
5773         }
5774
5775         if (new) {
5776                 WARN_ON(atomic_read(&new->frontbuffer_bits) & frontbuffer_bits);
5777                 atomic_or(frontbuffer_bits, &new->frontbuffer_bits);
5778         }
5779 }
5780
5781 /* Allocate a new GEM object and fill it with the supplied data */
5782 struct drm_i915_gem_object *
5783 i915_gem_object_create_from_data(struct drm_i915_private *dev_priv,
5784                                  const void *data, size_t size)
5785 {
5786         struct drm_i915_gem_object *obj;
5787         struct file *file;
5788         size_t offset;
5789         int err;
5790
5791         obj = i915_gem_object_create(dev_priv, round_up(size, PAGE_SIZE));
5792         if (IS_ERR(obj))
5793                 return obj;
5794
5795         GEM_BUG_ON(obj->write_domain != I915_GEM_DOMAIN_CPU);
5796
5797         file = obj->base.filp;
5798         offset = 0;
5799         do {
5800                 unsigned int len = min_t(typeof(size), size, PAGE_SIZE);
5801                 struct page *page;
5802                 void *pgdata, *vaddr;
5803
5804                 err = pagecache_write_begin(file, file->f_mapping,
5805                                             offset, len, 0,
5806                                             &page, &pgdata);
5807                 if (err < 0)
5808                         goto fail;
5809
5810                 vaddr = kmap(page);
5811                 memcpy(vaddr, data, len);
5812                 kunmap(page);
5813
5814                 err = pagecache_write_end(file, file->f_mapping,
5815                                           offset, len, len,
5816                                           page, pgdata);
5817                 if (err < 0)
5818                         goto fail;
5819
5820                 size -= len;
5821                 data += len;
5822                 offset += len;
5823         } while (size);
5824
5825         return obj;
5826
5827 fail:
5828         i915_gem_object_put(obj);
5829         return ERR_PTR(err);
5830 }
5831
5832 struct scatterlist *
5833 i915_gem_object_get_sg(struct drm_i915_gem_object *obj,
5834                        unsigned int n,
5835                        unsigned int *offset)
5836 {
5837         struct i915_gem_object_page_iter *iter = &obj->mm.get_page;
5838         struct scatterlist *sg;
5839         unsigned int idx, count;
5840
5841         might_sleep();
5842         GEM_BUG_ON(n >= obj->base.size >> PAGE_SHIFT);
5843         GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
5844
5845         /* As we iterate forward through the sg, we record each entry in a
5846          * radixtree for quick repeated (backwards) lookups. If we have seen
5847          * this index previously, we will have an entry for it.
5848          *
5849          * Initial lookup is O(N), but this is amortized to O(1) for
5850          * sequential page access (where each new request is consecutive
5851          * to the previous one). Repeated lookups are O(lg(obj->base.size)),
5852          * i.e. O(1) with a large constant!
5853          */
5854         if (n < READ_ONCE(iter->sg_idx))
5855                 goto lookup;
5856
5857         mutex_lock(&iter->lock);
5858
5859         /* We prefer to reuse the last sg so that repeated lookup of this
5860          * (or the subsequent) sg are fast - comparing against the last
5861          * sg is faster than going through the radixtree.
5862          */
5863
5864         sg = iter->sg_pos;
5865         idx = iter->sg_idx;
5866         count = __sg_page_count(sg);
5867
5868         while (idx + count <= n) {
5869                 unsigned long exception, i;
5870                 int ret;
5871
5872                 /* If we cannot allocate and insert this entry, or the
5873                  * individual pages from this range, cancel updating the
5874                  * sg_idx so that on this lookup we are forced to linearly
5875                  * scan onwards, but on future lookups we will try the
5876                  * insertion again (in which case we need to be careful of
5877                  * the error return reporting that we have already inserted
5878                  * this index).
5879                  */
5880                 ret = radix_tree_insert(&iter->radix, idx, sg);
5881                 if (ret && ret != -EEXIST)
5882                         goto scan;
5883
5884                 exception =
5885                         RADIX_TREE_EXCEPTIONAL_ENTRY |
5886                         idx << RADIX_TREE_EXCEPTIONAL_SHIFT;
5887                 for (i = 1; i < count; i++) {
5888                         ret = radix_tree_insert(&iter->radix, idx + i,
5889                                                 (void *)exception);
5890                         if (ret && ret != -EEXIST)
5891                                 goto scan;
5892                 }
5893
5894                 idx += count;
5895                 sg = ____sg_next(sg);
5896                 count = __sg_page_count(sg);
5897         }
5898
5899 scan:
5900         iter->sg_pos = sg;
5901         iter->sg_idx = idx;
5902
5903         mutex_unlock(&iter->lock);
5904
5905         if (unlikely(n < idx)) /* insertion completed by another thread */
5906                 goto lookup;
5907
5908         /* In case we failed to insert the entry into the radixtree, we need
5909          * to look beyond the current sg.
5910          */
5911         while (idx + count <= n) {
5912                 idx += count;
5913                 sg = ____sg_next(sg);
5914                 count = __sg_page_count(sg);
5915         }
5916
5917         *offset = n - idx;
5918         return sg;
5919
5920 lookup:
5921         rcu_read_lock();
5922
5923         sg = radix_tree_lookup(&iter->radix, n);
5924         GEM_BUG_ON(!sg);
5925
5926         /* If this index is in the middle of multi-page sg entry,
5927          * the radixtree will contain an exceptional entry that points
5928          * to the start of that range. We will return the pointer to
5929          * the base page and the offset of this page within the
5930          * sg entry's range.
5931          */
5932         *offset = 0;
5933         if (unlikely(radix_tree_exception(sg))) {
5934                 unsigned long base =
5935                         (unsigned long)sg >> RADIX_TREE_EXCEPTIONAL_SHIFT;
5936
5937                 sg = radix_tree_lookup(&iter->radix, base);
5938                 GEM_BUG_ON(!sg);
5939
5940                 *offset = n - base;
5941         }
5942
5943         rcu_read_unlock();
5944
5945         return sg;
5946 }
5947
5948 struct page *
5949 i915_gem_object_get_page(struct drm_i915_gem_object *obj, unsigned int n)
5950 {
5951         struct scatterlist *sg;
5952         unsigned int offset;
5953
5954         GEM_BUG_ON(!i915_gem_object_has_struct_page(obj));
5955
5956         sg = i915_gem_object_get_sg(obj, n, &offset);
5957         return nth_page(sg_page(sg), offset);
5958 }
5959
5960 /* Like i915_gem_object_get_page(), but mark the returned page dirty */
5961 struct page *
5962 i915_gem_object_get_dirty_page(struct drm_i915_gem_object *obj,
5963                                unsigned int n)
5964 {
5965         struct page *page;
5966
5967         page = i915_gem_object_get_page(obj, n);
5968         if (!obj->mm.dirty)
5969                 set_page_dirty(page);
5970
5971         return page;
5972 }
5973
5974 dma_addr_t
5975 i915_gem_object_get_dma_address(struct drm_i915_gem_object *obj,
5976                                 unsigned long n)
5977 {
5978         struct scatterlist *sg;
5979         unsigned int offset;
5980
5981         sg = i915_gem_object_get_sg(obj, n, &offset);
5982         return sg_dma_address(sg) + (offset << PAGE_SHIFT);
5983 }
5984
5985 int i915_gem_object_attach_phys(struct drm_i915_gem_object *obj, int align)
5986 {
5987         struct sg_table *pages;
5988         int err;
5989
5990         if (align > obj->base.size)
5991                 return -EINVAL;
5992
5993         if (obj->ops == &i915_gem_phys_ops)
5994                 return 0;
5995
5996         if (obj->ops != &i915_gem_object_ops)
5997                 return -EINVAL;
5998
5999         err = i915_gem_object_unbind(obj);
6000         if (err)
6001                 return err;
6002
6003         mutex_lock(&obj->mm.lock);
6004
6005         if (obj->mm.madv != I915_MADV_WILLNEED) {
6006                 err = -EFAULT;
6007                 goto err_unlock;
6008         }
6009
6010         if (obj->mm.quirked) {
6011                 err = -EFAULT;
6012                 goto err_unlock;
6013         }
6014
6015         if (obj->mm.mapping) {
6016                 err = -EBUSY;
6017                 goto err_unlock;
6018         }
6019
6020         pages = fetch_and_zero(&obj->mm.pages);
6021         if (pages) {
6022                 struct drm_i915_private *i915 = to_i915(obj->base.dev);
6023
6024                 __i915_gem_object_reset_page_iter(obj);
6025
6026                 spin_lock(&i915->mm.obj_lock);
6027                 list_del(&obj->mm.link);
6028                 spin_unlock(&i915->mm.obj_lock);
6029         }
6030
6031         obj->ops = &i915_gem_phys_ops;
6032
6033         err = ____i915_gem_object_get_pages(obj);
6034         if (err)
6035                 goto err_xfer;
6036
6037         /* Perma-pin (until release) the physical set of pages */
6038         __i915_gem_object_pin_pages(obj);
6039
6040         if (!IS_ERR_OR_NULL(pages))
6041                 i915_gem_object_ops.put_pages(obj, pages);
6042         mutex_unlock(&obj->mm.lock);
6043         return 0;
6044
6045 err_xfer:
6046         obj->ops = &i915_gem_object_ops;
6047         obj->mm.pages = pages;
6048 err_unlock:
6049         mutex_unlock(&obj->mm.lock);
6050         return err;
6051 }
6052
6053 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
6054 #include "selftests/scatterlist.c"
6055 #include "selftests/mock_gem_device.c"
6056 #include "selftests/huge_gem_object.c"
6057 #include "selftests/huge_pages.c"
6058 #include "selftests/i915_gem_object.c"
6059 #include "selftests/i915_gem_coherency.c"
6060 #endif