drivers/gpu/drm/i915/i915_gem.c

   1 /*
   2  * Copyright © 2008-2015 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *    Eric Anholt <eric@anholt.net>
  25  *
  26  */
  27
  28 #include <drm/drmP.h>
  29 #include <drm/drm_vma_manager.h>
  30 #include <drm/i915_drm.h>
  31 #include "i915_drv.h"
  32 #include "i915_gem_clflush.h"
  33 #include "i915_vgpu.h"
  34 #include "i915_trace.h"
  35 #include "intel_drv.h"
  36 #include "intel_frontbuffer.h"
  37 #include "intel_mocs.h"
  38 #include "intel_workarounds.h"
  39 #include "i915_gemfs.h"
  40 #include <linux/dma-fence-array.h>
  41 #include <linux/kthread.h>
  42 #include <linux/reservation.h>
  43 #include <linux/shmem_fs.h>
  44 #include <linux/slab.h>
  45 #include <linux/stop_machine.h>
  46 #include <linux/swap.h>
  47 #include <linux/pci.h>
  48 #include <linux/dma-buf.h>
  49
  50 static void i915_gem_flush_free_objects(struct drm_i915_private *i915);
  51
  52 static bool cpu_write_needs_clflush(struct drm_i915_gem_object *obj)
  53 {
  54         if (obj->cache_dirty)
  55                 return false;
  56
  57         if (!(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE))
  58                 return true;
  59
  60         return obj->pin_global; /* currently in use by HW, keep flushed */
  61 }
  62
  63 static int
  64 insert_mappable_node(struct i915_ggtt *ggtt,
  65                      struct drm_mm_node *node, u32 size)
  66 {
  67         memset(node, 0, sizeof(*node));
  68         return drm_mm_insert_node_in_range(&ggtt->vm.mm, node,
  69                                            size, 0, I915_COLOR_UNEVICTABLE,
  70                                            0, ggtt->mappable_end,
  71                                            DRM_MM_INSERT_LOW);
  72 }
  73
  74 static void
  75 remove_mappable_node(struct drm_mm_node *node)
  76 {
  77         drm_mm_remove_node(node);
  78 }
  79
  80 /* some bookkeeping */
  81 static void i915_gem_info_add_obj(struct drm_i915_private *dev_priv,
  82                                   u64 size)
  83 {
  84         spin_lock(&dev_priv->mm.object_stat_lock);
  85         dev_priv->mm.object_count++;
  86         dev_priv->mm.object_memory += size;
  87         spin_unlock(&dev_priv->mm.object_stat_lock);
  88 }
  89
  90 static void i915_gem_info_remove_obj(struct drm_i915_private *dev_priv,
  91                                      u64 size)
  92 {
  93         spin_lock(&dev_priv->mm.object_stat_lock);
  94         dev_priv->mm.object_count--;
  95         dev_priv->mm.object_memory -= size;
  96         spin_unlock(&dev_priv->mm.object_stat_lock);
  97 }
  98
  99 static int
 100 i915_gem_wait_for_error(struct i915_gpu_error *error)
 101 {
 102         int ret;
 103
 104         might_sleep();
 105
 106         /*
 107          * Only wait 10 seconds for the gpu reset to complete to avoid hanging
 108          * userspace. If it takes that long something really bad is going on and
 109          * we should simply try to bail out and fail as gracefully as possible.
 110          */
 111         ret = wait_event_interruptible_timeout(error->reset_queue,
 112                                                !i915_reset_backoff(error),
 113                                                I915_RESET_TIMEOUT);
 114         if (ret == 0) {
 115                 DRM_ERROR("Timed out waiting for the gpu reset to complete\n");
 116                 return -EIO;
 117         } else if (ret < 0) {
 118                 return ret;
 119         } else {
 120                 return 0;
 121         }
 122 }
 123
 124 int i915_mutex_lock_interruptible(struct drm_device *dev)
 125 {
 126         struct drm_i915_private *dev_priv = to_i915(dev);
 127         int ret;
 128
 129         ret = i915_gem_wait_for_error(&dev_priv->gpu_error);
 130         if (ret)
 131                 return ret;
 132
 133         ret = mutex_lock_interruptible(&dev->struct_mutex);
 134         if (ret)
 135                 return ret;
 136
 137         return 0;
 138 }
 139
 140 static u32 __i915_gem_park(struct drm_i915_private *i915)
 141 {
 142         GEM_TRACE("\n");
 143
 144         lockdep_assert_held(&i915->drm.struct_mutex);
 145         GEM_BUG_ON(i915->gt.active_requests);
 146         GEM_BUG_ON(!list_empty(&i915->gt.active_rings));
 147
 148         if (!i915->gt.awake)
 149                 return I915_EPOCH_INVALID;
 150
 151         GEM_BUG_ON(i915->gt.epoch == I915_EPOCH_INVALID);
 152
 153         /*
 154          * Be paranoid and flush a concurrent interrupt to make sure
 155          * we don't reactivate any irq tasklets after parking.
 156          *
 157          * FIXME: Note that even though we have waited for execlists to be idle,
 158          * there may still be an in-flight interrupt even though the CSB
 159          * is now empty. synchronize_irq() makes sure that a residual interrupt
 160          * is completed before we continue, but it doesn't prevent the HW from
 161          * raising a spurious interrupt later. To complete the shield we should
 162          * coordinate disabling the CS irq with flushing the interrupts.
 163          */
 164         synchronize_irq(i915->drm.irq);
 165
 166         intel_engines_park(i915);
 167         i915_timelines_park(i915);
 168
 169         i915_pmu_gt_parked(i915);
 170         i915_vma_parked(i915);
 171
 172         i915->gt.awake = false;
 173
 174         if (INTEL_GEN(i915) >= 6)
 175                 gen6_rps_idle(i915);
 176
 177         intel_display_power_put(i915, POWER_DOMAIN_GT_IRQ);
 178
 179         intel_runtime_pm_put(i915);
 180
 181         return i915->gt.epoch;
 182 }
 183
 184 void i915_gem_park(struct drm_i915_private *i915)
 185 {
 186         GEM_TRACE("\n");
 187
 188         lockdep_assert_held(&i915->drm.struct_mutex);
 189         GEM_BUG_ON(i915->gt.active_requests);
 190
 191         if (!i915->gt.awake)
 192                 return;
 193
 194         /* Defer the actual call to __i915_gem_park() to prevent ping-pongs */
 195         mod_delayed_work(i915->wq, &i915->gt.idle_work, msecs_to_jiffies(100));
 196 }
 197
 198 void i915_gem_unpark(struct drm_i915_private *i915)
 199 {
 200         GEM_TRACE("\n");
 201
 202         lockdep_assert_held(&i915->drm.struct_mutex);
 203         GEM_BUG_ON(!i915->gt.active_requests);
 204
 205         if (i915->gt.awake)
 206                 return;
 207
 208         intel_runtime_pm_get_noresume(i915);
 209
 210         /*
 211          * It seems that the DMC likes to transition between the DC states a lot
 212          * when there are no connected displays (no active power domains) during
 213          * command submission.
 214          *
 215          * This activity has negative impact on the performance of the chip with
 216          * huge latencies observed in the interrupt handler and elsewhere.
 217          *
 218          * Work around it by grabbing a GT IRQ power domain whilst there is any
 219          * GT activity, preventing any DC state transitions.
 220          */
 221         intel_display_power_get(i915, POWER_DOMAIN_GT_IRQ);
 222
 223         i915->gt.awake = true;
 224         if (unlikely(++i915->gt.epoch == 0)) /* keep 0 as invalid */
 225                 i915->gt.epoch = 1;
 226
 227         intel_enable_gt_powersave(i915);
 228         i915_update_gfx_val(i915);
 229         if (INTEL_GEN(i915) >= 6)
 230                 gen6_rps_busy(i915);
 231         i915_pmu_gt_unparked(i915);
 232
 233         intel_engines_unpark(i915);
 234
 235         i915_queue_hangcheck(i915);
 236
 237         queue_delayed_work(i915->wq,
 238                            &i915->gt.retire_work,
 239                            round_jiffies_up_relative(HZ));
 240 }
 241
 242 int
 243 i915_gem_get_aperture_ioctl(struct drm_device *dev, void *data,
 244                             struct drm_file *file)
 245 {
 246         struct drm_i915_private *dev_priv = to_i915(dev);
 247         struct i915_ggtt *ggtt = &dev_priv->ggtt;
 248         struct drm_i915_gem_get_aperture *args = data;
 249         struct i915_vma *vma;
 250         u64 pinned;
 251
 252         pinned = ggtt->vm.reserved;
 253         mutex_lock(&dev->struct_mutex);
 254         list_for_each_entry(vma, &ggtt->vm.active_list, vm_link)
 255                 if (i915_vma_is_pinned(vma))
 256                         pinned += vma->node.size;
 257         list_for_each_entry(vma, &ggtt->vm.inactive_list, vm_link)
 258                 if (i915_vma_is_pinned(vma))
 259                         pinned += vma->node.size;
 260         mutex_unlock(&dev->struct_mutex);
 261
 262         args->aper_size = ggtt->vm.total;
 263         args->aper_available_size = args->aper_size - pinned;
 264
 265         return 0;
 266 }
 267
 268 static int i915_gem_object_get_pages_phys(struct drm_i915_gem_object *obj)
 269 {
 270         struct address_space *mapping = obj->base.filp->f_mapping;
 271         drm_dma_handle_t *phys;
 272         struct sg_table *st;
 273         struct scatterlist *sg;
 274         char *vaddr;
 275         int i;
 276         int err;
 277
 278         if (WARN_ON(i915_gem_object_needs_bit17_swizzle(obj)))
 279                 return -EINVAL;
 280
 281         /* Always aligning to the object size, allows a single allocation
 282          * to handle all possible callers, and given typical object sizes,
 283          * the alignment of the buddy allocation will naturally match.
 284          */
 285         phys = drm_pci_alloc(obj->base.dev,
 286                              roundup_pow_of_two(obj->base.size),
 287                              roundup_pow_of_two(obj->base.size));
 288         if (!phys)
 289                 return -ENOMEM;
 290
 291         vaddr = phys->vaddr;
 292         for (i = 0; i < obj->base.size / PAGE_SIZE; i++) {
 293                 struct page *page;
 294                 char *src;
 295
 296                 page = shmem_read_mapping_page(mapping, i);
 297                 if (IS_ERR(page)) {
 298                         err = PTR_ERR(page);
 299                         goto err_phys;
 300                 }
 301
 302                 src = kmap_atomic(page);
 303                 memcpy(vaddr, src, PAGE_SIZE);
 304                 drm_clflush_virt_range(vaddr, PAGE_SIZE);
 305                 kunmap_atomic(src);
 306
 307                 put_page(page);
 308                 vaddr += PAGE_SIZE;
 309         }
 310
 311         i915_gem_chipset_flush(to_i915(obj->base.dev));
 312
 313         st = kmalloc(sizeof(*st), GFP_KERNEL);
 314         if (!st) {
 315                 err = -ENOMEM;
 316                 goto err_phys;
 317         }
 318
 319         if (sg_alloc_table(st, 1, GFP_KERNEL)) {
 320                 kfree(st);
 321                 err = -ENOMEM;
 322                 goto err_phys;
 323         }
 324
 325         sg = st->sgl;
 326         sg->offset = 0;
 327         sg->length = obj->base.size;
 328
 329         sg_dma_address(sg) = phys->busaddr;
 330         sg_dma_len(sg) = obj->base.size;
 331
 332         obj->phys_handle = phys;
 333
 334         __i915_gem_object_set_pages(obj, st, sg->length);
 335
 336         return 0;
 337
 338 err_phys:
 339         drm_pci_free(obj->base.dev, phys);
 340
 341         return err;
 342 }
 343
 344 static void __start_cpu_write(struct drm_i915_gem_object *obj)
 345 {
 346         obj->read_domains = I915_GEM_DOMAIN_CPU;
 347         obj->write_domain = I915_GEM_DOMAIN_CPU;
 348         if (cpu_write_needs_clflush(obj))
 349                 obj->cache_dirty = true;
 350 }
 351
 352 static void
 353 __i915_gem_object_release_shmem(struct drm_i915_gem_object *obj,
 354                                 struct sg_table *pages,
 355                                 bool needs_clflush)
 356 {
 357         GEM_BUG_ON(obj->mm.madv == __I915_MADV_PURGED);
 358
 359         if (obj->mm.madv == I915_MADV_DONTNEED)
 360                 obj->mm.dirty = false;
 361
 362         if (needs_clflush &&
 363             (obj->read_domains & I915_GEM_DOMAIN_CPU) == 0 &&
 364             !(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ))
 365                 drm_clflush_sg(pages);
 366
 367         __start_cpu_write(obj);
 368 }
 369
 370 static void
 371 i915_gem_object_put_pages_phys(struct drm_i915_gem_object *obj,
 372                                struct sg_table *pages)
 373 {
 374         __i915_gem_object_release_shmem(obj, pages, false);
 375
 376         if (obj->mm.dirty) {
 377                 struct address_space *mapping = obj->base.filp->f_mapping;
 378                 char *vaddr = obj->phys_handle->vaddr;
 379                 int i;
 380
 381                 for (i = 0; i < obj->base.size / PAGE_SIZE; i++) {
 382                         struct page *page;
 383                         char *dst;
 384
 385                         page = shmem_read_mapping_page(mapping, i);
 386                         if (IS_ERR(page))
 387                                 continue;
 388
 389                         dst = kmap_atomic(page);
 390                         drm_clflush_virt_range(vaddr, PAGE_SIZE);
 391                         memcpy(dst, vaddr, PAGE_SIZE);
 392                         kunmap_atomic(dst);
 393
 394                         set_page_dirty(page);
 395                         if (obj->mm.madv == I915_MADV_WILLNEED)
 396                                 mark_page_accessed(page);
 397                         put_page(page);
 398                         vaddr += PAGE_SIZE;
 399                 }
 400                 obj->mm.dirty = false;
 401         }
 402
 403         sg_free_table(pages);
 404         kfree(pages);
 405
 406         drm_pci_free(obj->base.dev, obj->phys_handle);
 407 }
 408
 409 static void
 410 i915_gem_object_release_phys(struct drm_i915_gem_object *obj)
 411 {
 412         i915_gem_object_unpin_pages(obj);
 413 }
 414
 415 static const struct drm_i915_gem_object_ops i915_gem_phys_ops = {
 416         .get_pages = i915_gem_object_get_pages_phys,
 417         .put_pages = i915_gem_object_put_pages_phys,
 418         .release = i915_gem_object_release_phys,
 419 };
 420
 421 static const struct drm_i915_gem_object_ops i915_gem_object_ops;
 422
 423 int i915_gem_object_unbind(struct drm_i915_gem_object *obj)
 424 {
 425         struct i915_vma *vma;
 426         LIST_HEAD(still_in_list);
 427         int ret;
 428
 429         lockdep_assert_held(&obj->base.dev->struct_mutex);
 430
 431         /* Closed vma are removed from the obj->vma_list - but they may
 432          * still have an active binding on the object. To remove those we
 433          * must wait for all rendering to complete to the object (as unbinding
 434          * must anyway), and retire the requests.
 435          */
 436         ret = i915_gem_object_set_to_cpu_domain(obj, false);
 437         if (ret)
 438                 return ret;
 439
 440         while ((vma = list_first_entry_or_null(&obj->vma_list,
 441                                                struct i915_vma,
 442                                                obj_link))) {
 443                 list_move_tail(&vma->obj_link, &still_in_list);
 444                 ret = i915_vma_unbind(vma);
 445                 if (ret)
 446                         break;
 447         }
 448         list_splice(&still_in_list, &obj->vma_list);
 449
 450         return ret;
 451 }
 452
 453 static long
 454 i915_gem_object_wait_fence(struct dma_fence *fence,
 455                            unsigned int flags,
 456                            long timeout,
 457                            struct intel_rps_client *rps_client)
 458 {
 459         struct i915_request *rq;
 460
 461         BUILD_BUG_ON(I915_WAIT_INTERRUPTIBLE != 0x1);
 462
 463         if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags))
 464                 return timeout;
 465
 466         if (!dma_fence_is_i915(fence))
 467                 return dma_fence_wait_timeout(fence,
 468                                               flags & I915_WAIT_INTERRUPTIBLE,
 469                                               timeout);
 470
 471         rq = to_request(fence);
 472         if (i915_request_completed(rq))
 473                 goto out;
 474
 475         /*
 476          * This client is about to stall waiting for the GPU. In many cases
 477          * this is undesirable and limits the throughput of the system, as
 478          * many clients cannot continue processing user input/output whilst
 479          * blocked. RPS autotuning may take tens of milliseconds to respond
 480          * to the GPU load and thus incurs additional latency for the client.
 481          * We can circumvent that by promoting the GPU frequency to maximum
 482          * before we wait. This makes the GPU throttle up much more quickly
 483          * (good for benchmarks and user experience, e.g. window animations),
 484          * but at a cost of spending more power processing the workload
 485          * (bad for battery). Not all clients even want their results
 486          * immediately and for them we should just let the GPU select its own
 487          * frequency to maximise efficiency. To prevent a single client from
 488          * forcing the clocks too high for the whole system, we only allow
 489          * each client to waitboost once in a busy period.
 490          */
 491         if (rps_client && !i915_request_started(rq)) {
 492                 if (INTEL_GEN(rq->i915) >= 6)
 493                         gen6_rps_boost(rq, rps_client);
 494         }
 495
 496         timeout = i915_request_wait(rq, flags, timeout);
 497
 498 out:
 499         if (flags & I915_WAIT_LOCKED && i915_request_completed(rq))
 500                 i915_request_retire_upto(rq);
 501
 502         return timeout;
 503 }
 504
 505 static long
 506 i915_gem_object_wait_reservation(struct reservation_object *resv,
 507                                  unsigned int flags,
 508                                  long timeout,
 509                                  struct intel_rps_client *rps_client)
 510 {
 511         unsigned int seq = __read_seqcount_begin(&resv->seq);
 512         struct dma_fence *excl;
 513         bool prune_fences = false;
 514
 515         if (flags & I915_WAIT_ALL) {
 516                 struct dma_fence **shared;
 517                 unsigned int count, i;
 518                 int ret;
 519
 520                 ret = reservation_object_get_fences_rcu(resv,
 521                                                         &excl, &count, &shared);
 522                 if (ret)
 523                         return ret;
 524
 525                 for (i = 0; i < count; i++) {
 526                         timeout = i915_gem_object_wait_fence(shared[i],
 527                                                              flags, timeout,
 528                                                              rps_client);
 529                         if (timeout < 0)
 530                                 break;
 531
 532                         dma_fence_put(shared[i]);
 533                 }
 534
 535                 for (; i < count; i++)
 536                         dma_fence_put(shared[i]);
 537                 kfree(shared);
 538
 539                 /*
 540                  * If both shared fences and an exclusive fence exist,
 541                  * then by construction the shared fences must be later
 542                  * than the exclusive fence. If we successfully wait for
 543                  * all the shared fences, we know that the exclusive fence
 544                  * must all be signaled. If all the shared fences are
 545                  * signaled, we can prune the array and recover the
 546                  * floating references on the fences/requests.
 547                  */
 548                 prune_fences = count && timeout >= 0;
 549         } else {
 550                 excl = reservation_object_get_excl_rcu(resv);
 551         }
 552
 553         if (excl && timeout >= 0)
 554                 timeout = i915_gem_object_wait_fence(excl, flags, timeout,
 555                                                      rps_client);
 556
 557         dma_fence_put(excl);
 558
 559         /*
 560          * Opportunistically prune the fences iff we know they have *all* been
 561          * signaled and that the reservation object has not been changed (i.e.
 562          * no new fences have been added).
 563          */
 564         if (prune_fences && !__read_seqcount_retry(&resv->seq, seq)) {
 565                 if (reservation_object_trylock(resv)) {
 566                         if (!__read_seqcount_retry(&resv->seq, seq))
 567                                 reservation_object_add_excl_fence(resv, NULL);
 568                         reservation_object_unlock(resv);
 569                 }
 570         }
 571
 572         return timeout;
 573 }
 574
 575 static void __fence_set_priority(struct dma_fence *fence,
 576                                  const struct i915_sched_attr *attr)
 577 {
 578         struct i915_request *rq;
 579         struct intel_engine_cs *engine;
 580
 581         if (dma_fence_is_signaled(fence) || !dma_fence_is_i915(fence))
 582                 return;
 583
 584         rq = to_request(fence);
 585         engine = rq->engine;
 586
 587         local_bh_disable();
 588         rcu_read_lock(); /* RCU serialisation for set-wedged protection */
 589         if (engine->schedule)
 590                 engine->schedule(rq, attr);
 591         rcu_read_unlock();
 592         local_bh_enable(); /* kick the tasklets if queues were reprioritised */
 593 }
 594
 595 static void fence_set_priority(struct dma_fence *fence,
 596                                const struct i915_sched_attr *attr)
 597 {
 598         /* Recurse once into a fence-array */
 599         if (dma_fence_is_array(fence)) {
 600                 struct dma_fence_array *array = to_dma_fence_array(fence);
 601                 int i;
 602
 603                 for (i = 0; i < array->num_fences; i++)
 604                         __fence_set_priority(array->fences[i], attr);
 605         } else {
 606                 __fence_set_priority(fence, attr);
 607         }
 608 }
 609
 610 int
 611 i915_gem_object_wait_priority(struct drm_i915_gem_object *obj,
 612                               unsigned int flags,
 613                               const struct i915_sched_attr *attr)
 614 {
 615         struct dma_fence *excl;
 616
 617         if (flags & I915_WAIT_ALL) {
 618                 struct dma_fence **shared;
 619                 unsigned int count, i;
 620                 int ret;
 621
 622                 ret = reservation_object_get_fences_rcu(obj->resv,
 623                                                         &excl, &count, &shared);
 624                 if (ret)
 625                         return ret;
 626
 627                 for (i = 0; i < count; i++) {
 628                         fence_set_priority(shared[i], attr);
 629                         dma_fence_put(shared[i]);
 630                 }
 631
 632                 kfree(shared);
 633         } else {
 634                 excl = reservation_object_get_excl_rcu(obj->resv);
 635         }
 636
 637         if (excl) {
 638                 fence_set_priority(excl, attr);
 639                 dma_fence_put(excl);
 640         }
 641         return 0;
 642 }
 643
 644 /**
 645  * Waits for rendering to the object to be completed
 646  * @obj: i915 gem object
 647  * @flags: how to wait (under a lock, for all rendering or just for writes etc)
 648  * @timeout: how long to wait
 649  * @rps_client: client (user process) to charge for any waitboosting
 650  */
 651 int
 652 i915_gem_object_wait(struct drm_i915_gem_object *obj,
 653                      unsigned int flags,
 654                      long timeout,
 655                      struct intel_rps_client *rps_client)
 656 {
 657         might_sleep();
 658 #if IS_ENABLED(CONFIG_LOCKDEP)
 659         GEM_BUG_ON(debug_locks &&
 660                    !!lockdep_is_held(&obj->base.dev->struct_mutex) !=
 661                    !!(flags & I915_WAIT_LOCKED));
 662 #endif
 663         GEM_BUG_ON(timeout < 0);
 664
 665         timeout = i915_gem_object_wait_reservation(obj->resv,
 666                                                    flags, timeout,
 667                                                    rps_client);
 668         return timeout < 0 ? timeout : 0;
 669 }
 670
 671 static struct intel_rps_client *to_rps_client(struct drm_file *file)
 672 {
 673         struct drm_i915_file_private *fpriv = file->driver_priv;
 674
 675         return &fpriv->rps_client;
 676 }
 677
 678 static int
 679 i915_gem_phys_pwrite(struct drm_i915_gem_object *obj,
 680                      struct drm_i915_gem_pwrite *args,
 681                      struct drm_file *file)
 682 {
 683         void *vaddr = obj->phys_handle->vaddr + args->offset;
 684         char __user *user_data = u64_to_user_ptr(args->data_ptr);
 685
 686         /* We manually control the domain here and pretend that it
 687          * remains coherent i.e. in the GTT domain, like shmem_pwrite.
 688          */
 689         intel_fb_obj_invalidate(obj, ORIGIN_CPU);
 690         if (copy_from_user(vaddr, user_data, args->size))
 691                 return -EFAULT;
 692
 693         drm_clflush_virt_range(vaddr, args->size);
 694         i915_gem_chipset_flush(to_i915(obj->base.dev));
 695
 696         intel_fb_obj_flush(obj, ORIGIN_CPU);
 697         return 0;
 698 }
 699
 700 void *i915_gem_object_alloc(struct drm_i915_private *dev_priv)
 701 {
 702         return kmem_cache_zalloc(dev_priv->objects, GFP_KERNEL);
 703 }
 704
 705 void i915_gem_object_free(struct drm_i915_gem_object *obj)
 706 {
 707         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
 708         kmem_cache_free(dev_priv->objects, obj);
 709 }
 710
 711 static int
 712 i915_gem_create(struct drm_file *file,
 713                 struct drm_i915_private *dev_priv,
 714                 uint64_t size,
 715                 uint32_t *handle_p)
 716 {
 717         struct drm_i915_gem_object *obj;
 718         int ret;
 719         u32 handle;
 720
 721         size = roundup(size, PAGE_SIZE);
 722         if (size == 0)
 723                 return -EINVAL;
 724
 725         /* Allocate the new object */
 726         obj = i915_gem_object_create(dev_priv, size);
 727         if (IS_ERR(obj))
 728                 return PTR_ERR(obj);
 729
 730         ret = drm_gem_handle_create(file, &obj->base, &handle);
 731         /* drop reference from allocate - handle holds it now */
 732         i915_gem_object_put(obj);
 733         if (ret)
 734                 return ret;
 735
 736         *handle_p = handle;
 737         return 0;
 738 }
 739
 740 int
 741 i915_gem_dumb_create(struct drm_file *file,
 742                      struct drm_device *dev,
 743                      struct drm_mode_create_dumb *args)
 744 {
 745         /* have to work out size/pitch and return them */
 746         args->pitch = ALIGN(args->width * DIV_ROUND_UP(args->bpp, 8), 64);
 747         args->size = args->pitch * args->height;
 748         return i915_gem_create(file, to_i915(dev),
 749                                args->size, &args->handle);
 750 }
 751
 752 static bool gpu_write_needs_clflush(struct drm_i915_gem_object *obj)
 753 {
 754         return !(obj->cache_level == I915_CACHE_NONE ||
 755                  obj->cache_level == I915_CACHE_WT);
 756 }
 757
 758 /**
 759  * Creates a new mm object and returns a handle to it.
 760  * @dev: drm device pointer
 761  * @data: ioctl data blob
 762  * @file: drm file pointer
 763  */
 764 int
 765 i915_gem_create_ioctl(struct drm_device *dev, void *data,
 766                       struct drm_file *file)
 767 {
 768         struct drm_i915_private *dev_priv = to_i915(dev);
 769         struct drm_i915_gem_create *args = data;
 770
 771         i915_gem_flush_free_objects(dev_priv);
 772
 773         return i915_gem_create(file, dev_priv,
 774                                args->size, &args->handle);
 775 }
 776
 777 static inline enum fb_op_origin
 778 fb_write_origin(struct drm_i915_gem_object *obj, unsigned int domain)
 779 {
 780         return (domain == I915_GEM_DOMAIN_GTT ?
 781                 obj->frontbuffer_ggtt_origin : ORIGIN_CPU);
 782 }
 783
 784 void i915_gem_flush_ggtt_writes(struct drm_i915_private *dev_priv)
 785 {
 786         /*
 787          * No actual flushing is required for the GTT write domain for reads
 788          * from the GTT domain. Writes to it "immediately" go to main memory
 789          * as far as we know, so there's no chipset flush. It also doesn't
 790          * land in the GPU render cache.
 791          *
 792          * However, we do have to enforce the order so that all writes through
 793          * the GTT land before any writes to the device, such as updates to
 794          * the GATT itself.
 795          *
 796          * We also have to wait a bit for the writes to land from the GTT.
 797          * An uncached read (i.e. mmio) seems to be ideal for the round-trip
 798          * timing. This issue has only been observed when switching quickly
 799          * between GTT writes and CPU reads from inside the kernel on recent hw,
 800          * and it appears to only affect discrete GTT blocks (i.e. on LLC
 801          * system agents we cannot reproduce this behaviour, until Cannonlake
 802          * that was!).
 803          */
 804
 805         wmb();
 806
 807         if (INTEL_INFO(dev_priv)->has_coherent_ggtt)
 808                 return;
 809
 810         i915_gem_chipset_flush(dev_priv);
 811
 812         intel_runtime_pm_get(dev_priv);
 813         spin_lock_irq(&dev_priv->uncore.lock);
 814
 815         POSTING_READ_FW(RING_HEAD(RENDER_RING_BASE));
 816
 817         spin_unlock_irq(&dev_priv->uncore.lock);
 818         intel_runtime_pm_put(dev_priv);
 819 }
 820
 821 static void
 822 flush_write_domain(struct drm_i915_gem_object *obj, unsigned int flush_domains)
 823 {
 824         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
 825         struct i915_vma *vma;
 826
 827         if (!(obj->write_domain & flush_domains))
 828                 return;
 829
 830         switch (obj->write_domain) {
 831         case I915_GEM_DOMAIN_GTT:
 832                 i915_gem_flush_ggtt_writes(dev_priv);
 833
 834                 intel_fb_obj_flush(obj,
 835                                    fb_write_origin(obj, I915_GEM_DOMAIN_GTT));
 836
 837                 for_each_ggtt_vma(vma, obj) {
 838                         if (vma->iomap)
 839                                 continue;
 840
 841                         i915_vma_unset_ggtt_write(vma);
 842                 }
 843                 break;
 844
 845         case I915_GEM_DOMAIN_WC:
 846                 wmb();
 847                 break;
 848
 849         case I915_GEM_DOMAIN_CPU:
 850                 i915_gem_clflush_object(obj, I915_CLFLUSH_SYNC);
 851                 break;
 852
 853         case I915_GEM_DOMAIN_RENDER:
 854                 if (gpu_write_needs_clflush(obj))
 855                         obj->cache_dirty = true;
 856                 break;
 857         }
 858
 859         obj->write_domain = 0;
 860 }
 861
 862 static inline int
 863 __copy_to_user_swizzled(char __user *cpu_vaddr,
 864                         const char *gpu_vaddr, int gpu_offset,
 865                         int length)
 866 {
 867         int ret, cpu_offset = 0;
 868
 869         while (length > 0) {
 870                 int cacheline_end = ALIGN(gpu_offset + 1, 64);
 871                 int this_length = min(cacheline_end - gpu_offset, length);
 872                 int swizzled_gpu_offset = gpu_offset ^ 64;
 873
 874                 ret = __copy_to_user(cpu_vaddr + cpu_offset,
 875                                      gpu_vaddr + swizzled_gpu_offset,
 876                                      this_length);
 877                 if (ret)
 878                         return ret + length;
 879
 880                 cpu_offset += this_length;
 881                 gpu_offset += this_length;
 882                 length -= this_length;
 883         }
 884
 885         return 0;
 886 }
 887
 888 static inline int
 889 __copy_from_user_swizzled(char *gpu_vaddr, int gpu_offset,
 890                           const char __user *cpu_vaddr,
 891                           int length)
 892 {
 893         int ret, cpu_offset = 0;
 894
 895         while (length > 0) {
 896                 int cacheline_end = ALIGN(gpu_offset + 1, 64);
 897                 int this_length = min(cacheline_end - gpu_offset, length);
 898                 int swizzled_gpu_offset = gpu_offset ^ 64;
 899
 900                 ret = __copy_from_user(gpu_vaddr + swizzled_gpu_offset,
 901                                        cpu_vaddr + cpu_offset,
 902                                        this_length);
 903                 if (ret)
 904                         return ret + length;
 905
 906                 cpu_offset += this_length;
 907                 gpu_offset += this_length;
 908                 length -= this_length;
 909         }
 910
 911         return 0;
 912 }
 913
 914 /*
 915  * Pins the specified object's pages and synchronizes the object with
 916  * GPU accesses. Sets needs_clflush to non-zero if the caller should
 917  * flush the object from the CPU cache.
 918  */
 919 int i915_gem_obj_prepare_shmem_read(struct drm_i915_gem_object *obj,
 920                                     unsigned int *needs_clflush)
 921 {
 922         int ret;
 923
 924         lockdep_assert_held(&obj->base.dev->struct_mutex);
 925
 926         *needs_clflush = 0;
 927         if (!i915_gem_object_has_struct_page(obj))
 928                 return -ENODEV;
 929
 930         ret = i915_gem_object_wait(obj,
 931                                    I915_WAIT_INTERRUPTIBLE |
 932                                    I915_WAIT_LOCKED,
 933                                    MAX_SCHEDULE_TIMEOUT,
 934                                    NULL);
 935         if (ret)
 936                 return ret;
 937
 938         ret = i915_gem_object_pin_pages(obj);
 939         if (ret)
 940                 return ret;
 941
 942         if (obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ ||
 943             !static_cpu_has(X86_FEATURE_CLFLUSH)) {
 944                 ret = i915_gem_object_set_to_cpu_domain(obj, false);
 945                 if (ret)
 946                         goto err_unpin;
 947                 else
 948                         goto out;
 949         }
 950
 951         flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
 952
 953         /* If we're not in the cpu read domain, set ourself into the gtt
 954          * read domain and manually flush cachelines (if required). This
 955          * optimizes for the case when the gpu will dirty the data
 956          * anyway again before the next pread happens.
 957          */
 958         if (!obj->cache_dirty &&
 959             !(obj->read_domains & I915_GEM_DOMAIN_CPU))
 960                 *needs_clflush = CLFLUSH_BEFORE;
 961
 962 out:
 963         /* return with the pages pinned */
 964         return 0;
 965
 966 err_unpin:
 967         i915_gem_object_unpin_pages(obj);
 968         return ret;
 969 }
 970
 971 int i915_gem_obj_prepare_shmem_write(struct drm_i915_gem_object *obj,
 972                                      unsigned int *needs_clflush)
 973 {
 974         int ret;
 975
 976         lockdep_assert_held(&obj->base.dev->struct_mutex);
 977
 978         *needs_clflush = 0;
 979         if (!i915_gem_object_has_struct_page(obj))
 980                 return -ENODEV;
 981
 982         ret = i915_gem_object_wait(obj,
 983                                    I915_WAIT_INTERRUPTIBLE |
 984                                    I915_WAIT_LOCKED |
 985                                    I915_WAIT_ALL,
 986                                    MAX_SCHEDULE_TIMEOUT,
 987                                    NULL);
 988         if (ret)
 989                 return ret;
 990
 991         ret = i915_gem_object_pin_pages(obj);
 992         if (ret)
 993                 return ret;
 994
 995         if (obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE ||
 996             !static_cpu_has(X86_FEATURE_CLFLUSH)) {
 997                 ret = i915_gem_object_set_to_cpu_domain(obj, true);
 998                 if (ret)
 999                         goto err_unpin;
1000                 else
1001                         goto out;
1002         }
1003
1004         flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
1005
1006         /* If we're not in the cpu write domain, set ourself into the
1007          * gtt write domain and manually flush cachelines (as required).
1008          * This optimizes for the case when the gpu will use the data
1009          * right away and we therefore have to clflush anyway.
1010          */
1011         if (!obj->cache_dirty) {
1012                 *needs_clflush |= CLFLUSH_AFTER;
1013
1014                 /*
1015                  * Same trick applies to invalidate partially written
1016                  * cachelines read before writing.
1017                  */
1018                 if (!(obj->read_domains & I915_GEM_DOMAIN_CPU))
1019                         *needs_clflush |= CLFLUSH_BEFORE;
1020         }
1021
1022 out:
1023         intel_fb_obj_invalidate(obj, ORIGIN_CPU);
1024         obj->mm.dirty = true;
1025         /* return with the pages pinned */
1026         return 0;
1027
1028 err_unpin:
1029         i915_gem_object_unpin_pages(obj);
1030         return ret;
1031 }
1032
1033 static void
1034 shmem_clflush_swizzled_range(char *addr, unsigned long length,
1035                              bool swizzled)
1036 {
1037         if (unlikely(swizzled)) {
1038                 unsigned long start = (unsigned long) addr;
1039                 unsigned long end = (unsigned long) addr + length;
1040
1041                 /* For swizzling simply ensure that we always flush both
1042                  * channels. Lame, but simple and it works. Swizzled
1043                  * pwrite/pread is far from a hotpath - current userspace
1044                  * doesn't use it at all. */
1045                 start = round_down(start, 128);
1046                 end = round_up(end, 128);
1047
1048                 drm_clflush_virt_range((void *)start, end - start);
1049         } else {
1050                 drm_clflush_virt_range(addr, length);
1051         }
1052
1053 }
1054
1055 /* Only difference to the fast-path function is that this can handle bit17
1056  * and uses non-atomic copy and kmap functions. */
1057 static int
1058 shmem_pread_slow(struct page *page, int offset, int length,
1059                  char __user *user_data,
1060                  bool page_do_bit17_swizzling, bool needs_clflush)
1061 {
1062         char *vaddr;
1063         int ret;
1064
1065         vaddr = kmap(page);
1066         if (needs_clflush)
1067                 shmem_clflush_swizzled_range(vaddr + offset, length,
1068                                              page_do_bit17_swizzling);
1069
1070         if (page_do_bit17_swizzling)
1071                 ret = __copy_to_user_swizzled(user_data, vaddr, offset, length);
1072         else
1073                 ret = __copy_to_user(user_data, vaddr + offset, length);
1074         kunmap(page);
1075
1076         return ret ? - EFAULT : 0;
1077 }
1078
1079 static int
1080 shmem_pread(struct page *page, int offset, int length, char __user *user_data,
1081             bool page_do_bit17_swizzling, bool needs_clflush)
1082 {
1083         int ret;
1084
1085         ret = -ENODEV;
1086         if (!page_do_bit17_swizzling) {
1087                 char *vaddr = kmap_atomic(page);
1088
1089                 if (needs_clflush)
1090                         drm_clflush_virt_range(vaddr + offset, length);
1091                 ret = __copy_to_user_inatomic(user_data, vaddr + offset, length);
1092                 kunmap_atomic(vaddr);
1093         }
1094         if (ret == 0)
1095                 return 0;
1096
1097         return shmem_pread_slow(page, offset, length, user_data,
1098                                 page_do_bit17_swizzling, needs_clflush);
1099 }
1100
1101 static int
1102 i915_gem_shmem_pread(struct drm_i915_gem_object *obj,
1103                      struct drm_i915_gem_pread *args)
1104 {
1105         char __user *user_data;
1106         u64 remain;
1107         unsigned int obj_do_bit17_swizzling;
1108         unsigned int needs_clflush;
1109         unsigned int idx, offset;
1110         int ret;
1111
1112         obj_do_bit17_swizzling = 0;
1113         if (i915_gem_object_needs_bit17_swizzle(obj))
1114                 obj_do_bit17_swizzling = BIT(17);
1115
1116         ret = mutex_lock_interruptible(&obj->base.dev->struct_mutex);
1117         if (ret)
1118                 return ret;
1119
1120         ret = i915_gem_obj_prepare_shmem_read(obj, &needs_clflush);
1121         mutex_unlock(&obj->base.dev->struct_mutex);
1122         if (ret)
1123                 return ret;
1124
1125         remain = args->size;
1126         user_data = u64_to_user_ptr(args->data_ptr);
1127         offset = offset_in_page(args->offset);
1128         for (idx = args->offset >> PAGE_SHIFT; remain; idx++) {
1129                 struct page *page = i915_gem_object_get_page(obj, idx);
1130                 unsigned int length = min_t(u64, remain, PAGE_SIZE - offset);
1131
1132                 ret = shmem_pread(page, offset, length, user_data,
1133                                   page_to_phys(page) & obj_do_bit17_swizzling,
1134                                   needs_clflush);
1135                 if (ret)
1136                         break;
1137
1138                 remain -= length;
1139                 user_data += length;
1140                 offset = 0;
1141         }
1142
1143         i915_gem_obj_finish_shmem_access(obj);
1144         return ret;
1145 }
1146
1147 static inline bool
1148 gtt_user_read(struct io_mapping *mapping,
1149               loff_t base, int offset,
1150               char __user *user_data, int length)
1151 {
1152         void __iomem *vaddr;
1153         unsigned long unwritten;
1154
1155         /* We can use the cpu mem copy function because this is X86. */
1156         vaddr = io_mapping_map_atomic_wc(mapping, base);
1157         unwritten = __copy_to_user_inatomic(user_data,
1158                                             (void __force *)vaddr + offset,
1159                                             length);
1160         io_mapping_unmap_atomic(vaddr);
1161         if (unwritten) {
1162                 vaddr = io_mapping_map_wc(mapping, base, PAGE_SIZE);
1163                 unwritten = copy_to_user(user_data,
1164                                          (void __force *)vaddr + offset,
1165                                          length);
1166                 io_mapping_unmap(vaddr);
1167         }
1168         return unwritten;
1169 }
1170
1171 static int
1172 i915_gem_gtt_pread(struct drm_i915_gem_object *obj,
1173                    const struct drm_i915_gem_pread *args)
1174 {
1175         struct drm_i915_private *i915 = to_i915(obj->base.dev);
1176         struct i915_ggtt *ggtt = &i915->ggtt;
1177         struct drm_mm_node node;
1178         struct i915_vma *vma;
1179         void __user *user_data;
1180         u64 remain, offset;
1181         int ret;
1182
1183         ret = mutex_lock_interruptible(&i915->drm.struct_mutex);
1184         if (ret)
1185                 return ret;
1186
1187         intel_runtime_pm_get(i915);
1188         vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
1189                                        PIN_MAPPABLE |
1190                                        PIN_NONFAULT |
1191                                        PIN_NONBLOCK);
1192         if (!IS_ERR(vma)) {
1193                 node.start = i915_ggtt_offset(vma);
1194                 node.allocated = false;
1195                 ret = i915_vma_put_fence(vma);
1196                 if (ret) {
1197                         i915_vma_unpin(vma);
1198                         vma = ERR_PTR(ret);
1199                 }
1200         }
1201         if (IS_ERR(vma)) {
1202                 ret = insert_mappable_node(ggtt, &node, PAGE_SIZE);
1203                 if (ret)
1204                         goto out_unlock;
1205                 GEM_BUG_ON(!node.allocated);
1206         }
1207
1208         ret = i915_gem_object_set_to_gtt_domain(obj, false);
1209         if (ret)
1210                 goto out_unpin;
1211
1212         mutex_unlock(&i915->drm.struct_mutex);
1213
1214         user_data = u64_to_user_ptr(args->data_ptr);
1215         remain = args->size;
1216         offset = args->offset;
1217
1218         while (remain > 0) {
1219                 /* Operation in this page
1220                  *
1221                  * page_base = page offset within aperture
1222                  * page_offset = offset within page
1223                  * page_length = bytes to copy for this page
1224                  */
1225                 u32 page_base = node.start;
1226                 unsigned page_offset = offset_in_page(offset);
1227                 unsigned page_length = PAGE_SIZE - page_offset;
1228                 page_length = remain < page_length ? remain : page_length;
1229                 if (node.allocated) {
1230                         wmb();
1231                         ggtt->vm.insert_page(&ggtt->vm,
1232                                              i915_gem_object_get_dma_address(obj, offset >> PAGE_SHIFT),
1233                                              node.start, I915_CACHE_NONE, 0);
1234                         wmb();
1235                 } else {
1236                         page_base += offset & PAGE_MASK;
1237                 }
1238
1239                 if (gtt_user_read(&ggtt->iomap, page_base, page_offset,
1240                                   user_data, page_length)) {
1241                         ret = -EFAULT;
1242                         break;
1243                 }
1244
1245                 remain -= page_length;
1246                 user_data += page_length;
1247                 offset += page_length;
1248         }
1249
1250         mutex_lock(&i915->drm.struct_mutex);
1251 out_unpin:
1252         if (node.allocated) {
1253                 wmb();
1254                 ggtt->vm.clear_range(&ggtt->vm, node.start, node.size);
1255                 remove_mappable_node(&node);
1256         } else {
1257                 i915_vma_unpin(vma);
1258         }
1259 out_unlock:
1260         intel_runtime_pm_put(i915);
1261         mutex_unlock(&i915->drm.struct_mutex);
1262
1263         return ret;
1264 }
1265
1266 /**
1267  * Reads data from the object referenced by handle.
1268  * @dev: drm device pointer
1269  * @data: ioctl data blob
1270  * @file: drm file pointer
1271  *
1272  * On error, the contents of *data are undefined.
1273  */
1274 int
1275 i915_gem_pread_ioctl(struct drm_device *dev, void *data,
1276                      struct drm_file *file)
1277 {
1278         struct drm_i915_gem_pread *args = data;
1279         struct drm_i915_gem_object *obj;
1280         int ret;
1281
1282         if (args->size == 0)
1283                 return 0;
1284
1285         if (!access_ok(VERIFY_WRITE,
1286                        u64_to_user_ptr(args->data_ptr),
1287                        args->size))
1288                 return -EFAULT;
1289
1290         obj = i915_gem_object_lookup(file, args->handle);
1291         if (!obj)
1292                 return -ENOENT;
1293
1294         /* Bounds check source.  */
1295         if (range_overflows_t(u64, args->offset, args->size, obj->base.size)) {
1296                 ret = -EINVAL;
1297                 goto out;
1298         }
1299
1300         trace_i915_gem_object_pread(obj, args->offset, args->size);
1301
1302         ret = i915_gem_object_wait(obj,
1303                                    I915_WAIT_INTERRUPTIBLE,
1304                                    MAX_SCHEDULE_TIMEOUT,
1305                                    to_rps_client(file));
1306         if (ret)
1307                 goto out;
1308
1309         ret = i915_gem_object_pin_pages(obj);
1310         if (ret)
1311                 goto out;
1312
1313         ret = i915_gem_shmem_pread(obj, args);
1314         if (ret == -EFAULT || ret == -ENODEV)
1315                 ret = i915_gem_gtt_pread(obj, args);
1316
1317         i915_gem_object_unpin_pages(obj);
1318 out:
1319         i915_gem_object_put(obj);
1320         return ret;
1321 }
1322
1323 /* This is the fast write path which cannot handle
1324  * page faults in the source data
1325  */
1326
1327 static inline bool
1328 ggtt_write(struct io_mapping *mapping,
1329            loff_t base, int offset,
1330            char __user *user_data, int length)
1331 {
1332         void __iomem *vaddr;
1333         unsigned long unwritten;
1334
1335         /* We can use the cpu mem copy function because this is X86. */
1336         vaddr = io_mapping_map_atomic_wc(mapping, base);
1337         unwritten = __copy_from_user_inatomic_nocache((void __force *)vaddr + offset,
1338                                                       user_data, length);
1339         io_mapping_unmap_atomic(vaddr);
1340         if (unwritten) {
1341                 vaddr = io_mapping_map_wc(mapping, base, PAGE_SIZE);
1342                 unwritten = copy_from_user((void __force *)vaddr + offset,
1343                                            user_data, length);
1344                 io_mapping_unmap(vaddr);
1345         }
1346
1347         return unwritten;
1348 }
1349
1350 /**
1351  * This is the fast pwrite path, where we copy the data directly from the
1352  * user into the GTT, uncached.
1353  * @obj: i915 GEM object
1354  * @args: pwrite arguments structure
1355  */
1356 static int
1357 i915_gem_gtt_pwrite_fast(struct drm_i915_gem_object *obj,
1358                          const struct drm_i915_gem_pwrite *args)
1359 {
1360         struct drm_i915_private *i915 = to_i915(obj->base.dev);
1361         struct i915_ggtt *ggtt = &i915->ggtt;
1362         struct drm_mm_node node;
1363         struct i915_vma *vma;
1364         u64 remain, offset;
1365         void __user *user_data;
1366         int ret;
1367
1368         ret = mutex_lock_interruptible(&i915->drm.struct_mutex);
1369         if (ret)
1370                 return ret;
1371
1372         if (i915_gem_object_has_struct_page(obj)) {
1373                 /*
1374                  * Avoid waking the device up if we can fallback, as
1375                  * waking/resuming is very slow (worst-case 10-100 ms
1376                  * depending on PCI sleeps and our own resume time).
1377                  * This easily dwarfs any performance advantage from
1378                  * using the cache bypass of indirect GGTT access.
1379                  */
1380                 if (!intel_runtime_pm_get_if_in_use(i915)) {
1381                         ret = -EFAULT;
1382                         goto out_unlock;
1383                 }
1384         } else {
1385                 /* No backing pages, no fallback, we must force GGTT access */
1386                 intel_runtime_pm_get(i915);
1387         }
1388
1389         vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
1390                                        PIN_MAPPABLE |
1391                                        PIN_NONFAULT |
1392                                        PIN_NONBLOCK);
1393         if (!IS_ERR(vma)) {
1394                 node.start = i915_ggtt_offset(vma);
1395                 node.allocated = false;
1396                 ret = i915_vma_put_fence(vma);
1397                 if (ret) {
1398                         i915_vma_unpin(vma);
1399                         vma = ERR_PTR(ret);
1400                 }
1401         }
1402         if (IS_ERR(vma)) {
1403                 ret = insert_mappable_node(ggtt, &node, PAGE_SIZE);
1404                 if (ret)
1405                         goto out_rpm;
1406                 GEM_BUG_ON(!node.allocated);
1407         }
1408
1409         ret = i915_gem_object_set_to_gtt_domain(obj, true);
1410         if (ret)
1411                 goto out_unpin;
1412
1413         mutex_unlock(&i915->drm.struct_mutex);
1414
1415         intel_fb_obj_invalidate(obj, ORIGIN_CPU);
1416
1417         user_data = u64_to_user_ptr(args->data_ptr);
1418         offset = args->offset;
1419         remain = args->size;
1420         while (remain) {
1421                 /* Operation in this page
1422                  *
1423                  * page_base = page offset within aperture
1424                  * page_offset = offset within page
1425                  * page_length = bytes to copy for this page
1426                  */
1427                 u32 page_base = node.start;
1428                 unsigned int page_offset = offset_in_page(offset);
1429                 unsigned int page_length = PAGE_SIZE - page_offset;
1430                 page_length = remain < page_length ? remain : page_length;
1431                 if (node.allocated) {
1432                         wmb(); /* flush the write before we modify the GGTT */
1433                         ggtt->vm.insert_page(&ggtt->vm,
1434                                              i915_gem_object_get_dma_address(obj, offset >> PAGE_SHIFT),
1435                                              node.start, I915_CACHE_NONE, 0);
1436                         wmb(); /* flush modifications to the GGTT (insert_page) */
1437                 } else {
1438                         page_base += offset & PAGE_MASK;
1439                 }
1440                 /* If we get a fault while copying data, then (presumably) our
1441                  * source page isn't available.  Return the error and we'll
1442                  * retry in the slow path.
1443                  * If the object is non-shmem backed, we retry again with the
1444                  * path that handles page fault.
1445                  */
1446                 if (ggtt_write(&ggtt->iomap, page_base, page_offset,
1447                                user_data, page_length)) {
1448                         ret = -EFAULT;
1449                         break;
1450                 }
1451
1452                 remain -= page_length;
1453                 user_data += page_length;
1454                 offset += page_length;
1455         }
1456         intel_fb_obj_flush(obj, ORIGIN_CPU);
1457
1458         mutex_lock(&i915->drm.struct_mutex);
1459 out_unpin:
1460         if (node.allocated) {
1461                 wmb();
1462                 ggtt->vm.clear_range(&ggtt->vm, node.start, node.size);
1463                 remove_mappable_node(&node);
1464         } else {
1465                 i915_vma_unpin(vma);
1466         }
1467 out_rpm:
1468         intel_runtime_pm_put(i915);
1469 out_unlock:
1470         mutex_unlock(&i915->drm.struct_mutex);
1471         return ret;
1472 }
1473
1474 static int
1475 shmem_pwrite_slow(struct page *page, int offset, int length,
1476                   char __user *user_data,
1477                   bool page_do_bit17_swizzling,
1478                   bool needs_clflush_before,
1479                   bool needs_clflush_after)
1480 {
1481         char *vaddr;
1482         int ret;
1483
1484         vaddr = kmap(page);
1485         if (unlikely(needs_clflush_before || page_do_bit17_swizzling))
1486                 shmem_clflush_swizzled_range(vaddr + offset, length,
1487                                              page_do_bit17_swizzling);
1488         if (page_do_bit17_swizzling)
1489                 ret = __copy_from_user_swizzled(vaddr, offset, user_data,
1490                                                 length);
1491         else
1492                 ret = __copy_from_user(vaddr + offset, user_data, length);
1493         if (needs_clflush_after)
1494                 shmem_clflush_swizzled_range(vaddr + offset, length,
1495                                              page_do_bit17_swizzling);
1496         kunmap(page);
1497
1498         return ret ? -EFAULT : 0;
1499 }
1500
1501 /* Per-page copy function for the shmem pwrite fastpath.
1502  * Flushes invalid cachelines before writing to the target if
1503  * needs_clflush_before is set and flushes out any written cachelines after
1504  * writing if needs_clflush is set.
1505  */
1506 static int
1507 shmem_pwrite(struct page *page, int offset, int len, char __user *user_data,
1508              bool page_do_bit17_swizzling,
1509              bool needs_clflush_before,
1510              bool needs_clflush_after)
1511 {
1512         int ret;
1513
1514         ret = -ENODEV;
1515         if (!page_do_bit17_swizzling) {
1516                 char *vaddr = kmap_atomic(page);
1517
1518                 if (needs_clflush_before)
1519                         drm_clflush_virt_range(vaddr + offset, len);
1520                 ret = __copy_from_user_inatomic(vaddr + offset, user_data, len);
1521                 if (needs_clflush_after)
1522                         drm_clflush_virt_range(vaddr + offset, len);
1523
1524                 kunmap_atomic(vaddr);
1525         }
1526         if (ret == 0)
1527                 return ret;
1528
1529         return shmem_pwrite_slow(page, offset, len, user_data,
1530                                  page_do_bit17_swizzling,
1531                                  needs_clflush_before,
1532                                  needs_clflush_after);
1533 }
1534
1535 static int
1536 i915_gem_shmem_pwrite(struct drm_i915_gem_object *obj,
1537                       const struct drm_i915_gem_pwrite *args)
1538 {
1539         struct drm_i915_private *i915 = to_i915(obj->base.dev);
1540         void __user *user_data;
1541         u64 remain;
1542         unsigned int obj_do_bit17_swizzling;
1543         unsigned int partial_cacheline_write;
1544         unsigned int needs_clflush;
1545         unsigned int offset, idx;
1546         int ret;
1547
1548         ret = mutex_lock_interruptible(&i915->drm.struct_mutex);
1549         if (ret)
1550                 return ret;
1551
1552         ret = i915_gem_obj_prepare_shmem_write(obj, &needs_clflush);
1553         mutex_unlock(&i915->drm.struct_mutex);
1554         if (ret)
1555                 return ret;
1556
1557         obj_do_bit17_swizzling = 0;
1558         if (i915_gem_object_needs_bit17_swizzle(obj))
1559                 obj_do_bit17_swizzling = BIT(17);
1560
1561         /* If we don't overwrite a cacheline completely we need to be
1562          * careful to have up-to-date data by first clflushing. Don't
1563          * overcomplicate things and flush the entire patch.
1564          */
1565         partial_cacheline_write = 0;
1566         if (needs_clflush & CLFLUSH_BEFORE)
1567                 partial_cacheline_write = boot_cpu_data.x86_clflush_size - 1;
1568
1569         user_data = u64_to_user_ptr(args->data_ptr);
1570         remain = args->size;
1571         offset = offset_in_page(args->offset);
1572         for (idx = args->offset >> PAGE_SHIFT; remain; idx++) {
1573                 struct page *page = i915_gem_object_get_page(obj, idx);
1574                 unsigned int length = min_t(u64, remain, PAGE_SIZE - offset);
1575
1576                 ret = shmem_pwrite(page, offset, length, user_data,
1577                                    page_to_phys(page) & obj_do_bit17_swizzling,
1578                                    (offset | length) & partial_cacheline_write,
1579                                    needs_clflush & CLFLUSH_AFTER);
1580                 if (ret)
1581                         break;
1582
1583                 remain -= length;
1584                 user_data += length;
1585                 offset = 0;
1586         }
1587
1588         intel_fb_obj_flush(obj, ORIGIN_CPU);
1589         i915_gem_obj_finish_shmem_access(obj);
1590         return ret;
1591 }
1592
1593 /**
1594  * Writes data to the object referenced by handle.
1595  * @dev: drm device
1596  * @data: ioctl data blob
1597  * @file: drm file
1598  *
1599  * On error, the contents of the buffer that were to be modified are undefined.
1600  */
1601 int
1602 i915_gem_pwrite_ioctl(struct drm_device *dev, void *data,
1603                       struct drm_file *file)
1604 {
1605         struct drm_i915_gem_pwrite *args = data;
1606         struct drm_i915_gem_object *obj;
1607         int ret;
1608
1609         if (args->size == 0)
1610                 return 0;
1611
1612         if (!access_ok(VERIFY_READ,
1613                        u64_to_user_ptr(args->data_ptr),
1614                        args->size))
1615                 return -EFAULT;
1616
1617         obj = i915_gem_object_lookup(file, args->handle);
1618         if (!obj)
1619                 return -ENOENT;
1620
1621         /* Bounds check destination. */
1622         if (range_overflows_t(u64, args->offset, args->size, obj->base.size)) {
1623                 ret = -EINVAL;
1624                 goto err;
1625         }
1626
1627         /* Writes not allowed into this read-only object */
1628         if (i915_gem_object_is_readonly(obj)) {
1629                 ret = -EINVAL;
1630                 goto err;
1631         }
1632
1633         trace_i915_gem_object_pwrite(obj, args->offset, args->size);
1634
1635         ret = -ENODEV;
1636         if (obj->ops->pwrite)
1637                 ret = obj->ops->pwrite(obj, args);
1638         if (ret != -ENODEV)
1639                 goto err;
1640
1641         ret = i915_gem_object_wait(obj,
1642                                    I915_WAIT_INTERRUPTIBLE |
1643                                    I915_WAIT_ALL,
1644                                    MAX_SCHEDULE_TIMEOUT,
1645                                    to_rps_client(file));
1646         if (ret)
1647                 goto err;
1648
1649         ret = i915_gem_object_pin_pages(obj);
1650         if (ret)
1651                 goto err;
1652
1653         ret = -EFAULT;
1654         /* We can only do the GTT pwrite on untiled buffers, as otherwise
1655          * it would end up going through the fenced access, and we'll get
1656          * different detiling behavior between reading and writing.
1657          * pread/pwrite currently are reading and writing from the CPU
1658          * perspective, requiring manual detiling by the client.
1659          */
1660         if (!i915_gem_object_has_struct_page(obj) ||
1661             cpu_write_needs_clflush(obj))
1662                 /* Note that the gtt paths might fail with non-page-backed user
1663                  * pointers (e.g. gtt mappings when moving data between
1664                  * textures). Fallback to the shmem path in that case.
1665                  */
1666                 ret = i915_gem_gtt_pwrite_fast(obj, args);
1667
1668         if (ret == -EFAULT || ret == -ENOSPC) {
1669                 if (obj->phys_handle)
1670                         ret = i915_gem_phys_pwrite(obj, args, file);
1671                 else
1672                         ret = i915_gem_shmem_pwrite(obj, args);
1673         }
1674
1675         i915_gem_object_unpin_pages(obj);
1676 err:
1677         i915_gem_object_put(obj);
1678         return ret;
1679 }
1680
1681 static void i915_gem_object_bump_inactive_ggtt(struct drm_i915_gem_object *obj)
1682 {
1683         struct drm_i915_private *i915;
1684         struct list_head *list;
1685         struct i915_vma *vma;
1686
1687         GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
1688
1689         for_each_ggtt_vma(vma, obj) {
1690                 if (i915_vma_is_active(vma))
1691                         continue;
1692
1693                 if (!drm_mm_node_allocated(&vma->node))
1694                         continue;
1695
1696                 list_move_tail(&vma->vm_link, &vma->vm->inactive_list);
1697         }
1698
1699         i915 = to_i915(obj->base.dev);
1700         spin_lock(&i915->mm.obj_lock);
1701         list = obj->bind_count ? &i915->mm.bound_list : &i915->mm.unbound_list;
1702         list_move_tail(&obj->mm.link, list);
1703         spin_unlock(&i915->mm.obj_lock);
1704 }
1705
1706 /**
1707  * Called when user space prepares to use an object with the CPU, either
1708  * through the mmap ioctl's mapping or a GTT mapping.
1709  * @dev: drm device
1710  * @data: ioctl data blob
1711  * @file: drm file
1712  */
1713 int
1714 i915_gem_set_domain_ioctl(struct drm_device *dev, void *data,
1715                           struct drm_file *file)
1716 {
1717         struct drm_i915_gem_set_domain *args = data;
1718         struct drm_i915_gem_object *obj;
1719         uint32_t read_domains = args->read_domains;
1720         uint32_t write_domain = args->write_domain;
1721         int err;
1722
1723         /* Only handle setting domains to types used by the CPU. */
1724         if ((write_domain | read_domains) & I915_GEM_GPU_DOMAINS)
1725                 return -EINVAL;
1726
1727         /* Having something in the write domain implies it's in the read
1728          * domain, and only that read domain.  Enforce that in the request.
1729          */
1730         if (write_domain != 0 && read_domains != write_domain)
1731                 return -EINVAL;
1732
1733         obj = i915_gem_object_lookup(file, args->handle);
1734         if (!obj)
1735                 return -ENOENT;
1736
1737         /* Try to flush the object off the GPU without holding the lock.
1738          * We will repeat the flush holding the lock in the normal manner
1739          * to catch cases where we are gazumped.
1740          */
1741         err = i915_gem_object_wait(obj,
1742                                    I915_WAIT_INTERRUPTIBLE |
1743                                    (write_domain ? I915_WAIT_ALL : 0),
1744                                    MAX_SCHEDULE_TIMEOUT,
1745                                    to_rps_client(file));
1746         if (err)
1747                 goto out;
1748
1749         /*
1750          * Proxy objects do not control access to the backing storage, ergo
1751          * they cannot be used as a means to manipulate the cache domain
1752          * tracking for that backing storage. The proxy object is always
1753          * considered to be outside of any cache domain.
1754          */
1755         if (i915_gem_object_is_proxy(obj)) {
1756                 err = -ENXIO;
1757                 goto out;
1758         }
1759
1760         /*
1761          * Flush and acquire obj->pages so that we are coherent through
1762          * direct access in memory with previous cached writes through
1763          * shmemfs and that our cache domain tracking remains valid.
1764          * For example, if the obj->filp was moved to swap without us
1765          * being notified and releasing the pages, we would mistakenly
1766          * continue to assume that the obj remained out of the CPU cached
1767          * domain.
1768          */
1769         err = i915_gem_object_pin_pages(obj);
1770         if (err)
1771                 goto out;
1772
1773         err = i915_mutex_lock_interruptible(dev);
1774         if (err)
1775                 goto out_unpin;
1776
1777         if (read_domains & I915_GEM_DOMAIN_WC)
1778                 err = i915_gem_object_set_to_wc_domain(obj, write_domain);
1779         else if (read_domains & I915_GEM_DOMAIN_GTT)
1780                 err = i915_gem_object_set_to_gtt_domain(obj, write_domain);
1781         else
1782                 err = i915_gem_object_set_to_cpu_domain(obj, write_domain);
1783
1784         /* And bump the LRU for this access */
1785         i915_gem_object_bump_inactive_ggtt(obj);
1786
1787         mutex_unlock(&dev->struct_mutex);
1788
1789         if (write_domain != 0)
1790                 intel_fb_obj_invalidate(obj,
1791                                         fb_write_origin(obj, write_domain));
1792
1793 out_unpin:
1794         i915_gem_object_unpin_pages(obj);
1795 out:
1796         i915_gem_object_put(obj);
1797         return err;
1798 }
1799
1800 /**
1801  * Called when user space has done writes to this buffer
1802  * @dev: drm device
1803  * @data: ioctl data blob
1804  * @file: drm file
1805  */
1806 int
1807 i915_gem_sw_finish_ioctl(struct drm_device *dev, void *data,
1808                          struct drm_file *file)
1809 {
1810         struct drm_i915_gem_sw_finish *args = data;
1811         struct drm_i915_gem_object *obj;
1812
1813         obj = i915_gem_object_lookup(file, args->handle);
1814         if (!obj)
1815                 return -ENOENT;
1816
1817         /*
1818          * Proxy objects are barred from CPU access, so there is no
1819          * need to ban sw_finish as it is a nop.
1820          */
1821
1822         /* Pinned buffers may be scanout, so flush the cache */
1823         i915_gem_object_flush_if_display(obj);
1824         i915_gem_object_put(obj);
1825
1826         return 0;
1827 }
1828
1829 /**
1830  * i915_gem_mmap_ioctl - Maps the contents of an object, returning the address
1831  *                       it is mapped to.
1832  * @dev: drm device
1833  * @data: ioctl data blob
1834  * @file: drm file
1835  *
1836  * While the mapping holds a reference on the contents of the object, it doesn't
1837  * imply a ref on the object itself.
1838  *
1839  * IMPORTANT:
1840  *
1841  * DRM driver writers who look a this function as an example for how to do GEM
1842  * mmap support, please don't implement mmap support like here. The modern way
1843  * to implement DRM mmap support is with an mmap offset ioctl (like
1844  * i915_gem_mmap_gtt) and then using the mmap syscall on the DRM fd directly.
1845  * That way debug tooling like valgrind will understand what's going on, hiding
1846  * the mmap call in a driver private ioctl will break that. The i915 driver only
1847  * does cpu mmaps this way because we didn't know better.
1848  */
1849 int
1850 i915_gem_mmap_ioctl(struct drm_device *dev, void *data,
1851                     struct drm_file *file)
1852 {
1853         struct drm_i915_gem_mmap *args = data;
1854         struct drm_i915_gem_object *obj;
1855         unsigned long addr;
1856
1857         if (args->flags & ~(I915_MMAP_WC))
1858                 return -EINVAL;
1859
1860         if (args->flags & I915_MMAP_WC && !boot_cpu_has(X86_FEATURE_PAT))
1861                 return -ENODEV;
1862
1863         obj = i915_gem_object_lookup(file, args->handle);
1864         if (!obj)
1865                 return -ENOENT;
1866
1867         /* prime objects have no backing filp to GEM mmap
1868          * pages from.
1869          */
1870         if (!obj->base.filp) {
1871                 i915_gem_object_put(obj);
1872                 return -ENXIO;
1873         }
1874
1875         addr = vm_mmap(obj->base.filp, 0, args->size,
1876                        PROT_READ | PROT_WRITE, MAP_SHARED,
1877                        args->offset);
1878         if (args->flags & I915_MMAP_WC) {
1879                 struct mm_struct *mm = current->mm;
1880                 struct vm_area_struct *vma;
1881
1882                 if (down_write_killable(&mm->mmap_sem)) {
1883                         i915_gem_object_put(obj);
1884                         return -EINTR;
1885                 }
1886                 vma = find_vma(mm, addr);
1887                 if (vma)
1888                         vma->vm_page_prot =
1889                                 pgprot_writecombine(vm_get_page_prot(vma->vm_flags));
1890                 else
1891                         addr = -ENOMEM;
1892                 up_write(&mm->mmap_sem);
1893
1894                 /* This may race, but that's ok, it only gets set */
1895                 WRITE_ONCE(obj->frontbuffer_ggtt_origin, ORIGIN_CPU);
1896         }
1897         i915_gem_object_put(obj);
1898         if (IS_ERR((void *)addr))
1899                 return addr;
1900
1901         args->addr_ptr = (uint64_t) addr;
1902
1903         return 0;
1904 }
1905
1906 static unsigned int tile_row_pages(const struct drm_i915_gem_object *obj)
1907 {
1908         return i915_gem_object_get_tile_row_size(obj) >> PAGE_SHIFT;
1909 }
1910
1911 /**
1912  * i915_gem_mmap_gtt_version - report the current feature set for GTT mmaps
1913  *
1914  * A history of the GTT mmap interface:
1915  *
1916  * 0 - Everything had to fit into the GTT. Both parties of a memcpy had to
1917  *     aligned and suitable for fencing, and still fit into the available
1918  *     mappable space left by the pinned display objects. A classic problem
1919  *     we called the page-fault-of-doom where we would ping-pong between
1920  *     two objects that could not fit inside the GTT and so the memcpy
1921  *     would page one object in at the expense of the other between every
1922  *     single byte.
1923  *
1924  * 1 - Objects can be any size, and have any compatible fencing (X Y, or none
1925  *     as set via i915_gem_set_tiling() [DRM_I915_GEM_SET_TILING]). If the
1926  *     object is too large for the available space (or simply too large
1927  *     for the mappable aperture!), a view is created instead and faulted
1928  *     into userspace. (This view is aligned and sized appropriately for
1929  *     fenced access.)
1930  *
1931  * 2 - Recognise WC as a separate cache domain so that we can flush the
1932  *     delayed writes via GTT before performing direct access via WC.
1933  *
1934  * Restrictions:
1935  *
1936  *  * snoopable objects cannot be accessed via the GTT. It can cause machine
1937  *    hangs on some architectures, corruption on others. An attempt to service
1938  *    a GTT page fault from a snoopable object will generate a SIGBUS.
1939  *
1940  *  * the object must be able to fit into RAM (physical memory, though no
1941  *    limited to the mappable aperture).
1942  *
1943  *
1944  * Caveats:
1945  *
1946  *  * a new GTT page fault will synchronize rendering from the GPU and flush
1947  *    all data to system memory. Subsequent access will not be synchronized.
1948  *
1949  *  * all mappings are revoked on runtime device suspend.
1950  *
1951  *  * there are only 8, 16 or 32 fence registers to share between all users
1952  *    (older machines require fence register for display and blitter access
1953  *    as well). Contention of the fence registers will cause the previous users
1954  *    to be unmapped and any new access will generate new page faults.
1955  *
1956  *  * running out of memory while servicing a fault may generate a SIGBUS,
1957  *    rather than the expected SIGSEGV.
1958  */
1959 int i915_gem_mmap_gtt_version(void)
1960 {
1961         return 2;
1962 }
1963
1964 static inline struct i915_ggtt_view
1965 compute_partial_view(const struct drm_i915_gem_object *obj,
1966                      pgoff_t page_offset,
1967                      unsigned int chunk)
1968 {
1969         struct i915_ggtt_view view;
1970
1971         if (i915_gem_object_is_tiled(obj))
1972                 chunk = roundup(chunk, tile_row_pages(obj));
1973
1974         view.type = I915_GGTT_VIEW_PARTIAL;
1975         view.partial.offset = rounddown(page_offset, chunk);
1976         view.partial.size =
1977                 min_t(unsigned int, chunk,
1978                       (obj->base.size >> PAGE_SHIFT) - view.partial.offset);
1979
1980         /* If the partial covers the entire object, just create a normal VMA. */
1981         if (chunk >= obj->base.size >> PAGE_SHIFT)
1982                 view.type = I915_GGTT_VIEW_NORMAL;
1983
1984         return view;
1985 }
1986
1987 /**
1988  * i915_gem_fault - fault a page into the GTT
1989  * @vmf: fault info
1990  *
1991  * The fault handler is set up by drm_gem_mmap() when a object is GTT mapped
1992  * from userspace.  The fault handler takes care of binding the object to
1993  * the GTT (if needed), allocating and programming a fence register (again,
1994  * only if needed based on whether the old reg is still valid or the object
1995  * is tiled) and inserting a new PTE into the faulting process.
1996  *
1997  * Note that the faulting process may involve evicting existing objects
1998  * from the GTT and/or fence registers to make room.  So performance may
1999  * suffer if the GTT working set is large or there are few fence registers
2000  * left.
2001  *
2002  * The current feature set supported by i915_gem_fault() and thus GTT mmaps
2003  * is exposed via I915_PARAM_MMAP_GTT_VERSION (see i915_gem_mmap_gtt_version).
2004  */
2005 vm_fault_t i915_gem_fault(struct vm_fault *vmf)
2006 {
2007 #define MIN_CHUNK_PAGES (SZ_1M >> PAGE_SHIFT)
2008         struct vm_area_struct *area = vmf->vma;
2009         struct drm_i915_gem_object *obj = to_intel_bo(area->vm_private_data);
2010         struct drm_device *dev = obj->base.dev;
2011         struct drm_i915_private *dev_priv = to_i915(dev);
2012         struct i915_ggtt *ggtt = &dev_priv->ggtt;
2013         bool write = area->vm_flags & VM_WRITE;
2014         struct i915_vma *vma;
2015         pgoff_t page_offset;
2016         int ret;
2017
2018         /* Sanity check that we allow writing into this object */
2019         if (i915_gem_object_is_readonly(obj) && write)
2020                 return VM_FAULT_SIGBUS;
2021
2022         /* We don't use vmf->pgoff since that has the fake offset */
2023         page_offset = (vmf->address - area->vm_start) >> PAGE_SHIFT;
2024
2025         trace_i915_gem_object_fault(obj, page_offset, true, write);
2026
2027         /* Try to flush the object off the GPU first without holding the lock.
2028          * Upon acquiring the lock, we will perform our sanity checks and then
2029          * repeat the flush holding the lock in the normal manner to catch cases
2030          * where we are gazumped.
2031          */
2032         ret = i915_gem_object_wait(obj,
2033                                    I915_WAIT_INTERRUPTIBLE,
2034                                    MAX_SCHEDULE_TIMEOUT,
2035                                    NULL);
2036         if (ret)
2037                 goto err;
2038
2039         ret = i915_gem_object_pin_pages(obj);
2040         if (ret)
2041                 goto err;
2042
2043         intel_runtime_pm_get(dev_priv);
2044
2045         ret = i915_mutex_lock_interruptible(dev);
2046         if (ret)
2047                 goto err_rpm;
2048
2049         /* Access to snoopable pages through the GTT is incoherent. */
2050         if (obj->cache_level != I915_CACHE_NONE && !HAS_LLC(dev_priv)) {
2051                 ret = -EFAULT;
2052                 goto err_unlock;
2053         }
2054
2055
2056         /* Now pin it into the GTT as needed */
2057         vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
2058                                        PIN_MAPPABLE |
2059                                        PIN_NONBLOCK |
2060                                        PIN_NONFAULT);
2061         if (IS_ERR(vma)) {
2062                 /* Use a partial view if it is bigger than available space */
2063                 struct i915_ggtt_view view =
2064                         compute_partial_view(obj, page_offset, MIN_CHUNK_PAGES);
2065                 unsigned int flags;
2066
2067                 flags = PIN_MAPPABLE;
2068                 if (view.type == I915_GGTT_VIEW_NORMAL)
2069                         flags |= PIN_NONBLOCK; /* avoid warnings for pinned */
2070
2071                 /*
2072                  * Userspace is now writing through an untracked VMA, abandon
2073                  * all hope that the hardware is able to track future writes.
2074                  */
2075                 obj->frontbuffer_ggtt_origin = ORIGIN_CPU;
2076
2077                 vma = i915_gem_object_ggtt_pin(obj, &view, 0, 0, flags);
2078                 if (IS_ERR(vma) && !view.type) {
2079                         flags = PIN_MAPPABLE;
2080                         view.type = I915_GGTT_VIEW_PARTIAL;
2081                         vma = i915_gem_object_ggtt_pin(obj, &view, 0, 0, flags);
2082                 }
2083         }
2084         if (IS_ERR(vma)) {
2085                 ret = PTR_ERR(vma);
2086                 goto err_unlock;
2087         }
2088
2089         ret = i915_gem_object_set_to_gtt_domain(obj, write);
2090         if (ret)
2091                 goto err_unpin;
2092
2093         ret = i915_vma_pin_fence(vma);
2094         if (ret)
2095                 goto err_unpin;
2096
2097         /* Finally, remap it using the new GTT offset */
2098         ret = remap_io_mapping(area,
2099                                area->vm_start + (vma->ggtt_view.partial.offset << PAGE_SHIFT),
2100                                (ggtt->gmadr.start + vma->node.start) >> PAGE_SHIFT,
2101                                min_t(u64, vma->size, area->vm_end - area->vm_start),
2102                                &ggtt->iomap);
2103         if (ret)
2104                 goto err_fence;
2105
2106         /* Mark as being mmapped into userspace for later revocation */
2107         assert_rpm_wakelock_held(dev_priv);
2108         if (!i915_vma_set_userfault(vma) && !obj->userfault_count++)
2109                 list_add(&obj->userfault_link, &dev_priv->mm.userfault_list);
2110         GEM_BUG_ON(!obj->userfault_count);
2111
2112         i915_vma_set_ggtt_write(vma);
2113
2114 err_fence:
2115         i915_vma_unpin_fence(vma);
2116 err_unpin:
2117         __i915_vma_unpin(vma);
2118 err_unlock:
2119         mutex_unlock(&dev->struct_mutex);
2120 err_rpm:
2121         intel_runtime_pm_put(dev_priv);
2122         i915_gem_object_unpin_pages(obj);
2123 err:
2124         switch (ret) {
2125         case -EIO:
2126                 /*
2127                  * We eat errors when the gpu is terminally wedged to avoid
2128                  * userspace unduly crashing (gl has no provisions for mmaps to
2129                  * fail). But any other -EIO isn't ours (e.g. swap in failure)
2130                  * and so needs to be reported.
2131                  */
2132                 if (!i915_terminally_wedged(&dev_priv->gpu_error))
2133                         return VM_FAULT_SIGBUS;
2134                 /* else: fall through */
2135         case -EAGAIN:
2136                 /*
2137                  * EAGAIN means the gpu is hung and we'll wait for the error
2138                  * handler to reset everything when re-faulting in
2139                  * i915_mutex_lock_interruptible.
2140                  */
2141         case 0:
2142         case -ERESTARTSYS:
2143         case -EINTR:
2144         case -EBUSY:
2145                 /*
2146                  * EBUSY is ok: this just means that another thread
2147                  * already did the job.
2148                  */
2149                 return VM_FAULT_NOPAGE;
2150         case -ENOMEM:
2151                 return VM_FAULT_OOM;
2152         case -ENOSPC:
2153         case -EFAULT:
2154                 return VM_FAULT_SIGBUS;
2155         default:
2156                 WARN_ONCE(ret, "unhandled error in i915_gem_fault: %i\n", ret);
2157                 return VM_FAULT_SIGBUS;
2158         }
2159 }
2160
2161 static void __i915_gem_object_release_mmap(struct drm_i915_gem_object *obj)
2162 {
2163         struct i915_vma *vma;
2164
2165         GEM_BUG_ON(!obj->userfault_count);
2166
2167         obj->userfault_count = 0;
2168         list_del(&obj->userfault_link);
2169         drm_vma_node_unmap(&obj->base.vma_node,
2170                            obj->base.dev->anon_inode->i_mapping);
2171
2172         for_each_ggtt_vma(vma, obj)
2173                 i915_vma_unset_userfault(vma);
2174 }
2175
2176 /**
2177  * i915_gem_release_mmap - remove physical page mappings
2178  * @obj: obj in question
2179  *
2180  * Preserve the reservation of the mmapping with the DRM core code, but
2181  * relinquish ownership of the pages back to the system.
2182  *
2183  * It is vital that we remove the page mapping if we have mapped a tiled
2184  * object through the GTT and then lose the fence register due to
2185  * resource pressure. Similarly if the object has been moved out of the
2186  * aperture, than pages mapped into userspace must be revoked. Removing the
2187  * mapping will then trigger a page fault on the next user access, allowing
2188  * fixup by i915_gem_fault().
2189  */
2190 void
2191 i915_gem_release_mmap(struct drm_i915_gem_object *obj)
2192 {
2193         struct drm_i915_private *i915 = to_i915(obj->base.dev);
2194
2195         /* Serialisation between user GTT access and our code depends upon
2196          * revoking the CPU's PTE whilst the mutex is held. The next user
2197          * pagefault then has to wait until we release the mutex.
2198          *
2199          * Note that RPM complicates somewhat by adding an additional
2200          * requirement that operations to the GGTT be made holding the RPM
2201          * wakeref.
2202          */
2203         lockdep_assert_held(&i915->drm.struct_mutex);
2204         intel_runtime_pm_get(i915);
2205
2206         if (!obj->userfault_count)
2207                 goto out;
2208
2209         __i915_gem_object_release_mmap(obj);
2210
2211         /* Ensure that the CPU's PTE are revoked and there are not outstanding
2212          * memory transactions from userspace before we return. The TLB
2213          * flushing implied above by changing the PTE above *should* be
2214          * sufficient, an extra barrier here just provides us with a bit
2215          * of paranoid documentation about our requirement to serialise
2216          * memory writes before touching registers / GSM.
2217          */
2218         wmb();
2219
2220 out:
2221         intel_runtime_pm_put(i915);
2222 }
2223
2224 void i915_gem_runtime_suspend(struct drm_i915_private *dev_priv)
2225 {
2226         struct drm_i915_gem_object *obj, *on;
2227         int i;
2228
2229         /*
2230          * Only called during RPM suspend. All users of the userfault_list
2231          * must be holding an RPM wakeref to ensure that this can not
2232          * run concurrently with themselves (and use the struct_mutex for
2233          * protection between themselves).
2234          */
2235
2236         list_for_each_entry_safe(obj, on,
2237                                  &dev_priv->mm.userfault_list, userfault_link)
2238                 __i915_gem_object_release_mmap(obj);
2239
2240         /* The fence will be lost when the device powers down. If any were
2241          * in use by hardware (i.e. they are pinned), we should not be powering
2242          * down! All other fences will be reacquired by the user upon waking.
2243          */
2244         for (i = 0; i < dev_priv->num_fence_regs; i++) {
2245                 struct drm_i915_fence_reg *reg = &dev_priv->fence_regs[i];
2246
2247                 /* Ideally we want to assert that the fence register is not
2248                  * live at this point (i.e. that no piece of code will be
2249                  * trying to write through fence + GTT, as that both violates
2250                  * our tracking of activity and associated locking/barriers,
2251                  * but also is illegal given that the hw is powered down).
2252                  *
2253                  * Previously we used reg->pin_count as a "liveness" indicator.
2254                  * That is not sufficient, and we need a more fine-grained
2255                  * tool if we want to have a sanity check here.
2256                  */
2257
2258                 if (!reg->vma)
2259                         continue;
2260
2261                 GEM_BUG_ON(i915_vma_has_userfault(reg->vma));
2262                 reg->dirty = true;
2263         }
2264 }
2265
2266 static int i915_gem_object_create_mmap_offset(struct drm_i915_gem_object *obj)
2267 {
2268         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
2269         int err;
2270
2271         err = drm_gem_create_mmap_offset(&obj->base);
2272         if (likely(!err))
2273                 return 0;
2274
2275         /* Attempt to reap some mmap space from dead objects */
2276         do {
2277                 err = i915_gem_wait_for_idle(dev_priv,
2278                                              I915_WAIT_INTERRUPTIBLE,
2279                                              MAX_SCHEDULE_TIMEOUT);
2280                 if (err)
2281                         break;
2282
2283                 i915_gem_drain_freed_objects(dev_priv);
2284                 err = drm_gem_create_mmap_offset(&obj->base);
2285                 if (!err)
2286                         break;
2287
2288         } while (flush_delayed_work(&dev_priv->gt.retire_work));
2289
2290         return err;
2291 }
2292
2293 static void i915_gem_object_free_mmap_offset(struct drm_i915_gem_object *obj)
2294 {
2295         drm_gem_free_mmap_offset(&obj->base);
2296 }
2297
2298 int
2299 i915_gem_mmap_gtt(struct drm_file *file,
2300                   struct drm_device *dev,
2301                   uint32_t handle,
2302                   uint64_t *offset)
2303 {
2304         struct drm_i915_gem_object *obj;
2305         int ret;
2306
2307         obj = i915_gem_object_lookup(file, handle);
2308         if (!obj)
2309                 return -ENOENT;
2310
2311         ret = i915_gem_object_create_mmap_offset(obj);
2312         if (ret == 0)
2313                 *offset = drm_vma_node_offset_addr(&obj->base.vma_node);
2314
2315         i915_gem_object_put(obj);
2316         return ret;
2317 }
2318
2319 /**
2320  * i915_gem_mmap_gtt_ioctl - prepare an object for GTT mmap'ing
2321  * @dev: DRM device
2322  * @data: GTT mapping ioctl data
2323  * @file: GEM object info
2324  *
2325  * Simply returns the fake offset to userspace so it can mmap it.
2326  * The mmap call will end up in drm_gem_mmap(), which will set things
2327  * up so we can get faults in the handler above.
2328  *
2329  * The fault handler will take care of binding the object into the GTT
2330  * (since it may have been evicted to make room for something), allocating
2331  * a fence register, and mapping the appropriate aperture address into
2332  * userspace.
2333  */
2334 int
2335 i915_gem_mmap_gtt_ioctl(struct drm_device *dev, void *data,
2336                         struct drm_file *file)
2337 {
2338         struct drm_i915_gem_mmap_gtt *args = data;
2339
2340         return i915_gem_mmap_gtt(file, dev, args->handle, &args->offset);
2341 }
2342
2343 /* Immediately discard the backing storage */
2344 static void
2345 i915_gem_object_truncate(struct drm_i915_gem_object *obj)
2346 {
2347         i915_gem_object_free_mmap_offset(obj);
2348
2349         if (obj->base.filp == NULL)
2350                 return;
2351
2352         /* Our goal here is to return as much of the memory as
2353          * is possible back to the system as we are called from OOM.
2354          * To do this we must instruct the shmfs to drop all of its
2355          * backing pages, *now*.
2356          */
2357         shmem_truncate_range(file_inode(obj->base.filp), 0, (loff_t)-1);
2358         obj->mm.madv = __I915_MADV_PURGED;
2359         obj->mm.pages = ERR_PTR(-EFAULT);
2360 }
2361
2362 /* Try to discard unwanted pages */
2363 void __i915_gem_object_invalidate(struct drm_i915_gem_object *obj)
2364 {
2365         struct address_space *mapping;
2366
2367         lockdep_assert_held(&obj->mm.lock);
2368         GEM_BUG_ON(i915_gem_object_has_pages(obj));
2369
2370         switch (obj->mm.madv) {
2371         case I915_MADV_DONTNEED:
2372                 i915_gem_object_truncate(obj);
2373         case __I915_MADV_PURGED:
2374                 return;
2375         }
2376
2377         if (obj->base.filp == NULL)
2378                 return;
2379
2380         mapping = obj->base.filp->f_mapping,
2381         invalidate_mapping_pages(mapping, 0, (loff_t)-1);
2382 }
2383
2384 static void
2385 i915_gem_object_put_pages_gtt(struct drm_i915_gem_object *obj,
2386                               struct sg_table *pages)
2387 {
2388         struct sgt_iter sgt_iter;
2389         struct page *page;
2390
2391         __i915_gem_object_release_shmem(obj, pages, true);
2392
2393         i915_gem_gtt_finish_pages(obj, pages);
2394
2395         if (i915_gem_object_needs_bit17_swizzle(obj))
2396                 i915_gem_object_save_bit_17_swizzle(obj, pages);
2397
2398         for_each_sgt_page(page, sgt_iter, pages) {
2399                 if (obj->mm.dirty)
2400                         set_page_dirty(page);
2401
2402                 if (obj->mm.madv == I915_MADV_WILLNEED)
2403                         mark_page_accessed(page);
2404
2405                 put_page(page);
2406         }
2407         obj->mm.dirty = false;
2408
2409         sg_free_table(pages);
2410         kfree(pages);
2411 }
2412
2413 static void __i915_gem_object_reset_page_iter(struct drm_i915_gem_object *obj)
2414 {
2415         struct radix_tree_iter iter;
2416         void __rcu **slot;
2417
2418         rcu_read_lock();
2419         radix_tree_for_each_slot(slot, &obj->mm.get_page.radix, &iter, 0)
2420                 radix_tree_delete(&obj->mm.get_page.radix, iter.index);
2421         rcu_read_unlock();
2422 }
2423
2424 static struct sg_table *
2425 __i915_gem_object_unset_pages(struct drm_i915_gem_object *obj)
2426 {
2427         struct drm_i915_private *i915 = to_i915(obj->base.dev);
2428         struct sg_table *pages;
2429
2430         pages = fetch_and_zero(&obj->mm.pages);
2431         if (!pages)
2432                 return NULL;
2433
2434         spin_lock(&i915->mm.obj_lock);
2435         list_del(&obj->mm.link);
2436         spin_unlock(&i915->mm.obj_lock);
2437
2438         if (obj->mm.mapping) {
2439                 void *ptr;
2440
2441                 ptr = page_mask_bits(obj->mm.mapping);
2442                 if (is_vmalloc_addr(ptr))
2443                         vunmap(ptr);
2444                 else
2445                         kunmap(kmap_to_page(ptr));
2446
2447                 obj->mm.mapping = NULL;
2448         }
2449
2450         __i915_gem_object_reset_page_iter(obj);
2451         obj->mm.page_sizes.phys = obj->mm.page_sizes.sg = 0;
2452
2453         return pages;
2454 }
2455
2456 void __i915_gem_object_put_pages(struct drm_i915_gem_object *obj,
2457                                  enum i915_mm_subclass subclass)
2458 {
2459         struct sg_table *pages;
2460
2461         if (i915_gem_object_has_pinned_pages(obj))
2462                 return;
2463
2464         GEM_BUG_ON(obj->bind_count);
2465         if (!i915_gem_object_has_pages(obj))
2466                 return;
2467
2468         /* May be called by shrinker from within get_pages() (on another bo) */
2469         mutex_lock_nested(&obj->mm.lock, subclass);
2470         if (unlikely(atomic_read(&obj->mm.pages_pin_count)))
2471                 goto unlock;
2472
2473         /*
2474          * ->put_pages might need to allocate memory for the bit17 swizzle
2475          * array, hence protect them from being reaped by removing them from gtt
2476          * lists early.
2477          */
2478         pages = __i915_gem_object_unset_pages(obj);
2479         if (!IS_ERR(pages))
2480                 obj->ops->put_pages(obj, pages);
2481
2482 unlock:
2483         mutex_unlock(&obj->mm.lock);
2484 }
2485
2486 static bool i915_sg_trim(struct sg_table *orig_st)
2487 {
2488         struct sg_table new_st;
2489         struct scatterlist *sg, *new_sg;
2490         unsigned int i;
2491
2492         if (orig_st->nents == orig_st->orig_nents)
2493                 return false;
2494
2495         if (sg_alloc_table(&new_st, orig_st->nents, GFP_KERNEL | __GFP_NOWARN))
2496                 return false;
2497
2498         new_sg = new_st.sgl;
2499         for_each_sg(orig_st->sgl, sg, orig_st->nents, i) {
2500                 sg_set_page(new_sg, sg_page(sg), sg->length, 0);
2501                 sg_dma_address(new_sg) = sg_dma_address(sg);
2502                 sg_dma_len(new_sg) = sg_dma_len(sg);
2503
2504                 new_sg = sg_next(new_sg);
2505         }
2506         GEM_BUG_ON(new_sg); /* Should walk exactly nents and hit the end */
2507
2508         sg_free_table(orig_st);
2509
2510         *orig_st = new_st;
2511         return true;
2512 }
2513
2514 static int i915_gem_object_get_pages_gtt(struct drm_i915_gem_object *obj)
2515 {
2516         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
2517         const unsigned long page_count = obj->base.size / PAGE_SIZE;
2518         unsigned long i;
2519         struct address_space *mapping;
2520         struct sg_table *st;
2521         struct scatterlist *sg;
2522         struct sgt_iter sgt_iter;
2523         struct page *page;
2524         unsigned long last_pfn = 0;     /* suppress gcc warning */
2525         unsigned int max_segment = i915_sg_segment_size();
2526         unsigned int sg_page_sizes;
2527         gfp_t noreclaim;
2528         int ret;
2529
2530         /*
2531          * Assert that the object is not currently in any GPU domain. As it
2532          * wasn't in the GTT, there shouldn't be any way it could have been in
2533          * a GPU cache
2534          */
2535         GEM_BUG_ON(obj->read_domains & I915_GEM_GPU_DOMAINS);
2536         GEM_BUG_ON(obj->write_domain & I915_GEM_GPU_DOMAINS);
2537
2538         /*
2539          * If there's no chance of allocating enough pages for the whole
2540          * object, bail early.
2541          */
2542         if (page_count > totalram_pages)
2543                 return -ENOMEM;
2544
2545         st = kmalloc(sizeof(*st), GFP_KERNEL);
2546         if (st == NULL)
2547                 return -ENOMEM;
2548
2549 rebuild_st:
2550         if (sg_alloc_table(st, page_count, GFP_KERNEL)) {
2551                 kfree(st);
2552                 return -ENOMEM;
2553         }
2554
2555         /*
2556          * Get the list of pages out of our struct file.  They'll be pinned
2557          * at this point until we release them.
2558          *
2559          * Fail silently without starting the shrinker
2560          */
2561         mapping = obj->base.filp->f_mapping;
2562         noreclaim = mapping_gfp_constraint(mapping, ~__GFP_RECLAIM);
2563         noreclaim |= __GFP_NORETRY | __GFP_NOWARN;
2564
2565         sg = st->sgl;
2566         st->nents = 0;
2567         sg_page_sizes = 0;
2568         for (i = 0; i < page_count; i++) {
2569                 const unsigned int shrink[] = {
2570                         I915_SHRINK_BOUND | I915_SHRINK_UNBOUND | I915_SHRINK_PURGEABLE,
2571                         0,
2572                 }, *s = shrink;
2573                 gfp_t gfp = noreclaim;
2574
2575                 do {
2576                         page = shmem_read_mapping_page_gfp(mapping, i, gfp);
2577                         if (likely(!IS_ERR(page)))
2578                                 break;
2579
2580                         if (!*s) {
2581                                 ret = PTR_ERR(page);
2582                                 goto err_sg;
2583                         }
2584
2585                         i915_gem_shrink(dev_priv, 2 * page_count, NULL, *s++);
2586                         cond_resched();
2587
2588                         /*
2589                          * We've tried hard to allocate the memory by reaping
2590                          * our own buffer, now let the real VM do its job and
2591                          * go down in flames if truly OOM.
2592                          *
2593                          * However, since graphics tend to be disposable,
2594                          * defer the oom here by reporting the ENOMEM back
2595                          * to userspace.
2596                          */
2597                         if (!*s) {
2598                                 /* reclaim and warn, but no oom */
2599                                 gfp = mapping_gfp_mask(mapping);
2600
2601                                 /*
2602                                  * Our bo are always dirty and so we require
2603                                  * kswapd to reclaim our pages (direct reclaim
2604                                  * does not effectively begin pageout of our
2605                                  * buffers on its own). However, direct reclaim
2606                                  * only waits for kswapd when under allocation
2607                                  * congestion. So as a result __GFP_RECLAIM is
2608                                  * unreliable and fails to actually reclaim our
2609                                  * dirty pages -- unless you try over and over
2610                                  * again with !__GFP_NORETRY. However, we still
2611                                  * want to fail this allocation rather than
2612                                  * trigger the out-of-memory killer and for
2613                                  * this we want __GFP_RETRY_MAYFAIL.
2614                                  */
2615                                 gfp |= __GFP_RETRY_MAYFAIL;
2616                         }
2617                 } while (1);
2618
2619                 if (!i ||
2620                     sg->length >= max_segment ||
2621                     page_to_pfn(page) != last_pfn + 1) {
2622                         if (i) {
2623                                 sg_page_sizes |= sg->length;
2624                                 sg = sg_next(sg);
2625                         }
2626                         st->nents++;
2627                         sg_set_page(sg, page, PAGE_SIZE, 0);
2628                 } else {
2629                         sg->length += PAGE_SIZE;
2630                 }
2631                 last_pfn = page_to_pfn(page);
2632
2633                 /* Check that the i965g/gm workaround works. */
2634                 WARN_ON((gfp & __GFP_DMA32) && (last_pfn >= 0x00100000UL));
2635         }
2636         if (sg) { /* loop terminated early; short sg table */
2637                 sg_page_sizes |= sg->length;
2638                 sg_mark_end(sg);
2639         }
2640
2641         /* Trim unused sg entries to avoid wasting memory. */
2642         i915_sg_trim(st);
2643
2644         ret = i915_gem_gtt_prepare_pages(obj, st);
2645         if (ret) {
2646                 /*
2647                  * DMA remapping failed? One possible cause is that
2648                  * it could not reserve enough large entries, asking
2649                  * for PAGE_SIZE chunks instead may be helpful.
2650                  */
2651                 if (max_segment > PAGE_SIZE) {
2652                         for_each_sgt_page(page, sgt_iter, st)
2653                                 put_page(page);
2654                         sg_free_table(st);
2655
2656                         max_segment = PAGE_SIZE;
2657                         goto rebuild_st;
2658                 } else {
2659                         dev_warn(&dev_priv->drm.pdev->dev,
2660                                  "Failed to DMA remap %lu pages\n",
2661                                  page_count);
2662                         goto err_pages;
2663                 }
2664         }
2665
2666         if (i915_gem_object_needs_bit17_swizzle(obj))
2667                 i915_gem_object_do_bit_17_swizzle(obj, st);
2668
2669         __i915_gem_object_set_pages(obj, st, sg_page_sizes);
2670
2671         return 0;
2672
2673 err_sg:
2674         sg_mark_end(sg);
2675 err_pages:
2676         for_each_sgt_page(page, sgt_iter, st)
2677                 put_page(page);
2678         sg_free_table(st);
2679         kfree(st);
2680
2681         /*
2682          * shmemfs first checks if there is enough memory to allocate the page
2683          * and reports ENOSPC should there be insufficient, along with the usual
2684          * ENOMEM for a genuine allocation failure.
2685          *
2686          * We use ENOSPC in our driver to mean that we have run out of aperture
2687          * space and so want to translate the error from shmemfs back to our
2688          * usual understanding of ENOMEM.
2689          */
2690         if (ret == -ENOSPC)
2691                 ret = -ENOMEM;
2692
2693         return ret;
2694 }
2695
2696 void __i915_gem_object_set_pages(struct drm_i915_gem_object *obj,
2697                                  struct sg_table *pages,
2698                                  unsigned int sg_page_sizes)
2699 {
2700         struct drm_i915_private *i915 = to_i915(obj->base.dev);
2701         unsigned long supported = INTEL_INFO(i915)->page_sizes;
2702         int i;
2703
2704         lockdep_assert_held(&obj->mm.lock);
2705
2706         obj->mm.get_page.sg_pos = pages->sgl;
2707         obj->mm.get_page.sg_idx = 0;
2708
2709         obj->mm.pages = pages;
2710
2711         if (i915_gem_object_is_tiled(obj) &&
2712             i915->quirks & QUIRK_PIN_SWIZZLED_PAGES) {
2713                 GEM_BUG_ON(obj->mm.quirked);
2714                 __i915_gem_object_pin_pages(obj);
2715                 obj->mm.quirked = true;
2716         }
2717
2718         GEM_BUG_ON(!sg_page_sizes);
2719         obj->mm.page_sizes.phys = sg_page_sizes;
2720
2721         /*
2722          * Calculate the supported page-sizes which fit into the given
2723          * sg_page_sizes. This will give us the page-sizes which we may be able
2724          * to use opportunistically when later inserting into the GTT. For
2725          * example if phys=2G, then in theory we should be able to use 1G, 2M,
2726          * 64K or 4K pages, although in practice this will depend on a number of
2727          * other factors.
2728          */
2729         obj->mm.page_sizes.sg = 0;
2730         for_each_set_bit(i, &supported, ilog2(I915_GTT_MAX_PAGE_SIZE) + 1) {
2731                 if (obj->mm.page_sizes.phys & ~0u << i)
2732                         obj->mm.page_sizes.sg |= BIT(i);
2733         }
2734         GEM_BUG_ON(!HAS_PAGE_SIZES(i915, obj->mm.page_sizes.sg));
2735
2736         spin_lock(&i915->mm.obj_lock);
2737         list_add(&obj->mm.link, &i915->mm.unbound_list);
2738         spin_unlock(&i915->mm.obj_lock);
2739 }
2740
2741 static int ____i915_gem_object_get_pages(struct drm_i915_gem_object *obj)
2742 {
2743         int err;
2744
2745         if (unlikely(obj->mm.madv != I915_MADV_WILLNEED)) {
2746                 DRM_DEBUG("Attempting to obtain a purgeable object\n");
2747                 return -EFAULT;
2748         }
2749
2750         err = obj->ops->get_pages(obj);
2751         GEM_BUG_ON(!err && !i915_gem_object_has_pages(obj));
2752
2753         return err;
2754 }
2755
2756 /* Ensure that the associated pages are gathered from the backing storage
2757  * and pinned into our object. i915_gem_object_pin_pages() may be called
2758  * multiple times before they are released by a single call to
2759  * i915_gem_object_unpin_pages() - once the pages are no longer referenced
2760  * either as a result of memory pressure (reaping pages under the shrinker)
2761  * or as the object is itself released.
2762  */
2763 int __i915_gem_object_get_pages(struct drm_i915_gem_object *obj)
2764 {
2765         int err;
2766
2767         err = mutex_lock_interruptible(&obj->mm.lock);
2768         if (err)
2769                 return err;
2770
2771         if (unlikely(!i915_gem_object_has_pages(obj))) {
2772                 GEM_BUG_ON(i915_gem_object_has_pinned_pages(obj));
2773
2774                 err = ____i915_gem_object_get_pages(obj);
2775                 if (err)
2776                         goto unlock;
2777
2778                 smp_mb__before_atomic();
2779         }
2780         atomic_inc(&obj->mm.pages_pin_count);
2781
2782 unlock:
2783         mutex_unlock(&obj->mm.lock);
2784         return err;
2785 }
2786
2787 /* The 'mapping' part of i915_gem_object_pin_map() below */
2788 static void *i915_gem_object_map(const struct drm_i915_gem_object *obj,
2789                                  enum i915_map_type type)
2790 {
2791         unsigned long n_pages = obj->base.size >> PAGE_SHIFT;
2792         struct sg_table *sgt = obj->mm.pages;
2793         struct sgt_iter sgt_iter;
2794         struct page *page;
2795         struct page *stack_pages[32];
2796         struct page **pages = stack_pages;
2797         unsigned long i = 0;
2798         pgprot_t pgprot;
2799         void *addr;
2800
2801         /* A single page can always be kmapped */
2802         if (n_pages == 1 && type == I915_MAP_WB)
2803                 return kmap(sg_page(sgt->sgl));
2804
2805         if (n_pages > ARRAY_SIZE(stack_pages)) {
2806                 /* Too big for stack -- allocate temporary array instead */
2807                 pages = kvmalloc_array(n_pages, sizeof(*pages), GFP_KERNEL);
2808                 if (!pages)
2809                         return NULL;
2810         }
2811
2812         for_each_sgt_page(page, sgt_iter, sgt)
2813                 pages[i++] = page;
2814
2815         /* Check that we have the expected number of pages */
2816         GEM_BUG_ON(i != n_pages);
2817
2818         switch (type) {
2819         default:
2820                 MISSING_CASE(type);
2821                 /* fallthrough to use PAGE_KERNEL anyway */
2822         case I915_MAP_WB:
2823                 pgprot = PAGE_KERNEL;
2824                 break;
2825         case I915_MAP_WC:
2826                 pgprot = pgprot_writecombine(PAGE_KERNEL_IO);
2827                 break;
2828         }
2829         addr = vmap(pages, n_pages, 0, pgprot);
2830
2831         if (pages != stack_pages)
2832                 kvfree(pages);
2833
2834         return addr;
2835 }
2836
2837 /* get, pin, and map the pages of the object into kernel space */
2838 void *i915_gem_object_pin_map(struct drm_i915_gem_object *obj,
2839                               enum i915_map_type type)
2840 {
2841         enum i915_map_type has_type;
2842         bool pinned;
2843         void *ptr;
2844         int ret;
2845
2846         if (unlikely(!i915_gem_object_has_struct_page(obj)))
2847                 return ERR_PTR(-ENXIO);
2848
2849         ret = mutex_lock_interruptible(&obj->mm.lock);
2850         if (ret)
2851                 return ERR_PTR(ret);
2852
2853         pinned = !(type & I915_MAP_OVERRIDE);
2854         type &= ~I915_MAP_OVERRIDE;
2855
2856         if (!atomic_inc_not_zero(&obj->mm.pages_pin_count)) {
2857                 if (unlikely(!i915_gem_object_has_pages(obj))) {
2858                         GEM_BUG_ON(i915_gem_object_has_pinned_pages(obj));
2859
2860                         ret = ____i915_gem_object_get_pages(obj);
2861                         if (ret)
2862                                 goto err_unlock;
2863
2864                         smp_mb__before_atomic();
2865                 }
2866                 atomic_inc(&obj->mm.pages_pin_count);
2867                 pinned = false;
2868         }
2869         GEM_BUG_ON(!i915_gem_object_has_pages(obj));
2870
2871         ptr = page_unpack_bits(obj->mm.mapping, &has_type);
2872         if (ptr && has_type != type) {
2873                 if (pinned) {
2874                         ret = -EBUSY;
2875                         goto err_unpin;
2876                 }
2877
2878                 if (is_vmalloc_addr(ptr))
2879                         vunmap(ptr);
2880                 else
2881                         kunmap(kmap_to_page(ptr));
2882
2883                 ptr = obj->mm.mapping = NULL;
2884         }
2885
2886         if (!ptr) {
2887                 ptr = i915_gem_object_map(obj, type);
2888                 if (!ptr) {
2889                         ret = -ENOMEM;
2890                         goto err_unpin;
2891                 }
2892
2893                 obj->mm.mapping = page_pack_bits(ptr, type);
2894         }
2895
2896 out_unlock:
2897         mutex_unlock(&obj->mm.lock);
2898         return ptr;
2899
2900 err_unpin:
2901         atomic_dec(&obj->mm.pages_pin_count);
2902 err_unlock:
2903         ptr = ERR_PTR(ret);
2904         goto out_unlock;
2905 }
2906
2907 static int
2908 i915_gem_object_pwrite_gtt(struct drm_i915_gem_object *obj,
2909                            const struct drm_i915_gem_pwrite *arg)
2910 {
2911         struct address_space *mapping = obj->base.filp->f_mapping;
2912         char __user *user_data = u64_to_user_ptr(arg->data_ptr);
2913         u64 remain, offset;
2914         unsigned int pg;
2915
2916         /* Before we instantiate/pin the backing store for our use, we
2917          * can prepopulate the shmemfs filp efficiently using a write into
2918          * the pagecache. We avoid the penalty of instantiating all the
2919          * pages, important if the user is just writing to a few and never
2920          * uses the object on the GPU, and using a direct write into shmemfs
2921          * allows it to avoid the cost of retrieving a page (either swapin
2922          * or clearing-before-use) before it is overwritten.
2923          */
2924         if (i915_gem_object_has_pages(obj))
2925                 return -ENODEV;
2926
2927         if (obj->mm.madv != I915_MADV_WILLNEED)
2928                 return -EFAULT;
2929
2930         /* Before the pages are instantiated the object is treated as being
2931          * in the CPU domain. The pages will be clflushed as required before
2932          * use, and we can freely write into the pages directly. If userspace
2933          * races pwrite with any other operation; corruption will ensue -
2934          * that is userspace's prerogative!
2935          */
2936
2937         remain = arg->size;
2938         offset = arg->offset;
2939         pg = offset_in_page(offset);
2940
2941         do {
2942                 unsigned int len, unwritten;
2943                 struct page *page;
2944                 void *data, *vaddr;
2945                 int err;
2946
2947                 len = PAGE_SIZE - pg;
2948                 if (len > remain)
2949                         len = remain;
2950
2951                 err = pagecache_write_begin(obj->base.filp, mapping,
2952                                             offset, len, 0,
2953                                             &page, &data);
2954                 if (err < 0)
2955                         return err;
2956
2957                 vaddr = kmap(page);
2958                 unwritten = copy_from_user(vaddr + pg, user_data, len);
2959                 kunmap(page);
2960
2961                 err = pagecache_write_end(obj->base.filp, mapping,
2962                                           offset, len, len - unwritten,
2963                                           page, data);
2964                 if (err < 0)
2965                         return err;
2966
2967                 if (unwritten)
2968                         return -EFAULT;
2969
2970                 remain -= len;
2971                 user_data += len;
2972                 offset += len;
2973                 pg = 0;
2974         } while (remain);
2975
2976         return 0;
2977 }
2978
2979 static void i915_gem_client_mark_guilty(struct drm_i915_file_private *file_priv,
2980                                         const struct i915_gem_context *ctx)
2981 {
2982         unsigned int score;
2983         unsigned long prev_hang;
2984
2985         if (i915_gem_context_is_banned(ctx))
2986                 score = I915_CLIENT_SCORE_CONTEXT_BAN;
2987         else
2988                 score = 0;
2989
2990         prev_hang = xchg(&file_priv->hang_timestamp, jiffies);
2991         if (time_before(jiffies, prev_hang + I915_CLIENT_FAST_HANG_JIFFIES))
2992                 score += I915_CLIENT_SCORE_HANG_FAST;
2993
2994         if (score) {
2995                 atomic_add(score, &file_priv->ban_score);
2996
2997                 DRM_DEBUG_DRIVER("client %s: gained %u ban score, now %u\n",
2998                                  ctx->name, score,
2999                                  atomic_read(&file_priv->ban_score));
3000         }
3001 }
3002
3003 static void i915_gem_context_mark_guilty(struct i915_gem_context *ctx)
3004 {
3005         unsigned int score;
3006         bool banned, bannable;
3007
3008         atomic_inc(&ctx->guilty_count);
3009
3010         bannable = i915_gem_context_is_bannable(ctx);
3011         score = atomic_add_return(CONTEXT_SCORE_GUILTY, &ctx->ban_score);
3012         banned = score >= CONTEXT_SCORE_BAN_THRESHOLD;
3013
3014         /* Cool contexts don't accumulate client ban score */
3015         if (!bannable)
3016                 return;
3017
3018         if (banned) {
3019                 DRM_DEBUG_DRIVER("context %s: guilty %d, score %u, banned\n",
3020                                  ctx->name, atomic_read(&ctx->guilty_count),
3021                                  score);
3022                 i915_gem_context_set_banned(ctx);
3023         }
3024
3025         if (!IS_ERR_OR_NULL(ctx->file_priv))
3026                 i915_gem_client_mark_guilty(ctx->file_priv, ctx);
3027 }
3028
3029 static void i915_gem_context_mark_innocent(struct i915_gem_context *ctx)
3030 {
3031         atomic_inc(&ctx->active_count);
3032 }
3033
3034 struct i915_request *
3035 i915_gem_find_active_request(struct intel_engine_cs *engine)
3036 {
3037         struct i915_request *request, *active = NULL;
3038         unsigned long flags;
3039
3040         /*
3041          * We are called by the error capture, reset and to dump engine
3042          * state at random points in time. In particular, note that neither is
3043          * crucially ordered with an interrupt. After a hang, the GPU is dead
3044          * and we assume that no more writes can happen (we waited long enough
3045          * for all writes that were in transaction to be flushed) - adding an
3046          * extra delay for a recent interrupt is pointless. Hence, we do
3047          * not need an engine->irq_seqno_barrier() before the seqno reads.
3048          * At all other times, we must assume the GPU is still running, but
3049          * we only care about the snapshot of this moment.
3050          */
3051         spin_lock_irqsave(&engine->timeline.lock, flags);
3052         list_for_each_entry(request, &engine->timeline.requests, link) {
3053                 if (__i915_request_completed(request, request->global_seqno))
3054                         continue;
3055
3056                 active = request;
3057                 break;
3058         }
3059         spin_unlock_irqrestore(&engine->timeline.lock, flags);
3060
3061         return active;
3062 }
3063
3064 /*
3065  * Ensure irq handler finishes, and not run again.
3066  * Also return the active request so that we only search for it once.
3067  */
3068 struct i915_request *
3069 i915_gem_reset_prepare_engine(struct intel_engine_cs *engine)
3070 {
3071         struct i915_request *request;
3072
3073         /*
3074          * During the reset sequence, we must prevent the engine from
3075          * entering RC6. As the context state is undefined until we restart
3076          * the engine, if it does enter RC6 during the reset, the state
3077          * written to the powercontext is undefined and so we may lose
3078          * GPU state upon resume, i.e. fail to restart after a reset.
3079          */
3080         intel_uncore_forcewake_get(engine->i915, FORCEWAKE_ALL);
3081
3082         request = engine->reset.prepare(engine);
3083         if (request && request->fence.error == -EIO)
3084                 request = ERR_PTR(-EIO); /* Previous reset failed! */
3085
3086         return request;
3087 }
3088
3089 int i915_gem_reset_prepare(struct drm_i915_private *dev_priv)
3090 {
3091         struct intel_engine_cs *engine;
3092         struct i915_request *request;
3093         enum intel_engine_id id;
3094         int err = 0;
3095
3096         for_each_engine(engine, dev_priv, id) {
3097                 request = i915_gem_reset_prepare_engine(engine);
3098                 if (IS_ERR(request)) {
3099                         err = PTR_ERR(request);
3100                         continue;
3101                 }
3102
3103                 engine->hangcheck.active_request = request;
3104         }
3105
3106         i915_gem_revoke_fences(dev_priv);
3107         intel_uc_sanitize(dev_priv);
3108
3109         return err;
3110 }
3111
3112 static void engine_skip_context(struct i915_request *request)
3113 {
3114         struct intel_engine_cs *engine = request->engine;
3115         struct i915_gem_context *hung_ctx = request->gem_context;
3116         struct i915_timeline *timeline = request->timeline;
3117         unsigned long flags;
3118
3119         GEM_BUG_ON(timeline == &engine->timeline);
3120
3121         spin_lock_irqsave(&engine->timeline.lock, flags);
3122         spin_lock(&timeline->lock);
3123
3124         list_for_each_entry_continue(request, &engine->timeline.requests, link)
3125                 if (request->gem_context == hung_ctx)
3126                         i915_request_skip(request, -EIO);
3127
3128         list_for_each_entry(request, &timeline->requests, link)
3129                 i915_request_skip(request, -EIO);
3130
3131         spin_unlock(&timeline->lock);
3132         spin_unlock_irqrestore(&engine->timeline.lock, flags);
3133 }
3134
3135 /* Returns the request if it was guilty of the hang */
3136 static struct i915_request *
3137 i915_gem_reset_request(struct intel_engine_cs *engine,
3138                        struct i915_request *request,
3139                        bool stalled)
3140 {
3141         /* The guilty request will get skipped on a hung engine.
3142          *
3143          * Users of client default contexts do not rely on logical
3144          * state preserved between batches so it is safe to execute
3145          * queued requests following the hang. Non default contexts
3146          * rely on preserved state, so skipping a batch loses the
3147          * evolution of the state and it needs to be considered corrupted.
3148          * Executing more queued batches on top of corrupted state is
3149          * risky. But we take the risk by trying to advance through
3150          * the queued requests in order to make the client behaviour
3151          * more predictable around resets, by not throwing away random
3152          * amount of batches it has prepared for execution. Sophisticated
3153          * clients can use gem_reset_stats_ioctl and dma fence status
3154          * (exported via sync_file info ioctl on explicit fences) to observe
3155          * when it loses the context state and should rebuild accordingly.
3156          *
3157          * The context ban, and ultimately the client ban, mechanism are safety
3158          * valves if client submission ends up resulting in nothing more than
3159          * subsequent hangs.
3160          */
3161
3162         if (i915_request_completed(request)) {
3163                 GEM_TRACE("%s pardoned global=%d (fence %llx:%d), current %d\n",
3164                           engine->name, request->global_seqno,
3165                           request->fence.context, request->fence.seqno,
3166                           intel_engine_get_seqno(engine));
3167                 stalled = false;
3168         }
3169
3170         if (stalled) {
3171                 i915_gem_context_mark_guilty(request->gem_context);
3172                 i915_request_skip(request, -EIO);
3173
3174                 /* If this context is now banned, skip all pending requests. */
3175                 if (i915_gem_context_is_banned(request->gem_context))
3176                         engine_skip_context(request);
3177         } else {
3178                 /*
3179                  * Since this is not the hung engine, it may have advanced
3180                  * since the hang declaration. Double check by refinding
3181                  * the active request at the time of the reset.
3182                  */
3183                 request = i915_gem_find_active_request(engine);
3184                 if (request) {
3185                         unsigned long flags;
3186
3187                         i915_gem_context_mark_innocent(request->gem_context);
3188                         dma_fence_set_error(&request->fence, -EAGAIN);
3189
3190                         /* Rewind the engine to replay the incomplete rq */
3191                         spin_lock_irqsave(&engine->timeline.lock, flags);
3192                         request = list_prev_entry(request, link);
3193                         if (&request->link == &engine->timeline.requests)
3194                                 request = NULL;
3195                         spin_unlock_irqrestore(&engine->timeline.lock, flags);
3196                 }
3197         }
3198
3199         return request;
3200 }
3201
3202 void i915_gem_reset_engine(struct intel_engine_cs *engine,
3203                            struct i915_request *request,
3204                            bool stalled)
3205 {
3206         /*
3207          * Make sure this write is visible before we re-enable the interrupt
3208          * handlers on another CPU, as tasklet_enable() resolves to just
3209          * a compiler barrier which is insufficient for our purpose here.
3210          */
3211         smp_store_mb(engine->irq_posted, 0);
3212
3213         if (request)
3214                 request = i915_gem_reset_request(engine, request, stalled);
3215
3216         /* Setup the CS to resume from the breadcrumb of the hung request */
3217         engine->reset.reset(engine, request);
3218 }
3219
3220 void i915_gem_reset(struct drm_i915_private *dev_priv,
3221                     unsigned int stalled_mask)
3222 {
3223         struct intel_engine_cs *engine;
3224         enum intel_engine_id id;
3225
3226         lockdep_assert_held(&dev_priv->drm.struct_mutex);
3227
3228         i915_retire_requests(dev_priv);
3229
3230         for_each_engine(engine, dev_priv, id) {
3231                 struct intel_context *ce;
3232
3233                 i915_gem_reset_engine(engine,
3234                                       engine->hangcheck.active_request,
3235                                       stalled_mask & ENGINE_MASK(id));
3236                 ce = fetch_and_zero(&engine->last_retired_context);
3237                 if (ce)
3238                         intel_context_unpin(ce);
3239
3240                 /*
3241                  * Ostensibily, we always want a context loaded for powersaving,
3242                  * so if the engine is idle after the reset, send a request
3243                  * to load our scratch kernel_context.
3244                  *
3245                  * More mysteriously, if we leave the engine idle after a reset,
3246                  * the next userspace batch may hang, with what appears to be
3247                  * an incoherent read by the CS (presumably stale TLB). An
3248                  * empty request appears sufficient to paper over the glitch.
3249                  */
3250                 if (intel_engine_is_idle(engine)) {
3251                         struct i915_request *rq;
3252
3253                         rq = i915_request_alloc(engine,
3254                                                 dev_priv->kernel_context);
3255                         if (!IS_ERR(rq))
3256                                 i915_request_add(rq);
3257                 }
3258         }
3259
3260         i915_gem_restore_fences(dev_priv);
3261 }
3262
3263 void i915_gem_reset_finish_engine(struct intel_engine_cs *engine)
3264 {
3265         engine->reset.finish(engine);
3266
3267         intel_uncore_forcewake_put(engine->i915, FORCEWAKE_ALL);
3268 }
3269
3270 void i915_gem_reset_finish(struct drm_i915_private *dev_priv)
3271 {
3272         struct intel_engine_cs *engine;
3273         enum intel_engine_id id;
3274
3275         lockdep_assert_held(&dev_priv->drm.struct_mutex);
3276
3277         for_each_engine(engine, dev_priv, id) {
3278                 engine->hangcheck.active_request = NULL;
3279                 i915_gem_reset_finish_engine(engine);
3280         }
3281 }
3282
3283 static void nop_submit_request(struct i915_request *request)
3284 {
3285         GEM_TRACE("%s fence %llx:%d -> -EIO\n",
3286                   request->engine->name,
3287                   request->fence.context, request->fence.seqno);
3288         dma_fence_set_error(&request->fence, -EIO);
3289
3290         i915_request_submit(request);
3291 }
3292
3293 static void nop_complete_submit_request(struct i915_request *request)
3294 {
3295         unsigned long flags;
3296
3297         GEM_TRACE("%s fence %llx:%d -> -EIO\n",
3298                   request->engine->name,
3299                   request->fence.context, request->fence.seqno);
3300         dma_fence_set_error(&request->fence, -EIO);
3301
3302         spin_lock_irqsave(&request->engine->timeline.lock, flags);
3303         __i915_request_submit(request);
3304         intel_engine_init_global_seqno(request->engine, request->global_seqno);
3305         spin_unlock_irqrestore(&request->engine->timeline.lock, flags);
3306 }
3307
3308 void i915_gem_set_wedged(struct drm_i915_private *i915)
3309 {
3310         struct intel_engine_cs *engine;
3311         enum intel_engine_id id;
3312
3313         GEM_TRACE("start\n");
3314
3315         if (GEM_SHOW_DEBUG()) {
3316                 struct drm_printer p = drm_debug_printer(__func__);
3317
3318                 for_each_engine(engine, i915, id)
3319                         intel_engine_dump(engine, &p, "%s\n", engine->name);
3320         }
3321
3322         if (test_and_set_bit(I915_WEDGED, &i915->gpu_error.flags))
3323                 goto out;
3324
3325         /*
3326          * First, stop submission to hw, but do not yet complete requests by
3327          * rolling the global seqno forward (since this would complete requests
3328          * for which we haven't set the fence error to EIO yet).
3329          */
3330         for_each_engine(engine, i915, id) {
3331                 i915_gem_reset_prepare_engine(engine);
3332
3333                 engine->submit_request = nop_submit_request;
3334                 engine->schedule = NULL;
3335         }
3336         i915->caps.scheduler = 0;
3337
3338         /* Even if the GPU reset fails, it should still stop the engines */
3339         if (INTEL_GEN(i915) >= 5)
3340                 intel_gpu_reset(i915, ALL_ENGINES);
3341
3342         /*
3343          * Make sure no one is running the old callback before we proceed with
3344          * cancelling requests and resetting the completion tracking. Otherwise
3345          * we might submit a request to the hardware which never completes.
3346          */
3347         synchronize_rcu();
3348
3349         for_each_engine(engine, i915, id) {
3350                 /* Mark all executing requests as skipped */
3351                 engine->cancel_requests(engine);
3352
3353                 /*
3354                  * Only once we've force-cancelled all in-flight requests can we
3355                  * start to complete all requests.
3356                  */
3357                 engine->submit_request = nop_complete_submit_request;
3358         }
3359
3360         /*
3361          * Make sure no request can slip through without getting completed by
3362          * either this call here to intel_engine_init_global_seqno, or the one
3363          * in nop_complete_submit_request.
3364          */
3365         synchronize_rcu();
3366
3367         for_each_engine(engine, i915, id) {
3368                 unsigned long flags;
3369
3370                 /*
3371                  * Mark all pending requests as complete so that any concurrent
3372                  * (lockless) lookup doesn't try and wait upon the request as we
3373                  * reset it.
3374                  */
3375                 spin_lock_irqsave(&engine->timeline.lock, flags);
3376                 intel_engine_init_global_seqno(engine,
3377                                                intel_engine_last_submit(engine));
3378                 spin_unlock_irqrestore(&engine->timeline.lock, flags);
3379
3380                 i915_gem_reset_finish_engine(engine);
3381         }
3382
3383 out:
3384         GEM_TRACE("end\n");
3385
3386         wake_up_all(&i915->gpu_error.reset_queue);
3387 }
3388
3389 bool i915_gem_unset_wedged(struct drm_i915_private *i915)
3390 {
3391         struct i915_timeline *tl;
3392
3393         lockdep_assert_held(&i915->drm.struct_mutex);
3394         if (!test_bit(I915_WEDGED, &i915->gpu_error.flags))
3395                 return true;
3396
3397         GEM_TRACE("start\n");
3398
3399         /*
3400          * Before unwedging, make sure that all pending operations
3401          * are flushed and errored out - we may have requests waiting upon
3402          * third party fences. We marked all inflight requests as EIO, and
3403          * every execbuf since returned EIO, for consistency we want all
3404          * the currently pending requests to also be marked as EIO, which
3405          * is done inside our nop_submit_request - and so we must wait.
3406          *
3407          * No more can be submitted until we reset the wedged bit.
3408          */
3409         list_for_each_entry(tl, &i915->gt.timelines, link) {
3410                 struct i915_request *rq;
3411
3412                 rq = i915_gem_active_peek(&tl->last_request,
3413                                           &i915->drm.struct_mutex);
3414                 if (!rq)
3415                         continue;
3416
3417                 /*
3418                  * We can't use our normal waiter as we want to
3419                  * avoid recursively trying to handle the current
3420                  * reset. The basic dma_fence_default_wait() installs
3421                  * a callback for dma_fence_signal(), which is
3422                  * triggered by our nop handler (indirectly, the
3423                  * callback enables the signaler thread which is
3424                  * woken by the nop_submit_request() advancing the seqno
3425                  * and when the seqno passes the fence, the signaler
3426                  * then signals the fence waking us up).
3427                  */
3428                 if (dma_fence_default_wait(&rq->fence, true,
3429                                            MAX_SCHEDULE_TIMEOUT) < 0)
3430                         return false;
3431         }
3432         i915_retire_requests(i915);
3433         GEM_BUG_ON(i915->gt.active_requests);
3434
3435         if (!intel_gpu_reset(i915, ALL_ENGINES))
3436                 intel_engines_sanitize(i915);
3437
3438         /*
3439          * Undo nop_submit_request. We prevent all new i915 requests from
3440          * being queued (by disallowing execbuf whilst wedged) so having
3441          * waited for all active requests above, we know the system is idle
3442          * and do not have to worry about a thread being inside
3443          * engine->submit_request() as we swap over. So unlike installing
3444          * the nop_submit_request on reset, we can do this from normal
3445          * context and do not require stop_machine().
3446          */
3447         intel_engines_reset_default_submission(i915);
3448         i915_gem_contexts_lost(i915);
3449
3450         GEM_TRACE("end\n");
3451
3452         smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
3453         clear_bit(I915_WEDGED, &i915->gpu_error.flags);
3454
3455         return true;
3456 }
3457
3458 static void
3459 i915_gem_retire_work_handler(struct work_struct *work)
3460 {
3461         struct drm_i915_private *dev_priv =
3462                 container_of(work, typeof(*dev_priv), gt.retire_work.work);
3463         struct drm_device *dev = &dev_priv->drm;
3464
3465         /* Come back later if the device is busy... */
3466         if (mutex_trylock(&dev->struct_mutex)) {
3467                 i915_retire_requests(dev_priv);
3468                 mutex_unlock(&dev->struct_mutex);
3469         }
3470
3471         /*
3472          * Keep the retire handler running until we are finally idle.
3473          * We do not need to do this test under locking as in the worst-case
3474          * we queue the retire worker once too often.
3475          */
3476         if (READ_ONCE(dev_priv->gt.awake))
3477                 queue_delayed_work(dev_priv->wq,
3478                                    &dev_priv->gt.retire_work,
3479                                    round_jiffies_up_relative(HZ));
3480 }
3481
3482 static void shrink_caches(struct drm_i915_private *i915)
3483 {
3484         /*
3485          * kmem_cache_shrink() discards empty slabs and reorders partially
3486          * filled slabs to prioritise allocating from the mostly full slabs,
3487          * with the aim of reducing fragmentation.
3488          */
3489         kmem_cache_shrink(i915->priorities);
3490         kmem_cache_shrink(i915->dependencies);
3491         kmem_cache_shrink(i915->requests);
3492         kmem_cache_shrink(i915->luts);
3493         kmem_cache_shrink(i915->vmas);
3494         kmem_cache_shrink(i915->objects);
3495 }
3496
3497 struct sleep_rcu_work {
3498         union {
3499                 struct rcu_head rcu;
3500                 struct work_struct work;
3501         };
3502         struct drm_i915_private *i915;
3503         unsigned int epoch;
3504 };
3505
3506 static inline bool
3507 same_epoch(struct drm_i915_private *i915, unsigned int epoch)
3508 {
3509         /*
3510          * There is a small chance that the epoch wrapped since we started
3511          * sleeping. If we assume that epoch is at least a u32, then it will
3512          * take at least 2^32 * 100ms for it to wrap, or about 326 years.
3513          */
3514         return epoch == READ_ONCE(i915->gt.epoch);
3515 }
3516
3517 static void __sleep_work(struct work_struct *work)
3518 {
3519         struct sleep_rcu_work *s = container_of(work, typeof(*s), work);
3520         struct drm_i915_private *i915 = s->i915;
3521         unsigned int epoch = s->epoch;
3522
3523         kfree(s);
3524         if (same_epoch(i915, epoch))
3525                 shrink_caches(i915);
3526 }
3527
3528 static void __sleep_rcu(struct rcu_head *rcu)
3529 {
3530         struct sleep_rcu_work *s = container_of(rcu, typeof(*s), rcu);
3531         struct drm_i915_private *i915 = s->i915;
3532
3533         if (same_epoch(i915, s->epoch)) {
3534                 INIT_WORK(&s->work, __sleep_work);
3535                 queue_work(i915->wq, &s->work);
3536         } else {
3537                 kfree(s);
3538         }
3539 }
3540
3541 static inline bool
3542 new_requests_since_last_retire(const struct drm_i915_private *i915)
3543 {
3544         return (READ_ONCE(i915->gt.active_requests) ||
3545                 work_pending(&i915->gt.idle_work.work));
3546 }
3547
3548 static void assert_kernel_context_is_current(struct drm_i915_private *i915)
3549 {
3550         struct intel_engine_cs *engine;
3551         enum intel_engine_id id;
3552
3553         if (i915_terminally_wedged(&i915->gpu_error))
3554                 return;
3555
3556         GEM_BUG_ON(i915->gt.active_requests);
3557         for_each_engine(engine, i915, id) {
3558                 GEM_BUG_ON(__i915_gem_active_peek(&engine->timeline.last_request));
3559                 GEM_BUG_ON(engine->last_retired_context !=
3560                            to_intel_context(i915->kernel_context, engine));
3561         }
3562 }
3563
3564 static void
3565 i915_gem_idle_work_handler(struct work_struct *work)
3566 {
3567         struct drm_i915_private *dev_priv =
3568                 container_of(work, typeof(*dev_priv), gt.idle_work.work);
3569         unsigned int epoch = I915_EPOCH_INVALID;
3570         bool rearm_hangcheck;
3571
3572         if (!READ_ONCE(dev_priv->gt.awake))
3573                 return;
3574
3575         if (READ_ONCE(dev_priv->gt.active_requests))
3576                 return;
3577
3578         /*
3579          * Flush out the last user context, leaving only the pinned
3580          * kernel context resident. When we are idling on the kernel_context,
3581          * no more new requests (with a context switch) are emitted and we
3582          * can finally rest. A consequence is that the idle work handler is
3583          * always called at least twice before idling (and if the system is
3584          * idle that implies a round trip through the retire worker).
3585          */
3586         mutex_lock(&dev_priv->drm.struct_mutex);
3587         i915_gem_switch_to_kernel_context(dev_priv);
3588         mutex_unlock(&dev_priv->drm.struct_mutex);
3589
3590         GEM_TRACE("active_requests=%d (after switch-to-kernel-context)\n",
3591                   READ_ONCE(dev_priv->gt.active_requests));
3592
3593         /*
3594          * Wait for last execlists context complete, but bail out in case a
3595          * new request is submitted. As we don't trust the hardware, we
3596          * continue on if the wait times out. This is necessary to allow
3597          * the machine to suspend even if the hardware dies, and we will
3598          * try to recover in resume (after depriving the hardware of power,
3599          * it may be in a better mmod).
3600          */
3601         __wait_for(if (new_requests_since_last_retire(dev_priv)) return,
3602                    intel_engines_are_idle(dev_priv),
3603                    I915_IDLE_ENGINES_TIMEOUT * 1000,
3604                    10, 500);
3605
3606         rearm_hangcheck =
3607                 cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work);
3608
3609         if (!mutex_trylock(&dev_priv->drm.struct_mutex)) {
3610                 /* Currently busy, come back later */
3611                 mod_delayed_work(dev_priv->wq,
3612                                  &dev_priv->gt.idle_work,
3613                                  msecs_to_jiffies(50));
3614                 goto out_rearm;
3615         }
3616
3617         /*
3618          * New request retired after this work handler started, extend active
3619          * period until next instance of the work.
3620          */
3621         if (new_requests_since_last_retire(dev_priv))
3622                 goto out_unlock;
3623
3624         epoch = __i915_gem_park(dev_priv);
3625
3626         assert_kernel_context_is_current(dev_priv);
3627
3628         rearm_hangcheck = false;
3629 out_unlock:
3630         mutex_unlock(&dev_priv->drm.struct_mutex);
3631
3632 out_rearm:
3633         if (rearm_hangcheck) {
3634                 GEM_BUG_ON(!dev_priv->gt.awake);
3635                 i915_queue_hangcheck(dev_priv);
3636         }
3637
3638         /*
3639          * When we are idle, it is an opportune time to reap our caches.
3640          * However, we have many objects that utilise RCU and the ordered
3641          * i915->wq that this work is executing on. To try and flush any
3642          * pending frees now we are idle, we first wait for an RCU grace
3643          * period, and then queue a task (that will run last on the wq) to
3644          * shrink and re-optimize the caches.
3645          */
3646         if (same_epoch(dev_priv, epoch)) {
3647                 struct sleep_rcu_work *s = kmalloc(sizeof(*s), GFP_KERNEL);
3648                 if (s) {
3649                         s->i915 = dev_priv;
3650                         s->epoch = epoch;
3651                         call_rcu(&s->rcu, __sleep_rcu);
3652                 }
3653         }
3654 }
3655
3656 void i915_gem_close_object(struct drm_gem_object *gem, struct drm_file *file)
3657 {
3658         struct drm_i915_private *i915 = to_i915(gem->dev);
3659         struct drm_i915_gem_object *obj = to_intel_bo(gem);
3660         struct drm_i915_file_private *fpriv = file->driver_priv;
3661         struct i915_lut_handle *lut, *ln;
3662
3663         mutex_lock(&i915->drm.struct_mutex);
3664
3665         list_for_each_entry_safe(lut, ln, &obj->lut_list, obj_link) {
3666                 struct i915_gem_context *ctx = lut->ctx;
3667                 struct i915_vma *vma;
3668
3669                 GEM_BUG_ON(ctx->file_priv == ERR_PTR(-EBADF));
3670                 if (ctx->file_priv != fpriv)
3671                         continue;
3672
3673                 vma = radix_tree_delete(&ctx->handles_vma, lut->handle);
3674                 GEM_BUG_ON(vma->obj != obj);
3675
3676                 /* We allow the process to have multiple handles to the same
3677                  * vma, in the same fd namespace, by virtue of flink/open.
3678                  */
3679                 GEM_BUG_ON(!vma->open_count);
3680                 if (!--vma->open_count && !i915_vma_is_ggtt(vma))
3681                         i915_vma_close(vma);
3682
3683                 list_del(&lut->obj_link);
3684                 list_del(&lut->ctx_link);
3685
3686                 kmem_cache_free(i915->luts, lut);
3687                 __i915_gem_object_release_unless_active(obj);
3688         }
3689
3690         mutex_unlock(&i915->drm.struct_mutex);
3691 }
3692
3693 static unsigned long to_wait_timeout(s64 timeout_ns)
3694 {
3695         if (timeout_ns < 0)
3696                 return MAX_SCHEDULE_TIMEOUT;
3697
3698         if (timeout_ns == 0)
3699                 return 0;
3700
3701         return nsecs_to_jiffies_timeout(timeout_ns);
3702 }
3703
3704 /**
3705  * i915_gem_wait_ioctl - implements DRM_IOCTL_I915_GEM_WAIT
3706  * @dev: drm device pointer
3707  * @data: ioctl data blob
3708  * @file: drm file pointer
3709  *
3710  * Returns 0 if successful, else an error is returned with the remaining time in
3711  * the timeout parameter.
3712  *  -ETIME: object is still busy after timeout
3713  *  -ERESTARTSYS: signal interrupted the wait
3714  *  -ENONENT: object doesn't exist
3715  * Also possible, but rare:
3716  *  -EAGAIN: incomplete, restart syscall
3717  *  -ENOMEM: damn
3718  *  -ENODEV: Internal IRQ fail
3719  *  -E?: The add request failed
3720  *
3721  * The wait ioctl with a timeout of 0 reimplements the busy ioctl. With any
3722  * non-zero timeout parameter the wait ioctl will wait for the given number of
3723  * nanoseconds on an object becoming unbusy. Since the wait itself does so
3724  * without holding struct_mutex the object may become re-busied before this
3725  * function completes. A similar but shorter * race condition exists in the busy
3726  * ioctl
3727  */
3728 int
3729 i915_gem_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
3730 {
3731         struct drm_i915_gem_wait *args = data;
3732         struct drm_i915_gem_object *obj;
3733         ktime_t start;
3734         long ret;
3735
3736         if (args->flags != 0)
3737                 return -EINVAL;
3738
3739         obj = i915_gem_object_lookup(file, args->bo_handle);
3740         if (!obj)
3741                 return -ENOENT;
3742
3743         start = ktime_get();
3744
3745         ret = i915_gem_object_wait(obj,
3746                                    I915_WAIT_INTERRUPTIBLE | I915_WAIT_ALL,
3747                                    to_wait_timeout(args->timeout_ns),
3748                                    to_rps_client(file));
3749
3750         if (args->timeout_ns > 0) {
3751                 args->timeout_ns -= ktime_to_ns(ktime_sub(ktime_get(), start));
3752                 if (args->timeout_ns < 0)
3753                         args->timeout_ns = 0;
3754
3755                 /*
3756                  * Apparently ktime isn't accurate enough and occasionally has a
3757                  * bit of mismatch in the jiffies<->nsecs<->ktime loop. So patch
3758                  * things up to make the test happy. We allow up to 1 jiffy.
3759                  *
3760                  * This is a regression from the timespec->ktime conversion.
3761                  */
3762                 if (ret == -ETIME && !nsecs_to_jiffies(args->timeout_ns))
3763                         args->timeout_ns = 0;
3764
3765                 /* Asked to wait beyond the jiffie/scheduler precision? */
3766                 if (ret == -ETIME && args->timeout_ns)
3767                         ret = -EAGAIN;
3768         }
3769
3770         i915_gem_object_put(obj);
3771         return ret;
3772 }
3773
3774 static long wait_for_timeline(struct i915_timeline *tl,
3775                               unsigned int flags, long timeout)
3776 {
3777         struct i915_request *rq;
3778
3779         rq = i915_gem_active_get_unlocked(&tl->last_request);
3780         if (!rq)
3781                 return timeout;
3782
3783         /*
3784          * "Race-to-idle".
3785          *
3786          * Switching to the kernel context is often used a synchronous
3787          * step prior to idling, e.g. in suspend for flushing all
3788          * current operations to memory before sleeping. These we
3789          * want to complete as quickly as possible to avoid prolonged
3790          * stalls, so allow the gpu to boost to maximum clocks.
3791          */
3792         if (flags & I915_WAIT_FOR_IDLE_BOOST)
3793                 gen6_rps_boost(rq, NULL);
3794
3795         timeout = i915_request_wait(rq, flags, timeout);
3796         i915_request_put(rq);
3797
3798         return timeout;
3799 }
3800
3801 static int wait_for_engines(struct drm_i915_private *i915)
3802 {
3803         if (wait_for(intel_engines_are_idle(i915), I915_IDLE_ENGINES_TIMEOUT)) {
3804                 dev_err(i915->drm.dev,
3805                         "Failed to idle engines, declaring wedged!\n");
3806                 GEM_TRACE_DUMP();
3807                 i915_gem_set_wedged(i915);
3808                 return -EIO;
3809         }
3810
3811         return 0;
3812 }
3813
3814 int i915_gem_wait_for_idle(struct drm_i915_private *i915,
3815                            unsigned int flags, long timeout)
3816 {
3817         GEM_TRACE("flags=%x (%s), timeout=%ld%s\n",
3818                   flags, flags & I915_WAIT_LOCKED ? "locked" : "unlocked",
3819                   timeout, timeout == MAX_SCHEDULE_TIMEOUT ? " (forever)" : "");
3820
3821         /* If the device is asleep, we have no requests outstanding */
3822         if (!READ_ONCE(i915->gt.awake))
3823                 return 0;
3824
3825         if (flags & I915_WAIT_LOCKED) {
3826                 struct i915_timeline *tl;
3827                 int err;
3828
3829                 lockdep_assert_held(&i915->drm.struct_mutex);
3830
3831                 list_for_each_entry(tl, &i915->gt.timelines, link) {
3832                         timeout = wait_for_timeline(tl, flags, timeout);
3833                         if (timeout < 0)
3834                                 return timeout;
3835                 }
3836                 if (GEM_SHOW_DEBUG() && !timeout) {
3837                         /* Presume that timeout was non-zero to begin with! */
3838                         dev_warn(&i915->drm.pdev->dev,
3839                                  "Missed idle-completion interrupt!\n");
3840                         GEM_TRACE_DUMP();
3841                 }
3842
3843                 err = wait_for_engines(i915);
3844                 if (err)
3845                         return err;
3846
3847                 i915_retire_requests(i915);
3848                 GEM_BUG_ON(i915->gt.active_requests);
3849         } else {
3850                 struct intel_engine_cs *engine;
3851                 enum intel_engine_id id;
3852
3853                 for_each_engine(engine, i915, id) {
3854                         struct i915_timeline *tl = &engine->timeline;
3855
3856                         timeout = wait_for_timeline(tl, flags, timeout);
3857                         if (timeout < 0)
3858                                 return timeout;
3859                 }
3860         }
3861
3862         return 0;
3863 }
3864
3865 static void __i915_gem_object_flush_for_display(struct drm_i915_gem_object *obj)
3866 {
3867         /*
3868          * We manually flush the CPU domain so that we can override and
3869          * force the flush for the display, and perform it asyncrhonously.
3870          */
3871         flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
3872         if (obj->cache_dirty)
3873                 i915_gem_clflush_object(obj, I915_CLFLUSH_FORCE);
3874         obj->write_domain = 0;
3875 }
3876
3877 void i915_gem_object_flush_if_display(struct drm_i915_gem_object *obj)
3878 {
3879         if (!READ_ONCE(obj->pin_global))
3880                 return;
3881
3882         mutex_lock(&obj->base.dev->struct_mutex);
3883         __i915_gem_object_flush_for_display(obj);
3884         mutex_unlock(&obj->base.dev->struct_mutex);
3885 }
3886
3887 /**
3888  * Moves a single object to the WC read, and possibly write domain.
3889  * @obj: object to act on
3890  * @write: ask for write access or read only
3891  *
3892  * This function returns when the move is complete, including waiting on
3893  * flushes to occur.
3894  */
3895 int
3896 i915_gem_object_set_to_wc_domain(struct drm_i915_gem_object *obj, bool write)
3897 {
3898         int ret;
3899
3900         lockdep_assert_held(&obj->base.dev->struct_mutex);
3901
3902         ret = i915_gem_object_wait(obj,
3903                                    I915_WAIT_INTERRUPTIBLE |
3904                                    I915_WAIT_LOCKED |
3905                                    (write ? I915_WAIT_ALL : 0),
3906                                    MAX_SCHEDULE_TIMEOUT,
3907                                    NULL);
3908         if (ret)
3909                 return ret;
3910
3911         if (obj->write_domain == I915_GEM_DOMAIN_WC)
3912                 return 0;
3913
3914         /* Flush and acquire obj->pages so that we are coherent through
3915          * direct access in memory with previous cached writes through
3916          * shmemfs and that our cache domain tracking remains valid.
3917          * For example, if the obj->filp was moved to swap without us
3918          * being notified and releasing the pages, we would mistakenly
3919          * continue to assume that the obj remained out of the CPU cached
3920          * domain.
3921          */
3922         ret = i915_gem_object_pin_pages(obj);
3923         if (ret)
3924                 return ret;
3925
3926         flush_write_domain(obj, ~I915_GEM_DOMAIN_WC);
3927
3928         /* Serialise direct access to this object with the barriers for
3929          * coherent writes from the GPU, by effectively invalidating the
3930          * WC domain upon first access.
3931          */
3932         if ((obj->read_domains & I915_GEM_DOMAIN_WC) == 0)
3933                 mb();
3934
3935         /* It should now be out of any other write domains, and we can update
3936          * the domain values for our changes.
3937          */
3938         GEM_BUG_ON((obj->write_domain & ~I915_GEM_DOMAIN_WC) != 0);
3939         obj->read_domains |= I915_GEM_DOMAIN_WC;
3940         if (write) {
3941                 obj->read_domains = I915_GEM_DOMAIN_WC;
3942                 obj->write_domain = I915_GEM_DOMAIN_WC;
3943                 obj->mm.dirty = true;
3944         }
3945
3946         i915_gem_object_unpin_pages(obj);
3947         return 0;
3948 }
3949
3950 /**
3951  * Moves a single object to the GTT read, and possibly write domain.
3952  * @obj: object to act on
3953  * @write: ask for write access or read only
3954  *
3955  * This function returns when the move is complete, including waiting on
3956  * flushes to occur.
3957  */
3958 int
3959 i915_gem_object_set_to_gtt_domain(struct drm_i915_gem_object *obj, bool write)
3960 {
3961         int ret;
3962
3963         lockdep_assert_held(&obj->base.dev->struct_mutex);
3964
3965         ret = i915_gem_object_wait(obj,
3966                                    I915_WAIT_INTERRUPTIBLE |
3967                                    I915_WAIT_LOCKED |
3968                                    (write ? I915_WAIT_ALL : 0),
3969                                    MAX_SCHEDULE_TIMEOUT,
3970                                    NULL);
3971         if (ret)
3972                 return ret;
3973
3974         if (obj->write_domain == I915_GEM_DOMAIN_GTT)
3975                 return 0;
3976
3977         /* Flush and acquire obj->pages so that we are coherent through
3978          * direct access in memory with previous cached writes through
3979          * shmemfs and that our cache domain tracking remains valid.
3980          * For example, if the obj->filp was moved to swap without us
3981          * being notified and releasing the pages, we would mistakenly
3982          * continue to assume that the obj remained out of the CPU cached
3983          * domain.
3984          */
3985         ret = i915_gem_object_pin_pages(obj);
3986         if (ret)
3987                 return ret;
3988
3989         flush_write_domain(obj, ~I915_GEM_DOMAIN_GTT);
3990
3991         /* Serialise direct access to this object with the barriers for
3992          * coherent writes from the GPU, by effectively invalidating the
3993          * GTT domain upon first access.
3994          */
3995         if ((obj->read_domains & I915_GEM_DOMAIN_GTT) == 0)
3996                 mb();
3997
3998         /* It should now be out of any other write domains, and we can update
3999          * the domain values for our changes.
4000          */
4001         GEM_BUG_ON((obj->write_domain & ~I915_GEM_DOMAIN_GTT) != 0);
4002         obj->read_domains |= I915_GEM_DOMAIN_GTT;
4003         if (write) {
4004                 obj->read_domains = I915_GEM_DOMAIN_GTT;
4005                 obj->write_domain = I915_GEM_DOMAIN_GTT;
4006                 obj->mm.dirty = true;
4007         }
4008
4009         i915_gem_object_unpin_pages(obj);
4010         return 0;
4011 }
4012
4013 /**
4014  * Changes the cache-level of an object across all VMA.
4015  * @obj: object to act on
4016  * @cache_level: new cache level to set for the object
4017  *
4018  * After this function returns, the object will be in the new cache-level
4019  * across all GTT and the contents of the backing storage will be coherent,
4020  * with respect to the new cache-level. In order to keep the backing storage
4021  * coherent for all users, we only allow a single cache level to be set
4022  * globally on the object and prevent it from being changed whilst the
4023  * hardware is reading from the object. That is if the object is currently
4024  * on the scanout it will be set to uncached (or equivalent display
4025  * cache coherency) and all non-MOCS GPU access will also be uncached so
4026  * that all direct access to the scanout remains coherent.
4027  */
4028 int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
4029                                     enum i915_cache_level cache_level)
4030 {
4031         struct i915_vma *vma;
4032         int ret;
4033
4034         lockdep_assert_held(&obj->base.dev->struct_mutex);
4035
4036         if (obj->cache_level == cache_level)
4037                 return 0;
4038
4039         /* Inspect the list of currently bound VMA and unbind any that would
4040          * be invalid given the new cache-level. This is principally to
4041          * catch the issue of the CS prefetch crossing page boundaries and
4042          * reading an invalid PTE on older architectures.
4043          */
4044 restart:
4045         list_for_each_entry(vma, &obj->vma_list, obj_link) {
4046                 if (!drm_mm_node_allocated(&vma->node))
4047                         continue;
4048
4049                 if (i915_vma_is_pinned(vma)) {
4050                         DRM_DEBUG("can not change the cache level of pinned objects\n");
4051                         return -EBUSY;
4052                 }
4053
4054                 if (!i915_vma_is_closed(vma) &&
4055                     i915_gem_valid_gtt_space(vma, cache_level))
4056                         continue;
4057
4058                 ret = i915_vma_unbind(vma);
4059                 if (ret)
4060                         return ret;
4061
4062                 /* As unbinding may affect other elements in the
4063                  * obj->vma_list (due to side-effects from retiring
4064                  * an active vma), play safe and restart the iterator.
4065                  */
4066                 goto restart;
4067         }
4068
4069         /* We can reuse the existing drm_mm nodes but need to change the
4070          * cache-level on the PTE. We could simply unbind them all and
4071          * rebind with the correct cache-level on next use. However since
4072          * we already have a valid slot, dma mapping, pages etc, we may as
4073          * rewrite the PTE in the belief that doing so tramples upon less
4074          * state and so involves less work.
4075          */
4076         if (obj->bind_count) {
4077                 /* Before we change the PTE, the GPU must not be accessing it.
4078                  * If we wait upon the object, we know that all the bound
4079                  * VMA are no longer active.
4080                  */
4081                 ret = i915_gem_object_wait(obj,
4082                                            I915_WAIT_INTERRUPTIBLE |
4083                                            I915_WAIT_LOCKED |
4084                                            I915_WAIT_ALL,
4085                                            MAX_SCHEDULE_TIMEOUT,
4086                                            NULL);
4087                 if (ret)
4088                         return ret;
4089
4090                 if (!HAS_LLC(to_i915(obj->base.dev)) &&
4091                     cache_level != I915_CACHE_NONE) {
4092                         /* Access to snoopable pages through the GTT is
4093                          * incoherent and on some machines causes a hard
4094                          * lockup. Relinquish the CPU mmaping to force
4095                          * userspace to refault in the pages and we can
4096                          * then double check if the GTT mapping is still
4097                          * valid for that pointer access.
4098                          */
4099                         i915_gem_release_mmap(obj);
4100
4101                         /* As we no longer need a fence for GTT access,
4102                          * we can relinquish it now (and so prevent having
4103                          * to steal a fence from someone else on the next
4104                          * fence request). Note GPU activity would have
4105                          * dropped the fence as all snoopable access is
4106                          * supposed to be linear.
4107                          */
4108                         for_each_ggtt_vma(vma, obj) {
4109                                 ret = i915_vma_put_fence(vma);
4110                                 if (ret)
4111                                         return ret;
4112                         }
4113                 } else {
4114                         /* We either have incoherent backing store and
4115                          * so no GTT access or the architecture is fully
4116                          * coherent. In such cases, existing GTT mmaps
4117                          * ignore the cache bit in the PTE and we can
4118                          * rewrite it without confusing the GPU or having
4119                          * to force userspace to fault back in its mmaps.
4120                          */
4121                 }
4122
4123                 list_for_each_entry(vma, &obj->vma_list, obj_link) {
4124                         if (!drm_mm_node_allocated(&vma->node))
4125                                 continue;
4126
4127                         ret = i915_vma_bind(vma, cache_level, PIN_UPDATE);
4128                         if (ret)
4129                                 return ret;
4130                 }
4131         }
4132
4133         list_for_each_entry(vma, &obj->vma_list, obj_link)
4134                 vma->node.color = cache_level;
4135         i915_gem_object_set_cache_coherency(obj, cache_level);
4136         obj->cache_dirty = true; /* Always invalidate stale cachelines */
4137
4138         return 0;
4139 }
4140
4141 int i915_gem_get_caching_ioctl(struct drm_device *dev, void *data,
4142                                struct drm_file *file)
4143 {
4144         struct drm_i915_gem_caching *args = data;
4145         struct drm_i915_gem_object *obj;
4146         int err = 0;
4147
4148         rcu_read_lock();
4149         obj = i915_gem_object_lookup_rcu(file, args->handle);
4150         if (!obj) {
4151                 err = -ENOENT;
4152                 goto out;
4153         }
4154
4155         switch (obj->cache_level) {
4156         case I915_CACHE_LLC:
4157         case I915_CACHE_L3_LLC:
4158                 args->caching = I915_CACHING_CACHED;
4159                 break;
4160
4161         case I915_CACHE_WT:
4162                 args->caching = I915_CACHING_DISPLAY;
4163                 break;
4164
4165         default:
4166                 args->caching = I915_CACHING_NONE;
4167                 break;
4168         }
4169 out:
4170         rcu_read_unlock();
4171         return err;
4172 }
4173
4174 int i915_gem_set_caching_ioctl(struct drm_device *dev, void *data,
4175                                struct drm_file *file)
4176 {
4177         struct drm_i915_private *i915 = to_i915(dev);
4178         struct drm_i915_gem_caching *args = data;
4179         struct drm_i915_gem_object *obj;
4180         enum i915_cache_level level;
4181         int ret = 0;
4182
4183         switch (args->caching) {
4184         case I915_CACHING_NONE:
4185                 level = I915_CACHE_NONE;
4186                 break;
4187         case I915_CACHING_CACHED:
4188                 /*
4189                  * Due to a HW issue on BXT A stepping, GPU stores via a
4190                  * snooped mapping may leave stale data in a corresponding CPU
4191                  * cacheline, whereas normally such cachelines would get
4192                  * invalidated.
4193                  */
4194                 if (!HAS_LLC(i915) && !HAS_SNOOP(i915))
4195                         return -ENODEV;
4196
4197                 level = I915_CACHE_LLC;
4198                 break;
4199         case I915_CACHING_DISPLAY:
4200                 level = HAS_WT(i915) ? I915_CACHE_WT : I915_CACHE_NONE;
4201                 break;
4202         default:
4203                 return -EINVAL;
4204         }
4205
4206         obj = i915_gem_object_lookup(file, args->handle);
4207         if (!obj)
4208                 return -ENOENT;
4209
4210         /*
4211          * The caching mode of proxy object is handled by its generator, and
4212          * not allowed to be changed by userspace.
4213          */
4214         if (i915_gem_object_is_proxy(obj)) {
4215                 ret = -ENXIO;
4216                 goto out;
4217         }
4218
4219         if (obj->cache_level == level)
4220                 goto out;
4221
4222         ret = i915_gem_object_wait(obj,
4223                                    I915_WAIT_INTERRUPTIBLE,
4224                                    MAX_SCHEDULE_TIMEOUT,
4225                                    to_rps_client(file));
4226         if (ret)
4227                 goto out;
4228
4229         ret = i915_mutex_lock_interruptible(dev);
4230         if (ret)
4231                 goto out;
4232
4233         ret = i915_gem_object_set_cache_level(obj, level);
4234         mutex_unlock(&dev->struct_mutex);
4235
4236 out:
4237         i915_gem_object_put(obj);
4238         return ret;
4239 }
4240
4241 /*
4242  * Prepare buffer for display plane (scanout, cursors, etc). Can be called from
4243  * an uninterruptible phase (modesetting) and allows any flushes to be pipelined
4244  * (for pageflips). We only flush the caches while preparing the buffer for
4245  * display, the callers are responsible for frontbuffer flush.
4246  */
4247 struct i915_vma *
4248 i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj,
4249                                      u32 alignment,
4250                                      const struct i915_ggtt_view *view,
4251                                      unsigned int flags)
4252 {
4253         struct i915_vma *vma;
4254         int ret;
4255
4256         lockdep_assert_held(&obj->base.dev->struct_mutex);
4257
4258         /* Mark the global pin early so that we account for the
4259          * display coherency whilst setting up the cache domains.
4260          */
4261         obj->pin_global++;
4262
4263         /* The display engine is not coherent with the LLC cache on gen6.  As
4264          * a result, we make sure that the pinning that is about to occur is
4265          * done with uncached PTEs. This is lowest common denominator for all
4266          * chipsets.
4267          *
4268          * However for gen6+, we could do better by using the GFDT bit instead
4269          * of uncaching, which would allow us to flush all the LLC-cached data
4270          * with that bit in the PTE to main memory with just one PIPE_CONTROL.
4271          */
4272         ret = i915_gem_object_set_cache_level(obj,
4273                                               HAS_WT(to_i915(obj->base.dev)) ?
4274                                               I915_CACHE_WT : I915_CACHE_NONE);
4275         if (ret) {
4276                 vma = ERR_PTR(ret);
4277                 goto err_unpin_global;
4278         }
4279
4280         /* As the user may map the buffer once pinned in the display plane
4281          * (e.g. libkms for the bootup splash), we have to ensure that we
4282          * always use map_and_fenceable for all scanout buffers. However,
4283          * it may simply be too big to fit into mappable, in which case
4284          * put it anyway and hope that userspace can cope (but always first
4285          * try to preserve the existing ABI).
4286          */
4287         vma = ERR_PTR(-ENOSPC);
4288         if ((flags & PIN_MAPPABLE) == 0 &&
4289             (!view || view->type == I915_GGTT_VIEW_NORMAL))
4290                 vma = i915_gem_object_ggtt_pin(obj, view, 0, alignment,
4291                                                flags |
4292                                                PIN_MAPPABLE |
4293                                                PIN_NONBLOCK);
4294         if (IS_ERR(vma))
4295                 vma = i915_gem_object_ggtt_pin(obj, view, 0, alignment, flags);
4296         if (IS_ERR(vma))
4297                 goto err_unpin_global;
4298
4299         vma->display_alignment = max_t(u64, vma->display_alignment, alignment);
4300
4301         __i915_gem_object_flush_for_display(obj);
4302
4303         /* It should now be out of any other write domains, and we can update
4304          * the domain values for our changes.
4305          */
4306         obj->read_domains |= I915_GEM_DOMAIN_GTT;
4307
4308         return vma;
4309
4310 err_unpin_global:
4311         obj->pin_global--;
4312         return vma;
4313 }
4314
4315 void
4316 i915_gem_object_unpin_from_display_plane(struct i915_vma *vma)
4317 {
4318         lockdep_assert_held(&vma->vm->i915->drm.struct_mutex);
4319
4320         if (WARN_ON(vma->obj->pin_global == 0))
4321                 return;
4322
4323         if (--vma->obj->pin_global == 0)
4324                 vma->display_alignment = I915_GTT_MIN_ALIGNMENT;
4325
4326         /* Bump the LRU to try and avoid premature eviction whilst flipping  */
4327         i915_gem_object_bump_inactive_ggtt(vma->obj);
4328
4329         i915_vma_unpin(vma);
4330 }
4331
4332 /**
4333  * Moves a single object to the CPU read, and possibly write domain.
4334  * @obj: object to act on
4335  * @write: requesting write or read-only access
4336  *
4337  * This function returns when the move is complete, including waiting on
4338  * flushes to occur.
4339  */
4340 int
4341 i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write)
4342 {
4343         int ret;
4344
4345         lockdep_assert_held(&obj->base.dev->struct_mutex);
4346
4347         ret = i915_gem_object_wait(obj,
4348                                    I915_WAIT_INTERRUPTIBLE |
4349                                    I915_WAIT_LOCKED |
4350                                    (write ? I915_WAIT_ALL : 0),
4351                                    MAX_SCHEDULE_TIMEOUT,
4352                                    NULL);
4353         if (ret)
4354                 return ret;
4355
4356         flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
4357
4358         /* Flush the CPU cache if it's still invalid. */
4359         if ((obj->read_domains & I915_GEM_DOMAIN_CPU) == 0) {
4360                 i915_gem_clflush_object(obj, I915_CLFLUSH_SYNC);
4361                 obj->read_domains |= I915_GEM_DOMAIN_CPU;
4362         }
4363
4364         /* It should now be out of any other write domains, and we can update
4365          * the domain values for our changes.
4366          */
4367         GEM_BUG_ON(obj->write_domain & ~I915_GEM_DOMAIN_CPU);
4368
4369         /* If we're writing through the CPU, then the GPU read domains will
4370          * need to be invalidated at next use.
4371          */
4372         if (write)
4373                 __start_cpu_write(obj);
4374
4375         return 0;
4376 }
4377
4378 /* Throttle our rendering by waiting until the ring has completed our requests
4379  * emitted over 20 msec ago.
4380  *
4381  * Note that if we were to use the current jiffies each time around the loop,
4382  * we wouldn't escape the function with any frames outstanding if the time to
4383  * render a frame was over 20ms.
4384  *
4385  * This should get us reasonable parallelism between CPU and GPU but also
4386  * relatively low latency when blocking on a particular request to finish.
4387  */
4388 static int
4389 i915_gem_ring_throttle(struct drm_device *dev, struct drm_file *file)
4390 {
4391         struct drm_i915_private *dev_priv = to_i915(dev);
4392         struct drm_i915_file_private *file_priv = file->driver_priv;
4393         unsigned long recent_enough = jiffies - DRM_I915_THROTTLE_JIFFIES;
4394         struct i915_request *request, *target = NULL;
4395         long ret;
4396
4397         /* ABI: return -EIO if already wedged */
4398         if (i915_terminally_wedged(&dev_priv->gpu_error))
4399                 return -EIO;
4400
4401         spin_lock(&file_priv->mm.lock);
4402         list_for_each_entry(request, &file_priv->mm.request_list, client_link) {
4403                 if (time_after_eq(request->emitted_jiffies, recent_enough))
4404                         break;
4405
4406                 if (target) {
4407                         list_del(&target->client_link);
4408                         target->file_priv = NULL;
4409                 }
4410
4411                 target = request;
4412         }
4413         if (target)
4414                 i915_request_get(target);
4415         spin_unlock(&file_priv->mm.lock);
4416
4417         if (target == NULL)
4418                 return 0;
4419
4420         ret = i915_request_wait(target,
4421                                 I915_WAIT_INTERRUPTIBLE,
4422                                 MAX_SCHEDULE_TIMEOUT);
4423         i915_request_put(target);
4424
4425         return ret < 0 ? ret : 0;
4426 }
4427
4428 struct i915_vma *
4429 i915_gem_object_ggtt_pin(struct drm_i915_gem_object *obj,
4430                          const struct i915_ggtt_view *view,
4431                          u64 size,
4432                          u64 alignment,
4433                          u64 flags)
4434 {
4435         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
4436         struct i915_address_space *vm = &dev_priv->ggtt.vm;
4437         struct i915_vma *vma;
4438         int ret;
4439
4440         lockdep_assert_held(&obj->base.dev->struct_mutex);
4441
4442         if (flags & PIN_MAPPABLE &&
4443             (!view || view->type == I915_GGTT_VIEW_NORMAL)) {
4444                 /* If the required space is larger than the available
4445                  * aperture, we will not able to find a slot for the
4446                  * object and unbinding the object now will be in
4447                  * vain. Worse, doing so may cause us to ping-pong
4448                  * the object in and out of the Global GTT and
4449                  * waste a lot of cycles under the mutex.
4450                  */
4451                 if (obj->base.size > dev_priv->ggtt.mappable_end)
4452                         return ERR_PTR(-E2BIG);
4453
4454                 /* If NONBLOCK is set the caller is optimistically
4455                  * trying to cache the full object within the mappable
4456                  * aperture, and *must* have a fallback in place for
4457                  * situations where we cannot bind the object. We
4458                  * can be a little more lax here and use the fallback
4459                  * more often to avoid costly migrations of ourselves
4460                  * and other objects within the aperture.
4461                  *
4462                  * Half-the-aperture is used as a simple heuristic.
4463                  * More interesting would to do search for a free
4464                  * block prior to making the commitment to unbind.
4465                  * That caters for the self-harm case, and with a
4466                  * little more heuristics (e.g. NOFAULT, NOEVICT)
4467                  * we could try to minimise harm to others.
4468                  */
4469                 if (flags & PIN_NONBLOCK &&
4470                     obj->base.size > dev_priv->ggtt.mappable_end / 2)
4471                         return ERR_PTR(-ENOSPC);
4472         }
4473
4474         vma = i915_vma_instance(obj, vm, view);
4475         if (unlikely(IS_ERR(vma)))
4476                 return vma;
4477
4478         if (i915_vma_misplaced(vma, size, alignment, flags)) {
4479                 if (flags & PIN_NONBLOCK) {
4480                         if (i915_vma_is_pinned(vma) || i915_vma_is_active(vma))
4481                                 return ERR_PTR(-ENOSPC);
4482
4483                         if (flags & PIN_MAPPABLE &&
4484                             vma->fence_size > dev_priv->ggtt.mappable_end / 2)
4485                                 return ERR_PTR(-ENOSPC);
4486                 }
4487
4488                 WARN(i915_vma_is_pinned(vma),
4489                      "bo is already pinned in ggtt with incorrect alignment:"
4490                      " offset=%08x, req.alignment=%llx,"
4491                      " req.map_and_fenceable=%d, vma->map_and_fenceable=%d\n",
4492                      i915_ggtt_offset(vma), alignment,
4493                      !!(flags & PIN_MAPPABLE),
4494                      i915_vma_is_map_and_fenceable(vma));
4495                 ret = i915_vma_unbind(vma);
4496                 if (ret)
4497                         return ERR_PTR(ret);
4498         }
4499
4500         ret = i915_vma_pin(vma, size, alignment, flags | PIN_GLOBAL);
4501         if (ret)
4502                 return ERR_PTR(ret);
4503
4504         return vma;
4505 }
4506
4507 static __always_inline unsigned int __busy_read_flag(unsigned int id)
4508 {
4509         /* Note that we could alias engines in the execbuf API, but
4510          * that would be very unwise as it prevents userspace from
4511          * fine control over engine selection. Ahem.
4512          *
4513          * This should be something like EXEC_MAX_ENGINE instead of
4514          * I915_NUM_ENGINES.
4515          */
4516         BUILD_BUG_ON(I915_NUM_ENGINES > 16);
4517         return 0x10000 << id;
4518 }
4519
4520 static __always_inline unsigned int __busy_write_id(unsigned int id)
4521 {
4522         /* The uABI guarantees an active writer is also amongst the read
4523          * engines. This would be true if we accessed the activity tracking
4524          * under the lock, but as we perform the lookup of the object and
4525          * its activity locklessly we can not guarantee that the last_write
4526          * being active implies that we have set the same engine flag from
4527          * last_read - hence we always set both read and write busy for
4528          * last_write.
4529          */
4530         return id | __busy_read_flag(id);
4531 }
4532
4533 static __always_inline unsigned int
4534 __busy_set_if_active(const struct dma_fence *fence,
4535                      unsigned int (*flag)(unsigned int id))
4536 {
4537         struct i915_request *rq;
4538
4539         /* We have to check the current hw status of the fence as the uABI
4540          * guarantees forward progress. We could rely on the idle worker
4541          * to eventually flush us, but to minimise latency just ask the
4542          * hardware.
4543          *
4544          * Note we only report on the status of native fences.
4545          */
4546         if (!dma_fence_is_i915(fence))
4547                 return 0;
4548
4549         /* opencode to_request() in order to avoid const warnings */
4550         rq = container_of(fence, struct i915_request, fence);
4551         if (i915_request_completed(rq))
4552                 return 0;
4553
4554         return flag(rq->engine->uabi_id);
4555 }
4556
4557 static __always_inline unsigned int
4558 busy_check_reader(const struct dma_fence *fence)
4559 {
4560         return __busy_set_if_active(fence, __busy_read_flag);
4561 }
4562
4563 static __always_inline unsigned int
4564 busy_check_writer(const struct dma_fence *fence)
4565 {
4566         if (!fence)
4567                 return 0;
4568
4569         return __busy_set_if_active(fence, __busy_write_id);
4570 }
4571
4572 int
4573 i915_gem_busy_ioctl(struct drm_device *dev, void *data,
4574                     struct drm_file *file)
4575 {
4576         struct drm_i915_gem_busy *args = data;
4577         struct drm_i915_gem_object *obj;
4578         struct reservation_object_list *list;
4579         unsigned int seq;
4580         int err;
4581
4582         err = -ENOENT;
4583         rcu_read_lock();
4584         obj = i915_gem_object_lookup_rcu(file, args->handle);
4585         if (!obj)
4586                 goto out;
4587
4588         /* A discrepancy here is that we do not report the status of
4589          * non-i915 fences, i.e. even though we may report the object as idle,
4590          * a call to set-domain may still stall waiting for foreign rendering.
4591          * This also means that wait-ioctl may report an object as busy,
4592          * where busy-ioctl considers it idle.
4593          *
4594          * We trade the ability to warn of foreign fences to report on which
4595          * i915 engines are active for the object.
4596          *
4597          * Alternatively, we can trade that extra information on read/write
4598          * activity with
4599          *      args->busy =
4600          *              !reservation_object_test_signaled_rcu(obj->resv, true);
4601          * to report the overall busyness. This is what the wait-ioctl does.
4602          *
4603          */
4604 retry:
4605         seq = raw_read_seqcount(&obj->resv->seq);
4606
4607         /* Translate the exclusive fence to the READ *and* WRITE engine */
4608         args->busy = busy_check_writer(rcu_dereference(obj->resv->fence_excl));
4609
4610         /* Translate shared fences to READ set of engines */
4611         list = rcu_dereference(obj->resv->fence);
4612         if (list) {
4613                 unsigned int shared_count = list->shared_count, i;
4614
4615                 for (i = 0; i < shared_count; ++i) {
4616                         struct dma_fence *fence =
4617                                 rcu_dereference(list->shared[i]);
4618
4619                         args->busy |= busy_check_reader(fence);
4620                 }
4621         }
4622
4623         if (args->busy && read_seqcount_retry(&obj->resv->seq, seq))
4624                 goto retry;
4625
4626         err = 0;
4627 out:
4628         rcu_read_unlock();
4629         return err;
4630 }
4631
4632 int
4633 i915_gem_throttle_ioctl(struct drm_device *dev, void *data,
4634                         struct drm_file *file_priv)
4635 {
4636         return i915_gem_ring_throttle(dev, file_priv);
4637 }
4638
4639 int
4640 i915_gem_madvise_ioctl(struct drm_device *dev, void *data,
4641                        struct drm_file *file_priv)
4642 {
4643         struct drm_i915_private *dev_priv = to_i915(dev);
4644         struct drm_i915_gem_madvise *args = data;
4645         struct drm_i915_gem_object *obj;
4646         int err;
4647
4648         switch (args->madv) {
4649         case I915_MADV_DONTNEED:
4650         case I915_MADV_WILLNEED:
4651             break;
4652         default:
4653             return -EINVAL;
4654         }
4655
4656         obj = i915_gem_object_lookup(file_priv, args->handle);
4657         if (!obj)
4658                 return -ENOENT;
4659
4660         err = mutex_lock_interruptible(&obj->mm.lock);
4661         if (err)
4662                 goto out;
4663
4664         if (i915_gem_object_has_pages(obj) &&
4665             i915_gem_object_is_tiled(obj) &&
4666             dev_priv->quirks & QUIRK_PIN_SWIZZLED_PAGES) {
4667                 if (obj->mm.madv == I915_MADV_WILLNEED) {
4668                         GEM_BUG_ON(!obj->mm.quirked);
4669                         __i915_gem_object_unpin_pages(obj);
4670                         obj->mm.quirked = false;
4671                 }
4672                 if (args->madv == I915_MADV_WILLNEED) {
4673                         GEM_BUG_ON(obj->mm.quirked);
4674                         __i915_gem_object_pin_pages(obj);
4675                         obj->mm.quirked = true;
4676                 }
4677         }
4678
4679         if (obj->mm.madv != __I915_MADV_PURGED)
4680                 obj->mm.madv = args->madv;
4681
4682         /* if the object is no longer attached, discard its backing storage */
4683         if (obj->mm.madv == I915_MADV_DONTNEED &&
4684             !i915_gem_object_has_pages(obj))
4685                 i915_gem_object_truncate(obj);
4686
4687         args->retained = obj->mm.madv != __I915_MADV_PURGED;
4688         mutex_unlock(&obj->mm.lock);
4689
4690 out:
4691         i915_gem_object_put(obj);
4692         return err;
4693 }
4694
4695 static void
4696 frontbuffer_retire(struct i915_gem_active *active, struct i915_request *request)
4697 {
4698         struct drm_i915_gem_object *obj =
4699                 container_of(active, typeof(*obj), frontbuffer_write);
4700
4701         intel_fb_obj_flush(obj, ORIGIN_CS);
4702 }
4703
4704 void i915_gem_object_init(struct drm_i915_gem_object *obj,
4705                           const struct drm_i915_gem_object_ops *ops)
4706 {
4707         mutex_init(&obj->mm.lock);
4708
4709         INIT_LIST_HEAD(&obj->vma_list);
4710         INIT_LIST_HEAD(&obj->lut_list);
4711         INIT_LIST_HEAD(&obj->batch_pool_link);
4712
4713         obj->ops = ops;
4714
4715         reservation_object_init(&obj->__builtin_resv);
4716         obj->resv = &obj->__builtin_resv;
4717
4718         obj->frontbuffer_ggtt_origin = ORIGIN_GTT;
4719         init_request_active(&obj->frontbuffer_write, frontbuffer_retire);
4720
4721         obj->mm.madv = I915_MADV_WILLNEED;
4722         INIT_RADIX_TREE(&obj->mm.get_page.radix, GFP_KERNEL | __GFP_NOWARN);
4723         mutex_init(&obj->mm.get_page.lock);
4724
4725         i915_gem_info_add_obj(to_i915(obj->base.dev), obj->base.size);
4726 }
4727
4728 static const struct drm_i915_gem_object_ops i915_gem_object_ops = {
4729         .flags = I915_GEM_OBJECT_HAS_STRUCT_PAGE |
4730                  I915_GEM_OBJECT_IS_SHRINKABLE,
4731
4732         .get_pages = i915_gem_object_get_pages_gtt,
4733         .put_pages = i915_gem_object_put_pages_gtt,
4734
4735         .pwrite = i915_gem_object_pwrite_gtt,
4736 };
4737
4738 static int i915_gem_object_create_shmem(struct drm_device *dev,
4739                                         struct drm_gem_object *obj,
4740                                         size_t size)
4741 {
4742         struct drm_i915_private *i915 = to_i915(dev);
4743         unsigned long flags = VM_NORESERVE;
4744         struct file *filp;
4745
4746         drm_gem_private_object_init(dev, obj, size);
4747
4748         if (i915->mm.gemfs)
4749                 filp = shmem_file_setup_with_mnt(i915->mm.gemfs, "i915", size,
4750                                                  flags);
4751         else
4752                 filp = shmem_file_setup("i915", size, flags);
4753
4754         if (IS_ERR(filp))
4755                 return PTR_ERR(filp);
4756
4757         obj->filp = filp;
4758
4759         return 0;
4760 }
4761
4762 struct drm_i915_gem_object *
4763 i915_gem_object_create(struct drm_i915_private *dev_priv, u64 size)
4764 {
4765         struct drm_i915_gem_object *obj;
4766         struct address_space *mapping;
4767         unsigned int cache_level;
4768         gfp_t mask;
4769         int ret;
4770
4771         /* There is a prevalence of the assumption that we fit the object's
4772          * page count inside a 32bit _signed_ variable. Let's document this and
4773          * catch if we ever need to fix it. In the meantime, if you do spot
4774          * such a local variable, please consider fixing!
4775          */
4776         if (size >> PAGE_SHIFT > INT_MAX)
4777                 return ERR_PTR(-E2BIG);
4778
4779         if (overflows_type(size, obj->base.size))
4780                 return ERR_PTR(-E2BIG);
4781
4782         obj = i915_gem_object_alloc(dev_priv);
4783         if (obj == NULL)
4784                 return ERR_PTR(-ENOMEM);
4785
4786         ret = i915_gem_object_create_shmem(&dev_priv->drm, &obj->base, size);
4787         if (ret)
4788                 goto fail;
4789
4790         mask = GFP_HIGHUSER | __GFP_RECLAIMABLE;
4791         if (IS_I965GM(dev_priv) || IS_I965G(dev_priv)) {
4792                 /* 965gm cannot relocate objects above 4GiB. */
4793                 mask &= ~__GFP_HIGHMEM;
4794                 mask |= __GFP_DMA32;
4795         }
4796
4797         mapping = obj->base.filp->f_mapping;
4798         mapping_set_gfp_mask(mapping, mask);
4799         GEM_BUG_ON(!(mapping_gfp_mask(mapping) & __GFP_RECLAIM));
4800
4801         i915_gem_object_init(obj, &i915_gem_object_ops);
4802
4803         obj->write_domain = I915_GEM_DOMAIN_CPU;
4804         obj->read_domains = I915_GEM_DOMAIN_CPU;
4805
4806         if (HAS_LLC(dev_priv))
4807                 /* On some devices, we can have the GPU use the LLC (the CPU
4808                  * cache) for about a 10% performance improvement
4809                  * compared to uncached.  Graphics requests other than
4810                  * display scanout are coherent with the CPU in
4811                  * accessing this cache.  This means in this mode we
4812                  * don't need to clflush on the CPU side, and on the
4813                  * GPU side we only need to flush internal caches to
4814                  * get data visible to the CPU.
4815                  *
4816                  * However, we maintain the display planes as UC, and so
4817                  * need to rebind when first used as such.
4818                  */
4819                 cache_level = I915_CACHE_LLC;
4820         else
4821                 cache_level = I915_CACHE_NONE;
4822
4823         i915_gem_object_set_cache_coherency(obj, cache_level);
4824
4825         trace_i915_gem_object_create(obj);
4826
4827         return obj;
4828
4829 fail:
4830         i915_gem_object_free(obj);
4831         return ERR_PTR(ret);
4832 }
4833
4834 static bool discard_backing_storage(struct drm_i915_gem_object *obj)
4835 {
4836         /* If we are the last user of the backing storage (be it shmemfs
4837          * pages or stolen etc), we know that the pages are going to be
4838          * immediately released. In this case, we can then skip copying
4839          * back the contents from the GPU.
4840          */
4841
4842         if (obj->mm.madv != I915_MADV_WILLNEED)
4843                 return false;
4844
4845         if (obj->base.filp == NULL)
4846                 return true;
4847
4848         /* At first glance, this looks racy, but then again so would be
4849          * userspace racing mmap against close. However, the first external
4850          * reference to the filp can only be obtained through the
4851          * i915_gem_mmap_ioctl() which safeguards us against the user
4852          * acquiring such a reference whilst we are in the middle of
4853          * freeing the object.
4854          */
4855         return atomic_long_read(&obj->base.filp->f_count) == 1;
4856 }
4857
4858 static void __i915_gem_free_objects(struct drm_i915_private *i915,
4859                                     struct llist_node *freed)
4860 {
4861         struct drm_i915_gem_object *obj, *on;
4862
4863         intel_runtime_pm_get(i915);
4864         llist_for_each_entry_safe(obj, on, freed, freed) {
4865                 struct i915_vma *vma, *vn;
4866
4867                 trace_i915_gem_object_destroy(obj);
4868
4869                 mutex_lock(&i915->drm.struct_mutex);
4870
4871                 GEM_BUG_ON(i915_gem_object_is_active(obj));
4872                 list_for_each_entry_safe(vma, vn,
4873                                          &obj->vma_list, obj_link) {
4874                         GEM_BUG_ON(i915_vma_is_active(vma));
4875                         vma->flags &= ~I915_VMA_PIN_MASK;
4876                         i915_vma_destroy(vma);
4877                 }
4878                 GEM_BUG_ON(!list_empty(&obj->vma_list));
4879                 GEM_BUG_ON(!RB_EMPTY_ROOT(&obj->vma_tree));
4880
4881                 /* This serializes freeing with the shrinker. Since the free
4882                  * is delayed, first by RCU then by the workqueue, we want the
4883                  * shrinker to be able to free pages of unreferenced objects,
4884                  * or else we may oom whilst there are plenty of deferred
4885                  * freed objects.
4886                  */
4887                 if (i915_gem_object_has_pages(obj)) {
4888                         spin_lock(&i915->mm.obj_lock);
4889                         list_del_init(&obj->mm.link);
4890                         spin_unlock(&i915->mm.obj_lock);
4891                 }
4892
4893                 mutex_unlock(&i915->drm.struct_mutex);
4894
4895                 GEM_BUG_ON(obj->bind_count);
4896                 GEM_BUG_ON(obj->userfault_count);
4897                 GEM_BUG_ON(atomic_read(&obj->frontbuffer_bits));
4898                 GEM_BUG_ON(!list_empty(&obj->lut_list));
4899
4900                 if (obj->ops->release)
4901                         obj->ops->release(obj);
4902
4903                 if (WARN_ON(i915_gem_object_has_pinned_pages(obj)))
4904                         atomic_set(&obj->mm.pages_pin_count, 0);
4905                 __i915_gem_object_put_pages(obj, I915_MM_NORMAL);
4906                 GEM_BUG_ON(i915_gem_object_has_pages(obj));
4907
4908                 if (obj->base.import_attach)
4909                         drm_prime_gem_destroy(&obj->base, NULL);
4910
4911                 reservation_object_fini(&obj->__builtin_resv);
4912                 drm_gem_object_release(&obj->base);
4913                 i915_gem_info_remove_obj(i915, obj->base.size);
4914
4915                 kfree(obj->bit_17);
4916                 i915_gem_object_free(obj);
4917
4918                 GEM_BUG_ON(!atomic_read(&i915->mm.free_count));
4919                 atomic_dec(&i915->mm.free_count);
4920
4921                 if (on)
4922                         cond_resched();
4923         }
4924         intel_runtime_pm_put(i915);
4925 }
4926
4927 static void i915_gem_flush_free_objects(struct drm_i915_private *i915)
4928 {
4929         struct llist_node *freed;
4930
4931         /* Free the oldest, most stale object to keep the free_list short */
4932         freed = NULL;
4933         if (!llist_empty(&i915->mm.free_list)) { /* quick test for hotpath */
4934                 /* Only one consumer of llist_del_first() allowed */
4935                 spin_lock(&i915->mm.free_lock);
4936                 freed = llist_del_first(&i915->mm.free_list);
4937                 spin_unlock(&i915->mm.free_lock);
4938         }
4939         if (unlikely(freed)) {
4940                 freed->next = NULL;
4941                 __i915_gem_free_objects(i915, freed);
4942         }
4943 }
4944
4945 static void __i915_gem_free_work(struct work_struct *work)
4946 {
4947         struct drm_i915_private *i915 =
4948                 container_of(work, struct drm_i915_private, mm.free_work);
4949         struct llist_node *freed;
4950
4951         /*
4952          * All file-owned VMA should have been released by this point through
4953          * i915_gem_close_object(), or earlier by i915_gem_context_close().
4954          * However, the object may also be bound into the global GTT (e.g.
4955          * older GPUs without per-process support, or for direct access through
4956          * the GTT either for the user or for scanout). Those VMA still need to
4957          * unbound now.
4958          */
4959
4960         spin_lock(&i915->mm.free_lock);
4961         while ((freed = llist_del_all(&i915->mm.free_list))) {
4962                 spin_unlock(&i915->mm.free_lock);
4963
4964                 __i915_gem_free_objects(i915, freed);
4965                 if (need_resched())
4966                         return;
4967
4968                 spin_lock(&i915->mm.free_lock);
4969         }
4970         spin_unlock(&i915->mm.free_lock);
4971 }
4972
4973 static void __i915_gem_free_object_rcu(struct rcu_head *head)
4974 {
4975         struct drm_i915_gem_object *obj =
4976                 container_of(head, typeof(*obj), rcu);
4977         struct drm_i915_private *i915 = to_i915(obj->base.dev);
4978
4979         /*
4980          * Since we require blocking on struct_mutex to unbind the freed
4981          * object from the GPU before releasing resources back to the
4982          * system, we can not do that directly from the RCU callback (which may
4983          * be a softirq context), but must instead then defer that work onto a
4984          * kthread. We use the RCU callback rather than move the freed object
4985          * directly onto the work queue so that we can mix between using the
4986          * worker and performing frees directly from subsequent allocations for
4987          * crude but effective memory throttling.
4988          */
4989         if (llist_add(&obj->freed, &i915->mm.free_list))
4990                 queue_work(i915->wq, &i915->mm.free_work);
4991 }
4992
4993 void i915_gem_free_object(struct drm_gem_object *gem_obj)
4994 {
4995         struct drm_i915_gem_object *obj = to_intel_bo(gem_obj);
4996
4997         if (obj->mm.quirked)
4998                 __i915_gem_object_unpin_pages(obj);
4999
5000         if (discard_backing_storage(obj))
5001                 obj->mm.madv = I915_MADV_DONTNEED;
5002
5003         /*
5004          * Before we free the object, make sure any pure RCU-only
5005          * read-side critical sections are complete, e.g.
5006          * i915_gem_busy_ioctl(). For the corresponding synchronized
5007          * lookup see i915_gem_object_lookup_rcu().
5008          */
5009         atomic_inc(&to_i915(obj->base.dev)->mm.free_count);
5010         call_rcu(&obj->rcu, __i915_gem_free_object_rcu);
5011 }
5012
5013 void __i915_gem_object_release_unless_active(struct drm_i915_gem_object *obj)
5014 {
5015         lockdep_assert_held(&obj->base.dev->struct_mutex);
5016
5017         if (!i915_gem_object_has_active_reference(obj) &&
5018             i915_gem_object_is_active(obj))
5019                 i915_gem_object_set_active_reference(obj);
5020         else
5021                 i915_gem_object_put(obj);
5022 }
5023
5024 void i915_gem_sanitize(struct drm_i915_private *i915)
5025 {
5026         int err;
5027
5028         GEM_TRACE("\n");
5029
5030         mutex_lock(&i915->drm.struct_mutex);
5031
5032         intel_runtime_pm_get(i915);
5033         intel_uncore_forcewake_get(i915, FORCEWAKE_ALL);
5034
5035         /*
5036          * As we have just resumed the machine and woken the device up from
5037          * deep PCI sleep (presumably D3_cold), assume the HW has been reset
5038          * back to defaults, recovering from whatever wedged state we left it
5039          * in and so worth trying to use the device once more.
5040          */
5041         if (i915_terminally_wedged(&i915->gpu_error))
5042                 i915_gem_unset_wedged(i915);
5043
5044         /*
5045          * If we inherit context state from the BIOS or earlier occupants
5046          * of the GPU, the GPU may be in an inconsistent state when we
5047          * try to take over. The only way to remove the earlier state
5048          * is by resetting. However, resetting on earlier gen is tricky as
5049          * it may impact the display and we are uncertain about the stability
5050          * of the reset, so this could be applied to even earlier gen.
5051          */
5052         err = -ENODEV;
5053         if (INTEL_GEN(i915) >= 5 && intel_has_gpu_reset(i915))
5054                 err = WARN_ON(intel_gpu_reset(i915, ALL_ENGINES));
5055         if (!err)
5056                 intel_engines_sanitize(i915);
5057
5058         intel_uncore_forcewake_put(i915, FORCEWAKE_ALL);
5059         intel_runtime_pm_put(i915);
5060
5061         i915_gem_contexts_lost(i915);
5062         mutex_unlock(&i915->drm.struct_mutex);
5063 }
5064
5065 int i915_gem_suspend(struct drm_i915_private *i915)
5066 {
5067         int ret;
5068
5069         GEM_TRACE("\n");
5070
5071         intel_runtime_pm_get(i915);
5072         intel_suspend_gt_powersave(i915);
5073
5074         mutex_lock(&i915->drm.struct_mutex);
5075
5076         /*
5077          * We have to flush all the executing contexts to main memory so
5078          * that they can saved in the hibernation image. To ensure the last
5079          * context image is coherent, we have to switch away from it. That
5080          * leaves the i915->kernel_context still active when
5081          * we actually suspend, and its image in memory may not match the GPU
5082          * state. Fortunately, the kernel_context is disposable and we do
5083          * not rely on its state.
5084          */
5085         if (!i915_terminally_wedged(&i915->gpu_error)) {
5086                 ret = i915_gem_switch_to_kernel_context(i915);
5087                 if (ret)
5088                         goto err_unlock;
5089
5090                 ret = i915_gem_wait_for_idle(i915,
5091                                              I915_WAIT_INTERRUPTIBLE |
5092                                              I915_WAIT_LOCKED |
5093                                              I915_WAIT_FOR_IDLE_BOOST,
5094                                              MAX_SCHEDULE_TIMEOUT);
5095                 if (ret && ret != -EIO)
5096                         goto err_unlock;
5097
5098                 assert_kernel_context_is_current(i915);
5099         }
5100         i915_retire_requests(i915); /* ensure we flush after wedging */
5101
5102         mutex_unlock(&i915->drm.struct_mutex);
5103
5104         intel_uc_suspend(i915);
5105
5106         cancel_delayed_work_sync(&i915->gpu_error.hangcheck_work);
5107         cancel_delayed_work_sync(&i915->gt.retire_work);
5108
5109         /*
5110          * As the idle_work is rearming if it detects a race, play safe and
5111          * repeat the flush until it is definitely idle.
5112          */
5113         drain_delayed_work(&i915->gt.idle_work);
5114
5115         /*
5116          * Assert that we successfully flushed all the work and
5117          * reset the GPU back to its idle, low power state.
5118          */
5119         WARN_ON(i915->gt.awake);
5120         if (WARN_ON(!intel_engines_are_idle(i915)))
5121                 i915_gem_set_wedged(i915); /* no hope, discard everything */
5122
5123         intel_runtime_pm_put(i915);
5124         return 0;
5125
5126 err_unlock:
5127         mutex_unlock(&i915->drm.struct_mutex);
5128         intel_runtime_pm_put(i915);
5129         return ret;
5130 }
5131
5132 void i915_gem_suspend_late(struct drm_i915_private *i915)
5133 {
5134         struct drm_i915_gem_object *obj;
5135         struct list_head *phases[] = {
5136                 &i915->mm.unbound_list,
5137                 &i915->mm.bound_list,
5138                 NULL
5139         }, **phase;
5140
5141         /*
5142          * Neither the BIOS, ourselves or any other kernel
5143          * expects the system to be in execlists mode on startup,
5144          * so we need to reset the GPU back to legacy mode. And the only
5145          * known way to disable logical contexts is through a GPU reset.
5146          *
5147          * So in order to leave the system in a known default configuration,
5148          * always reset the GPU upon unload and suspend. Afterwards we then
5149          * clean up the GEM state tracking, flushing off the requests and
5150          * leaving the system in a known idle state.
5151          *
5152          * Note that is of the upmost importance that the GPU is idle and
5153          * all stray writes are flushed *before* we dismantle the backing
5154          * storage for the pinned objects.
5155          *
5156          * However, since we are uncertain that resetting the GPU on older
5157          * machines is a good idea, we don't - just in case it leaves the
5158          * machine in an unusable condition.
5159          */
5160
5161         mutex_lock(&i915->drm.struct_mutex);
5162         for (phase = phases; *phase; phase++) {
5163                 list_for_each_entry(obj, *phase, mm.link)
5164                         WARN_ON(i915_gem_object_set_to_gtt_domain(obj, false));
5165         }
5166         mutex_unlock(&i915->drm.struct_mutex);
5167
5168         intel_uc_sanitize(i915);
5169         i915_gem_sanitize(i915);
5170 }
5171
5172 void i915_gem_resume(struct drm_i915_private *i915)
5173 {
5174         GEM_TRACE("\n");
5175
5176         WARN_ON(i915->gt.awake);
5177
5178         mutex_lock(&i915->drm.struct_mutex);
5179         intel_uncore_forcewake_get(i915, FORCEWAKE_ALL);
5180
5181         i915_gem_restore_gtt_mappings(i915);
5182         i915_gem_restore_fences(i915);
5183
5184         /*
5185          * As we didn't flush the kernel context before suspend, we cannot
5186          * guarantee that the context image is complete. So let's just reset
5187          * it and start again.
5188          */
5189         i915->gt.resume(i915);
5190
5191         if (i915_gem_init_hw(i915))
5192                 goto err_wedged;
5193
5194         intel_uc_resume(i915);
5195
5196         /* Always reload a context for powersaving. */
5197         if (i915_gem_switch_to_kernel_context(i915))
5198                 goto err_wedged;
5199
5200 out_unlock:
5201         intel_uncore_forcewake_put(i915, FORCEWAKE_ALL);
5202         mutex_unlock(&i915->drm.struct_mutex);
5203         return;
5204
5205 err_wedged:
5206         if (!i915_terminally_wedged(&i915->gpu_error)) {
5207                 DRM_ERROR("failed to re-initialize GPU, declaring wedged!\n");
5208                 i915_gem_set_wedged(i915);
5209         }
5210         goto out_unlock;
5211 }
5212
5213 void i915_gem_init_swizzling(struct drm_i915_private *dev_priv)
5214 {
5215         if (INTEL_GEN(dev_priv) < 5 ||
5216             dev_priv->mm.bit_6_swizzle_x == I915_BIT_6_SWIZZLE_NONE)
5217                 return;
5218
5219         I915_WRITE(DISP_ARB_CTL, I915_READ(DISP_ARB_CTL) |
5220                                  DISP_TILE_SURFACE_SWIZZLING);
5221
5222         if (IS_GEN5(dev_priv))
5223                 return;
5224
5225         I915_WRITE(TILECTL, I915_READ(TILECTL) | TILECTL_SWZCTL);
5226         if (IS_GEN6(dev_priv))
5227                 I915_WRITE(ARB_MODE, _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_SNB));
5228         else if (IS_GEN7(dev_priv))
5229                 I915_WRITE(ARB_MODE, _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_IVB));
5230         else if (IS_GEN8(dev_priv))
5231                 I915_WRITE(GAMTARBMODE, _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_BDW));
5232         else
5233                 BUG();
5234 }
5235
5236 static void init_unused_ring(struct drm_i915_private *dev_priv, u32 base)
5237 {
5238         I915_WRITE(RING_CTL(base), 0);
5239         I915_WRITE(RING_HEAD(base), 0);
5240         I915_WRITE(RING_TAIL(base), 0);
5241         I915_WRITE(RING_START(base), 0);
5242 }
5243
5244 static void init_unused_rings(struct drm_i915_private *dev_priv)
5245 {
5246         if (IS_I830(dev_priv)) {
5247                 init_unused_ring(dev_priv, PRB1_BASE);
5248                 init_unused_ring(dev_priv, SRB0_BASE);
5249                 init_unused_ring(dev_priv, SRB1_BASE);
5250                 init_unused_ring(dev_priv, SRB2_BASE);
5251                 init_unused_ring(dev_priv, SRB3_BASE);
5252         } else if (IS_GEN2(dev_priv)) {
5253                 init_unused_ring(dev_priv, SRB0_BASE);
5254                 init_unused_ring(dev_priv, SRB1_BASE);
5255         } else if (IS_GEN3(dev_priv)) {
5256                 init_unused_ring(dev_priv, PRB1_BASE);
5257                 init_unused_ring(dev_priv, PRB2_BASE);
5258         }
5259 }
5260
5261 static int __i915_gem_restart_engines(void *data)
5262 {
5263         struct drm_i915_private *i915 = data;
5264         struct intel_engine_cs *engine;
5265         enum intel_engine_id id;
5266         int err;
5267
5268         for_each_engine(engine, i915, id) {
5269                 err = engine->init_hw(engine);
5270                 if (err) {
5271                         DRM_ERROR("Failed to restart %s (%d)\n",
5272                                   engine->name, err);
5273                         return err;
5274                 }
5275         }
5276
5277         return 0;
5278 }
5279
5280 int i915_gem_init_hw(struct drm_i915_private *dev_priv)
5281 {
5282         int ret;
5283
5284         dev_priv->gt.last_init_time = ktime_get();
5285
5286         /* Double layer security blanket, see i915_gem_init() */
5287         intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
5288
5289         if (HAS_EDRAM(dev_priv) && INTEL_GEN(dev_priv) < 9)
5290                 I915_WRITE(HSW_IDICR, I915_READ(HSW_IDICR) | IDIHASHMSK(0xf));
5291
5292         if (IS_HASWELL(dev_priv))
5293                 I915_WRITE(MI_PREDICATE_RESULT_2, IS_HSW_GT3(dev_priv) ?
5294                            LOWER_SLICE_ENABLED : LOWER_SLICE_DISABLED);
5295
5296         if (HAS_PCH_NOP(dev_priv)) {
5297                 if (IS_IVYBRIDGE(dev_priv)) {
5298                         u32 temp = I915_READ(GEN7_MSG_CTL);
5299                         temp &= ~(WAIT_FOR_PCH_FLR_ACK | WAIT_FOR_PCH_RESET_ACK);
5300                         I915_WRITE(GEN7_MSG_CTL, temp);
5301                 } else if (INTEL_GEN(dev_priv) >= 7) {
5302                         u32 temp = I915_READ(HSW_NDE_RSTWRN_OPT);
5303                         temp &= ~RESET_PCH_HANDSHAKE_ENABLE;
5304                         I915_WRITE(HSW_NDE_RSTWRN_OPT, temp);
5305                 }
5306         }
5307
5308         intel_gt_workarounds_apply(dev_priv);
5309
5310         i915_gem_init_swizzling(dev_priv);
5311
5312         /*
5313          * At least 830 can leave some of the unused rings
5314          * "active" (ie. head != tail) after resume which
5315          * will prevent c3 entry. Makes sure all unused rings
5316          * are totally idle.
5317          */
5318         init_unused_rings(dev_priv);
5319
5320         BUG_ON(!dev_priv->kernel_context);
5321         if (i915_terminally_wedged(&dev_priv->gpu_error)) {
5322                 ret = -EIO;
5323                 goto out;
5324         }
5325
5326         ret = i915_ppgtt_init_hw(dev_priv);
5327         if (ret) {
5328                 DRM_ERROR("Enabling PPGTT failed (%d)\n", ret);
5329                 goto out;
5330         }
5331
5332         ret = intel_wopcm_init_hw(&dev_priv->wopcm);
5333         if (ret) {
5334                 DRM_ERROR("Enabling WOPCM failed (%d)\n", ret);
5335                 goto out;
5336         }
5337
5338         /* We can't enable contexts until all firmware is loaded */
5339         ret = intel_uc_init_hw(dev_priv);
5340         if (ret) {
5341                 DRM_ERROR("Enabling uc failed (%d)\n", ret);
5342                 goto out;
5343         }
5344
5345         intel_mocs_init_l3cc_table(dev_priv);
5346
5347         /* Only when the HW is re-initialised, can we replay the requests */
5348         ret = __i915_gem_restart_engines(dev_priv);
5349         if (ret)
5350                 goto cleanup_uc;
5351
5352         intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
5353
5354         return 0;
5355
5356 cleanup_uc:
5357         intel_uc_fini_hw(dev_priv);
5358 out:
5359         intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
5360
5361         return ret;
5362 }
5363
5364 static int __intel_engines_record_defaults(struct drm_i915_private *i915)
5365 {
5366         struct i915_gem_context *ctx;
5367         struct intel_engine_cs *engine;
5368         enum intel_engine_id id;
5369         int err;
5370
5371         /*
5372          * As we reset the gpu during very early sanitisation, the current
5373          * register state on the GPU should reflect its defaults values.
5374          * We load a context onto the hw (with restore-inhibit), then switch
5375          * over to a second context to save that default register state. We
5376          * can then prime every new context with that state so they all start
5377          * from the same default HW values.
5378          */
5379
5380         ctx = i915_gem_context_create_kernel(i915, 0);
5381         if (IS_ERR(ctx))
5382                 return PTR_ERR(ctx);
5383
5384         for_each_engine(engine, i915, id) {
5385                 struct i915_request *rq;
5386
5387                 rq = i915_request_alloc(engine, ctx);
5388                 if (IS_ERR(rq)) {
5389                         err = PTR_ERR(rq);
5390                         goto out_ctx;
5391                 }
5392
5393                 err = 0;
5394                 if (engine->init_context)
5395                         err = engine->init_context(rq);
5396
5397                 i915_request_add(rq);
5398                 if (err)
5399                         goto err_active;
5400         }
5401
5402         err = i915_gem_switch_to_kernel_context(i915);
5403         if (err)
5404                 goto err_active;
5405
5406         if (i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED, HZ / 5)) {
5407                 i915_gem_set_wedged(i915);
5408                 err = -EIO; /* Caller will declare us wedged */
5409                 goto err_active;
5410         }
5411
5412         assert_kernel_context_is_current(i915);
5413
5414         /*
5415          * Immediately park the GPU so that we enable powersaving and
5416          * treat it as idle. The next time we issue a request, we will
5417          * unpark and start using the engine->pinned_default_state, otherwise
5418          * it is in limbo and an early reset may fail.
5419          */
5420         __i915_gem_park(i915);
5421
5422         for_each_engine(engine, i915, id) {
5423                 struct i915_vma *state;
5424                 void *vaddr;
5425
5426                 GEM_BUG_ON(to_intel_context(ctx, engine)->pin_count);
5427
5428                 state = to_intel_context(ctx, engine)->state;
5429                 if (!state)
5430                         continue;
5431
5432                 /*
5433                  * As we will hold a reference to the logical state, it will
5434                  * not be torn down with the context, and importantly the
5435                  * object will hold onto its vma (making it possible for a
5436                  * stray GTT write to corrupt our defaults). Unmap the vma
5437                  * from the GTT to prevent such accidents and reclaim the
5438                  * space.
5439                  */
5440                 err = i915_vma_unbind(state);
5441                 if (err)
5442                         goto err_active;
5443
5444                 err = i915_gem_object_set_to_cpu_domain(state->obj, false);
5445                 if (err)
5446                         goto err_active;
5447
5448                 engine->default_state = i915_gem_object_get(state->obj);
5449
5450                 /* Check we can acquire the image of the context state */
5451                 vaddr = i915_gem_object_pin_map(engine->default_state,
5452                                                 I915_MAP_FORCE_WB);
5453                 if (IS_ERR(vaddr)) {
5454                         err = PTR_ERR(vaddr);
5455                         goto err_active;
5456                 }
5457
5458                 i915_gem_object_unpin_map(engine->default_state);
5459         }
5460
5461         if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) {
5462                 unsigned int found = intel_engines_has_context_isolation(i915);
5463
5464                 /*
5465                  * Make sure that classes with multiple engine instances all
5466                  * share the same basic configuration.
5467                  */
5468                 for_each_engine(engine, i915, id) {
5469                         unsigned int bit = BIT(engine->uabi_class);
5470                         unsigned int expected = engine->default_state ? bit : 0;
5471
5472                         if ((found & bit) != expected) {
5473                                 DRM_ERROR("mismatching default context state for class %d on engine %s\n",
5474                                           engine->uabi_class, engine->name);
5475                         }
5476                 }
5477         }
5478
5479 out_ctx:
5480         i915_gem_context_set_closed(ctx);
5481         i915_gem_context_put(ctx);
5482         return err;
5483
5484 err_active:
5485         /*
5486          * If we have to abandon now, we expect the engines to be idle
5487          * and ready to be torn-down. First try to flush any remaining
5488          * request, ensure we are pointing at the kernel context and
5489          * then remove it.
5490          */
5491         if (WARN_ON(i915_gem_switch_to_kernel_context(i915)))
5492                 goto out_ctx;
5493
5494         if (WARN_ON(i915_gem_wait_for_idle(i915,
5495                                            I915_WAIT_LOCKED,
5496                                            MAX_SCHEDULE_TIMEOUT)))
5497                 goto out_ctx;
5498
5499         i915_gem_contexts_lost(i915);
5500         goto out_ctx;
5501 }
5502
5503 int i915_gem_init(struct drm_i915_private *dev_priv)
5504 {
5505         int ret;
5506
5507         /* We need to fallback to 4K pages if host doesn't support huge gtt. */
5508         if (intel_vgpu_active(dev_priv) && !intel_vgpu_has_huge_gtt(dev_priv))
5509                 mkwrite_device_info(dev_priv)->page_sizes =
5510                         I915_GTT_PAGE_SIZE_4K;
5511
5512         dev_priv->mm.unordered_timeline = dma_fence_context_alloc(1);
5513
5514         if (HAS_LOGICAL_RING_CONTEXTS(dev_priv)) {
5515                 dev_priv->gt.resume = intel_lr_context_resume;
5516                 dev_priv->gt.cleanup_engine = intel_logical_ring_cleanup;
5517         } else {
5518                 dev_priv->gt.resume = intel_legacy_submission_resume;
5519                 dev_priv->gt.cleanup_engine = intel_engine_cleanup;
5520         }
5521
5522         ret = i915_gem_init_userptr(dev_priv);
5523         if (ret)
5524                 return ret;
5525
5526         ret = intel_uc_init_misc(dev_priv);
5527         if (ret)
5528                 return ret;
5529
5530         ret = intel_wopcm_init(&dev_priv->wopcm);
5531         if (ret)
5532                 goto err_uc_misc;
5533
5534         /* This is just a security blanket to placate dragons.
5535          * On some systems, we very sporadically observe that the first TLBs
5536          * used by the CS may be stale, despite us poking the TLB reset. If
5537          * we hold the forcewake during initialisation these problems
5538          * just magically go away.
5539          */
5540         mutex_lock(&dev_priv->drm.struct_mutex);
5541         intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
5542
5543         ret = i915_gem_init_ggtt(dev_priv);
5544         if (ret) {
5545                 GEM_BUG_ON(ret == -EIO);
5546                 goto err_unlock;
5547         }
5548
5549         ret = i915_gem_contexts_init(dev_priv);
5550         if (ret) {
5551                 GEM_BUG_ON(ret == -EIO);
5552                 goto err_ggtt;
5553         }
5554
5555         ret = intel_engines_init(dev_priv);
5556         if (ret) {
5557                 GEM_BUG_ON(ret == -EIO);
5558                 goto err_context;
5559         }
5560
5561         intel_init_gt_powersave(dev_priv);
5562
5563         ret = intel_uc_init(dev_priv);
5564         if (ret)
5565                 goto err_pm;
5566
5567         ret = i915_gem_init_hw(dev_priv);
5568         if (ret)
5569                 goto err_uc_init;
5570
5571         /*
5572          * Despite its name intel_init_clock_gating applies both display
5573          * clock gating workarounds; GT mmio workarounds and the occasional
5574          * GT power context workaround. Worse, sometimes it includes a context
5575          * register workaround which we need to apply before we record the
5576          * default HW state for all contexts.
5577          *
5578          * FIXME: break up the workarounds and apply them at the right time!
5579          */
5580         intel_init_clock_gating(dev_priv);
5581
5582         ret = __intel_engines_record_defaults(dev_priv);
5583         if (ret)
5584                 goto err_init_hw;
5585
5586         if (i915_inject_load_failure()) {
5587                 ret = -ENODEV;
5588                 goto err_init_hw;
5589         }
5590
5591         if (i915_inject_load_failure()) {
5592                 ret = -EIO;
5593                 goto err_init_hw;
5594         }
5595
5596         intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
5597         mutex_unlock(&dev_priv->drm.struct_mutex);
5598
5599         return 0;
5600
5601         /*
5602          * Unwinding is complicated by that we want to handle -EIO to mean
5603          * disable GPU submission but keep KMS alive. We want to mark the
5604          * HW as irrevisibly wedged, but keep enough state around that the
5605          * driver doesn't explode during runtime.
5606          */
5607 err_init_hw:
5608         mutex_unlock(&dev_priv->drm.struct_mutex);
5609
5610         WARN_ON(i915_gem_suspend(dev_priv));
5611         i915_gem_suspend_late(dev_priv);
5612
5613         i915_gem_drain_workqueue(dev_priv);
5614
5615         mutex_lock(&dev_priv->drm.struct_mutex);
5616         intel_uc_fini_hw(dev_priv);
5617 err_uc_init:
5618         intel_uc_fini(dev_priv);
5619 err_pm:
5620         if (ret != -EIO) {
5621                 intel_cleanup_gt_powersave(dev_priv);
5622                 i915_gem_cleanup_engines(dev_priv);
5623         }
5624 err_context:
5625         if (ret != -EIO)
5626                 i915_gem_contexts_fini(dev_priv);
5627 err_ggtt:
5628 err_unlock:
5629         intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
5630         mutex_unlock(&dev_priv->drm.struct_mutex);
5631
5632 err_uc_misc:
5633         intel_uc_fini_misc(dev_priv);
5634
5635         if (ret != -EIO)
5636                 i915_gem_cleanup_userptr(dev_priv);
5637
5638         if (ret == -EIO) {
5639                 mutex_lock(&dev_priv->drm.struct_mutex);
5640
5641                 /*
5642                  * Allow engine initialisation to fail by marking the GPU as
5643                  * wedged. But we only want to do this where the GPU is angry,
5644                  * for all other failure, such as an allocation failure, bail.
5645                  */
5646                 if (!i915_terminally_wedged(&dev_priv->gpu_error)) {
5647                         i915_load_error(dev_priv,
5648                                         "Failed to initialize GPU, declaring it wedged!\n");
5649                         i915_gem_set_wedged(dev_priv);
5650                 }
5651
5652                 /* Minimal basic recovery for KMS */
5653                 ret = i915_ggtt_enable_hw(dev_priv);
5654                 i915_gem_restore_gtt_mappings(dev_priv);
5655                 i915_gem_restore_fences(dev_priv);
5656                 intel_init_clock_gating(dev_priv);
5657
5658                 mutex_unlock(&dev_priv->drm.struct_mutex);
5659         }
5660
5661         i915_gem_drain_freed_objects(dev_priv);
5662         return ret;
5663 }
5664
5665 void i915_gem_fini(struct drm_i915_private *dev_priv)
5666 {
5667         i915_gem_suspend_late(dev_priv);
5668         intel_disable_gt_powersave(dev_priv);
5669
5670         /* Flush any outstanding unpin_work. */
5671         i915_gem_drain_workqueue(dev_priv);
5672
5673         mutex_lock(&dev_priv->drm.struct_mutex);
5674         intel_uc_fini_hw(dev_priv);
5675         intel_uc_fini(dev_priv);
5676         i915_gem_cleanup_engines(dev_priv);
5677         i915_gem_contexts_fini(dev_priv);
5678         mutex_unlock(&dev_priv->drm.struct_mutex);
5679
5680         intel_cleanup_gt_powersave(dev_priv);
5681
5682         intel_uc_fini_misc(dev_priv);
5683         i915_gem_cleanup_userptr(dev_priv);
5684
5685         i915_gem_drain_freed_objects(dev_priv);
5686
5687         WARN_ON(!list_empty(&dev_priv->contexts.list));
5688 }
5689
5690 void i915_gem_init_mmio(struct drm_i915_private *i915)
5691 {
5692         i915_gem_sanitize(i915);
5693 }
5694
5695 void
5696 i915_gem_cleanup_engines(struct drm_i915_private *dev_priv)
5697 {
5698         struct intel_engine_cs *engine;
5699         enum intel_engine_id id;
5700
5701         for_each_engine(engine, dev_priv, id)
5702                 dev_priv->gt.cleanup_engine(engine);
5703 }
5704
5705 void
5706 i915_gem_load_init_fences(struct drm_i915_private *dev_priv)
5707 {
5708         int i;
5709
5710         if (INTEL_GEN(dev_priv) >= 7 && !IS_VALLEYVIEW(dev_priv) &&
5711             !IS_CHERRYVIEW(dev_priv))
5712                 dev_priv->num_fence_regs = 32;
5713         else if (INTEL_GEN(dev_priv) >= 4 ||
5714                  IS_I945G(dev_priv) || IS_I945GM(dev_priv) ||
5715                  IS_G33(dev_priv) || IS_PINEVIEW(dev_priv))
5716                 dev_priv->num_fence_regs = 16;
5717         else
5718                 dev_priv->num_fence_regs = 8;
5719
5720         if (intel_vgpu_active(dev_priv))
5721                 dev_priv->num_fence_regs =
5722                                 I915_READ(vgtif_reg(avail_rs.fence_num));
5723
5724         /* Initialize fence registers to zero */
5725         for (i = 0; i < dev_priv->num_fence_regs; i++) {
5726                 struct drm_i915_fence_reg *fence = &dev_priv->fence_regs[i];
5727
5728                 fence->i915 = dev_priv;
5729                 fence->id = i;
5730                 list_add_tail(&fence->link, &dev_priv->mm.fence_list);
5731         }
5732         i915_gem_restore_fences(dev_priv);
5733
5734         i915_gem_detect_bit_6_swizzle(dev_priv);
5735 }
5736
5737 static void i915_gem_init__mm(struct drm_i915_private *i915)
5738 {
5739         spin_lock_init(&i915->mm.object_stat_lock);
5740         spin_lock_init(&i915->mm.obj_lock);
5741         spin_lock_init(&i915->mm.free_lock);
5742
5743         init_llist_head(&i915->mm.free_list);
5744
5745         INIT_LIST_HEAD(&i915->mm.unbound_list);
5746         INIT_LIST_HEAD(&i915->mm.bound_list);
5747         INIT_LIST_HEAD(&i915->mm.fence_list);
5748         INIT_LIST_HEAD(&i915->mm.userfault_list);
5749
5750         INIT_WORK(&i915->mm.free_work, __i915_gem_free_work);
5751 }
5752
5753 int i915_gem_init_early(struct drm_i915_private *dev_priv)
5754 {
5755         int err = -ENOMEM;
5756
5757         dev_priv->objects = KMEM_CACHE(drm_i915_gem_object, SLAB_HWCACHE_ALIGN);
5758         if (!dev_priv->objects)
5759                 goto err_out;
5760
5761         dev_priv->vmas = KMEM_CACHE(i915_vma, SLAB_HWCACHE_ALIGN);
5762         if (!dev_priv->vmas)
5763                 goto err_objects;
5764
5765         dev_priv->luts = KMEM_CACHE(i915_lut_handle, 0);
5766         if (!dev_priv->luts)
5767                 goto err_vmas;
5768
5769         dev_priv->requests = KMEM_CACHE(i915_request,
5770                                         SLAB_HWCACHE_ALIGN |
5771                                         SLAB_RECLAIM_ACCOUNT |
5772                                         SLAB_TYPESAFE_BY_RCU);
5773         if (!dev_priv->requests)
5774                 goto err_luts;
5775
5776         dev_priv->dependencies = KMEM_CACHE(i915_dependency,
5777                                             SLAB_HWCACHE_ALIGN |
5778                                             SLAB_RECLAIM_ACCOUNT);
5779         if (!dev_priv->dependencies)
5780                 goto err_requests;
5781
5782         dev_priv->priorities = KMEM_CACHE(i915_priolist, SLAB_HWCACHE_ALIGN);
5783         if (!dev_priv->priorities)
5784                 goto err_dependencies;
5785
5786         INIT_LIST_HEAD(&dev_priv->gt.timelines);
5787         INIT_LIST_HEAD(&dev_priv->gt.active_rings);
5788         INIT_LIST_HEAD(&dev_priv->gt.closed_vma);
5789
5790         i915_gem_init__mm(dev_priv);
5791
5792         INIT_DELAYED_WORK(&dev_priv->gt.retire_work,
5793                           i915_gem_retire_work_handler);
5794         INIT_DELAYED_WORK(&dev_priv->gt.idle_work,
5795                           i915_gem_idle_work_handler);
5796         init_waitqueue_head(&dev_priv->gpu_error.wait_queue);
5797         init_waitqueue_head(&dev_priv->gpu_error.reset_queue);
5798
5799         atomic_set(&dev_priv->mm.bsd_engine_dispatch_index, 0);
5800
5801         spin_lock_init(&dev_priv->fb_tracking.lock);
5802
5803         err = i915_gemfs_init(dev_priv);
5804         if (err)
5805                 DRM_NOTE("Unable to create a private tmpfs mount, hugepage support will be disabled(%d).\n", err);
5806
5807         return 0;
5808
5809 err_dependencies:
5810         kmem_cache_destroy(dev_priv->dependencies);
5811 err_requests:
5812         kmem_cache_destroy(dev_priv->requests);
5813 err_luts:
5814         kmem_cache_destroy(dev_priv->luts);
5815 err_vmas:
5816         kmem_cache_destroy(dev_priv->vmas);
5817 err_objects:
5818         kmem_cache_destroy(dev_priv->objects);
5819 err_out:
5820         return err;
5821 }
5822
5823 void i915_gem_cleanup_early(struct drm_i915_private *dev_priv)
5824 {
5825         i915_gem_drain_freed_objects(dev_priv);
5826         GEM_BUG_ON(!llist_empty(&dev_priv->mm.free_list));
5827         GEM_BUG_ON(atomic_read(&dev_priv->mm.free_count));
5828         WARN_ON(dev_priv->mm.object_count);
5829         WARN_ON(!list_empty(&dev_priv->gt.timelines));
5830
5831         kmem_cache_destroy(dev_priv->priorities);
5832         kmem_cache_destroy(dev_priv->dependencies);
5833         kmem_cache_destroy(dev_priv->requests);
5834         kmem_cache_destroy(dev_priv->luts);
5835         kmem_cache_destroy(dev_priv->vmas);
5836         kmem_cache_destroy(dev_priv->objects);
5837
5838         /* And ensure that our DESTROY_BY_RCU slabs are truly destroyed */
5839         rcu_barrier();
5840
5841         i915_gemfs_fini(dev_priv);
5842 }
5843
5844 int i915_gem_freeze(struct drm_i915_private *dev_priv)
5845 {
5846         /* Discard all purgeable objects, let userspace recover those as
5847          * required after resuming.
5848          */
5849         i915_gem_shrink_all(dev_priv);
5850
5851         return 0;
5852 }
5853
5854 int i915_gem_freeze_late(struct drm_i915_private *i915)
5855 {
5856         struct drm_i915_gem_object *obj;
5857         struct list_head *phases[] = {
5858                 &i915->mm.unbound_list,
5859                 &i915->mm.bound_list,
5860                 NULL
5861         }, **phase;
5862
5863         /*
5864          * Called just before we write the hibernation image.
5865          *
5866          * We need to update the domain tracking to reflect that the CPU
5867          * will be accessing all the pages to create and restore from the
5868          * hibernation, and so upon restoration those pages will be in the
5869          * CPU domain.
5870          *
5871          * To make sure the hibernation image contains the latest state,
5872          * we update that state just before writing out the image.
5873          *
5874          * To try and reduce the hibernation image, we manually shrink
5875          * the objects as well, see i915_gem_freeze()
5876          */
5877
5878         i915_gem_shrink(i915, -1UL, NULL, I915_SHRINK_UNBOUND);
5879         i915_gem_drain_freed_objects(i915);
5880
5881         mutex_lock(&i915->drm.struct_mutex);
5882         for (phase = phases; *phase; phase++) {
5883                 list_for_each_entry(obj, *phase, mm.link)
5884                         WARN_ON(i915_gem_object_set_to_cpu_domain(obj, true));
5885         }
5886         mutex_unlock(&i915->drm.struct_mutex);
5887
5888         return 0;
5889 }
5890
5891 void i915_gem_release(struct drm_device *dev, struct drm_file *file)
5892 {
5893         struct drm_i915_file_private *file_priv = file->driver_priv;
5894         struct i915_request *request;
5895
5896         /* Clean up our request list when the client is going away, so that
5897          * later retire_requests won't dereference our soon-to-be-gone
5898          * file_priv.
5899          */
5900         spin_lock(&file_priv->mm.lock);
5901         list_for_each_entry(request, &file_priv->mm.request_list, client_link)
5902                 request->file_priv = NULL;
5903         spin_unlock(&file_priv->mm.lock);
5904 }
5905
5906 int i915_gem_open(struct drm_i915_private *i915, struct drm_file *file)
5907 {
5908         struct drm_i915_file_private *file_priv;
5909         int ret;
5910
5911         DRM_DEBUG("\n");
5912
5913         file_priv = kzalloc(sizeof(*file_priv), GFP_KERNEL);
5914         if (!file_priv)
5915                 return -ENOMEM;
5916
5917         file->driver_priv = file_priv;
5918         file_priv->dev_priv = i915;
5919         file_priv->file = file;
5920
5921         spin_lock_init(&file_priv->mm.lock);
5922         INIT_LIST_HEAD(&file_priv->mm.request_list);
5923
5924         file_priv->bsd_engine = -1;
5925         file_priv->hang_timestamp = jiffies;
5926
5927         ret = i915_gem_context_open(i915, file);
5928         if (ret)
5929                 kfree(file_priv);
5930
5931         return ret;
5932 }
5933
5934 /**
5935  * i915_gem_track_fb - update frontbuffer tracking
5936  * @old: current GEM buffer for the frontbuffer slots
5937  * @new: new GEM buffer for the frontbuffer slots
5938  * @frontbuffer_bits: bitmask of frontbuffer slots
5939  *
5940  * This updates the frontbuffer tracking bits @frontbuffer_bits by clearing them
5941  * from @old and setting them in @new. Both @old and @new can be NULL.
5942  */
5943 void i915_gem_track_fb(struct drm_i915_gem_object *old,
5944                        struct drm_i915_gem_object *new,
5945                        unsigned frontbuffer_bits)
5946 {
5947         /* Control of individual bits within the mask are guarded by
5948          * the owning plane->mutex, i.e. we can never see concurrent
5949          * manipulation of individual bits. But since the bitfield as a whole
5950          * is updated using RMW, we need to use atomics in order to update
5951          * the bits.
5952          */
5953         BUILD_BUG_ON(INTEL_FRONTBUFFER_BITS_PER_PIPE * I915_MAX_PIPES >
5954                      sizeof(atomic_t) * BITS_PER_BYTE);
5955
5956         if (old) {
5957                 WARN_ON(!(atomic_read(&old->frontbuffer_bits) & frontbuffer_bits));
5958                 atomic_andnot(frontbuffer_bits, &old->frontbuffer_bits);
5959         }
5960
5961         if (new) {
5962                 WARN_ON(atomic_read(&new->frontbuffer_bits) & frontbuffer_bits);
5963                 atomic_or(frontbuffer_bits, &new->frontbuffer_bits);
5964         }
5965 }
5966
5967 /* Allocate a new GEM object and fill it with the supplied data */
5968 struct drm_i915_gem_object *
5969 i915_gem_object_create_from_data(struct drm_i915_private *dev_priv,
5970                                  const void *data, size_t size)
5971 {
5972         struct drm_i915_gem_object *obj;
5973         struct file *file;
5974         size_t offset;
5975         int err;
5976
5977         obj = i915_gem_object_create(dev_priv, round_up(size, PAGE_SIZE));
5978         if (IS_ERR(obj))
5979                 return obj;
5980
5981         GEM_BUG_ON(obj->write_domain != I915_GEM_DOMAIN_CPU);
5982
5983         file = obj->base.filp;
5984         offset = 0;
5985         do {
5986                 unsigned int len = min_t(typeof(size), size, PAGE_SIZE);
5987                 struct page *page;
5988                 void *pgdata, *vaddr;
5989
5990                 err = pagecache_write_begin(file, file->f_mapping,
5991                                             offset, len, 0,
5992                                             &page, &pgdata);
5993                 if (err < 0)
5994                         goto fail;
5995
5996                 vaddr = kmap(page);
5997                 memcpy(vaddr, data, len);
5998                 kunmap(page);
5999
6000                 err = pagecache_write_end(file, file->f_mapping,
6001                                           offset, len, len,
6002                                           page, pgdata);
6003                 if (err < 0)
6004                         goto fail;
6005
6006                 size -= len;
6007                 data += len;
6008                 offset += len;
6009         } while (size);
6010
6011         return obj;
6012
6013 fail:
6014         i915_gem_object_put(obj);
6015         return ERR_PTR(err);
6016 }
6017
6018 struct scatterlist *
6019 i915_gem_object_get_sg(struct drm_i915_gem_object *obj,
6020                        unsigned int n,
6021                        unsigned int *offset)
6022 {
6023         struct i915_gem_object_page_iter *iter = &obj->mm.get_page;
6024         struct scatterlist *sg;
6025         unsigned int idx, count;
6026
6027         might_sleep();
6028         GEM_BUG_ON(n >= obj->base.size >> PAGE_SHIFT);
6029         GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
6030
6031         /* As we iterate forward through the sg, we record each entry in a
6032          * radixtree for quick repeated (backwards) lookups. If we have seen
6033          * this index previously, we will have an entry for it.
6034          *
6035          * Initial lookup is O(N), but this is amortized to O(1) for
6036          * sequential page access (where each new request is consecutive
6037          * to the previous one). Repeated lookups are O(lg(obj->base.size)),
6038          * i.e. O(1) with a large constant!
6039          */
6040         if (n < READ_ONCE(iter->sg_idx))
6041                 goto lookup;
6042
6043         mutex_lock(&iter->lock);
6044
6045         /* We prefer to reuse the last sg so that repeated lookup of this
6046          * (or the subsequent) sg are fast - comparing against the last
6047          * sg is faster than going through the radixtree.
6048          */
6049
6050         sg = iter->sg_pos;
6051         idx = iter->sg_idx;
6052         count = __sg_page_count(sg);
6053
6054         while (idx + count <= n) {
6055                 unsigned long exception, i;
6056                 int ret;
6057
6058                 /* If we cannot allocate and insert this entry, or the
6059                  * individual pages from this range, cancel updating the
6060                  * sg_idx so that on this lookup we are forced to linearly
6061                  * scan onwards, but on future lookups we will try the
6062                  * insertion again (in which case we need to be careful of
6063                  * the error return reporting that we have already inserted
6064                  * this index).
6065                  */
6066                 ret = radix_tree_insert(&iter->radix, idx, sg);
6067                 if (ret && ret != -EEXIST)
6068                         goto scan;
6069
6070                 exception =
6071                         RADIX_TREE_EXCEPTIONAL_ENTRY |
6072                         idx << RADIX_TREE_EXCEPTIONAL_SHIFT;
6073                 for (i = 1; i < count; i++) {
6074                         ret = radix_tree_insert(&iter->radix, idx + i,
6075                                                 (void *)exception);
6076                         if (ret && ret != -EEXIST)
6077                                 goto scan;
6078                 }
6079
6080                 idx += count;
6081                 sg = ____sg_next(sg);
6082                 count = __sg_page_count(sg);
6083         }
6084
6085 scan:
6086         iter->sg_pos = sg;
6087         iter->sg_idx = idx;
6088
6089         mutex_unlock(&iter->lock);
6090
6091         if (unlikely(n < idx)) /* insertion completed by another thread */
6092                 goto lookup;
6093
6094         /* In case we failed to insert the entry into the radixtree, we need
6095          * to look beyond the current sg.
6096          */
6097         while (idx + count <= n) {
6098                 idx += count;
6099                 sg = ____sg_next(sg);
6100                 count = __sg_page_count(sg);
6101         }
6102
6103         *offset = n - idx;
6104         return sg;
6105
6106 lookup:
6107         rcu_read_lock();
6108
6109         sg = radix_tree_lookup(&iter->radix, n);
6110         GEM_BUG_ON(!sg);
6111
6112         /* If this index is in the middle of multi-page sg entry,
6113          * the radixtree will contain an exceptional entry that points
6114          * to the start of that range. We will return the pointer to
6115          * the base page and the offset of this page within the
6116          * sg entry's range.
6117          */
6118         *offset = 0;
6119         if (unlikely(radix_tree_exception(sg))) {
6120                 unsigned long base =
6121                         (unsigned long)sg >> RADIX_TREE_EXCEPTIONAL_SHIFT;
6122
6123                 sg = radix_tree_lookup(&iter->radix, base);
6124                 GEM_BUG_ON(!sg);
6125
6126                 *offset = n - base;
6127         }
6128
6129         rcu_read_unlock();
6130
6131         return sg;
6132 }
6133
6134 struct page *
6135 i915_gem_object_get_page(struct drm_i915_gem_object *obj, unsigned int n)
6136 {
6137         struct scatterlist *sg;
6138         unsigned int offset;
6139
6140         GEM_BUG_ON(!i915_gem_object_has_struct_page(obj));
6141
6142         sg = i915_gem_object_get_sg(obj, n, &offset);
6143         return nth_page(sg_page(sg), offset);
6144 }
6145
6146 /* Like i915_gem_object_get_page(), but mark the returned page dirty */
6147 struct page *
6148 i915_gem_object_get_dirty_page(struct drm_i915_gem_object *obj,
6149                                unsigned int n)
6150 {
6151         struct page *page;
6152
6153         page = i915_gem_object_get_page(obj, n);
6154         if (!obj->mm.dirty)
6155                 set_page_dirty(page);
6156
6157         return page;
6158 }
6159
6160 dma_addr_t
6161 i915_gem_object_get_dma_address(struct drm_i915_gem_object *obj,
6162                                 unsigned long n)
6163 {
6164         struct scatterlist *sg;
6165         unsigned int offset;
6166
6167         sg = i915_gem_object_get_sg(obj, n, &offset);
6168         return sg_dma_address(sg) + (offset << PAGE_SHIFT);
6169 }
6170
6171 int i915_gem_object_attach_phys(struct drm_i915_gem_object *obj, int align)
6172 {
6173         struct sg_table *pages;
6174         int err;
6175
6176         if (align > obj->base.size)
6177                 return -EINVAL;
6178
6179         if (obj->ops == &i915_gem_phys_ops)
6180                 return 0;
6181
6182         if (obj->ops != &i915_gem_object_ops)
6183                 return -EINVAL;
6184
6185         err = i915_gem_object_unbind(obj);
6186         if (err)
6187                 return err;
6188
6189         mutex_lock(&obj->mm.lock);
6190
6191         if (obj->mm.madv != I915_MADV_WILLNEED) {
6192                 err = -EFAULT;
6193                 goto err_unlock;
6194         }
6195
6196         if (obj->mm.quirked) {
6197                 err = -EFAULT;
6198                 goto err_unlock;
6199         }
6200
6201         if (obj->mm.mapping) {
6202                 err = -EBUSY;
6203                 goto err_unlock;
6204         }
6205
6206         pages = __i915_gem_object_unset_pages(obj);
6207
6208         obj->ops = &i915_gem_phys_ops;
6209
6210         err = ____i915_gem_object_get_pages(obj);
6211         if (err)
6212                 goto err_xfer;
6213
6214         /* Perma-pin (until release) the physical set of pages */
6215         __i915_gem_object_pin_pages(obj);
6216
6217         if (!IS_ERR_OR_NULL(pages))
6218                 i915_gem_object_ops.put_pages(obj, pages);
6219         mutex_unlock(&obj->mm.lock);
6220         return 0;
6221
6222 err_xfer:
6223         obj->ops = &i915_gem_object_ops;
6224         if (!IS_ERR_OR_NULL(pages)) {
6225                 unsigned int sg_page_sizes = i915_sg_page_sizes(pages->sgl);
6226
6227                 __i915_gem_object_set_pages(obj, pages, sg_page_sizes);
6228         }
6229 err_unlock:
6230         mutex_unlock(&obj->mm.lock);
6231         return err;
6232 }
6233
6234 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
6235 #include "selftests/scatterlist.c"
6236 #include "selftests/mock_gem_device.c"
6237 #include "selftests/huge_gem_object.c"
6238 #include "selftests/huge_pages.c"
6239 #include "selftests/i915_gem_object.c"
6240 #include "selftests/i915_gem_coherency.c"
6241 #include "selftests/i915_gem.c"
6242 #endif