drivers/gpu/drm/i915/i915_gem.c

   1 /*
   2  * Copyright © 2008-2015 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *    Eric Anholt <eric@anholt.net>
  25  *
  26  */
  27
  28 #include <drm/drm_vma_manager.h>
  29 #include <drm/i915_drm.h>
  30 #include "i915_drv.h"
  31 #include "i915_gem_clflush.h"
  32 #include "i915_vgpu.h"
  33 #include "i915_trace.h"
  34 #include "intel_drv.h"
  35 #include "intel_frontbuffer.h"
  36 #include "intel_mocs.h"
  37 #include "intel_workarounds.h"
  38 #include "i915_gemfs.h"
  39 #include <linux/dma-fence-array.h>
  40 #include <linux/kthread.h>
  41 #include <linux/reservation.h>
  42 #include <linux/shmem_fs.h>
  43 #include <linux/slab.h>
  44 #include <linux/stop_machine.h>
  45 #include <linux/swap.h>
  46 #include <linux/pci.h>
  47 #include <linux/dma-buf.h>
  48
  49 static void i915_gem_flush_free_objects(struct drm_i915_private *i915);
  50
  51 static bool cpu_write_needs_clflush(struct drm_i915_gem_object *obj)
  52 {
  53         if (obj->cache_dirty)
  54                 return false;
  55
  56         if (!(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE))
  57                 return true;
  58
  59         return obj->pin_global; /* currently in use by HW, keep flushed */
  60 }
  61
  62 static int
  63 insert_mappable_node(struct i915_ggtt *ggtt,
  64                      struct drm_mm_node *node, u32 size)
  65 {
  66         memset(node, 0, sizeof(*node));
  67         return drm_mm_insert_node_in_range(&ggtt->vm.mm, node,
  68                                            size, 0, I915_COLOR_UNEVICTABLE,
  69                                            0, ggtt->mappable_end,
  70                                            DRM_MM_INSERT_LOW);
  71 }
  72
  73 static void
  74 remove_mappable_node(struct drm_mm_node *node)
  75 {
  76         drm_mm_remove_node(node);
  77 }
  78
  79 /* some bookkeeping */
  80 static void i915_gem_info_add_obj(struct drm_i915_private *dev_priv,
  81                                   u64 size)
  82 {
  83         spin_lock(&dev_priv->mm.object_stat_lock);
  84         dev_priv->mm.object_count++;
  85         dev_priv->mm.object_memory += size;
  86         spin_unlock(&dev_priv->mm.object_stat_lock);
  87 }
  88
  89 static void i915_gem_info_remove_obj(struct drm_i915_private *dev_priv,
  90                                      u64 size)
  91 {
  92         spin_lock(&dev_priv->mm.object_stat_lock);
  93         dev_priv->mm.object_count--;
  94         dev_priv->mm.object_memory -= size;
  95         spin_unlock(&dev_priv->mm.object_stat_lock);
  96 }
  97
  98 static int
  99 i915_gem_wait_for_error(struct i915_gpu_error *error)
 100 {
 101         int ret;
 102
 103         might_sleep();
 104
 105         /*
 106          * Only wait 10 seconds for the gpu reset to complete to avoid hanging
 107          * userspace. If it takes that long something really bad is going on and
 108          * we should simply try to bail out and fail as gracefully as possible.
 109          */
 110         ret = wait_event_interruptible_timeout(error->reset_queue,
 111                                                !i915_reset_backoff(error),
 112                                                I915_RESET_TIMEOUT);
 113         if (ret == 0) {
 114                 DRM_ERROR("Timed out waiting for the gpu reset to complete\n");
 115                 return -EIO;
 116         } else if (ret < 0) {
 117                 return ret;
 118         } else {
 119                 return 0;
 120         }
 121 }
 122
 123 int i915_mutex_lock_interruptible(struct drm_device *dev)
 124 {
 125         struct drm_i915_private *dev_priv = to_i915(dev);
 126         int ret;
 127
 128         ret = i915_gem_wait_for_error(&dev_priv->gpu_error);
 129         if (ret)
 130                 return ret;
 131
 132         ret = mutex_lock_interruptible(&dev->struct_mutex);
 133         if (ret)
 134                 return ret;
 135
 136         return 0;
 137 }
 138
 139 static u32 __i915_gem_park(struct drm_i915_private *i915)
 140 {
 141         GEM_TRACE("\n");
 142
 143         lockdep_assert_held(&i915->drm.struct_mutex);
 144         GEM_BUG_ON(i915->gt.active_requests);
 145         GEM_BUG_ON(!list_empty(&i915->gt.active_rings));
 146
 147         if (!i915->gt.awake)
 148                 return I915_EPOCH_INVALID;
 149
 150         GEM_BUG_ON(i915->gt.epoch == I915_EPOCH_INVALID);
 151
 152         /*
 153          * Be paranoid and flush a concurrent interrupt to make sure
 154          * we don't reactivate any irq tasklets after parking.
 155          *
 156          * FIXME: Note that even though we have waited for execlists to be idle,
 157          * there may still be an in-flight interrupt even though the CSB
 158          * is now empty. synchronize_irq() makes sure that a residual interrupt
 159          * is completed before we continue, but it doesn't prevent the HW from
 160          * raising a spurious interrupt later. To complete the shield we should
 161          * coordinate disabling the CS irq with flushing the interrupts.
 162          */
 163         synchronize_irq(i915->drm.irq);
 164
 165         intel_engines_park(i915);
 166         i915_timelines_park(i915);
 167
 168         i915_pmu_gt_parked(i915);
 169         i915_vma_parked(i915);
 170
 171         i915->gt.awake = false;
 172
 173         if (INTEL_GEN(i915) >= 6)
 174                 gen6_rps_idle(i915);
 175
 176         intel_display_power_put(i915, POWER_DOMAIN_GT_IRQ);
 177
 178         intel_runtime_pm_put(i915);
 179
 180         return i915->gt.epoch;
 181 }
 182
 183 void i915_gem_park(struct drm_i915_private *i915)
 184 {
 185         GEM_TRACE("\n");
 186
 187         lockdep_assert_held(&i915->drm.struct_mutex);
 188         GEM_BUG_ON(i915->gt.active_requests);
 189
 190         if (!i915->gt.awake)
 191                 return;
 192
 193         /* Defer the actual call to __i915_gem_park() to prevent ping-pongs */
 194         mod_delayed_work(i915->wq, &i915->gt.idle_work, msecs_to_jiffies(100));
 195 }
 196
 197 void i915_gem_unpark(struct drm_i915_private *i915)
 198 {
 199         GEM_TRACE("\n");
 200
 201         lockdep_assert_held(&i915->drm.struct_mutex);
 202         GEM_BUG_ON(!i915->gt.active_requests);
 203
 204         if (i915->gt.awake)
 205                 return;
 206
 207         intel_runtime_pm_get_noresume(i915);
 208
 209         /*
 210          * It seems that the DMC likes to transition between the DC states a lot
 211          * when there are no connected displays (no active power domains) during
 212          * command submission.
 213          *
 214          * This activity has negative impact on the performance of the chip with
 215          * huge latencies observed in the interrupt handler and elsewhere.
 216          *
 217          * Work around it by grabbing a GT IRQ power domain whilst there is any
 218          * GT activity, preventing any DC state transitions.
 219          */
 220         intel_display_power_get(i915, POWER_DOMAIN_GT_IRQ);
 221
 222         i915->gt.awake = true;
 223         if (unlikely(++i915->gt.epoch == 0)) /* keep 0 as invalid */
 224                 i915->gt.epoch = 1;
 225
 226         intel_enable_gt_powersave(i915);
 227         i915_update_gfx_val(i915);
 228         if (INTEL_GEN(i915) >= 6)
 229                 gen6_rps_busy(i915);
 230         i915_pmu_gt_unparked(i915);
 231
 232         intel_engines_unpark(i915);
 233
 234         i915_queue_hangcheck(i915);
 235
 236         queue_delayed_work(i915->wq,
 237                            &i915->gt.retire_work,
 238                            round_jiffies_up_relative(HZ));
 239 }
 240
 241 int
 242 i915_gem_get_aperture_ioctl(struct drm_device *dev, void *data,
 243                             struct drm_file *file)
 244 {
 245         struct drm_i915_private *dev_priv = to_i915(dev);
 246         struct i915_ggtt *ggtt = &dev_priv->ggtt;
 247         struct drm_i915_gem_get_aperture *args = data;
 248         struct i915_vma *vma;
 249         u64 pinned;
 250
 251         pinned = ggtt->vm.reserved;
 252         mutex_lock(&dev->struct_mutex);
 253         list_for_each_entry(vma, &ggtt->vm.active_list, vm_link)
 254                 if (i915_vma_is_pinned(vma))
 255                         pinned += vma->node.size;
 256         list_for_each_entry(vma, &ggtt->vm.inactive_list, vm_link)
 257                 if (i915_vma_is_pinned(vma))
 258                         pinned += vma->node.size;
 259         mutex_unlock(&dev->struct_mutex);
 260
 261         args->aper_size = ggtt->vm.total;
 262         args->aper_available_size = args->aper_size - pinned;
 263
 264         return 0;
 265 }
 266
 267 static int i915_gem_object_get_pages_phys(struct drm_i915_gem_object *obj)
 268 {
 269         struct address_space *mapping = obj->base.filp->f_mapping;
 270         drm_dma_handle_t *phys;
 271         struct sg_table *st;
 272         struct scatterlist *sg;
 273         char *vaddr;
 274         int i;
 275         int err;
 276
 277         if (WARN_ON(i915_gem_object_needs_bit17_swizzle(obj)))
 278                 return -EINVAL;
 279
 280         /* Always aligning to the object size, allows a single allocation
 281          * to handle all possible callers, and given typical object sizes,
 282          * the alignment of the buddy allocation will naturally match.
 283          */
 284         phys = drm_pci_alloc(obj->base.dev,
 285                              roundup_pow_of_two(obj->base.size),
 286                              roundup_pow_of_two(obj->base.size));
 287         if (!phys)
 288                 return -ENOMEM;
 289
 290         vaddr = phys->vaddr;
 291         for (i = 0; i < obj->base.size / PAGE_SIZE; i++) {
 292                 struct page *page;
 293                 char *src;
 294
 295                 page = shmem_read_mapping_page(mapping, i);
 296                 if (IS_ERR(page)) {
 297                         err = PTR_ERR(page);
 298                         goto err_phys;
 299                 }
 300
 301                 src = kmap_atomic(page);
 302                 memcpy(vaddr, src, PAGE_SIZE);
 303                 drm_clflush_virt_range(vaddr, PAGE_SIZE);
 304                 kunmap_atomic(src);
 305
 306                 put_page(page);
 307                 vaddr += PAGE_SIZE;
 308         }
 309
 310         i915_gem_chipset_flush(to_i915(obj->base.dev));
 311
 312         st = kmalloc(sizeof(*st), GFP_KERNEL);
 313         if (!st) {
 314                 err = -ENOMEM;
 315                 goto err_phys;
 316         }
 317
 318         if (sg_alloc_table(st, 1, GFP_KERNEL)) {
 319                 kfree(st);
 320                 err = -ENOMEM;
 321                 goto err_phys;
 322         }
 323
 324         sg = st->sgl;
 325         sg->offset = 0;
 326         sg->length = obj->base.size;
 327
 328         sg_dma_address(sg) = phys->busaddr;
 329         sg_dma_len(sg) = obj->base.size;
 330
 331         obj->phys_handle = phys;
 332
 333         __i915_gem_object_set_pages(obj, st, sg->length);
 334
 335         return 0;
 336
 337 err_phys:
 338         drm_pci_free(obj->base.dev, phys);
 339
 340         return err;
 341 }
 342
 343 static void __start_cpu_write(struct drm_i915_gem_object *obj)
 344 {
 345         obj->read_domains = I915_GEM_DOMAIN_CPU;
 346         obj->write_domain = I915_GEM_DOMAIN_CPU;
 347         if (cpu_write_needs_clflush(obj))
 348                 obj->cache_dirty = true;
 349 }
 350
 351 static void
 352 __i915_gem_object_release_shmem(struct drm_i915_gem_object *obj,
 353                                 struct sg_table *pages,
 354                                 bool needs_clflush)
 355 {
 356         GEM_BUG_ON(obj->mm.madv == __I915_MADV_PURGED);
 357
 358         if (obj->mm.madv == I915_MADV_DONTNEED)
 359                 obj->mm.dirty = false;
 360
 361         if (needs_clflush &&
 362             (obj->read_domains & I915_GEM_DOMAIN_CPU) == 0 &&
 363             !(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ))
 364                 drm_clflush_sg(pages);
 365
 366         __start_cpu_write(obj);
 367 }
 368
 369 static void
 370 i915_gem_object_put_pages_phys(struct drm_i915_gem_object *obj,
 371                                struct sg_table *pages)
 372 {
 373         __i915_gem_object_release_shmem(obj, pages, false);
 374
 375         if (obj->mm.dirty) {
 376                 struct address_space *mapping = obj->base.filp->f_mapping;
 377                 char *vaddr = obj->phys_handle->vaddr;
 378                 int i;
 379
 380                 for (i = 0; i < obj->base.size / PAGE_SIZE; i++) {
 381                         struct page *page;
 382                         char *dst;
 383
 384                         page = shmem_read_mapping_page(mapping, i);
 385                         if (IS_ERR(page))
 386                                 continue;
 387
 388                         dst = kmap_atomic(page);
 389                         drm_clflush_virt_range(vaddr, PAGE_SIZE);
 390                         memcpy(dst, vaddr, PAGE_SIZE);
 391                         kunmap_atomic(dst);
 392
 393                         set_page_dirty(page);
 394                         if (obj->mm.madv == I915_MADV_WILLNEED)
 395                                 mark_page_accessed(page);
 396                         put_page(page);
 397                         vaddr += PAGE_SIZE;
 398                 }
 399                 obj->mm.dirty = false;
 400         }
 401
 402         sg_free_table(pages);
 403         kfree(pages);
 404
 405         drm_pci_free(obj->base.dev, obj->phys_handle);
 406 }
 407
 408 static void
 409 i915_gem_object_release_phys(struct drm_i915_gem_object *obj)
 410 {
 411         i915_gem_object_unpin_pages(obj);
 412 }
 413
 414 static const struct drm_i915_gem_object_ops i915_gem_phys_ops = {
 415         .get_pages = i915_gem_object_get_pages_phys,
 416         .put_pages = i915_gem_object_put_pages_phys,
 417         .release = i915_gem_object_release_phys,
 418 };
 419
 420 static const struct drm_i915_gem_object_ops i915_gem_object_ops;
 421
 422 int i915_gem_object_unbind(struct drm_i915_gem_object *obj)
 423 {
 424         struct i915_vma *vma;
 425         LIST_HEAD(still_in_list);
 426         int ret;
 427
 428         lockdep_assert_held(&obj->base.dev->struct_mutex);
 429
 430         /* Closed vma are removed from the obj->vma_list - but they may
 431          * still have an active binding on the object. To remove those we
 432          * must wait for all rendering to complete to the object (as unbinding
 433          * must anyway), and retire the requests.
 434          */
 435         ret = i915_gem_object_set_to_cpu_domain(obj, false);
 436         if (ret)
 437                 return ret;
 438
 439         while ((vma = list_first_entry_or_null(&obj->vma_list,
 440                                                struct i915_vma,
 441                                                obj_link))) {
 442                 list_move_tail(&vma->obj_link, &still_in_list);
 443                 ret = i915_vma_unbind(vma);
 444                 if (ret)
 445                         break;
 446         }
 447         list_splice(&still_in_list, &obj->vma_list);
 448
 449         return ret;
 450 }
 451
 452 static long
 453 i915_gem_object_wait_fence(struct dma_fence *fence,
 454                            unsigned int flags,
 455                            long timeout,
 456                            struct intel_rps_client *rps_client)
 457 {
 458         struct i915_request *rq;
 459
 460         BUILD_BUG_ON(I915_WAIT_INTERRUPTIBLE != 0x1);
 461
 462         if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags))
 463                 return timeout;
 464
 465         if (!dma_fence_is_i915(fence))
 466                 return dma_fence_wait_timeout(fence,
 467                                               flags & I915_WAIT_INTERRUPTIBLE,
 468                                               timeout);
 469
 470         rq = to_request(fence);
 471         if (i915_request_completed(rq))
 472                 goto out;
 473
 474         /*
 475          * This client is about to stall waiting for the GPU. In many cases
 476          * this is undesirable and limits the throughput of the system, as
 477          * many clients cannot continue processing user input/output whilst
 478          * blocked. RPS autotuning may take tens of milliseconds to respond
 479          * to the GPU load and thus incurs additional latency for the client.
 480          * We can circumvent that by promoting the GPU frequency to maximum
 481          * before we wait. This makes the GPU throttle up much more quickly
 482          * (good for benchmarks and user experience, e.g. window animations),
 483          * but at a cost of spending more power processing the workload
 484          * (bad for battery). Not all clients even want their results
 485          * immediately and for them we should just let the GPU select its own
 486          * frequency to maximise efficiency. To prevent a single client from
 487          * forcing the clocks too high for the whole system, we only allow
 488          * each client to waitboost once in a busy period.
 489          */
 490         if (rps_client && !i915_request_started(rq)) {
 491                 if (INTEL_GEN(rq->i915) >= 6)
 492                         gen6_rps_boost(rq, rps_client);
 493         }
 494
 495         timeout = i915_request_wait(rq, flags, timeout);
 496
 497 out:
 498         if (flags & I915_WAIT_LOCKED && i915_request_completed(rq))
 499                 i915_request_retire_upto(rq);
 500
 501         return timeout;
 502 }
 503
 504 static long
 505 i915_gem_object_wait_reservation(struct reservation_object *resv,
 506                                  unsigned int flags,
 507                                  long timeout,
 508                                  struct intel_rps_client *rps_client)
 509 {
 510         unsigned int seq = __read_seqcount_begin(&resv->seq);
 511         struct dma_fence *excl;
 512         bool prune_fences = false;
 513
 514         if (flags & I915_WAIT_ALL) {
 515                 struct dma_fence **shared;
 516                 unsigned int count, i;
 517                 int ret;
 518
 519                 ret = reservation_object_get_fences_rcu(resv,
 520                                                         &excl, &count, &shared);
 521                 if (ret)
 522                         return ret;
 523
 524                 for (i = 0; i < count; i++) {
 525                         timeout = i915_gem_object_wait_fence(shared[i],
 526                                                              flags, timeout,
 527                                                              rps_client);
 528                         if (timeout < 0)
 529                                 break;
 530
 531                         dma_fence_put(shared[i]);
 532                 }
 533
 534                 for (; i < count; i++)
 535                         dma_fence_put(shared[i]);
 536                 kfree(shared);
 537
 538                 /*
 539                  * If both shared fences and an exclusive fence exist,
 540                  * then by construction the shared fences must be later
 541                  * than the exclusive fence. If we successfully wait for
 542                  * all the shared fences, we know that the exclusive fence
 543                  * must all be signaled. If all the shared fences are
 544                  * signaled, we can prune the array and recover the
 545                  * floating references on the fences/requests.
 546                  */
 547                 prune_fences = count && timeout >= 0;
 548         } else {
 549                 excl = reservation_object_get_excl_rcu(resv);
 550         }
 551
 552         if (excl && timeout >= 0)
 553                 timeout = i915_gem_object_wait_fence(excl, flags, timeout,
 554                                                      rps_client);
 555
 556         dma_fence_put(excl);
 557
 558         /*
 559          * Opportunistically prune the fences iff we know they have *all* been
 560          * signaled and that the reservation object has not been changed (i.e.
 561          * no new fences have been added).
 562          */
 563         if (prune_fences && !__read_seqcount_retry(&resv->seq, seq)) {
 564                 if (reservation_object_trylock(resv)) {
 565                         if (!__read_seqcount_retry(&resv->seq, seq))
 566                                 reservation_object_add_excl_fence(resv, NULL);
 567                         reservation_object_unlock(resv);
 568                 }
 569         }
 570
 571         return timeout;
 572 }
 573
 574 static void __fence_set_priority(struct dma_fence *fence,
 575                                  const struct i915_sched_attr *attr)
 576 {
 577         struct i915_request *rq;
 578         struct intel_engine_cs *engine;
 579
 580         if (dma_fence_is_signaled(fence) || !dma_fence_is_i915(fence))
 581                 return;
 582
 583         rq = to_request(fence);
 584         engine = rq->engine;
 585
 586         local_bh_disable();
 587         rcu_read_lock(); /* RCU serialisation for set-wedged protection */
 588         if (engine->schedule)
 589                 engine->schedule(rq, attr);
 590         rcu_read_unlock();
 591         local_bh_enable(); /* kick the tasklets if queues were reprioritised */
 592 }
 593
 594 static void fence_set_priority(struct dma_fence *fence,
 595                                const struct i915_sched_attr *attr)
 596 {
 597         /* Recurse once into a fence-array */
 598         if (dma_fence_is_array(fence)) {
 599                 struct dma_fence_array *array = to_dma_fence_array(fence);
 600                 int i;
 601
 602                 for (i = 0; i < array->num_fences; i++)
 603                         __fence_set_priority(array->fences[i], attr);
 604         } else {
 605                 __fence_set_priority(fence, attr);
 606         }
 607 }
 608
 609 int
 610 i915_gem_object_wait_priority(struct drm_i915_gem_object *obj,
 611                               unsigned int flags,
 612                               const struct i915_sched_attr *attr)
 613 {
 614         struct dma_fence *excl;
 615
 616         if (flags & I915_WAIT_ALL) {
 617                 struct dma_fence **shared;
 618                 unsigned int count, i;
 619                 int ret;
 620
 621                 ret = reservation_object_get_fences_rcu(obj->resv,
 622                                                         &excl, &count, &shared);
 623                 if (ret)
 624                         return ret;
 625
 626                 for (i = 0; i < count; i++) {
 627                         fence_set_priority(shared[i], attr);
 628                         dma_fence_put(shared[i]);
 629                 }
 630
 631                 kfree(shared);
 632         } else {
 633                 excl = reservation_object_get_excl_rcu(obj->resv);
 634         }
 635
 636         if (excl) {
 637                 fence_set_priority(excl, attr);
 638                 dma_fence_put(excl);
 639         }
 640         return 0;
 641 }
 642
 643 /**
 644  * Waits for rendering to the object to be completed
 645  * @obj: i915 gem object
 646  * @flags: how to wait (under a lock, for all rendering or just for writes etc)
 647  * @timeout: how long to wait
 648  * @rps_client: client (user process) to charge for any waitboosting
 649  */
 650 int
 651 i915_gem_object_wait(struct drm_i915_gem_object *obj,
 652                      unsigned int flags,
 653                      long timeout,
 654                      struct intel_rps_client *rps_client)
 655 {
 656         might_sleep();
 657 #if IS_ENABLED(CONFIG_LOCKDEP)
 658         GEM_BUG_ON(debug_locks &&
 659                    !!lockdep_is_held(&obj->base.dev->struct_mutex) !=
 660                    !!(flags & I915_WAIT_LOCKED));
 661 #endif
 662         GEM_BUG_ON(timeout < 0);
 663
 664         timeout = i915_gem_object_wait_reservation(obj->resv,
 665                                                    flags, timeout,
 666                                                    rps_client);
 667         return timeout < 0 ? timeout : 0;
 668 }
 669
 670 static struct intel_rps_client *to_rps_client(struct drm_file *file)
 671 {
 672         struct drm_i915_file_private *fpriv = file->driver_priv;
 673
 674         return &fpriv->rps_client;
 675 }
 676
 677 static int
 678 i915_gem_phys_pwrite(struct drm_i915_gem_object *obj,
 679                      struct drm_i915_gem_pwrite *args,
 680                      struct drm_file *file)
 681 {
 682         void *vaddr = obj->phys_handle->vaddr + args->offset;
 683         char __user *user_data = u64_to_user_ptr(args->data_ptr);
 684
 685         /* We manually control the domain here and pretend that it
 686          * remains coherent i.e. in the GTT domain, like shmem_pwrite.
 687          */
 688         intel_fb_obj_invalidate(obj, ORIGIN_CPU);
 689         if (copy_from_user(vaddr, user_data, args->size))
 690                 return -EFAULT;
 691
 692         drm_clflush_virt_range(vaddr, args->size);
 693         i915_gem_chipset_flush(to_i915(obj->base.dev));
 694
 695         intel_fb_obj_flush(obj, ORIGIN_CPU);
 696         return 0;
 697 }
 698
 699 void *i915_gem_object_alloc(struct drm_i915_private *dev_priv)
 700 {
 701         return kmem_cache_zalloc(dev_priv->objects, GFP_KERNEL);
 702 }
 703
 704 void i915_gem_object_free(struct drm_i915_gem_object *obj)
 705 {
 706         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
 707         kmem_cache_free(dev_priv->objects, obj);
 708 }
 709
 710 static int
 711 i915_gem_create(struct drm_file *file,
 712                 struct drm_i915_private *dev_priv,
 713                 uint64_t size,
 714                 uint32_t *handle_p)
 715 {
 716         struct drm_i915_gem_object *obj;
 717         int ret;
 718         u32 handle;
 719
 720         size = roundup(size, PAGE_SIZE);
 721         if (size == 0)
 722                 return -EINVAL;
 723
 724         /* Allocate the new object */
 725         obj = i915_gem_object_create(dev_priv, size);
 726         if (IS_ERR(obj))
 727                 return PTR_ERR(obj);
 728
 729         ret = drm_gem_handle_create(file, &obj->base, &handle);
 730         /* drop reference from allocate - handle holds it now */
 731         i915_gem_object_put(obj);
 732         if (ret)
 733                 return ret;
 734
 735         *handle_p = handle;
 736         return 0;
 737 }
 738
 739 int
 740 i915_gem_dumb_create(struct drm_file *file,
 741                      struct drm_device *dev,
 742                      struct drm_mode_create_dumb *args)
 743 {
 744         /* have to work out size/pitch and return them */
 745         args->pitch = ALIGN(args->width * DIV_ROUND_UP(args->bpp, 8), 64);
 746         args->size = args->pitch * args->height;
 747         return i915_gem_create(file, to_i915(dev),
 748                                args->size, &args->handle);
 749 }
 750
 751 static bool gpu_write_needs_clflush(struct drm_i915_gem_object *obj)
 752 {
 753         return !(obj->cache_level == I915_CACHE_NONE ||
 754                  obj->cache_level == I915_CACHE_WT);
 755 }
 756
 757 /**
 758  * Creates a new mm object and returns a handle to it.
 759  * @dev: drm device pointer
 760  * @data: ioctl data blob
 761  * @file: drm file pointer
 762  */
 763 int
 764 i915_gem_create_ioctl(struct drm_device *dev, void *data,
 765                       struct drm_file *file)
 766 {
 767         struct drm_i915_private *dev_priv = to_i915(dev);
 768         struct drm_i915_gem_create *args = data;
 769
 770         i915_gem_flush_free_objects(dev_priv);
 771
 772         return i915_gem_create(file, dev_priv,
 773                                args->size, &args->handle);
 774 }
 775
 776 static inline enum fb_op_origin
 777 fb_write_origin(struct drm_i915_gem_object *obj, unsigned int domain)
 778 {
 779         return (domain == I915_GEM_DOMAIN_GTT ?
 780                 obj->frontbuffer_ggtt_origin : ORIGIN_CPU);
 781 }
 782
 783 void i915_gem_flush_ggtt_writes(struct drm_i915_private *dev_priv)
 784 {
 785         /*
 786          * No actual flushing is required for the GTT write domain for reads
 787          * from the GTT domain. Writes to it "immediately" go to main memory
 788          * as far as we know, so there's no chipset flush. It also doesn't
 789          * land in the GPU render cache.
 790          *
 791          * However, we do have to enforce the order so that all writes through
 792          * the GTT land before any writes to the device, such as updates to
 793          * the GATT itself.
 794          *
 795          * We also have to wait a bit for the writes to land from the GTT.
 796          * An uncached read (i.e. mmio) seems to be ideal for the round-trip
 797          * timing. This issue has only been observed when switching quickly
 798          * between GTT writes and CPU reads from inside the kernel on recent hw,
 799          * and it appears to only affect discrete GTT blocks (i.e. on LLC
 800          * system agents we cannot reproduce this behaviour, until Cannonlake
 801          * that was!).
 802          */
 803
 804         wmb();
 805
 806         if (INTEL_INFO(dev_priv)->has_coherent_ggtt)
 807                 return;
 808
 809         i915_gem_chipset_flush(dev_priv);
 810
 811         intel_runtime_pm_get(dev_priv);
 812         spin_lock_irq(&dev_priv->uncore.lock);
 813
 814         POSTING_READ_FW(RING_HEAD(RENDER_RING_BASE));
 815
 816         spin_unlock_irq(&dev_priv->uncore.lock);
 817         intel_runtime_pm_put(dev_priv);
 818 }
 819
 820 static void
 821 flush_write_domain(struct drm_i915_gem_object *obj, unsigned int flush_domains)
 822 {
 823         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
 824         struct i915_vma *vma;
 825
 826         if (!(obj->write_domain & flush_domains))
 827                 return;
 828
 829         switch (obj->write_domain) {
 830         case I915_GEM_DOMAIN_GTT:
 831                 i915_gem_flush_ggtt_writes(dev_priv);
 832
 833                 intel_fb_obj_flush(obj,
 834                                    fb_write_origin(obj, I915_GEM_DOMAIN_GTT));
 835
 836                 for_each_ggtt_vma(vma, obj) {
 837                         if (vma->iomap)
 838                                 continue;
 839
 840                         i915_vma_unset_ggtt_write(vma);
 841                 }
 842                 break;
 843
 844         case I915_GEM_DOMAIN_WC:
 845                 wmb();
 846                 break;
 847
 848         case I915_GEM_DOMAIN_CPU:
 849                 i915_gem_clflush_object(obj, I915_CLFLUSH_SYNC);
 850                 break;
 851
 852         case I915_GEM_DOMAIN_RENDER:
 853                 if (gpu_write_needs_clflush(obj))
 854                         obj->cache_dirty = true;
 855                 break;
 856         }
 857
 858         obj->write_domain = 0;
 859 }
 860
 861 /*
 862  * Pins the specified object's pages and synchronizes the object with
 863  * GPU accesses. Sets needs_clflush to non-zero if the caller should
 864  * flush the object from the CPU cache.
 865  */
 866 int i915_gem_obj_prepare_shmem_read(struct drm_i915_gem_object *obj,
 867                                     unsigned int *needs_clflush)
 868 {
 869         int ret;
 870
 871         lockdep_assert_held(&obj->base.dev->struct_mutex);
 872
 873         *needs_clflush = 0;
 874         if (!i915_gem_object_has_struct_page(obj))
 875                 return -ENODEV;
 876
 877         ret = i915_gem_object_wait(obj,
 878                                    I915_WAIT_INTERRUPTIBLE |
 879                                    I915_WAIT_LOCKED,
 880                                    MAX_SCHEDULE_TIMEOUT,
 881                                    NULL);
 882         if (ret)
 883                 return ret;
 884
 885         ret = i915_gem_object_pin_pages(obj);
 886         if (ret)
 887                 return ret;
 888
 889         if (obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ ||
 890             !static_cpu_has(X86_FEATURE_CLFLUSH)) {
 891                 ret = i915_gem_object_set_to_cpu_domain(obj, false);
 892                 if (ret)
 893                         goto err_unpin;
 894                 else
 895                         goto out;
 896         }
 897
 898         flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
 899
 900         /* If we're not in the cpu read domain, set ourself into the gtt
 901          * read domain and manually flush cachelines (if required). This
 902          * optimizes for the case when the gpu will dirty the data
 903          * anyway again before the next pread happens.
 904          */
 905         if (!obj->cache_dirty &&
 906             !(obj->read_domains & I915_GEM_DOMAIN_CPU))
 907                 *needs_clflush = CLFLUSH_BEFORE;
 908
 909 out:
 910         /* return with the pages pinned */
 911         return 0;
 912
 913 err_unpin:
 914         i915_gem_object_unpin_pages(obj);
 915         return ret;
 916 }
 917
 918 int i915_gem_obj_prepare_shmem_write(struct drm_i915_gem_object *obj,
 919                                      unsigned int *needs_clflush)
 920 {
 921         int ret;
 922
 923         lockdep_assert_held(&obj->base.dev->struct_mutex);
 924
 925         *needs_clflush = 0;
 926         if (!i915_gem_object_has_struct_page(obj))
 927                 return -ENODEV;
 928
 929         ret = i915_gem_object_wait(obj,
 930                                    I915_WAIT_INTERRUPTIBLE |
 931                                    I915_WAIT_LOCKED |
 932                                    I915_WAIT_ALL,
 933                                    MAX_SCHEDULE_TIMEOUT,
 934                                    NULL);
 935         if (ret)
 936                 return ret;
 937
 938         ret = i915_gem_object_pin_pages(obj);
 939         if (ret)
 940                 return ret;
 941
 942         if (obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE ||
 943             !static_cpu_has(X86_FEATURE_CLFLUSH)) {
 944                 ret = i915_gem_object_set_to_cpu_domain(obj, true);
 945                 if (ret)
 946                         goto err_unpin;
 947                 else
 948                         goto out;
 949         }
 950
 951         flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
 952
 953         /* If we're not in the cpu write domain, set ourself into the
 954          * gtt write domain and manually flush cachelines (as required).
 955          * This optimizes for the case when the gpu will use the data
 956          * right away and we therefore have to clflush anyway.
 957          */
 958         if (!obj->cache_dirty) {
 959                 *needs_clflush |= CLFLUSH_AFTER;
 960
 961                 /*
 962                  * Same trick applies to invalidate partially written
 963                  * cachelines read before writing.
 964                  */
 965                 if (!(obj->read_domains & I915_GEM_DOMAIN_CPU))
 966                         *needs_clflush |= CLFLUSH_BEFORE;
 967         }
 968
 969 out:
 970         intel_fb_obj_invalidate(obj, ORIGIN_CPU);
 971         obj->mm.dirty = true;
 972         /* return with the pages pinned */
 973         return 0;
 974
 975 err_unpin:
 976         i915_gem_object_unpin_pages(obj);
 977         return ret;
 978 }
 979
 980 static int
 981 shmem_pread(struct page *page, int offset, int len, char __user *user_data,
 982             bool needs_clflush)
 983 {
 984         char *vaddr;
 985         int ret;
 986
 987         vaddr = kmap(page);
 988
 989         if (needs_clflush)
 990                 drm_clflush_virt_range(vaddr + offset, len);
 991
 992         ret = __copy_to_user(user_data, vaddr + offset, len);
 993
 994         kunmap(page);
 995
 996         return ret ? -EFAULT : 0;
 997 }
 998
 999 static int
1000 i915_gem_shmem_pread(struct drm_i915_gem_object *obj,
1001                      struct drm_i915_gem_pread *args)
1002 {
1003         char __user *user_data;
1004         u64 remain;
1005         unsigned int needs_clflush;
1006         unsigned int idx, offset;
1007         int ret;
1008
1009         ret = mutex_lock_interruptible(&obj->base.dev->struct_mutex);
1010         if (ret)
1011                 return ret;
1012
1013         ret = i915_gem_obj_prepare_shmem_read(obj, &needs_clflush);
1014         mutex_unlock(&obj->base.dev->struct_mutex);
1015         if (ret)
1016                 return ret;
1017
1018         remain = args->size;
1019         user_data = u64_to_user_ptr(args->data_ptr);
1020         offset = offset_in_page(args->offset);
1021         for (idx = args->offset >> PAGE_SHIFT; remain; idx++) {
1022                 struct page *page = i915_gem_object_get_page(obj, idx);
1023                 unsigned int length = min_t(u64, remain, PAGE_SIZE - offset);
1024
1025                 ret = shmem_pread(page, offset, length, user_data,
1026                                   needs_clflush);
1027                 if (ret)
1028                         break;
1029
1030                 remain -= length;
1031                 user_data += length;
1032                 offset = 0;
1033         }
1034
1035         i915_gem_obj_finish_shmem_access(obj);
1036         return ret;
1037 }
1038
1039 static inline bool
1040 gtt_user_read(struct io_mapping *mapping,
1041               loff_t base, int offset,
1042               char __user *user_data, int length)
1043 {
1044         void __iomem *vaddr;
1045         unsigned long unwritten;
1046
1047         /* We can use the cpu mem copy function because this is X86. */
1048         vaddr = io_mapping_map_atomic_wc(mapping, base);
1049         unwritten = __copy_to_user_inatomic(user_data,
1050                                             (void __force *)vaddr + offset,
1051                                             length);
1052         io_mapping_unmap_atomic(vaddr);
1053         if (unwritten) {
1054                 vaddr = io_mapping_map_wc(mapping, base, PAGE_SIZE);
1055                 unwritten = copy_to_user(user_data,
1056                                          (void __force *)vaddr + offset,
1057                                          length);
1058                 io_mapping_unmap(vaddr);
1059         }
1060         return unwritten;
1061 }
1062
1063 static int
1064 i915_gem_gtt_pread(struct drm_i915_gem_object *obj,
1065                    const struct drm_i915_gem_pread *args)
1066 {
1067         struct drm_i915_private *i915 = to_i915(obj->base.dev);
1068         struct i915_ggtt *ggtt = &i915->ggtt;
1069         struct drm_mm_node node;
1070         struct i915_vma *vma;
1071         void __user *user_data;
1072         u64 remain, offset;
1073         int ret;
1074
1075         ret = mutex_lock_interruptible(&i915->drm.struct_mutex);
1076         if (ret)
1077                 return ret;
1078
1079         intel_runtime_pm_get(i915);
1080         vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
1081                                        PIN_MAPPABLE |
1082                                        PIN_NONFAULT |
1083                                        PIN_NONBLOCK);
1084         if (!IS_ERR(vma)) {
1085                 node.start = i915_ggtt_offset(vma);
1086                 node.allocated = false;
1087                 ret = i915_vma_put_fence(vma);
1088                 if (ret) {
1089                         i915_vma_unpin(vma);
1090                         vma = ERR_PTR(ret);
1091                 }
1092         }
1093         if (IS_ERR(vma)) {
1094                 ret = insert_mappable_node(ggtt, &node, PAGE_SIZE);
1095                 if (ret)
1096                         goto out_unlock;
1097                 GEM_BUG_ON(!node.allocated);
1098         }
1099
1100         ret = i915_gem_object_set_to_gtt_domain(obj, false);
1101         if (ret)
1102                 goto out_unpin;
1103
1104         mutex_unlock(&i915->drm.struct_mutex);
1105
1106         user_data = u64_to_user_ptr(args->data_ptr);
1107         remain = args->size;
1108         offset = args->offset;
1109
1110         while (remain > 0) {
1111                 /* Operation in this page
1112                  *
1113                  * page_base = page offset within aperture
1114                  * page_offset = offset within page
1115                  * page_length = bytes to copy for this page
1116                  */
1117                 u32 page_base = node.start;
1118                 unsigned page_offset = offset_in_page(offset);
1119                 unsigned page_length = PAGE_SIZE - page_offset;
1120                 page_length = remain < page_length ? remain : page_length;
1121                 if (node.allocated) {
1122                         wmb();
1123                         ggtt->vm.insert_page(&ggtt->vm,
1124                                              i915_gem_object_get_dma_address(obj, offset >> PAGE_SHIFT),
1125                                              node.start, I915_CACHE_NONE, 0);
1126                         wmb();
1127                 } else {
1128                         page_base += offset & PAGE_MASK;
1129                 }
1130
1131                 if (gtt_user_read(&ggtt->iomap, page_base, page_offset,
1132                                   user_data, page_length)) {
1133                         ret = -EFAULT;
1134                         break;
1135                 }
1136
1137                 remain -= page_length;
1138                 user_data += page_length;
1139                 offset += page_length;
1140         }
1141
1142         mutex_lock(&i915->drm.struct_mutex);
1143 out_unpin:
1144         if (node.allocated) {
1145                 wmb();
1146                 ggtt->vm.clear_range(&ggtt->vm, node.start, node.size);
1147                 remove_mappable_node(&node);
1148         } else {
1149                 i915_vma_unpin(vma);
1150         }
1151 out_unlock:
1152         intel_runtime_pm_put(i915);
1153         mutex_unlock(&i915->drm.struct_mutex);
1154
1155         return ret;
1156 }
1157
1158 /**
1159  * Reads data from the object referenced by handle.
1160  * @dev: drm device pointer
1161  * @data: ioctl data blob
1162  * @file: drm file pointer
1163  *
1164  * On error, the contents of *data are undefined.
1165  */
1166 int
1167 i915_gem_pread_ioctl(struct drm_device *dev, void *data,
1168                      struct drm_file *file)
1169 {
1170         struct drm_i915_gem_pread *args = data;
1171         struct drm_i915_gem_object *obj;
1172         int ret;
1173
1174         if (args->size == 0)
1175                 return 0;
1176
1177         if (!access_ok(u64_to_user_ptr(args->data_ptr),
1178                        args->size))
1179                 return -EFAULT;
1180
1181         obj = i915_gem_object_lookup(file, args->handle);
1182         if (!obj)
1183                 return -ENOENT;
1184
1185         /* Bounds check source.  */
1186         if (range_overflows_t(u64, args->offset, args->size, obj->base.size)) {
1187                 ret = -EINVAL;
1188                 goto out;
1189         }
1190
1191         trace_i915_gem_object_pread(obj, args->offset, args->size);
1192
1193         ret = i915_gem_object_wait(obj,
1194                                    I915_WAIT_INTERRUPTIBLE,
1195                                    MAX_SCHEDULE_TIMEOUT,
1196                                    to_rps_client(file));
1197         if (ret)
1198                 goto out;
1199
1200         ret = i915_gem_object_pin_pages(obj);
1201         if (ret)
1202                 goto out;
1203
1204         ret = i915_gem_shmem_pread(obj, args);
1205         if (ret == -EFAULT || ret == -ENODEV)
1206                 ret = i915_gem_gtt_pread(obj, args);
1207
1208         i915_gem_object_unpin_pages(obj);
1209 out:
1210         i915_gem_object_put(obj);
1211         return ret;
1212 }
1213
1214 /* This is the fast write path which cannot handle
1215  * page faults in the source data
1216  */
1217
1218 static inline bool
1219 ggtt_write(struct io_mapping *mapping,
1220            loff_t base, int offset,
1221            char __user *user_data, int length)
1222 {
1223         void __iomem *vaddr;
1224         unsigned long unwritten;
1225
1226         /* We can use the cpu mem copy function because this is X86. */
1227         vaddr = io_mapping_map_atomic_wc(mapping, base);
1228         unwritten = __copy_from_user_inatomic_nocache((void __force *)vaddr + offset,
1229                                                       user_data, length);
1230         io_mapping_unmap_atomic(vaddr);
1231         if (unwritten) {
1232                 vaddr = io_mapping_map_wc(mapping, base, PAGE_SIZE);
1233                 unwritten = copy_from_user((void __force *)vaddr + offset,
1234                                            user_data, length);
1235                 io_mapping_unmap(vaddr);
1236         }
1237
1238         return unwritten;
1239 }
1240
1241 /**
1242  * This is the fast pwrite path, where we copy the data directly from the
1243  * user into the GTT, uncached.
1244  * @obj: i915 GEM object
1245  * @args: pwrite arguments structure
1246  */
1247 static int
1248 i915_gem_gtt_pwrite_fast(struct drm_i915_gem_object *obj,
1249                          const struct drm_i915_gem_pwrite *args)
1250 {
1251         struct drm_i915_private *i915 = to_i915(obj->base.dev);
1252         struct i915_ggtt *ggtt = &i915->ggtt;
1253         struct drm_mm_node node;
1254         struct i915_vma *vma;
1255         u64 remain, offset;
1256         void __user *user_data;
1257         int ret;
1258
1259         ret = mutex_lock_interruptible(&i915->drm.struct_mutex);
1260         if (ret)
1261                 return ret;
1262
1263         if (i915_gem_object_has_struct_page(obj)) {
1264                 /*
1265                  * Avoid waking the device up if we can fallback, as
1266                  * waking/resuming is very slow (worst-case 10-100 ms
1267                  * depending on PCI sleeps and our own resume time).
1268                  * This easily dwarfs any performance advantage from
1269                  * using the cache bypass of indirect GGTT access.
1270                  */
1271                 if (!intel_runtime_pm_get_if_in_use(i915)) {
1272                         ret = -EFAULT;
1273                         goto out_unlock;
1274                 }
1275         } else {
1276                 /* No backing pages, no fallback, we must force GGTT access */
1277                 intel_runtime_pm_get(i915);
1278         }
1279
1280         vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
1281                                        PIN_MAPPABLE |
1282                                        PIN_NONFAULT |
1283                                        PIN_NONBLOCK);
1284         if (!IS_ERR(vma)) {
1285                 node.start = i915_ggtt_offset(vma);
1286                 node.allocated = false;
1287                 ret = i915_vma_put_fence(vma);
1288                 if (ret) {
1289                         i915_vma_unpin(vma);
1290                         vma = ERR_PTR(ret);
1291                 }
1292         }
1293         if (IS_ERR(vma)) {
1294                 ret = insert_mappable_node(ggtt, &node, PAGE_SIZE);
1295                 if (ret)
1296                         goto out_rpm;
1297                 GEM_BUG_ON(!node.allocated);
1298         }
1299
1300         ret = i915_gem_object_set_to_gtt_domain(obj, true);
1301         if (ret)
1302                 goto out_unpin;
1303
1304         mutex_unlock(&i915->drm.struct_mutex);
1305
1306         intel_fb_obj_invalidate(obj, ORIGIN_CPU);
1307
1308         user_data = u64_to_user_ptr(args->data_ptr);
1309         offset = args->offset;
1310         remain = args->size;
1311         while (remain) {
1312                 /* Operation in this page
1313                  *
1314                  * page_base = page offset within aperture
1315                  * page_offset = offset within page
1316                  * page_length = bytes to copy for this page
1317                  */
1318                 u32 page_base = node.start;
1319                 unsigned int page_offset = offset_in_page(offset);
1320                 unsigned int page_length = PAGE_SIZE - page_offset;
1321                 page_length = remain < page_length ? remain : page_length;
1322                 if (node.allocated) {
1323                         wmb(); /* flush the write before we modify the GGTT */
1324                         ggtt->vm.insert_page(&ggtt->vm,
1325                                              i915_gem_object_get_dma_address(obj, offset >> PAGE_SHIFT),
1326                                              node.start, I915_CACHE_NONE, 0);
1327                         wmb(); /* flush modifications to the GGTT (insert_page) */
1328                 } else {
1329                         page_base += offset & PAGE_MASK;
1330                 }
1331                 /* If we get a fault while copying data, then (presumably) our
1332                  * source page isn't available.  Return the error and we'll
1333                  * retry in the slow path.
1334                  * If the object is non-shmem backed, we retry again with the
1335                  * path that handles page fault.
1336                  */
1337                 if (ggtt_write(&ggtt->iomap, page_base, page_offset,
1338                                user_data, page_length)) {
1339                         ret = -EFAULT;
1340                         break;
1341                 }
1342
1343                 remain -= page_length;
1344                 user_data += page_length;
1345                 offset += page_length;
1346         }
1347         intel_fb_obj_flush(obj, ORIGIN_CPU);
1348
1349         mutex_lock(&i915->drm.struct_mutex);
1350 out_unpin:
1351         if (node.allocated) {
1352                 wmb();
1353                 ggtt->vm.clear_range(&ggtt->vm, node.start, node.size);
1354                 remove_mappable_node(&node);
1355         } else {
1356                 i915_vma_unpin(vma);
1357         }
1358 out_rpm:
1359         intel_runtime_pm_put(i915);
1360 out_unlock:
1361         mutex_unlock(&i915->drm.struct_mutex);
1362         return ret;
1363 }
1364
1365 /* Per-page copy function for the shmem pwrite fastpath.
1366  * Flushes invalid cachelines before writing to the target if
1367  * needs_clflush_before is set and flushes out any written cachelines after
1368  * writing if needs_clflush is set.
1369  */
1370 static int
1371 shmem_pwrite(struct page *page, int offset, int len, char __user *user_data,
1372              bool needs_clflush_before,
1373              bool needs_clflush_after)
1374 {
1375         char *vaddr;
1376         int ret;
1377
1378         vaddr = kmap(page);
1379
1380         if (needs_clflush_before)
1381                 drm_clflush_virt_range(vaddr + offset, len);
1382
1383         ret = __copy_from_user(vaddr + offset, user_data, len);
1384         if (!ret && needs_clflush_after)
1385                 drm_clflush_virt_range(vaddr + offset, len);
1386
1387         kunmap(page);
1388
1389         return ret ? -EFAULT : 0;
1390 }
1391
1392 static int
1393 i915_gem_shmem_pwrite(struct drm_i915_gem_object *obj,
1394                       const struct drm_i915_gem_pwrite *args)
1395 {
1396         struct drm_i915_private *i915 = to_i915(obj->base.dev);
1397         void __user *user_data;
1398         u64 remain;
1399         unsigned int partial_cacheline_write;
1400         unsigned int needs_clflush;
1401         unsigned int offset, idx;
1402         int ret;
1403
1404         ret = mutex_lock_interruptible(&i915->drm.struct_mutex);
1405         if (ret)
1406                 return ret;
1407
1408         ret = i915_gem_obj_prepare_shmem_write(obj, &needs_clflush);
1409         mutex_unlock(&i915->drm.struct_mutex);
1410         if (ret)
1411                 return ret;
1412
1413         /* If we don't overwrite a cacheline completely we need to be
1414          * careful to have up-to-date data by first clflushing. Don't
1415          * overcomplicate things and flush the entire patch.
1416          */
1417         partial_cacheline_write = 0;
1418         if (needs_clflush & CLFLUSH_BEFORE)
1419                 partial_cacheline_write = boot_cpu_data.x86_clflush_size - 1;
1420
1421         user_data = u64_to_user_ptr(args->data_ptr);
1422         remain = args->size;
1423         offset = offset_in_page(args->offset);
1424         for (idx = args->offset >> PAGE_SHIFT; remain; idx++) {
1425                 struct page *page = i915_gem_object_get_page(obj, idx);
1426                 unsigned int length = min_t(u64, remain, PAGE_SIZE - offset);
1427
1428                 ret = shmem_pwrite(page, offset, length, user_data,
1429                                    (offset | length) & partial_cacheline_write,
1430                                    needs_clflush & CLFLUSH_AFTER);
1431                 if (ret)
1432                         break;
1433
1434                 remain -= length;
1435                 user_data += length;
1436                 offset = 0;
1437         }
1438
1439         intel_fb_obj_flush(obj, ORIGIN_CPU);
1440         i915_gem_obj_finish_shmem_access(obj);
1441         return ret;
1442 }
1443
1444 /**
1445  * Writes data to the object referenced by handle.
1446  * @dev: drm device
1447  * @data: ioctl data blob
1448  * @file: drm file
1449  *
1450  * On error, the contents of the buffer that were to be modified are undefined.
1451  */
1452 int
1453 i915_gem_pwrite_ioctl(struct drm_device *dev, void *data,
1454                       struct drm_file *file)
1455 {
1456         struct drm_i915_gem_pwrite *args = data;
1457         struct drm_i915_gem_object *obj;
1458         int ret;
1459
1460         if (args->size == 0)
1461                 return 0;
1462
1463         if (!access_ok(u64_to_user_ptr(args->data_ptr), args->size))
1464                 return -EFAULT;
1465
1466         obj = i915_gem_object_lookup(file, args->handle);
1467         if (!obj)
1468                 return -ENOENT;
1469
1470         /* Bounds check destination. */
1471         if (range_overflows_t(u64, args->offset, args->size, obj->base.size)) {
1472                 ret = -EINVAL;
1473                 goto err;
1474         }
1475
1476         /* Writes not allowed into this read-only object */
1477         if (i915_gem_object_is_readonly(obj)) {
1478                 ret = -EINVAL;
1479                 goto err;
1480         }
1481
1482         trace_i915_gem_object_pwrite(obj, args->offset, args->size);
1483
1484         ret = -ENODEV;
1485         if (obj->ops->pwrite)
1486                 ret = obj->ops->pwrite(obj, args);
1487         if (ret != -ENODEV)
1488                 goto err;
1489
1490         ret = i915_gem_object_wait(obj,
1491                                    I915_WAIT_INTERRUPTIBLE |
1492                                    I915_WAIT_ALL,
1493                                    MAX_SCHEDULE_TIMEOUT,
1494                                    to_rps_client(file));
1495         if (ret)
1496                 goto err;
1497
1498         ret = i915_gem_object_pin_pages(obj);
1499         if (ret)
1500                 goto err;
1501
1502         ret = -EFAULT;
1503         /* We can only do the GTT pwrite on untiled buffers, as otherwise
1504          * it would end up going through the fenced access, and we'll get
1505          * different detiling behavior between reading and writing.
1506          * pread/pwrite currently are reading and writing from the CPU
1507          * perspective, requiring manual detiling by the client.
1508          */
1509         if (!i915_gem_object_has_struct_page(obj) ||
1510             cpu_write_needs_clflush(obj))
1511                 /* Note that the gtt paths might fail with non-page-backed user
1512                  * pointers (e.g. gtt mappings when moving data between
1513                  * textures). Fallback to the shmem path in that case.
1514                  */
1515                 ret = i915_gem_gtt_pwrite_fast(obj, args);
1516
1517         if (ret == -EFAULT || ret == -ENOSPC) {
1518                 if (obj->phys_handle)
1519                         ret = i915_gem_phys_pwrite(obj, args, file);
1520                 else
1521                         ret = i915_gem_shmem_pwrite(obj, args);
1522         }
1523
1524         i915_gem_object_unpin_pages(obj);
1525 err:
1526         i915_gem_object_put(obj);
1527         return ret;
1528 }
1529
1530 static void i915_gem_object_bump_inactive_ggtt(struct drm_i915_gem_object *obj)
1531 {
1532         struct drm_i915_private *i915;
1533         struct list_head *list;
1534         struct i915_vma *vma;
1535
1536         GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
1537
1538         for_each_ggtt_vma(vma, obj) {
1539                 if (i915_vma_is_active(vma))
1540                         continue;
1541
1542                 if (!drm_mm_node_allocated(&vma->node))
1543                         continue;
1544
1545                 list_move_tail(&vma->vm_link, &vma->vm->inactive_list);
1546         }
1547
1548         i915 = to_i915(obj->base.dev);
1549         spin_lock(&i915->mm.obj_lock);
1550         list = obj->bind_count ? &i915->mm.bound_list : &i915->mm.unbound_list;
1551         list_move_tail(&obj->mm.link, list);
1552         spin_unlock(&i915->mm.obj_lock);
1553 }
1554
1555 /**
1556  * Called when user space prepares to use an object with the CPU, either
1557  * through the mmap ioctl's mapping or a GTT mapping.
1558  * @dev: drm device
1559  * @data: ioctl data blob
1560  * @file: drm file
1561  */
1562 int
1563 i915_gem_set_domain_ioctl(struct drm_device *dev, void *data,
1564                           struct drm_file *file)
1565 {
1566         struct drm_i915_gem_set_domain *args = data;
1567         struct drm_i915_gem_object *obj;
1568         uint32_t read_domains = args->read_domains;
1569         uint32_t write_domain = args->write_domain;
1570         int err;
1571
1572         /* Only handle setting domains to types used by the CPU. */
1573         if ((write_domain | read_domains) & I915_GEM_GPU_DOMAINS)
1574                 return -EINVAL;
1575
1576         /* Having something in the write domain implies it's in the read
1577          * domain, and only that read domain.  Enforce that in the request.
1578          */
1579         if (write_domain != 0 && read_domains != write_domain)
1580                 return -EINVAL;
1581
1582         obj = i915_gem_object_lookup(file, args->handle);
1583         if (!obj)
1584                 return -ENOENT;
1585
1586         /* Try to flush the object off the GPU without holding the lock.
1587          * We will repeat the flush holding the lock in the normal manner
1588          * to catch cases where we are gazumped.
1589          */
1590         err = i915_gem_object_wait(obj,
1591                                    I915_WAIT_INTERRUPTIBLE |
1592                                    I915_WAIT_PRIORITY |
1593                                    (write_domain ? I915_WAIT_ALL : 0),
1594                                    MAX_SCHEDULE_TIMEOUT,
1595                                    to_rps_client(file));
1596         if (err)
1597                 goto out;
1598
1599         /*
1600          * Proxy objects do not control access to the backing storage, ergo
1601          * they cannot be used as a means to manipulate the cache domain
1602          * tracking for that backing storage. The proxy object is always
1603          * considered to be outside of any cache domain.
1604          */
1605         if (i915_gem_object_is_proxy(obj)) {
1606                 err = -ENXIO;
1607                 goto out;
1608         }
1609
1610         /*
1611          * Flush and acquire obj->pages so that we are coherent through
1612          * direct access in memory with previous cached writes through
1613          * shmemfs and that our cache domain tracking remains valid.
1614          * For example, if the obj->filp was moved to swap without us
1615          * being notified and releasing the pages, we would mistakenly
1616          * continue to assume that the obj remained out of the CPU cached
1617          * domain.
1618          */
1619         err = i915_gem_object_pin_pages(obj);
1620         if (err)
1621                 goto out;
1622
1623         err = i915_mutex_lock_interruptible(dev);
1624         if (err)
1625                 goto out_unpin;
1626
1627         if (read_domains & I915_GEM_DOMAIN_WC)
1628                 err = i915_gem_object_set_to_wc_domain(obj, write_domain);
1629         else if (read_domains & I915_GEM_DOMAIN_GTT)
1630                 err = i915_gem_object_set_to_gtt_domain(obj, write_domain);
1631         else
1632                 err = i915_gem_object_set_to_cpu_domain(obj, write_domain);
1633
1634         /* And bump the LRU for this access */
1635         i915_gem_object_bump_inactive_ggtt(obj);
1636
1637         mutex_unlock(&dev->struct_mutex);
1638
1639         if (write_domain != 0)
1640                 intel_fb_obj_invalidate(obj,
1641                                         fb_write_origin(obj, write_domain));
1642
1643 out_unpin:
1644         i915_gem_object_unpin_pages(obj);
1645 out:
1646         i915_gem_object_put(obj);
1647         return err;
1648 }
1649
1650 /**
1651  * Called when user space has done writes to this buffer
1652  * @dev: drm device
1653  * @data: ioctl data blob
1654  * @file: drm file
1655  */
1656 int
1657 i915_gem_sw_finish_ioctl(struct drm_device *dev, void *data,
1658                          struct drm_file *file)
1659 {
1660         struct drm_i915_gem_sw_finish *args = data;
1661         struct drm_i915_gem_object *obj;
1662
1663         obj = i915_gem_object_lookup(file, args->handle);
1664         if (!obj)
1665                 return -ENOENT;
1666
1667         /*
1668          * Proxy objects are barred from CPU access, so there is no
1669          * need to ban sw_finish as it is a nop.
1670          */
1671
1672         /* Pinned buffers may be scanout, so flush the cache */
1673         i915_gem_object_flush_if_display(obj);
1674         i915_gem_object_put(obj);
1675
1676         return 0;
1677 }
1678
1679 /**
1680  * i915_gem_mmap_ioctl - Maps the contents of an object, returning the address
1681  *                       it is mapped to.
1682  * @dev: drm device
1683  * @data: ioctl data blob
1684  * @file: drm file
1685  *
1686  * While the mapping holds a reference on the contents of the object, it doesn't
1687  * imply a ref on the object itself.
1688  *
1689  * IMPORTANT:
1690  *
1691  * DRM driver writers who look a this function as an example for how to do GEM
1692  * mmap support, please don't implement mmap support like here. The modern way
1693  * to implement DRM mmap support is with an mmap offset ioctl (like
1694  * i915_gem_mmap_gtt) and then using the mmap syscall on the DRM fd directly.
1695  * That way debug tooling like valgrind will understand what's going on, hiding
1696  * the mmap call in a driver private ioctl will break that. The i915 driver only
1697  * does cpu mmaps this way because we didn't know better.
1698  */
1699 int
1700 i915_gem_mmap_ioctl(struct drm_device *dev, void *data,
1701                     struct drm_file *file)
1702 {
1703         struct drm_i915_gem_mmap *args = data;
1704         struct drm_i915_gem_object *obj;
1705         unsigned long addr;
1706
1707         if (args->flags & ~(I915_MMAP_WC))
1708                 return -EINVAL;
1709
1710         if (args->flags & I915_MMAP_WC && !boot_cpu_has(X86_FEATURE_PAT))
1711                 return -ENODEV;
1712
1713         obj = i915_gem_object_lookup(file, args->handle);
1714         if (!obj)
1715                 return -ENOENT;
1716
1717         /* prime objects have no backing filp to GEM mmap
1718          * pages from.
1719          */
1720         if (!obj->base.filp) {
1721                 i915_gem_object_put(obj);
1722                 return -ENXIO;
1723         }
1724
1725         addr = vm_mmap(obj->base.filp, 0, args->size,
1726                        PROT_READ | PROT_WRITE, MAP_SHARED,
1727                        args->offset);
1728         if (args->flags & I915_MMAP_WC) {
1729                 struct mm_struct *mm = current->mm;
1730                 struct vm_area_struct *vma;
1731
1732                 if (down_write_killable(&mm->mmap_sem)) {
1733                         i915_gem_object_put(obj);
1734                         return -EINTR;
1735                 }
1736                 vma = find_vma(mm, addr);
1737                 if (vma)
1738                         vma->vm_page_prot =
1739                                 pgprot_writecombine(vm_get_page_prot(vma->vm_flags));
1740                 else
1741                         addr = -ENOMEM;
1742                 up_write(&mm->mmap_sem);
1743
1744                 /* This may race, but that's ok, it only gets set */
1745                 WRITE_ONCE(obj->frontbuffer_ggtt_origin, ORIGIN_CPU);
1746         }
1747         i915_gem_object_put(obj);
1748         if (IS_ERR((void *)addr))
1749                 return addr;
1750
1751         args->addr_ptr = (uint64_t) addr;
1752
1753         return 0;
1754 }
1755
1756 static unsigned int tile_row_pages(const struct drm_i915_gem_object *obj)
1757 {
1758         return i915_gem_object_get_tile_row_size(obj) >> PAGE_SHIFT;
1759 }
1760
1761 /**
1762  * i915_gem_mmap_gtt_version - report the current feature set for GTT mmaps
1763  *
1764  * A history of the GTT mmap interface:
1765  *
1766  * 0 - Everything had to fit into the GTT. Both parties of a memcpy had to
1767  *     aligned and suitable for fencing, and still fit into the available
1768  *     mappable space left by the pinned display objects. A classic problem
1769  *     we called the page-fault-of-doom where we would ping-pong between
1770  *     two objects that could not fit inside the GTT and so the memcpy
1771  *     would page one object in at the expense of the other between every
1772  *     single byte.
1773  *
1774  * 1 - Objects can be any size, and have any compatible fencing (X Y, or none
1775  *     as set via i915_gem_set_tiling() [DRM_I915_GEM_SET_TILING]). If the
1776  *     object is too large for the available space (or simply too large
1777  *     for the mappable aperture!), a view is created instead and faulted
1778  *     into userspace. (This view is aligned and sized appropriately for
1779  *     fenced access.)
1780  *
1781  * 2 - Recognise WC as a separate cache domain so that we can flush the
1782  *     delayed writes via GTT before performing direct access via WC.
1783  *
1784  * Restrictions:
1785  *
1786  *  * snoopable objects cannot be accessed via the GTT. It can cause machine
1787  *    hangs on some architectures, corruption on others. An attempt to service
1788  *    a GTT page fault from a snoopable object will generate a SIGBUS.
1789  *
1790  *  * the object must be able to fit into RAM (physical memory, though no
1791  *    limited to the mappable aperture).
1792  *
1793  *
1794  * Caveats:
1795  *
1796  *  * a new GTT page fault will synchronize rendering from the GPU and flush
1797  *    all data to system memory. Subsequent access will not be synchronized.
1798  *
1799  *  * all mappings are revoked on runtime device suspend.
1800  *
1801  *  * there are only 8, 16 or 32 fence registers to share between all users
1802  *    (older machines require fence register for display and blitter access
1803  *    as well). Contention of the fence registers will cause the previous users
1804  *    to be unmapped and any new access will generate new page faults.
1805  *
1806  *  * running out of memory while servicing a fault may generate a SIGBUS,
1807  *    rather than the expected SIGSEGV.
1808  */
1809 int i915_gem_mmap_gtt_version(void)
1810 {
1811         return 2;
1812 }
1813
1814 static inline struct i915_ggtt_view
1815 compute_partial_view(const struct drm_i915_gem_object *obj,
1816                      pgoff_t page_offset,
1817                      unsigned int chunk)
1818 {
1819         struct i915_ggtt_view view;
1820
1821         if (i915_gem_object_is_tiled(obj))
1822                 chunk = roundup(chunk, tile_row_pages(obj));
1823
1824         view.type = I915_GGTT_VIEW_PARTIAL;
1825         view.partial.offset = rounddown(page_offset, chunk);
1826         view.partial.size =
1827                 min_t(unsigned int, chunk,
1828                       (obj->base.size >> PAGE_SHIFT) - view.partial.offset);
1829
1830         /* If the partial covers the entire object, just create a normal VMA. */
1831         if (chunk >= obj->base.size >> PAGE_SHIFT)
1832                 view.type = I915_GGTT_VIEW_NORMAL;
1833
1834         return view;
1835 }
1836
1837 /**
1838  * i915_gem_fault - fault a page into the GTT
1839  * @vmf: fault info
1840  *
1841  * The fault handler is set up by drm_gem_mmap() when a object is GTT mapped
1842  * from userspace.  The fault handler takes care of binding the object to
1843  * the GTT (if needed), allocating and programming a fence register (again,
1844  * only if needed based on whether the old reg is still valid or the object
1845  * is tiled) and inserting a new PTE into the faulting process.
1846  *
1847  * Note that the faulting process may involve evicting existing objects
1848  * from the GTT and/or fence registers to make room.  So performance may
1849  * suffer if the GTT working set is large or there are few fence registers
1850  * left.
1851  *
1852  * The current feature set supported by i915_gem_fault() and thus GTT mmaps
1853  * is exposed via I915_PARAM_MMAP_GTT_VERSION (see i915_gem_mmap_gtt_version).
1854  */
1855 vm_fault_t i915_gem_fault(struct vm_fault *vmf)
1856 {
1857 #define MIN_CHUNK_PAGES (SZ_1M >> PAGE_SHIFT)
1858         struct vm_area_struct *area = vmf->vma;
1859         struct drm_i915_gem_object *obj = to_intel_bo(area->vm_private_data);
1860         struct drm_device *dev = obj->base.dev;
1861         struct drm_i915_private *dev_priv = to_i915(dev);
1862         struct i915_ggtt *ggtt = &dev_priv->ggtt;
1863         bool write = area->vm_flags & VM_WRITE;
1864         struct i915_vma *vma;
1865         pgoff_t page_offset;
1866         int ret;
1867
1868         /* Sanity check that we allow writing into this object */
1869         if (i915_gem_object_is_readonly(obj) && write)
1870                 return VM_FAULT_SIGBUS;
1871
1872         /* We don't use vmf->pgoff since that has the fake offset */
1873         page_offset = (vmf->address - area->vm_start) >> PAGE_SHIFT;
1874
1875         trace_i915_gem_object_fault(obj, page_offset, true, write);
1876
1877         /* Try to flush the object off the GPU first without holding the lock.
1878          * Upon acquiring the lock, we will perform our sanity checks and then
1879          * repeat the flush holding the lock in the normal manner to catch cases
1880          * where we are gazumped.
1881          */
1882         ret = i915_gem_object_wait(obj,
1883                                    I915_WAIT_INTERRUPTIBLE,
1884                                    MAX_SCHEDULE_TIMEOUT,
1885                                    NULL);
1886         if (ret)
1887                 goto err;
1888
1889         ret = i915_gem_object_pin_pages(obj);
1890         if (ret)
1891                 goto err;
1892
1893         intel_runtime_pm_get(dev_priv);
1894
1895         ret = i915_mutex_lock_interruptible(dev);
1896         if (ret)
1897                 goto err_rpm;
1898
1899         /* Access to snoopable pages through the GTT is incoherent. */
1900         if (obj->cache_level != I915_CACHE_NONE && !HAS_LLC(dev_priv)) {
1901                 ret = -EFAULT;
1902                 goto err_unlock;
1903         }
1904
1905
1906         /* Now pin it into the GTT as needed */
1907         vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
1908                                        PIN_MAPPABLE |
1909                                        PIN_NONBLOCK |
1910                                        PIN_NONFAULT);
1911         if (IS_ERR(vma)) {
1912                 /* Use a partial view if it is bigger than available space */
1913                 struct i915_ggtt_view view =
1914                         compute_partial_view(obj, page_offset, MIN_CHUNK_PAGES);
1915                 unsigned int flags;
1916
1917                 flags = PIN_MAPPABLE;
1918                 if (view.type == I915_GGTT_VIEW_NORMAL)
1919                         flags |= PIN_NONBLOCK; /* avoid warnings for pinned */
1920
1921                 /*
1922                  * Userspace is now writing through an untracked VMA, abandon
1923                  * all hope that the hardware is able to track future writes.
1924                  */
1925                 obj->frontbuffer_ggtt_origin = ORIGIN_CPU;
1926
1927                 vma = i915_gem_object_ggtt_pin(obj, &view, 0, 0, flags);
1928                 if (IS_ERR(vma) && !view.type) {
1929                         flags = PIN_MAPPABLE;
1930                         view.type = I915_GGTT_VIEW_PARTIAL;
1931                         vma = i915_gem_object_ggtt_pin(obj, &view, 0, 0, flags);
1932                 }
1933         }
1934         if (IS_ERR(vma)) {
1935                 ret = PTR_ERR(vma);
1936                 goto err_unlock;
1937         }
1938
1939         ret = i915_gem_object_set_to_gtt_domain(obj, write);
1940         if (ret)
1941                 goto err_unpin;
1942
1943         ret = i915_vma_pin_fence(vma);
1944         if (ret)
1945                 goto err_unpin;
1946
1947         /* Finally, remap it using the new GTT offset */
1948         ret = remap_io_mapping(area,
1949                                area->vm_start + (vma->ggtt_view.partial.offset << PAGE_SHIFT),
1950                                (ggtt->gmadr.start + vma->node.start) >> PAGE_SHIFT,
1951                                min_t(u64, vma->size, area->vm_end - area->vm_start),
1952                                &ggtt->iomap);
1953         if (ret)
1954                 goto err_fence;
1955
1956         /* Mark as being mmapped into userspace for later revocation */
1957         assert_rpm_wakelock_held(dev_priv);
1958         if (!i915_vma_set_userfault(vma) && !obj->userfault_count++)
1959                 list_add(&obj->userfault_link, &dev_priv->mm.userfault_list);
1960         GEM_BUG_ON(!obj->userfault_count);
1961
1962         i915_vma_set_ggtt_write(vma);
1963
1964 err_fence:
1965         i915_vma_unpin_fence(vma);
1966 err_unpin:
1967         __i915_vma_unpin(vma);
1968 err_unlock:
1969         mutex_unlock(&dev->struct_mutex);
1970 err_rpm:
1971         intel_runtime_pm_put(dev_priv);
1972         i915_gem_object_unpin_pages(obj);
1973 err:
1974         switch (ret) {
1975         case -EIO:
1976                 /*
1977                  * We eat errors when the gpu is terminally wedged to avoid
1978                  * userspace unduly crashing (gl has no provisions for mmaps to
1979                  * fail). But any other -EIO isn't ours (e.g. swap in failure)
1980                  * and so needs to be reported.
1981                  */
1982                 if (!i915_terminally_wedged(&dev_priv->gpu_error))
1983                         return VM_FAULT_SIGBUS;
1984                 /* else: fall through */
1985         case -EAGAIN:
1986                 /*
1987                  * EAGAIN means the gpu is hung and we'll wait for the error
1988                  * handler to reset everything when re-faulting in
1989                  * i915_mutex_lock_interruptible.
1990                  */
1991         case 0:
1992         case -ERESTARTSYS:
1993         case -EINTR:
1994         case -EBUSY:
1995                 /*
1996                  * EBUSY is ok: this just means that another thread
1997                  * already did the job.
1998                  */
1999                 return VM_FAULT_NOPAGE;
2000         case -ENOMEM:
2001                 return VM_FAULT_OOM;
2002         case -ENOSPC:
2003         case -EFAULT:
2004                 return VM_FAULT_SIGBUS;
2005         default:
2006                 WARN_ONCE(ret, "unhandled error in i915_gem_fault: %i\n", ret);
2007                 return VM_FAULT_SIGBUS;
2008         }
2009 }
2010
2011 static void __i915_gem_object_release_mmap(struct drm_i915_gem_object *obj)
2012 {
2013         struct i915_vma *vma;
2014
2015         GEM_BUG_ON(!obj->userfault_count);
2016
2017         obj->userfault_count = 0;
2018         list_del(&obj->userfault_link);
2019         drm_vma_node_unmap(&obj->base.vma_node,
2020                            obj->base.dev->anon_inode->i_mapping);
2021
2022         for_each_ggtt_vma(vma, obj)
2023                 i915_vma_unset_userfault(vma);
2024 }
2025
2026 /**
2027  * i915_gem_release_mmap - remove physical page mappings
2028  * @obj: obj in question
2029  *
2030  * Preserve the reservation of the mmapping with the DRM core code, but
2031  * relinquish ownership of the pages back to the system.
2032  *
2033  * It is vital that we remove the page mapping if we have mapped a tiled
2034  * object through the GTT and then lose the fence register due to
2035  * resource pressure. Similarly if the object has been moved out of the
2036  * aperture, than pages mapped into userspace must be revoked. Removing the
2037  * mapping will then trigger a page fault on the next user access, allowing
2038  * fixup by i915_gem_fault().
2039  */
2040 void
2041 i915_gem_release_mmap(struct drm_i915_gem_object *obj)
2042 {
2043         struct drm_i915_private *i915 = to_i915(obj->base.dev);
2044
2045         /* Serialisation between user GTT access and our code depends upon
2046          * revoking the CPU's PTE whilst the mutex is held. The next user
2047          * pagefault then has to wait until we release the mutex.
2048          *
2049          * Note that RPM complicates somewhat by adding an additional
2050          * requirement that operations to the GGTT be made holding the RPM
2051          * wakeref.
2052          */
2053         lockdep_assert_held(&i915->drm.struct_mutex);
2054         intel_runtime_pm_get(i915);
2055
2056         if (!obj->userfault_count)
2057                 goto out;
2058
2059         __i915_gem_object_release_mmap(obj);
2060
2061         /* Ensure that the CPU's PTE are revoked and there are not outstanding
2062          * memory transactions from userspace before we return. The TLB
2063          * flushing implied above by changing the PTE above *should* be
2064          * sufficient, an extra barrier here just provides us with a bit
2065          * of paranoid documentation about our requirement to serialise
2066          * memory writes before touching registers / GSM.
2067          */
2068         wmb();
2069
2070 out:
2071         intel_runtime_pm_put(i915);
2072 }
2073
2074 void i915_gem_runtime_suspend(struct drm_i915_private *dev_priv)
2075 {
2076         struct drm_i915_gem_object *obj, *on;
2077         int i;
2078
2079         /*
2080          * Only called during RPM suspend. All users of the userfault_list
2081          * must be holding an RPM wakeref to ensure that this can not
2082          * run concurrently with themselves (and use the struct_mutex for
2083          * protection between themselves).
2084          */
2085
2086         list_for_each_entry_safe(obj, on,
2087                                  &dev_priv->mm.userfault_list, userfault_link)
2088                 __i915_gem_object_release_mmap(obj);
2089
2090         /* The fence will be lost when the device powers down. If any were
2091          * in use by hardware (i.e. they are pinned), we should not be powering
2092          * down! All other fences will be reacquired by the user upon waking.
2093          */
2094         for (i = 0; i < dev_priv->num_fence_regs; i++) {
2095                 struct drm_i915_fence_reg *reg = &dev_priv->fence_regs[i];
2096
2097                 /* Ideally we want to assert that the fence register is not
2098                  * live at this point (i.e. that no piece of code will be
2099                  * trying to write through fence + GTT, as that both violates
2100                  * our tracking of activity and associated locking/barriers,
2101                  * but also is illegal given that the hw is powered down).
2102                  *
2103                  * Previously we used reg->pin_count as a "liveness" indicator.
2104                  * That is not sufficient, and we need a more fine-grained
2105                  * tool if we want to have a sanity check here.
2106                  */
2107
2108                 if (!reg->vma)
2109                         continue;
2110
2111                 GEM_BUG_ON(i915_vma_has_userfault(reg->vma));
2112                 reg->dirty = true;
2113         }
2114 }
2115
2116 static int i915_gem_object_create_mmap_offset(struct drm_i915_gem_object *obj)
2117 {
2118         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
2119         int err;
2120
2121         err = drm_gem_create_mmap_offset(&obj->base);
2122         if (likely(!err))
2123                 return 0;
2124
2125         /* Attempt to reap some mmap space from dead objects */
2126         do {
2127                 err = i915_gem_wait_for_idle(dev_priv,
2128                                              I915_WAIT_INTERRUPTIBLE,
2129                                              MAX_SCHEDULE_TIMEOUT);
2130                 if (err)
2131                         break;
2132
2133                 i915_gem_drain_freed_objects(dev_priv);
2134                 err = drm_gem_create_mmap_offset(&obj->base);
2135                 if (!err)
2136                         break;
2137
2138         } while (flush_delayed_work(&dev_priv->gt.retire_work));
2139
2140         return err;
2141 }
2142
2143 static void i915_gem_object_free_mmap_offset(struct drm_i915_gem_object *obj)
2144 {
2145         drm_gem_free_mmap_offset(&obj->base);
2146 }
2147
2148 int
2149 i915_gem_mmap_gtt(struct drm_file *file,
2150                   struct drm_device *dev,
2151                   uint32_t handle,
2152                   uint64_t *offset)
2153 {
2154         struct drm_i915_gem_object *obj;
2155         int ret;
2156
2157         obj = i915_gem_object_lookup(file, handle);
2158         if (!obj)
2159                 return -ENOENT;
2160
2161         ret = i915_gem_object_create_mmap_offset(obj);
2162         if (ret == 0)
2163                 *offset = drm_vma_node_offset_addr(&obj->base.vma_node);
2164
2165         i915_gem_object_put(obj);
2166         return ret;
2167 }
2168
2169 /**
2170  * i915_gem_mmap_gtt_ioctl - prepare an object for GTT mmap'ing
2171  * @dev: DRM device
2172  * @data: GTT mapping ioctl data
2173  * @file: GEM object info
2174  *
2175  * Simply returns the fake offset to userspace so it can mmap it.
2176  * The mmap call will end up in drm_gem_mmap(), which will set things
2177  * up so we can get faults in the handler above.
2178  *
2179  * The fault handler will take care of binding the object into the GTT
2180  * (since it may have been evicted to make room for something), allocating
2181  * a fence register, and mapping the appropriate aperture address into
2182  * userspace.
2183  */
2184 int
2185 i915_gem_mmap_gtt_ioctl(struct drm_device *dev, void *data,
2186                         struct drm_file *file)
2187 {
2188         struct drm_i915_gem_mmap_gtt *args = data;
2189
2190         return i915_gem_mmap_gtt(file, dev, args->handle, &args->offset);
2191 }
2192
2193 /* Immediately discard the backing storage */
2194 static void
2195 i915_gem_object_truncate(struct drm_i915_gem_object *obj)
2196 {
2197         i915_gem_object_free_mmap_offset(obj);
2198
2199         if (obj->base.filp == NULL)
2200                 return;
2201
2202         /* Our goal here is to return as much of the memory as
2203          * is possible back to the system as we are called from OOM.
2204          * To do this we must instruct the shmfs to drop all of its
2205          * backing pages, *now*.
2206          */
2207         shmem_truncate_range(file_inode(obj->base.filp), 0, (loff_t)-1);
2208         obj->mm.madv = __I915_MADV_PURGED;
2209         obj->mm.pages = ERR_PTR(-EFAULT);
2210 }
2211
2212 /* Try to discard unwanted pages */
2213 void __i915_gem_object_invalidate(struct drm_i915_gem_object *obj)
2214 {
2215         struct address_space *mapping;
2216
2217         lockdep_assert_held(&obj->mm.lock);
2218         GEM_BUG_ON(i915_gem_object_has_pages(obj));
2219
2220         switch (obj->mm.madv) {
2221         case I915_MADV_DONTNEED:
2222                 i915_gem_object_truncate(obj);
2223         case __I915_MADV_PURGED:
2224                 return;
2225         }
2226
2227         if (obj->base.filp == NULL)
2228                 return;
2229
2230         mapping = obj->base.filp->f_mapping,
2231         invalidate_mapping_pages(mapping, 0, (loff_t)-1);
2232 }
2233
2234 /*
2235  * Move pages to appropriate lru and release the pagevec, decrementing the
2236  * ref count of those pages.
2237  */
2238 static void check_release_pagevec(struct pagevec *pvec)
2239 {
2240         check_move_unevictable_pages(pvec);
2241         __pagevec_release(pvec);
2242         cond_resched();
2243 }
2244
2245 static void
2246 i915_gem_object_put_pages_gtt(struct drm_i915_gem_object *obj,
2247                               struct sg_table *pages)
2248 {
2249         struct sgt_iter sgt_iter;
2250         struct pagevec pvec;
2251         struct page *page;
2252
2253         __i915_gem_object_release_shmem(obj, pages, true);
2254
2255         i915_gem_gtt_finish_pages(obj, pages);
2256
2257         if (i915_gem_object_needs_bit17_swizzle(obj))
2258                 i915_gem_object_save_bit_17_swizzle(obj, pages);
2259
2260         mapping_clear_unevictable(file_inode(obj->base.filp)->i_mapping);
2261
2262         pagevec_init(&pvec);
2263         for_each_sgt_page(page, sgt_iter, pages) {
2264                 if (obj->mm.dirty)
2265                         set_page_dirty(page);
2266
2267                 if (obj->mm.madv == I915_MADV_WILLNEED)
2268                         mark_page_accessed(page);
2269
2270                 if (!pagevec_add(&pvec, page))
2271                         check_release_pagevec(&pvec);
2272         }
2273         if (pagevec_count(&pvec))
2274                 check_release_pagevec(&pvec);
2275         obj->mm.dirty = false;
2276
2277         sg_free_table(pages);
2278         kfree(pages);
2279 }
2280
2281 static void __i915_gem_object_reset_page_iter(struct drm_i915_gem_object *obj)
2282 {
2283         struct radix_tree_iter iter;
2284         void __rcu **slot;
2285
2286         rcu_read_lock();
2287         radix_tree_for_each_slot(slot, &obj->mm.get_page.radix, &iter, 0)
2288                 radix_tree_delete(&obj->mm.get_page.radix, iter.index);
2289         rcu_read_unlock();
2290 }
2291
2292 static struct sg_table *
2293 __i915_gem_object_unset_pages(struct drm_i915_gem_object *obj)
2294 {
2295         struct drm_i915_private *i915 = to_i915(obj->base.dev);
2296         struct sg_table *pages;
2297
2298         pages = fetch_and_zero(&obj->mm.pages);
2299         if (!pages)
2300                 return NULL;
2301
2302         spin_lock(&i915->mm.obj_lock);
2303         list_del(&obj->mm.link);
2304         spin_unlock(&i915->mm.obj_lock);
2305
2306         if (obj->mm.mapping) {
2307                 void *ptr;
2308
2309                 ptr = page_mask_bits(obj->mm.mapping);
2310                 if (is_vmalloc_addr(ptr))
2311                         vunmap(ptr);
2312                 else
2313                         kunmap(kmap_to_page(ptr));
2314
2315                 obj->mm.mapping = NULL;
2316         }
2317
2318         __i915_gem_object_reset_page_iter(obj);
2319         obj->mm.page_sizes.phys = obj->mm.page_sizes.sg = 0;
2320
2321         return pages;
2322 }
2323
2324 void __i915_gem_object_put_pages(struct drm_i915_gem_object *obj,
2325                                  enum i915_mm_subclass subclass)
2326 {
2327         struct sg_table *pages;
2328
2329         if (i915_gem_object_has_pinned_pages(obj))
2330                 return;
2331
2332         GEM_BUG_ON(obj->bind_count);
2333         if (!i915_gem_object_has_pages(obj))
2334                 return;
2335
2336         /* May be called by shrinker from within get_pages() (on another bo) */
2337         mutex_lock_nested(&obj->mm.lock, subclass);
2338         if (unlikely(atomic_read(&obj->mm.pages_pin_count)))
2339                 goto unlock;
2340
2341         /*
2342          * ->put_pages might need to allocate memory for the bit17 swizzle
2343          * array, hence protect them from being reaped by removing them from gtt
2344          * lists early.
2345          */
2346         pages = __i915_gem_object_unset_pages(obj);
2347         if (!IS_ERR(pages))
2348                 obj->ops->put_pages(obj, pages);
2349
2350 unlock:
2351         mutex_unlock(&obj->mm.lock);
2352 }
2353
2354 bool i915_sg_trim(struct sg_table *orig_st)
2355 {
2356         struct sg_table new_st;
2357         struct scatterlist *sg, *new_sg;
2358         unsigned int i;
2359
2360         if (orig_st->nents == orig_st->orig_nents)
2361                 return false;
2362
2363         if (sg_alloc_table(&new_st, orig_st->nents, GFP_KERNEL | __GFP_NOWARN))
2364                 return false;
2365
2366         new_sg = new_st.sgl;
2367         for_each_sg(orig_st->sgl, sg, orig_st->nents, i) {
2368                 sg_set_page(new_sg, sg_page(sg), sg->length, 0);
2369                 sg_dma_address(new_sg) = sg_dma_address(sg);
2370                 sg_dma_len(new_sg) = sg_dma_len(sg);
2371
2372                 new_sg = sg_next(new_sg);
2373         }
2374         GEM_BUG_ON(new_sg); /* Should walk exactly nents and hit the end */
2375
2376         sg_free_table(orig_st);
2377
2378         *orig_st = new_st;
2379         return true;
2380 }
2381
2382 static int i915_gem_object_get_pages_gtt(struct drm_i915_gem_object *obj)
2383 {
2384         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
2385         const unsigned long page_count = obj->base.size / PAGE_SIZE;
2386         unsigned long i;
2387         struct address_space *mapping;
2388         struct sg_table *st;
2389         struct scatterlist *sg;
2390         struct sgt_iter sgt_iter;
2391         struct page *page;
2392         unsigned long last_pfn = 0;     /* suppress gcc warning */
2393         unsigned int max_segment = i915_sg_segment_size();
2394         unsigned int sg_page_sizes;
2395         struct pagevec pvec;
2396         gfp_t noreclaim;
2397         int ret;
2398
2399         /*
2400          * Assert that the object is not currently in any GPU domain. As it
2401          * wasn't in the GTT, there shouldn't be any way it could have been in
2402          * a GPU cache
2403          */
2404         GEM_BUG_ON(obj->read_domains & I915_GEM_GPU_DOMAINS);
2405         GEM_BUG_ON(obj->write_domain & I915_GEM_GPU_DOMAINS);
2406
2407         /*
2408          * If there's no chance of allocating enough pages for the whole
2409          * object, bail early.
2410          */
2411         if (page_count > totalram_pages())
2412                 return -ENOMEM;
2413
2414         st = kmalloc(sizeof(*st), GFP_KERNEL);
2415         if (st == NULL)
2416                 return -ENOMEM;
2417
2418 rebuild_st:
2419         if (sg_alloc_table(st, page_count, GFP_KERNEL)) {
2420                 kfree(st);
2421                 return -ENOMEM;
2422         }
2423
2424         /*
2425          * Get the list of pages out of our struct file.  They'll be pinned
2426          * at this point until we release them.
2427          *
2428          * Fail silently without starting the shrinker
2429          */
2430         mapping = obj->base.filp->f_mapping;
2431         mapping_set_unevictable(mapping);
2432         noreclaim = mapping_gfp_constraint(mapping, ~__GFP_RECLAIM);
2433         noreclaim |= __GFP_NORETRY | __GFP_NOWARN;
2434
2435         sg = st->sgl;
2436         st->nents = 0;
2437         sg_page_sizes = 0;
2438         for (i = 0; i < page_count; i++) {
2439                 const unsigned int shrink[] = {
2440                         I915_SHRINK_BOUND | I915_SHRINK_UNBOUND | I915_SHRINK_PURGEABLE,
2441                         0,
2442                 }, *s = shrink;
2443                 gfp_t gfp = noreclaim;
2444
2445                 do {
2446                         cond_resched();
2447                         page = shmem_read_mapping_page_gfp(mapping, i, gfp);
2448                         if (likely(!IS_ERR(page)))
2449                                 break;
2450
2451                         if (!*s) {
2452                                 ret = PTR_ERR(page);
2453                                 goto err_sg;
2454                         }
2455
2456                         i915_gem_shrink(dev_priv, 2 * page_count, NULL, *s++);
2457
2458                         /*
2459                          * We've tried hard to allocate the memory by reaping
2460                          * our own buffer, now let the real VM do its job and
2461                          * go down in flames if truly OOM.
2462                          *
2463                          * However, since graphics tend to be disposable,
2464                          * defer the oom here by reporting the ENOMEM back
2465                          * to userspace.
2466                          */
2467                         if (!*s) {
2468                                 /* reclaim and warn, but no oom */
2469                                 gfp = mapping_gfp_mask(mapping);
2470
2471                                 /*
2472                                  * Our bo are always dirty and so we require
2473                                  * kswapd to reclaim our pages (direct reclaim
2474                                  * does not effectively begin pageout of our
2475                                  * buffers on its own). However, direct reclaim
2476                                  * only waits for kswapd when under allocation
2477                                  * congestion. So as a result __GFP_RECLAIM is
2478                                  * unreliable and fails to actually reclaim our
2479                                  * dirty pages -- unless you try over and over
2480                                  * again with !__GFP_NORETRY. However, we still
2481                                  * want to fail this allocation rather than
2482                                  * trigger the out-of-memory killer and for
2483                                  * this we want __GFP_RETRY_MAYFAIL.
2484                                  */
2485                                 gfp |= __GFP_RETRY_MAYFAIL;
2486                         }
2487                 } while (1);
2488
2489                 if (!i ||
2490                     sg->length >= max_segment ||
2491                     page_to_pfn(page) != last_pfn + 1) {
2492                         if (i) {
2493                                 sg_page_sizes |= sg->length;
2494                                 sg = sg_next(sg);
2495                         }
2496                         st->nents++;
2497                         sg_set_page(sg, page, PAGE_SIZE, 0);
2498                 } else {
2499                         sg->length += PAGE_SIZE;
2500                 }
2501                 last_pfn = page_to_pfn(page);
2502
2503                 /* Check that the i965g/gm workaround works. */
2504                 WARN_ON((gfp & __GFP_DMA32) && (last_pfn >= 0x00100000UL));
2505         }
2506         if (sg) { /* loop terminated early; short sg table */
2507                 sg_page_sizes |= sg->length;
2508                 sg_mark_end(sg);
2509         }
2510
2511         /* Trim unused sg entries to avoid wasting memory. */
2512         i915_sg_trim(st);
2513
2514         ret = i915_gem_gtt_prepare_pages(obj, st);
2515         if (ret) {
2516                 /*
2517                  * DMA remapping failed? One possible cause is that
2518                  * it could not reserve enough large entries, asking
2519                  * for PAGE_SIZE chunks instead may be helpful.
2520                  */
2521                 if (max_segment > PAGE_SIZE) {
2522                         for_each_sgt_page(page, sgt_iter, st)
2523                                 put_page(page);
2524                         sg_free_table(st);
2525
2526                         max_segment = PAGE_SIZE;
2527                         goto rebuild_st;
2528                 } else {
2529                         dev_warn(&dev_priv->drm.pdev->dev,
2530                                  "Failed to DMA remap %lu pages\n",
2531                                  page_count);
2532                         goto err_pages;
2533                 }
2534         }
2535
2536         if (i915_gem_object_needs_bit17_swizzle(obj))
2537                 i915_gem_object_do_bit_17_swizzle(obj, st);
2538
2539         __i915_gem_object_set_pages(obj, st, sg_page_sizes);
2540
2541         return 0;
2542
2543 err_sg:
2544         sg_mark_end(sg);
2545 err_pages:
2546         mapping_clear_unevictable(mapping);
2547         pagevec_init(&pvec);
2548         for_each_sgt_page(page, sgt_iter, st) {
2549                 if (!pagevec_add(&pvec, page))
2550                         check_release_pagevec(&pvec);
2551         }
2552         if (pagevec_count(&pvec))
2553                 check_release_pagevec(&pvec);
2554         sg_free_table(st);
2555         kfree(st);
2556
2557         /*
2558          * shmemfs first checks if there is enough memory to allocate the page
2559          * and reports ENOSPC should there be insufficient, along with the usual
2560          * ENOMEM for a genuine allocation failure.
2561          *
2562          * We use ENOSPC in our driver to mean that we have run out of aperture
2563          * space and so want to translate the error from shmemfs back to our
2564          * usual understanding of ENOMEM.
2565          */
2566         if (ret == -ENOSPC)
2567                 ret = -ENOMEM;
2568
2569         return ret;
2570 }
2571
2572 void __i915_gem_object_set_pages(struct drm_i915_gem_object *obj,
2573                                  struct sg_table *pages,
2574                                  unsigned int sg_page_sizes)
2575 {
2576         struct drm_i915_private *i915 = to_i915(obj->base.dev);
2577         unsigned long supported = INTEL_INFO(i915)->page_sizes;
2578         int i;
2579
2580         lockdep_assert_held(&obj->mm.lock);
2581
2582         obj->mm.get_page.sg_pos = pages->sgl;
2583         obj->mm.get_page.sg_idx = 0;
2584
2585         obj->mm.pages = pages;
2586
2587         if (i915_gem_object_is_tiled(obj) &&
2588             i915->quirks & QUIRK_PIN_SWIZZLED_PAGES) {
2589                 GEM_BUG_ON(obj->mm.quirked);
2590                 __i915_gem_object_pin_pages(obj);
2591                 obj->mm.quirked = true;
2592         }
2593
2594         GEM_BUG_ON(!sg_page_sizes);
2595         obj->mm.page_sizes.phys = sg_page_sizes;
2596
2597         /*
2598          * Calculate the supported page-sizes which fit into the given
2599          * sg_page_sizes. This will give us the page-sizes which we may be able
2600          * to use opportunistically when later inserting into the GTT. For
2601          * example if phys=2G, then in theory we should be able to use 1G, 2M,
2602          * 64K or 4K pages, although in practice this will depend on a number of
2603          * other factors.
2604          */
2605         obj->mm.page_sizes.sg = 0;
2606         for_each_set_bit(i, &supported, ilog2(I915_GTT_MAX_PAGE_SIZE) + 1) {
2607                 if (obj->mm.page_sizes.phys & ~0u << i)
2608                         obj->mm.page_sizes.sg |= BIT(i);
2609         }
2610         GEM_BUG_ON(!HAS_PAGE_SIZES(i915, obj->mm.page_sizes.sg));
2611
2612         spin_lock(&i915->mm.obj_lock);
2613         list_add(&obj->mm.link, &i915->mm.unbound_list);
2614         spin_unlock(&i915->mm.obj_lock);
2615 }
2616
2617 static int ____i915_gem_object_get_pages(struct drm_i915_gem_object *obj)
2618 {
2619         int err;
2620
2621         if (unlikely(obj->mm.madv != I915_MADV_WILLNEED)) {
2622                 DRM_DEBUG("Attempting to obtain a purgeable object\n");
2623                 return -EFAULT;
2624         }
2625
2626         err = obj->ops->get_pages(obj);
2627         GEM_BUG_ON(!err && !i915_gem_object_has_pages(obj));
2628
2629         return err;
2630 }
2631
2632 /* Ensure that the associated pages are gathered from the backing storage
2633  * and pinned into our object. i915_gem_object_pin_pages() may be called
2634  * multiple times before they are released by a single call to
2635  * i915_gem_object_unpin_pages() - once the pages are no longer referenced
2636  * either as a result of memory pressure (reaping pages under the shrinker)
2637  * or as the object is itself released.
2638  */
2639 int __i915_gem_object_get_pages(struct drm_i915_gem_object *obj)
2640 {
2641         int err;
2642
2643         err = mutex_lock_interruptible(&obj->mm.lock);
2644         if (err)
2645                 return err;
2646
2647         if (unlikely(!i915_gem_object_has_pages(obj))) {
2648                 GEM_BUG_ON(i915_gem_object_has_pinned_pages(obj));
2649
2650                 err = ____i915_gem_object_get_pages(obj);
2651                 if (err)
2652                         goto unlock;
2653
2654                 smp_mb__before_atomic();
2655         }
2656         atomic_inc(&obj->mm.pages_pin_count);
2657
2658 unlock:
2659         mutex_unlock(&obj->mm.lock);
2660         return err;
2661 }
2662
2663 /* The 'mapping' part of i915_gem_object_pin_map() below */
2664 static void *i915_gem_object_map(const struct drm_i915_gem_object *obj,
2665                                  enum i915_map_type type)
2666 {
2667         unsigned long n_pages = obj->base.size >> PAGE_SHIFT;
2668         struct sg_table *sgt = obj->mm.pages;
2669         struct sgt_iter sgt_iter;
2670         struct page *page;
2671         struct page *stack_pages[32];
2672         struct page **pages = stack_pages;
2673         unsigned long i = 0;
2674         pgprot_t pgprot;
2675         void *addr;
2676
2677         /* A single page can always be kmapped */
2678         if (n_pages == 1 && type == I915_MAP_WB)
2679                 return kmap(sg_page(sgt->sgl));
2680
2681         if (n_pages > ARRAY_SIZE(stack_pages)) {
2682                 /* Too big for stack -- allocate temporary array instead */
2683                 pages = kvmalloc_array(n_pages, sizeof(*pages), GFP_KERNEL);
2684                 if (!pages)
2685                         return NULL;
2686         }
2687
2688         for_each_sgt_page(page, sgt_iter, sgt)
2689                 pages[i++] = page;
2690
2691         /* Check that we have the expected number of pages */
2692         GEM_BUG_ON(i != n_pages);
2693
2694         switch (type) {
2695         default:
2696                 MISSING_CASE(type);
2697                 /* fallthrough to use PAGE_KERNEL anyway */
2698         case I915_MAP_WB:
2699                 pgprot = PAGE_KERNEL;
2700                 break;
2701         case I915_MAP_WC:
2702                 pgprot = pgprot_writecombine(PAGE_KERNEL_IO);
2703                 break;
2704         }
2705         addr = vmap(pages, n_pages, 0, pgprot);
2706
2707         if (pages != stack_pages)
2708                 kvfree(pages);
2709
2710         return addr;
2711 }
2712
2713 /* get, pin, and map the pages of the object into kernel space */
2714 void *i915_gem_object_pin_map(struct drm_i915_gem_object *obj,
2715                               enum i915_map_type type)
2716 {
2717         enum i915_map_type has_type;
2718         bool pinned;
2719         void *ptr;
2720         int ret;
2721
2722         if (unlikely(!i915_gem_object_has_struct_page(obj)))
2723                 return ERR_PTR(-ENXIO);
2724
2725         ret = mutex_lock_interruptible(&obj->mm.lock);
2726         if (ret)
2727                 return ERR_PTR(ret);
2728
2729         pinned = !(type & I915_MAP_OVERRIDE);
2730         type &= ~I915_MAP_OVERRIDE;
2731
2732         if (!atomic_inc_not_zero(&obj->mm.pages_pin_count)) {
2733                 if (unlikely(!i915_gem_object_has_pages(obj))) {
2734                         GEM_BUG_ON(i915_gem_object_has_pinned_pages(obj));
2735
2736                         ret = ____i915_gem_object_get_pages(obj);
2737                         if (ret)
2738                                 goto err_unlock;
2739
2740                         smp_mb__before_atomic();
2741                 }
2742                 atomic_inc(&obj->mm.pages_pin_count);
2743                 pinned = false;
2744         }
2745         GEM_BUG_ON(!i915_gem_object_has_pages(obj));
2746
2747         ptr = page_unpack_bits(obj->mm.mapping, &has_type);
2748         if (ptr && has_type != type) {
2749                 if (pinned) {
2750                         ret = -EBUSY;
2751                         goto err_unpin;
2752                 }
2753
2754                 if (is_vmalloc_addr(ptr))
2755                         vunmap(ptr);
2756                 else
2757                         kunmap(kmap_to_page(ptr));
2758
2759                 ptr = obj->mm.mapping = NULL;
2760         }
2761
2762         if (!ptr) {
2763                 ptr = i915_gem_object_map(obj, type);
2764                 if (!ptr) {
2765                         ret = -ENOMEM;
2766                         goto err_unpin;
2767                 }
2768
2769                 obj->mm.mapping = page_pack_bits(ptr, type);
2770         }
2771
2772 out_unlock:
2773         mutex_unlock(&obj->mm.lock);
2774         return ptr;
2775
2776 err_unpin:
2777         atomic_dec(&obj->mm.pages_pin_count);
2778 err_unlock:
2779         ptr = ERR_PTR(ret);
2780         goto out_unlock;
2781 }
2782
2783 static int
2784 i915_gem_object_pwrite_gtt(struct drm_i915_gem_object *obj,
2785                            const struct drm_i915_gem_pwrite *arg)
2786 {
2787         struct address_space *mapping = obj->base.filp->f_mapping;
2788         char __user *user_data = u64_to_user_ptr(arg->data_ptr);
2789         u64 remain, offset;
2790         unsigned int pg;
2791
2792         /* Before we instantiate/pin the backing store for our use, we
2793          * can prepopulate the shmemfs filp efficiently using a write into
2794          * the pagecache. We avoid the penalty of instantiating all the
2795          * pages, important if the user is just writing to a few and never
2796          * uses the object on the GPU, and using a direct write into shmemfs
2797          * allows it to avoid the cost of retrieving a page (either swapin
2798          * or clearing-before-use) before it is overwritten.
2799          */
2800         if (i915_gem_object_has_pages(obj))
2801                 return -ENODEV;
2802
2803         if (obj->mm.madv != I915_MADV_WILLNEED)
2804                 return -EFAULT;
2805
2806         /* Before the pages are instantiated the object is treated as being
2807          * in the CPU domain. The pages will be clflushed as required before
2808          * use, and we can freely write into the pages directly. If userspace
2809          * races pwrite with any other operation; corruption will ensue -
2810          * that is userspace's prerogative!
2811          */
2812
2813         remain = arg->size;
2814         offset = arg->offset;
2815         pg = offset_in_page(offset);
2816
2817         do {
2818                 unsigned int len, unwritten;
2819                 struct page *page;
2820                 void *data, *vaddr;
2821                 int err;
2822
2823                 len = PAGE_SIZE - pg;
2824                 if (len > remain)
2825                         len = remain;
2826
2827                 err = pagecache_write_begin(obj->base.filp, mapping,
2828                                             offset, len, 0,
2829                                             &page, &data);
2830                 if (err < 0)
2831                         return err;
2832
2833                 vaddr = kmap(page);
2834                 unwritten = copy_from_user(vaddr + pg, user_data, len);
2835                 kunmap(page);
2836
2837                 err = pagecache_write_end(obj->base.filp, mapping,
2838                                           offset, len, len - unwritten,
2839                                           page, data);
2840                 if (err < 0)
2841                         return err;
2842
2843                 if (unwritten)
2844                         return -EFAULT;
2845
2846                 remain -= len;
2847                 user_data += len;
2848                 offset += len;
2849                 pg = 0;
2850         } while (remain);
2851
2852         return 0;
2853 }
2854
2855 static void i915_gem_client_mark_guilty(struct drm_i915_file_private *file_priv,
2856                                         const struct i915_gem_context *ctx)
2857 {
2858         unsigned int score;
2859         unsigned long prev_hang;
2860
2861         if (i915_gem_context_is_banned(ctx))
2862                 score = I915_CLIENT_SCORE_CONTEXT_BAN;
2863         else
2864                 score = 0;
2865
2866         prev_hang = xchg(&file_priv->hang_timestamp, jiffies);
2867         if (time_before(jiffies, prev_hang + I915_CLIENT_FAST_HANG_JIFFIES))
2868                 score += I915_CLIENT_SCORE_HANG_FAST;
2869
2870         if (score) {
2871                 atomic_add(score, &file_priv->ban_score);
2872
2873                 DRM_DEBUG_DRIVER("client %s: gained %u ban score, now %u\n",
2874                                  ctx->name, score,
2875                                  atomic_read(&file_priv->ban_score));
2876         }
2877 }
2878
2879 static void i915_gem_context_mark_guilty(struct i915_gem_context *ctx)
2880 {
2881         unsigned int score;
2882         bool banned, bannable;
2883
2884         atomic_inc(&ctx->guilty_count);
2885
2886         bannable = i915_gem_context_is_bannable(ctx);
2887         score = atomic_add_return(CONTEXT_SCORE_GUILTY, &ctx->ban_score);
2888         banned = score >= CONTEXT_SCORE_BAN_THRESHOLD;
2889
2890         /* Cool contexts don't accumulate client ban score */
2891         if (!bannable)
2892                 return;
2893
2894         if (banned) {
2895                 DRM_DEBUG_DRIVER("context %s: guilty %d, score %u, banned\n",
2896                                  ctx->name, atomic_read(&ctx->guilty_count),
2897                                  score);
2898                 i915_gem_context_set_banned(ctx);
2899         }
2900
2901         if (!IS_ERR_OR_NULL(ctx->file_priv))
2902                 i915_gem_client_mark_guilty(ctx->file_priv, ctx);
2903 }
2904
2905 static void i915_gem_context_mark_innocent(struct i915_gem_context *ctx)
2906 {
2907         atomic_inc(&ctx->active_count);
2908 }
2909
2910 struct i915_request *
2911 i915_gem_find_active_request(struct intel_engine_cs *engine)
2912 {
2913         struct i915_request *request, *active = NULL;
2914         unsigned long flags;
2915
2916         /*
2917          * We are called by the error capture, reset and to dump engine
2918          * state at random points in time. In particular, note that neither is
2919          * crucially ordered with an interrupt. After a hang, the GPU is dead
2920          * and we assume that no more writes can happen (we waited long enough
2921          * for all writes that were in transaction to be flushed) - adding an
2922          * extra delay for a recent interrupt is pointless. Hence, we do
2923          * not need an engine->irq_seqno_barrier() before the seqno reads.
2924          * At all other times, we must assume the GPU is still running, but
2925          * we only care about the snapshot of this moment.
2926          */
2927         spin_lock_irqsave(&engine->timeline.lock, flags);
2928         list_for_each_entry(request, &engine->timeline.requests, link) {
2929                 if (__i915_request_completed(request, request->global_seqno))
2930                         continue;
2931
2932                 active = request;
2933                 break;
2934         }
2935         spin_unlock_irqrestore(&engine->timeline.lock, flags);
2936
2937         return active;
2938 }
2939
2940 /*
2941  * Ensure irq handler finishes, and not run again.
2942  * Also return the active request so that we only search for it once.
2943  */
2944 struct i915_request *
2945 i915_gem_reset_prepare_engine(struct intel_engine_cs *engine)
2946 {
2947         struct i915_request *request;
2948
2949         /*
2950          * During the reset sequence, we must prevent the engine from
2951          * entering RC6. As the context state is undefined until we restart
2952          * the engine, if it does enter RC6 during the reset, the state
2953          * written to the powercontext is undefined and so we may lose
2954          * GPU state upon resume, i.e. fail to restart after a reset.
2955          */
2956         intel_uncore_forcewake_get(engine->i915, FORCEWAKE_ALL);
2957
2958         request = engine->reset.prepare(engine);
2959         if (request && request->fence.error == -EIO)
2960                 request = ERR_PTR(-EIO); /* Previous reset failed! */
2961
2962         return request;
2963 }
2964
2965 int i915_gem_reset_prepare(struct drm_i915_private *dev_priv)
2966 {
2967         struct intel_engine_cs *engine;
2968         struct i915_request *request;
2969         enum intel_engine_id id;
2970         int err = 0;
2971
2972         for_each_engine(engine, dev_priv, id) {
2973                 request = i915_gem_reset_prepare_engine(engine);
2974                 if (IS_ERR(request)) {
2975                         err = PTR_ERR(request);
2976                         continue;
2977                 }
2978
2979                 engine->hangcheck.active_request = request;
2980         }
2981
2982         i915_gem_revoke_fences(dev_priv);
2983         intel_uc_sanitize(dev_priv);
2984
2985         return err;
2986 }
2987
2988 static void engine_skip_context(struct i915_request *request)
2989 {
2990         struct intel_engine_cs *engine = request->engine;
2991         struct i915_gem_context *hung_ctx = request->gem_context;
2992         struct i915_timeline *timeline = request->timeline;
2993         unsigned long flags;
2994
2995         GEM_BUG_ON(timeline == &engine->timeline);
2996
2997         spin_lock_irqsave(&engine->timeline.lock, flags);
2998         spin_lock(&timeline->lock);
2999
3000         list_for_each_entry_continue(request, &engine->timeline.requests, link)
3001                 if (request->gem_context == hung_ctx)
3002                         i915_request_skip(request, -EIO);
3003
3004         list_for_each_entry(request, &timeline->requests, link)
3005                 i915_request_skip(request, -EIO);
3006
3007         spin_unlock(&timeline->lock);
3008         spin_unlock_irqrestore(&engine->timeline.lock, flags);
3009 }
3010
3011 /* Returns the request if it was guilty of the hang */
3012 static struct i915_request *
3013 i915_gem_reset_request(struct intel_engine_cs *engine,
3014                        struct i915_request *request,
3015                        bool stalled)
3016 {
3017         /* The guilty request will get skipped on a hung engine.
3018          *
3019          * Users of client default contexts do not rely on logical
3020          * state preserved between batches so it is safe to execute
3021          * queued requests following the hang. Non default contexts
3022          * rely on preserved state, so skipping a batch loses the
3023          * evolution of the state and it needs to be considered corrupted.
3024          * Executing more queued batches on top of corrupted state is
3025          * risky. But we take the risk by trying to advance through
3026          * the queued requests in order to make the client behaviour
3027          * more predictable around resets, by not throwing away random
3028          * amount of batches it has prepared for execution. Sophisticated
3029          * clients can use gem_reset_stats_ioctl and dma fence status
3030          * (exported via sync_file info ioctl on explicit fences) to observe
3031          * when it loses the context state and should rebuild accordingly.
3032          *
3033          * The context ban, and ultimately the client ban, mechanism are safety
3034          * valves if client submission ends up resulting in nothing more than
3035          * subsequent hangs.
3036          */
3037
3038         if (i915_request_completed(request)) {
3039                 GEM_TRACE("%s pardoned global=%d (fence %llx:%lld), current %d\n",
3040                           engine->name, request->global_seqno,
3041                           request->fence.context, request->fence.seqno,
3042                           intel_engine_get_seqno(engine));
3043                 stalled = false;
3044         }
3045
3046         if (stalled) {
3047                 i915_gem_context_mark_guilty(request->gem_context);
3048                 i915_request_skip(request, -EIO);
3049
3050                 /* If this context is now banned, skip all pending requests. */
3051                 if (i915_gem_context_is_banned(request->gem_context))
3052                         engine_skip_context(request);
3053         } else {
3054                 /*
3055                  * Since this is not the hung engine, it may have advanced
3056                  * since the hang declaration. Double check by refinding
3057                  * the active request at the time of the reset.
3058                  */
3059                 request = i915_gem_find_active_request(engine);
3060                 if (request) {
3061                         unsigned long flags;
3062
3063                         i915_gem_context_mark_innocent(request->gem_context);
3064                         dma_fence_set_error(&request->fence, -EAGAIN);
3065
3066                         /* Rewind the engine to replay the incomplete rq */
3067                         spin_lock_irqsave(&engine->timeline.lock, flags);
3068                         request = list_prev_entry(request, link);
3069                         if (&request->link == &engine->timeline.requests)
3070                                 request = NULL;
3071                         spin_unlock_irqrestore(&engine->timeline.lock, flags);
3072                 }
3073         }
3074
3075         return request;
3076 }
3077
3078 void i915_gem_reset_engine(struct intel_engine_cs *engine,
3079                            struct i915_request *request,
3080                            bool stalled)
3081 {
3082         if (request)
3083                 request = i915_gem_reset_request(engine, request, stalled);
3084
3085         /* Setup the CS to resume from the breadcrumb of the hung request */
3086         engine->reset.reset(engine, request);
3087 }
3088
3089 void i915_gem_reset(struct drm_i915_private *dev_priv,
3090                     unsigned int stalled_mask)
3091 {
3092         struct intel_engine_cs *engine;
3093         enum intel_engine_id id;
3094
3095         lockdep_assert_held(&dev_priv->drm.struct_mutex);
3096
3097         i915_retire_requests(dev_priv);
3098
3099         for_each_engine(engine, dev_priv, id) {
3100                 struct intel_context *ce;
3101
3102                 i915_gem_reset_engine(engine,
3103                                       engine->hangcheck.active_request,
3104                                       stalled_mask & ENGINE_MASK(id));
3105                 ce = fetch_and_zero(&engine->last_retired_context);
3106                 if (ce)
3107                         intel_context_unpin(ce);
3108
3109                 /*
3110                  * Ostensibily, we always want a context loaded for powersaving,
3111                  * so if the engine is idle after the reset, send a request
3112                  * to load our scratch kernel_context.
3113                  *
3114                  * More mysteriously, if we leave the engine idle after a reset,
3115                  * the next userspace batch may hang, with what appears to be
3116                  * an incoherent read by the CS (presumably stale TLB). An
3117                  * empty request appears sufficient to paper over the glitch.
3118                  */
3119                 if (intel_engine_is_idle(engine)) {
3120                         struct i915_request *rq;
3121
3122                         rq = i915_request_alloc(engine,
3123                                                 dev_priv->kernel_context);
3124                         if (!IS_ERR(rq))
3125                                 i915_request_add(rq);
3126                 }
3127         }
3128
3129         i915_gem_restore_fences(dev_priv);
3130 }
3131
3132 void i915_gem_reset_finish_engine(struct intel_engine_cs *engine)
3133 {
3134         engine->reset.finish(engine);
3135
3136         intel_uncore_forcewake_put(engine->i915, FORCEWAKE_ALL);
3137 }
3138
3139 void i915_gem_reset_finish(struct drm_i915_private *dev_priv)
3140 {
3141         struct intel_engine_cs *engine;
3142         enum intel_engine_id id;
3143
3144         lockdep_assert_held(&dev_priv->drm.struct_mutex);
3145
3146         for_each_engine(engine, dev_priv, id) {
3147                 engine->hangcheck.active_request = NULL;
3148                 i915_gem_reset_finish_engine(engine);
3149         }
3150 }
3151
3152 static void nop_submit_request(struct i915_request *request)
3153 {
3154         unsigned long flags;
3155
3156         GEM_TRACE("%s fence %llx:%lld -> -EIO\n",
3157                   request->engine->name,
3158                   request->fence.context, request->fence.seqno);
3159         dma_fence_set_error(&request->fence, -EIO);
3160
3161         spin_lock_irqsave(&request->engine->timeline.lock, flags);
3162         __i915_request_submit(request);
3163         intel_engine_write_global_seqno(request->engine, request->global_seqno);
3164         spin_unlock_irqrestore(&request->engine->timeline.lock, flags);
3165 }
3166
3167 void i915_gem_set_wedged(struct drm_i915_private *i915)
3168 {
3169         struct intel_engine_cs *engine;
3170         enum intel_engine_id id;
3171
3172         GEM_TRACE("start\n");
3173
3174         if (GEM_SHOW_DEBUG()) {
3175                 struct drm_printer p = drm_debug_printer(__func__);
3176
3177                 for_each_engine(engine, i915, id)
3178                         intel_engine_dump(engine, &p, "%s\n", engine->name);
3179         }
3180
3181         if (test_and_set_bit(I915_WEDGED, &i915->gpu_error.flags))
3182                 goto out;
3183
3184         /*
3185          * First, stop submission to hw, but do not yet complete requests by
3186          * rolling the global seqno forward (since this would complete requests
3187          * for which we haven't set the fence error to EIO yet).
3188          */
3189         for_each_engine(engine, i915, id)
3190                 i915_gem_reset_prepare_engine(engine);
3191
3192         /* Even if the GPU reset fails, it should still stop the engines */
3193         if (INTEL_GEN(i915) >= 5)
3194                 intel_gpu_reset(i915, ALL_ENGINES);
3195
3196         for_each_engine(engine, i915, id) {
3197                 engine->submit_request = nop_submit_request;
3198                 engine->schedule = NULL;
3199         }
3200         i915->caps.scheduler = 0;
3201
3202         /*
3203          * Make sure no request can slip through without getting completed by
3204          * either this call here to intel_engine_write_global_seqno, or the one
3205          * in nop_submit_request.
3206          */
3207         synchronize_rcu();
3208
3209         /* Mark all executing requests as skipped */
3210         for_each_engine(engine, i915, id)
3211                 engine->cancel_requests(engine);
3212
3213         for_each_engine(engine, i915, id) {
3214                 i915_gem_reset_finish_engine(engine);
3215                 intel_engine_wakeup(engine);
3216         }
3217
3218 out:
3219         GEM_TRACE("end\n");
3220
3221         wake_up_all(&i915->gpu_error.reset_queue);
3222 }
3223
3224 bool i915_gem_unset_wedged(struct drm_i915_private *i915)
3225 {
3226         struct i915_timeline *tl;
3227
3228         lockdep_assert_held(&i915->drm.struct_mutex);
3229         if (!test_bit(I915_WEDGED, &i915->gpu_error.flags))
3230                 return true;
3231
3232         if (!i915->gt.scratch) /* Never full initialised, recovery impossible */
3233                 return false;
3234
3235         GEM_TRACE("start\n");
3236
3237         /*
3238          * Before unwedging, make sure that all pending operations
3239          * are flushed and errored out - we may have requests waiting upon
3240          * third party fences. We marked all inflight requests as EIO, and
3241          * every execbuf since returned EIO, for consistency we want all
3242          * the currently pending requests to also be marked as EIO, which
3243          * is done inside our nop_submit_request - and so we must wait.
3244          *
3245          * No more can be submitted until we reset the wedged bit.
3246          */
3247         list_for_each_entry(tl, &i915->gt.timelines, link) {
3248                 struct i915_request *rq;
3249
3250                 rq = i915_gem_active_peek(&tl->last_request,
3251                                           &i915->drm.struct_mutex);
3252                 if (!rq)
3253                         continue;
3254
3255                 /*
3256                  * We can't use our normal waiter as we want to
3257                  * avoid recursively trying to handle the current
3258                  * reset. The basic dma_fence_default_wait() installs
3259                  * a callback for dma_fence_signal(), which is
3260                  * triggered by our nop handler (indirectly, the
3261                  * callback enables the signaler thread which is
3262                  * woken by the nop_submit_request() advancing the seqno
3263                  * and when the seqno passes the fence, the signaler
3264                  * then signals the fence waking us up).
3265                  */
3266                 if (dma_fence_default_wait(&rq->fence, true,
3267                                            MAX_SCHEDULE_TIMEOUT) < 0)
3268                         return false;
3269         }
3270         i915_retire_requests(i915);
3271         GEM_BUG_ON(i915->gt.active_requests);
3272
3273         intel_engines_sanitize(i915, false);
3274
3275         /*
3276          * Undo nop_submit_request. We prevent all new i915 requests from
3277          * being queued (by disallowing execbuf whilst wedged) so having
3278          * waited for all active requests above, we know the system is idle
3279          * and do not have to worry about a thread being inside
3280          * engine->submit_request() as we swap over. So unlike installing
3281          * the nop_submit_request on reset, we can do this from normal
3282          * context and do not require stop_machine().
3283          */
3284         intel_engines_reset_default_submission(i915);
3285         i915_gem_contexts_lost(i915);
3286
3287         GEM_TRACE("end\n");
3288
3289         smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
3290         clear_bit(I915_WEDGED, &i915->gpu_error.flags);
3291
3292         return true;
3293 }
3294
3295 static void
3296 i915_gem_retire_work_handler(struct work_struct *work)
3297 {
3298         struct drm_i915_private *dev_priv =
3299                 container_of(work, typeof(*dev_priv), gt.retire_work.work);
3300         struct drm_device *dev = &dev_priv->drm;
3301
3302         /* Come back later if the device is busy... */
3303         if (mutex_trylock(&dev->struct_mutex)) {
3304                 i915_retire_requests(dev_priv);
3305                 mutex_unlock(&dev->struct_mutex);
3306         }
3307
3308         /*
3309          * Keep the retire handler running until we are finally idle.
3310          * We do not need to do this test under locking as in the worst-case
3311          * we queue the retire worker once too often.
3312          */
3313         if (READ_ONCE(dev_priv->gt.awake))
3314                 queue_delayed_work(dev_priv->wq,
3315                                    &dev_priv->gt.retire_work,
3316                                    round_jiffies_up_relative(HZ));
3317 }
3318
3319 static void shrink_caches(struct drm_i915_private *i915)
3320 {
3321         /*
3322          * kmem_cache_shrink() discards empty slabs and reorders partially
3323          * filled slabs to prioritise allocating from the mostly full slabs,
3324          * with the aim of reducing fragmentation.
3325          */
3326         kmem_cache_shrink(i915->priorities);
3327         kmem_cache_shrink(i915->dependencies);
3328         kmem_cache_shrink(i915->requests);
3329         kmem_cache_shrink(i915->luts);
3330         kmem_cache_shrink(i915->vmas);
3331         kmem_cache_shrink(i915->objects);
3332 }
3333
3334 struct sleep_rcu_work {
3335         union {
3336                 struct rcu_head rcu;
3337                 struct work_struct work;
3338         };
3339         struct drm_i915_private *i915;
3340         unsigned int epoch;
3341 };
3342
3343 static inline bool
3344 same_epoch(struct drm_i915_private *i915, unsigned int epoch)
3345 {
3346         /*
3347          * There is a small chance that the epoch wrapped since we started
3348          * sleeping. If we assume that epoch is at least a u32, then it will
3349          * take at least 2^32 * 100ms for it to wrap, or about 326 years.
3350          */
3351         return epoch == READ_ONCE(i915->gt.epoch);
3352 }
3353
3354 static void __sleep_work(struct work_struct *work)
3355 {
3356         struct sleep_rcu_work *s = container_of(work, typeof(*s), work);
3357         struct drm_i915_private *i915 = s->i915;
3358         unsigned int epoch = s->epoch;
3359
3360         kfree(s);
3361         if (same_epoch(i915, epoch))
3362                 shrink_caches(i915);
3363 }
3364
3365 static void __sleep_rcu(struct rcu_head *rcu)
3366 {
3367         struct sleep_rcu_work *s = container_of(rcu, typeof(*s), rcu);
3368         struct drm_i915_private *i915 = s->i915;
3369
3370         destroy_rcu_head(&s->rcu);
3371
3372         if (same_epoch(i915, s->epoch)) {
3373                 INIT_WORK(&s->work, __sleep_work);
3374                 queue_work(i915->wq, &s->work);
3375         } else {
3376                 kfree(s);
3377         }
3378 }
3379
3380 static inline bool
3381 new_requests_since_last_retire(const struct drm_i915_private *i915)
3382 {
3383         return (READ_ONCE(i915->gt.active_requests) ||
3384                 work_pending(&i915->gt.idle_work.work));
3385 }
3386
3387 static void assert_kernel_context_is_current(struct drm_i915_private *i915)
3388 {
3389         struct intel_engine_cs *engine;
3390         enum intel_engine_id id;
3391
3392         if (i915_terminally_wedged(&i915->gpu_error))
3393                 return;
3394
3395         GEM_BUG_ON(i915->gt.active_requests);
3396         for_each_engine(engine, i915, id) {
3397                 GEM_BUG_ON(__i915_gem_active_peek(&engine->timeline.last_request));
3398                 GEM_BUG_ON(engine->last_retired_context !=
3399                            to_intel_context(i915->kernel_context, engine));
3400         }
3401 }
3402
3403 static void
3404 i915_gem_idle_work_handler(struct work_struct *work)
3405 {
3406         struct drm_i915_private *dev_priv =
3407                 container_of(work, typeof(*dev_priv), gt.idle_work.work);
3408         unsigned int epoch = I915_EPOCH_INVALID;
3409         bool rearm_hangcheck;
3410
3411         if (!READ_ONCE(dev_priv->gt.awake))
3412                 return;
3413
3414         if (READ_ONCE(dev_priv->gt.active_requests))
3415                 return;
3416
3417         /*
3418          * Flush out the last user context, leaving only the pinned
3419          * kernel context resident. When we are idling on the kernel_context,
3420          * no more new requests (with a context switch) are emitted and we
3421          * can finally rest. A consequence is that the idle work handler is
3422          * always called at least twice before idling (and if the system is
3423          * idle that implies a round trip through the retire worker).
3424          */
3425         mutex_lock(&dev_priv->drm.struct_mutex);
3426         i915_gem_switch_to_kernel_context(dev_priv);
3427         mutex_unlock(&dev_priv->drm.struct_mutex);
3428
3429         GEM_TRACE("active_requests=%d (after switch-to-kernel-context)\n",
3430                   READ_ONCE(dev_priv->gt.active_requests));
3431
3432         /*
3433          * Wait for last execlists context complete, but bail out in case a
3434          * new request is submitted. As we don't trust the hardware, we
3435          * continue on if the wait times out. This is necessary to allow
3436          * the machine to suspend even if the hardware dies, and we will
3437          * try to recover in resume (after depriving the hardware of power,
3438          * it may be in a better mmod).
3439          */
3440         __wait_for(if (new_requests_since_last_retire(dev_priv)) return,
3441                    intel_engines_are_idle(dev_priv),
3442                    I915_IDLE_ENGINES_TIMEOUT * 1000,
3443                    10, 500);
3444
3445         rearm_hangcheck =
3446                 cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work);
3447
3448         if (!mutex_trylock(&dev_priv->drm.struct_mutex)) {
3449                 /* Currently busy, come back later */
3450                 mod_delayed_work(dev_priv->wq,
3451                                  &dev_priv->gt.idle_work,
3452                                  msecs_to_jiffies(50));
3453                 goto out_rearm;
3454         }
3455
3456         /*
3457          * New request retired after this work handler started, extend active
3458          * period until next instance of the work.
3459          */
3460         if (new_requests_since_last_retire(dev_priv))
3461                 goto out_unlock;
3462
3463         epoch = __i915_gem_park(dev_priv);
3464
3465         assert_kernel_context_is_current(dev_priv);
3466
3467         rearm_hangcheck = false;
3468 out_unlock:
3469         mutex_unlock(&dev_priv->drm.struct_mutex);
3470
3471 out_rearm:
3472         if (rearm_hangcheck) {
3473                 GEM_BUG_ON(!dev_priv->gt.awake);
3474                 i915_queue_hangcheck(dev_priv);
3475         }
3476
3477         /*
3478          * When we are idle, it is an opportune time to reap our caches.
3479          * However, we have many objects that utilise RCU and the ordered
3480          * i915->wq that this work is executing on. To try and flush any
3481          * pending frees now we are idle, we first wait for an RCU grace
3482          * period, and then queue a task (that will run last on the wq) to
3483          * shrink and re-optimize the caches.
3484          */
3485         if (same_epoch(dev_priv, epoch)) {
3486                 struct sleep_rcu_work *s = kmalloc(sizeof(*s), GFP_KERNEL);
3487                 if (s) {
3488                         init_rcu_head(&s->rcu);
3489                         s->i915 = dev_priv;
3490                         s->epoch = epoch;
3491                         call_rcu(&s->rcu, __sleep_rcu);
3492                 }
3493         }
3494 }
3495
3496 void i915_gem_close_object(struct drm_gem_object *gem, struct drm_file *file)
3497 {
3498         struct drm_i915_private *i915 = to_i915(gem->dev);
3499         struct drm_i915_gem_object *obj = to_intel_bo(gem);
3500         struct drm_i915_file_private *fpriv = file->driver_priv;
3501         struct i915_lut_handle *lut, *ln;
3502
3503         mutex_lock(&i915->drm.struct_mutex);
3504
3505         list_for_each_entry_safe(lut, ln, &obj->lut_list, obj_link) {
3506                 struct i915_gem_context *ctx = lut->ctx;
3507                 struct i915_vma *vma;
3508
3509                 GEM_BUG_ON(ctx->file_priv == ERR_PTR(-EBADF));
3510                 if (ctx->file_priv != fpriv)
3511                         continue;
3512
3513                 vma = radix_tree_delete(&ctx->handles_vma, lut->handle);
3514                 GEM_BUG_ON(vma->obj != obj);
3515
3516                 /* We allow the process to have multiple handles to the same
3517                  * vma, in the same fd namespace, by virtue of flink/open.
3518                  */
3519                 GEM_BUG_ON(!vma->open_count);
3520                 if (!--vma->open_count && !i915_vma_is_ggtt(vma))
3521                         i915_vma_close(vma);
3522
3523                 list_del(&lut->obj_link);
3524                 list_del(&lut->ctx_link);
3525
3526                 kmem_cache_free(i915->luts, lut);
3527                 __i915_gem_object_release_unless_active(obj);
3528         }
3529
3530         mutex_unlock(&i915->drm.struct_mutex);
3531 }
3532
3533 static unsigned long to_wait_timeout(s64 timeout_ns)
3534 {
3535         if (timeout_ns < 0)
3536                 return MAX_SCHEDULE_TIMEOUT;
3537
3538         if (timeout_ns == 0)
3539                 return 0;
3540
3541         return nsecs_to_jiffies_timeout(timeout_ns);
3542 }
3543
3544 /**
3545  * i915_gem_wait_ioctl - implements DRM_IOCTL_I915_GEM_WAIT
3546  * @dev: drm device pointer
3547  * @data: ioctl data blob
3548  * @file: drm file pointer
3549  *
3550  * Returns 0 if successful, else an error is returned with the remaining time in
3551  * the timeout parameter.
3552  *  -ETIME: object is still busy after timeout
3553  *  -ERESTARTSYS: signal interrupted the wait
3554  *  -ENONENT: object doesn't exist
3555  * Also possible, but rare:
3556  *  -EAGAIN: incomplete, restart syscall
3557  *  -ENOMEM: damn
3558  *  -ENODEV: Internal IRQ fail
3559  *  -E?: The add request failed
3560  *
3561  * The wait ioctl with a timeout of 0 reimplements the busy ioctl. With any
3562  * non-zero timeout parameter the wait ioctl will wait for the given number of
3563  * nanoseconds on an object becoming unbusy. Since the wait itself does so
3564  * without holding struct_mutex the object may become re-busied before this
3565  * function completes. A similar but shorter * race condition exists in the busy
3566  * ioctl
3567  */
3568 int
3569 i915_gem_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
3570 {
3571         struct drm_i915_gem_wait *args = data;
3572         struct drm_i915_gem_object *obj;
3573         ktime_t start;
3574         long ret;
3575
3576         if (args->flags != 0)
3577                 return -EINVAL;
3578
3579         obj = i915_gem_object_lookup(file, args->bo_handle);
3580         if (!obj)
3581                 return -ENOENT;
3582
3583         start = ktime_get();
3584
3585         ret = i915_gem_object_wait(obj,
3586                                    I915_WAIT_INTERRUPTIBLE |
3587                                    I915_WAIT_PRIORITY |
3588                                    I915_WAIT_ALL,
3589                                    to_wait_timeout(args->timeout_ns),
3590                                    to_rps_client(file));
3591
3592         if (args->timeout_ns > 0) {
3593                 args->timeout_ns -= ktime_to_ns(ktime_sub(ktime_get(), start));
3594                 if (args->timeout_ns < 0)
3595                         args->timeout_ns = 0;
3596
3597                 /*
3598                  * Apparently ktime isn't accurate enough and occasionally has a
3599                  * bit of mismatch in the jiffies<->nsecs<->ktime loop. So patch
3600                  * things up to make the test happy. We allow up to 1 jiffy.
3601                  *
3602                  * This is a regression from the timespec->ktime conversion.
3603                  */
3604                 if (ret == -ETIME && !nsecs_to_jiffies(args->timeout_ns))
3605                         args->timeout_ns = 0;
3606
3607                 /* Asked to wait beyond the jiffie/scheduler precision? */
3608                 if (ret == -ETIME && args->timeout_ns)
3609                         ret = -EAGAIN;
3610         }
3611
3612         i915_gem_object_put(obj);
3613         return ret;
3614 }
3615
3616 static long wait_for_timeline(struct i915_timeline *tl,
3617                               unsigned int flags, long timeout)
3618 {
3619         struct i915_request *rq;
3620
3621         rq = i915_gem_active_get_unlocked(&tl->last_request);
3622         if (!rq)
3623                 return timeout;
3624
3625         /*
3626          * "Race-to-idle".
3627          *
3628          * Switching to the kernel context is often used a synchronous
3629          * step prior to idling, e.g. in suspend for flushing all
3630          * current operations to memory before sleeping. These we
3631          * want to complete as quickly as possible to avoid prolonged
3632          * stalls, so allow the gpu to boost to maximum clocks.
3633          */
3634         if (flags & I915_WAIT_FOR_IDLE_BOOST)
3635                 gen6_rps_boost(rq, NULL);
3636
3637         timeout = i915_request_wait(rq, flags, timeout);
3638         i915_request_put(rq);
3639
3640         return timeout;
3641 }
3642
3643 static int wait_for_engines(struct drm_i915_private *i915)
3644 {
3645         if (wait_for(intel_engines_are_idle(i915), I915_IDLE_ENGINES_TIMEOUT)) {
3646                 dev_err(i915->drm.dev,
3647                         "Failed to idle engines, declaring wedged!\n");
3648                 GEM_TRACE_DUMP();
3649                 i915_gem_set_wedged(i915);
3650                 return -EIO;
3651         }
3652
3653         return 0;
3654 }
3655
3656 int i915_gem_wait_for_idle(struct drm_i915_private *i915,
3657                            unsigned int flags, long timeout)
3658 {
3659         GEM_TRACE("flags=%x (%s), timeout=%ld%s\n",
3660                   flags, flags & I915_WAIT_LOCKED ? "locked" : "unlocked",
3661                   timeout, timeout == MAX_SCHEDULE_TIMEOUT ? " (forever)" : "");
3662
3663         /* If the device is asleep, we have no requests outstanding */
3664         if (!READ_ONCE(i915->gt.awake))
3665                 return 0;
3666
3667         if (flags & I915_WAIT_LOCKED) {
3668                 struct i915_timeline *tl;
3669                 int err;
3670
3671                 lockdep_assert_held(&i915->drm.struct_mutex);
3672
3673                 list_for_each_entry(tl, &i915->gt.timelines, link) {
3674                         timeout = wait_for_timeline(tl, flags, timeout);
3675                         if (timeout < 0)
3676                                 return timeout;
3677                 }
3678                 if (GEM_SHOW_DEBUG() && !timeout) {
3679                         /* Presume that timeout was non-zero to begin with! */
3680                         dev_warn(&i915->drm.pdev->dev,
3681                                  "Missed idle-completion interrupt!\n");
3682                         GEM_TRACE_DUMP();
3683                 }
3684
3685                 err = wait_for_engines(i915);
3686                 if (err)
3687                         return err;
3688
3689                 i915_retire_requests(i915);
3690                 GEM_BUG_ON(i915->gt.active_requests);
3691         } else {
3692                 struct intel_engine_cs *engine;
3693                 enum intel_engine_id id;
3694
3695                 for_each_engine(engine, i915, id) {
3696                         struct i915_timeline *tl = &engine->timeline;
3697
3698                         timeout = wait_for_timeline(tl, flags, timeout);
3699                         if (timeout < 0)
3700                                 return timeout;
3701                 }
3702         }
3703
3704         return 0;
3705 }
3706
3707 static void __i915_gem_object_flush_for_display(struct drm_i915_gem_object *obj)
3708 {
3709         /*
3710          * We manually flush the CPU domain so that we can override and
3711          * force the flush for the display, and perform it asyncrhonously.
3712          */
3713         flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
3714         if (obj->cache_dirty)
3715                 i915_gem_clflush_object(obj, I915_CLFLUSH_FORCE);
3716         obj->write_domain = 0;
3717 }
3718
3719 void i915_gem_object_flush_if_display(struct drm_i915_gem_object *obj)
3720 {
3721         if (!READ_ONCE(obj->pin_global))
3722                 return;
3723
3724         mutex_lock(&obj->base.dev->struct_mutex);
3725         __i915_gem_object_flush_for_display(obj);
3726         mutex_unlock(&obj->base.dev->struct_mutex);
3727 }
3728
3729 /**
3730  * Moves a single object to the WC read, and possibly write domain.
3731  * @obj: object to act on
3732  * @write: ask for write access or read only
3733  *
3734  * This function returns when the move is complete, including waiting on
3735  * flushes to occur.
3736  */
3737 int
3738 i915_gem_object_set_to_wc_domain(struct drm_i915_gem_object *obj, bool write)
3739 {
3740         int ret;
3741
3742         lockdep_assert_held(&obj->base.dev->struct_mutex);
3743
3744         ret = i915_gem_object_wait(obj,
3745                                    I915_WAIT_INTERRUPTIBLE |
3746                                    I915_WAIT_LOCKED |
3747                                    (write ? I915_WAIT_ALL : 0),
3748                                    MAX_SCHEDULE_TIMEOUT,
3749                                    NULL);
3750         if (ret)
3751                 return ret;
3752
3753         if (obj->write_domain == I915_GEM_DOMAIN_WC)
3754                 return 0;
3755
3756         /* Flush and acquire obj->pages so that we are coherent through
3757          * direct access in memory with previous cached writes through
3758          * shmemfs and that our cache domain tracking remains valid.
3759          * For example, if the obj->filp was moved to swap without us
3760          * being notified and releasing the pages, we would mistakenly
3761          * continue to assume that the obj remained out of the CPU cached
3762          * domain.
3763          */
3764         ret = i915_gem_object_pin_pages(obj);
3765         if (ret)
3766                 return ret;
3767
3768         flush_write_domain(obj, ~I915_GEM_DOMAIN_WC);
3769
3770         /* Serialise direct access to this object with the barriers for
3771          * coherent writes from the GPU, by effectively invalidating the
3772          * WC domain upon first access.
3773          */
3774         if ((obj->read_domains & I915_GEM_DOMAIN_WC) == 0)
3775                 mb();
3776
3777         /* It should now be out of any other write domains, and we can update
3778          * the domain values for our changes.
3779          */
3780         GEM_BUG_ON((obj->write_domain & ~I915_GEM_DOMAIN_WC) != 0);
3781         obj->read_domains |= I915_GEM_DOMAIN_WC;
3782         if (write) {
3783                 obj->read_domains = I915_GEM_DOMAIN_WC;
3784                 obj->write_domain = I915_GEM_DOMAIN_WC;
3785                 obj->mm.dirty = true;
3786         }
3787
3788         i915_gem_object_unpin_pages(obj);
3789         return 0;
3790 }
3791
3792 /**
3793  * Moves a single object to the GTT read, and possibly write domain.
3794  * @obj: object to act on
3795  * @write: ask for write access or read only
3796  *
3797  * This function returns when the move is complete, including waiting on
3798  * flushes to occur.
3799  */
3800 int
3801 i915_gem_object_set_to_gtt_domain(struct drm_i915_gem_object *obj, bool write)
3802 {
3803         int ret;
3804
3805         lockdep_assert_held(&obj->base.dev->struct_mutex);
3806
3807         ret = i915_gem_object_wait(obj,
3808                                    I915_WAIT_INTERRUPTIBLE |
3809                                    I915_WAIT_LOCKED |
3810                                    (write ? I915_WAIT_ALL : 0),
3811                                    MAX_SCHEDULE_TIMEOUT,
3812                                    NULL);
3813         if (ret)
3814                 return ret;
3815
3816         if (obj->write_domain == I915_GEM_DOMAIN_GTT)
3817                 return 0;
3818
3819         /* Flush and acquire obj->pages so that we are coherent through
3820          * direct access in memory with previous cached writes through
3821          * shmemfs and that our cache domain tracking remains valid.
3822          * For example, if the obj->filp was moved to swap without us
3823          * being notified and releasing the pages, we would mistakenly
3824          * continue to assume that the obj remained out of the CPU cached
3825          * domain.
3826          */
3827         ret = i915_gem_object_pin_pages(obj);
3828         if (ret)
3829                 return ret;
3830
3831         flush_write_domain(obj, ~I915_GEM_DOMAIN_GTT);
3832
3833         /* Serialise direct access to this object with the barriers for
3834          * coherent writes from the GPU, by effectively invalidating the
3835          * GTT domain upon first access.
3836          */
3837         if ((obj->read_domains & I915_GEM_DOMAIN_GTT) == 0)
3838                 mb();
3839
3840         /* It should now be out of any other write domains, and we can update
3841          * the domain values for our changes.
3842          */
3843         GEM_BUG_ON((obj->write_domain & ~I915_GEM_DOMAIN_GTT) != 0);
3844         obj->read_domains |= I915_GEM_DOMAIN_GTT;
3845         if (write) {
3846                 obj->read_domains = I915_GEM_DOMAIN_GTT;
3847                 obj->write_domain = I915_GEM_DOMAIN_GTT;
3848                 obj->mm.dirty = true;
3849         }
3850
3851         i915_gem_object_unpin_pages(obj);
3852         return 0;
3853 }
3854
3855 /**
3856  * Changes the cache-level of an object across all VMA.
3857  * @obj: object to act on
3858  * @cache_level: new cache level to set for the object
3859  *
3860  * After this function returns, the object will be in the new cache-level
3861  * across all GTT and the contents of the backing storage will be coherent,
3862  * with respect to the new cache-level. In order to keep the backing storage
3863  * coherent for all users, we only allow a single cache level to be set
3864  * globally on the object and prevent it from being changed whilst the
3865  * hardware is reading from the object. That is if the object is currently
3866  * on the scanout it will be set to uncached (or equivalent display
3867  * cache coherency) and all non-MOCS GPU access will also be uncached so
3868  * that all direct access to the scanout remains coherent.
3869  */
3870 int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
3871                                     enum i915_cache_level cache_level)
3872 {
3873         struct i915_vma *vma;
3874         int ret;
3875
3876         lockdep_assert_held(&obj->base.dev->struct_mutex);
3877
3878         if (obj->cache_level == cache_level)
3879                 return 0;
3880
3881         /* Inspect the list of currently bound VMA and unbind any that would
3882          * be invalid given the new cache-level. This is principally to
3883          * catch the issue of the CS prefetch crossing page boundaries and
3884          * reading an invalid PTE on older architectures.
3885          */
3886 restart:
3887         list_for_each_entry(vma, &obj->vma_list, obj_link) {
3888                 if (!drm_mm_node_allocated(&vma->node))
3889                         continue;
3890
3891                 if (i915_vma_is_pinned(vma)) {
3892                         DRM_DEBUG("can not change the cache level of pinned objects\n");
3893                         return -EBUSY;
3894                 }
3895
3896                 if (!i915_vma_is_closed(vma) &&
3897                     i915_gem_valid_gtt_space(vma, cache_level))
3898                         continue;
3899
3900                 ret = i915_vma_unbind(vma);
3901                 if (ret)
3902                         return ret;
3903
3904                 /* As unbinding may affect other elements in the
3905                  * obj->vma_list (due to side-effects from retiring
3906                  * an active vma), play safe and restart the iterator.
3907                  */
3908                 goto restart;
3909         }
3910
3911         /* We can reuse the existing drm_mm nodes but need to change the
3912          * cache-level on the PTE. We could simply unbind them all and
3913          * rebind with the correct cache-level on next use. However since
3914          * we already have a valid slot, dma mapping, pages etc, we may as
3915          * rewrite the PTE in the belief that doing so tramples upon less
3916          * state and so involves less work.
3917          */
3918         if (obj->bind_count) {
3919                 /* Before we change the PTE, the GPU must not be accessing it.
3920                  * If we wait upon the object, we know that all the bound
3921                  * VMA are no longer active.
3922                  */
3923                 ret = i915_gem_object_wait(obj,
3924                                            I915_WAIT_INTERRUPTIBLE |
3925                                            I915_WAIT_LOCKED |
3926                                            I915_WAIT_ALL,
3927                                            MAX_SCHEDULE_TIMEOUT,
3928                                            NULL);
3929                 if (ret)
3930                         return ret;
3931
3932                 if (!HAS_LLC(to_i915(obj->base.dev)) &&
3933                     cache_level != I915_CACHE_NONE) {
3934                         /* Access to snoopable pages through the GTT is
3935                          * incoherent and on some machines causes a hard
3936                          * lockup. Relinquish the CPU mmaping to force
3937                          * userspace to refault in the pages and we can
3938                          * then double check if the GTT mapping is still
3939                          * valid for that pointer access.
3940                          */
3941                         i915_gem_release_mmap(obj);
3942
3943                         /* As we no longer need a fence for GTT access,
3944                          * we can relinquish it now (and so prevent having
3945                          * to steal a fence from someone else on the next
3946                          * fence request). Note GPU activity would have
3947                          * dropped the fence as all snoopable access is
3948                          * supposed to be linear.
3949                          */
3950                         for_each_ggtt_vma(vma, obj) {
3951                                 ret = i915_vma_put_fence(vma);
3952                                 if (ret)
3953                                         return ret;
3954                         }
3955                 } else {
3956                         /* We either have incoherent backing store and
3957                          * so no GTT access or the architecture is fully
3958                          * coherent. In such cases, existing GTT mmaps
3959                          * ignore the cache bit in the PTE and we can
3960                          * rewrite it without confusing the GPU or having
3961                          * to force userspace to fault back in its mmaps.
3962                          */
3963                 }
3964
3965                 list_for_each_entry(vma, &obj->vma_list, obj_link) {
3966                         if (!drm_mm_node_allocated(&vma->node))
3967                                 continue;
3968
3969                         ret = i915_vma_bind(vma, cache_level, PIN_UPDATE);
3970                         if (ret)
3971                                 return ret;
3972                 }
3973         }
3974
3975         list_for_each_entry(vma, &obj->vma_list, obj_link)
3976                 vma->node.color = cache_level;
3977         i915_gem_object_set_cache_coherency(obj, cache_level);
3978         obj->cache_dirty = true; /* Always invalidate stale cachelines */
3979
3980         return 0;
3981 }
3982
3983 int i915_gem_get_caching_ioctl(struct drm_device *dev, void *data,
3984                                struct drm_file *file)
3985 {
3986         struct drm_i915_gem_caching *args = data;
3987         struct drm_i915_gem_object *obj;
3988         int err = 0;
3989
3990         rcu_read_lock();
3991         obj = i915_gem_object_lookup_rcu(file, args->handle);
3992         if (!obj) {
3993                 err = -ENOENT;
3994                 goto out;
3995         }
3996
3997         switch (obj->cache_level) {
3998         case I915_CACHE_LLC:
3999         case I915_CACHE_L3_LLC:
4000                 args->caching = I915_CACHING_CACHED;
4001                 break;
4002
4003         case I915_CACHE_WT:
4004                 args->caching = I915_CACHING_DISPLAY;
4005                 break;
4006
4007         default:
4008                 args->caching = I915_CACHING_NONE;
4009                 break;
4010         }
4011 out:
4012         rcu_read_unlock();
4013         return err;
4014 }
4015
4016 int i915_gem_set_caching_ioctl(struct drm_device *dev, void *data,
4017                                struct drm_file *file)
4018 {
4019         struct drm_i915_private *i915 = to_i915(dev);
4020         struct drm_i915_gem_caching *args = data;
4021         struct drm_i915_gem_object *obj;
4022         enum i915_cache_level level;
4023         int ret = 0;
4024
4025         switch (args->caching) {
4026         case I915_CACHING_NONE:
4027                 level = I915_CACHE_NONE;
4028                 break;
4029         case I915_CACHING_CACHED:
4030                 /*
4031                  * Due to a HW issue on BXT A stepping, GPU stores via a
4032                  * snooped mapping may leave stale data in a corresponding CPU
4033                  * cacheline, whereas normally such cachelines would get
4034                  * invalidated.
4035                  */
4036                 if (!HAS_LLC(i915) && !HAS_SNOOP(i915))
4037                         return -ENODEV;
4038
4039                 level = I915_CACHE_LLC;
4040                 break;
4041         case I915_CACHING_DISPLAY:
4042                 level = HAS_WT(i915) ? I915_CACHE_WT : I915_CACHE_NONE;
4043                 break;
4044         default:
4045                 return -EINVAL;
4046         }
4047
4048         obj = i915_gem_object_lookup(file, args->handle);
4049         if (!obj)
4050                 return -ENOENT;
4051
4052         /*
4053          * The caching mode of proxy object is handled by its generator, and
4054          * not allowed to be changed by userspace.
4055          */
4056         if (i915_gem_object_is_proxy(obj)) {
4057                 ret = -ENXIO;
4058                 goto out;
4059         }
4060
4061         if (obj->cache_level == level)
4062                 goto out;
4063
4064         ret = i915_gem_object_wait(obj,
4065                                    I915_WAIT_INTERRUPTIBLE,
4066                                    MAX_SCHEDULE_TIMEOUT,
4067                                    to_rps_client(file));
4068         if (ret)
4069                 goto out;
4070
4071         ret = i915_mutex_lock_interruptible(dev);
4072         if (ret)
4073                 goto out;
4074
4075         ret = i915_gem_object_set_cache_level(obj, level);
4076         mutex_unlock(&dev->struct_mutex);
4077
4078 out:
4079         i915_gem_object_put(obj);
4080         return ret;
4081 }
4082
4083 /*
4084  * Prepare buffer for display plane (scanout, cursors, etc). Can be called from
4085  * an uninterruptible phase (modesetting) and allows any flushes to be pipelined
4086  * (for pageflips). We only flush the caches while preparing the buffer for
4087  * display, the callers are responsible for frontbuffer flush.
4088  */
4089 struct i915_vma *
4090 i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj,
4091                                      u32 alignment,
4092                                      const struct i915_ggtt_view *view,
4093                                      unsigned int flags)
4094 {
4095         struct i915_vma *vma;
4096         int ret;
4097
4098         lockdep_assert_held(&obj->base.dev->struct_mutex);
4099
4100         /* Mark the global pin early so that we account for the
4101          * display coherency whilst setting up the cache domains.
4102          */
4103         obj->pin_global++;
4104
4105         /* The display engine is not coherent with the LLC cache on gen6.  As
4106          * a result, we make sure that the pinning that is about to occur is
4107          * done with uncached PTEs. This is lowest common denominator for all
4108          * chipsets.
4109          *
4110          * However for gen6+, we could do better by using the GFDT bit instead
4111          * of uncaching, which would allow us to flush all the LLC-cached data
4112          * with that bit in the PTE to main memory with just one PIPE_CONTROL.
4113          */
4114         ret = i915_gem_object_set_cache_level(obj,
4115                                               HAS_WT(to_i915(obj->base.dev)) ?
4116                                               I915_CACHE_WT : I915_CACHE_NONE);
4117         if (ret) {
4118                 vma = ERR_PTR(ret);
4119                 goto err_unpin_global;
4120         }
4121
4122         /* As the user may map the buffer once pinned in the display plane
4123          * (e.g. libkms for the bootup splash), we have to ensure that we
4124          * always use map_and_fenceable for all scanout buffers. However,
4125          * it may simply be too big to fit into mappable, in which case
4126          * put it anyway and hope that userspace can cope (but always first
4127          * try to preserve the existing ABI).
4128          */
4129         vma = ERR_PTR(-ENOSPC);
4130         if ((flags & PIN_MAPPABLE) == 0 &&
4131             (!view || view->type == I915_GGTT_VIEW_NORMAL))
4132                 vma = i915_gem_object_ggtt_pin(obj, view, 0, alignment,
4133                                                flags |
4134                                                PIN_MAPPABLE |
4135                                                PIN_NONBLOCK);
4136         if (IS_ERR(vma))
4137                 vma = i915_gem_object_ggtt_pin(obj, view, 0, alignment, flags);
4138         if (IS_ERR(vma))
4139                 goto err_unpin_global;
4140
4141         vma->display_alignment = max_t(u64, vma->display_alignment, alignment);
4142
4143         __i915_gem_object_flush_for_display(obj);
4144
4145         /* It should now be out of any other write domains, and we can update
4146          * the domain values for our changes.
4147          */
4148         obj->read_domains |= I915_GEM_DOMAIN_GTT;
4149
4150         return vma;
4151
4152 err_unpin_global:
4153         obj->pin_global--;
4154         return vma;
4155 }
4156
4157 void
4158 i915_gem_object_unpin_from_display_plane(struct i915_vma *vma)
4159 {
4160         lockdep_assert_held(&vma->vm->i915->drm.struct_mutex);
4161
4162         if (WARN_ON(vma->obj->pin_global == 0))
4163                 return;
4164
4165         if (--vma->obj->pin_global == 0)
4166                 vma->display_alignment = I915_GTT_MIN_ALIGNMENT;
4167
4168         /* Bump the LRU to try and avoid premature eviction whilst flipping  */
4169         i915_gem_object_bump_inactive_ggtt(vma->obj);
4170
4171         i915_vma_unpin(vma);
4172 }
4173
4174 /**
4175  * Moves a single object to the CPU read, and possibly write domain.
4176  * @obj: object to act on
4177  * @write: requesting write or read-only access
4178  *
4179  * This function returns when the move is complete, including waiting on
4180  * flushes to occur.
4181  */
4182 int
4183 i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write)
4184 {
4185         int ret;
4186
4187         lockdep_assert_held(&obj->base.dev->struct_mutex);
4188
4189         ret = i915_gem_object_wait(obj,
4190                                    I915_WAIT_INTERRUPTIBLE |
4191                                    I915_WAIT_LOCKED |
4192                                    (write ? I915_WAIT_ALL : 0),
4193                                    MAX_SCHEDULE_TIMEOUT,
4194                                    NULL);
4195         if (ret)
4196                 return ret;
4197
4198         flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
4199
4200         /* Flush the CPU cache if it's still invalid. */
4201         if ((obj->read_domains & I915_GEM_DOMAIN_CPU) == 0) {
4202                 i915_gem_clflush_object(obj, I915_CLFLUSH_SYNC);
4203                 obj->read_domains |= I915_GEM_DOMAIN_CPU;
4204         }
4205
4206         /* It should now be out of any other write domains, and we can update
4207          * the domain values for our changes.
4208          */
4209         GEM_BUG_ON(obj->write_domain & ~I915_GEM_DOMAIN_CPU);
4210
4211         /* If we're writing through the CPU, then the GPU read domains will
4212          * need to be invalidated at next use.
4213          */
4214         if (write)
4215                 __start_cpu_write(obj);
4216
4217         return 0;
4218 }
4219
4220 /* Throttle our rendering by waiting until the ring has completed our requests
4221  * emitted over 20 msec ago.
4222  *
4223  * Note that if we were to use the current jiffies each time around the loop,
4224  * we wouldn't escape the function with any frames outstanding if the time to
4225  * render a frame was over 20ms.
4226  *
4227  * This should get us reasonable parallelism between CPU and GPU but also
4228  * relatively low latency when blocking on a particular request to finish.
4229  */
4230 static int
4231 i915_gem_ring_throttle(struct drm_device *dev, struct drm_file *file)
4232 {
4233         struct drm_i915_private *dev_priv = to_i915(dev);
4234         struct drm_i915_file_private *file_priv = file->driver_priv;
4235         unsigned long recent_enough = jiffies - DRM_I915_THROTTLE_JIFFIES;
4236         struct i915_request *request, *target = NULL;
4237         long ret;
4238
4239         /* ABI: return -EIO if already wedged */
4240         if (i915_terminally_wedged(&dev_priv->gpu_error))
4241                 return -EIO;
4242
4243         spin_lock(&file_priv->mm.lock);
4244         list_for_each_entry(request, &file_priv->mm.request_list, client_link) {
4245                 if (time_after_eq(request->emitted_jiffies, recent_enough))
4246                         break;
4247
4248                 if (target) {
4249                         list_del(&target->client_link);
4250                         target->file_priv = NULL;
4251                 }
4252
4253                 target = request;
4254         }
4255         if (target)
4256                 i915_request_get(target);
4257         spin_unlock(&file_priv->mm.lock);
4258
4259         if (target == NULL)
4260                 return 0;
4261
4262         ret = i915_request_wait(target,
4263                                 I915_WAIT_INTERRUPTIBLE,
4264                                 MAX_SCHEDULE_TIMEOUT);
4265         i915_request_put(target);
4266
4267         return ret < 0 ? ret : 0;
4268 }
4269
4270 struct i915_vma *
4271 i915_gem_object_ggtt_pin(struct drm_i915_gem_object *obj,
4272                          const struct i915_ggtt_view *view,
4273                          u64 size,
4274                          u64 alignment,
4275                          u64 flags)
4276 {
4277         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
4278         struct i915_address_space *vm = &dev_priv->ggtt.vm;
4279         struct i915_vma *vma;
4280         int ret;
4281
4282         lockdep_assert_held(&obj->base.dev->struct_mutex);
4283
4284         if (flags & PIN_MAPPABLE &&
4285             (!view || view->type == I915_GGTT_VIEW_NORMAL)) {
4286                 /* If the required space is larger than the available
4287                  * aperture, we will not able to find a slot for the
4288                  * object and unbinding the object now will be in
4289                  * vain. Worse, doing so may cause us to ping-pong
4290                  * the object in and out of the Global GTT and
4291                  * waste a lot of cycles under the mutex.
4292                  */
4293                 if (obj->base.size > dev_priv->ggtt.mappable_end)
4294                         return ERR_PTR(-E2BIG);
4295
4296                 /* If NONBLOCK is set the caller is optimistically
4297                  * trying to cache the full object within the mappable
4298                  * aperture, and *must* have a fallback in place for
4299                  * situations where we cannot bind the object. We
4300                  * can be a little more lax here and use the fallback
4301                  * more often to avoid costly migrations of ourselves
4302                  * and other objects within the aperture.
4303                  *
4304                  * Half-the-aperture is used as a simple heuristic.
4305                  * More interesting would to do search for a free
4306                  * block prior to making the commitment to unbind.
4307                  * That caters for the self-harm case, and with a
4308                  * little more heuristics (e.g. NOFAULT, NOEVICT)
4309                  * we could try to minimise harm to others.
4310                  */
4311                 if (flags & PIN_NONBLOCK &&
4312                     obj->base.size > dev_priv->ggtt.mappable_end / 2)
4313                         return ERR_PTR(-ENOSPC);
4314         }
4315
4316         vma = i915_vma_instance(obj, vm, view);
4317         if (unlikely(IS_ERR(vma)))
4318                 return vma;
4319
4320         if (i915_vma_misplaced(vma, size, alignment, flags)) {
4321                 if (flags & PIN_NONBLOCK) {
4322                         if (i915_vma_is_pinned(vma) || i915_vma_is_active(vma))
4323                                 return ERR_PTR(-ENOSPC);
4324
4325                         if (flags & PIN_MAPPABLE &&
4326                             vma->fence_size > dev_priv->ggtt.mappable_end / 2)
4327                                 return ERR_PTR(-ENOSPC);
4328                 }
4329
4330                 WARN(i915_vma_is_pinned(vma),
4331                      "bo is already pinned in ggtt with incorrect alignment:"
4332                      " offset=%08x, req.alignment=%llx,"
4333                      " req.map_and_fenceable=%d, vma->map_and_fenceable=%d\n",
4334                      i915_ggtt_offset(vma), alignment,
4335                      !!(flags & PIN_MAPPABLE),
4336                      i915_vma_is_map_and_fenceable(vma));
4337                 ret = i915_vma_unbind(vma);
4338                 if (ret)
4339                         return ERR_PTR(ret);
4340         }
4341
4342         ret = i915_vma_pin(vma, size, alignment, flags | PIN_GLOBAL);
4343         if (ret)
4344                 return ERR_PTR(ret);
4345
4346         return vma;
4347 }
4348
4349 static __always_inline unsigned int __busy_read_flag(unsigned int id)
4350 {
4351         /* Note that we could alias engines in the execbuf API, but
4352          * that would be very unwise as it prevents userspace from
4353          * fine control over engine selection. Ahem.
4354          *
4355          * This should be something like EXEC_MAX_ENGINE instead of
4356          * I915_NUM_ENGINES.
4357          */
4358         BUILD_BUG_ON(I915_NUM_ENGINES > 16);
4359         return 0x10000 << id;
4360 }
4361
4362 static __always_inline unsigned int __busy_write_id(unsigned int id)
4363 {
4364         /* The uABI guarantees an active writer is also amongst the read
4365          * engines. This would be true if we accessed the activity tracking
4366          * under the lock, but as we perform the lookup of the object and
4367          * its activity locklessly we can not guarantee that the last_write
4368          * being active implies that we have set the same engine flag from
4369          * last_read - hence we always set both read and write busy for
4370          * last_write.
4371          */
4372         return id | __busy_read_flag(id);
4373 }
4374
4375 static __always_inline unsigned int
4376 __busy_set_if_active(const struct dma_fence *fence,
4377                      unsigned int (*flag)(unsigned int id))
4378 {
4379         struct i915_request *rq;
4380
4381         /* We have to check the current hw status of the fence as the uABI
4382          * guarantees forward progress. We could rely on the idle worker
4383          * to eventually flush us, but to minimise latency just ask the
4384          * hardware.
4385          *
4386          * Note we only report on the status of native fences.
4387          */
4388         if (!dma_fence_is_i915(fence))
4389                 return 0;
4390
4391         /* opencode to_request() in order to avoid const warnings */
4392         rq = container_of(fence, struct i915_request, fence);
4393         if (i915_request_completed(rq))
4394                 return 0;
4395
4396         return flag(rq->engine->uabi_id);
4397 }
4398
4399 static __always_inline unsigned int
4400 busy_check_reader(const struct dma_fence *fence)
4401 {
4402         return __busy_set_if_active(fence, __busy_read_flag);
4403 }
4404
4405 static __always_inline unsigned int
4406 busy_check_writer(const struct dma_fence *fence)
4407 {
4408         if (!fence)
4409                 return 0;
4410
4411         return __busy_set_if_active(fence, __busy_write_id);
4412 }
4413
4414 int
4415 i915_gem_busy_ioctl(struct drm_device *dev, void *data,
4416                     struct drm_file *file)
4417 {
4418         struct drm_i915_gem_busy *args = data;
4419         struct drm_i915_gem_object *obj;
4420         struct reservation_object_list *list;
4421         unsigned int seq;
4422         int err;
4423
4424         err = -ENOENT;
4425         rcu_read_lock();
4426         obj = i915_gem_object_lookup_rcu(file, args->handle);
4427         if (!obj)
4428                 goto out;
4429
4430         /* A discrepancy here is that we do not report the status of
4431          * non-i915 fences, i.e. even though we may report the object as idle,
4432          * a call to set-domain may still stall waiting for foreign rendering.
4433          * This also means that wait-ioctl may report an object as busy,
4434          * where busy-ioctl considers it idle.
4435          *
4436          * We trade the ability to warn of foreign fences to report on which
4437          * i915 engines are active for the object.
4438          *
4439          * Alternatively, we can trade that extra information on read/write
4440          * activity with
4441          *      args->busy =
4442          *              !reservation_object_test_signaled_rcu(obj->resv, true);
4443          * to report the overall busyness. This is what the wait-ioctl does.
4444          *
4445          */
4446 retry:
4447         seq = raw_read_seqcount(&obj->resv->seq);
4448
4449         /* Translate the exclusive fence to the READ *and* WRITE engine */
4450         args->busy = busy_check_writer(rcu_dereference(obj->resv->fence_excl));
4451
4452         /* Translate shared fences to READ set of engines */
4453         list = rcu_dereference(obj->resv->fence);
4454         if (list) {
4455                 unsigned int shared_count = list->shared_count, i;
4456
4457                 for (i = 0; i < shared_count; ++i) {
4458                         struct dma_fence *fence =
4459                                 rcu_dereference(list->shared[i]);
4460
4461                         args->busy |= busy_check_reader(fence);
4462                 }
4463         }
4464
4465         if (args->busy && read_seqcount_retry(&obj->resv->seq, seq))
4466                 goto retry;
4467
4468         err = 0;
4469 out:
4470         rcu_read_unlock();
4471         return err;
4472 }
4473
4474 int
4475 i915_gem_throttle_ioctl(struct drm_device *dev, void *data,
4476                         struct drm_file *file_priv)
4477 {
4478         return i915_gem_ring_throttle(dev, file_priv);
4479 }
4480
4481 int
4482 i915_gem_madvise_ioctl(struct drm_device *dev, void *data,
4483                        struct drm_file *file_priv)
4484 {
4485         struct drm_i915_private *dev_priv = to_i915(dev);
4486         struct drm_i915_gem_madvise *args = data;
4487         struct drm_i915_gem_object *obj;
4488         int err;
4489
4490         switch (args->madv) {
4491         case I915_MADV_DONTNEED:
4492         case I915_MADV_WILLNEED:
4493             break;
4494         default:
4495             return -EINVAL;
4496         }
4497
4498         obj = i915_gem_object_lookup(file_priv, args->handle);
4499         if (!obj)
4500                 return -ENOENT;
4501
4502         err = mutex_lock_interruptible(&obj->mm.lock);
4503         if (err)
4504                 goto out;
4505
4506         if (i915_gem_object_has_pages(obj) &&
4507             i915_gem_object_is_tiled(obj) &&
4508             dev_priv->quirks & QUIRK_PIN_SWIZZLED_PAGES) {
4509                 if (obj->mm.madv == I915_MADV_WILLNEED) {
4510                         GEM_BUG_ON(!obj->mm.quirked);
4511                         __i915_gem_object_unpin_pages(obj);
4512                         obj->mm.quirked = false;
4513                 }
4514                 if (args->madv == I915_MADV_WILLNEED) {
4515                         GEM_BUG_ON(obj->mm.quirked);
4516                         __i915_gem_object_pin_pages(obj);
4517                         obj->mm.quirked = true;
4518                 }
4519         }
4520
4521         if (obj->mm.madv != __I915_MADV_PURGED)
4522                 obj->mm.madv = args->madv;
4523
4524         /* if the object is no longer attached, discard its backing storage */
4525         if (obj->mm.madv == I915_MADV_DONTNEED &&
4526             !i915_gem_object_has_pages(obj))
4527                 i915_gem_object_truncate(obj);
4528
4529         args->retained = obj->mm.madv != __I915_MADV_PURGED;
4530         mutex_unlock(&obj->mm.lock);
4531
4532 out:
4533         i915_gem_object_put(obj);
4534         return err;
4535 }
4536
4537 static void
4538 frontbuffer_retire(struct i915_gem_active *active, struct i915_request *request)
4539 {
4540         struct drm_i915_gem_object *obj =
4541                 container_of(active, typeof(*obj), frontbuffer_write);
4542
4543         intel_fb_obj_flush(obj, ORIGIN_CS);
4544 }
4545
4546 void i915_gem_object_init(struct drm_i915_gem_object *obj,
4547                           const struct drm_i915_gem_object_ops *ops)
4548 {
4549         mutex_init(&obj->mm.lock);
4550
4551         INIT_LIST_HEAD(&obj->vma_list);
4552         INIT_LIST_HEAD(&obj->lut_list);
4553         INIT_LIST_HEAD(&obj->batch_pool_link);
4554
4555         init_rcu_head(&obj->rcu);
4556
4557         obj->ops = ops;
4558
4559         reservation_object_init(&obj->__builtin_resv);
4560         obj->resv = &obj->__builtin_resv;
4561
4562         obj->frontbuffer_ggtt_origin = ORIGIN_GTT;
4563         init_request_active(&obj->frontbuffer_write, frontbuffer_retire);
4564
4565         obj->mm.madv = I915_MADV_WILLNEED;
4566         INIT_RADIX_TREE(&obj->mm.get_page.radix, GFP_KERNEL | __GFP_NOWARN);
4567         mutex_init(&obj->mm.get_page.lock);
4568
4569         i915_gem_info_add_obj(to_i915(obj->base.dev), obj->base.size);
4570 }
4571
4572 static const struct drm_i915_gem_object_ops i915_gem_object_ops = {
4573         .flags = I915_GEM_OBJECT_HAS_STRUCT_PAGE |
4574                  I915_GEM_OBJECT_IS_SHRINKABLE,
4575
4576         .get_pages = i915_gem_object_get_pages_gtt,
4577         .put_pages = i915_gem_object_put_pages_gtt,
4578
4579         .pwrite = i915_gem_object_pwrite_gtt,
4580 };
4581
4582 static int i915_gem_object_create_shmem(struct drm_device *dev,
4583                                         struct drm_gem_object *obj,
4584                                         size_t size)
4585 {
4586         struct drm_i915_private *i915 = to_i915(dev);
4587         unsigned long flags = VM_NORESERVE;
4588         struct file *filp;
4589
4590         drm_gem_private_object_init(dev, obj, size);
4591
4592         if (i915->mm.gemfs)
4593                 filp = shmem_file_setup_with_mnt(i915->mm.gemfs, "i915", size,
4594                                                  flags);
4595         else
4596                 filp = shmem_file_setup("i915", size, flags);
4597
4598         if (IS_ERR(filp))
4599                 return PTR_ERR(filp);
4600
4601         obj->filp = filp;
4602
4603         return 0;
4604 }
4605
4606 struct drm_i915_gem_object *
4607 i915_gem_object_create(struct drm_i915_private *dev_priv, u64 size)
4608 {
4609         struct drm_i915_gem_object *obj;
4610         struct address_space *mapping;
4611         unsigned int cache_level;
4612         gfp_t mask;
4613         int ret;
4614
4615         /* There is a prevalence of the assumption that we fit the object's
4616          * page count inside a 32bit _signed_ variable. Let's document this and
4617          * catch if we ever need to fix it. In the meantime, if you do spot
4618          * such a local variable, please consider fixing!
4619          */
4620         if (size >> PAGE_SHIFT > INT_MAX)
4621                 return ERR_PTR(-E2BIG);
4622
4623         if (overflows_type(size, obj->base.size))
4624                 return ERR_PTR(-E2BIG);
4625
4626         obj = i915_gem_object_alloc(dev_priv);
4627         if (obj == NULL)
4628                 return ERR_PTR(-ENOMEM);
4629
4630         ret = i915_gem_object_create_shmem(&dev_priv->drm, &obj->base, size);
4631         if (ret)
4632                 goto fail;
4633
4634         mask = GFP_HIGHUSER | __GFP_RECLAIMABLE;
4635         if (IS_I965GM(dev_priv) || IS_I965G(dev_priv)) {
4636                 /* 965gm cannot relocate objects above 4GiB. */
4637                 mask &= ~__GFP_HIGHMEM;
4638                 mask |= __GFP_DMA32;
4639         }
4640
4641         mapping = obj->base.filp->f_mapping;
4642         mapping_set_gfp_mask(mapping, mask);
4643         GEM_BUG_ON(!(mapping_gfp_mask(mapping) & __GFP_RECLAIM));
4644
4645         i915_gem_object_init(obj, &i915_gem_object_ops);
4646
4647         obj->write_domain = I915_GEM_DOMAIN_CPU;
4648         obj->read_domains = I915_GEM_DOMAIN_CPU;
4649
4650         if (HAS_LLC(dev_priv))
4651                 /* On some devices, we can have the GPU use the LLC (the CPU
4652                  * cache) for about a 10% performance improvement
4653                  * compared to uncached.  Graphics requests other than
4654                  * display scanout are coherent with the CPU in
4655                  * accessing this cache.  This means in this mode we
4656                  * don't need to clflush on the CPU side, and on the
4657                  * GPU side we only need to flush internal caches to
4658                  * get data visible to the CPU.
4659                  *
4660                  * However, we maintain the display planes as UC, and so
4661                  * need to rebind when first used as such.
4662                  */
4663                 cache_level = I915_CACHE_LLC;
4664         else
4665                 cache_level = I915_CACHE_NONE;
4666
4667         i915_gem_object_set_cache_coherency(obj, cache_level);
4668
4669         trace_i915_gem_object_create(obj);
4670
4671         return obj;
4672
4673 fail:
4674         i915_gem_object_free(obj);
4675         return ERR_PTR(ret);
4676 }
4677
4678 static bool discard_backing_storage(struct drm_i915_gem_object *obj)
4679 {
4680         /* If we are the last user of the backing storage (be it shmemfs
4681          * pages or stolen etc), we know that the pages are going to be
4682          * immediately released. In this case, we can then skip copying
4683          * back the contents from the GPU.
4684          */
4685
4686         if (obj->mm.madv != I915_MADV_WILLNEED)
4687                 return false;
4688
4689         if (obj->base.filp == NULL)
4690                 return true;
4691
4692         /* At first glance, this looks racy, but then again so would be
4693          * userspace racing mmap against close. However, the first external
4694          * reference to the filp can only be obtained through the
4695          * i915_gem_mmap_ioctl() which safeguards us against the user
4696          * acquiring such a reference whilst we are in the middle of
4697          * freeing the object.
4698          */
4699         return atomic_long_read(&obj->base.filp->f_count) == 1;
4700 }
4701
4702 static void __i915_gem_free_objects(struct drm_i915_private *i915,
4703                                     struct llist_node *freed)
4704 {
4705         struct drm_i915_gem_object *obj, *on;
4706
4707         intel_runtime_pm_get(i915);
4708         llist_for_each_entry_safe(obj, on, freed, freed) {
4709                 struct i915_vma *vma, *vn;
4710
4711                 trace_i915_gem_object_destroy(obj);
4712
4713                 mutex_lock(&i915->drm.struct_mutex);
4714
4715                 GEM_BUG_ON(i915_gem_object_is_active(obj));
4716                 list_for_each_entry_safe(vma, vn,
4717                                          &obj->vma_list, obj_link) {
4718                         GEM_BUG_ON(i915_vma_is_active(vma));
4719                         vma->flags &= ~I915_VMA_PIN_MASK;
4720                         i915_vma_destroy(vma);
4721                 }
4722                 GEM_BUG_ON(!list_empty(&obj->vma_list));
4723                 GEM_BUG_ON(!RB_EMPTY_ROOT(&obj->vma_tree));
4724
4725                 /* This serializes freeing with the shrinker. Since the free
4726                  * is delayed, first by RCU then by the workqueue, we want the
4727                  * shrinker to be able to free pages of unreferenced objects,
4728                  * or else we may oom whilst there are plenty of deferred
4729                  * freed objects.
4730                  */
4731                 if (i915_gem_object_has_pages(obj)) {
4732                         spin_lock(&i915->mm.obj_lock);
4733                         list_del_init(&obj->mm.link);
4734                         spin_unlock(&i915->mm.obj_lock);
4735                 }
4736
4737                 mutex_unlock(&i915->drm.struct_mutex);
4738
4739                 GEM_BUG_ON(obj->bind_count);
4740                 GEM_BUG_ON(obj->userfault_count);
4741                 GEM_BUG_ON(atomic_read(&obj->frontbuffer_bits));
4742                 GEM_BUG_ON(!list_empty(&obj->lut_list));
4743
4744                 if (obj->ops->release)
4745                         obj->ops->release(obj);
4746
4747                 if (WARN_ON(i915_gem_object_has_pinned_pages(obj)))
4748                         atomic_set(&obj->mm.pages_pin_count, 0);
4749                 __i915_gem_object_put_pages(obj, I915_MM_NORMAL);
4750                 GEM_BUG_ON(i915_gem_object_has_pages(obj));
4751
4752                 if (obj->base.import_attach)
4753                         drm_prime_gem_destroy(&obj->base, NULL);
4754
4755                 reservation_object_fini(&obj->__builtin_resv);
4756                 drm_gem_object_release(&obj->base);
4757                 i915_gem_info_remove_obj(i915, obj->base.size);
4758
4759                 kfree(obj->bit_17);
4760                 i915_gem_object_free(obj);
4761
4762                 GEM_BUG_ON(!atomic_read(&i915->mm.free_count));
4763                 atomic_dec(&i915->mm.free_count);
4764
4765                 if (on)
4766                         cond_resched();
4767         }
4768         intel_runtime_pm_put(i915);
4769 }
4770
4771 static void i915_gem_flush_free_objects(struct drm_i915_private *i915)
4772 {
4773         struct llist_node *freed;
4774
4775         /* Free the oldest, most stale object to keep the free_list short */
4776         freed = NULL;
4777         if (!llist_empty(&i915->mm.free_list)) { /* quick test for hotpath */
4778                 /* Only one consumer of llist_del_first() allowed */
4779                 spin_lock(&i915->mm.free_lock);
4780                 freed = llist_del_first(&i915->mm.free_list);
4781                 spin_unlock(&i915->mm.free_lock);
4782         }
4783         if (unlikely(freed)) {
4784                 freed->next = NULL;
4785                 __i915_gem_free_objects(i915, freed);
4786         }
4787 }
4788
4789 static void __i915_gem_free_work(struct work_struct *work)
4790 {
4791         struct drm_i915_private *i915 =
4792                 container_of(work, struct drm_i915_private, mm.free_work);
4793         struct llist_node *freed;
4794
4795         /*
4796          * All file-owned VMA should have been released by this point through
4797          * i915_gem_close_object(), or earlier by i915_gem_context_close().
4798          * However, the object may also be bound into the global GTT (e.g.
4799          * older GPUs without per-process support, or for direct access through
4800          * the GTT either for the user or for scanout). Those VMA still need to
4801          * unbound now.
4802          */
4803
4804         spin_lock(&i915->mm.free_lock);
4805         while ((freed = llist_del_all(&i915->mm.free_list))) {
4806                 spin_unlock(&i915->mm.free_lock);
4807
4808                 __i915_gem_free_objects(i915, freed);
4809                 if (need_resched())
4810                         return;
4811
4812                 spin_lock(&i915->mm.free_lock);
4813         }
4814         spin_unlock(&i915->mm.free_lock);
4815 }
4816
4817 static void __i915_gem_free_object_rcu(struct rcu_head *head)
4818 {
4819         struct drm_i915_gem_object *obj =
4820                 container_of(head, typeof(*obj), rcu);
4821         struct drm_i915_private *i915 = to_i915(obj->base.dev);
4822
4823         /*
4824          * We reuse obj->rcu for the freed list, so we had better not treat
4825          * it like a rcu_head from this point forwards. And we expect all
4826          * objects to be freed via this path.
4827          */
4828         destroy_rcu_head(&obj->rcu);
4829
4830         /*
4831          * Since we require blocking on struct_mutex to unbind the freed
4832          * object from the GPU before releasing resources back to the
4833          * system, we can not do that directly from the RCU callback (which may
4834          * be a softirq context), but must instead then defer that work onto a
4835          * kthread. We use the RCU callback rather than move the freed object
4836          * directly onto the work queue so that we can mix between using the
4837          * worker and performing frees directly from subsequent allocations for
4838          * crude but effective memory throttling.
4839          */
4840         if (llist_add(&obj->freed, &i915->mm.free_list))
4841                 queue_work(i915->wq, &i915->mm.free_work);
4842 }
4843
4844 void i915_gem_free_object(struct drm_gem_object *gem_obj)
4845 {
4846         struct drm_i915_gem_object *obj = to_intel_bo(gem_obj);
4847
4848         if (obj->mm.quirked)
4849                 __i915_gem_object_unpin_pages(obj);
4850
4851         if (discard_backing_storage(obj))
4852                 obj->mm.madv = I915_MADV_DONTNEED;
4853
4854         /*
4855          * Before we free the object, make sure any pure RCU-only
4856          * read-side critical sections are complete, e.g.
4857          * i915_gem_busy_ioctl(). For the corresponding synchronized
4858          * lookup see i915_gem_object_lookup_rcu().
4859          */
4860         atomic_inc(&to_i915(obj->base.dev)->mm.free_count);
4861         call_rcu(&obj->rcu, __i915_gem_free_object_rcu);
4862 }
4863
4864 void __i915_gem_object_release_unless_active(struct drm_i915_gem_object *obj)
4865 {
4866         lockdep_assert_held(&obj->base.dev->struct_mutex);
4867
4868         if (!i915_gem_object_has_active_reference(obj) &&
4869             i915_gem_object_is_active(obj))
4870                 i915_gem_object_set_active_reference(obj);
4871         else
4872                 i915_gem_object_put(obj);
4873 }
4874
4875 void i915_gem_sanitize(struct drm_i915_private *i915)
4876 {
4877         GEM_TRACE("\n");
4878
4879         mutex_lock(&i915->drm.struct_mutex);
4880
4881         intel_runtime_pm_get(i915);
4882         intel_uncore_forcewake_get(i915, FORCEWAKE_ALL);
4883
4884         /*
4885          * As we have just resumed the machine and woken the device up from
4886          * deep PCI sleep (presumably D3_cold), assume the HW has been reset
4887          * back to defaults, recovering from whatever wedged state we left it
4888          * in and so worth trying to use the device once more.
4889          */
4890         if (i915_terminally_wedged(&i915->gpu_error))
4891                 i915_gem_unset_wedged(i915);
4892
4893         /*
4894          * If we inherit context state from the BIOS or earlier occupants
4895          * of the GPU, the GPU may be in an inconsistent state when we
4896          * try to take over. The only way to remove the earlier state
4897          * is by resetting. However, resetting on earlier gen is tricky as
4898          * it may impact the display and we are uncertain about the stability
4899          * of the reset, so this could be applied to even earlier gen.
4900          */
4901         intel_engines_sanitize(i915, false);
4902
4903         intel_uncore_forcewake_put(i915, FORCEWAKE_ALL);
4904         intel_runtime_pm_put(i915);
4905
4906         i915_gem_contexts_lost(i915);
4907         mutex_unlock(&i915->drm.struct_mutex);
4908 }
4909
4910 int i915_gem_suspend(struct drm_i915_private *i915)
4911 {
4912         int ret;
4913
4914         GEM_TRACE("\n");
4915
4916         intel_runtime_pm_get(i915);
4917         intel_suspend_gt_powersave(i915);
4918
4919         mutex_lock(&i915->drm.struct_mutex);
4920
4921         /*
4922          * We have to flush all the executing contexts to main memory so
4923          * that they can saved in the hibernation image. To ensure the last
4924          * context image is coherent, we have to switch away from it. That
4925          * leaves the i915->kernel_context still active when
4926          * we actually suspend, and its image in memory may not match the GPU
4927          * state. Fortunately, the kernel_context is disposable and we do
4928          * not rely on its state.
4929          */
4930         if (!i915_terminally_wedged(&i915->gpu_error)) {
4931                 ret = i915_gem_switch_to_kernel_context(i915);
4932                 if (ret)
4933                         goto err_unlock;
4934
4935                 ret = i915_gem_wait_for_idle(i915,
4936                                              I915_WAIT_INTERRUPTIBLE |
4937                                              I915_WAIT_LOCKED |
4938                                              I915_WAIT_FOR_IDLE_BOOST,
4939                                              MAX_SCHEDULE_TIMEOUT);
4940                 if (ret && ret != -EIO)
4941                         goto err_unlock;
4942
4943                 assert_kernel_context_is_current(i915);
4944         }
4945         i915_retire_requests(i915); /* ensure we flush after wedging */
4946
4947         mutex_unlock(&i915->drm.struct_mutex);
4948
4949         intel_uc_suspend(i915);
4950
4951         cancel_delayed_work_sync(&i915->gpu_error.hangcheck_work);
4952         cancel_delayed_work_sync(&i915->gt.retire_work);
4953
4954         /*
4955          * As the idle_work is rearming if it detects a race, play safe and
4956          * repeat the flush until it is definitely idle.
4957          */
4958         drain_delayed_work(&i915->gt.idle_work);
4959
4960         /*
4961          * Assert that we successfully flushed all the work and
4962          * reset the GPU back to its idle, low power state.
4963          */
4964         WARN_ON(i915->gt.awake);
4965         if (WARN_ON(!intel_engines_are_idle(i915)))
4966                 i915_gem_set_wedged(i915); /* no hope, discard everything */
4967
4968         intel_runtime_pm_put(i915);
4969         return 0;
4970
4971 err_unlock:
4972         mutex_unlock(&i915->drm.struct_mutex);
4973         intel_runtime_pm_put(i915);
4974         return ret;
4975 }
4976
4977 void i915_gem_suspend_late(struct drm_i915_private *i915)
4978 {
4979         struct drm_i915_gem_object *obj;
4980         struct list_head *phases[] = {
4981                 &i915->mm.unbound_list,
4982                 &i915->mm.bound_list,
4983                 NULL
4984         }, **phase;
4985
4986         /*
4987          * Neither the BIOS, ourselves or any other kernel
4988          * expects the system to be in execlists mode on startup,
4989          * so we need to reset the GPU back to legacy mode. And the only
4990          * known way to disable logical contexts is through a GPU reset.
4991          *
4992          * So in order to leave the system in a known default configuration,
4993          * always reset the GPU upon unload and suspend. Afterwards we then
4994          * clean up the GEM state tracking, flushing off the requests and
4995          * leaving the system in a known idle state.
4996          *
4997          * Note that is of the upmost importance that the GPU is idle and
4998          * all stray writes are flushed *before* we dismantle the backing
4999          * storage for the pinned objects.
5000          *
5001          * However, since we are uncertain that resetting the GPU on older
5002          * machines is a good idea, we don't - just in case it leaves the
5003          * machine in an unusable condition.
5004          */
5005
5006         mutex_lock(&i915->drm.struct_mutex);
5007         for (phase = phases; *phase; phase++) {
5008                 list_for_each_entry(obj, *phase, mm.link)
5009                         WARN_ON(i915_gem_object_set_to_gtt_domain(obj, false));
5010         }
5011         mutex_unlock(&i915->drm.struct_mutex);
5012
5013         intel_uc_sanitize(i915);
5014         i915_gem_sanitize(i915);
5015 }
5016
5017 void i915_gem_resume(struct drm_i915_private *i915)
5018 {
5019         GEM_TRACE("\n");
5020
5021         WARN_ON(i915->gt.awake);
5022
5023         mutex_lock(&i915->drm.struct_mutex);
5024         intel_uncore_forcewake_get(i915, FORCEWAKE_ALL);
5025
5026         i915_gem_restore_gtt_mappings(i915);
5027         i915_gem_restore_fences(i915);
5028
5029         /*
5030          * As we didn't flush the kernel context before suspend, we cannot
5031          * guarantee that the context image is complete. So let's just reset
5032          * it and start again.
5033          */
5034         i915->gt.resume(i915);
5035
5036         if (i915_gem_init_hw(i915))
5037                 goto err_wedged;
5038
5039         intel_uc_resume(i915);
5040
5041         /* Always reload a context for powersaving. */
5042         if (i915_gem_switch_to_kernel_context(i915))
5043                 goto err_wedged;
5044
5045 out_unlock:
5046         intel_uncore_forcewake_put(i915, FORCEWAKE_ALL);
5047         mutex_unlock(&i915->drm.struct_mutex);
5048         return;
5049
5050 err_wedged:
5051         if (!i915_terminally_wedged(&i915->gpu_error)) {
5052                 DRM_ERROR("failed to re-initialize GPU, declaring wedged!\n");
5053                 i915_gem_set_wedged(i915);
5054         }
5055         goto out_unlock;
5056 }
5057
5058 void i915_gem_init_swizzling(struct drm_i915_private *dev_priv)
5059 {
5060         if (INTEL_GEN(dev_priv) < 5 ||
5061             dev_priv->mm.bit_6_swizzle_x == I915_BIT_6_SWIZZLE_NONE)
5062                 return;
5063
5064         I915_WRITE(DISP_ARB_CTL, I915_READ(DISP_ARB_CTL) |
5065                                  DISP_TILE_SURFACE_SWIZZLING);
5066
5067         if (IS_GEN(dev_priv, 5))
5068                 return;
5069
5070         I915_WRITE(TILECTL, I915_READ(TILECTL) | TILECTL_SWZCTL);
5071         if (IS_GEN(dev_priv, 6))
5072                 I915_WRITE(ARB_MODE, _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_SNB));
5073         else if (IS_GEN(dev_priv, 7))
5074                 I915_WRITE(ARB_MODE, _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_IVB));
5075         else if (IS_GEN(dev_priv, 8))
5076                 I915_WRITE(GAMTARBMODE, _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_BDW));
5077         else
5078                 BUG();
5079 }
5080
5081 static void init_unused_ring(struct drm_i915_private *dev_priv, u32 base)
5082 {
5083         I915_WRITE(RING_CTL(base), 0);
5084         I915_WRITE(RING_HEAD(base), 0);
5085         I915_WRITE(RING_TAIL(base), 0);
5086         I915_WRITE(RING_START(base), 0);
5087 }
5088
5089 static void init_unused_rings(struct drm_i915_private *dev_priv)
5090 {
5091         if (IS_I830(dev_priv)) {
5092                 init_unused_ring(dev_priv, PRB1_BASE);
5093                 init_unused_ring(dev_priv, SRB0_BASE);
5094                 init_unused_ring(dev_priv, SRB1_BASE);
5095                 init_unused_ring(dev_priv, SRB2_BASE);
5096                 init_unused_ring(dev_priv, SRB3_BASE);
5097         } else if (IS_GEN(dev_priv, 2)) {
5098                 init_unused_ring(dev_priv, SRB0_BASE);
5099                 init_unused_ring(dev_priv, SRB1_BASE);
5100         } else if (IS_GEN(dev_priv, 3)) {
5101                 init_unused_ring(dev_priv, PRB1_BASE);
5102                 init_unused_ring(dev_priv, PRB2_BASE);
5103         }
5104 }
5105
5106 static int __i915_gem_restart_engines(void *data)
5107 {
5108         struct drm_i915_private *i915 = data;
5109         struct intel_engine_cs *engine;
5110         enum intel_engine_id id;
5111         int err;
5112
5113         for_each_engine(engine, i915, id) {
5114                 err = engine->init_hw(engine);
5115                 if (err) {
5116                         DRM_ERROR("Failed to restart %s (%d)\n",
5117                                   engine->name, err);
5118                         return err;
5119                 }
5120         }
5121
5122         return 0;
5123 }
5124
5125 int i915_gem_init_hw(struct drm_i915_private *dev_priv)
5126 {
5127         int ret;
5128
5129         dev_priv->gt.last_init_time = ktime_get();
5130
5131         /* Double layer security blanket, see i915_gem_init() */
5132         intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
5133
5134         if (HAS_EDRAM(dev_priv) && INTEL_GEN(dev_priv) < 9)
5135                 I915_WRITE(HSW_IDICR, I915_READ(HSW_IDICR) | IDIHASHMSK(0xf));
5136
5137         if (IS_HASWELL(dev_priv))
5138                 I915_WRITE(MI_PREDICATE_RESULT_2, IS_HSW_GT3(dev_priv) ?
5139                            LOWER_SLICE_ENABLED : LOWER_SLICE_DISABLED);
5140
5141         /* Apply the GT workarounds... */
5142         intel_gt_apply_workarounds(dev_priv);
5143         /* ...and determine whether they are sticking. */
5144         intel_gt_verify_workarounds(dev_priv, "init");
5145
5146         i915_gem_init_swizzling(dev_priv);
5147
5148         /*
5149          * At least 830 can leave some of the unused rings
5150          * "active" (ie. head != tail) after resume which
5151          * will prevent c3 entry. Makes sure all unused rings
5152          * are totally idle.
5153          */
5154         init_unused_rings(dev_priv);
5155
5156         BUG_ON(!dev_priv->kernel_context);
5157         if (i915_terminally_wedged(&dev_priv->gpu_error)) {
5158                 ret = -EIO;
5159                 goto out;
5160         }
5161
5162         ret = i915_ppgtt_init_hw(dev_priv);
5163         if (ret) {
5164                 DRM_ERROR("Enabling PPGTT failed (%d)\n", ret);
5165                 goto out;
5166         }
5167
5168         ret = intel_wopcm_init_hw(&dev_priv->wopcm);
5169         if (ret) {
5170                 DRM_ERROR("Enabling WOPCM failed (%d)\n", ret);
5171                 goto out;
5172         }
5173
5174         /* We can't enable contexts until all firmware is loaded */
5175         ret = intel_uc_init_hw(dev_priv);
5176         if (ret) {
5177                 DRM_ERROR("Enabling uc failed (%d)\n", ret);
5178                 goto out;
5179         }
5180
5181         intel_mocs_init_l3cc_table(dev_priv);
5182
5183         /* Only when the HW is re-initialised, can we replay the requests */
5184         ret = __i915_gem_restart_engines(dev_priv);
5185         if (ret)
5186                 goto cleanup_uc;
5187
5188         intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
5189
5190         return 0;
5191
5192 cleanup_uc:
5193         intel_uc_fini_hw(dev_priv);
5194 out:
5195         intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
5196
5197         return ret;
5198 }
5199
5200 static int __intel_engines_record_defaults(struct drm_i915_private *i915)
5201 {
5202         struct i915_gem_context *ctx;
5203         struct intel_engine_cs *engine;
5204         enum intel_engine_id id;
5205         int err;
5206
5207         /*
5208          * As we reset the gpu during very early sanitisation, the current
5209          * register state on the GPU should reflect its defaults values.
5210          * We load a context onto the hw (with restore-inhibit), then switch
5211          * over to a second context to save that default register state. We
5212          * can then prime every new context with that state so they all start
5213          * from the same default HW values.
5214          */
5215
5216         ctx = i915_gem_context_create_kernel(i915, 0);
5217         if (IS_ERR(ctx))
5218                 return PTR_ERR(ctx);
5219
5220         for_each_engine(engine, i915, id) {
5221                 struct i915_request *rq;
5222
5223                 rq = i915_request_alloc(engine, ctx);
5224                 if (IS_ERR(rq)) {
5225                         err = PTR_ERR(rq);
5226                         goto out_ctx;
5227                 }
5228
5229                 err = 0;
5230                 if (engine->init_context)
5231                         err = engine->init_context(rq);
5232
5233                 i915_request_add(rq);
5234                 if (err)
5235                         goto err_active;
5236         }
5237
5238         err = i915_gem_switch_to_kernel_context(i915);
5239         if (err)
5240                 goto err_active;
5241
5242         if (i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED, HZ / 5)) {
5243                 i915_gem_set_wedged(i915);
5244                 err = -EIO; /* Caller will declare us wedged */
5245                 goto err_active;
5246         }
5247
5248         assert_kernel_context_is_current(i915);
5249
5250         /*
5251          * Immediately park the GPU so that we enable powersaving and
5252          * treat it as idle. The next time we issue a request, we will
5253          * unpark and start using the engine->pinned_default_state, otherwise
5254          * it is in limbo and an early reset may fail.
5255          */
5256         __i915_gem_park(i915);
5257
5258         for_each_engine(engine, i915, id) {
5259                 struct i915_vma *state;
5260                 void *vaddr;
5261
5262                 GEM_BUG_ON(to_intel_context(ctx, engine)->pin_count);
5263
5264                 state = to_intel_context(ctx, engine)->state;
5265                 if (!state)
5266                         continue;
5267
5268                 /*
5269                  * As we will hold a reference to the logical state, it will
5270                  * not be torn down with the context, and importantly the
5271                  * object will hold onto its vma (making it possible for a
5272                  * stray GTT write to corrupt our defaults). Unmap the vma
5273                  * from the GTT to prevent such accidents and reclaim the
5274                  * space.
5275                  */
5276                 err = i915_vma_unbind(state);
5277                 if (err)
5278                         goto err_active;
5279
5280                 err = i915_gem_object_set_to_cpu_domain(state->obj, false);
5281                 if (err)
5282                         goto err_active;
5283
5284                 engine->default_state = i915_gem_object_get(state->obj);
5285
5286                 /* Check we can acquire the image of the context state */
5287                 vaddr = i915_gem_object_pin_map(engine->default_state,
5288                                                 I915_MAP_FORCE_WB);
5289                 if (IS_ERR(vaddr)) {
5290                         err = PTR_ERR(vaddr);
5291                         goto err_active;
5292                 }
5293
5294                 i915_gem_object_unpin_map(engine->default_state);
5295         }
5296
5297         if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) {
5298                 unsigned int found = intel_engines_has_context_isolation(i915);
5299
5300                 /*
5301                  * Make sure that classes with multiple engine instances all
5302                  * share the same basic configuration.
5303                  */
5304                 for_each_engine(engine, i915, id) {
5305                         unsigned int bit = BIT(engine->uabi_class);
5306                         unsigned int expected = engine->default_state ? bit : 0;
5307
5308                         if ((found & bit) != expected) {
5309                                 DRM_ERROR("mismatching default context state for class %d on engine %s\n",
5310                                           engine->uabi_class, engine->name);
5311                         }
5312                 }
5313         }
5314
5315 out_ctx:
5316         i915_gem_context_set_closed(ctx);
5317         i915_gem_context_put(ctx);
5318         return err;
5319
5320 err_active:
5321         /*
5322          * If we have to abandon now, we expect the engines to be idle
5323          * and ready to be torn-down. First try to flush any remaining
5324          * request, ensure we are pointing at the kernel context and
5325          * then remove it.
5326          */
5327         if (WARN_ON(i915_gem_switch_to_kernel_context(i915)))
5328                 goto out_ctx;
5329
5330         if (WARN_ON(i915_gem_wait_for_idle(i915,
5331                                            I915_WAIT_LOCKED,
5332                                            MAX_SCHEDULE_TIMEOUT)))
5333                 goto out_ctx;
5334
5335         i915_gem_contexts_lost(i915);
5336         goto out_ctx;
5337 }
5338
5339 static int
5340 i915_gem_init_scratch(struct drm_i915_private *i915, unsigned int size)
5341 {
5342         struct drm_i915_gem_object *obj;
5343         struct i915_vma *vma;
5344         int ret;
5345
5346         obj = i915_gem_object_create_stolen(i915, size);
5347         if (!obj)
5348                 obj = i915_gem_object_create_internal(i915, size);
5349         if (IS_ERR(obj)) {
5350                 DRM_ERROR("Failed to allocate scratch page\n");
5351                 return PTR_ERR(obj);
5352         }
5353
5354         vma = i915_vma_instance(obj, &i915->ggtt.vm, NULL);
5355         if (IS_ERR(vma)) {
5356                 ret = PTR_ERR(vma);
5357                 goto err_unref;
5358         }
5359
5360         ret = i915_vma_pin(vma, 0, 0, PIN_GLOBAL | PIN_HIGH);
5361         if (ret)
5362                 goto err_unref;
5363
5364         i915->gt.scratch = vma;
5365         return 0;
5366
5367 err_unref:
5368         i915_gem_object_put(obj);
5369         return ret;
5370 }
5371
5372 static void i915_gem_fini_scratch(struct drm_i915_private *i915)
5373 {
5374         i915_vma_unpin_and_release(&i915->gt.scratch, 0);
5375 }
5376
5377 int i915_gem_init(struct drm_i915_private *dev_priv)
5378 {
5379         int ret;
5380
5381         /* We need to fallback to 4K pages if host doesn't support huge gtt. */
5382         if (intel_vgpu_active(dev_priv) && !intel_vgpu_has_huge_gtt(dev_priv))
5383                 mkwrite_device_info(dev_priv)->page_sizes =
5384                         I915_GTT_PAGE_SIZE_4K;
5385
5386         dev_priv->mm.unordered_timeline = dma_fence_context_alloc(1);
5387
5388         if (HAS_LOGICAL_RING_CONTEXTS(dev_priv)) {
5389                 dev_priv->gt.resume = intel_lr_context_resume;
5390                 dev_priv->gt.cleanup_engine = intel_logical_ring_cleanup;
5391         } else {
5392                 dev_priv->gt.resume = intel_legacy_submission_resume;
5393                 dev_priv->gt.cleanup_engine = intel_engine_cleanup;
5394         }
5395
5396         ret = i915_gem_init_userptr(dev_priv);
5397         if (ret)
5398                 return ret;
5399
5400         ret = intel_uc_init_misc(dev_priv);
5401         if (ret)
5402                 return ret;
5403
5404         ret = intel_wopcm_init(&dev_priv->wopcm);
5405         if (ret)
5406                 goto err_uc_misc;
5407
5408         /* This is just a security blanket to placate dragons.
5409          * On some systems, we very sporadically observe that the first TLBs
5410          * used by the CS may be stale, despite us poking the TLB reset. If
5411          * we hold the forcewake during initialisation these problems
5412          * just magically go away.
5413          */
5414         mutex_lock(&dev_priv->drm.struct_mutex);
5415         intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
5416
5417         ret = i915_gem_init_ggtt(dev_priv);
5418         if (ret) {
5419                 GEM_BUG_ON(ret == -EIO);
5420                 goto err_unlock;
5421         }
5422
5423         ret = i915_gem_init_scratch(dev_priv,
5424                                     IS_GEN(dev_priv, 2) ? SZ_256K : PAGE_SIZE);
5425         if (ret) {
5426                 GEM_BUG_ON(ret == -EIO);
5427                 goto err_ggtt;
5428         }
5429
5430         ret = i915_gem_contexts_init(dev_priv);
5431         if (ret) {
5432                 GEM_BUG_ON(ret == -EIO);
5433                 goto err_scratch;
5434         }
5435
5436         ret = intel_engines_init(dev_priv);
5437         if (ret) {
5438                 GEM_BUG_ON(ret == -EIO);
5439                 goto err_context;
5440         }
5441
5442         intel_init_gt_powersave(dev_priv);
5443
5444         ret = intel_uc_init(dev_priv);
5445         if (ret)
5446                 goto err_pm;
5447
5448         ret = i915_gem_init_hw(dev_priv);
5449         if (ret)
5450                 goto err_uc_init;
5451
5452         /*
5453          * Despite its name intel_init_clock_gating applies both display
5454          * clock gating workarounds; GT mmio workarounds and the occasional
5455          * GT power context workaround. Worse, sometimes it includes a context
5456          * register workaround which we need to apply before we record the
5457          * default HW state for all contexts.
5458          *
5459          * FIXME: break up the workarounds and apply them at the right time!
5460          */
5461         intel_init_clock_gating(dev_priv);
5462
5463         ret = __intel_engines_record_defaults(dev_priv);
5464         if (ret)
5465                 goto err_init_hw;
5466
5467         if (i915_inject_load_failure()) {
5468                 ret = -ENODEV;
5469                 goto err_init_hw;
5470         }
5471
5472         if (i915_inject_load_failure()) {
5473                 ret = -EIO;
5474                 goto err_init_hw;
5475         }
5476
5477         intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
5478         mutex_unlock(&dev_priv->drm.struct_mutex);
5479
5480         return 0;
5481
5482         /*
5483          * Unwinding is complicated by that we want to handle -EIO to mean
5484          * disable GPU submission but keep KMS alive. We want to mark the
5485          * HW as irrevisibly wedged, but keep enough state around that the
5486          * driver doesn't explode during runtime.
5487          */
5488 err_init_hw:
5489         mutex_unlock(&dev_priv->drm.struct_mutex);
5490
5491         WARN_ON(i915_gem_suspend(dev_priv));
5492         i915_gem_suspend_late(dev_priv);
5493
5494         i915_gem_drain_workqueue(dev_priv);
5495
5496         mutex_lock(&dev_priv->drm.struct_mutex);
5497         intel_uc_fini_hw(dev_priv);
5498 err_uc_init:
5499         intel_uc_fini(dev_priv);
5500 err_pm:
5501         if (ret != -EIO) {
5502                 intel_cleanup_gt_powersave(dev_priv);
5503                 i915_gem_cleanup_engines(dev_priv);
5504         }
5505 err_context:
5506         if (ret != -EIO)
5507                 i915_gem_contexts_fini(dev_priv);
5508 err_scratch:
5509         i915_gem_fini_scratch(dev_priv);
5510 err_ggtt:
5511 err_unlock:
5512         intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
5513         mutex_unlock(&dev_priv->drm.struct_mutex);
5514
5515 err_uc_misc:
5516         intel_uc_fini_misc(dev_priv);
5517
5518         if (ret != -EIO)
5519                 i915_gem_cleanup_userptr(dev_priv);
5520
5521         if (ret == -EIO) {
5522                 mutex_lock(&dev_priv->drm.struct_mutex);
5523
5524                 /*
5525                  * Allow engine initialisation to fail by marking the GPU as
5526                  * wedged. But we only want to do this where the GPU is angry,
5527                  * for all other failure, such as an allocation failure, bail.
5528                  */
5529                 if (!i915_terminally_wedged(&dev_priv->gpu_error)) {
5530                         i915_load_error(dev_priv,
5531                                         "Failed to initialize GPU, declaring it wedged!\n");
5532                         i915_gem_set_wedged(dev_priv);
5533                 }
5534
5535                 /* Minimal basic recovery for KMS */
5536                 ret = i915_ggtt_enable_hw(dev_priv);
5537                 i915_gem_restore_gtt_mappings(dev_priv);
5538                 i915_gem_restore_fences(dev_priv);
5539                 intel_init_clock_gating(dev_priv);
5540
5541                 mutex_unlock(&dev_priv->drm.struct_mutex);
5542         }
5543
5544         i915_gem_drain_freed_objects(dev_priv);
5545         return ret;
5546 }
5547
5548 void i915_gem_fini(struct drm_i915_private *dev_priv)
5549 {
5550         i915_gem_suspend_late(dev_priv);
5551         intel_disable_gt_powersave(dev_priv);
5552
5553         /* Flush any outstanding unpin_work. */
5554         i915_gem_drain_workqueue(dev_priv);
5555
5556         mutex_lock(&dev_priv->drm.struct_mutex);
5557         intel_uc_fini_hw(dev_priv);
5558         intel_uc_fini(dev_priv);
5559         i915_gem_cleanup_engines(dev_priv);
5560         i915_gem_contexts_fini(dev_priv);
5561         i915_gem_fini_scratch(dev_priv);
5562         mutex_unlock(&dev_priv->drm.struct_mutex);
5563
5564         intel_wa_list_free(&dev_priv->gt_wa_list);
5565
5566         intel_cleanup_gt_powersave(dev_priv);
5567
5568         intel_uc_fini_misc(dev_priv);
5569         i915_gem_cleanup_userptr(dev_priv);
5570
5571         i915_gem_drain_freed_objects(dev_priv);
5572
5573         WARN_ON(!list_empty(&dev_priv->contexts.list));
5574 }
5575
5576 void i915_gem_init_mmio(struct drm_i915_private *i915)
5577 {
5578         i915_gem_sanitize(i915);
5579 }
5580
5581 void
5582 i915_gem_cleanup_engines(struct drm_i915_private *dev_priv)
5583 {
5584         struct intel_engine_cs *engine;
5585         enum intel_engine_id id;
5586
5587         for_each_engine(engine, dev_priv, id)
5588                 dev_priv->gt.cleanup_engine(engine);
5589 }
5590
5591 void
5592 i915_gem_load_init_fences(struct drm_i915_private *dev_priv)
5593 {
5594         int i;
5595
5596         if (INTEL_GEN(dev_priv) >= 7 && !IS_VALLEYVIEW(dev_priv) &&
5597             !IS_CHERRYVIEW(dev_priv))
5598                 dev_priv->num_fence_regs = 32;
5599         else if (INTEL_GEN(dev_priv) >= 4 ||
5600                  IS_I945G(dev_priv) || IS_I945GM(dev_priv) ||
5601                  IS_G33(dev_priv) || IS_PINEVIEW(dev_priv))
5602                 dev_priv->num_fence_regs = 16;
5603         else
5604                 dev_priv->num_fence_regs = 8;
5605
5606         if (intel_vgpu_active(dev_priv))
5607                 dev_priv->num_fence_regs =
5608                                 I915_READ(vgtif_reg(avail_rs.fence_num));
5609
5610         /* Initialize fence registers to zero */
5611         for (i = 0; i < dev_priv->num_fence_regs; i++) {
5612                 struct drm_i915_fence_reg *fence = &dev_priv->fence_regs[i];
5613
5614                 fence->i915 = dev_priv;
5615                 fence->id = i;
5616                 list_add_tail(&fence->link, &dev_priv->mm.fence_list);
5617         }
5618         i915_gem_restore_fences(dev_priv);
5619
5620         i915_gem_detect_bit_6_swizzle(dev_priv);
5621 }
5622
5623 static void i915_gem_init__mm(struct drm_i915_private *i915)
5624 {
5625         spin_lock_init(&i915->mm.object_stat_lock);
5626         spin_lock_init(&i915->mm.obj_lock);
5627         spin_lock_init(&i915->mm.free_lock);
5628
5629         init_llist_head(&i915->mm.free_list);
5630
5631         INIT_LIST_HEAD(&i915->mm.unbound_list);
5632         INIT_LIST_HEAD(&i915->mm.bound_list);
5633         INIT_LIST_HEAD(&i915->mm.fence_list);
5634         INIT_LIST_HEAD(&i915->mm.userfault_list);
5635
5636         INIT_WORK(&i915->mm.free_work, __i915_gem_free_work);
5637 }
5638
5639 int i915_gem_init_early(struct drm_i915_private *dev_priv)
5640 {
5641         int err = -ENOMEM;
5642
5643         dev_priv->objects = KMEM_CACHE(drm_i915_gem_object, SLAB_HWCACHE_ALIGN);
5644         if (!dev_priv->objects)
5645                 goto err_out;
5646
5647         dev_priv->vmas = KMEM_CACHE(i915_vma, SLAB_HWCACHE_ALIGN);
5648         if (!dev_priv->vmas)
5649                 goto err_objects;
5650
5651         dev_priv->luts = KMEM_CACHE(i915_lut_handle, 0);
5652         if (!dev_priv->luts)
5653                 goto err_vmas;
5654
5655         dev_priv->requests = KMEM_CACHE(i915_request,
5656                                         SLAB_HWCACHE_ALIGN |
5657                                         SLAB_RECLAIM_ACCOUNT |
5658                                         SLAB_TYPESAFE_BY_RCU);
5659         if (!dev_priv->requests)
5660                 goto err_luts;
5661
5662         dev_priv->dependencies = KMEM_CACHE(i915_dependency,
5663                                             SLAB_HWCACHE_ALIGN |
5664                                             SLAB_RECLAIM_ACCOUNT);
5665         if (!dev_priv->dependencies)
5666                 goto err_requests;
5667
5668         dev_priv->priorities = KMEM_CACHE(i915_priolist, SLAB_HWCACHE_ALIGN);
5669         if (!dev_priv->priorities)
5670                 goto err_dependencies;
5671
5672         INIT_LIST_HEAD(&dev_priv->gt.timelines);
5673         INIT_LIST_HEAD(&dev_priv->gt.active_rings);
5674         INIT_LIST_HEAD(&dev_priv->gt.closed_vma);
5675
5676         i915_gem_init__mm(dev_priv);
5677
5678         INIT_DELAYED_WORK(&dev_priv->gt.retire_work,
5679                           i915_gem_retire_work_handler);
5680         INIT_DELAYED_WORK(&dev_priv->gt.idle_work,
5681                           i915_gem_idle_work_handler);
5682         init_waitqueue_head(&dev_priv->gpu_error.wait_queue);
5683         init_waitqueue_head(&dev_priv->gpu_error.reset_queue);
5684
5685         atomic_set(&dev_priv->mm.bsd_engine_dispatch_index, 0);
5686
5687         spin_lock_init(&dev_priv->fb_tracking.lock);
5688
5689         err = i915_gemfs_init(dev_priv);
5690         if (err)
5691                 DRM_NOTE("Unable to create a private tmpfs mount, hugepage support will be disabled(%d).\n", err);
5692
5693         return 0;
5694
5695 err_dependencies:
5696         kmem_cache_destroy(dev_priv->dependencies);
5697 err_requests:
5698         kmem_cache_destroy(dev_priv->requests);
5699 err_luts:
5700         kmem_cache_destroy(dev_priv->luts);
5701 err_vmas:
5702         kmem_cache_destroy(dev_priv->vmas);
5703 err_objects:
5704         kmem_cache_destroy(dev_priv->objects);
5705 err_out:
5706         return err;
5707 }
5708
5709 void i915_gem_cleanup_early(struct drm_i915_private *dev_priv)
5710 {
5711         i915_gem_drain_freed_objects(dev_priv);
5712         GEM_BUG_ON(!llist_empty(&dev_priv->mm.free_list));
5713         GEM_BUG_ON(atomic_read(&dev_priv->mm.free_count));
5714         WARN_ON(dev_priv->mm.object_count);
5715         WARN_ON(!list_empty(&dev_priv->gt.timelines));
5716
5717         kmem_cache_destroy(dev_priv->priorities);
5718         kmem_cache_destroy(dev_priv->dependencies);
5719         kmem_cache_destroy(dev_priv->requests);
5720         kmem_cache_destroy(dev_priv->luts);
5721         kmem_cache_destroy(dev_priv->vmas);
5722         kmem_cache_destroy(dev_priv->objects);
5723
5724         /* And ensure that our DESTROY_BY_RCU slabs are truly destroyed */
5725         rcu_barrier();
5726
5727         i915_gemfs_fini(dev_priv);
5728 }
5729
5730 int i915_gem_freeze(struct drm_i915_private *dev_priv)
5731 {
5732         /* Discard all purgeable objects, let userspace recover those as
5733          * required after resuming.
5734          */
5735         i915_gem_shrink_all(dev_priv);
5736
5737         return 0;
5738 }
5739
5740 int i915_gem_freeze_late(struct drm_i915_private *i915)
5741 {
5742         struct drm_i915_gem_object *obj;
5743         struct list_head *phases[] = {
5744                 &i915->mm.unbound_list,
5745                 &i915->mm.bound_list,
5746                 NULL
5747         }, **phase;
5748
5749         /*
5750          * Called just before we write the hibernation image.
5751          *
5752          * We need to update the domain tracking to reflect that the CPU
5753          * will be accessing all the pages to create and restore from the
5754          * hibernation, and so upon restoration those pages will be in the
5755          * CPU domain.
5756          *
5757          * To make sure the hibernation image contains the latest state,
5758          * we update that state just before writing out the image.
5759          *
5760          * To try and reduce the hibernation image, we manually shrink
5761          * the objects as well, see i915_gem_freeze()
5762          */
5763
5764         i915_gem_shrink(i915, -1UL, NULL, I915_SHRINK_UNBOUND);
5765         i915_gem_drain_freed_objects(i915);
5766
5767         mutex_lock(&i915->drm.struct_mutex);
5768         for (phase = phases; *phase; phase++) {
5769                 list_for_each_entry(obj, *phase, mm.link)
5770                         WARN_ON(i915_gem_object_set_to_cpu_domain(obj, true));
5771         }
5772         mutex_unlock(&i915->drm.struct_mutex);
5773
5774         return 0;
5775 }
5776
5777 void i915_gem_release(struct drm_device *dev, struct drm_file *file)
5778 {
5779         struct drm_i915_file_private *file_priv = file->driver_priv;
5780         struct i915_request *request;
5781
5782         /* Clean up our request list when the client is going away, so that
5783          * later retire_requests won't dereference our soon-to-be-gone
5784          * file_priv.
5785          */
5786         spin_lock(&file_priv->mm.lock);
5787         list_for_each_entry(request, &file_priv->mm.request_list, client_link)
5788                 request->file_priv = NULL;
5789         spin_unlock(&file_priv->mm.lock);
5790 }
5791
5792 int i915_gem_open(struct drm_i915_private *i915, struct drm_file *file)
5793 {
5794         struct drm_i915_file_private *file_priv;
5795         int ret;
5796
5797         DRM_DEBUG("\n");
5798
5799         file_priv = kzalloc(sizeof(*file_priv), GFP_KERNEL);
5800         if (!file_priv)
5801                 return -ENOMEM;
5802
5803         file->driver_priv = file_priv;
5804         file_priv->dev_priv = i915;
5805         file_priv->file = file;
5806
5807         spin_lock_init(&file_priv->mm.lock);
5808         INIT_LIST_HEAD(&file_priv->mm.request_list);
5809
5810         file_priv->bsd_engine = -1;
5811         file_priv->hang_timestamp = jiffies;
5812
5813         ret = i915_gem_context_open(i915, file);
5814         if (ret)
5815                 kfree(file_priv);
5816
5817         return ret;
5818 }
5819
5820 /**
5821  * i915_gem_track_fb - update frontbuffer tracking
5822  * @old: current GEM buffer for the frontbuffer slots
5823  * @new: new GEM buffer for the frontbuffer slots
5824  * @frontbuffer_bits: bitmask of frontbuffer slots
5825  *
5826  * This updates the frontbuffer tracking bits @frontbuffer_bits by clearing them
5827  * from @old and setting them in @new. Both @old and @new can be NULL.
5828  */
5829 void i915_gem_track_fb(struct drm_i915_gem_object *old,
5830                        struct drm_i915_gem_object *new,
5831                        unsigned frontbuffer_bits)
5832 {
5833         /* Control of individual bits within the mask are guarded by
5834          * the owning plane->mutex, i.e. we can never see concurrent
5835          * manipulation of individual bits. But since the bitfield as a whole
5836          * is updated using RMW, we need to use atomics in order to update
5837          * the bits.
5838          */
5839         BUILD_BUG_ON(INTEL_FRONTBUFFER_BITS_PER_PIPE * I915_MAX_PIPES >
5840                      BITS_PER_TYPE(atomic_t));
5841
5842         if (old) {
5843                 WARN_ON(!(atomic_read(&old->frontbuffer_bits) & frontbuffer_bits));
5844                 atomic_andnot(frontbuffer_bits, &old->frontbuffer_bits);
5845         }
5846
5847         if (new) {
5848                 WARN_ON(atomic_read(&new->frontbuffer_bits) & frontbuffer_bits);
5849                 atomic_or(frontbuffer_bits, &new->frontbuffer_bits);
5850         }
5851 }
5852
5853 /* Allocate a new GEM object and fill it with the supplied data */
5854 struct drm_i915_gem_object *
5855 i915_gem_object_create_from_data(struct drm_i915_private *dev_priv,
5856                                  const void *data, size_t size)
5857 {
5858         struct drm_i915_gem_object *obj;
5859         struct file *file;
5860         size_t offset;
5861         int err;
5862
5863         obj = i915_gem_object_create(dev_priv, round_up(size, PAGE_SIZE));
5864         if (IS_ERR(obj))
5865                 return obj;
5866
5867         GEM_BUG_ON(obj->write_domain != I915_GEM_DOMAIN_CPU);
5868
5869         file = obj->base.filp;
5870         offset = 0;
5871         do {
5872                 unsigned int len = min_t(typeof(size), size, PAGE_SIZE);
5873                 struct page *page;
5874                 void *pgdata, *vaddr;
5875
5876                 err = pagecache_write_begin(file, file->f_mapping,
5877                                             offset, len, 0,
5878                                             &page, &pgdata);
5879                 if (err < 0)
5880                         goto fail;
5881
5882                 vaddr = kmap(page);
5883                 memcpy(vaddr, data, len);
5884                 kunmap(page);
5885
5886                 err = pagecache_write_end(file, file->f_mapping,
5887                                           offset, len, len,
5888                                           page, pgdata);
5889                 if (err < 0)
5890                         goto fail;
5891
5892                 size -= len;
5893                 data += len;
5894                 offset += len;
5895         } while (size);
5896
5897         return obj;
5898
5899 fail:
5900         i915_gem_object_put(obj);
5901         return ERR_PTR(err);
5902 }
5903
5904 struct scatterlist *
5905 i915_gem_object_get_sg(struct drm_i915_gem_object *obj,
5906                        unsigned int n,
5907                        unsigned int *offset)
5908 {
5909         struct i915_gem_object_page_iter *iter = &obj->mm.get_page;
5910         struct scatterlist *sg;
5911         unsigned int idx, count;
5912
5913         might_sleep();
5914         GEM_BUG_ON(n >= obj->base.size >> PAGE_SHIFT);
5915         GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
5916
5917         /* As we iterate forward through the sg, we record each entry in a
5918          * radixtree for quick repeated (backwards) lookups. If we have seen
5919          * this index previously, we will have an entry for it.
5920          *
5921          * Initial lookup is O(N), but this is amortized to O(1) for
5922          * sequential page access (where each new request is consecutive
5923          * to the previous one). Repeated lookups are O(lg(obj->base.size)),
5924          * i.e. O(1) with a large constant!
5925          */
5926         if (n < READ_ONCE(iter->sg_idx))
5927                 goto lookup;
5928
5929         mutex_lock(&iter->lock);
5930
5931         /* We prefer to reuse the last sg so that repeated lookup of this
5932          * (or the subsequent) sg are fast - comparing against the last
5933          * sg is faster than going through the radixtree.
5934          */
5935
5936         sg = iter->sg_pos;
5937         idx = iter->sg_idx;
5938         count = __sg_page_count(sg);
5939
5940         while (idx + count <= n) {
5941                 void *entry;
5942                 unsigned long i;
5943                 int ret;
5944
5945                 /* If we cannot allocate and insert this entry, or the
5946                  * individual pages from this range, cancel updating the
5947                  * sg_idx so that on this lookup we are forced to linearly
5948                  * scan onwards, but on future lookups we will try the
5949                  * insertion again (in which case we need to be careful of
5950                  * the error return reporting that we have already inserted
5951                  * this index).
5952                  */
5953                 ret = radix_tree_insert(&iter->radix, idx, sg);
5954                 if (ret && ret != -EEXIST)
5955                         goto scan;
5956
5957                 entry = xa_mk_value(idx);
5958                 for (i = 1; i < count; i++) {
5959                         ret = radix_tree_insert(&iter->radix, idx + i, entry);
5960                         if (ret && ret != -EEXIST)
5961                                 goto scan;
5962                 }
5963
5964                 idx += count;
5965                 sg = ____sg_next(sg);
5966                 count = __sg_page_count(sg);
5967         }
5968
5969 scan:
5970         iter->sg_pos = sg;
5971         iter->sg_idx = idx;
5972
5973         mutex_unlock(&iter->lock);
5974
5975         if (unlikely(n < idx)) /* insertion completed by another thread */
5976                 goto lookup;
5977
5978         /* In case we failed to insert the entry into the radixtree, we need
5979          * to look beyond the current sg.
5980          */
5981         while (idx + count <= n) {
5982                 idx += count;
5983                 sg = ____sg_next(sg);
5984                 count = __sg_page_count(sg);
5985         }
5986
5987         *offset = n - idx;
5988         return sg;
5989
5990 lookup:
5991         rcu_read_lock();
5992
5993         sg = radix_tree_lookup(&iter->radix, n);
5994         GEM_BUG_ON(!sg);
5995
5996         /* If this index is in the middle of multi-page sg entry,
5997          * the radix tree will contain a value entry that points
5998          * to the start of that range. We will return the pointer to
5999          * the base page and the offset of this page within the
6000          * sg entry's range.
6001          */
6002         *offset = 0;
6003         if (unlikely(xa_is_value(sg))) {
6004                 unsigned long base = xa_to_value(sg);
6005
6006                 sg = radix_tree_lookup(&iter->radix, base);
6007                 GEM_BUG_ON(!sg);
6008
6009                 *offset = n - base;
6010         }
6011
6012         rcu_read_unlock();
6013
6014         return sg;
6015 }
6016
6017 struct page *
6018 i915_gem_object_get_page(struct drm_i915_gem_object *obj, unsigned int n)
6019 {
6020         struct scatterlist *sg;
6021         unsigned int offset;
6022
6023         GEM_BUG_ON(!i915_gem_object_has_struct_page(obj));
6024
6025         sg = i915_gem_object_get_sg(obj, n, &offset);
6026         return nth_page(sg_page(sg), offset);
6027 }
6028
6029 /* Like i915_gem_object_get_page(), but mark the returned page dirty */
6030 struct page *
6031 i915_gem_object_get_dirty_page(struct drm_i915_gem_object *obj,
6032                                unsigned int n)
6033 {
6034         struct page *page;
6035
6036         page = i915_gem_object_get_page(obj, n);
6037         if (!obj->mm.dirty)
6038                 set_page_dirty(page);
6039
6040         return page;
6041 }
6042
6043 dma_addr_t
6044 i915_gem_object_get_dma_address(struct drm_i915_gem_object *obj,
6045                                 unsigned long n)
6046 {
6047         struct scatterlist *sg;
6048         unsigned int offset;
6049
6050         sg = i915_gem_object_get_sg(obj, n, &offset);
6051         return sg_dma_address(sg) + (offset << PAGE_SHIFT);
6052 }
6053
6054 int i915_gem_object_attach_phys(struct drm_i915_gem_object *obj, int align)
6055 {
6056         struct sg_table *pages;
6057         int err;
6058
6059         if (align > obj->base.size)
6060                 return -EINVAL;
6061
6062         if (obj->ops == &i915_gem_phys_ops)
6063                 return 0;
6064
6065         if (obj->ops != &i915_gem_object_ops)
6066                 return -EINVAL;
6067
6068         err = i915_gem_object_unbind(obj);
6069         if (err)
6070                 return err;
6071
6072         mutex_lock(&obj->mm.lock);
6073
6074         if (obj->mm.madv != I915_MADV_WILLNEED) {
6075                 err = -EFAULT;
6076                 goto err_unlock;
6077         }
6078
6079         if (obj->mm.quirked) {
6080                 err = -EFAULT;
6081                 goto err_unlock;
6082         }
6083
6084         if (obj->mm.mapping) {
6085                 err = -EBUSY;
6086                 goto err_unlock;
6087         }
6088
6089         pages = __i915_gem_object_unset_pages(obj);
6090
6091         obj->ops = &i915_gem_phys_ops;
6092
6093         err = ____i915_gem_object_get_pages(obj);
6094         if (err)
6095                 goto err_xfer;
6096
6097         /* Perma-pin (until release) the physical set of pages */
6098         __i915_gem_object_pin_pages(obj);
6099
6100         if (!IS_ERR_OR_NULL(pages))
6101                 i915_gem_object_ops.put_pages(obj, pages);
6102         mutex_unlock(&obj->mm.lock);
6103         return 0;
6104
6105 err_xfer:
6106         obj->ops = &i915_gem_object_ops;
6107         if (!IS_ERR_OR_NULL(pages)) {
6108                 unsigned int sg_page_sizes = i915_sg_page_sizes(pages->sgl);
6109
6110                 __i915_gem_object_set_pages(obj, pages, sg_page_sizes);
6111         }
6112 err_unlock:
6113         mutex_unlock(&obj->mm.lock);
6114         return err;
6115 }
6116
6117 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
6118 #include "selftests/scatterlist.c"
6119 #include "selftests/mock_gem_device.c"
6120 #include "selftests/huge_gem_object.c"
6121 #include "selftests/huge_pages.c"
6122 #include "selftests/i915_gem_object.c"
6123 #include "selftests/i915_gem_coherency.c"
6124 #include "selftests/i915_gem.c"
6125 #endif