drivers/gpu/drm/i915/i915_gem.c

   1 /*
   2  * Copyright © 2008-2015 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *    Eric Anholt <eric@anholt.net>
  25  *
  26  */
  27
  28 #include <drm/drmP.h>
  29 #include <drm/drm_vma_manager.h>
  30 #include <drm/i915_drm.h>
  31 #include "i915_drv.h"
  32 #include "i915_gem_clflush.h"
  33 #include "i915_vgpu.h"
  34 #include "i915_trace.h"
  35 #include "intel_drv.h"
  36 #include "intel_frontbuffer.h"
  37 #include "intel_mocs.h"
  38 #include "intel_workarounds.h"
  39 #include "i915_gemfs.h"
  40 #include <linux/dma-fence-array.h>
  41 #include <linux/kthread.h>
  42 #include <linux/reservation.h>
  43 #include <linux/shmem_fs.h>
  44 #include <linux/slab.h>
  45 #include <linux/stop_machine.h>
  46 #include <linux/swap.h>
  47 #include <linux/pci.h>
  48 #include <linux/dma-buf.h>
  49
  50 static void i915_gem_flush_free_objects(struct drm_i915_private *i915);
  51
  52 static bool cpu_write_needs_clflush(struct drm_i915_gem_object *obj)
  53 {
  54         if (obj->cache_dirty)
  55                 return false;
  56
  57         if (!(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE))
  58                 return true;
  59
  60         return obj->pin_global; /* currently in use by HW, keep flushed */
  61 }
  62
  63 static int
  64 insert_mappable_node(struct i915_ggtt *ggtt,
  65                      struct drm_mm_node *node, u32 size)
  66 {
  67         memset(node, 0, sizeof(*node));
  68         return drm_mm_insert_node_in_range(&ggtt->vm.mm, node,
  69                                            size, 0, I915_COLOR_UNEVICTABLE,
  70                                            0, ggtt->mappable_end,
  71                                            DRM_MM_INSERT_LOW);
  72 }
  73
  74 static void
  75 remove_mappable_node(struct drm_mm_node *node)
  76 {
  77         drm_mm_remove_node(node);
  78 }
  79
  80 /* some bookkeeping */
  81 static void i915_gem_info_add_obj(struct drm_i915_private *dev_priv,
  82                                   u64 size)
  83 {
  84         spin_lock(&dev_priv->mm.object_stat_lock);
  85         dev_priv->mm.object_count++;
  86         dev_priv->mm.object_memory += size;
  87         spin_unlock(&dev_priv->mm.object_stat_lock);
  88 }
  89
  90 static void i915_gem_info_remove_obj(struct drm_i915_private *dev_priv,
  91                                      u64 size)
  92 {
  93         spin_lock(&dev_priv->mm.object_stat_lock);
  94         dev_priv->mm.object_count--;
  95         dev_priv->mm.object_memory -= size;
  96         spin_unlock(&dev_priv->mm.object_stat_lock);
  97 }
  98
  99 static int
 100 i915_gem_wait_for_error(struct i915_gpu_error *error)
 101 {
 102         int ret;
 103
 104         might_sleep();
 105
 106         /*
 107          * Only wait 10 seconds for the gpu reset to complete to avoid hanging
 108          * userspace. If it takes that long something really bad is going on and
 109          * we should simply try to bail out and fail as gracefully as possible.
 110          */
 111         ret = wait_event_interruptible_timeout(error->reset_queue,
 112                                                !i915_reset_backoff(error),
 113                                                I915_RESET_TIMEOUT);
 114         if (ret == 0) {
 115                 DRM_ERROR("Timed out waiting for the gpu reset to complete\n");
 116                 return -EIO;
 117         } else if (ret < 0) {
 118                 return ret;
 119         } else {
 120                 return 0;
 121         }
 122 }
 123
 124 int i915_mutex_lock_interruptible(struct drm_device *dev)
 125 {
 126         struct drm_i915_private *dev_priv = to_i915(dev);
 127         int ret;
 128
 129         ret = i915_gem_wait_for_error(&dev_priv->gpu_error);
 130         if (ret)
 131                 return ret;
 132
 133         ret = mutex_lock_interruptible(&dev->struct_mutex);
 134         if (ret)
 135                 return ret;
 136
 137         return 0;
 138 }
 139
 140 static u32 __i915_gem_park(struct drm_i915_private *i915)
 141 {
 142         GEM_TRACE("\n");
 143
 144         lockdep_assert_held(&i915->drm.struct_mutex);
 145         GEM_BUG_ON(i915->gt.active_requests);
 146         GEM_BUG_ON(!list_empty(&i915->gt.active_rings));
 147
 148         if (!i915->gt.awake)
 149                 return I915_EPOCH_INVALID;
 150
 151         GEM_BUG_ON(i915->gt.epoch == I915_EPOCH_INVALID);
 152
 153         /*
 154          * Be paranoid and flush a concurrent interrupt to make sure
 155          * we don't reactivate any irq tasklets after parking.
 156          *
 157          * FIXME: Note that even though we have waited for execlists to be idle,
 158          * there may still be an in-flight interrupt even though the CSB
 159          * is now empty. synchronize_irq() makes sure that a residual interrupt
 160          * is completed before we continue, but it doesn't prevent the HW from
 161          * raising a spurious interrupt later. To complete the shield we should
 162          * coordinate disabling the CS irq with flushing the interrupts.
 163          */
 164         synchronize_irq(i915->drm.irq);
 165
 166         intel_engines_park(i915);
 167         i915_timelines_park(i915);
 168
 169         i915_pmu_gt_parked(i915);
 170         i915_vma_parked(i915);
 171
 172         i915->gt.awake = false;
 173
 174         if (INTEL_GEN(i915) >= 6)
 175                 gen6_rps_idle(i915);
 176
 177         intel_display_power_put(i915, POWER_DOMAIN_GT_IRQ);
 178
 179         intel_runtime_pm_put(i915);
 180
 181         return i915->gt.epoch;
 182 }
 183
 184 void i915_gem_park(struct drm_i915_private *i915)
 185 {
 186         GEM_TRACE("\n");
 187
 188         lockdep_assert_held(&i915->drm.struct_mutex);
 189         GEM_BUG_ON(i915->gt.active_requests);
 190
 191         if (!i915->gt.awake)
 192                 return;
 193
 194         /* Defer the actual call to __i915_gem_park() to prevent ping-pongs */
 195         mod_delayed_work(i915->wq, &i915->gt.idle_work, msecs_to_jiffies(100));
 196 }
 197
 198 void i915_gem_unpark(struct drm_i915_private *i915)
 199 {
 200         GEM_TRACE("\n");
 201
 202         lockdep_assert_held(&i915->drm.struct_mutex);
 203         GEM_BUG_ON(!i915->gt.active_requests);
 204
 205         if (i915->gt.awake)
 206                 return;
 207
 208         intel_runtime_pm_get_noresume(i915);
 209
 210         /*
 211          * It seems that the DMC likes to transition between the DC states a lot
 212          * when there are no connected displays (no active power domains) during
 213          * command submission.
 214          *
 215          * This activity has negative impact on the performance of the chip with
 216          * huge latencies observed in the interrupt handler and elsewhere.
 217          *
 218          * Work around it by grabbing a GT IRQ power domain whilst there is any
 219          * GT activity, preventing any DC state transitions.
 220          */
 221         intel_display_power_get(i915, POWER_DOMAIN_GT_IRQ);
 222
 223         i915->gt.awake = true;
 224         if (unlikely(++i915->gt.epoch == 0)) /* keep 0 as invalid */
 225                 i915->gt.epoch = 1;
 226
 227         intel_enable_gt_powersave(i915);
 228         i915_update_gfx_val(i915);
 229         if (INTEL_GEN(i915) >= 6)
 230                 gen6_rps_busy(i915);
 231         i915_pmu_gt_unparked(i915);
 232
 233         intel_engines_unpark(i915);
 234
 235         i915_queue_hangcheck(i915);
 236
 237         queue_delayed_work(i915->wq,
 238                            &i915->gt.retire_work,
 239                            round_jiffies_up_relative(HZ));
 240 }
 241
 242 int
 243 i915_gem_get_aperture_ioctl(struct drm_device *dev, void *data,
 244                             struct drm_file *file)
 245 {
 246         struct drm_i915_private *dev_priv = to_i915(dev);
 247         struct i915_ggtt *ggtt = &dev_priv->ggtt;
 248         struct drm_i915_gem_get_aperture *args = data;
 249         struct i915_vma *vma;
 250         u64 pinned;
 251
 252         pinned = ggtt->vm.reserved;
 253         mutex_lock(&dev->struct_mutex);
 254         list_for_each_entry(vma, &ggtt->vm.active_list, vm_link)
 255                 if (i915_vma_is_pinned(vma))
 256                         pinned += vma->node.size;
 257         list_for_each_entry(vma, &ggtt->vm.inactive_list, vm_link)
 258                 if (i915_vma_is_pinned(vma))
 259                         pinned += vma->node.size;
 260         mutex_unlock(&dev->struct_mutex);
 261
 262         args->aper_size = ggtt->vm.total;
 263         args->aper_available_size = args->aper_size - pinned;
 264
 265         return 0;
 266 }
 267
 268 static int i915_gem_object_get_pages_phys(struct drm_i915_gem_object *obj)
 269 {
 270         struct address_space *mapping = obj->base.filp->f_mapping;
 271         drm_dma_handle_t *phys;
 272         struct sg_table *st;
 273         struct scatterlist *sg;
 274         char *vaddr;
 275         int i;
 276         int err;
 277
 278         if (WARN_ON(i915_gem_object_needs_bit17_swizzle(obj)))
 279                 return -EINVAL;
 280
 281         /* Always aligning to the object size, allows a single allocation
 282          * to handle all possible callers, and given typical object sizes,
 283          * the alignment of the buddy allocation will naturally match.
 284          */
 285         phys = drm_pci_alloc(obj->base.dev,
 286                              roundup_pow_of_two(obj->base.size),
 287                              roundup_pow_of_two(obj->base.size));
 288         if (!phys)
 289                 return -ENOMEM;
 290
 291         vaddr = phys->vaddr;
 292         for (i = 0; i < obj->base.size / PAGE_SIZE; i++) {
 293                 struct page *page;
 294                 char *src;
 295
 296                 page = shmem_read_mapping_page(mapping, i);
 297                 if (IS_ERR(page)) {
 298                         err = PTR_ERR(page);
 299                         goto err_phys;
 300                 }
 301
 302                 src = kmap_atomic(page);
 303                 memcpy(vaddr, src, PAGE_SIZE);
 304                 drm_clflush_virt_range(vaddr, PAGE_SIZE);
 305                 kunmap_atomic(src);
 306
 307                 put_page(page);
 308                 vaddr += PAGE_SIZE;
 309         }
 310
 311         i915_gem_chipset_flush(to_i915(obj->base.dev));
 312
 313         st = kmalloc(sizeof(*st), GFP_KERNEL);
 314         if (!st) {
 315                 err = -ENOMEM;
 316                 goto err_phys;
 317         }
 318
 319         if (sg_alloc_table(st, 1, GFP_KERNEL)) {
 320                 kfree(st);
 321                 err = -ENOMEM;
 322                 goto err_phys;
 323         }
 324
 325         sg = st->sgl;
 326         sg->offset = 0;
 327         sg->length = obj->base.size;
 328
 329         sg_dma_address(sg) = phys->busaddr;
 330         sg_dma_len(sg) = obj->base.size;
 331
 332         obj->phys_handle = phys;
 333
 334         __i915_gem_object_set_pages(obj, st, sg->length);
 335
 336         return 0;
 337
 338 err_phys:
 339         drm_pci_free(obj->base.dev, phys);
 340
 341         return err;
 342 }
 343
 344 static void __start_cpu_write(struct drm_i915_gem_object *obj)
 345 {
 346         obj->read_domains = I915_GEM_DOMAIN_CPU;
 347         obj->write_domain = I915_GEM_DOMAIN_CPU;
 348         if (cpu_write_needs_clflush(obj))
 349                 obj->cache_dirty = true;
 350 }
 351
 352 static void
 353 __i915_gem_object_release_shmem(struct drm_i915_gem_object *obj,
 354                                 struct sg_table *pages,
 355                                 bool needs_clflush)
 356 {
 357         GEM_BUG_ON(obj->mm.madv == __I915_MADV_PURGED);
 358
 359         if (obj->mm.madv == I915_MADV_DONTNEED)
 360                 obj->mm.dirty = false;
 361
 362         if (needs_clflush &&
 363             (obj->read_domains & I915_GEM_DOMAIN_CPU) == 0 &&
 364             !(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ))
 365                 drm_clflush_sg(pages);
 366
 367         __start_cpu_write(obj);
 368 }
 369
 370 static void
 371 i915_gem_object_put_pages_phys(struct drm_i915_gem_object *obj,
 372                                struct sg_table *pages)
 373 {
 374         __i915_gem_object_release_shmem(obj, pages, false);
 375
 376         if (obj->mm.dirty) {
 377                 struct address_space *mapping = obj->base.filp->f_mapping;
 378                 char *vaddr = obj->phys_handle->vaddr;
 379                 int i;
 380
 381                 for (i = 0; i < obj->base.size / PAGE_SIZE; i++) {
 382                         struct page *page;
 383                         char *dst;
 384
 385                         page = shmem_read_mapping_page(mapping, i);
 386                         if (IS_ERR(page))
 387                                 continue;
 388
 389                         dst = kmap_atomic(page);
 390                         drm_clflush_virt_range(vaddr, PAGE_SIZE);
 391                         memcpy(dst, vaddr, PAGE_SIZE);
 392                         kunmap_atomic(dst);
 393
 394                         set_page_dirty(page);
 395                         if (obj->mm.madv == I915_MADV_WILLNEED)
 396                                 mark_page_accessed(page);
 397                         put_page(page);
 398                         vaddr += PAGE_SIZE;
 399                 }
 400                 obj->mm.dirty = false;
 401         }
 402
 403         sg_free_table(pages);
 404         kfree(pages);
 405
 406         drm_pci_free(obj->base.dev, obj->phys_handle);
 407 }
 408
 409 static void
 410 i915_gem_object_release_phys(struct drm_i915_gem_object *obj)
 411 {
 412         i915_gem_object_unpin_pages(obj);
 413 }
 414
 415 static const struct drm_i915_gem_object_ops i915_gem_phys_ops = {
 416         .get_pages = i915_gem_object_get_pages_phys,
 417         .put_pages = i915_gem_object_put_pages_phys,
 418         .release = i915_gem_object_release_phys,
 419 };
 420
 421 static const struct drm_i915_gem_object_ops i915_gem_object_ops;
 422
 423 int i915_gem_object_unbind(struct drm_i915_gem_object *obj)
 424 {
 425         struct i915_vma *vma;
 426         LIST_HEAD(still_in_list);
 427         int ret;
 428
 429         lockdep_assert_held(&obj->base.dev->struct_mutex);
 430
 431         /* Closed vma are removed from the obj->vma_list - but they may
 432          * still have an active binding on the object. To remove those we
 433          * must wait for all rendering to complete to the object (as unbinding
 434          * must anyway), and retire the requests.
 435          */
 436         ret = i915_gem_object_set_to_cpu_domain(obj, false);
 437         if (ret)
 438                 return ret;
 439
 440         while ((vma = list_first_entry_or_null(&obj->vma_list,
 441                                                struct i915_vma,
 442                                                obj_link))) {
 443                 list_move_tail(&vma->obj_link, &still_in_list);
 444                 ret = i915_vma_unbind(vma);
 445                 if (ret)
 446                         break;
 447         }
 448         list_splice(&still_in_list, &obj->vma_list);
 449
 450         return ret;
 451 }
 452
 453 static long
 454 i915_gem_object_wait_fence(struct dma_fence *fence,
 455                            unsigned int flags,
 456                            long timeout,
 457                            struct intel_rps_client *rps_client)
 458 {
 459         struct i915_request *rq;
 460
 461         BUILD_BUG_ON(I915_WAIT_INTERRUPTIBLE != 0x1);
 462
 463         if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags))
 464                 return timeout;
 465
 466         if (!dma_fence_is_i915(fence))
 467                 return dma_fence_wait_timeout(fence,
 468                                               flags & I915_WAIT_INTERRUPTIBLE,
 469                                               timeout);
 470
 471         rq = to_request(fence);
 472         if (i915_request_completed(rq))
 473                 goto out;
 474
 475         /*
 476          * This client is about to stall waiting for the GPU. In many cases
 477          * this is undesirable and limits the throughput of the system, as
 478          * many clients cannot continue processing user input/output whilst
 479          * blocked. RPS autotuning may take tens of milliseconds to respond
 480          * to the GPU load and thus incurs additional latency for the client.
 481          * We can circumvent that by promoting the GPU frequency to maximum
 482          * before we wait. This makes the GPU throttle up much more quickly
 483          * (good for benchmarks and user experience, e.g. window animations),
 484          * but at a cost of spending more power processing the workload
 485          * (bad for battery). Not all clients even want their results
 486          * immediately and for them we should just let the GPU select its own
 487          * frequency to maximise efficiency. To prevent a single client from
 488          * forcing the clocks too high for the whole system, we only allow
 489          * each client to waitboost once in a busy period.
 490          */
 491         if (rps_client && !i915_request_started(rq)) {
 492                 if (INTEL_GEN(rq->i915) >= 6)
 493                         gen6_rps_boost(rq, rps_client);
 494         }
 495
 496         timeout = i915_request_wait(rq, flags, timeout);
 497
 498 out:
 499         if (flags & I915_WAIT_LOCKED && i915_request_completed(rq))
 500                 i915_request_retire_upto(rq);
 501
 502         return timeout;
 503 }
 504
 505 static long
 506 i915_gem_object_wait_reservation(struct reservation_object *resv,
 507                                  unsigned int flags,
 508                                  long timeout,
 509                                  struct intel_rps_client *rps_client)
 510 {
 511         unsigned int seq = __read_seqcount_begin(&resv->seq);
 512         struct dma_fence *excl;
 513         bool prune_fences = false;
 514
 515         if (flags & I915_WAIT_ALL) {
 516                 struct dma_fence **shared;
 517                 unsigned int count, i;
 518                 int ret;
 519
 520                 ret = reservation_object_get_fences_rcu(resv,
 521                                                         &excl, &count, &shared);
 522                 if (ret)
 523                         return ret;
 524
 525                 for (i = 0; i < count; i++) {
 526                         timeout = i915_gem_object_wait_fence(shared[i],
 527                                                              flags, timeout,
 528                                                              rps_client);
 529                         if (timeout < 0)
 530                                 break;
 531
 532                         dma_fence_put(shared[i]);
 533                 }
 534
 535                 for (; i < count; i++)
 536                         dma_fence_put(shared[i]);
 537                 kfree(shared);
 538
 539                 /*
 540                  * If both shared fences and an exclusive fence exist,
 541                  * then by construction the shared fences must be later
 542                  * than the exclusive fence. If we successfully wait for
 543                  * all the shared fences, we know that the exclusive fence
 544                  * must all be signaled. If all the shared fences are
 545                  * signaled, we can prune the array and recover the
 546                  * floating references on the fences/requests.
 547                  */
 548                 prune_fences = count && timeout >= 0;
 549         } else {
 550                 excl = reservation_object_get_excl_rcu(resv);
 551         }
 552
 553         if (excl && timeout >= 0)
 554                 timeout = i915_gem_object_wait_fence(excl, flags, timeout,
 555                                                      rps_client);
 556
 557         dma_fence_put(excl);
 558
 559         /*
 560          * Opportunistically prune the fences iff we know they have *all* been
 561          * signaled and that the reservation object has not been changed (i.e.
 562          * no new fences have been added).
 563          */
 564         if (prune_fences && !__read_seqcount_retry(&resv->seq, seq)) {
 565                 if (reservation_object_trylock(resv)) {
 566                         if (!__read_seqcount_retry(&resv->seq, seq))
 567                                 reservation_object_add_excl_fence(resv, NULL);
 568                         reservation_object_unlock(resv);
 569                 }
 570         }
 571
 572         return timeout;
 573 }
 574
 575 static void __fence_set_priority(struct dma_fence *fence,
 576                                  const struct i915_sched_attr *attr)
 577 {
 578         struct i915_request *rq;
 579         struct intel_engine_cs *engine;
 580
 581         if (dma_fence_is_signaled(fence) || !dma_fence_is_i915(fence))
 582                 return;
 583
 584         rq = to_request(fence);
 585         engine = rq->engine;
 586
 587         local_bh_disable();
 588         rcu_read_lock(); /* RCU serialisation for set-wedged protection */
 589         if (engine->schedule)
 590                 engine->schedule(rq, attr);
 591         rcu_read_unlock();
 592         local_bh_enable(); /* kick the tasklets if queues were reprioritised */
 593 }
 594
 595 static void fence_set_priority(struct dma_fence *fence,
 596                                const struct i915_sched_attr *attr)
 597 {
 598         /* Recurse once into a fence-array */
 599         if (dma_fence_is_array(fence)) {
 600                 struct dma_fence_array *array = to_dma_fence_array(fence);
 601                 int i;
 602
 603                 for (i = 0; i < array->num_fences; i++)
 604                         __fence_set_priority(array->fences[i], attr);
 605         } else {
 606                 __fence_set_priority(fence, attr);
 607         }
 608 }
 609
 610 int
 611 i915_gem_object_wait_priority(struct drm_i915_gem_object *obj,
 612                               unsigned int flags,
 613                               const struct i915_sched_attr *attr)
 614 {
 615         struct dma_fence *excl;
 616
 617         if (flags & I915_WAIT_ALL) {
 618                 struct dma_fence **shared;
 619                 unsigned int count, i;
 620                 int ret;
 621
 622                 ret = reservation_object_get_fences_rcu(obj->resv,
 623                                                         &excl, &count, &shared);
 624                 if (ret)
 625                         return ret;
 626
 627                 for (i = 0; i < count; i++) {
 628                         fence_set_priority(shared[i], attr);
 629                         dma_fence_put(shared[i]);
 630                 }
 631
 632                 kfree(shared);
 633         } else {
 634                 excl = reservation_object_get_excl_rcu(obj->resv);
 635         }
 636
 637         if (excl) {
 638                 fence_set_priority(excl, attr);
 639                 dma_fence_put(excl);
 640         }
 641         return 0;
 642 }
 643
 644 /**
 645  * Waits for rendering to the object to be completed
 646  * @obj: i915 gem object
 647  * @flags: how to wait (under a lock, for all rendering or just for writes etc)
 648  * @timeout: how long to wait
 649  * @rps_client: client (user process) to charge for any waitboosting
 650  */
 651 int
 652 i915_gem_object_wait(struct drm_i915_gem_object *obj,
 653                      unsigned int flags,
 654                      long timeout,
 655                      struct intel_rps_client *rps_client)
 656 {
 657         might_sleep();
 658 #if IS_ENABLED(CONFIG_LOCKDEP)
 659         GEM_BUG_ON(debug_locks &&
 660                    !!lockdep_is_held(&obj->base.dev->struct_mutex) !=
 661                    !!(flags & I915_WAIT_LOCKED));
 662 #endif
 663         GEM_BUG_ON(timeout < 0);
 664
 665         timeout = i915_gem_object_wait_reservation(obj->resv,
 666                                                    flags, timeout,
 667                                                    rps_client);
 668         return timeout < 0 ? timeout : 0;
 669 }
 670
 671 static struct intel_rps_client *to_rps_client(struct drm_file *file)
 672 {
 673         struct drm_i915_file_private *fpriv = file->driver_priv;
 674
 675         return &fpriv->rps_client;
 676 }
 677
 678 static int
 679 i915_gem_phys_pwrite(struct drm_i915_gem_object *obj,
 680                      struct drm_i915_gem_pwrite *args,
 681                      struct drm_file *file)
 682 {
 683         void *vaddr = obj->phys_handle->vaddr + args->offset;
 684         char __user *user_data = u64_to_user_ptr(args->data_ptr);
 685
 686         /* We manually control the domain here and pretend that it
 687          * remains coherent i.e. in the GTT domain, like shmem_pwrite.
 688          */
 689         intel_fb_obj_invalidate(obj, ORIGIN_CPU);
 690         if (copy_from_user(vaddr, user_data, args->size))
 691                 return -EFAULT;
 692
 693         drm_clflush_virt_range(vaddr, args->size);
 694         i915_gem_chipset_flush(to_i915(obj->base.dev));
 695
 696         intel_fb_obj_flush(obj, ORIGIN_CPU);
 697         return 0;
 698 }
 699
 700 void *i915_gem_object_alloc(struct drm_i915_private *dev_priv)
 701 {
 702         return kmem_cache_zalloc(dev_priv->objects, GFP_KERNEL);
 703 }
 704
 705 void i915_gem_object_free(struct drm_i915_gem_object *obj)
 706 {
 707         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
 708         kmem_cache_free(dev_priv->objects, obj);
 709 }
 710
 711 static int
 712 i915_gem_create(struct drm_file *file,
 713                 struct drm_i915_private *dev_priv,
 714                 uint64_t size,
 715                 uint32_t *handle_p)
 716 {
 717         struct drm_i915_gem_object *obj;
 718         int ret;
 719         u32 handle;
 720
 721         size = roundup(size, PAGE_SIZE);
 722         if (size == 0)
 723                 return -EINVAL;
 724
 725         /* Allocate the new object */
 726         obj = i915_gem_object_create(dev_priv, size);
 727         if (IS_ERR(obj))
 728                 return PTR_ERR(obj);
 729
 730         ret = drm_gem_handle_create(file, &obj->base, &handle);
 731         /* drop reference from allocate - handle holds it now */
 732         i915_gem_object_put(obj);
 733         if (ret)
 734                 return ret;
 735
 736         *handle_p = handle;
 737         return 0;
 738 }
 739
 740 int
 741 i915_gem_dumb_create(struct drm_file *file,
 742                      struct drm_device *dev,
 743                      struct drm_mode_create_dumb *args)
 744 {
 745         /* have to work out size/pitch and return them */
 746         args->pitch = ALIGN(args->width * DIV_ROUND_UP(args->bpp, 8), 64);
 747         args->size = args->pitch * args->height;
 748         return i915_gem_create(file, to_i915(dev),
 749                                args->size, &args->handle);
 750 }
 751
 752 static bool gpu_write_needs_clflush(struct drm_i915_gem_object *obj)
 753 {
 754         return !(obj->cache_level == I915_CACHE_NONE ||
 755                  obj->cache_level == I915_CACHE_WT);
 756 }
 757
 758 /**
 759  * Creates a new mm object and returns a handle to it.
 760  * @dev: drm device pointer
 761  * @data: ioctl data blob
 762  * @file: drm file pointer
 763  */
 764 int
 765 i915_gem_create_ioctl(struct drm_device *dev, void *data,
 766                       struct drm_file *file)
 767 {
 768         struct drm_i915_private *dev_priv = to_i915(dev);
 769         struct drm_i915_gem_create *args = data;
 770
 771         i915_gem_flush_free_objects(dev_priv);
 772
 773         return i915_gem_create(file, dev_priv,
 774                                args->size, &args->handle);
 775 }
 776
 777 static inline enum fb_op_origin
 778 fb_write_origin(struct drm_i915_gem_object *obj, unsigned int domain)
 779 {
 780         return (domain == I915_GEM_DOMAIN_GTT ?
 781                 obj->frontbuffer_ggtt_origin : ORIGIN_CPU);
 782 }
 783
 784 void i915_gem_flush_ggtt_writes(struct drm_i915_private *dev_priv)
 785 {
 786         /*
 787          * No actual flushing is required for the GTT write domain for reads
 788          * from the GTT domain. Writes to it "immediately" go to main memory
 789          * as far as we know, so there's no chipset flush. It also doesn't
 790          * land in the GPU render cache.
 791          *
 792          * However, we do have to enforce the order so that all writes through
 793          * the GTT land before any writes to the device, such as updates to
 794          * the GATT itself.
 795          *
 796          * We also have to wait a bit for the writes to land from the GTT.
 797          * An uncached read (i.e. mmio) seems to be ideal for the round-trip
 798          * timing. This issue has only been observed when switching quickly
 799          * between GTT writes and CPU reads from inside the kernel on recent hw,
 800          * and it appears to only affect discrete GTT blocks (i.e. on LLC
 801          * system agents we cannot reproduce this behaviour, until Cannonlake
 802          * that was!).
 803          */
 804
 805         i915_gem_chipset_flush(dev_priv);
 806
 807         intel_runtime_pm_get(dev_priv);
 808         spin_lock_irq(&dev_priv->uncore.lock);
 809
 810         POSTING_READ_FW(RING_HEAD(RENDER_RING_BASE));
 811
 812         spin_unlock_irq(&dev_priv->uncore.lock);
 813         intel_runtime_pm_put(dev_priv);
 814 }
 815
 816 static void
 817 flush_write_domain(struct drm_i915_gem_object *obj, unsigned int flush_domains)
 818 {
 819         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
 820         struct i915_vma *vma;
 821
 822         if (!(obj->write_domain & flush_domains))
 823                 return;
 824
 825         switch (obj->write_domain) {
 826         case I915_GEM_DOMAIN_GTT:
 827                 i915_gem_flush_ggtt_writes(dev_priv);
 828
 829                 intel_fb_obj_flush(obj,
 830                                    fb_write_origin(obj, I915_GEM_DOMAIN_GTT));
 831
 832                 for_each_ggtt_vma(vma, obj) {
 833                         if (vma->iomap)
 834                                 continue;
 835
 836                         i915_vma_unset_ggtt_write(vma);
 837                 }
 838                 break;
 839
 840         case I915_GEM_DOMAIN_WC:
 841                 wmb();
 842                 break;
 843
 844         case I915_GEM_DOMAIN_CPU:
 845                 i915_gem_clflush_object(obj, I915_CLFLUSH_SYNC);
 846                 break;
 847
 848         case I915_GEM_DOMAIN_RENDER:
 849                 if (gpu_write_needs_clflush(obj))
 850                         obj->cache_dirty = true;
 851                 break;
 852         }
 853
 854         obj->write_domain = 0;
 855 }
 856
 857 static inline int
 858 __copy_to_user_swizzled(char __user *cpu_vaddr,
 859                         const char *gpu_vaddr, int gpu_offset,
 860                         int length)
 861 {
 862         int ret, cpu_offset = 0;
 863
 864         while (length > 0) {
 865                 int cacheline_end = ALIGN(gpu_offset + 1, 64);
 866                 int this_length = min(cacheline_end - gpu_offset, length);
 867                 int swizzled_gpu_offset = gpu_offset ^ 64;
 868
 869                 ret = __copy_to_user(cpu_vaddr + cpu_offset,
 870                                      gpu_vaddr + swizzled_gpu_offset,
 871                                      this_length);
 872                 if (ret)
 873                         return ret + length;
 874
 875                 cpu_offset += this_length;
 876                 gpu_offset += this_length;
 877                 length -= this_length;
 878         }
 879
 880         return 0;
 881 }
 882
 883 static inline int
 884 __copy_from_user_swizzled(char *gpu_vaddr, int gpu_offset,
 885                           const char __user *cpu_vaddr,
 886                           int length)
 887 {
 888         int ret, cpu_offset = 0;
 889
 890         while (length > 0) {
 891                 int cacheline_end = ALIGN(gpu_offset + 1, 64);
 892                 int this_length = min(cacheline_end - gpu_offset, length);
 893                 int swizzled_gpu_offset = gpu_offset ^ 64;
 894
 895                 ret = __copy_from_user(gpu_vaddr + swizzled_gpu_offset,
 896                                        cpu_vaddr + cpu_offset,
 897                                        this_length);
 898                 if (ret)
 899                         return ret + length;
 900
 901                 cpu_offset += this_length;
 902                 gpu_offset += this_length;
 903                 length -= this_length;
 904         }
 905
 906         return 0;
 907 }
 908
 909 /*
 910  * Pins the specified object's pages and synchronizes the object with
 911  * GPU accesses. Sets needs_clflush to non-zero if the caller should
 912  * flush the object from the CPU cache.
 913  */
 914 int i915_gem_obj_prepare_shmem_read(struct drm_i915_gem_object *obj,
 915                                     unsigned int *needs_clflush)
 916 {
 917         int ret;
 918
 919         lockdep_assert_held(&obj->base.dev->struct_mutex);
 920
 921         *needs_clflush = 0;
 922         if (!i915_gem_object_has_struct_page(obj))
 923                 return -ENODEV;
 924
 925         ret = i915_gem_object_wait(obj,
 926                                    I915_WAIT_INTERRUPTIBLE |
 927                                    I915_WAIT_LOCKED,
 928                                    MAX_SCHEDULE_TIMEOUT,
 929                                    NULL);
 930         if (ret)
 931                 return ret;
 932
 933         ret = i915_gem_object_pin_pages(obj);
 934         if (ret)
 935                 return ret;
 936
 937         if (obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ ||
 938             !static_cpu_has(X86_FEATURE_CLFLUSH)) {
 939                 ret = i915_gem_object_set_to_cpu_domain(obj, false);
 940                 if (ret)
 941                         goto err_unpin;
 942                 else
 943                         goto out;
 944         }
 945
 946         flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
 947
 948         /* If we're not in the cpu read domain, set ourself into the gtt
 949          * read domain and manually flush cachelines (if required). This
 950          * optimizes for the case when the gpu will dirty the data
 951          * anyway again before the next pread happens.
 952          */
 953         if (!obj->cache_dirty &&
 954             !(obj->read_domains & I915_GEM_DOMAIN_CPU))
 955                 *needs_clflush = CLFLUSH_BEFORE;
 956
 957 out:
 958         /* return with the pages pinned */
 959         return 0;
 960
 961 err_unpin:
 962         i915_gem_object_unpin_pages(obj);
 963         return ret;
 964 }
 965
 966 int i915_gem_obj_prepare_shmem_write(struct drm_i915_gem_object *obj,
 967                                      unsigned int *needs_clflush)
 968 {
 969         int ret;
 970
 971         lockdep_assert_held(&obj->base.dev->struct_mutex);
 972
 973         *needs_clflush = 0;
 974         if (!i915_gem_object_has_struct_page(obj))
 975                 return -ENODEV;
 976
 977         ret = i915_gem_object_wait(obj,
 978                                    I915_WAIT_INTERRUPTIBLE |
 979                                    I915_WAIT_LOCKED |
 980                                    I915_WAIT_ALL,
 981                                    MAX_SCHEDULE_TIMEOUT,
 982                                    NULL);
 983         if (ret)
 984                 return ret;
 985
 986         ret = i915_gem_object_pin_pages(obj);
 987         if (ret)
 988                 return ret;
 989
 990         if (obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE ||
 991             !static_cpu_has(X86_FEATURE_CLFLUSH)) {
 992                 ret = i915_gem_object_set_to_cpu_domain(obj, true);
 993                 if (ret)
 994                         goto err_unpin;
 995                 else
 996                         goto out;
 997         }
 998
 999         flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
1000
1001         /* If we're not in the cpu write domain, set ourself into the
1002          * gtt write domain and manually flush cachelines (as required).
1003          * This optimizes for the case when the gpu will use the data
1004          * right away and we therefore have to clflush anyway.
1005          */
1006         if (!obj->cache_dirty) {
1007                 *needs_clflush |= CLFLUSH_AFTER;
1008
1009                 /*
1010                  * Same trick applies to invalidate partially written
1011                  * cachelines read before writing.
1012                  */
1013                 if (!(obj->read_domains & I915_GEM_DOMAIN_CPU))
1014                         *needs_clflush |= CLFLUSH_BEFORE;
1015         }
1016
1017 out:
1018         intel_fb_obj_invalidate(obj, ORIGIN_CPU);
1019         obj->mm.dirty = true;
1020         /* return with the pages pinned */
1021         return 0;
1022
1023 err_unpin:
1024         i915_gem_object_unpin_pages(obj);
1025         return ret;
1026 }
1027
1028 static void
1029 shmem_clflush_swizzled_range(char *addr, unsigned long length,
1030                              bool swizzled)
1031 {
1032         if (unlikely(swizzled)) {
1033                 unsigned long start = (unsigned long) addr;
1034                 unsigned long end = (unsigned long) addr + length;
1035
1036                 /* For swizzling simply ensure that we always flush both
1037                  * channels. Lame, but simple and it works. Swizzled
1038                  * pwrite/pread is far from a hotpath - current userspace
1039                  * doesn't use it at all. */
1040                 start = round_down(start, 128);
1041                 end = round_up(end, 128);
1042
1043                 drm_clflush_virt_range((void *)start, end - start);
1044         } else {
1045                 drm_clflush_virt_range(addr, length);
1046         }
1047
1048 }
1049
1050 /* Only difference to the fast-path function is that this can handle bit17
1051  * and uses non-atomic copy and kmap functions. */
1052 static int
1053 shmem_pread_slow(struct page *page, int offset, int length,
1054                  char __user *user_data,
1055                  bool page_do_bit17_swizzling, bool needs_clflush)
1056 {
1057         char *vaddr;
1058         int ret;
1059
1060         vaddr = kmap(page);
1061         if (needs_clflush)
1062                 shmem_clflush_swizzled_range(vaddr + offset, length,
1063                                              page_do_bit17_swizzling);
1064
1065         if (page_do_bit17_swizzling)
1066                 ret = __copy_to_user_swizzled(user_data, vaddr, offset, length);
1067         else
1068                 ret = __copy_to_user(user_data, vaddr + offset, length);
1069         kunmap(page);
1070
1071         return ret ? - EFAULT : 0;
1072 }
1073
1074 static int
1075 shmem_pread(struct page *page, int offset, int length, char __user *user_data,
1076             bool page_do_bit17_swizzling, bool needs_clflush)
1077 {
1078         int ret;
1079
1080         ret = -ENODEV;
1081         if (!page_do_bit17_swizzling) {
1082                 char *vaddr = kmap_atomic(page);
1083
1084                 if (needs_clflush)
1085                         drm_clflush_virt_range(vaddr + offset, length);
1086                 ret = __copy_to_user_inatomic(user_data, vaddr + offset, length);
1087                 kunmap_atomic(vaddr);
1088         }
1089         if (ret == 0)
1090                 return 0;
1091
1092         return shmem_pread_slow(page, offset, length, user_data,
1093                                 page_do_bit17_swizzling, needs_clflush);
1094 }
1095
1096 static int
1097 i915_gem_shmem_pread(struct drm_i915_gem_object *obj,
1098                      struct drm_i915_gem_pread *args)
1099 {
1100         char __user *user_data;
1101         u64 remain;
1102         unsigned int obj_do_bit17_swizzling;
1103         unsigned int needs_clflush;
1104         unsigned int idx, offset;
1105         int ret;
1106
1107         obj_do_bit17_swizzling = 0;
1108         if (i915_gem_object_needs_bit17_swizzle(obj))
1109                 obj_do_bit17_swizzling = BIT(17);
1110
1111         ret = mutex_lock_interruptible(&obj->base.dev->struct_mutex);
1112         if (ret)
1113                 return ret;
1114
1115         ret = i915_gem_obj_prepare_shmem_read(obj, &needs_clflush);
1116         mutex_unlock(&obj->base.dev->struct_mutex);
1117         if (ret)
1118                 return ret;
1119
1120         remain = args->size;
1121         user_data = u64_to_user_ptr(args->data_ptr);
1122         offset = offset_in_page(args->offset);
1123         for (idx = args->offset >> PAGE_SHIFT; remain; idx++) {
1124                 struct page *page = i915_gem_object_get_page(obj, idx);
1125                 int length;
1126
1127                 length = remain;
1128                 if (offset + length > PAGE_SIZE)
1129                         length = PAGE_SIZE - offset;
1130
1131                 ret = shmem_pread(page, offset, length, user_data,
1132                                   page_to_phys(page) & obj_do_bit17_swizzling,
1133                                   needs_clflush);
1134                 if (ret)
1135                         break;
1136
1137                 remain -= length;
1138                 user_data += length;
1139                 offset = 0;
1140         }
1141
1142         i915_gem_obj_finish_shmem_access(obj);
1143         return ret;
1144 }
1145
1146 static inline bool
1147 gtt_user_read(struct io_mapping *mapping,
1148               loff_t base, int offset,
1149               char __user *user_data, int length)
1150 {
1151         void __iomem *vaddr;
1152         unsigned long unwritten;
1153
1154         /* We can use the cpu mem copy function because this is X86. */
1155         vaddr = io_mapping_map_atomic_wc(mapping, base);
1156         unwritten = __copy_to_user_inatomic(user_data,
1157                                             (void __force *)vaddr + offset,
1158                                             length);
1159         io_mapping_unmap_atomic(vaddr);
1160         if (unwritten) {
1161                 vaddr = io_mapping_map_wc(mapping, base, PAGE_SIZE);
1162                 unwritten = copy_to_user(user_data,
1163                                          (void __force *)vaddr + offset,
1164                                          length);
1165                 io_mapping_unmap(vaddr);
1166         }
1167         return unwritten;
1168 }
1169
1170 static int
1171 i915_gem_gtt_pread(struct drm_i915_gem_object *obj,
1172                    const struct drm_i915_gem_pread *args)
1173 {
1174         struct drm_i915_private *i915 = to_i915(obj->base.dev);
1175         struct i915_ggtt *ggtt = &i915->ggtt;
1176         struct drm_mm_node node;
1177         struct i915_vma *vma;
1178         void __user *user_data;
1179         u64 remain, offset;
1180         int ret;
1181
1182         ret = mutex_lock_interruptible(&i915->drm.struct_mutex);
1183         if (ret)
1184                 return ret;
1185
1186         intel_runtime_pm_get(i915);
1187         vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
1188                                        PIN_MAPPABLE |
1189                                        PIN_NONFAULT |
1190                                        PIN_NONBLOCK);
1191         if (!IS_ERR(vma)) {
1192                 node.start = i915_ggtt_offset(vma);
1193                 node.allocated = false;
1194                 ret = i915_vma_put_fence(vma);
1195                 if (ret) {
1196                         i915_vma_unpin(vma);
1197                         vma = ERR_PTR(ret);
1198                 }
1199         }
1200         if (IS_ERR(vma)) {
1201                 ret = insert_mappable_node(ggtt, &node, PAGE_SIZE);
1202                 if (ret)
1203                         goto out_unlock;
1204                 GEM_BUG_ON(!node.allocated);
1205         }
1206
1207         ret = i915_gem_object_set_to_gtt_domain(obj, false);
1208         if (ret)
1209                 goto out_unpin;
1210
1211         mutex_unlock(&i915->drm.struct_mutex);
1212
1213         user_data = u64_to_user_ptr(args->data_ptr);
1214         remain = args->size;
1215         offset = args->offset;
1216
1217         while (remain > 0) {
1218                 /* Operation in this page
1219                  *
1220                  * page_base = page offset within aperture
1221                  * page_offset = offset within page
1222                  * page_length = bytes to copy for this page
1223                  */
1224                 u32 page_base = node.start;
1225                 unsigned page_offset = offset_in_page(offset);
1226                 unsigned page_length = PAGE_SIZE - page_offset;
1227                 page_length = remain < page_length ? remain : page_length;
1228                 if (node.allocated) {
1229                         wmb();
1230                         ggtt->vm.insert_page(&ggtt->vm,
1231                                              i915_gem_object_get_dma_address(obj, offset >> PAGE_SHIFT),
1232                                              node.start, I915_CACHE_NONE, 0);
1233                         wmb();
1234                 } else {
1235                         page_base += offset & PAGE_MASK;
1236                 }
1237
1238                 if (gtt_user_read(&ggtt->iomap, page_base, page_offset,
1239                                   user_data, page_length)) {
1240                         ret = -EFAULT;
1241                         break;
1242                 }
1243
1244                 remain -= page_length;
1245                 user_data += page_length;
1246                 offset += page_length;
1247         }
1248
1249         mutex_lock(&i915->drm.struct_mutex);
1250 out_unpin:
1251         if (node.allocated) {
1252                 wmb();
1253                 ggtt->vm.clear_range(&ggtt->vm, node.start, node.size);
1254                 remove_mappable_node(&node);
1255         } else {
1256                 i915_vma_unpin(vma);
1257         }
1258 out_unlock:
1259         intel_runtime_pm_put(i915);
1260         mutex_unlock(&i915->drm.struct_mutex);
1261
1262         return ret;
1263 }
1264
1265 /**
1266  * Reads data from the object referenced by handle.
1267  * @dev: drm device pointer
1268  * @data: ioctl data blob
1269  * @file: drm file pointer
1270  *
1271  * On error, the contents of *data are undefined.
1272  */
1273 int
1274 i915_gem_pread_ioctl(struct drm_device *dev, void *data,
1275                      struct drm_file *file)
1276 {
1277         struct drm_i915_gem_pread *args = data;
1278         struct drm_i915_gem_object *obj;
1279         int ret;
1280
1281         if (args->size == 0)
1282                 return 0;
1283
1284         if (!access_ok(VERIFY_WRITE,
1285                        u64_to_user_ptr(args->data_ptr),
1286                        args->size))
1287                 return -EFAULT;
1288
1289         obj = i915_gem_object_lookup(file, args->handle);
1290         if (!obj)
1291                 return -ENOENT;
1292
1293         /* Bounds check source.  */
1294         if (range_overflows_t(u64, args->offset, args->size, obj->base.size)) {
1295                 ret = -EINVAL;
1296                 goto out;
1297         }
1298
1299         trace_i915_gem_object_pread(obj, args->offset, args->size);
1300
1301         ret = i915_gem_object_wait(obj,
1302                                    I915_WAIT_INTERRUPTIBLE,
1303                                    MAX_SCHEDULE_TIMEOUT,
1304                                    to_rps_client(file));
1305         if (ret)
1306                 goto out;
1307
1308         ret = i915_gem_object_pin_pages(obj);
1309         if (ret)
1310                 goto out;
1311
1312         ret = i915_gem_shmem_pread(obj, args);
1313         if (ret == -EFAULT || ret == -ENODEV)
1314                 ret = i915_gem_gtt_pread(obj, args);
1315
1316         i915_gem_object_unpin_pages(obj);
1317 out:
1318         i915_gem_object_put(obj);
1319         return ret;
1320 }
1321
1322 /* This is the fast write path which cannot handle
1323  * page faults in the source data
1324  */
1325
1326 static inline bool
1327 ggtt_write(struct io_mapping *mapping,
1328            loff_t base, int offset,
1329            char __user *user_data, int length)
1330 {
1331         void __iomem *vaddr;
1332         unsigned long unwritten;
1333
1334         /* We can use the cpu mem copy function because this is X86. */
1335         vaddr = io_mapping_map_atomic_wc(mapping, base);
1336         unwritten = __copy_from_user_inatomic_nocache((void __force *)vaddr + offset,
1337                                                       user_data, length);
1338         io_mapping_unmap_atomic(vaddr);
1339         if (unwritten) {
1340                 vaddr = io_mapping_map_wc(mapping, base, PAGE_SIZE);
1341                 unwritten = copy_from_user((void __force *)vaddr + offset,
1342                                            user_data, length);
1343                 io_mapping_unmap(vaddr);
1344         }
1345
1346         return unwritten;
1347 }
1348
1349 /**
1350  * This is the fast pwrite path, where we copy the data directly from the
1351  * user into the GTT, uncached.
1352  * @obj: i915 GEM object
1353  * @args: pwrite arguments structure
1354  */
1355 static int
1356 i915_gem_gtt_pwrite_fast(struct drm_i915_gem_object *obj,
1357                          const struct drm_i915_gem_pwrite *args)
1358 {
1359         struct drm_i915_private *i915 = to_i915(obj->base.dev);
1360         struct i915_ggtt *ggtt = &i915->ggtt;
1361         struct drm_mm_node node;
1362         struct i915_vma *vma;
1363         u64 remain, offset;
1364         void __user *user_data;
1365         int ret;
1366
1367         ret = mutex_lock_interruptible(&i915->drm.struct_mutex);
1368         if (ret)
1369                 return ret;
1370
1371         if (i915_gem_object_has_struct_page(obj)) {
1372                 /*
1373                  * Avoid waking the device up if we can fallback, as
1374                  * waking/resuming is very slow (worst-case 10-100 ms
1375                  * depending on PCI sleeps and our own resume time).
1376                  * This easily dwarfs any performance advantage from
1377                  * using the cache bypass of indirect GGTT access.
1378                  */
1379                 if (!intel_runtime_pm_get_if_in_use(i915)) {
1380                         ret = -EFAULT;
1381                         goto out_unlock;
1382                 }
1383         } else {
1384                 /* No backing pages, no fallback, we must force GGTT access */
1385                 intel_runtime_pm_get(i915);
1386         }
1387
1388         vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
1389                                        PIN_MAPPABLE |
1390                                        PIN_NONFAULT |
1391                                        PIN_NONBLOCK);
1392         if (!IS_ERR(vma)) {
1393                 node.start = i915_ggtt_offset(vma);
1394                 node.allocated = false;
1395                 ret = i915_vma_put_fence(vma);
1396                 if (ret) {
1397                         i915_vma_unpin(vma);
1398                         vma = ERR_PTR(ret);
1399                 }
1400         }
1401         if (IS_ERR(vma)) {
1402                 ret = insert_mappable_node(ggtt, &node, PAGE_SIZE);
1403                 if (ret)
1404                         goto out_rpm;
1405                 GEM_BUG_ON(!node.allocated);
1406         }
1407
1408         ret = i915_gem_object_set_to_gtt_domain(obj, true);
1409         if (ret)
1410                 goto out_unpin;
1411
1412         mutex_unlock(&i915->drm.struct_mutex);
1413
1414         intel_fb_obj_invalidate(obj, ORIGIN_CPU);
1415
1416         user_data = u64_to_user_ptr(args->data_ptr);
1417         offset = args->offset;
1418         remain = args->size;
1419         while (remain) {
1420                 /* Operation in this page
1421                  *
1422                  * page_base = page offset within aperture
1423                  * page_offset = offset within page
1424                  * page_length = bytes to copy for this page
1425                  */
1426                 u32 page_base = node.start;
1427                 unsigned int page_offset = offset_in_page(offset);
1428                 unsigned int page_length = PAGE_SIZE - page_offset;
1429                 page_length = remain < page_length ? remain : page_length;
1430                 if (node.allocated) {
1431                         wmb(); /* flush the write before we modify the GGTT */
1432                         ggtt->vm.insert_page(&ggtt->vm,
1433                                              i915_gem_object_get_dma_address(obj, offset >> PAGE_SHIFT),
1434                                              node.start, I915_CACHE_NONE, 0);
1435                         wmb(); /* flush modifications to the GGTT (insert_page) */
1436                 } else {
1437                         page_base += offset & PAGE_MASK;
1438                 }
1439                 /* If we get a fault while copying data, then (presumably) our
1440                  * source page isn't available.  Return the error and we'll
1441                  * retry in the slow path.
1442                  * If the object is non-shmem backed, we retry again with the
1443                  * path that handles page fault.
1444                  */
1445                 if (ggtt_write(&ggtt->iomap, page_base, page_offset,
1446                                user_data, page_length)) {
1447                         ret = -EFAULT;
1448                         break;
1449                 }
1450
1451                 remain -= page_length;
1452                 user_data += page_length;
1453                 offset += page_length;
1454         }
1455         intel_fb_obj_flush(obj, ORIGIN_CPU);
1456
1457         mutex_lock(&i915->drm.struct_mutex);
1458 out_unpin:
1459         if (node.allocated) {
1460                 wmb();
1461                 ggtt->vm.clear_range(&ggtt->vm, node.start, node.size);
1462                 remove_mappable_node(&node);
1463         } else {
1464                 i915_vma_unpin(vma);
1465         }
1466 out_rpm:
1467         intel_runtime_pm_put(i915);
1468 out_unlock:
1469         mutex_unlock(&i915->drm.struct_mutex);
1470         return ret;
1471 }
1472
1473 static int
1474 shmem_pwrite_slow(struct page *page, int offset, int length,
1475                   char __user *user_data,
1476                   bool page_do_bit17_swizzling,
1477                   bool needs_clflush_before,
1478                   bool needs_clflush_after)
1479 {
1480         char *vaddr;
1481         int ret;
1482
1483         vaddr = kmap(page);
1484         if (unlikely(needs_clflush_before || page_do_bit17_swizzling))
1485                 shmem_clflush_swizzled_range(vaddr + offset, length,
1486                                              page_do_bit17_swizzling);
1487         if (page_do_bit17_swizzling)
1488                 ret = __copy_from_user_swizzled(vaddr, offset, user_data,
1489                                                 length);
1490         else
1491                 ret = __copy_from_user(vaddr + offset, user_data, length);
1492         if (needs_clflush_after)
1493                 shmem_clflush_swizzled_range(vaddr + offset, length,
1494                                              page_do_bit17_swizzling);
1495         kunmap(page);
1496
1497         return ret ? -EFAULT : 0;
1498 }
1499
1500 /* Per-page copy function for the shmem pwrite fastpath.
1501  * Flushes invalid cachelines before writing to the target if
1502  * needs_clflush_before is set and flushes out any written cachelines after
1503  * writing if needs_clflush is set.
1504  */
1505 static int
1506 shmem_pwrite(struct page *page, int offset, int len, char __user *user_data,
1507              bool page_do_bit17_swizzling,
1508              bool needs_clflush_before,
1509              bool needs_clflush_after)
1510 {
1511         int ret;
1512
1513         ret = -ENODEV;
1514         if (!page_do_bit17_swizzling) {
1515                 char *vaddr = kmap_atomic(page);
1516
1517                 if (needs_clflush_before)
1518                         drm_clflush_virt_range(vaddr + offset, len);
1519                 ret = __copy_from_user_inatomic(vaddr + offset, user_data, len);
1520                 if (needs_clflush_after)
1521                         drm_clflush_virt_range(vaddr + offset, len);
1522
1523                 kunmap_atomic(vaddr);
1524         }
1525         if (ret == 0)
1526                 return ret;
1527
1528         return shmem_pwrite_slow(page, offset, len, user_data,
1529                                  page_do_bit17_swizzling,
1530                                  needs_clflush_before,
1531                                  needs_clflush_after);
1532 }
1533
1534 static int
1535 i915_gem_shmem_pwrite(struct drm_i915_gem_object *obj,
1536                       const struct drm_i915_gem_pwrite *args)
1537 {
1538         struct drm_i915_private *i915 = to_i915(obj->base.dev);
1539         void __user *user_data;
1540         u64 remain;
1541         unsigned int obj_do_bit17_swizzling;
1542         unsigned int partial_cacheline_write;
1543         unsigned int needs_clflush;
1544         unsigned int offset, idx;
1545         int ret;
1546
1547         ret = mutex_lock_interruptible(&i915->drm.struct_mutex);
1548         if (ret)
1549                 return ret;
1550
1551         ret = i915_gem_obj_prepare_shmem_write(obj, &needs_clflush);
1552         mutex_unlock(&i915->drm.struct_mutex);
1553         if (ret)
1554                 return ret;
1555
1556         obj_do_bit17_swizzling = 0;
1557         if (i915_gem_object_needs_bit17_swizzle(obj))
1558                 obj_do_bit17_swizzling = BIT(17);
1559
1560         /* If we don't overwrite a cacheline completely we need to be
1561          * careful to have up-to-date data by first clflushing. Don't
1562          * overcomplicate things and flush the entire patch.
1563          */
1564         partial_cacheline_write = 0;
1565         if (needs_clflush & CLFLUSH_BEFORE)
1566                 partial_cacheline_write = boot_cpu_data.x86_clflush_size - 1;
1567
1568         user_data = u64_to_user_ptr(args->data_ptr);
1569         remain = args->size;
1570         offset = offset_in_page(args->offset);
1571         for (idx = args->offset >> PAGE_SHIFT; remain; idx++) {
1572                 struct page *page = i915_gem_object_get_page(obj, idx);
1573                 int length;
1574
1575                 length = remain;
1576                 if (offset + length > PAGE_SIZE)
1577                         length = PAGE_SIZE - offset;
1578
1579                 ret = shmem_pwrite(page, offset, length, user_data,
1580                                    page_to_phys(page) & obj_do_bit17_swizzling,
1581                                    (offset | length) & partial_cacheline_write,
1582                                    needs_clflush & CLFLUSH_AFTER);
1583                 if (ret)
1584                         break;
1585
1586                 remain -= length;
1587                 user_data += length;
1588                 offset = 0;
1589         }
1590
1591         intel_fb_obj_flush(obj, ORIGIN_CPU);
1592         i915_gem_obj_finish_shmem_access(obj);
1593         return ret;
1594 }
1595
1596 /**
1597  * Writes data to the object referenced by handle.
1598  * @dev: drm device
1599  * @data: ioctl data blob
1600  * @file: drm file
1601  *
1602  * On error, the contents of the buffer that were to be modified are undefined.
1603  */
1604 int
1605 i915_gem_pwrite_ioctl(struct drm_device *dev, void *data,
1606                       struct drm_file *file)
1607 {
1608         struct drm_i915_gem_pwrite *args = data;
1609         struct drm_i915_gem_object *obj;
1610         int ret;
1611
1612         if (args->size == 0)
1613                 return 0;
1614
1615         if (!access_ok(VERIFY_READ,
1616                        u64_to_user_ptr(args->data_ptr),
1617                        args->size))
1618                 return -EFAULT;
1619
1620         obj = i915_gem_object_lookup(file, args->handle);
1621         if (!obj)
1622                 return -ENOENT;
1623
1624         /* Bounds check destination. */
1625         if (range_overflows_t(u64, args->offset, args->size, obj->base.size)) {
1626                 ret = -EINVAL;
1627                 goto err;
1628         }
1629
1630         /* Writes not allowed into this read-only object */
1631         if (i915_gem_object_is_readonly(obj)) {
1632                 ret = -EINVAL;
1633                 goto err;
1634         }
1635
1636         trace_i915_gem_object_pwrite(obj, args->offset, args->size);
1637
1638         ret = -ENODEV;
1639         if (obj->ops->pwrite)
1640                 ret = obj->ops->pwrite(obj, args);
1641         if (ret != -ENODEV)
1642                 goto err;
1643
1644         ret = i915_gem_object_wait(obj,
1645                                    I915_WAIT_INTERRUPTIBLE |
1646                                    I915_WAIT_ALL,
1647                                    MAX_SCHEDULE_TIMEOUT,
1648                                    to_rps_client(file));
1649         if (ret)
1650                 goto err;
1651
1652         ret = i915_gem_object_pin_pages(obj);
1653         if (ret)
1654                 goto err;
1655
1656         ret = -EFAULT;
1657         /* We can only do the GTT pwrite on untiled buffers, as otherwise
1658          * it would end up going through the fenced access, and we'll get
1659          * different detiling behavior between reading and writing.
1660          * pread/pwrite currently are reading and writing from the CPU
1661          * perspective, requiring manual detiling by the client.
1662          */
1663         if (!i915_gem_object_has_struct_page(obj) ||
1664             cpu_write_needs_clflush(obj))
1665                 /* Note that the gtt paths might fail with non-page-backed user
1666                  * pointers (e.g. gtt mappings when moving data between
1667                  * textures). Fallback to the shmem path in that case.
1668                  */
1669                 ret = i915_gem_gtt_pwrite_fast(obj, args);
1670
1671         if (ret == -EFAULT || ret == -ENOSPC) {
1672                 if (obj->phys_handle)
1673                         ret = i915_gem_phys_pwrite(obj, args, file);
1674                 else
1675                         ret = i915_gem_shmem_pwrite(obj, args);
1676         }
1677
1678         i915_gem_object_unpin_pages(obj);
1679 err:
1680         i915_gem_object_put(obj);
1681         return ret;
1682 }
1683
1684 static void i915_gem_object_bump_inactive_ggtt(struct drm_i915_gem_object *obj)
1685 {
1686         struct drm_i915_private *i915;
1687         struct list_head *list;
1688         struct i915_vma *vma;
1689
1690         GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
1691
1692         for_each_ggtt_vma(vma, obj) {
1693                 if (i915_vma_is_active(vma))
1694                         continue;
1695
1696                 if (!drm_mm_node_allocated(&vma->node))
1697                         continue;
1698
1699                 list_move_tail(&vma->vm_link, &vma->vm->inactive_list);
1700         }
1701
1702         i915 = to_i915(obj->base.dev);
1703         spin_lock(&i915->mm.obj_lock);
1704         list = obj->bind_count ? &i915->mm.bound_list : &i915->mm.unbound_list;
1705         list_move_tail(&obj->mm.link, list);
1706         spin_unlock(&i915->mm.obj_lock);
1707 }
1708
1709 /**
1710  * Called when user space prepares to use an object with the CPU, either
1711  * through the mmap ioctl's mapping or a GTT mapping.
1712  * @dev: drm device
1713  * @data: ioctl data blob
1714  * @file: drm file
1715  */
1716 int
1717 i915_gem_set_domain_ioctl(struct drm_device *dev, void *data,
1718                           struct drm_file *file)
1719 {
1720         struct drm_i915_gem_set_domain *args = data;
1721         struct drm_i915_gem_object *obj;
1722         uint32_t read_domains = args->read_domains;
1723         uint32_t write_domain = args->write_domain;
1724         int err;
1725
1726         /* Only handle setting domains to types used by the CPU. */
1727         if ((write_domain | read_domains) & I915_GEM_GPU_DOMAINS)
1728                 return -EINVAL;
1729
1730         /* Having something in the write domain implies it's in the read
1731          * domain, and only that read domain.  Enforce that in the request.
1732          */
1733         if (write_domain != 0 && read_domains != write_domain)
1734                 return -EINVAL;
1735
1736         obj = i915_gem_object_lookup(file, args->handle);
1737         if (!obj)
1738                 return -ENOENT;
1739
1740         /* Try to flush the object off the GPU without holding the lock.
1741          * We will repeat the flush holding the lock in the normal manner
1742          * to catch cases where we are gazumped.
1743          */
1744         err = i915_gem_object_wait(obj,
1745                                    I915_WAIT_INTERRUPTIBLE |
1746                                    (write_domain ? I915_WAIT_ALL : 0),
1747                                    MAX_SCHEDULE_TIMEOUT,
1748                                    to_rps_client(file));
1749         if (err)
1750                 goto out;
1751
1752         /*
1753          * Proxy objects do not control access to the backing storage, ergo
1754          * they cannot be used as a means to manipulate the cache domain
1755          * tracking for that backing storage. The proxy object is always
1756          * considered to be outside of any cache domain.
1757          */
1758         if (i915_gem_object_is_proxy(obj)) {
1759                 err = -ENXIO;
1760                 goto out;
1761         }
1762
1763         /*
1764          * Flush and acquire obj->pages so that we are coherent through
1765          * direct access in memory with previous cached writes through
1766          * shmemfs and that our cache domain tracking remains valid.
1767          * For example, if the obj->filp was moved to swap without us
1768          * being notified and releasing the pages, we would mistakenly
1769          * continue to assume that the obj remained out of the CPU cached
1770          * domain.
1771          */
1772         err = i915_gem_object_pin_pages(obj);
1773         if (err)
1774                 goto out;
1775
1776         err = i915_mutex_lock_interruptible(dev);
1777         if (err)
1778                 goto out_unpin;
1779
1780         if (read_domains & I915_GEM_DOMAIN_WC)
1781                 err = i915_gem_object_set_to_wc_domain(obj, write_domain);
1782         else if (read_domains & I915_GEM_DOMAIN_GTT)
1783                 err = i915_gem_object_set_to_gtt_domain(obj, write_domain);
1784         else
1785                 err = i915_gem_object_set_to_cpu_domain(obj, write_domain);
1786
1787         /* And bump the LRU for this access */
1788         i915_gem_object_bump_inactive_ggtt(obj);
1789
1790         mutex_unlock(&dev->struct_mutex);
1791
1792         if (write_domain != 0)
1793                 intel_fb_obj_invalidate(obj,
1794                                         fb_write_origin(obj, write_domain));
1795
1796 out_unpin:
1797         i915_gem_object_unpin_pages(obj);
1798 out:
1799         i915_gem_object_put(obj);
1800         return err;
1801 }
1802
1803 /**
1804  * Called when user space has done writes to this buffer
1805  * @dev: drm device
1806  * @data: ioctl data blob
1807  * @file: drm file
1808  */
1809 int
1810 i915_gem_sw_finish_ioctl(struct drm_device *dev, void *data,
1811                          struct drm_file *file)
1812 {
1813         struct drm_i915_gem_sw_finish *args = data;
1814         struct drm_i915_gem_object *obj;
1815
1816         obj = i915_gem_object_lookup(file, args->handle);
1817         if (!obj)
1818                 return -ENOENT;
1819
1820         /*
1821          * Proxy objects are barred from CPU access, so there is no
1822          * need to ban sw_finish as it is a nop.
1823          */
1824
1825         /* Pinned buffers may be scanout, so flush the cache */
1826         i915_gem_object_flush_if_display(obj);
1827         i915_gem_object_put(obj);
1828
1829         return 0;
1830 }
1831
1832 /**
1833  * i915_gem_mmap_ioctl - Maps the contents of an object, returning the address
1834  *                       it is mapped to.
1835  * @dev: drm device
1836  * @data: ioctl data blob
1837  * @file: drm file
1838  *
1839  * While the mapping holds a reference on the contents of the object, it doesn't
1840  * imply a ref on the object itself.
1841  *
1842  * IMPORTANT:
1843  *
1844  * DRM driver writers who look a this function as an example for how to do GEM
1845  * mmap support, please don't implement mmap support like here. The modern way
1846  * to implement DRM mmap support is with an mmap offset ioctl (like
1847  * i915_gem_mmap_gtt) and then using the mmap syscall on the DRM fd directly.
1848  * That way debug tooling like valgrind will understand what's going on, hiding
1849  * the mmap call in a driver private ioctl will break that. The i915 driver only
1850  * does cpu mmaps this way because we didn't know better.
1851  */
1852 int
1853 i915_gem_mmap_ioctl(struct drm_device *dev, void *data,
1854                     struct drm_file *file)
1855 {
1856         struct drm_i915_gem_mmap *args = data;
1857         struct drm_i915_gem_object *obj;
1858         unsigned long addr;
1859
1860         if (args->flags & ~(I915_MMAP_WC))
1861                 return -EINVAL;
1862
1863         if (args->flags & I915_MMAP_WC && !boot_cpu_has(X86_FEATURE_PAT))
1864                 return -ENODEV;
1865
1866         obj = i915_gem_object_lookup(file, args->handle);
1867         if (!obj)
1868                 return -ENOENT;
1869
1870         /* prime objects have no backing filp to GEM mmap
1871          * pages from.
1872          */
1873         if (!obj->base.filp) {
1874                 i915_gem_object_put(obj);
1875                 return -ENXIO;
1876         }
1877
1878         addr = vm_mmap(obj->base.filp, 0, args->size,
1879                        PROT_READ | PROT_WRITE, MAP_SHARED,
1880                        args->offset);
1881         if (args->flags & I915_MMAP_WC) {
1882                 struct mm_struct *mm = current->mm;
1883                 struct vm_area_struct *vma;
1884
1885                 if (down_write_killable(&mm->mmap_sem)) {
1886                         i915_gem_object_put(obj);
1887                         return -EINTR;
1888                 }
1889                 vma = find_vma(mm, addr);
1890                 if (vma)
1891                         vma->vm_page_prot =
1892                                 pgprot_writecombine(vm_get_page_prot(vma->vm_flags));
1893                 else
1894                         addr = -ENOMEM;
1895                 up_write(&mm->mmap_sem);
1896
1897                 /* This may race, but that's ok, it only gets set */
1898                 WRITE_ONCE(obj->frontbuffer_ggtt_origin, ORIGIN_CPU);
1899         }
1900         i915_gem_object_put(obj);
1901         if (IS_ERR((void *)addr))
1902                 return addr;
1903
1904         args->addr_ptr = (uint64_t) addr;
1905
1906         return 0;
1907 }
1908
1909 static unsigned int tile_row_pages(struct drm_i915_gem_object *obj)
1910 {
1911         return i915_gem_object_get_tile_row_size(obj) >> PAGE_SHIFT;
1912 }
1913
1914 /**
1915  * i915_gem_mmap_gtt_version - report the current feature set for GTT mmaps
1916  *
1917  * A history of the GTT mmap interface:
1918  *
1919  * 0 - Everything had to fit into the GTT. Both parties of a memcpy had to
1920  *     aligned and suitable for fencing, and still fit into the available
1921  *     mappable space left by the pinned display objects. A classic problem
1922  *     we called the page-fault-of-doom where we would ping-pong between
1923  *     two objects that could not fit inside the GTT and so the memcpy
1924  *     would page one object in at the expense of the other between every
1925  *     single byte.
1926  *
1927  * 1 - Objects can be any size, and have any compatible fencing (X Y, or none
1928  *     as set via i915_gem_set_tiling() [DRM_I915_GEM_SET_TILING]). If the
1929  *     object is too large for the available space (or simply too large
1930  *     for the mappable aperture!), a view is created instead and faulted
1931  *     into userspace. (This view is aligned and sized appropriately for
1932  *     fenced access.)
1933  *
1934  * 2 - Recognise WC as a separate cache domain so that we can flush the
1935  *     delayed writes via GTT before performing direct access via WC.
1936  *
1937  * Restrictions:
1938  *
1939  *  * snoopable objects cannot be accessed via the GTT. It can cause machine
1940  *    hangs on some architectures, corruption on others. An attempt to service
1941  *    a GTT page fault from a snoopable object will generate a SIGBUS.
1942  *
1943  *  * the object must be able to fit into RAM (physical memory, though no
1944  *    limited to the mappable aperture).
1945  *
1946  *
1947  * Caveats:
1948  *
1949  *  * a new GTT page fault will synchronize rendering from the GPU and flush
1950  *    all data to system memory. Subsequent access will not be synchronized.
1951  *
1952  *  * all mappings are revoked on runtime device suspend.
1953  *
1954  *  * there are only 8, 16 or 32 fence registers to share between all users
1955  *    (older machines require fence register for display and blitter access
1956  *    as well). Contention of the fence registers will cause the previous users
1957  *    to be unmapped and any new access will generate new page faults.
1958  *
1959  *  * running out of memory while servicing a fault may generate a SIGBUS,
1960  *    rather than the expected SIGSEGV.
1961  */
1962 int i915_gem_mmap_gtt_version(void)
1963 {
1964         return 2;
1965 }
1966
1967 static inline struct i915_ggtt_view
1968 compute_partial_view(struct drm_i915_gem_object *obj,
1969                      pgoff_t page_offset,
1970                      unsigned int chunk)
1971 {
1972         struct i915_ggtt_view view;
1973
1974         if (i915_gem_object_is_tiled(obj))
1975                 chunk = roundup(chunk, tile_row_pages(obj));
1976
1977         view.type = I915_GGTT_VIEW_PARTIAL;
1978         view.partial.offset = rounddown(page_offset, chunk);
1979         view.partial.size =
1980                 min_t(unsigned int, chunk,
1981                       (obj->base.size >> PAGE_SHIFT) - view.partial.offset);
1982
1983         /* If the partial covers the entire object, just create a normal VMA. */
1984         if (chunk >= obj->base.size >> PAGE_SHIFT)
1985                 view.type = I915_GGTT_VIEW_NORMAL;
1986
1987         return view;
1988 }
1989
1990 /**
1991  * i915_gem_fault - fault a page into the GTT
1992  * @vmf: fault info
1993  *
1994  * The fault handler is set up by drm_gem_mmap() when a object is GTT mapped
1995  * from userspace.  The fault handler takes care of binding the object to
1996  * the GTT (if needed), allocating and programming a fence register (again,
1997  * only if needed based on whether the old reg is still valid or the object
1998  * is tiled) and inserting a new PTE into the faulting process.
1999  *
2000  * Note that the faulting process may involve evicting existing objects
2001  * from the GTT and/or fence registers to make room.  So performance may
2002  * suffer if the GTT working set is large or there are few fence registers
2003  * left.
2004  *
2005  * The current feature set supported by i915_gem_fault() and thus GTT mmaps
2006  * is exposed via I915_PARAM_MMAP_GTT_VERSION (see i915_gem_mmap_gtt_version).
2007  */
2008 vm_fault_t i915_gem_fault(struct vm_fault *vmf)
2009 {
2010 #define MIN_CHUNK_PAGES (SZ_1M >> PAGE_SHIFT)
2011         struct vm_area_struct *area = vmf->vma;
2012         struct drm_i915_gem_object *obj = to_intel_bo(area->vm_private_data);
2013         struct drm_device *dev = obj->base.dev;
2014         struct drm_i915_private *dev_priv = to_i915(dev);
2015         struct i915_ggtt *ggtt = &dev_priv->ggtt;
2016         bool write = !!(vmf->flags & FAULT_FLAG_WRITE);
2017         struct i915_vma *vma;
2018         pgoff_t page_offset;
2019         int ret;
2020
2021         /* Sanity check that we allow writing into this object */
2022         if (i915_gem_object_is_readonly(obj) && write)
2023                 return VM_FAULT_SIGBUS;
2024
2025         /* We don't use vmf->pgoff since that has the fake offset */
2026         page_offset = (vmf->address - area->vm_start) >> PAGE_SHIFT;
2027
2028         trace_i915_gem_object_fault(obj, page_offset, true, write);
2029
2030         /* Try to flush the object off the GPU first without holding the lock.
2031          * Upon acquiring the lock, we will perform our sanity checks and then
2032          * repeat the flush holding the lock in the normal manner to catch cases
2033          * where we are gazumped.
2034          */
2035         ret = i915_gem_object_wait(obj,
2036                                    I915_WAIT_INTERRUPTIBLE,
2037                                    MAX_SCHEDULE_TIMEOUT,
2038                                    NULL);
2039         if (ret)
2040                 goto err;
2041
2042         ret = i915_gem_object_pin_pages(obj);
2043         if (ret)
2044                 goto err;
2045
2046         intel_runtime_pm_get(dev_priv);
2047
2048         ret = i915_mutex_lock_interruptible(dev);
2049         if (ret)
2050                 goto err_rpm;
2051
2052         /* Access to snoopable pages through the GTT is incoherent. */
2053         if (obj->cache_level != I915_CACHE_NONE && !HAS_LLC(dev_priv)) {
2054                 ret = -EFAULT;
2055                 goto err_unlock;
2056         }
2057
2058
2059         /* Now pin it into the GTT as needed */
2060         vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
2061                                        PIN_MAPPABLE |
2062                                        PIN_NONBLOCK |
2063                                        PIN_NONFAULT);
2064         if (IS_ERR(vma)) {
2065                 /* Use a partial view if it is bigger than available space */
2066                 struct i915_ggtt_view view =
2067                         compute_partial_view(obj, page_offset, MIN_CHUNK_PAGES);
2068                 unsigned int flags;
2069
2070                 flags = PIN_MAPPABLE;
2071                 if (view.type == I915_GGTT_VIEW_NORMAL)
2072                         flags |= PIN_NONBLOCK; /* avoid warnings for pinned */
2073
2074                 /*
2075                  * Userspace is now writing through an untracked VMA, abandon
2076                  * all hope that the hardware is able to track future writes.
2077                  */
2078                 obj->frontbuffer_ggtt_origin = ORIGIN_CPU;
2079
2080                 vma = i915_gem_object_ggtt_pin(obj, &view, 0, 0, flags);
2081                 if (IS_ERR(vma) && !view.type) {
2082                         flags = PIN_MAPPABLE;
2083                         view.type = I915_GGTT_VIEW_PARTIAL;
2084                         vma = i915_gem_object_ggtt_pin(obj, &view, 0, 0, flags);
2085                 }
2086         }
2087         if (IS_ERR(vma)) {
2088                 ret = PTR_ERR(vma);
2089                 goto err_unlock;
2090         }
2091
2092         ret = i915_gem_object_set_to_gtt_domain(obj, write);
2093         if (ret)
2094                 goto err_unpin;
2095
2096         ret = i915_vma_pin_fence(vma);
2097         if (ret)
2098                 goto err_unpin;
2099
2100         /* Finally, remap it using the new GTT offset */
2101         ret = remap_io_mapping(area,
2102                                area->vm_start + (vma->ggtt_view.partial.offset << PAGE_SHIFT),
2103                                (ggtt->gmadr.start + vma->node.start) >> PAGE_SHIFT,
2104                                min_t(u64, vma->size, area->vm_end - area->vm_start),
2105                                &ggtt->iomap);
2106         if (ret)
2107                 goto err_fence;
2108
2109         /* Mark as being mmapped into userspace for later revocation */
2110         assert_rpm_wakelock_held(dev_priv);
2111         if (!i915_vma_set_userfault(vma) && !obj->userfault_count++)
2112                 list_add(&obj->userfault_link, &dev_priv->mm.userfault_list);
2113         GEM_BUG_ON(!obj->userfault_count);
2114
2115         i915_vma_set_ggtt_write(vma);
2116
2117 err_fence:
2118         i915_vma_unpin_fence(vma);
2119 err_unpin:
2120         __i915_vma_unpin(vma);
2121 err_unlock:
2122         mutex_unlock(&dev->struct_mutex);
2123 err_rpm:
2124         intel_runtime_pm_put(dev_priv);
2125         i915_gem_object_unpin_pages(obj);
2126 err:
2127         switch (ret) {
2128         case -EIO:
2129                 /*
2130                  * We eat errors when the gpu is terminally wedged to avoid
2131                  * userspace unduly crashing (gl has no provisions for mmaps to
2132                  * fail). But any other -EIO isn't ours (e.g. swap in failure)
2133                  * and so needs to be reported.
2134                  */
2135                 if (!i915_terminally_wedged(&dev_priv->gpu_error))
2136                         return VM_FAULT_SIGBUS;
2137                 /* else: fall through */
2138         case -EAGAIN:
2139                 /*
2140                  * EAGAIN means the gpu is hung and we'll wait for the error
2141                  * handler to reset everything when re-faulting in
2142                  * i915_mutex_lock_interruptible.
2143                  */
2144         case 0:
2145         case -ERESTARTSYS:
2146         case -EINTR:
2147         case -EBUSY:
2148                 /*
2149                  * EBUSY is ok: this just means that another thread
2150                  * already did the job.
2151                  */
2152                 return VM_FAULT_NOPAGE;
2153         case -ENOMEM:
2154                 return VM_FAULT_OOM;
2155         case -ENOSPC:
2156         case -EFAULT:
2157                 return VM_FAULT_SIGBUS;
2158         default:
2159                 WARN_ONCE(ret, "unhandled error in i915_gem_fault: %i\n", ret);
2160                 return VM_FAULT_SIGBUS;
2161         }
2162 }
2163
2164 static void __i915_gem_object_release_mmap(struct drm_i915_gem_object *obj)
2165 {
2166         struct i915_vma *vma;
2167
2168         GEM_BUG_ON(!obj->userfault_count);
2169
2170         obj->userfault_count = 0;
2171         list_del(&obj->userfault_link);
2172         drm_vma_node_unmap(&obj->base.vma_node,
2173                            obj->base.dev->anon_inode->i_mapping);
2174
2175         for_each_ggtt_vma(vma, obj)
2176                 i915_vma_unset_userfault(vma);
2177 }
2178
2179 /**
2180  * i915_gem_release_mmap - remove physical page mappings
2181  * @obj: obj in question
2182  *
2183  * Preserve the reservation of the mmapping with the DRM core code, but
2184  * relinquish ownership of the pages back to the system.
2185  *
2186  * It is vital that we remove the page mapping if we have mapped a tiled
2187  * object through the GTT and then lose the fence register due to
2188  * resource pressure. Similarly if the object has been moved out of the
2189  * aperture, than pages mapped into userspace must be revoked. Removing the
2190  * mapping will then trigger a page fault on the next user access, allowing
2191  * fixup by i915_gem_fault().
2192  */
2193 void
2194 i915_gem_release_mmap(struct drm_i915_gem_object *obj)
2195 {
2196         struct drm_i915_private *i915 = to_i915(obj->base.dev);
2197
2198         /* Serialisation between user GTT access and our code depends upon
2199          * revoking the CPU's PTE whilst the mutex is held. The next user
2200          * pagefault then has to wait until we release the mutex.
2201          *
2202          * Note that RPM complicates somewhat by adding an additional
2203          * requirement that operations to the GGTT be made holding the RPM
2204          * wakeref.
2205          */
2206         lockdep_assert_held(&i915->drm.struct_mutex);
2207         intel_runtime_pm_get(i915);
2208
2209         if (!obj->userfault_count)
2210                 goto out;
2211
2212         __i915_gem_object_release_mmap(obj);
2213
2214         /* Ensure that the CPU's PTE are revoked and there are not outstanding
2215          * memory transactions from userspace before we return. The TLB
2216          * flushing implied above by changing the PTE above *should* be
2217          * sufficient, an extra barrier here just provides us with a bit
2218          * of paranoid documentation about our requirement to serialise
2219          * memory writes before touching registers / GSM.
2220          */
2221         wmb();
2222
2223 out:
2224         intel_runtime_pm_put(i915);
2225 }
2226
2227 void i915_gem_runtime_suspend(struct drm_i915_private *dev_priv)
2228 {
2229         struct drm_i915_gem_object *obj, *on;
2230         int i;
2231
2232         /*
2233          * Only called during RPM suspend. All users of the userfault_list
2234          * must be holding an RPM wakeref to ensure that this can not
2235          * run concurrently with themselves (and use the struct_mutex for
2236          * protection between themselves).
2237          */
2238
2239         list_for_each_entry_safe(obj, on,
2240                                  &dev_priv->mm.userfault_list, userfault_link)
2241                 __i915_gem_object_release_mmap(obj);
2242
2243         /* The fence will be lost when the device powers down. If any were
2244          * in use by hardware (i.e. they are pinned), we should not be powering
2245          * down! All other fences will be reacquired by the user upon waking.
2246          */
2247         for (i = 0; i < dev_priv->num_fence_regs; i++) {
2248                 struct drm_i915_fence_reg *reg = &dev_priv->fence_regs[i];
2249
2250                 /* Ideally we want to assert that the fence register is not
2251                  * live at this point (i.e. that no piece of code will be
2252                  * trying to write through fence + GTT, as that both violates
2253                  * our tracking of activity and associated locking/barriers,
2254                  * but also is illegal given that the hw is powered down).
2255                  *
2256                  * Previously we used reg->pin_count as a "liveness" indicator.
2257                  * That is not sufficient, and we need a more fine-grained
2258                  * tool if we want to have a sanity check here.
2259                  */
2260
2261                 if (!reg->vma)
2262                         continue;
2263
2264                 GEM_BUG_ON(i915_vma_has_userfault(reg->vma));
2265                 reg->dirty = true;
2266         }
2267 }
2268
2269 static int i915_gem_object_create_mmap_offset(struct drm_i915_gem_object *obj)
2270 {
2271         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
2272         int err;
2273
2274         err = drm_gem_create_mmap_offset(&obj->base);
2275         if (likely(!err))
2276                 return 0;
2277
2278         /* Attempt to reap some mmap space from dead objects */
2279         do {
2280                 err = i915_gem_wait_for_idle(dev_priv,
2281                                              I915_WAIT_INTERRUPTIBLE,
2282                                              MAX_SCHEDULE_TIMEOUT);
2283                 if (err)
2284                         break;
2285
2286                 i915_gem_drain_freed_objects(dev_priv);
2287                 err = drm_gem_create_mmap_offset(&obj->base);
2288                 if (!err)
2289                         break;
2290
2291         } while (flush_delayed_work(&dev_priv->gt.retire_work));
2292
2293         return err;
2294 }
2295
2296 static void i915_gem_object_free_mmap_offset(struct drm_i915_gem_object *obj)
2297 {
2298         drm_gem_free_mmap_offset(&obj->base);
2299 }
2300
2301 int
2302 i915_gem_mmap_gtt(struct drm_file *file,
2303                   struct drm_device *dev,
2304                   uint32_t handle,
2305                   uint64_t *offset)
2306 {
2307         struct drm_i915_gem_object *obj;
2308         int ret;
2309
2310         obj = i915_gem_object_lookup(file, handle);
2311         if (!obj)
2312                 return -ENOENT;
2313
2314         ret = i915_gem_object_create_mmap_offset(obj);
2315         if (ret == 0)
2316                 *offset = drm_vma_node_offset_addr(&obj->base.vma_node);
2317
2318         i915_gem_object_put(obj);
2319         return ret;
2320 }
2321
2322 /**
2323  * i915_gem_mmap_gtt_ioctl - prepare an object for GTT mmap'ing
2324  * @dev: DRM device
2325  * @data: GTT mapping ioctl data
2326  * @file: GEM object info
2327  *
2328  * Simply returns the fake offset to userspace so it can mmap it.
2329  * The mmap call will end up in drm_gem_mmap(), which will set things
2330  * up so we can get faults in the handler above.
2331  *
2332  * The fault handler will take care of binding the object into the GTT
2333  * (since it may have been evicted to make room for something), allocating
2334  * a fence register, and mapping the appropriate aperture address into
2335  * userspace.
2336  */
2337 int
2338 i915_gem_mmap_gtt_ioctl(struct drm_device *dev, void *data,
2339                         struct drm_file *file)
2340 {
2341         struct drm_i915_gem_mmap_gtt *args = data;
2342
2343         return i915_gem_mmap_gtt(file, dev, args->handle, &args->offset);
2344 }
2345
2346 /* Immediately discard the backing storage */
2347 static void
2348 i915_gem_object_truncate(struct drm_i915_gem_object *obj)
2349 {
2350         i915_gem_object_free_mmap_offset(obj);
2351
2352         if (obj->base.filp == NULL)
2353                 return;
2354
2355         /* Our goal here is to return as much of the memory as
2356          * is possible back to the system as we are called from OOM.
2357          * To do this we must instruct the shmfs to drop all of its
2358          * backing pages, *now*.
2359          */
2360         shmem_truncate_range(file_inode(obj->base.filp), 0, (loff_t)-1);
2361         obj->mm.madv = __I915_MADV_PURGED;
2362         obj->mm.pages = ERR_PTR(-EFAULT);
2363 }
2364
2365 /* Try to discard unwanted pages */
2366 void __i915_gem_object_invalidate(struct drm_i915_gem_object *obj)
2367 {
2368         struct address_space *mapping;
2369
2370         lockdep_assert_held(&obj->mm.lock);
2371         GEM_BUG_ON(i915_gem_object_has_pages(obj));
2372
2373         switch (obj->mm.madv) {
2374         case I915_MADV_DONTNEED:
2375                 i915_gem_object_truncate(obj);
2376         case __I915_MADV_PURGED:
2377                 return;
2378         }
2379
2380         if (obj->base.filp == NULL)
2381                 return;
2382
2383         mapping = obj->base.filp->f_mapping,
2384         invalidate_mapping_pages(mapping, 0, (loff_t)-1);
2385 }
2386
2387 static void
2388 i915_gem_object_put_pages_gtt(struct drm_i915_gem_object *obj,
2389                               struct sg_table *pages)
2390 {
2391         struct sgt_iter sgt_iter;
2392         struct page *page;
2393
2394         __i915_gem_object_release_shmem(obj, pages, true);
2395
2396         i915_gem_gtt_finish_pages(obj, pages);
2397
2398         if (i915_gem_object_needs_bit17_swizzle(obj))
2399                 i915_gem_object_save_bit_17_swizzle(obj, pages);
2400
2401         for_each_sgt_page(page, sgt_iter, pages) {
2402                 if (obj->mm.dirty)
2403                         set_page_dirty(page);
2404
2405                 if (obj->mm.madv == I915_MADV_WILLNEED)
2406                         mark_page_accessed(page);
2407
2408                 put_page(page);
2409         }
2410         obj->mm.dirty = false;
2411
2412         sg_free_table(pages);
2413         kfree(pages);
2414 }
2415
2416 static void __i915_gem_object_reset_page_iter(struct drm_i915_gem_object *obj)
2417 {
2418         struct radix_tree_iter iter;
2419         void __rcu **slot;
2420
2421         rcu_read_lock();
2422         radix_tree_for_each_slot(slot, &obj->mm.get_page.radix, &iter, 0)
2423                 radix_tree_delete(&obj->mm.get_page.radix, iter.index);
2424         rcu_read_unlock();
2425 }
2426
2427 static struct sg_table *
2428 __i915_gem_object_unset_pages(struct drm_i915_gem_object *obj)
2429 {
2430         struct drm_i915_private *i915 = to_i915(obj->base.dev);
2431         struct sg_table *pages;
2432
2433         pages = fetch_and_zero(&obj->mm.pages);
2434         if (!pages)
2435                 return NULL;
2436
2437         spin_lock(&i915->mm.obj_lock);
2438         list_del(&obj->mm.link);
2439         spin_unlock(&i915->mm.obj_lock);
2440
2441         if (obj->mm.mapping) {
2442                 void *ptr;
2443
2444                 ptr = page_mask_bits(obj->mm.mapping);
2445                 if (is_vmalloc_addr(ptr))
2446                         vunmap(ptr);
2447                 else
2448                         kunmap(kmap_to_page(ptr));
2449
2450                 obj->mm.mapping = NULL;
2451         }
2452
2453         __i915_gem_object_reset_page_iter(obj);
2454         obj->mm.page_sizes.phys = obj->mm.page_sizes.sg = 0;
2455
2456         return pages;
2457 }
2458
2459 void __i915_gem_object_put_pages(struct drm_i915_gem_object *obj,
2460                                  enum i915_mm_subclass subclass)
2461 {
2462         struct sg_table *pages;
2463
2464         if (i915_gem_object_has_pinned_pages(obj))
2465                 return;
2466
2467         GEM_BUG_ON(obj->bind_count);
2468         if (!i915_gem_object_has_pages(obj))
2469                 return;
2470
2471         /* May be called by shrinker from within get_pages() (on another bo) */
2472         mutex_lock_nested(&obj->mm.lock, subclass);
2473         if (unlikely(atomic_read(&obj->mm.pages_pin_count)))
2474                 goto unlock;
2475
2476         /*
2477          * ->put_pages might need to allocate memory for the bit17 swizzle
2478          * array, hence protect them from being reaped by removing them from gtt
2479          * lists early.
2480          */
2481         pages = __i915_gem_object_unset_pages(obj);
2482         if (!IS_ERR(pages))
2483                 obj->ops->put_pages(obj, pages);
2484
2485 unlock:
2486         mutex_unlock(&obj->mm.lock);
2487 }
2488
2489 static bool i915_sg_trim(struct sg_table *orig_st)
2490 {
2491         struct sg_table new_st;
2492         struct scatterlist *sg, *new_sg;
2493         unsigned int i;
2494
2495         if (orig_st->nents == orig_st->orig_nents)
2496                 return false;
2497
2498         if (sg_alloc_table(&new_st, orig_st->nents, GFP_KERNEL | __GFP_NOWARN))
2499                 return false;
2500
2501         new_sg = new_st.sgl;
2502         for_each_sg(orig_st->sgl, sg, orig_st->nents, i) {
2503                 sg_set_page(new_sg, sg_page(sg), sg->length, 0);
2504                 /* called before being DMA mapped, no need to copy sg->dma_* */
2505                 new_sg = sg_next(new_sg);
2506         }
2507         GEM_BUG_ON(new_sg); /* Should walk exactly nents and hit the end */
2508
2509         sg_free_table(orig_st);
2510
2511         *orig_st = new_st;
2512         return true;
2513 }
2514
2515 static int i915_gem_object_get_pages_gtt(struct drm_i915_gem_object *obj)
2516 {
2517         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
2518         const unsigned long page_count = obj->base.size / PAGE_SIZE;
2519         unsigned long i;
2520         struct address_space *mapping;
2521         struct sg_table *st;
2522         struct scatterlist *sg;
2523         struct sgt_iter sgt_iter;
2524         struct page *page;
2525         unsigned long last_pfn = 0;     /* suppress gcc warning */
2526         unsigned int max_segment = i915_sg_segment_size();
2527         unsigned int sg_page_sizes;
2528         gfp_t noreclaim;
2529         int ret;
2530
2531         /* Assert that the object is not currently in any GPU domain. As it
2532          * wasn't in the GTT, there shouldn't be any way it could have been in
2533          * a GPU cache
2534          */
2535         GEM_BUG_ON(obj->read_domains & I915_GEM_GPU_DOMAINS);
2536         GEM_BUG_ON(obj->write_domain & I915_GEM_GPU_DOMAINS);
2537
2538         st = kmalloc(sizeof(*st), GFP_KERNEL);
2539         if (st == NULL)
2540                 return -ENOMEM;
2541
2542 rebuild_st:
2543         if (sg_alloc_table(st, page_count, GFP_KERNEL)) {
2544                 kfree(st);
2545                 return -ENOMEM;
2546         }
2547
2548         /* Get the list of pages out of our struct file.  They'll be pinned
2549          * at this point until we release them.
2550          *
2551          * Fail silently without starting the shrinker
2552          */
2553         mapping = obj->base.filp->f_mapping;
2554         noreclaim = mapping_gfp_constraint(mapping, ~__GFP_RECLAIM);
2555         noreclaim |= __GFP_NORETRY | __GFP_NOWARN;
2556
2557         sg = st->sgl;
2558         st->nents = 0;
2559         sg_page_sizes = 0;
2560         for (i = 0; i < page_count; i++) {
2561                 const unsigned int shrink[] = {
2562                         I915_SHRINK_BOUND | I915_SHRINK_UNBOUND | I915_SHRINK_PURGEABLE,
2563                         0,
2564                 }, *s = shrink;
2565                 gfp_t gfp = noreclaim;
2566
2567                 do {
2568                         page = shmem_read_mapping_page_gfp(mapping, i, gfp);
2569                         if (likely(!IS_ERR(page)))
2570                                 break;
2571
2572                         if (!*s) {
2573                                 ret = PTR_ERR(page);
2574                                 goto err_sg;
2575                         }
2576
2577                         i915_gem_shrink(dev_priv, 2 * page_count, NULL, *s++);
2578                         cond_resched();
2579
2580                         /* We've tried hard to allocate the memory by reaping
2581                          * our own buffer, now let the real VM do its job and
2582                          * go down in flames if truly OOM.
2583                          *
2584                          * However, since graphics tend to be disposable,
2585                          * defer the oom here by reporting the ENOMEM back
2586                          * to userspace.
2587                          */
2588                         if (!*s) {
2589                                 /* reclaim and warn, but no oom */
2590                                 gfp = mapping_gfp_mask(mapping);
2591
2592                                 /* Our bo are always dirty and so we require
2593                                  * kswapd to reclaim our pages (direct reclaim
2594                                  * does not effectively begin pageout of our
2595                                  * buffers on its own). However, direct reclaim
2596                                  * only waits for kswapd when under allocation
2597                                  * congestion. So as a result __GFP_RECLAIM is
2598                                  * unreliable and fails to actually reclaim our
2599                                  * dirty pages -- unless you try over and over
2600                                  * again with !__GFP_NORETRY. However, we still
2601                                  * want to fail this allocation rather than
2602                                  * trigger the out-of-memory killer and for
2603                                  * this we want __GFP_RETRY_MAYFAIL.
2604                                  */
2605                                 gfp |= __GFP_RETRY_MAYFAIL;
2606                         }
2607                 } while (1);
2608
2609                 if (!i ||
2610                     sg->length >= max_segment ||
2611                     page_to_pfn(page) != last_pfn + 1) {
2612                         if (i) {
2613                                 sg_page_sizes |= sg->length;
2614                                 sg = sg_next(sg);
2615                         }
2616                         st->nents++;
2617                         sg_set_page(sg, page, PAGE_SIZE, 0);
2618                 } else {
2619                         sg->length += PAGE_SIZE;
2620                 }
2621                 last_pfn = page_to_pfn(page);
2622
2623                 /* Check that the i965g/gm workaround works. */
2624                 WARN_ON((gfp & __GFP_DMA32) && (last_pfn >= 0x00100000UL));
2625         }
2626         if (sg) { /* loop terminated early; short sg table */
2627                 sg_page_sizes |= sg->length;
2628                 sg_mark_end(sg);
2629         }
2630
2631         /* Trim unused sg entries to avoid wasting memory. */
2632         i915_sg_trim(st);
2633
2634         ret = i915_gem_gtt_prepare_pages(obj, st);
2635         if (ret) {
2636                 /* DMA remapping failed? One possible cause is that
2637                  * it could not reserve enough large entries, asking
2638                  * for PAGE_SIZE chunks instead may be helpful.
2639                  */
2640                 if (max_segment > PAGE_SIZE) {
2641                         for_each_sgt_page(page, sgt_iter, st)
2642                                 put_page(page);
2643                         sg_free_table(st);
2644
2645                         max_segment = PAGE_SIZE;
2646                         goto rebuild_st;
2647                 } else {
2648                         dev_warn(&dev_priv->drm.pdev->dev,
2649                                  "Failed to DMA remap %lu pages\n",
2650                                  page_count);
2651                         goto err_pages;
2652                 }
2653         }
2654
2655         if (i915_gem_object_needs_bit17_swizzle(obj))
2656                 i915_gem_object_do_bit_17_swizzle(obj, st);
2657
2658         __i915_gem_object_set_pages(obj, st, sg_page_sizes);
2659
2660         return 0;
2661
2662 err_sg:
2663         sg_mark_end(sg);
2664 err_pages:
2665         for_each_sgt_page(page, sgt_iter, st)
2666                 put_page(page);
2667         sg_free_table(st);
2668         kfree(st);
2669
2670         /* shmemfs first checks if there is enough memory to allocate the page
2671          * and reports ENOSPC should there be insufficient, along with the usual
2672          * ENOMEM for a genuine allocation failure.
2673          *
2674          * We use ENOSPC in our driver to mean that we have run out of aperture
2675          * space and so want to translate the error from shmemfs back to our
2676          * usual understanding of ENOMEM.
2677          */
2678         if (ret == -ENOSPC)
2679                 ret = -ENOMEM;
2680
2681         return ret;
2682 }
2683
2684 void __i915_gem_object_set_pages(struct drm_i915_gem_object *obj,
2685                                  struct sg_table *pages,
2686                                  unsigned int sg_page_sizes)
2687 {
2688         struct drm_i915_private *i915 = to_i915(obj->base.dev);
2689         unsigned long supported = INTEL_INFO(i915)->page_sizes;
2690         int i;
2691
2692         lockdep_assert_held(&obj->mm.lock);
2693
2694         obj->mm.get_page.sg_pos = pages->sgl;
2695         obj->mm.get_page.sg_idx = 0;
2696
2697         obj->mm.pages = pages;
2698
2699         if (i915_gem_object_is_tiled(obj) &&
2700             i915->quirks & QUIRK_PIN_SWIZZLED_PAGES) {
2701                 GEM_BUG_ON(obj->mm.quirked);
2702                 __i915_gem_object_pin_pages(obj);
2703                 obj->mm.quirked = true;
2704         }
2705
2706         GEM_BUG_ON(!sg_page_sizes);
2707         obj->mm.page_sizes.phys = sg_page_sizes;
2708
2709         /*
2710          * Calculate the supported page-sizes which fit into the given
2711          * sg_page_sizes. This will give us the page-sizes which we may be able
2712          * to use opportunistically when later inserting into the GTT. For
2713          * example if phys=2G, then in theory we should be able to use 1G, 2M,
2714          * 64K or 4K pages, although in practice this will depend on a number of
2715          * other factors.
2716          */
2717         obj->mm.page_sizes.sg = 0;
2718         for_each_set_bit(i, &supported, ilog2(I915_GTT_MAX_PAGE_SIZE) + 1) {
2719                 if (obj->mm.page_sizes.phys & ~0u << i)
2720                         obj->mm.page_sizes.sg |= BIT(i);
2721         }
2722         GEM_BUG_ON(!HAS_PAGE_SIZES(i915, obj->mm.page_sizes.sg));
2723
2724         spin_lock(&i915->mm.obj_lock);
2725         list_add(&obj->mm.link, &i915->mm.unbound_list);
2726         spin_unlock(&i915->mm.obj_lock);
2727 }
2728
2729 static int ____i915_gem_object_get_pages(struct drm_i915_gem_object *obj)
2730 {
2731         int err;
2732
2733         if (unlikely(obj->mm.madv != I915_MADV_WILLNEED)) {
2734                 DRM_DEBUG("Attempting to obtain a purgeable object\n");
2735                 return -EFAULT;
2736         }
2737
2738         err = obj->ops->get_pages(obj);
2739         GEM_BUG_ON(!err && !i915_gem_object_has_pages(obj));
2740
2741         return err;
2742 }
2743
2744 /* Ensure that the associated pages are gathered from the backing storage
2745  * and pinned into our object. i915_gem_object_pin_pages() may be called
2746  * multiple times before they are released by a single call to
2747  * i915_gem_object_unpin_pages() - once the pages are no longer referenced
2748  * either as a result of memory pressure (reaping pages under the shrinker)
2749  * or as the object is itself released.
2750  */
2751 int __i915_gem_object_get_pages(struct drm_i915_gem_object *obj)
2752 {
2753         int err;
2754
2755         err = mutex_lock_interruptible(&obj->mm.lock);
2756         if (err)
2757                 return err;
2758
2759         if (unlikely(!i915_gem_object_has_pages(obj))) {
2760                 GEM_BUG_ON(i915_gem_object_has_pinned_pages(obj));
2761
2762                 err = ____i915_gem_object_get_pages(obj);
2763                 if (err)
2764                         goto unlock;
2765
2766                 smp_mb__before_atomic();
2767         }
2768         atomic_inc(&obj->mm.pages_pin_count);
2769
2770 unlock:
2771         mutex_unlock(&obj->mm.lock);
2772         return err;
2773 }
2774
2775 /* The 'mapping' part of i915_gem_object_pin_map() below */
2776 static void *i915_gem_object_map(const struct drm_i915_gem_object *obj,
2777                                  enum i915_map_type type)
2778 {
2779         unsigned long n_pages = obj->base.size >> PAGE_SHIFT;
2780         struct sg_table *sgt = obj->mm.pages;
2781         struct sgt_iter sgt_iter;
2782         struct page *page;
2783         struct page *stack_pages[32];
2784         struct page **pages = stack_pages;
2785         unsigned long i = 0;
2786         pgprot_t pgprot;
2787         void *addr;
2788
2789         /* A single page can always be kmapped */
2790         if (n_pages == 1 && type == I915_MAP_WB)
2791                 return kmap(sg_page(sgt->sgl));
2792
2793         if (n_pages > ARRAY_SIZE(stack_pages)) {
2794                 /* Too big for stack -- allocate temporary array instead */
2795                 pages = kvmalloc_array(n_pages, sizeof(*pages), GFP_KERNEL);
2796                 if (!pages)
2797                         return NULL;
2798         }
2799
2800         for_each_sgt_page(page, sgt_iter, sgt)
2801                 pages[i++] = page;
2802
2803         /* Check that we have the expected number of pages */
2804         GEM_BUG_ON(i != n_pages);
2805
2806         switch (type) {
2807         default:
2808                 MISSING_CASE(type);
2809                 /* fallthrough to use PAGE_KERNEL anyway */
2810         case I915_MAP_WB:
2811                 pgprot = PAGE_KERNEL;
2812                 break;
2813         case I915_MAP_WC:
2814                 pgprot = pgprot_writecombine(PAGE_KERNEL_IO);
2815                 break;
2816         }
2817         addr = vmap(pages, n_pages, 0, pgprot);
2818
2819         if (pages != stack_pages)
2820                 kvfree(pages);
2821
2822         return addr;
2823 }
2824
2825 /* get, pin, and map the pages of the object into kernel space */
2826 void *i915_gem_object_pin_map(struct drm_i915_gem_object *obj,
2827                               enum i915_map_type type)
2828 {
2829         enum i915_map_type has_type;
2830         bool pinned;
2831         void *ptr;
2832         int ret;
2833
2834         if (unlikely(!i915_gem_object_has_struct_page(obj)))
2835                 return ERR_PTR(-ENXIO);
2836
2837         ret = mutex_lock_interruptible(&obj->mm.lock);
2838         if (ret)
2839                 return ERR_PTR(ret);
2840
2841         pinned = !(type & I915_MAP_OVERRIDE);
2842         type &= ~I915_MAP_OVERRIDE;
2843
2844         if (!atomic_inc_not_zero(&obj->mm.pages_pin_count)) {
2845                 if (unlikely(!i915_gem_object_has_pages(obj))) {
2846                         GEM_BUG_ON(i915_gem_object_has_pinned_pages(obj));
2847
2848                         ret = ____i915_gem_object_get_pages(obj);
2849                         if (ret)
2850                                 goto err_unlock;
2851
2852                         smp_mb__before_atomic();
2853                 }
2854                 atomic_inc(&obj->mm.pages_pin_count);
2855                 pinned = false;
2856         }
2857         GEM_BUG_ON(!i915_gem_object_has_pages(obj));
2858
2859         ptr = page_unpack_bits(obj->mm.mapping, &has_type);
2860         if (ptr && has_type != type) {
2861                 if (pinned) {
2862                         ret = -EBUSY;
2863                         goto err_unpin;
2864                 }
2865
2866                 if (is_vmalloc_addr(ptr))
2867                         vunmap(ptr);
2868                 else
2869                         kunmap(kmap_to_page(ptr));
2870
2871                 ptr = obj->mm.mapping = NULL;
2872         }
2873
2874         if (!ptr) {
2875                 ptr = i915_gem_object_map(obj, type);
2876                 if (!ptr) {
2877                         ret = -ENOMEM;
2878                         goto err_unpin;
2879                 }
2880
2881                 obj->mm.mapping = page_pack_bits(ptr, type);
2882         }
2883
2884 out_unlock:
2885         mutex_unlock(&obj->mm.lock);
2886         return ptr;
2887
2888 err_unpin:
2889         atomic_dec(&obj->mm.pages_pin_count);
2890 err_unlock:
2891         ptr = ERR_PTR(ret);
2892         goto out_unlock;
2893 }
2894
2895 static int
2896 i915_gem_object_pwrite_gtt(struct drm_i915_gem_object *obj,
2897                            const struct drm_i915_gem_pwrite *arg)
2898 {
2899         struct address_space *mapping = obj->base.filp->f_mapping;
2900         char __user *user_data = u64_to_user_ptr(arg->data_ptr);
2901         u64 remain, offset;
2902         unsigned int pg;
2903
2904         /* Before we instantiate/pin the backing store for our use, we
2905          * can prepopulate the shmemfs filp efficiently using a write into
2906          * the pagecache. We avoid the penalty of instantiating all the
2907          * pages, important if the user is just writing to a few and never
2908          * uses the object on the GPU, and using a direct write into shmemfs
2909          * allows it to avoid the cost of retrieving a page (either swapin
2910          * or clearing-before-use) before it is overwritten.
2911          */
2912         if (i915_gem_object_has_pages(obj))
2913                 return -ENODEV;
2914
2915         if (obj->mm.madv != I915_MADV_WILLNEED)
2916                 return -EFAULT;
2917
2918         /* Before the pages are instantiated the object is treated as being
2919          * in the CPU domain. The pages will be clflushed as required before
2920          * use, and we can freely write into the pages directly. If userspace
2921          * races pwrite with any other operation; corruption will ensue -
2922          * that is userspace's prerogative!
2923          */
2924
2925         remain = arg->size;
2926         offset = arg->offset;
2927         pg = offset_in_page(offset);
2928
2929         do {
2930                 unsigned int len, unwritten;
2931                 struct page *page;
2932                 void *data, *vaddr;
2933                 int err;
2934
2935                 len = PAGE_SIZE - pg;
2936                 if (len > remain)
2937                         len = remain;
2938
2939                 err = pagecache_write_begin(obj->base.filp, mapping,
2940                                             offset, len, 0,
2941                                             &page, &data);
2942                 if (err < 0)
2943                         return err;
2944
2945                 vaddr = kmap(page);
2946                 unwritten = copy_from_user(vaddr + pg, user_data, len);
2947                 kunmap(page);
2948
2949                 err = pagecache_write_end(obj->base.filp, mapping,
2950                                           offset, len, len - unwritten,
2951                                           page, data);
2952                 if (err < 0)
2953                         return err;
2954
2955                 if (unwritten)
2956                         return -EFAULT;
2957
2958                 remain -= len;
2959                 user_data += len;
2960                 offset += len;
2961                 pg = 0;
2962         } while (remain);
2963
2964         return 0;
2965 }
2966
2967 static void i915_gem_client_mark_guilty(struct drm_i915_file_private *file_priv,
2968                                         const struct i915_gem_context *ctx)
2969 {
2970         unsigned int score;
2971         unsigned long prev_hang;
2972
2973         if (i915_gem_context_is_banned(ctx))
2974                 score = I915_CLIENT_SCORE_CONTEXT_BAN;
2975         else
2976                 score = 0;
2977
2978         prev_hang = xchg(&file_priv->hang_timestamp, jiffies);
2979         if (time_before(jiffies, prev_hang + I915_CLIENT_FAST_HANG_JIFFIES))
2980                 score += I915_CLIENT_SCORE_HANG_FAST;
2981
2982         if (score) {
2983                 atomic_add(score, &file_priv->ban_score);
2984
2985                 DRM_DEBUG_DRIVER("client %s: gained %u ban score, now %u\n",
2986                                  ctx->name, score,
2987                                  atomic_read(&file_priv->ban_score));
2988         }
2989 }
2990
2991 static void i915_gem_context_mark_guilty(struct i915_gem_context *ctx)
2992 {
2993         unsigned int score;
2994         bool banned, bannable;
2995
2996         atomic_inc(&ctx->guilty_count);
2997
2998         bannable = i915_gem_context_is_bannable(ctx);
2999         score = atomic_add_return(CONTEXT_SCORE_GUILTY, &ctx->ban_score);
3000         banned = score >= CONTEXT_SCORE_BAN_THRESHOLD;
3001
3002         /* Cool contexts don't accumulate client ban score */
3003         if (!bannable)
3004                 return;
3005
3006         if (banned) {
3007                 DRM_DEBUG_DRIVER("context %s: guilty %d, score %u, banned\n",
3008                                  ctx->name, atomic_read(&ctx->guilty_count),
3009                                  score);
3010                 i915_gem_context_set_banned(ctx);
3011         }
3012
3013         if (!IS_ERR_OR_NULL(ctx->file_priv))
3014                 i915_gem_client_mark_guilty(ctx->file_priv, ctx);
3015 }
3016
3017 static void i915_gem_context_mark_innocent(struct i915_gem_context *ctx)
3018 {
3019         atomic_inc(&ctx->active_count);
3020 }
3021
3022 struct i915_request *
3023 i915_gem_find_active_request(struct intel_engine_cs *engine)
3024 {
3025         struct i915_request *request, *active = NULL;
3026         unsigned long flags;
3027
3028         /*
3029          * We are called by the error capture, reset and to dump engine
3030          * state at random points in time. In particular, note that neither is
3031          * crucially ordered with an interrupt. After a hang, the GPU is dead
3032          * and we assume that no more writes can happen (we waited long enough
3033          * for all writes that were in transaction to be flushed) - adding an
3034          * extra delay for a recent interrupt is pointless. Hence, we do
3035          * not need an engine->irq_seqno_barrier() before the seqno reads.
3036          * At all other times, we must assume the GPU is still running, but
3037          * we only care about the snapshot of this moment.
3038          */
3039         spin_lock_irqsave(&engine->timeline.lock, flags);
3040         list_for_each_entry(request, &engine->timeline.requests, link) {
3041                 if (__i915_request_completed(request, request->global_seqno))
3042                         continue;
3043
3044                 active = request;
3045                 break;
3046         }
3047         spin_unlock_irqrestore(&engine->timeline.lock, flags);
3048
3049         return active;
3050 }
3051
3052 /*
3053  * Ensure irq handler finishes, and not run again.
3054  * Also return the active request so that we only search for it once.
3055  */
3056 struct i915_request *
3057 i915_gem_reset_prepare_engine(struct intel_engine_cs *engine)
3058 {
3059         struct i915_request *request;
3060
3061         /*
3062          * During the reset sequence, we must prevent the engine from
3063          * entering RC6. As the context state is undefined until we restart
3064          * the engine, if it does enter RC6 during the reset, the state
3065          * written to the powercontext is undefined and so we may lose
3066          * GPU state upon resume, i.e. fail to restart after a reset.
3067          */
3068         intel_uncore_forcewake_get(engine->i915, FORCEWAKE_ALL);
3069
3070         request = engine->reset.prepare(engine);
3071         if (request && request->fence.error == -EIO)
3072                 request = ERR_PTR(-EIO); /* Previous reset failed! */
3073
3074         return request;
3075 }
3076
3077 int i915_gem_reset_prepare(struct drm_i915_private *dev_priv)
3078 {
3079         struct intel_engine_cs *engine;
3080         struct i915_request *request;
3081         enum intel_engine_id id;
3082         int err = 0;
3083
3084         for_each_engine(engine, dev_priv, id) {
3085                 request = i915_gem_reset_prepare_engine(engine);
3086                 if (IS_ERR(request)) {
3087                         err = PTR_ERR(request);
3088                         continue;
3089                 }
3090
3091                 engine->hangcheck.active_request = request;
3092         }
3093
3094         i915_gem_revoke_fences(dev_priv);
3095         intel_uc_sanitize(dev_priv);
3096
3097         return err;
3098 }
3099
3100 static void engine_skip_context(struct i915_request *request)
3101 {
3102         struct intel_engine_cs *engine = request->engine;
3103         struct i915_gem_context *hung_ctx = request->gem_context;
3104         struct i915_timeline *timeline = request->timeline;
3105         unsigned long flags;
3106
3107         GEM_BUG_ON(timeline == &engine->timeline);
3108
3109         spin_lock_irqsave(&engine->timeline.lock, flags);
3110         spin_lock(&timeline->lock);
3111
3112         list_for_each_entry_continue(request, &engine->timeline.requests, link)
3113                 if (request->gem_context == hung_ctx)
3114                         i915_request_skip(request, -EIO);
3115
3116         list_for_each_entry(request, &timeline->requests, link)
3117                 i915_request_skip(request, -EIO);
3118
3119         spin_unlock(&timeline->lock);
3120         spin_unlock_irqrestore(&engine->timeline.lock, flags);
3121 }
3122
3123 /* Returns the request if it was guilty of the hang */
3124 static struct i915_request *
3125 i915_gem_reset_request(struct intel_engine_cs *engine,
3126                        struct i915_request *request,
3127                        bool stalled)
3128 {
3129         /* The guilty request will get skipped on a hung engine.
3130          *
3131          * Users of client default contexts do not rely on logical
3132          * state preserved between batches so it is safe to execute
3133          * queued requests following the hang. Non default contexts
3134          * rely on preserved state, so skipping a batch loses the
3135          * evolution of the state and it needs to be considered corrupted.
3136          * Executing more queued batches on top of corrupted state is
3137          * risky. But we take the risk by trying to advance through
3138          * the queued requests in order to make the client behaviour
3139          * more predictable around resets, by not throwing away random
3140          * amount of batches it has prepared for execution. Sophisticated
3141          * clients can use gem_reset_stats_ioctl and dma fence status
3142          * (exported via sync_file info ioctl on explicit fences) to observe
3143          * when it loses the context state and should rebuild accordingly.
3144          *
3145          * The context ban, and ultimately the client ban, mechanism are safety
3146          * valves if client submission ends up resulting in nothing more than
3147          * subsequent hangs.
3148          */
3149
3150         if (i915_request_completed(request)) {
3151                 GEM_TRACE("%s pardoned global=%d (fence %llx:%d), current %d\n",
3152                           engine->name, request->global_seqno,
3153                           request->fence.context, request->fence.seqno,
3154                           intel_engine_get_seqno(engine));
3155                 stalled = false;
3156         }
3157
3158         if (stalled) {
3159                 i915_gem_context_mark_guilty(request->gem_context);
3160                 i915_request_skip(request, -EIO);
3161
3162                 /* If this context is now banned, skip all pending requests. */
3163                 if (i915_gem_context_is_banned(request->gem_context))
3164                         engine_skip_context(request);
3165         } else {
3166                 /*
3167                  * Since this is not the hung engine, it may have advanced
3168                  * since the hang declaration. Double check by refinding
3169                  * the active request at the time of the reset.
3170                  */
3171                 request = i915_gem_find_active_request(engine);
3172                 if (request) {
3173                         unsigned long flags;
3174
3175                         i915_gem_context_mark_innocent(request->gem_context);
3176                         dma_fence_set_error(&request->fence, -EAGAIN);
3177
3178                         /* Rewind the engine to replay the incomplete rq */
3179                         spin_lock_irqsave(&engine->timeline.lock, flags);
3180                         request = list_prev_entry(request, link);
3181                         if (&request->link == &engine->timeline.requests)
3182                                 request = NULL;
3183                         spin_unlock_irqrestore(&engine->timeline.lock, flags);
3184                 }
3185         }
3186
3187         return request;
3188 }
3189
3190 void i915_gem_reset_engine(struct intel_engine_cs *engine,
3191                            struct i915_request *request,
3192                            bool stalled)
3193 {
3194         /*
3195          * Make sure this write is visible before we re-enable the interrupt
3196          * handlers on another CPU, as tasklet_enable() resolves to just
3197          * a compiler barrier which is insufficient for our purpose here.
3198          */
3199         smp_store_mb(engine->irq_posted, 0);
3200
3201         if (request)
3202                 request = i915_gem_reset_request(engine, request, stalled);
3203
3204         /* Setup the CS to resume from the breadcrumb of the hung request */
3205         engine->reset.reset(engine, request);
3206 }
3207
3208 void i915_gem_reset(struct drm_i915_private *dev_priv,
3209                     unsigned int stalled_mask)
3210 {
3211         struct intel_engine_cs *engine;
3212         enum intel_engine_id id;
3213
3214         lockdep_assert_held(&dev_priv->drm.struct_mutex);
3215
3216         i915_retire_requests(dev_priv);
3217
3218         for_each_engine(engine, dev_priv, id) {
3219                 struct intel_context *ce;
3220
3221                 i915_gem_reset_engine(engine,
3222                                       engine->hangcheck.active_request,
3223                                       stalled_mask & ENGINE_MASK(id));
3224                 ce = fetch_and_zero(&engine->last_retired_context);
3225                 if (ce)
3226                         intel_context_unpin(ce);
3227
3228                 /*
3229                  * Ostensibily, we always want a context loaded for powersaving,
3230                  * so if the engine is idle after the reset, send a request
3231                  * to load our scratch kernel_context.
3232                  *
3233                  * More mysteriously, if we leave the engine idle after a reset,
3234                  * the next userspace batch may hang, with what appears to be
3235                  * an incoherent read by the CS (presumably stale TLB). An
3236                  * empty request appears sufficient to paper over the glitch.
3237                  */
3238                 if (intel_engine_is_idle(engine)) {
3239                         struct i915_request *rq;
3240
3241                         rq = i915_request_alloc(engine,
3242                                                 dev_priv->kernel_context);
3243                         if (!IS_ERR(rq))
3244                                 i915_request_add(rq);
3245                 }
3246         }
3247
3248         i915_gem_restore_fences(dev_priv);
3249 }
3250
3251 void i915_gem_reset_finish_engine(struct intel_engine_cs *engine)
3252 {
3253         engine->reset.finish(engine);
3254
3255         intel_uncore_forcewake_put(engine->i915, FORCEWAKE_ALL);
3256 }
3257
3258 void i915_gem_reset_finish(struct drm_i915_private *dev_priv)
3259 {
3260         struct intel_engine_cs *engine;
3261         enum intel_engine_id id;
3262
3263         lockdep_assert_held(&dev_priv->drm.struct_mutex);
3264
3265         for_each_engine(engine, dev_priv, id) {
3266                 engine->hangcheck.active_request = NULL;
3267                 i915_gem_reset_finish_engine(engine);
3268         }
3269 }
3270
3271 static void nop_submit_request(struct i915_request *request)
3272 {
3273         GEM_TRACE("%s fence %llx:%d -> -EIO\n",
3274                   request->engine->name,
3275                   request->fence.context, request->fence.seqno);
3276         dma_fence_set_error(&request->fence, -EIO);
3277
3278         i915_request_submit(request);
3279 }
3280
3281 static void nop_complete_submit_request(struct i915_request *request)
3282 {
3283         unsigned long flags;
3284
3285         GEM_TRACE("%s fence %llx:%d -> -EIO\n",
3286                   request->engine->name,
3287                   request->fence.context, request->fence.seqno);
3288         dma_fence_set_error(&request->fence, -EIO);
3289
3290         spin_lock_irqsave(&request->engine->timeline.lock, flags);
3291         __i915_request_submit(request);
3292         intel_engine_init_global_seqno(request->engine, request->global_seqno);
3293         spin_unlock_irqrestore(&request->engine->timeline.lock, flags);
3294 }
3295
3296 void i915_gem_set_wedged(struct drm_i915_private *i915)
3297 {
3298         struct intel_engine_cs *engine;
3299         enum intel_engine_id id;
3300
3301         GEM_TRACE("start\n");
3302
3303         if (GEM_SHOW_DEBUG()) {
3304                 struct drm_printer p = drm_debug_printer(__func__);
3305
3306                 for_each_engine(engine, i915, id)
3307                         intel_engine_dump(engine, &p, "%s\n", engine->name);
3308         }
3309
3310         set_bit(I915_WEDGED, &i915->gpu_error.flags);
3311         smp_mb__after_atomic();
3312
3313         /*
3314          * First, stop submission to hw, but do not yet complete requests by
3315          * rolling the global seqno forward (since this would complete requests
3316          * for which we haven't set the fence error to EIO yet).
3317          */
3318         for_each_engine(engine, i915, id) {
3319                 i915_gem_reset_prepare_engine(engine);
3320
3321                 engine->submit_request = nop_submit_request;
3322                 engine->schedule = NULL;
3323         }
3324         i915->caps.scheduler = 0;
3325
3326         /* Even if the GPU reset fails, it should still stop the engines */
3327         intel_gpu_reset(i915, ALL_ENGINES);
3328
3329         /*
3330          * Make sure no one is running the old callback before we proceed with
3331          * cancelling requests and resetting the completion tracking. Otherwise
3332          * we might submit a request to the hardware which never completes.
3333          */
3334         synchronize_rcu();
3335
3336         for_each_engine(engine, i915, id) {
3337                 /* Mark all executing requests as skipped */
3338                 engine->cancel_requests(engine);
3339
3340                 /*
3341                  * Only once we've force-cancelled all in-flight requests can we
3342                  * start to complete all requests.
3343                  */
3344                 engine->submit_request = nop_complete_submit_request;
3345         }
3346
3347         /*
3348          * Make sure no request can slip through without getting completed by
3349          * either this call here to intel_engine_init_global_seqno, or the one
3350          * in nop_complete_submit_request.
3351          */
3352         synchronize_rcu();
3353
3354         for_each_engine(engine, i915, id) {
3355                 unsigned long flags;
3356
3357                 /*
3358                  * Mark all pending requests as complete so that any concurrent
3359                  * (lockless) lookup doesn't try and wait upon the request as we
3360                  * reset it.
3361                  */
3362                 spin_lock_irqsave(&engine->timeline.lock, flags);
3363                 intel_engine_init_global_seqno(engine,
3364                                                intel_engine_last_submit(engine));
3365                 spin_unlock_irqrestore(&engine->timeline.lock, flags);
3366
3367                 i915_gem_reset_finish_engine(engine);
3368         }
3369
3370         GEM_TRACE("end\n");
3371
3372         wake_up_all(&i915->gpu_error.reset_queue);
3373 }
3374
3375 bool i915_gem_unset_wedged(struct drm_i915_private *i915)
3376 {
3377         struct i915_timeline *tl;
3378
3379         lockdep_assert_held(&i915->drm.struct_mutex);
3380         if (!test_bit(I915_WEDGED, &i915->gpu_error.flags))
3381                 return true;
3382
3383         GEM_TRACE("start\n");
3384
3385         /*
3386          * Before unwedging, make sure that all pending operations
3387          * are flushed and errored out - we may have requests waiting upon
3388          * third party fences. We marked all inflight requests as EIO, and
3389          * every execbuf since returned EIO, for consistency we want all
3390          * the currently pending requests to also be marked as EIO, which
3391          * is done inside our nop_submit_request - and so we must wait.
3392          *
3393          * No more can be submitted until we reset the wedged bit.
3394          */
3395         list_for_each_entry(tl, &i915->gt.timelines, link) {
3396                 struct i915_request *rq;
3397
3398                 rq = i915_gem_active_peek(&tl->last_request,
3399                                           &i915->drm.struct_mutex);
3400                 if (!rq)
3401                         continue;
3402
3403                 /*
3404                  * We can't use our normal waiter as we want to
3405                  * avoid recursively trying to handle the current
3406                  * reset. The basic dma_fence_default_wait() installs
3407                  * a callback for dma_fence_signal(), which is
3408                  * triggered by our nop handler (indirectly, the
3409                  * callback enables the signaler thread which is
3410                  * woken by the nop_submit_request() advancing the seqno
3411                  * and when the seqno passes the fence, the signaler
3412                  * then signals the fence waking us up).
3413                  */
3414                 if (dma_fence_default_wait(&rq->fence, true,
3415                                            MAX_SCHEDULE_TIMEOUT) < 0)
3416                         return false;
3417         }
3418         i915_retire_requests(i915);
3419         GEM_BUG_ON(i915->gt.active_requests);
3420
3421         /*
3422          * Undo nop_submit_request. We prevent all new i915 requests from
3423          * being queued (by disallowing execbuf whilst wedged) so having
3424          * waited for all active requests above, we know the system is idle
3425          * and do not have to worry about a thread being inside
3426          * engine->submit_request() as we swap over. So unlike installing
3427          * the nop_submit_request on reset, we can do this from normal
3428          * context and do not require stop_machine().
3429          */
3430         intel_engines_reset_default_submission(i915);
3431         i915_gem_contexts_lost(i915);
3432
3433         GEM_TRACE("end\n");
3434
3435         smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
3436         clear_bit(I915_WEDGED, &i915->gpu_error.flags);
3437
3438         return true;
3439 }
3440
3441 static void
3442 i915_gem_retire_work_handler(struct work_struct *work)
3443 {
3444         struct drm_i915_private *dev_priv =
3445                 container_of(work, typeof(*dev_priv), gt.retire_work.work);
3446         struct drm_device *dev = &dev_priv->drm;
3447
3448         /* Come back later if the device is busy... */
3449         if (mutex_trylock(&dev->struct_mutex)) {
3450                 i915_retire_requests(dev_priv);
3451                 mutex_unlock(&dev->struct_mutex);
3452         }
3453
3454         /*
3455          * Keep the retire handler running until we are finally idle.
3456          * We do not need to do this test under locking as in the worst-case
3457          * we queue the retire worker once too often.
3458          */
3459         if (READ_ONCE(dev_priv->gt.awake))
3460                 queue_delayed_work(dev_priv->wq,
3461                                    &dev_priv->gt.retire_work,
3462                                    round_jiffies_up_relative(HZ));
3463 }
3464
3465 static void shrink_caches(struct drm_i915_private *i915)
3466 {
3467         /*
3468          * kmem_cache_shrink() discards empty slabs and reorders partially
3469          * filled slabs to prioritise allocating from the mostly full slabs,
3470          * with the aim of reducing fragmentation.
3471          */
3472         kmem_cache_shrink(i915->priorities);
3473         kmem_cache_shrink(i915->dependencies);
3474         kmem_cache_shrink(i915->requests);
3475         kmem_cache_shrink(i915->luts);
3476         kmem_cache_shrink(i915->vmas);
3477         kmem_cache_shrink(i915->objects);
3478 }
3479
3480 struct sleep_rcu_work {
3481         union {
3482                 struct rcu_head rcu;
3483                 struct work_struct work;
3484         };
3485         struct drm_i915_private *i915;
3486         unsigned int epoch;
3487 };
3488
3489 static inline bool
3490 same_epoch(struct drm_i915_private *i915, unsigned int epoch)
3491 {
3492         /*
3493          * There is a small chance that the epoch wrapped since we started
3494          * sleeping. If we assume that epoch is at least a u32, then it will
3495          * take at least 2^32 * 100ms for it to wrap, or about 326 years.
3496          */
3497         return epoch == READ_ONCE(i915->gt.epoch);
3498 }
3499
3500 static void __sleep_work(struct work_struct *work)
3501 {
3502         struct sleep_rcu_work *s = container_of(work, typeof(*s), work);
3503         struct drm_i915_private *i915 = s->i915;
3504         unsigned int epoch = s->epoch;
3505
3506         kfree(s);
3507         if (same_epoch(i915, epoch))
3508                 shrink_caches(i915);
3509 }
3510
3511 static void __sleep_rcu(struct rcu_head *rcu)
3512 {
3513         struct sleep_rcu_work *s = container_of(rcu, typeof(*s), rcu);
3514         struct drm_i915_private *i915 = s->i915;
3515
3516         if (same_epoch(i915, s->epoch)) {
3517                 INIT_WORK(&s->work, __sleep_work);
3518                 queue_work(i915->wq, &s->work);
3519         } else {
3520                 kfree(s);
3521         }
3522 }
3523
3524 static inline bool
3525 new_requests_since_last_retire(const struct drm_i915_private *i915)
3526 {
3527         return (READ_ONCE(i915->gt.active_requests) ||
3528                 work_pending(&i915->gt.idle_work.work));
3529 }
3530
3531 static void assert_kernel_context_is_current(struct drm_i915_private *i915)
3532 {
3533         struct intel_engine_cs *engine;
3534         enum intel_engine_id id;
3535
3536         if (i915_terminally_wedged(&i915->gpu_error))
3537                 return;
3538
3539         GEM_BUG_ON(i915->gt.active_requests);
3540         for_each_engine(engine, i915, id) {
3541                 GEM_BUG_ON(__i915_gem_active_peek(&engine->timeline.last_request));
3542                 GEM_BUG_ON(engine->last_retired_context !=
3543                            to_intel_context(i915->kernel_context, engine));
3544         }
3545 }
3546
3547 static void
3548 i915_gem_idle_work_handler(struct work_struct *work)
3549 {
3550         struct drm_i915_private *dev_priv =
3551                 container_of(work, typeof(*dev_priv), gt.idle_work.work);
3552         unsigned int epoch = I915_EPOCH_INVALID;
3553         bool rearm_hangcheck;
3554
3555         if (!READ_ONCE(dev_priv->gt.awake))
3556                 return;
3557
3558         if (READ_ONCE(dev_priv->gt.active_requests))
3559                 return;
3560
3561         /*
3562          * Flush out the last user context, leaving only the pinned
3563          * kernel context resident. When we are idling on the kernel_context,
3564          * no more new requests (with a context switch) are emitted and we
3565          * can finally rest. A consequence is that the idle work handler is
3566          * always called at least twice before idling (and if the system is
3567          * idle that implies a round trip through the retire worker).
3568          */
3569         mutex_lock(&dev_priv->drm.struct_mutex);
3570         i915_gem_switch_to_kernel_context(dev_priv);
3571         mutex_unlock(&dev_priv->drm.struct_mutex);
3572
3573         GEM_TRACE("active_requests=%d (after switch-to-kernel-context)\n",
3574                   READ_ONCE(dev_priv->gt.active_requests));
3575
3576         /*
3577          * Wait for last execlists context complete, but bail out in case a
3578          * new request is submitted. As we don't trust the hardware, we
3579          * continue on if the wait times out. This is necessary to allow
3580          * the machine to suspend even if the hardware dies, and we will
3581          * try to recover in resume (after depriving the hardware of power,
3582          * it may be in a better mmod).
3583          */
3584         __wait_for(if (new_requests_since_last_retire(dev_priv)) return,
3585                    intel_engines_are_idle(dev_priv),
3586                    I915_IDLE_ENGINES_TIMEOUT * 1000,
3587                    10, 500);
3588
3589         rearm_hangcheck =
3590                 cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work);
3591
3592         if (!mutex_trylock(&dev_priv->drm.struct_mutex)) {
3593                 /* Currently busy, come back later */
3594                 mod_delayed_work(dev_priv->wq,
3595                                  &dev_priv->gt.idle_work,
3596                                  msecs_to_jiffies(50));
3597                 goto out_rearm;
3598         }
3599
3600         /*
3601          * New request retired after this work handler started, extend active
3602          * period until next instance of the work.
3603          */
3604         if (new_requests_since_last_retire(dev_priv))
3605                 goto out_unlock;
3606
3607         epoch = __i915_gem_park(dev_priv);
3608
3609         assert_kernel_context_is_current(dev_priv);
3610
3611         rearm_hangcheck = false;
3612 out_unlock:
3613         mutex_unlock(&dev_priv->drm.struct_mutex);
3614
3615 out_rearm:
3616         if (rearm_hangcheck) {
3617                 GEM_BUG_ON(!dev_priv->gt.awake);
3618                 i915_queue_hangcheck(dev_priv);
3619         }
3620
3621         /*
3622          * When we are idle, it is an opportune time to reap our caches.
3623          * However, we have many objects that utilise RCU and the ordered
3624          * i915->wq that this work is executing on. To try and flush any
3625          * pending frees now we are idle, we first wait for an RCU grace
3626          * period, and then queue a task (that will run last on the wq) to
3627          * shrink and re-optimize the caches.
3628          */
3629         if (same_epoch(dev_priv, epoch)) {
3630                 struct sleep_rcu_work *s = kmalloc(sizeof(*s), GFP_KERNEL);
3631                 if (s) {
3632                         s->i915 = dev_priv;
3633                         s->epoch = epoch;
3634                         call_rcu(&s->rcu, __sleep_rcu);
3635                 }
3636         }
3637 }
3638
3639 void i915_gem_close_object(struct drm_gem_object *gem, struct drm_file *file)
3640 {
3641         struct drm_i915_private *i915 = to_i915(gem->dev);
3642         struct drm_i915_gem_object *obj = to_intel_bo(gem);
3643         struct drm_i915_file_private *fpriv = file->driver_priv;
3644         struct i915_lut_handle *lut, *ln;
3645
3646         mutex_lock(&i915->drm.struct_mutex);
3647
3648         list_for_each_entry_safe(lut, ln, &obj->lut_list, obj_link) {
3649                 struct i915_gem_context *ctx = lut->ctx;
3650                 struct i915_vma *vma;
3651
3652                 GEM_BUG_ON(ctx->file_priv == ERR_PTR(-EBADF));
3653                 if (ctx->file_priv != fpriv)
3654                         continue;
3655
3656                 vma = radix_tree_delete(&ctx->handles_vma, lut->handle);
3657                 GEM_BUG_ON(vma->obj != obj);
3658
3659                 /* We allow the process to have multiple handles to the same
3660                  * vma, in the same fd namespace, by virtue of flink/open.
3661                  */
3662                 GEM_BUG_ON(!vma->open_count);
3663                 if (!--vma->open_count && !i915_vma_is_ggtt(vma))
3664                         i915_vma_close(vma);
3665
3666                 list_del(&lut->obj_link);
3667                 list_del(&lut->ctx_link);
3668
3669                 kmem_cache_free(i915->luts, lut);
3670                 __i915_gem_object_release_unless_active(obj);
3671         }
3672
3673         mutex_unlock(&i915->drm.struct_mutex);
3674 }
3675
3676 static unsigned long to_wait_timeout(s64 timeout_ns)
3677 {
3678         if (timeout_ns < 0)
3679                 return MAX_SCHEDULE_TIMEOUT;
3680
3681         if (timeout_ns == 0)
3682                 return 0;
3683
3684         return nsecs_to_jiffies_timeout(timeout_ns);
3685 }
3686
3687 /**
3688  * i915_gem_wait_ioctl - implements DRM_IOCTL_I915_GEM_WAIT
3689  * @dev: drm device pointer
3690  * @data: ioctl data blob
3691  * @file: drm file pointer
3692  *
3693  * Returns 0 if successful, else an error is returned with the remaining time in
3694  * the timeout parameter.
3695  *  -ETIME: object is still busy after timeout
3696  *  -ERESTARTSYS: signal interrupted the wait
3697  *  -ENONENT: object doesn't exist
3698  * Also possible, but rare:
3699  *  -EAGAIN: incomplete, restart syscall
3700  *  -ENOMEM: damn
3701  *  -ENODEV: Internal IRQ fail
3702  *  -E?: The add request failed
3703  *
3704  * The wait ioctl with a timeout of 0 reimplements the busy ioctl. With any
3705  * non-zero timeout parameter the wait ioctl will wait for the given number of
3706  * nanoseconds on an object becoming unbusy. Since the wait itself does so
3707  * without holding struct_mutex the object may become re-busied before this
3708  * function completes. A similar but shorter * race condition exists in the busy
3709  * ioctl
3710  */
3711 int
3712 i915_gem_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
3713 {
3714         struct drm_i915_gem_wait *args = data;
3715         struct drm_i915_gem_object *obj;
3716         ktime_t start;
3717         long ret;
3718
3719         if (args->flags != 0)
3720                 return -EINVAL;
3721
3722         obj = i915_gem_object_lookup(file, args->bo_handle);
3723         if (!obj)
3724                 return -ENOENT;
3725
3726         start = ktime_get();
3727
3728         ret = i915_gem_object_wait(obj,
3729                                    I915_WAIT_INTERRUPTIBLE | I915_WAIT_ALL,
3730                                    to_wait_timeout(args->timeout_ns),
3731                                    to_rps_client(file));
3732
3733         if (args->timeout_ns > 0) {
3734                 args->timeout_ns -= ktime_to_ns(ktime_sub(ktime_get(), start));
3735                 if (args->timeout_ns < 0)
3736                         args->timeout_ns = 0;
3737
3738                 /*
3739                  * Apparently ktime isn't accurate enough and occasionally has a
3740                  * bit of mismatch in the jiffies<->nsecs<->ktime loop. So patch
3741                  * things up to make the test happy. We allow up to 1 jiffy.
3742                  *
3743                  * This is a regression from the timespec->ktime conversion.
3744                  */
3745                 if (ret == -ETIME && !nsecs_to_jiffies(args->timeout_ns))
3746                         args->timeout_ns = 0;
3747
3748                 /* Asked to wait beyond the jiffie/scheduler precision? */
3749                 if (ret == -ETIME && args->timeout_ns)
3750                         ret = -EAGAIN;
3751         }
3752
3753         i915_gem_object_put(obj);
3754         return ret;
3755 }
3756
3757 static long wait_for_timeline(struct i915_timeline *tl,
3758                               unsigned int flags, long timeout)
3759 {
3760         struct i915_request *rq;
3761
3762         rq = i915_gem_active_get_unlocked(&tl->last_request);
3763         if (!rq)
3764                 return timeout;
3765
3766         /*
3767          * "Race-to-idle".
3768          *
3769          * Switching to the kernel context is often used a synchronous
3770          * step prior to idling, e.g. in suspend for flushing all
3771          * current operations to memory before sleeping. These we
3772          * want to complete as quickly as possible to avoid prolonged
3773          * stalls, so allow the gpu to boost to maximum clocks.
3774          */
3775         if (flags & I915_WAIT_FOR_IDLE_BOOST)
3776                 gen6_rps_boost(rq, NULL);
3777
3778         timeout = i915_request_wait(rq, flags, timeout);
3779         i915_request_put(rq);
3780
3781         return timeout;
3782 }
3783
3784 static int wait_for_engines(struct drm_i915_private *i915)
3785 {
3786         if (wait_for(intel_engines_are_idle(i915), I915_IDLE_ENGINES_TIMEOUT)) {
3787                 dev_err(i915->drm.dev,
3788                         "Failed to idle engines, declaring wedged!\n");
3789                 GEM_TRACE_DUMP();
3790                 i915_gem_set_wedged(i915);
3791                 return -EIO;
3792         }
3793
3794         return 0;
3795 }
3796
3797 int i915_gem_wait_for_idle(struct drm_i915_private *i915,
3798                            unsigned int flags, long timeout)
3799 {
3800         GEM_TRACE("flags=%x (%s), timeout=%ld%s\n",
3801                   flags, flags & I915_WAIT_LOCKED ? "locked" : "unlocked",
3802                   timeout, timeout == MAX_SCHEDULE_TIMEOUT ? " (forever)" : "");
3803
3804         /* If the device is asleep, we have no requests outstanding */
3805         if (!READ_ONCE(i915->gt.awake))
3806                 return 0;
3807
3808         if (flags & I915_WAIT_LOCKED) {
3809                 struct i915_timeline *tl;
3810                 int err;
3811
3812                 lockdep_assert_held(&i915->drm.struct_mutex);
3813
3814                 list_for_each_entry(tl, &i915->gt.timelines, link) {
3815                         timeout = wait_for_timeline(tl, flags, timeout);
3816                         if (timeout < 0)
3817                                 return timeout;
3818                 }
3819
3820                 err = wait_for_engines(i915);
3821                 if (err)
3822                         return err;
3823
3824                 i915_retire_requests(i915);
3825                 GEM_BUG_ON(i915->gt.active_requests);
3826         } else {
3827                 struct intel_engine_cs *engine;
3828                 enum intel_engine_id id;
3829
3830                 for_each_engine(engine, i915, id) {
3831                         struct i915_timeline *tl = &engine->timeline;
3832
3833                         timeout = wait_for_timeline(tl, flags, timeout);
3834                         if (timeout < 0)
3835                                 return timeout;
3836                 }
3837         }
3838
3839         return 0;
3840 }
3841
3842 static void __i915_gem_object_flush_for_display(struct drm_i915_gem_object *obj)
3843 {
3844         /*
3845          * We manually flush the CPU domain so that we can override and
3846          * force the flush for the display, and perform it asyncrhonously.
3847          */
3848         flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
3849         if (obj->cache_dirty)
3850                 i915_gem_clflush_object(obj, I915_CLFLUSH_FORCE);
3851         obj->write_domain = 0;
3852 }
3853
3854 void i915_gem_object_flush_if_display(struct drm_i915_gem_object *obj)
3855 {
3856         if (!READ_ONCE(obj->pin_global))
3857                 return;
3858
3859         mutex_lock(&obj->base.dev->struct_mutex);
3860         __i915_gem_object_flush_for_display(obj);
3861         mutex_unlock(&obj->base.dev->struct_mutex);
3862 }
3863
3864 /**
3865  * Moves a single object to the WC read, and possibly write domain.
3866  * @obj: object to act on
3867  * @write: ask for write access or read only
3868  *
3869  * This function returns when the move is complete, including waiting on
3870  * flushes to occur.
3871  */
3872 int
3873 i915_gem_object_set_to_wc_domain(struct drm_i915_gem_object *obj, bool write)
3874 {
3875         int ret;
3876
3877         lockdep_assert_held(&obj->base.dev->struct_mutex);
3878
3879         ret = i915_gem_object_wait(obj,
3880                                    I915_WAIT_INTERRUPTIBLE |
3881                                    I915_WAIT_LOCKED |
3882                                    (write ? I915_WAIT_ALL : 0),
3883                                    MAX_SCHEDULE_TIMEOUT,
3884                                    NULL);
3885         if (ret)
3886                 return ret;
3887
3888         if (obj->write_domain == I915_GEM_DOMAIN_WC)
3889                 return 0;
3890
3891         /* Flush and acquire obj->pages so that we are coherent through
3892          * direct access in memory with previous cached writes through
3893          * shmemfs and that our cache domain tracking remains valid.
3894          * For example, if the obj->filp was moved to swap without us
3895          * being notified and releasing the pages, we would mistakenly
3896          * continue to assume that the obj remained out of the CPU cached
3897          * domain.
3898          */
3899         ret = i915_gem_object_pin_pages(obj);
3900         if (ret)
3901                 return ret;
3902
3903         flush_write_domain(obj, ~I915_GEM_DOMAIN_WC);
3904
3905         /* Serialise direct access to this object with the barriers for
3906          * coherent writes from the GPU, by effectively invalidating the
3907          * WC domain upon first access.
3908          */
3909         if ((obj->read_domains & I915_GEM_DOMAIN_WC) == 0)
3910                 mb();
3911
3912         /* It should now be out of any other write domains, and we can update
3913          * the domain values for our changes.
3914          */
3915         GEM_BUG_ON((obj->write_domain & ~I915_GEM_DOMAIN_WC) != 0);
3916         obj->read_domains |= I915_GEM_DOMAIN_WC;
3917         if (write) {
3918                 obj->read_domains = I915_GEM_DOMAIN_WC;
3919                 obj->write_domain = I915_GEM_DOMAIN_WC;
3920                 obj->mm.dirty = true;
3921         }
3922
3923         i915_gem_object_unpin_pages(obj);
3924         return 0;
3925 }
3926
3927 /**
3928  * Moves a single object to the GTT read, and possibly write domain.
3929  * @obj: object to act on
3930  * @write: ask for write access or read only
3931  *
3932  * This function returns when the move is complete, including waiting on
3933  * flushes to occur.
3934  */
3935 int
3936 i915_gem_object_set_to_gtt_domain(struct drm_i915_gem_object *obj, bool write)
3937 {
3938         int ret;
3939
3940         lockdep_assert_held(&obj->base.dev->struct_mutex);
3941
3942         ret = i915_gem_object_wait(obj,
3943                                    I915_WAIT_INTERRUPTIBLE |
3944                                    I915_WAIT_LOCKED |
3945                                    (write ? I915_WAIT_ALL : 0),
3946                                    MAX_SCHEDULE_TIMEOUT,
3947                                    NULL);
3948         if (ret)
3949                 return ret;
3950
3951         if (obj->write_domain == I915_GEM_DOMAIN_GTT)
3952                 return 0;
3953
3954         /* Flush and acquire obj->pages so that we are coherent through
3955          * direct access in memory with previous cached writes through
3956          * shmemfs and that our cache domain tracking remains valid.
3957          * For example, if the obj->filp was moved to swap without us
3958          * being notified and releasing the pages, we would mistakenly
3959          * continue to assume that the obj remained out of the CPU cached
3960          * domain.
3961          */
3962         ret = i915_gem_object_pin_pages(obj);
3963         if (ret)
3964                 return ret;
3965
3966         flush_write_domain(obj, ~I915_GEM_DOMAIN_GTT);
3967
3968         /* Serialise direct access to this object with the barriers for
3969          * coherent writes from the GPU, by effectively invalidating the
3970          * GTT domain upon first access.
3971          */
3972         if ((obj->read_domains & I915_GEM_DOMAIN_GTT) == 0)
3973                 mb();
3974
3975         /* It should now be out of any other write domains, and we can update
3976          * the domain values for our changes.
3977          */
3978         GEM_BUG_ON((obj->write_domain & ~I915_GEM_DOMAIN_GTT) != 0);
3979         obj->read_domains |= I915_GEM_DOMAIN_GTT;
3980         if (write) {
3981                 obj->read_domains = I915_GEM_DOMAIN_GTT;
3982                 obj->write_domain = I915_GEM_DOMAIN_GTT;
3983                 obj->mm.dirty = true;
3984         }
3985
3986         i915_gem_object_unpin_pages(obj);
3987         return 0;
3988 }
3989
3990 /**
3991  * Changes the cache-level of an object across all VMA.
3992  * @obj: object to act on
3993  * @cache_level: new cache level to set for the object
3994  *
3995  * After this function returns, the object will be in the new cache-level
3996  * across all GTT and the contents of the backing storage will be coherent,
3997  * with respect to the new cache-level. In order to keep the backing storage
3998  * coherent for all users, we only allow a single cache level to be set
3999  * globally on the object and prevent it from being changed whilst the
4000  * hardware is reading from the object. That is if the object is currently
4001  * on the scanout it will be set to uncached (or equivalent display
4002  * cache coherency) and all non-MOCS GPU access will also be uncached so
4003  * that all direct access to the scanout remains coherent.
4004  */
4005 int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
4006                                     enum i915_cache_level cache_level)
4007 {
4008         struct i915_vma *vma;
4009         int ret;
4010
4011         lockdep_assert_held(&obj->base.dev->struct_mutex);
4012
4013         if (obj->cache_level == cache_level)
4014                 return 0;
4015
4016         /* Inspect the list of currently bound VMA and unbind any that would
4017          * be invalid given the new cache-level. This is principally to
4018          * catch the issue of the CS prefetch crossing page boundaries and
4019          * reading an invalid PTE on older architectures.
4020          */
4021 restart:
4022         list_for_each_entry(vma, &obj->vma_list, obj_link) {
4023                 if (!drm_mm_node_allocated(&vma->node))
4024                         continue;
4025
4026                 if (i915_vma_is_pinned(vma)) {
4027                         DRM_DEBUG("can not change the cache level of pinned objects\n");
4028                         return -EBUSY;
4029                 }
4030
4031                 if (!i915_vma_is_closed(vma) &&
4032                     i915_gem_valid_gtt_space(vma, cache_level))
4033                         continue;
4034
4035                 ret = i915_vma_unbind(vma);
4036                 if (ret)
4037                         return ret;
4038
4039                 /* As unbinding may affect other elements in the
4040                  * obj->vma_list (due to side-effects from retiring
4041                  * an active vma), play safe and restart the iterator.
4042                  */
4043                 goto restart;
4044         }
4045
4046         /* We can reuse the existing drm_mm nodes but need to change the
4047          * cache-level on the PTE. We could simply unbind them all and
4048          * rebind with the correct cache-level on next use. However since
4049          * we already have a valid slot, dma mapping, pages etc, we may as
4050          * rewrite the PTE in the belief that doing so tramples upon less
4051          * state and so involves less work.
4052          */
4053         if (obj->bind_count) {
4054                 /* Before we change the PTE, the GPU must not be accessing it.
4055                  * If we wait upon the object, we know that all the bound
4056                  * VMA are no longer active.
4057                  */
4058                 ret = i915_gem_object_wait(obj,
4059                                            I915_WAIT_INTERRUPTIBLE |
4060                                            I915_WAIT_LOCKED |
4061                                            I915_WAIT_ALL,
4062                                            MAX_SCHEDULE_TIMEOUT,
4063                                            NULL);
4064                 if (ret)
4065                         return ret;
4066
4067                 if (!HAS_LLC(to_i915(obj->base.dev)) &&
4068                     cache_level != I915_CACHE_NONE) {
4069                         /* Access to snoopable pages through the GTT is
4070                          * incoherent and on some machines causes a hard
4071                          * lockup. Relinquish the CPU mmaping to force
4072                          * userspace to refault in the pages and we can
4073                          * then double check if the GTT mapping is still
4074                          * valid for that pointer access.
4075                          */
4076                         i915_gem_release_mmap(obj);
4077
4078                         /* As we no longer need a fence for GTT access,
4079                          * we can relinquish it now (and so prevent having
4080                          * to steal a fence from someone else on the next
4081                          * fence request). Note GPU activity would have
4082                          * dropped the fence as all snoopable access is
4083                          * supposed to be linear.
4084                          */
4085                         for_each_ggtt_vma(vma, obj) {
4086                                 ret = i915_vma_put_fence(vma);
4087                                 if (ret)
4088                                         return ret;
4089                         }
4090                 } else {
4091                         /* We either have incoherent backing store and
4092                          * so no GTT access or the architecture is fully
4093                          * coherent. In such cases, existing GTT mmaps
4094                          * ignore the cache bit in the PTE and we can
4095                          * rewrite it without confusing the GPU or having
4096                          * to force userspace to fault back in its mmaps.
4097                          */
4098                 }
4099
4100                 list_for_each_entry(vma, &obj->vma_list, obj_link) {
4101                         if (!drm_mm_node_allocated(&vma->node))
4102                                 continue;
4103
4104                         ret = i915_vma_bind(vma, cache_level, PIN_UPDATE);
4105                         if (ret)
4106                                 return ret;
4107                 }
4108         }
4109
4110         list_for_each_entry(vma, &obj->vma_list, obj_link)
4111                 vma->node.color = cache_level;
4112         i915_gem_object_set_cache_coherency(obj, cache_level);
4113         obj->cache_dirty = true; /* Always invalidate stale cachelines */
4114
4115         return 0;
4116 }
4117
4118 int i915_gem_get_caching_ioctl(struct drm_device *dev, void *data,
4119                                struct drm_file *file)
4120 {
4121         struct drm_i915_gem_caching *args = data;
4122         struct drm_i915_gem_object *obj;
4123         int err = 0;
4124
4125         rcu_read_lock();
4126         obj = i915_gem_object_lookup_rcu(file, args->handle);
4127         if (!obj) {
4128                 err = -ENOENT;
4129                 goto out;
4130         }
4131
4132         switch (obj->cache_level) {
4133         case I915_CACHE_LLC:
4134         case I915_CACHE_L3_LLC:
4135                 args->caching = I915_CACHING_CACHED;
4136                 break;
4137
4138         case I915_CACHE_WT:
4139                 args->caching = I915_CACHING_DISPLAY;
4140                 break;
4141
4142         default:
4143                 args->caching = I915_CACHING_NONE;
4144                 break;
4145         }
4146 out:
4147         rcu_read_unlock();
4148         return err;
4149 }
4150
4151 int i915_gem_set_caching_ioctl(struct drm_device *dev, void *data,
4152                                struct drm_file *file)
4153 {
4154         struct drm_i915_private *i915 = to_i915(dev);
4155         struct drm_i915_gem_caching *args = data;
4156         struct drm_i915_gem_object *obj;
4157         enum i915_cache_level level;
4158         int ret = 0;
4159
4160         switch (args->caching) {
4161         case I915_CACHING_NONE:
4162                 level = I915_CACHE_NONE;
4163                 break;
4164         case I915_CACHING_CACHED:
4165                 /*
4166                  * Due to a HW issue on BXT A stepping, GPU stores via a
4167                  * snooped mapping may leave stale data in a corresponding CPU
4168                  * cacheline, whereas normally such cachelines would get
4169                  * invalidated.
4170                  */
4171                 if (!HAS_LLC(i915) && !HAS_SNOOP(i915))
4172                         return -ENODEV;
4173
4174                 level = I915_CACHE_LLC;
4175                 break;
4176         case I915_CACHING_DISPLAY:
4177                 level = HAS_WT(i915) ? I915_CACHE_WT : I915_CACHE_NONE;
4178                 break;
4179         default:
4180                 return -EINVAL;
4181         }
4182
4183         obj = i915_gem_object_lookup(file, args->handle);
4184         if (!obj)
4185                 return -ENOENT;
4186
4187         /*
4188          * The caching mode of proxy object is handled by its generator, and
4189          * not allowed to be changed by userspace.
4190          */
4191         if (i915_gem_object_is_proxy(obj)) {
4192                 ret = -ENXIO;
4193                 goto out;
4194         }
4195
4196         if (obj->cache_level == level)
4197                 goto out;
4198
4199         ret = i915_gem_object_wait(obj,
4200                                    I915_WAIT_INTERRUPTIBLE,
4201                                    MAX_SCHEDULE_TIMEOUT,
4202                                    to_rps_client(file));
4203         if (ret)
4204                 goto out;
4205
4206         ret = i915_mutex_lock_interruptible(dev);
4207         if (ret)
4208                 goto out;
4209
4210         ret = i915_gem_object_set_cache_level(obj, level);
4211         mutex_unlock(&dev->struct_mutex);
4212
4213 out:
4214         i915_gem_object_put(obj);
4215         return ret;
4216 }
4217
4218 /*
4219  * Prepare buffer for display plane (scanout, cursors, etc). Can be called from
4220  * an uninterruptible phase (modesetting) and allows any flushes to be pipelined
4221  * (for pageflips). We only flush the caches while preparing the buffer for
4222  * display, the callers are responsible for frontbuffer flush.
4223  */
4224 struct i915_vma *
4225 i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj,
4226                                      u32 alignment,
4227                                      const struct i915_ggtt_view *view,
4228                                      unsigned int flags)
4229 {
4230         struct i915_vma *vma;
4231         int ret;
4232
4233         lockdep_assert_held(&obj->base.dev->struct_mutex);
4234
4235         /* Mark the global pin early so that we account for the
4236          * display coherency whilst setting up the cache domains.
4237          */
4238         obj->pin_global++;
4239
4240         /* The display engine is not coherent with the LLC cache on gen6.  As
4241          * a result, we make sure that the pinning that is about to occur is
4242          * done with uncached PTEs. This is lowest common denominator for all
4243          * chipsets.
4244          *
4245          * However for gen6+, we could do better by using the GFDT bit instead
4246          * of uncaching, which would allow us to flush all the LLC-cached data
4247          * with that bit in the PTE to main memory with just one PIPE_CONTROL.
4248          */
4249         ret = i915_gem_object_set_cache_level(obj,
4250                                               HAS_WT(to_i915(obj->base.dev)) ?
4251                                               I915_CACHE_WT : I915_CACHE_NONE);
4252         if (ret) {
4253                 vma = ERR_PTR(ret);
4254                 goto err_unpin_global;
4255         }
4256
4257         /* As the user may map the buffer once pinned in the display plane
4258          * (e.g. libkms for the bootup splash), we have to ensure that we
4259          * always use map_and_fenceable for all scanout buffers. However,
4260          * it may simply be too big to fit into mappable, in which case
4261          * put it anyway and hope that userspace can cope (but always first
4262          * try to preserve the existing ABI).
4263          */
4264         vma = ERR_PTR(-ENOSPC);
4265         if ((flags & PIN_MAPPABLE) == 0 &&
4266             (!view || view->type == I915_GGTT_VIEW_NORMAL))
4267                 vma = i915_gem_object_ggtt_pin(obj, view, 0, alignment,
4268                                                flags |
4269                                                PIN_MAPPABLE |
4270                                                PIN_NONBLOCK);
4271         if (IS_ERR(vma))
4272                 vma = i915_gem_object_ggtt_pin(obj, view, 0, alignment, flags);
4273         if (IS_ERR(vma))
4274                 goto err_unpin_global;
4275
4276         vma->display_alignment = max_t(u64, vma->display_alignment, alignment);
4277
4278         __i915_gem_object_flush_for_display(obj);
4279
4280         /* It should now be out of any other write domains, and we can update
4281          * the domain values for our changes.
4282          */
4283         obj->read_domains |= I915_GEM_DOMAIN_GTT;
4284
4285         return vma;
4286
4287 err_unpin_global:
4288         obj->pin_global--;
4289         return vma;
4290 }
4291
4292 void
4293 i915_gem_object_unpin_from_display_plane(struct i915_vma *vma)
4294 {
4295         lockdep_assert_held(&vma->vm->i915->drm.struct_mutex);
4296
4297         if (WARN_ON(vma->obj->pin_global == 0))
4298                 return;
4299
4300         if (--vma->obj->pin_global == 0)
4301                 vma->display_alignment = I915_GTT_MIN_ALIGNMENT;
4302
4303         /* Bump the LRU to try and avoid premature eviction whilst flipping  */
4304         i915_gem_object_bump_inactive_ggtt(vma->obj);
4305
4306         i915_vma_unpin(vma);
4307 }
4308
4309 /**
4310  * Moves a single object to the CPU read, and possibly write domain.
4311  * @obj: object to act on
4312  * @write: requesting write or read-only access
4313  *
4314  * This function returns when the move is complete, including waiting on
4315  * flushes to occur.
4316  */
4317 int
4318 i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write)
4319 {
4320         int ret;
4321
4322         lockdep_assert_held(&obj->base.dev->struct_mutex);
4323
4324         ret = i915_gem_object_wait(obj,
4325                                    I915_WAIT_INTERRUPTIBLE |
4326                                    I915_WAIT_LOCKED |
4327                                    (write ? I915_WAIT_ALL : 0),
4328                                    MAX_SCHEDULE_TIMEOUT,
4329                                    NULL);
4330         if (ret)
4331                 return ret;
4332
4333         flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
4334
4335         /* Flush the CPU cache if it's still invalid. */
4336         if ((obj->read_domains & I915_GEM_DOMAIN_CPU) == 0) {
4337                 i915_gem_clflush_object(obj, I915_CLFLUSH_SYNC);
4338                 obj->read_domains |= I915_GEM_DOMAIN_CPU;
4339         }
4340
4341         /* It should now be out of any other write domains, and we can update
4342          * the domain values for our changes.
4343          */
4344         GEM_BUG_ON(obj->write_domain & ~I915_GEM_DOMAIN_CPU);
4345
4346         /* If we're writing through the CPU, then the GPU read domains will
4347          * need to be invalidated at next use.
4348          */
4349         if (write)
4350                 __start_cpu_write(obj);
4351
4352         return 0;
4353 }
4354
4355 /* Throttle our rendering by waiting until the ring has completed our requests
4356  * emitted over 20 msec ago.
4357  *
4358  * Note that if we were to use the current jiffies each time around the loop,
4359  * we wouldn't escape the function with any frames outstanding if the time to
4360  * render a frame was over 20ms.
4361  *
4362  * This should get us reasonable parallelism between CPU and GPU but also
4363  * relatively low latency when blocking on a particular request to finish.
4364  */
4365 static int
4366 i915_gem_ring_throttle(struct drm_device *dev, struct drm_file *file)
4367 {
4368         struct drm_i915_private *dev_priv = to_i915(dev);
4369         struct drm_i915_file_private *file_priv = file->driver_priv;
4370         unsigned long recent_enough = jiffies - DRM_I915_THROTTLE_JIFFIES;
4371         struct i915_request *request, *target = NULL;
4372         long ret;
4373
4374         /* ABI: return -EIO if already wedged */
4375         if (i915_terminally_wedged(&dev_priv->gpu_error))
4376                 return -EIO;
4377
4378         spin_lock(&file_priv->mm.lock);
4379         list_for_each_entry(request, &file_priv->mm.request_list, client_link) {
4380                 if (time_after_eq(request->emitted_jiffies, recent_enough))
4381                         break;
4382
4383                 if (target) {
4384                         list_del(&target->client_link);
4385                         target->file_priv = NULL;
4386                 }
4387
4388                 target = request;
4389         }
4390         if (target)
4391                 i915_request_get(target);
4392         spin_unlock(&file_priv->mm.lock);
4393
4394         if (target == NULL)
4395                 return 0;
4396
4397         ret = i915_request_wait(target,
4398                                 I915_WAIT_INTERRUPTIBLE,
4399                                 MAX_SCHEDULE_TIMEOUT);
4400         i915_request_put(target);
4401
4402         return ret < 0 ? ret : 0;
4403 }
4404
4405 struct i915_vma *
4406 i915_gem_object_ggtt_pin(struct drm_i915_gem_object *obj,
4407                          const struct i915_ggtt_view *view,
4408                          u64 size,
4409                          u64 alignment,
4410                          u64 flags)
4411 {
4412         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
4413         struct i915_address_space *vm = &dev_priv->ggtt.vm;
4414         struct i915_vma *vma;
4415         int ret;
4416
4417         lockdep_assert_held(&obj->base.dev->struct_mutex);
4418
4419         if (flags & PIN_MAPPABLE &&
4420             (!view || view->type == I915_GGTT_VIEW_NORMAL)) {
4421                 /* If the required space is larger than the available
4422                  * aperture, we will not able to find a slot for the
4423                  * object and unbinding the object now will be in
4424                  * vain. Worse, doing so may cause us to ping-pong
4425                  * the object in and out of the Global GTT and
4426                  * waste a lot of cycles under the mutex.
4427                  */
4428                 if (obj->base.size > dev_priv->ggtt.mappable_end)
4429                         return ERR_PTR(-E2BIG);
4430
4431                 /* If NONBLOCK is set the caller is optimistically
4432                  * trying to cache the full object within the mappable
4433                  * aperture, and *must* have a fallback in place for
4434                  * situations where we cannot bind the object. We
4435                  * can be a little more lax here and use the fallback
4436                  * more often to avoid costly migrations of ourselves
4437                  * and other objects within the aperture.
4438                  *
4439                  * Half-the-aperture is used as a simple heuristic.
4440                  * More interesting would to do search for a free
4441                  * block prior to making the commitment to unbind.
4442                  * That caters for the self-harm case, and with a
4443                  * little more heuristics (e.g. NOFAULT, NOEVICT)
4444                  * we could try to minimise harm to others.
4445                  */
4446                 if (flags & PIN_NONBLOCK &&
4447                     obj->base.size > dev_priv->ggtt.mappable_end / 2)
4448                         return ERR_PTR(-ENOSPC);
4449         }
4450
4451         vma = i915_vma_instance(obj, vm, view);
4452         if (unlikely(IS_ERR(vma)))
4453                 return vma;
4454
4455         if (i915_vma_misplaced(vma, size, alignment, flags)) {
4456                 if (flags & PIN_NONBLOCK) {
4457                         if (i915_vma_is_pinned(vma) || i915_vma_is_active(vma))
4458                                 return ERR_PTR(-ENOSPC);
4459
4460                         if (flags & PIN_MAPPABLE &&
4461                             vma->fence_size > dev_priv->ggtt.mappable_end / 2)
4462                                 return ERR_PTR(-ENOSPC);
4463                 }
4464
4465                 WARN(i915_vma_is_pinned(vma),
4466                      "bo is already pinned in ggtt with incorrect alignment:"
4467                      " offset=%08x, req.alignment=%llx,"
4468                      " req.map_and_fenceable=%d, vma->map_and_fenceable=%d\n",
4469                      i915_ggtt_offset(vma), alignment,
4470                      !!(flags & PIN_MAPPABLE),
4471                      i915_vma_is_map_and_fenceable(vma));
4472                 ret = i915_vma_unbind(vma);
4473                 if (ret)
4474                         return ERR_PTR(ret);
4475         }
4476
4477         ret = i915_vma_pin(vma, size, alignment, flags | PIN_GLOBAL);
4478         if (ret)
4479                 return ERR_PTR(ret);
4480
4481         return vma;
4482 }
4483
4484 static __always_inline unsigned int __busy_read_flag(unsigned int id)
4485 {
4486         /* Note that we could alias engines in the execbuf API, but
4487          * that would be very unwise as it prevents userspace from
4488          * fine control over engine selection. Ahem.
4489          *
4490          * This should be something like EXEC_MAX_ENGINE instead of
4491          * I915_NUM_ENGINES.
4492          */
4493         BUILD_BUG_ON(I915_NUM_ENGINES > 16);
4494         return 0x10000 << id;
4495 }
4496
4497 static __always_inline unsigned int __busy_write_id(unsigned int id)
4498 {
4499         /* The uABI guarantees an active writer is also amongst the read
4500          * engines. This would be true if we accessed the activity tracking
4501          * under the lock, but as we perform the lookup of the object and
4502          * its activity locklessly we can not guarantee that the last_write
4503          * being active implies that we have set the same engine flag from
4504          * last_read - hence we always set both read and write busy for
4505          * last_write.
4506          */
4507         return id | __busy_read_flag(id);
4508 }
4509
4510 static __always_inline unsigned int
4511 __busy_set_if_active(const struct dma_fence *fence,
4512                      unsigned int (*flag)(unsigned int id))
4513 {
4514         struct i915_request *rq;
4515
4516         /* We have to check the current hw status of the fence as the uABI
4517          * guarantees forward progress. We could rely on the idle worker
4518          * to eventually flush us, but to minimise latency just ask the
4519          * hardware.
4520          *
4521          * Note we only report on the status of native fences.
4522          */
4523         if (!dma_fence_is_i915(fence))
4524                 return 0;
4525
4526         /* opencode to_request() in order to avoid const warnings */
4527         rq = container_of(fence, struct i915_request, fence);
4528         if (i915_request_completed(rq))
4529                 return 0;
4530
4531         return flag(rq->engine->uabi_id);
4532 }
4533
4534 static __always_inline unsigned int
4535 busy_check_reader(const struct dma_fence *fence)
4536 {
4537         return __busy_set_if_active(fence, __busy_read_flag);
4538 }
4539
4540 static __always_inline unsigned int
4541 busy_check_writer(const struct dma_fence *fence)
4542 {
4543         if (!fence)
4544                 return 0;
4545
4546         return __busy_set_if_active(fence, __busy_write_id);
4547 }
4548
4549 int
4550 i915_gem_busy_ioctl(struct drm_device *dev, void *data,
4551                     struct drm_file *file)
4552 {
4553         struct drm_i915_gem_busy *args = data;
4554         struct drm_i915_gem_object *obj;
4555         struct reservation_object_list *list;
4556         unsigned int seq;
4557         int err;
4558
4559         err = -ENOENT;
4560         rcu_read_lock();
4561         obj = i915_gem_object_lookup_rcu(file, args->handle);
4562         if (!obj)
4563                 goto out;
4564
4565         /* A discrepancy here is that we do not report the status of
4566          * non-i915 fences, i.e. even though we may report the object as idle,
4567          * a call to set-domain may still stall waiting for foreign rendering.
4568          * This also means that wait-ioctl may report an object as busy,
4569          * where busy-ioctl considers it idle.
4570          *
4571          * We trade the ability to warn of foreign fences to report on which
4572          * i915 engines are active for the object.
4573          *
4574          * Alternatively, we can trade that extra information on read/write
4575          * activity with
4576          *      args->busy =
4577          *              !reservation_object_test_signaled_rcu(obj->resv, true);
4578          * to report the overall busyness. This is what the wait-ioctl does.
4579          *
4580          */
4581 retry:
4582         seq = raw_read_seqcount(&obj->resv->seq);
4583
4584         /* Translate the exclusive fence to the READ *and* WRITE engine */
4585         args->busy = busy_check_writer(rcu_dereference(obj->resv->fence_excl));
4586
4587         /* Translate shared fences to READ set of engines */
4588         list = rcu_dereference(obj->resv->fence);
4589         if (list) {
4590                 unsigned int shared_count = list->shared_count, i;
4591
4592                 for (i = 0; i < shared_count; ++i) {
4593                         struct dma_fence *fence =
4594                                 rcu_dereference(list->shared[i]);
4595
4596                         args->busy |= busy_check_reader(fence);
4597                 }
4598         }
4599
4600         if (args->busy && read_seqcount_retry(&obj->resv->seq, seq))
4601                 goto retry;
4602
4603         err = 0;
4604 out:
4605         rcu_read_unlock();
4606         return err;
4607 }
4608
4609 int
4610 i915_gem_throttle_ioctl(struct drm_device *dev, void *data,
4611                         struct drm_file *file_priv)
4612 {
4613         return i915_gem_ring_throttle(dev, file_priv);
4614 }
4615
4616 int
4617 i915_gem_madvise_ioctl(struct drm_device *dev, void *data,
4618                        struct drm_file *file_priv)
4619 {
4620         struct drm_i915_private *dev_priv = to_i915(dev);
4621         struct drm_i915_gem_madvise *args = data;
4622         struct drm_i915_gem_object *obj;
4623         int err;
4624
4625         switch (args->madv) {
4626         case I915_MADV_DONTNEED:
4627         case I915_MADV_WILLNEED:
4628             break;
4629         default:
4630             return -EINVAL;
4631         }
4632
4633         obj = i915_gem_object_lookup(file_priv, args->handle);
4634         if (!obj)
4635                 return -ENOENT;
4636
4637         err = mutex_lock_interruptible(&obj->mm.lock);
4638         if (err)
4639                 goto out;
4640
4641         if (i915_gem_object_has_pages(obj) &&
4642             i915_gem_object_is_tiled(obj) &&
4643             dev_priv->quirks & QUIRK_PIN_SWIZZLED_PAGES) {
4644                 if (obj->mm.madv == I915_MADV_WILLNEED) {
4645                         GEM_BUG_ON(!obj->mm.quirked);
4646                         __i915_gem_object_unpin_pages(obj);
4647                         obj->mm.quirked = false;
4648                 }
4649                 if (args->madv == I915_MADV_WILLNEED) {
4650                         GEM_BUG_ON(obj->mm.quirked);
4651                         __i915_gem_object_pin_pages(obj);
4652                         obj->mm.quirked = true;
4653                 }
4654         }
4655
4656         if (obj->mm.madv != __I915_MADV_PURGED)
4657                 obj->mm.madv = args->madv;
4658
4659         /* if the object is no longer attached, discard its backing storage */
4660         if (obj->mm.madv == I915_MADV_DONTNEED &&
4661             !i915_gem_object_has_pages(obj))
4662                 i915_gem_object_truncate(obj);
4663
4664         args->retained = obj->mm.madv != __I915_MADV_PURGED;
4665         mutex_unlock(&obj->mm.lock);
4666
4667 out:
4668         i915_gem_object_put(obj);
4669         return err;
4670 }
4671
4672 static void
4673 frontbuffer_retire(struct i915_gem_active *active, struct i915_request *request)
4674 {
4675         struct drm_i915_gem_object *obj =
4676                 container_of(active, typeof(*obj), frontbuffer_write);
4677
4678         intel_fb_obj_flush(obj, ORIGIN_CS);
4679 }
4680
4681 void i915_gem_object_init(struct drm_i915_gem_object *obj,
4682                           const struct drm_i915_gem_object_ops *ops)
4683 {
4684         mutex_init(&obj->mm.lock);
4685
4686         INIT_LIST_HEAD(&obj->vma_list);
4687         INIT_LIST_HEAD(&obj->lut_list);
4688         INIT_LIST_HEAD(&obj->batch_pool_link);
4689
4690         obj->ops = ops;
4691
4692         reservation_object_init(&obj->__builtin_resv);
4693         obj->resv = &obj->__builtin_resv;
4694
4695         obj->frontbuffer_ggtt_origin = ORIGIN_GTT;
4696         init_request_active(&obj->frontbuffer_write, frontbuffer_retire);
4697
4698         obj->mm.madv = I915_MADV_WILLNEED;
4699         INIT_RADIX_TREE(&obj->mm.get_page.radix, GFP_KERNEL | __GFP_NOWARN);
4700         mutex_init(&obj->mm.get_page.lock);
4701
4702         i915_gem_info_add_obj(to_i915(obj->base.dev), obj->base.size);
4703 }
4704
4705 static const struct drm_i915_gem_object_ops i915_gem_object_ops = {
4706         .flags = I915_GEM_OBJECT_HAS_STRUCT_PAGE |
4707                  I915_GEM_OBJECT_IS_SHRINKABLE,
4708
4709         .get_pages = i915_gem_object_get_pages_gtt,
4710         .put_pages = i915_gem_object_put_pages_gtt,
4711
4712         .pwrite = i915_gem_object_pwrite_gtt,
4713 };
4714
4715 static int i915_gem_object_create_shmem(struct drm_device *dev,
4716                                         struct drm_gem_object *obj,
4717                                         size_t size)
4718 {
4719         struct drm_i915_private *i915 = to_i915(dev);
4720         unsigned long flags = VM_NORESERVE;
4721         struct file *filp;
4722
4723         drm_gem_private_object_init(dev, obj, size);
4724
4725         if (i915->mm.gemfs)
4726                 filp = shmem_file_setup_with_mnt(i915->mm.gemfs, "i915", size,
4727                                                  flags);
4728         else
4729                 filp = shmem_file_setup("i915", size, flags);
4730
4731         if (IS_ERR(filp))
4732                 return PTR_ERR(filp);
4733
4734         obj->filp = filp;
4735
4736         return 0;
4737 }
4738
4739 struct drm_i915_gem_object *
4740 i915_gem_object_create(struct drm_i915_private *dev_priv, u64 size)
4741 {
4742         struct drm_i915_gem_object *obj;
4743         struct address_space *mapping;
4744         unsigned int cache_level;
4745         gfp_t mask;
4746         int ret;
4747
4748         /* There is a prevalence of the assumption that we fit the object's
4749          * page count inside a 32bit _signed_ variable. Let's document this and
4750          * catch if we ever need to fix it. In the meantime, if you do spot
4751          * such a local variable, please consider fixing!
4752          */
4753         if (size >> PAGE_SHIFT > INT_MAX)
4754                 return ERR_PTR(-E2BIG);
4755
4756         if (overflows_type(size, obj->base.size))
4757                 return ERR_PTR(-E2BIG);
4758
4759         obj = i915_gem_object_alloc(dev_priv);
4760         if (obj == NULL)
4761                 return ERR_PTR(-ENOMEM);
4762
4763         ret = i915_gem_object_create_shmem(&dev_priv->drm, &obj->base, size);
4764         if (ret)
4765                 goto fail;
4766
4767         mask = GFP_HIGHUSER | __GFP_RECLAIMABLE;
4768         if (IS_I965GM(dev_priv) || IS_I965G(dev_priv)) {
4769                 /* 965gm cannot relocate objects above 4GiB. */
4770                 mask &= ~__GFP_HIGHMEM;
4771                 mask |= __GFP_DMA32;
4772         }
4773
4774         mapping = obj->base.filp->f_mapping;
4775         mapping_set_gfp_mask(mapping, mask);
4776         GEM_BUG_ON(!(mapping_gfp_mask(mapping) & __GFP_RECLAIM));
4777
4778         i915_gem_object_init(obj, &i915_gem_object_ops);
4779
4780         obj->write_domain = I915_GEM_DOMAIN_CPU;
4781         obj->read_domains = I915_GEM_DOMAIN_CPU;
4782
4783         if (HAS_LLC(dev_priv))
4784                 /* On some devices, we can have the GPU use the LLC (the CPU
4785                  * cache) for about a 10% performance improvement
4786                  * compared to uncached.  Graphics requests other than
4787                  * display scanout are coherent with the CPU in
4788                  * accessing this cache.  This means in this mode we
4789                  * don't need to clflush on the CPU side, and on the
4790                  * GPU side we only need to flush internal caches to
4791                  * get data visible to the CPU.
4792                  *
4793                  * However, we maintain the display planes as UC, and so
4794                  * need to rebind when first used as such.
4795                  */
4796                 cache_level = I915_CACHE_LLC;
4797         else
4798                 cache_level = I915_CACHE_NONE;
4799
4800         i915_gem_object_set_cache_coherency(obj, cache_level);
4801
4802         trace_i915_gem_object_create(obj);
4803
4804         return obj;
4805
4806 fail:
4807         i915_gem_object_free(obj);
4808         return ERR_PTR(ret);
4809 }
4810
4811 static bool discard_backing_storage(struct drm_i915_gem_object *obj)
4812 {
4813         /* If we are the last user of the backing storage (be it shmemfs
4814          * pages or stolen etc), we know that the pages are going to be
4815          * immediately released. In this case, we can then skip copying
4816          * back the contents from the GPU.
4817          */
4818
4819         if (obj->mm.madv != I915_MADV_WILLNEED)
4820                 return false;
4821
4822         if (obj->base.filp == NULL)
4823                 return true;
4824
4825         /* At first glance, this looks racy, but then again so would be
4826          * userspace racing mmap against close. However, the first external
4827          * reference to the filp can only be obtained through the
4828          * i915_gem_mmap_ioctl() which safeguards us against the user
4829          * acquiring such a reference whilst we are in the middle of
4830          * freeing the object.
4831          */
4832         return atomic_long_read(&obj->base.filp->f_count) == 1;
4833 }
4834
4835 static void __i915_gem_free_objects(struct drm_i915_private *i915,
4836                                     struct llist_node *freed)
4837 {
4838         struct drm_i915_gem_object *obj, *on;
4839
4840         intel_runtime_pm_get(i915);
4841         llist_for_each_entry_safe(obj, on, freed, freed) {
4842                 struct i915_vma *vma, *vn;
4843
4844                 trace_i915_gem_object_destroy(obj);
4845
4846                 mutex_lock(&i915->drm.struct_mutex);
4847
4848                 GEM_BUG_ON(i915_gem_object_is_active(obj));
4849                 list_for_each_entry_safe(vma, vn,
4850                                          &obj->vma_list, obj_link) {
4851                         GEM_BUG_ON(i915_vma_is_active(vma));
4852                         vma->flags &= ~I915_VMA_PIN_MASK;
4853                         i915_vma_destroy(vma);
4854                 }
4855                 GEM_BUG_ON(!list_empty(&obj->vma_list));
4856                 GEM_BUG_ON(!RB_EMPTY_ROOT(&obj->vma_tree));
4857
4858                 /* This serializes freeing with the shrinker. Since the free
4859                  * is delayed, first by RCU then by the workqueue, we want the
4860                  * shrinker to be able to free pages of unreferenced objects,
4861                  * or else we may oom whilst there are plenty of deferred
4862                  * freed objects.
4863                  */
4864                 if (i915_gem_object_has_pages(obj)) {
4865                         spin_lock(&i915->mm.obj_lock);
4866                         list_del_init(&obj->mm.link);
4867                         spin_unlock(&i915->mm.obj_lock);
4868                 }
4869
4870                 mutex_unlock(&i915->drm.struct_mutex);
4871
4872                 GEM_BUG_ON(obj->bind_count);
4873                 GEM_BUG_ON(obj->userfault_count);
4874                 GEM_BUG_ON(atomic_read(&obj->frontbuffer_bits));
4875                 GEM_BUG_ON(!list_empty(&obj->lut_list));
4876
4877                 if (obj->ops->release)
4878                         obj->ops->release(obj);
4879
4880                 if (WARN_ON(i915_gem_object_has_pinned_pages(obj)))
4881                         atomic_set(&obj->mm.pages_pin_count, 0);
4882                 __i915_gem_object_put_pages(obj, I915_MM_NORMAL);
4883                 GEM_BUG_ON(i915_gem_object_has_pages(obj));
4884
4885                 if (obj->base.import_attach)
4886                         drm_prime_gem_destroy(&obj->base, NULL);
4887
4888                 reservation_object_fini(&obj->__builtin_resv);
4889                 drm_gem_object_release(&obj->base);
4890                 i915_gem_info_remove_obj(i915, obj->base.size);
4891
4892                 kfree(obj->bit_17);
4893                 i915_gem_object_free(obj);
4894
4895                 GEM_BUG_ON(!atomic_read(&i915->mm.free_count));
4896                 atomic_dec(&i915->mm.free_count);
4897
4898                 if (on)
4899                         cond_resched();
4900         }
4901         intel_runtime_pm_put(i915);
4902 }
4903
4904 static void i915_gem_flush_free_objects(struct drm_i915_private *i915)
4905 {
4906         struct llist_node *freed;
4907
4908         /* Free the oldest, most stale object to keep the free_list short */
4909         freed = NULL;
4910         if (!llist_empty(&i915->mm.free_list)) { /* quick test for hotpath */
4911                 /* Only one consumer of llist_del_first() allowed */
4912                 spin_lock(&i915->mm.free_lock);
4913                 freed = llist_del_first(&i915->mm.free_list);
4914                 spin_unlock(&i915->mm.free_lock);
4915         }
4916         if (unlikely(freed)) {
4917                 freed->next = NULL;
4918                 __i915_gem_free_objects(i915, freed);
4919         }
4920 }
4921
4922 static void __i915_gem_free_work(struct work_struct *work)
4923 {
4924         struct drm_i915_private *i915 =
4925                 container_of(work, struct drm_i915_private, mm.free_work);
4926         struct llist_node *freed;
4927
4928         /*
4929          * All file-owned VMA should have been released by this point through
4930          * i915_gem_close_object(), or earlier by i915_gem_context_close().
4931          * However, the object may also be bound into the global GTT (e.g.
4932          * older GPUs without per-process support, or for direct access through
4933          * the GTT either for the user or for scanout). Those VMA still need to
4934          * unbound now.
4935          */
4936
4937         spin_lock(&i915->mm.free_lock);
4938         while ((freed = llist_del_all(&i915->mm.free_list))) {
4939                 spin_unlock(&i915->mm.free_lock);
4940
4941                 __i915_gem_free_objects(i915, freed);
4942                 if (need_resched())
4943                         return;
4944
4945                 spin_lock(&i915->mm.free_lock);
4946         }
4947         spin_unlock(&i915->mm.free_lock);
4948 }
4949
4950 static void __i915_gem_free_object_rcu(struct rcu_head *head)
4951 {
4952         struct drm_i915_gem_object *obj =
4953                 container_of(head, typeof(*obj), rcu);
4954         struct drm_i915_private *i915 = to_i915(obj->base.dev);
4955
4956         /*
4957          * Since we require blocking on struct_mutex to unbind the freed
4958          * object from the GPU before releasing resources back to the
4959          * system, we can not do that directly from the RCU callback (which may
4960          * be a softirq context), but must instead then defer that work onto a
4961          * kthread. We use the RCU callback rather than move the freed object
4962          * directly onto the work queue so that we can mix between using the
4963          * worker and performing frees directly from subsequent allocations for
4964          * crude but effective memory throttling.
4965          */
4966         if (llist_add(&obj->freed, &i915->mm.free_list))
4967                 queue_work(i915->wq, &i915->mm.free_work);
4968 }
4969
4970 void i915_gem_free_object(struct drm_gem_object *gem_obj)
4971 {
4972         struct drm_i915_gem_object *obj = to_intel_bo(gem_obj);
4973
4974         if (obj->mm.quirked)
4975                 __i915_gem_object_unpin_pages(obj);
4976
4977         if (discard_backing_storage(obj))
4978                 obj->mm.madv = I915_MADV_DONTNEED;
4979
4980         /*
4981          * Before we free the object, make sure any pure RCU-only
4982          * read-side critical sections are complete, e.g.
4983          * i915_gem_busy_ioctl(). For the corresponding synchronized
4984          * lookup see i915_gem_object_lookup_rcu().
4985          */
4986         atomic_inc(&to_i915(obj->base.dev)->mm.free_count);
4987         call_rcu(&obj->rcu, __i915_gem_free_object_rcu);
4988 }
4989
4990 void __i915_gem_object_release_unless_active(struct drm_i915_gem_object *obj)
4991 {
4992         lockdep_assert_held(&obj->base.dev->struct_mutex);
4993
4994         if (!i915_gem_object_has_active_reference(obj) &&
4995             i915_gem_object_is_active(obj))
4996                 i915_gem_object_set_active_reference(obj);
4997         else
4998                 i915_gem_object_put(obj);
4999 }
5000
5001 void i915_gem_sanitize(struct drm_i915_private *i915)
5002 {
5003         int err;
5004
5005         GEM_TRACE("\n");
5006
5007         mutex_lock(&i915->drm.struct_mutex);
5008
5009         intel_runtime_pm_get(i915);
5010         intel_uncore_forcewake_get(i915, FORCEWAKE_ALL);
5011
5012         /*
5013          * As we have just resumed the machine and woken the device up from
5014          * deep PCI sleep (presumably D3_cold), assume the HW has been reset
5015          * back to defaults, recovering from whatever wedged state we left it
5016          * in and so worth trying to use the device once more.
5017          */
5018         if (i915_terminally_wedged(&i915->gpu_error))
5019                 i915_gem_unset_wedged(i915);
5020
5021         /*
5022          * If we inherit context state from the BIOS or earlier occupants
5023          * of the GPU, the GPU may be in an inconsistent state when we
5024          * try to take over. The only way to remove the earlier state
5025          * is by resetting. However, resetting on earlier gen is tricky as
5026          * it may impact the display and we are uncertain about the stability
5027          * of the reset, so this could be applied to even earlier gen.
5028          */
5029         err = -ENODEV;
5030         if (INTEL_GEN(i915) >= 5 && intel_has_gpu_reset(i915))
5031                 err = WARN_ON(intel_gpu_reset(i915, ALL_ENGINES));
5032         if (!err)
5033                 intel_engines_sanitize(i915);
5034
5035         intel_uncore_forcewake_put(i915, FORCEWAKE_ALL);
5036         intel_runtime_pm_put(i915);
5037
5038         i915_gem_contexts_lost(i915);
5039         mutex_unlock(&i915->drm.struct_mutex);
5040 }
5041
5042 int i915_gem_suspend(struct drm_i915_private *i915)
5043 {
5044         int ret;
5045
5046         GEM_TRACE("\n");
5047
5048         intel_runtime_pm_get(i915);
5049         intel_suspend_gt_powersave(i915);
5050
5051         mutex_lock(&i915->drm.struct_mutex);
5052
5053         /*
5054          * We have to flush all the executing contexts to main memory so
5055          * that they can saved in the hibernation image. To ensure the last
5056          * context image is coherent, we have to switch away from it. That
5057          * leaves the i915->kernel_context still active when
5058          * we actually suspend, and its image in memory may not match the GPU
5059          * state. Fortunately, the kernel_context is disposable and we do
5060          * not rely on its state.
5061          */
5062         if (!i915_terminally_wedged(&i915->gpu_error)) {
5063                 ret = i915_gem_switch_to_kernel_context(i915);
5064                 if (ret)
5065                         goto err_unlock;
5066
5067                 ret = i915_gem_wait_for_idle(i915,
5068                                              I915_WAIT_INTERRUPTIBLE |
5069                                              I915_WAIT_LOCKED |
5070                                              I915_WAIT_FOR_IDLE_BOOST,
5071                                              MAX_SCHEDULE_TIMEOUT);
5072                 if (ret && ret != -EIO)
5073                         goto err_unlock;
5074
5075                 assert_kernel_context_is_current(i915);
5076         }
5077         i915_retire_requests(i915); /* ensure we flush after wedging */
5078
5079         mutex_unlock(&i915->drm.struct_mutex);
5080
5081         intel_uc_suspend(i915);
5082
5083         cancel_delayed_work_sync(&i915->gpu_error.hangcheck_work);
5084         cancel_delayed_work_sync(&i915->gt.retire_work);
5085
5086         /*
5087          * As the idle_work is rearming if it detects a race, play safe and
5088          * repeat the flush until it is definitely idle.
5089          */
5090         drain_delayed_work(&i915->gt.idle_work);
5091
5092         /*
5093          * Assert that we successfully flushed all the work and
5094          * reset the GPU back to its idle, low power state.
5095          */
5096         WARN_ON(i915->gt.awake);
5097         if (WARN_ON(!intel_engines_are_idle(i915)))
5098                 i915_gem_set_wedged(i915); /* no hope, discard everything */
5099
5100         intel_runtime_pm_put(i915);
5101         return 0;
5102
5103 err_unlock:
5104         mutex_unlock(&i915->drm.struct_mutex);
5105         intel_runtime_pm_put(i915);
5106         return ret;
5107 }
5108
5109 void i915_gem_suspend_late(struct drm_i915_private *i915)
5110 {
5111         struct drm_i915_gem_object *obj;
5112         struct list_head *phases[] = {
5113                 &i915->mm.unbound_list,
5114                 &i915->mm.bound_list,
5115                 NULL
5116         }, **phase;
5117
5118         /*
5119          * Neither the BIOS, ourselves or any other kernel
5120          * expects the system to be in execlists mode on startup,
5121          * so we need to reset the GPU back to legacy mode. And the only
5122          * known way to disable logical contexts is through a GPU reset.
5123          *
5124          * So in order to leave the system in a known default configuration,
5125          * always reset the GPU upon unload and suspend. Afterwards we then
5126          * clean up the GEM state tracking, flushing off the requests and
5127          * leaving the system in a known idle state.
5128          *
5129          * Note that is of the upmost importance that the GPU is idle and
5130          * all stray writes are flushed *before* we dismantle the backing
5131          * storage for the pinned objects.
5132          *
5133          * However, since we are uncertain that resetting the GPU on older
5134          * machines is a good idea, we don't - just in case it leaves the
5135          * machine in an unusable condition.
5136          */
5137
5138         mutex_lock(&i915->drm.struct_mutex);
5139         for (phase = phases; *phase; phase++) {
5140                 list_for_each_entry(obj, *phase, mm.link)
5141                         WARN_ON(i915_gem_object_set_to_gtt_domain(obj, false));
5142         }
5143         mutex_unlock(&i915->drm.struct_mutex);
5144
5145         intel_uc_sanitize(i915);
5146         i915_gem_sanitize(i915);
5147 }
5148
5149 void i915_gem_resume(struct drm_i915_private *i915)
5150 {
5151         GEM_TRACE("\n");
5152
5153         WARN_ON(i915->gt.awake);
5154
5155         mutex_lock(&i915->drm.struct_mutex);
5156         intel_uncore_forcewake_get(i915, FORCEWAKE_ALL);
5157
5158         i915_gem_restore_gtt_mappings(i915);
5159         i915_gem_restore_fences(i915);
5160
5161         /*
5162          * As we didn't flush the kernel context before suspend, we cannot
5163          * guarantee that the context image is complete. So let's just reset
5164          * it and start again.
5165          */
5166         i915->gt.resume(i915);
5167
5168         if (i915_gem_init_hw(i915))
5169                 goto err_wedged;
5170
5171         intel_uc_resume(i915);
5172
5173         /* Always reload a context for powersaving. */
5174         if (i915_gem_switch_to_kernel_context(i915))
5175                 goto err_wedged;
5176
5177 out_unlock:
5178         intel_uncore_forcewake_put(i915, FORCEWAKE_ALL);
5179         mutex_unlock(&i915->drm.struct_mutex);
5180         return;
5181
5182 err_wedged:
5183         if (!i915_terminally_wedged(&i915->gpu_error)) {
5184                 DRM_ERROR("failed to re-initialize GPU, declaring wedged!\n");
5185                 i915_gem_set_wedged(i915);
5186         }
5187         goto out_unlock;
5188 }
5189
5190 void i915_gem_init_swizzling(struct drm_i915_private *dev_priv)
5191 {
5192         if (INTEL_GEN(dev_priv) < 5 ||
5193             dev_priv->mm.bit_6_swizzle_x == I915_BIT_6_SWIZZLE_NONE)
5194                 return;
5195
5196         I915_WRITE(DISP_ARB_CTL, I915_READ(DISP_ARB_CTL) |
5197                                  DISP_TILE_SURFACE_SWIZZLING);
5198
5199         if (IS_GEN5(dev_priv))
5200                 return;
5201
5202         I915_WRITE(TILECTL, I915_READ(TILECTL) | TILECTL_SWZCTL);
5203         if (IS_GEN6(dev_priv))
5204                 I915_WRITE(ARB_MODE, _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_SNB));
5205         else if (IS_GEN7(dev_priv))
5206                 I915_WRITE(ARB_MODE, _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_IVB));
5207         else if (IS_GEN8(dev_priv))
5208                 I915_WRITE(GAMTARBMODE, _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_BDW));
5209         else
5210                 BUG();
5211 }
5212
5213 static void init_unused_ring(struct drm_i915_private *dev_priv, u32 base)
5214 {
5215         I915_WRITE(RING_CTL(base), 0);
5216         I915_WRITE(RING_HEAD(base), 0);
5217         I915_WRITE(RING_TAIL(base), 0);
5218         I915_WRITE(RING_START(base), 0);
5219 }
5220
5221 static void init_unused_rings(struct drm_i915_private *dev_priv)
5222 {
5223         if (IS_I830(dev_priv)) {
5224                 init_unused_ring(dev_priv, PRB1_BASE);
5225                 init_unused_ring(dev_priv, SRB0_BASE);
5226                 init_unused_ring(dev_priv, SRB1_BASE);
5227                 init_unused_ring(dev_priv, SRB2_BASE);
5228                 init_unused_ring(dev_priv, SRB3_BASE);
5229         } else if (IS_GEN2(dev_priv)) {
5230                 init_unused_ring(dev_priv, SRB0_BASE);
5231                 init_unused_ring(dev_priv, SRB1_BASE);
5232         } else if (IS_GEN3(dev_priv)) {
5233                 init_unused_ring(dev_priv, PRB1_BASE);
5234                 init_unused_ring(dev_priv, PRB2_BASE);
5235         }
5236 }
5237
5238 static int __i915_gem_restart_engines(void *data)
5239 {
5240         struct drm_i915_private *i915 = data;
5241         struct intel_engine_cs *engine;
5242         enum intel_engine_id id;
5243         int err;
5244
5245         for_each_engine(engine, i915, id) {
5246                 err = engine->init_hw(engine);
5247                 if (err) {
5248                         DRM_ERROR("Failed to restart %s (%d)\n",
5249                                   engine->name, err);
5250                         return err;
5251                 }
5252         }
5253
5254         return 0;
5255 }
5256
5257 int i915_gem_init_hw(struct drm_i915_private *dev_priv)
5258 {
5259         int ret;
5260
5261         dev_priv->gt.last_init_time = ktime_get();
5262
5263         /* Double layer security blanket, see i915_gem_init() */
5264         intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
5265
5266         if (HAS_EDRAM(dev_priv) && INTEL_GEN(dev_priv) < 9)
5267                 I915_WRITE(HSW_IDICR, I915_READ(HSW_IDICR) | IDIHASHMSK(0xf));
5268
5269         if (IS_HASWELL(dev_priv))
5270                 I915_WRITE(MI_PREDICATE_RESULT_2, IS_HSW_GT3(dev_priv) ?
5271                            LOWER_SLICE_ENABLED : LOWER_SLICE_DISABLED);
5272
5273         if (HAS_PCH_NOP(dev_priv)) {
5274                 if (IS_IVYBRIDGE(dev_priv)) {
5275                         u32 temp = I915_READ(GEN7_MSG_CTL);
5276                         temp &= ~(WAIT_FOR_PCH_FLR_ACK | WAIT_FOR_PCH_RESET_ACK);
5277                         I915_WRITE(GEN7_MSG_CTL, temp);
5278                 } else if (INTEL_GEN(dev_priv) >= 7) {
5279                         u32 temp = I915_READ(HSW_NDE_RSTWRN_OPT);
5280                         temp &= ~RESET_PCH_HANDSHAKE_ENABLE;
5281                         I915_WRITE(HSW_NDE_RSTWRN_OPT, temp);
5282                 }
5283         }
5284
5285         intel_gt_workarounds_apply(dev_priv);
5286
5287         i915_gem_init_swizzling(dev_priv);
5288
5289         /*
5290          * At least 830 can leave some of the unused rings
5291          * "active" (ie. head != tail) after resume which
5292          * will prevent c3 entry. Makes sure all unused rings
5293          * are totally idle.
5294          */
5295         init_unused_rings(dev_priv);
5296
5297         BUG_ON(!dev_priv->kernel_context);
5298         if (i915_terminally_wedged(&dev_priv->gpu_error)) {
5299                 ret = -EIO;
5300                 goto out;
5301         }
5302
5303         ret = i915_ppgtt_init_hw(dev_priv);
5304         if (ret) {
5305                 DRM_ERROR("Enabling PPGTT failed (%d)\n", ret);
5306                 goto out;
5307         }
5308
5309         ret = intel_wopcm_init_hw(&dev_priv->wopcm);
5310         if (ret) {
5311                 DRM_ERROR("Enabling WOPCM failed (%d)\n", ret);
5312                 goto out;
5313         }
5314
5315         /* We can't enable contexts until all firmware is loaded */
5316         ret = intel_uc_init_hw(dev_priv);
5317         if (ret) {
5318                 DRM_ERROR("Enabling uc failed (%d)\n", ret);
5319                 goto out;
5320         }
5321
5322         intel_mocs_init_l3cc_table(dev_priv);
5323
5324         /* Only when the HW is re-initialised, can we replay the requests */
5325         ret = __i915_gem_restart_engines(dev_priv);
5326         if (ret)
5327                 goto cleanup_uc;
5328
5329         intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
5330
5331         return 0;
5332
5333 cleanup_uc:
5334         intel_uc_fini_hw(dev_priv);
5335 out:
5336         intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
5337
5338         return ret;
5339 }
5340
5341 static int __intel_engines_record_defaults(struct drm_i915_private *i915)
5342 {
5343         struct i915_gem_context *ctx;
5344         struct intel_engine_cs *engine;
5345         enum intel_engine_id id;
5346         int err;
5347
5348         /*
5349          * As we reset the gpu during very early sanitisation, the current
5350          * register state on the GPU should reflect its defaults values.
5351          * We load a context onto the hw (with restore-inhibit), then switch
5352          * over to a second context to save that default register state. We
5353          * can then prime every new context with that state so they all start
5354          * from the same default HW values.
5355          */
5356
5357         ctx = i915_gem_context_create_kernel(i915, 0);
5358         if (IS_ERR(ctx))
5359                 return PTR_ERR(ctx);
5360
5361         for_each_engine(engine, i915, id) {
5362                 struct i915_request *rq;
5363
5364                 rq = i915_request_alloc(engine, ctx);
5365                 if (IS_ERR(rq)) {
5366                         err = PTR_ERR(rq);
5367                         goto out_ctx;
5368                 }
5369
5370                 err = 0;
5371                 if (engine->init_context)
5372                         err = engine->init_context(rq);
5373
5374                 i915_request_add(rq);
5375                 if (err)
5376                         goto err_active;
5377         }
5378
5379         err = i915_gem_switch_to_kernel_context(i915);
5380         if (err)
5381                 goto err_active;
5382
5383         if (i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED, HZ / 5)) {
5384                 i915_gem_set_wedged(i915);
5385                 err = -EIO; /* Caller will declare us wedged */
5386                 goto err_active;
5387         }
5388
5389         assert_kernel_context_is_current(i915);
5390
5391         for_each_engine(engine, i915, id) {
5392                 struct i915_vma *state;
5393
5394                 state = to_intel_context(ctx, engine)->state;
5395                 if (!state)
5396                         continue;
5397
5398                 /*
5399                  * As we will hold a reference to the logical state, it will
5400                  * not be torn down with the context, and importantly the
5401                  * object will hold onto its vma (making it possible for a
5402                  * stray GTT write to corrupt our defaults). Unmap the vma
5403                  * from the GTT to prevent such accidents and reclaim the
5404                  * space.
5405                  */
5406                 err = i915_vma_unbind(state);
5407                 if (err)
5408                         goto err_active;
5409
5410                 err = i915_gem_object_set_to_cpu_domain(state->obj, false);
5411                 if (err)
5412                         goto err_active;
5413
5414                 engine->default_state = i915_gem_object_get(state->obj);
5415         }
5416
5417         if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) {
5418                 unsigned int found = intel_engines_has_context_isolation(i915);
5419
5420                 /*
5421                  * Make sure that classes with multiple engine instances all
5422                  * share the same basic configuration.
5423                  */
5424                 for_each_engine(engine, i915, id) {
5425                         unsigned int bit = BIT(engine->uabi_class);
5426                         unsigned int expected = engine->default_state ? bit : 0;
5427
5428                         if ((found & bit) != expected) {
5429                                 DRM_ERROR("mismatching default context state for class %d on engine %s\n",
5430                                           engine->uabi_class, engine->name);
5431                         }
5432                 }
5433         }
5434
5435 out_ctx:
5436         i915_gem_context_set_closed(ctx);
5437         i915_gem_context_put(ctx);
5438         return err;
5439
5440 err_active:
5441         /*
5442          * If we have to abandon now, we expect the engines to be idle
5443          * and ready to be torn-down. First try to flush any remaining
5444          * request, ensure we are pointing at the kernel context and
5445          * then remove it.
5446          */
5447         if (WARN_ON(i915_gem_switch_to_kernel_context(i915)))
5448                 goto out_ctx;
5449
5450         if (WARN_ON(i915_gem_wait_for_idle(i915,
5451                                            I915_WAIT_LOCKED,
5452                                            MAX_SCHEDULE_TIMEOUT)))
5453                 goto out_ctx;
5454
5455         i915_gem_contexts_lost(i915);
5456         goto out_ctx;
5457 }
5458
5459 int i915_gem_init(struct drm_i915_private *dev_priv)
5460 {
5461         int ret;
5462
5463         /* We need to fallback to 4K pages if host doesn't support huge gtt. */
5464         if (intel_vgpu_active(dev_priv) && !intel_vgpu_has_huge_gtt(dev_priv))
5465                 mkwrite_device_info(dev_priv)->page_sizes =
5466                         I915_GTT_PAGE_SIZE_4K;
5467
5468         dev_priv->mm.unordered_timeline = dma_fence_context_alloc(1);
5469
5470         if (HAS_LOGICAL_RING_CONTEXTS(dev_priv)) {
5471                 dev_priv->gt.resume = intel_lr_context_resume;
5472                 dev_priv->gt.cleanup_engine = intel_logical_ring_cleanup;
5473         } else {
5474                 dev_priv->gt.resume = intel_legacy_submission_resume;
5475                 dev_priv->gt.cleanup_engine = intel_engine_cleanup;
5476         }
5477
5478         ret = i915_gem_init_userptr(dev_priv);
5479         if (ret)
5480                 return ret;
5481
5482         ret = intel_uc_init_misc(dev_priv);
5483         if (ret)
5484                 return ret;
5485
5486         ret = intel_wopcm_init(&dev_priv->wopcm);
5487         if (ret)
5488                 goto err_uc_misc;
5489
5490         /* This is just a security blanket to placate dragons.
5491          * On some systems, we very sporadically observe that the first TLBs
5492          * used by the CS may be stale, despite us poking the TLB reset. If
5493          * we hold the forcewake during initialisation these problems
5494          * just magically go away.
5495          */
5496         mutex_lock(&dev_priv->drm.struct_mutex);
5497         intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
5498
5499         ret = i915_gem_init_ggtt(dev_priv);
5500         if (ret) {
5501                 GEM_BUG_ON(ret == -EIO);
5502                 goto err_unlock;
5503         }
5504
5505         ret = i915_gem_contexts_init(dev_priv);
5506         if (ret) {
5507                 GEM_BUG_ON(ret == -EIO);
5508                 goto err_ggtt;
5509         }
5510
5511         ret = intel_engines_init(dev_priv);
5512         if (ret) {
5513                 GEM_BUG_ON(ret == -EIO);
5514                 goto err_context;
5515         }
5516
5517         intel_init_gt_powersave(dev_priv);
5518
5519         ret = intel_uc_init(dev_priv);
5520         if (ret)
5521                 goto err_pm;
5522
5523         ret = i915_gem_init_hw(dev_priv);
5524         if (ret)
5525                 goto err_uc_init;
5526
5527         /*
5528          * Despite its name intel_init_clock_gating applies both display
5529          * clock gating workarounds; GT mmio workarounds and the occasional
5530          * GT power context workaround. Worse, sometimes it includes a context
5531          * register workaround which we need to apply before we record the
5532          * default HW state for all contexts.
5533          *
5534          * FIXME: break up the workarounds and apply them at the right time!
5535          */
5536         intel_init_clock_gating(dev_priv);
5537
5538         ret = __intel_engines_record_defaults(dev_priv);
5539         if (ret)
5540                 goto err_init_hw;
5541
5542         if (i915_inject_load_failure()) {
5543                 ret = -ENODEV;
5544                 goto err_init_hw;
5545         }
5546
5547         if (i915_inject_load_failure()) {
5548                 ret = -EIO;
5549                 goto err_init_hw;
5550         }
5551
5552         intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
5553         mutex_unlock(&dev_priv->drm.struct_mutex);
5554
5555         return 0;
5556
5557         /*
5558          * Unwinding is complicated by that we want to handle -EIO to mean
5559          * disable GPU submission but keep KMS alive. We want to mark the
5560          * HW as irrevisibly wedged, but keep enough state around that the
5561          * driver doesn't explode during runtime.
5562          */
5563 err_init_hw:
5564         mutex_unlock(&dev_priv->drm.struct_mutex);
5565
5566         WARN_ON(i915_gem_suspend(dev_priv));
5567         i915_gem_suspend_late(dev_priv);
5568
5569         i915_gem_drain_workqueue(dev_priv);
5570
5571         mutex_lock(&dev_priv->drm.struct_mutex);
5572         intel_uc_fini_hw(dev_priv);
5573 err_uc_init:
5574         intel_uc_fini(dev_priv);
5575 err_pm:
5576         if (ret != -EIO) {
5577                 intel_cleanup_gt_powersave(dev_priv);
5578                 i915_gem_cleanup_engines(dev_priv);
5579         }
5580 err_context:
5581         if (ret != -EIO)
5582                 i915_gem_contexts_fini(dev_priv);
5583 err_ggtt:
5584 err_unlock:
5585         intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
5586         mutex_unlock(&dev_priv->drm.struct_mutex);
5587
5588 err_uc_misc:
5589         intel_uc_fini_misc(dev_priv);
5590
5591         if (ret != -EIO)
5592                 i915_gem_cleanup_userptr(dev_priv);
5593
5594         if (ret == -EIO) {
5595                 /*
5596                  * Allow engine initialisation to fail by marking the GPU as
5597                  * wedged. But we only want to do this where the GPU is angry,
5598                  * for all other failure, such as an allocation failure, bail.
5599                  */
5600                 if (!i915_terminally_wedged(&dev_priv->gpu_error)) {
5601                         i915_load_error(dev_priv,
5602                                         "Failed to initialize GPU, declaring it wedged!\n");
5603                         i915_gem_set_wedged(dev_priv);
5604                 }
5605                 ret = 0;
5606         }
5607
5608         i915_gem_drain_freed_objects(dev_priv);
5609         return ret;
5610 }
5611
5612 void i915_gem_fini(struct drm_i915_private *dev_priv)
5613 {
5614         i915_gem_suspend_late(dev_priv);
5615
5616         /* Flush any outstanding unpin_work. */
5617         i915_gem_drain_workqueue(dev_priv);
5618
5619         mutex_lock(&dev_priv->drm.struct_mutex);
5620         intel_uc_fini_hw(dev_priv);
5621         intel_uc_fini(dev_priv);
5622         i915_gem_cleanup_engines(dev_priv);
5623         i915_gem_contexts_fini(dev_priv);
5624         mutex_unlock(&dev_priv->drm.struct_mutex);
5625
5626         intel_uc_fini_misc(dev_priv);
5627         i915_gem_cleanup_userptr(dev_priv);
5628
5629         i915_gem_drain_freed_objects(dev_priv);
5630
5631         WARN_ON(!list_empty(&dev_priv->contexts.list));
5632 }
5633
5634 void i915_gem_init_mmio(struct drm_i915_private *i915)
5635 {
5636         i915_gem_sanitize(i915);
5637 }
5638
5639 void
5640 i915_gem_cleanup_engines(struct drm_i915_private *dev_priv)
5641 {
5642         struct intel_engine_cs *engine;
5643         enum intel_engine_id id;
5644
5645         for_each_engine(engine, dev_priv, id)
5646                 dev_priv->gt.cleanup_engine(engine);
5647 }
5648
5649 void
5650 i915_gem_load_init_fences(struct drm_i915_private *dev_priv)
5651 {
5652         int i;
5653
5654         if (INTEL_GEN(dev_priv) >= 7 && !IS_VALLEYVIEW(dev_priv) &&
5655             !IS_CHERRYVIEW(dev_priv))
5656                 dev_priv->num_fence_regs = 32;
5657         else if (INTEL_GEN(dev_priv) >= 4 ||
5658                  IS_I945G(dev_priv) || IS_I945GM(dev_priv) ||
5659                  IS_G33(dev_priv) || IS_PINEVIEW(dev_priv))
5660                 dev_priv->num_fence_regs = 16;
5661         else
5662                 dev_priv->num_fence_regs = 8;
5663
5664         if (intel_vgpu_active(dev_priv))
5665                 dev_priv->num_fence_regs =
5666                                 I915_READ(vgtif_reg(avail_rs.fence_num));
5667
5668         /* Initialize fence registers to zero */
5669         for (i = 0; i < dev_priv->num_fence_regs; i++) {
5670                 struct drm_i915_fence_reg *fence = &dev_priv->fence_regs[i];
5671
5672                 fence->i915 = dev_priv;
5673                 fence->id = i;
5674                 list_add_tail(&fence->link, &dev_priv->mm.fence_list);
5675         }
5676         i915_gem_restore_fences(dev_priv);
5677
5678         i915_gem_detect_bit_6_swizzle(dev_priv);
5679 }
5680
5681 static void i915_gem_init__mm(struct drm_i915_private *i915)
5682 {
5683         spin_lock_init(&i915->mm.object_stat_lock);
5684         spin_lock_init(&i915->mm.obj_lock);
5685         spin_lock_init(&i915->mm.free_lock);
5686
5687         init_llist_head(&i915->mm.free_list);
5688
5689         INIT_LIST_HEAD(&i915->mm.unbound_list);
5690         INIT_LIST_HEAD(&i915->mm.bound_list);
5691         INIT_LIST_HEAD(&i915->mm.fence_list);
5692         INIT_LIST_HEAD(&i915->mm.userfault_list);
5693
5694         INIT_WORK(&i915->mm.free_work, __i915_gem_free_work);
5695 }
5696
5697 int i915_gem_init_early(struct drm_i915_private *dev_priv)
5698 {
5699         int err = -ENOMEM;
5700
5701         dev_priv->objects = KMEM_CACHE(drm_i915_gem_object, SLAB_HWCACHE_ALIGN);
5702         if (!dev_priv->objects)
5703                 goto err_out;
5704
5705         dev_priv->vmas = KMEM_CACHE(i915_vma, SLAB_HWCACHE_ALIGN);
5706         if (!dev_priv->vmas)
5707                 goto err_objects;
5708
5709         dev_priv->luts = KMEM_CACHE(i915_lut_handle, 0);
5710         if (!dev_priv->luts)
5711                 goto err_vmas;
5712
5713         dev_priv->requests = KMEM_CACHE(i915_request,
5714                                         SLAB_HWCACHE_ALIGN |
5715                                         SLAB_RECLAIM_ACCOUNT |
5716                                         SLAB_TYPESAFE_BY_RCU);
5717         if (!dev_priv->requests)
5718                 goto err_luts;
5719
5720         dev_priv->dependencies = KMEM_CACHE(i915_dependency,
5721                                             SLAB_HWCACHE_ALIGN |
5722                                             SLAB_RECLAIM_ACCOUNT);
5723         if (!dev_priv->dependencies)
5724                 goto err_requests;
5725
5726         dev_priv->priorities = KMEM_CACHE(i915_priolist, SLAB_HWCACHE_ALIGN);
5727         if (!dev_priv->priorities)
5728                 goto err_dependencies;
5729
5730         INIT_LIST_HEAD(&dev_priv->gt.timelines);
5731         INIT_LIST_HEAD(&dev_priv->gt.active_rings);
5732         INIT_LIST_HEAD(&dev_priv->gt.closed_vma);
5733
5734         i915_gem_init__mm(dev_priv);
5735
5736         INIT_DELAYED_WORK(&dev_priv->gt.retire_work,
5737                           i915_gem_retire_work_handler);
5738         INIT_DELAYED_WORK(&dev_priv->gt.idle_work,
5739                           i915_gem_idle_work_handler);
5740         init_waitqueue_head(&dev_priv->gpu_error.wait_queue);
5741         init_waitqueue_head(&dev_priv->gpu_error.reset_queue);
5742
5743         atomic_set(&dev_priv->mm.bsd_engine_dispatch_index, 0);
5744
5745         spin_lock_init(&dev_priv->fb_tracking.lock);
5746
5747         err = i915_gemfs_init(dev_priv);
5748         if (err)
5749                 DRM_NOTE("Unable to create a private tmpfs mount, hugepage support will be disabled(%d).\n", err);
5750
5751         return 0;
5752
5753 err_dependencies:
5754         kmem_cache_destroy(dev_priv->dependencies);
5755 err_requests:
5756         kmem_cache_destroy(dev_priv->requests);
5757 err_luts:
5758         kmem_cache_destroy(dev_priv->luts);
5759 err_vmas:
5760         kmem_cache_destroy(dev_priv->vmas);
5761 err_objects:
5762         kmem_cache_destroy(dev_priv->objects);
5763 err_out:
5764         return err;
5765 }
5766
5767 void i915_gem_cleanup_early(struct drm_i915_private *dev_priv)
5768 {
5769         i915_gem_drain_freed_objects(dev_priv);
5770         GEM_BUG_ON(!llist_empty(&dev_priv->mm.free_list));
5771         GEM_BUG_ON(atomic_read(&dev_priv->mm.free_count));
5772         WARN_ON(dev_priv->mm.object_count);
5773         WARN_ON(!list_empty(&dev_priv->gt.timelines));
5774
5775         kmem_cache_destroy(dev_priv->priorities);
5776         kmem_cache_destroy(dev_priv->dependencies);
5777         kmem_cache_destroy(dev_priv->requests);
5778         kmem_cache_destroy(dev_priv->luts);
5779         kmem_cache_destroy(dev_priv->vmas);
5780         kmem_cache_destroy(dev_priv->objects);
5781
5782         /* And ensure that our DESTROY_BY_RCU slabs are truly destroyed */
5783         rcu_barrier();
5784
5785         i915_gemfs_fini(dev_priv);
5786 }
5787
5788 int i915_gem_freeze(struct drm_i915_private *dev_priv)
5789 {
5790         /* Discard all purgeable objects, let userspace recover those as
5791          * required after resuming.
5792          */
5793         i915_gem_shrink_all(dev_priv);
5794
5795         return 0;
5796 }
5797
5798 int i915_gem_freeze_late(struct drm_i915_private *i915)
5799 {
5800         struct drm_i915_gem_object *obj;
5801         struct list_head *phases[] = {
5802                 &i915->mm.unbound_list,
5803                 &i915->mm.bound_list,
5804                 NULL
5805         }, **phase;
5806
5807         /*
5808          * Called just before we write the hibernation image.
5809          *
5810          * We need to update the domain tracking to reflect that the CPU
5811          * will be accessing all the pages to create and restore from the
5812          * hibernation, and so upon restoration those pages will be in the
5813          * CPU domain.
5814          *
5815          * To make sure the hibernation image contains the latest state,
5816          * we update that state just before writing out the image.
5817          *
5818          * To try and reduce the hibernation image, we manually shrink
5819          * the objects as well, see i915_gem_freeze()
5820          */
5821
5822         i915_gem_shrink(i915, -1UL, NULL, I915_SHRINK_UNBOUND);
5823         i915_gem_drain_freed_objects(i915);
5824
5825         mutex_lock(&i915->drm.struct_mutex);
5826         for (phase = phases; *phase; phase++) {
5827                 list_for_each_entry(obj, *phase, mm.link)
5828                         WARN_ON(i915_gem_object_set_to_cpu_domain(obj, true));
5829         }
5830         mutex_unlock(&i915->drm.struct_mutex);
5831
5832         return 0;
5833 }
5834
5835 void i915_gem_release(struct drm_device *dev, struct drm_file *file)
5836 {
5837         struct drm_i915_file_private *file_priv = file->driver_priv;
5838         struct i915_request *request;
5839
5840         /* Clean up our request list when the client is going away, so that
5841          * later retire_requests won't dereference our soon-to-be-gone
5842          * file_priv.
5843          */
5844         spin_lock(&file_priv->mm.lock);
5845         list_for_each_entry(request, &file_priv->mm.request_list, client_link)
5846                 request->file_priv = NULL;
5847         spin_unlock(&file_priv->mm.lock);
5848 }
5849
5850 int i915_gem_open(struct drm_i915_private *i915, struct drm_file *file)
5851 {
5852         struct drm_i915_file_private *file_priv;
5853         int ret;
5854
5855         DRM_DEBUG("\n");
5856
5857         file_priv = kzalloc(sizeof(*file_priv), GFP_KERNEL);
5858         if (!file_priv)
5859                 return -ENOMEM;
5860
5861         file->driver_priv = file_priv;
5862         file_priv->dev_priv = i915;
5863         file_priv->file = file;
5864
5865         spin_lock_init(&file_priv->mm.lock);
5866         INIT_LIST_HEAD(&file_priv->mm.request_list);
5867
5868         file_priv->bsd_engine = -1;
5869         file_priv->hang_timestamp = jiffies;
5870
5871         ret = i915_gem_context_open(i915, file);
5872         if (ret)
5873                 kfree(file_priv);
5874
5875         return ret;
5876 }
5877
5878 /**
5879  * i915_gem_track_fb - update frontbuffer tracking
5880  * @old: current GEM buffer for the frontbuffer slots
5881  * @new: new GEM buffer for the frontbuffer slots
5882  * @frontbuffer_bits: bitmask of frontbuffer slots
5883  *
5884  * This updates the frontbuffer tracking bits @frontbuffer_bits by clearing them
5885  * from @old and setting them in @new. Both @old and @new can be NULL.
5886  */
5887 void i915_gem_track_fb(struct drm_i915_gem_object *old,
5888                        struct drm_i915_gem_object *new,
5889                        unsigned frontbuffer_bits)
5890 {
5891         /* Control of individual bits within the mask are guarded by
5892          * the owning plane->mutex, i.e. we can never see concurrent
5893          * manipulation of individual bits. But since the bitfield as a whole
5894          * is updated using RMW, we need to use atomics in order to update
5895          * the bits.
5896          */
5897         BUILD_BUG_ON(INTEL_FRONTBUFFER_BITS_PER_PIPE * I915_MAX_PIPES >
5898                      sizeof(atomic_t) * BITS_PER_BYTE);
5899
5900         if (old) {
5901                 WARN_ON(!(atomic_read(&old->frontbuffer_bits) & frontbuffer_bits));
5902                 atomic_andnot(frontbuffer_bits, &old->frontbuffer_bits);
5903         }
5904
5905         if (new) {
5906                 WARN_ON(atomic_read(&new->frontbuffer_bits) & frontbuffer_bits);
5907                 atomic_or(frontbuffer_bits, &new->frontbuffer_bits);
5908         }
5909 }
5910
5911 /* Allocate a new GEM object and fill it with the supplied data */
5912 struct drm_i915_gem_object *
5913 i915_gem_object_create_from_data(struct drm_i915_private *dev_priv,
5914                                  const void *data, size_t size)
5915 {
5916         struct drm_i915_gem_object *obj;
5917         struct file *file;
5918         size_t offset;
5919         int err;
5920
5921         obj = i915_gem_object_create(dev_priv, round_up(size, PAGE_SIZE));
5922         if (IS_ERR(obj))
5923                 return obj;
5924
5925         GEM_BUG_ON(obj->write_domain != I915_GEM_DOMAIN_CPU);
5926
5927         file = obj->base.filp;
5928         offset = 0;
5929         do {
5930                 unsigned int len = min_t(typeof(size), size, PAGE_SIZE);
5931                 struct page *page;
5932                 void *pgdata, *vaddr;
5933
5934                 err = pagecache_write_begin(file, file->f_mapping,
5935                                             offset, len, 0,
5936                                             &page, &pgdata);
5937                 if (err < 0)
5938                         goto fail;
5939
5940                 vaddr = kmap(page);
5941                 memcpy(vaddr, data, len);
5942                 kunmap(page);
5943
5944                 err = pagecache_write_end(file, file->f_mapping,
5945                                           offset, len, len,
5946                                           page, pgdata);
5947                 if (err < 0)
5948                         goto fail;
5949
5950                 size -= len;
5951                 data += len;
5952                 offset += len;
5953         } while (size);
5954
5955         return obj;
5956
5957 fail:
5958         i915_gem_object_put(obj);
5959         return ERR_PTR(err);
5960 }
5961
5962 struct scatterlist *
5963 i915_gem_object_get_sg(struct drm_i915_gem_object *obj,
5964                        unsigned int n,
5965                        unsigned int *offset)
5966 {
5967         struct i915_gem_object_page_iter *iter = &obj->mm.get_page;
5968         struct scatterlist *sg;
5969         unsigned int idx, count;
5970
5971         might_sleep();
5972         GEM_BUG_ON(n >= obj->base.size >> PAGE_SHIFT);
5973         GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
5974
5975         /* As we iterate forward through the sg, we record each entry in a
5976          * radixtree for quick repeated (backwards) lookups. If we have seen
5977          * this index previously, we will have an entry for it.
5978          *
5979          * Initial lookup is O(N), but this is amortized to O(1) for
5980          * sequential page access (where each new request is consecutive
5981          * to the previous one). Repeated lookups are O(lg(obj->base.size)),
5982          * i.e. O(1) with a large constant!
5983          */
5984         if (n < READ_ONCE(iter->sg_idx))
5985                 goto lookup;
5986
5987         mutex_lock(&iter->lock);
5988
5989         /* We prefer to reuse the last sg so that repeated lookup of this
5990          * (or the subsequent) sg are fast - comparing against the last
5991          * sg is faster than going through the radixtree.
5992          */
5993
5994         sg = iter->sg_pos;
5995         idx = iter->sg_idx;
5996         count = __sg_page_count(sg);
5997
5998         while (idx + count <= n) {
5999                 unsigned long exception, i;
6000                 int ret;
6001
6002                 /* If we cannot allocate and insert this entry, or the
6003                  * individual pages from this range, cancel updating the
6004                  * sg_idx so that on this lookup we are forced to linearly
6005                  * scan onwards, but on future lookups we will try the
6006                  * insertion again (in which case we need to be careful of
6007                  * the error return reporting that we have already inserted
6008                  * this index).
6009                  */
6010                 ret = radix_tree_insert(&iter->radix, idx, sg);
6011                 if (ret && ret != -EEXIST)
6012                         goto scan;
6013
6014                 exception =
6015                         RADIX_TREE_EXCEPTIONAL_ENTRY |
6016                         idx << RADIX_TREE_EXCEPTIONAL_SHIFT;
6017                 for (i = 1; i < count; i++) {
6018                         ret = radix_tree_insert(&iter->radix, idx + i,
6019                                                 (void *)exception);
6020                         if (ret && ret != -EEXIST)
6021                                 goto scan;
6022                 }
6023
6024                 idx += count;
6025                 sg = ____sg_next(sg);
6026                 count = __sg_page_count(sg);
6027         }
6028
6029 scan:
6030         iter->sg_pos = sg;
6031         iter->sg_idx = idx;
6032
6033         mutex_unlock(&iter->lock);
6034
6035         if (unlikely(n < idx)) /* insertion completed by another thread */
6036                 goto lookup;
6037
6038         /* In case we failed to insert the entry into the radixtree, we need
6039          * to look beyond the current sg.
6040          */
6041         while (idx + count <= n) {
6042                 idx += count;
6043                 sg = ____sg_next(sg);
6044                 count = __sg_page_count(sg);
6045         }
6046
6047         *offset = n - idx;
6048         return sg;
6049
6050 lookup:
6051         rcu_read_lock();
6052
6053         sg = radix_tree_lookup(&iter->radix, n);
6054         GEM_BUG_ON(!sg);
6055
6056         /* If this index is in the middle of multi-page sg entry,
6057          * the radixtree will contain an exceptional entry that points
6058          * to the start of that range. We will return the pointer to
6059          * the base page and the offset of this page within the
6060          * sg entry's range.
6061          */
6062         *offset = 0;
6063         if (unlikely(radix_tree_exception(sg))) {
6064                 unsigned long base =
6065                         (unsigned long)sg >> RADIX_TREE_EXCEPTIONAL_SHIFT;
6066
6067                 sg = radix_tree_lookup(&iter->radix, base);
6068                 GEM_BUG_ON(!sg);
6069
6070                 *offset = n - base;
6071         }
6072
6073         rcu_read_unlock();
6074
6075         return sg;
6076 }
6077
6078 struct page *
6079 i915_gem_object_get_page(struct drm_i915_gem_object *obj, unsigned int n)
6080 {
6081         struct scatterlist *sg;
6082         unsigned int offset;
6083
6084         GEM_BUG_ON(!i915_gem_object_has_struct_page(obj));
6085
6086         sg = i915_gem_object_get_sg(obj, n, &offset);
6087         return nth_page(sg_page(sg), offset);
6088 }
6089
6090 /* Like i915_gem_object_get_page(), but mark the returned page dirty */
6091 struct page *
6092 i915_gem_object_get_dirty_page(struct drm_i915_gem_object *obj,
6093                                unsigned int n)
6094 {
6095         struct page *page;
6096
6097         page = i915_gem_object_get_page(obj, n);
6098         if (!obj->mm.dirty)
6099                 set_page_dirty(page);
6100
6101         return page;
6102 }
6103
6104 dma_addr_t
6105 i915_gem_object_get_dma_address(struct drm_i915_gem_object *obj,
6106                                 unsigned long n)
6107 {
6108         struct scatterlist *sg;
6109         unsigned int offset;
6110
6111         sg = i915_gem_object_get_sg(obj, n, &offset);
6112         return sg_dma_address(sg) + (offset << PAGE_SHIFT);
6113 }
6114
6115 int i915_gem_object_attach_phys(struct drm_i915_gem_object *obj, int align)
6116 {
6117         struct sg_table *pages;
6118         int err;
6119
6120         if (align > obj->base.size)
6121                 return -EINVAL;
6122
6123         if (obj->ops == &i915_gem_phys_ops)
6124                 return 0;
6125
6126         if (obj->ops != &i915_gem_object_ops)
6127                 return -EINVAL;
6128
6129         err = i915_gem_object_unbind(obj);
6130         if (err)
6131                 return err;
6132
6133         mutex_lock(&obj->mm.lock);
6134
6135         if (obj->mm.madv != I915_MADV_WILLNEED) {
6136                 err = -EFAULT;
6137                 goto err_unlock;
6138         }
6139
6140         if (obj->mm.quirked) {
6141                 err = -EFAULT;
6142                 goto err_unlock;
6143         }
6144
6145         if (obj->mm.mapping) {
6146                 err = -EBUSY;
6147                 goto err_unlock;
6148         }
6149
6150         pages = __i915_gem_object_unset_pages(obj);
6151
6152         obj->ops = &i915_gem_phys_ops;
6153
6154         err = ____i915_gem_object_get_pages(obj);
6155         if (err)
6156                 goto err_xfer;
6157
6158         /* Perma-pin (until release) the physical set of pages */
6159         __i915_gem_object_pin_pages(obj);
6160
6161         if (!IS_ERR_OR_NULL(pages))
6162                 i915_gem_object_ops.put_pages(obj, pages);
6163         mutex_unlock(&obj->mm.lock);
6164         return 0;
6165
6166 err_xfer:
6167         obj->ops = &i915_gem_object_ops;
6168         if (!IS_ERR_OR_NULL(pages)) {
6169                 unsigned int sg_page_sizes = i915_sg_page_sizes(pages->sgl);
6170
6171                 __i915_gem_object_set_pages(obj, pages, sg_page_sizes);
6172         }
6173 err_unlock:
6174         mutex_unlock(&obj->mm.lock);
6175         return err;
6176 }
6177
6178 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
6179 #include "selftests/scatterlist.c"
6180 #include "selftests/mock_gem_device.c"
6181 #include "selftests/huge_gem_object.c"
6182 #include "selftests/huge_pages.c"
6183 #include "selftests/i915_gem_object.c"
6184 #include "selftests/i915_gem_coherency.c"
6185 #endif