drivers/gpu/drm/i915/i915_gem.c

   1 /*
   2  * Copyright © 2008-2015 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *    Eric Anholt <eric@anholt.net>
  25  *
  26  */
  27
  28 #include <drm/drm_vma_manager.h>
  29 #include <drm/i915_drm.h>
  30 #include <linux/dma-fence-array.h>
  31 #include <linux/kthread.h>
  32 #include <linux/reservation.h>
  33 #include <linux/shmem_fs.h>
  34 #include <linux/slab.h>
  35 #include <linux/stop_machine.h>
  36 #include <linux/swap.h>
  37 #include <linux/pci.h>
  38 #include <linux/dma-buf.h>
  39
  40 #include "i915_drv.h"
  41 #include "i915_gem_clflush.h"
  42 #include "i915_gemfs.h"
  43 #include "i915_reset.h"
  44 #include "i915_trace.h"
  45 #include "i915_vgpu.h"
  46
  47 #include "intel_drv.h"
  48 #include "intel_frontbuffer.h"
  49 #include "intel_mocs.h"
  50 #include "intel_workarounds.h"
  51
  52 static void i915_gem_flush_free_objects(struct drm_i915_private *i915);
  53
  54 static bool cpu_write_needs_clflush(struct drm_i915_gem_object *obj)
  55 {
  56         if (obj->cache_dirty)
  57                 return false;
  58
  59         if (!(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE))
  60                 return true;
  61
  62         return obj->pin_global; /* currently in use by HW, keep flushed */
  63 }
  64
  65 static int
  66 insert_mappable_node(struct i915_ggtt *ggtt,
  67                      struct drm_mm_node *node, u32 size)
  68 {
  69         memset(node, 0, sizeof(*node));
  70         return drm_mm_insert_node_in_range(&ggtt->vm.mm, node,
  71                                            size, 0, I915_COLOR_UNEVICTABLE,
  72                                            0, ggtt->mappable_end,
  73                                            DRM_MM_INSERT_LOW);
  74 }
  75
  76 static void
  77 remove_mappable_node(struct drm_mm_node *node)
  78 {
  79         drm_mm_remove_node(node);
  80 }
  81
  82 /* some bookkeeping */
  83 static void i915_gem_info_add_obj(struct drm_i915_private *dev_priv,
  84                                   u64 size)
  85 {
  86         spin_lock(&dev_priv->mm.object_stat_lock);
  87         dev_priv->mm.object_count++;
  88         dev_priv->mm.object_memory += size;
  89         spin_unlock(&dev_priv->mm.object_stat_lock);
  90 }
  91
  92 static void i915_gem_info_remove_obj(struct drm_i915_private *dev_priv,
  93                                      u64 size)
  94 {
  95         spin_lock(&dev_priv->mm.object_stat_lock);
  96         dev_priv->mm.object_count--;
  97         dev_priv->mm.object_memory -= size;
  98         spin_unlock(&dev_priv->mm.object_stat_lock);
  99 }
 100
 101 static int
 102 i915_gem_wait_for_error(struct i915_gpu_error *error)
 103 {
 104         int ret;
 105
 106         might_sleep();
 107
 108         /*
 109          * Only wait 10 seconds for the gpu reset to complete to avoid hanging
 110          * userspace. If it takes that long something really bad is going on and
 111          * we should simply try to bail out and fail as gracefully as possible.
 112          */
 113         ret = wait_event_interruptible_timeout(error->reset_queue,
 114                                                !i915_reset_backoff(error),
 115                                                I915_RESET_TIMEOUT);
 116         if (ret == 0) {
 117                 DRM_ERROR("Timed out waiting for the gpu reset to complete\n");
 118                 return -EIO;
 119         } else if (ret < 0) {
 120                 return ret;
 121         } else {
 122                 return 0;
 123         }
 124 }
 125
 126 int i915_mutex_lock_interruptible(struct drm_device *dev)
 127 {
 128         struct drm_i915_private *dev_priv = to_i915(dev);
 129         int ret;
 130
 131         ret = i915_gem_wait_for_error(&dev_priv->gpu_error);
 132         if (ret)
 133                 return ret;
 134
 135         ret = mutex_lock_interruptible(&dev->struct_mutex);
 136         if (ret)
 137                 return ret;
 138
 139         return 0;
 140 }
 141
 142 static u32 __i915_gem_park(struct drm_i915_private *i915)
 143 {
 144         intel_wakeref_t wakeref;
 145
 146         GEM_TRACE("\n");
 147
 148         lockdep_assert_held(&i915->drm.struct_mutex);
 149         GEM_BUG_ON(i915->gt.active_requests);
 150         GEM_BUG_ON(!list_empty(&i915->gt.active_rings));
 151
 152         if (!i915->gt.awake)
 153                 return I915_EPOCH_INVALID;
 154
 155         GEM_BUG_ON(i915->gt.epoch == I915_EPOCH_INVALID);
 156
 157         /*
 158          * Be paranoid and flush a concurrent interrupt to make sure
 159          * we don't reactivate any irq tasklets after parking.
 160          *
 161          * FIXME: Note that even though we have waited for execlists to be idle,
 162          * there may still be an in-flight interrupt even though the CSB
 163          * is now empty. synchronize_irq() makes sure that a residual interrupt
 164          * is completed before we continue, but it doesn't prevent the HW from
 165          * raising a spurious interrupt later. To complete the shield we should
 166          * coordinate disabling the CS irq with flushing the interrupts.
 167          */
 168         synchronize_irq(i915->drm.irq);
 169
 170         intel_engines_park(i915);
 171         i915_timelines_park(i915);
 172
 173         i915_pmu_gt_parked(i915);
 174         i915_vma_parked(i915);
 175
 176         wakeref = fetch_and_zero(&i915->gt.awake);
 177         GEM_BUG_ON(!wakeref);
 178
 179         if (INTEL_GEN(i915) >= 6)
 180                 gen6_rps_idle(i915);
 181
 182         intel_display_power_put(i915, POWER_DOMAIN_GT_IRQ, wakeref);
 183
 184         return i915->gt.epoch;
 185 }
 186
 187 void i915_gem_park(struct drm_i915_private *i915)
 188 {
 189         GEM_TRACE("\n");
 190
 191         lockdep_assert_held(&i915->drm.struct_mutex);
 192         GEM_BUG_ON(i915->gt.active_requests);
 193
 194         if (!i915->gt.awake)
 195                 return;
 196
 197         /* Defer the actual call to __i915_gem_park() to prevent ping-pongs */
 198         mod_delayed_work(i915->wq, &i915->gt.idle_work, msecs_to_jiffies(100));
 199 }
 200
 201 void i915_gem_unpark(struct drm_i915_private *i915)
 202 {
 203         GEM_TRACE("\n");
 204
 205         lockdep_assert_held(&i915->drm.struct_mutex);
 206         GEM_BUG_ON(!i915->gt.active_requests);
 207         assert_rpm_wakelock_held(i915);
 208
 209         if (i915->gt.awake)
 210                 return;
 211
 212         /*
 213          * It seems that the DMC likes to transition between the DC states a lot
 214          * when there are no connected displays (no active power domains) during
 215          * command submission.
 216          *
 217          * This activity has negative impact on the performance of the chip with
 218          * huge latencies observed in the interrupt handler and elsewhere.
 219          *
 220          * Work around it by grabbing a GT IRQ power domain whilst there is any
 221          * GT activity, preventing any DC state transitions.
 222          */
 223         i915->gt.awake = intel_display_power_get(i915, POWER_DOMAIN_GT_IRQ);
 224         GEM_BUG_ON(!i915->gt.awake);
 225
 226         if (unlikely(++i915->gt.epoch == 0)) /* keep 0 as invalid */
 227                 i915->gt.epoch = 1;
 228
 229         intel_enable_gt_powersave(i915);
 230         i915_update_gfx_val(i915);
 231         if (INTEL_GEN(i915) >= 6)
 232                 gen6_rps_busy(i915);
 233         i915_pmu_gt_unparked(i915);
 234
 235         intel_engines_unpark(i915);
 236
 237         i915_queue_hangcheck(i915);
 238
 239         queue_delayed_work(i915->wq,
 240                            &i915->gt.retire_work,
 241                            round_jiffies_up_relative(HZ));
 242 }
 243
 244 int
 245 i915_gem_get_aperture_ioctl(struct drm_device *dev, void *data,
 246                             struct drm_file *file)
 247 {
 248         struct drm_i915_private *dev_priv = to_i915(dev);
 249         struct i915_ggtt *ggtt = &dev_priv->ggtt;
 250         struct drm_i915_gem_get_aperture *args = data;
 251         struct i915_vma *vma;
 252         u64 pinned;
 253
 254         pinned = ggtt->vm.reserved;
 255         mutex_lock(&dev->struct_mutex);
 256         list_for_each_entry(vma, &ggtt->vm.active_list, vm_link)
 257                 if (i915_vma_is_pinned(vma))
 258                         pinned += vma->node.size;
 259         list_for_each_entry(vma, &ggtt->vm.inactive_list, vm_link)
 260                 if (i915_vma_is_pinned(vma))
 261                         pinned += vma->node.size;
 262         mutex_unlock(&dev->struct_mutex);
 263
 264         args->aper_size = ggtt->vm.total;
 265         args->aper_available_size = args->aper_size - pinned;
 266
 267         return 0;
 268 }
 269
 270 static int i915_gem_object_get_pages_phys(struct drm_i915_gem_object *obj)
 271 {
 272         struct address_space *mapping = obj->base.filp->f_mapping;
 273         drm_dma_handle_t *phys;
 274         struct sg_table *st;
 275         struct scatterlist *sg;
 276         char *vaddr;
 277         int i;
 278         int err;
 279
 280         if (WARN_ON(i915_gem_object_needs_bit17_swizzle(obj)))
 281                 return -EINVAL;
 282
 283         /* Always aligning to the object size, allows a single allocation
 284          * to handle all possible callers, and given typical object sizes,
 285          * the alignment of the buddy allocation will naturally match.
 286          */
 287         phys = drm_pci_alloc(obj->base.dev,
 288                              roundup_pow_of_two(obj->base.size),
 289                              roundup_pow_of_two(obj->base.size));
 290         if (!phys)
 291                 return -ENOMEM;
 292
 293         vaddr = phys->vaddr;
 294         for (i = 0; i < obj->base.size / PAGE_SIZE; i++) {
 295                 struct page *page;
 296                 char *src;
 297
 298                 page = shmem_read_mapping_page(mapping, i);
 299                 if (IS_ERR(page)) {
 300                         err = PTR_ERR(page);
 301                         goto err_phys;
 302                 }
 303
 304                 src = kmap_atomic(page);
 305                 memcpy(vaddr, src, PAGE_SIZE);
 306                 drm_clflush_virt_range(vaddr, PAGE_SIZE);
 307                 kunmap_atomic(src);
 308
 309                 put_page(page);
 310                 vaddr += PAGE_SIZE;
 311         }
 312
 313         i915_gem_chipset_flush(to_i915(obj->base.dev));
 314
 315         st = kmalloc(sizeof(*st), GFP_KERNEL);
 316         if (!st) {
 317                 err = -ENOMEM;
 318                 goto err_phys;
 319         }
 320
 321         if (sg_alloc_table(st, 1, GFP_KERNEL)) {
 322                 kfree(st);
 323                 err = -ENOMEM;
 324                 goto err_phys;
 325         }
 326
 327         sg = st->sgl;
 328         sg->offset = 0;
 329         sg->length = obj->base.size;
 330
 331         sg_dma_address(sg) = phys->busaddr;
 332         sg_dma_len(sg) = obj->base.size;
 333
 334         obj->phys_handle = phys;
 335
 336         __i915_gem_object_set_pages(obj, st, sg->length);
 337
 338         return 0;
 339
 340 err_phys:
 341         drm_pci_free(obj->base.dev, phys);
 342
 343         return err;
 344 }
 345
 346 static void __start_cpu_write(struct drm_i915_gem_object *obj)
 347 {
 348         obj->read_domains = I915_GEM_DOMAIN_CPU;
 349         obj->write_domain = I915_GEM_DOMAIN_CPU;
 350         if (cpu_write_needs_clflush(obj))
 351                 obj->cache_dirty = true;
 352 }
 353
 354 static void
 355 __i915_gem_object_release_shmem(struct drm_i915_gem_object *obj,
 356                                 struct sg_table *pages,
 357                                 bool needs_clflush)
 358 {
 359         GEM_BUG_ON(obj->mm.madv == __I915_MADV_PURGED);
 360
 361         if (obj->mm.madv == I915_MADV_DONTNEED)
 362                 obj->mm.dirty = false;
 363
 364         if (needs_clflush &&
 365             (obj->read_domains & I915_GEM_DOMAIN_CPU) == 0 &&
 366             !(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ))
 367                 drm_clflush_sg(pages);
 368
 369         __start_cpu_write(obj);
 370 }
 371
 372 static void
 373 i915_gem_object_put_pages_phys(struct drm_i915_gem_object *obj,
 374                                struct sg_table *pages)
 375 {
 376         __i915_gem_object_release_shmem(obj, pages, false);
 377
 378         if (obj->mm.dirty) {
 379                 struct address_space *mapping = obj->base.filp->f_mapping;
 380                 char *vaddr = obj->phys_handle->vaddr;
 381                 int i;
 382
 383                 for (i = 0; i < obj->base.size / PAGE_SIZE; i++) {
 384                         struct page *page;
 385                         char *dst;
 386
 387                         page = shmem_read_mapping_page(mapping, i);
 388                         if (IS_ERR(page))
 389                                 continue;
 390
 391                         dst = kmap_atomic(page);
 392                         drm_clflush_virt_range(vaddr, PAGE_SIZE);
 393                         memcpy(dst, vaddr, PAGE_SIZE);
 394                         kunmap_atomic(dst);
 395
 396                         set_page_dirty(page);
 397                         if (obj->mm.madv == I915_MADV_WILLNEED)
 398                                 mark_page_accessed(page);
 399                         put_page(page);
 400                         vaddr += PAGE_SIZE;
 401                 }
 402                 obj->mm.dirty = false;
 403         }
 404
 405         sg_free_table(pages);
 406         kfree(pages);
 407
 408         drm_pci_free(obj->base.dev, obj->phys_handle);
 409 }
 410
 411 static void
 412 i915_gem_object_release_phys(struct drm_i915_gem_object *obj)
 413 {
 414         i915_gem_object_unpin_pages(obj);
 415 }
 416
 417 static const struct drm_i915_gem_object_ops i915_gem_phys_ops = {
 418         .get_pages = i915_gem_object_get_pages_phys,
 419         .put_pages = i915_gem_object_put_pages_phys,
 420         .release = i915_gem_object_release_phys,
 421 };
 422
 423 static const struct drm_i915_gem_object_ops i915_gem_object_ops;
 424
 425 int i915_gem_object_unbind(struct drm_i915_gem_object *obj)
 426 {
 427         struct i915_vma *vma;
 428         LIST_HEAD(still_in_list);
 429         int ret;
 430
 431         lockdep_assert_held(&obj->base.dev->struct_mutex);
 432
 433         /* Closed vma are removed from the obj->vma_list - but they may
 434          * still have an active binding on the object. To remove those we
 435          * must wait for all rendering to complete to the object (as unbinding
 436          * must anyway), and retire the requests.
 437          */
 438         ret = i915_gem_object_set_to_cpu_domain(obj, false);
 439         if (ret)
 440                 return ret;
 441
 442         while ((vma = list_first_entry_or_null(&obj->vma_list,
 443                                                struct i915_vma,
 444                                                obj_link))) {
 445                 list_move_tail(&vma->obj_link, &still_in_list);
 446                 ret = i915_vma_unbind(vma);
 447                 if (ret)
 448                         break;
 449         }
 450         list_splice(&still_in_list, &obj->vma_list);
 451
 452         return ret;
 453 }
 454
 455 static long
 456 i915_gem_object_wait_fence(struct dma_fence *fence,
 457                            unsigned int flags,
 458                            long timeout,
 459                            struct intel_rps_client *rps_client)
 460 {
 461         struct i915_request *rq;
 462
 463         BUILD_BUG_ON(I915_WAIT_INTERRUPTIBLE != 0x1);
 464
 465         if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags))
 466                 return timeout;
 467
 468         if (!dma_fence_is_i915(fence))
 469                 return dma_fence_wait_timeout(fence,
 470                                               flags & I915_WAIT_INTERRUPTIBLE,
 471                                               timeout);
 472
 473         rq = to_request(fence);
 474         if (i915_request_completed(rq))
 475                 goto out;
 476
 477         /*
 478          * This client is about to stall waiting for the GPU. In many cases
 479          * this is undesirable and limits the throughput of the system, as
 480          * many clients cannot continue processing user input/output whilst
 481          * blocked. RPS autotuning may take tens of milliseconds to respond
 482          * to the GPU load and thus incurs additional latency for the client.
 483          * We can circumvent that by promoting the GPU frequency to maximum
 484          * before we wait. This makes the GPU throttle up much more quickly
 485          * (good for benchmarks and user experience, e.g. window animations),
 486          * but at a cost of spending more power processing the workload
 487          * (bad for battery). Not all clients even want their results
 488          * immediately and for them we should just let the GPU select its own
 489          * frequency to maximise efficiency. To prevent a single client from
 490          * forcing the clocks too high for the whole system, we only allow
 491          * each client to waitboost once in a busy period.
 492          */
 493         if (rps_client && !i915_request_started(rq)) {
 494                 if (INTEL_GEN(rq->i915) >= 6)
 495                         gen6_rps_boost(rq, rps_client);
 496         }
 497
 498         timeout = i915_request_wait(rq, flags, timeout);
 499
 500 out:
 501         if (flags & I915_WAIT_LOCKED && i915_request_completed(rq))
 502                 i915_request_retire_upto(rq);
 503
 504         return timeout;
 505 }
 506
 507 static long
 508 i915_gem_object_wait_reservation(struct reservation_object *resv,
 509                                  unsigned int flags,
 510                                  long timeout,
 511                                  struct intel_rps_client *rps_client)
 512 {
 513         unsigned int seq = __read_seqcount_begin(&resv->seq);
 514         struct dma_fence *excl;
 515         bool prune_fences = false;
 516
 517         if (flags & I915_WAIT_ALL) {
 518                 struct dma_fence **shared;
 519                 unsigned int count, i;
 520                 int ret;
 521
 522                 ret = reservation_object_get_fences_rcu(resv,
 523                                                         &excl, &count, &shared);
 524                 if (ret)
 525                         return ret;
 526
 527                 for (i = 0; i < count; i++) {
 528                         timeout = i915_gem_object_wait_fence(shared[i],
 529                                                              flags, timeout,
 530                                                              rps_client);
 531                         if (timeout < 0)
 532                                 break;
 533
 534                         dma_fence_put(shared[i]);
 535                 }
 536
 537                 for (; i < count; i++)
 538                         dma_fence_put(shared[i]);
 539                 kfree(shared);
 540
 541                 /*
 542                  * If both shared fences and an exclusive fence exist,
 543                  * then by construction the shared fences must be later
 544                  * than the exclusive fence. If we successfully wait for
 545                  * all the shared fences, we know that the exclusive fence
 546                  * must all be signaled. If all the shared fences are
 547                  * signaled, we can prune the array and recover the
 548                  * floating references on the fences/requests.
 549                  */
 550                 prune_fences = count && timeout >= 0;
 551         } else {
 552                 excl = reservation_object_get_excl_rcu(resv);
 553         }
 554
 555         if (excl && timeout >= 0)
 556                 timeout = i915_gem_object_wait_fence(excl, flags, timeout,
 557                                                      rps_client);
 558
 559         dma_fence_put(excl);
 560
 561         /*
 562          * Opportunistically prune the fences iff we know they have *all* been
 563          * signaled and that the reservation object has not been changed (i.e.
 564          * no new fences have been added).
 565          */
 566         if (prune_fences && !__read_seqcount_retry(&resv->seq, seq)) {
 567                 if (reservation_object_trylock(resv)) {
 568                         if (!__read_seqcount_retry(&resv->seq, seq))
 569                                 reservation_object_add_excl_fence(resv, NULL);
 570                         reservation_object_unlock(resv);
 571                 }
 572         }
 573
 574         return timeout;
 575 }
 576
 577 static void __fence_set_priority(struct dma_fence *fence,
 578                                  const struct i915_sched_attr *attr)
 579 {
 580         struct i915_request *rq;
 581         struct intel_engine_cs *engine;
 582
 583         if (dma_fence_is_signaled(fence) || !dma_fence_is_i915(fence))
 584                 return;
 585
 586         rq = to_request(fence);
 587         engine = rq->engine;
 588
 589         local_bh_disable();
 590         rcu_read_lock(); /* RCU serialisation for set-wedged protection */
 591         if (engine->schedule)
 592                 engine->schedule(rq, attr);
 593         rcu_read_unlock();
 594         local_bh_enable(); /* kick the tasklets if queues were reprioritised */
 595 }
 596
 597 static void fence_set_priority(struct dma_fence *fence,
 598                                const struct i915_sched_attr *attr)
 599 {
 600         /* Recurse once into a fence-array */
 601         if (dma_fence_is_array(fence)) {
 602                 struct dma_fence_array *array = to_dma_fence_array(fence);
 603                 int i;
 604
 605                 for (i = 0; i < array->num_fences; i++)
 606                         __fence_set_priority(array->fences[i], attr);
 607         } else {
 608                 __fence_set_priority(fence, attr);
 609         }
 610 }
 611
 612 int
 613 i915_gem_object_wait_priority(struct drm_i915_gem_object *obj,
 614                               unsigned int flags,
 615                               const struct i915_sched_attr *attr)
 616 {
 617         struct dma_fence *excl;
 618
 619         if (flags & I915_WAIT_ALL) {
 620                 struct dma_fence **shared;
 621                 unsigned int count, i;
 622                 int ret;
 623
 624                 ret = reservation_object_get_fences_rcu(obj->resv,
 625                                                         &excl, &count, &shared);
 626                 if (ret)
 627                         return ret;
 628
 629                 for (i = 0; i < count; i++) {
 630                         fence_set_priority(shared[i], attr);
 631                         dma_fence_put(shared[i]);
 632                 }
 633
 634                 kfree(shared);
 635         } else {
 636                 excl = reservation_object_get_excl_rcu(obj->resv);
 637         }
 638
 639         if (excl) {
 640                 fence_set_priority(excl, attr);
 641                 dma_fence_put(excl);
 642         }
 643         return 0;
 644 }
 645
 646 /**
 647  * Waits for rendering to the object to be completed
 648  * @obj: i915 gem object
 649  * @flags: how to wait (under a lock, for all rendering or just for writes etc)
 650  * @timeout: how long to wait
 651  * @rps_client: client (user process) to charge for any waitboosting
 652  */
 653 int
 654 i915_gem_object_wait(struct drm_i915_gem_object *obj,
 655                      unsigned int flags,
 656                      long timeout,
 657                      struct intel_rps_client *rps_client)
 658 {
 659         might_sleep();
 660 #if IS_ENABLED(CONFIG_LOCKDEP)
 661         GEM_BUG_ON(debug_locks &&
 662                    !!lockdep_is_held(&obj->base.dev->struct_mutex) !=
 663                    !!(flags & I915_WAIT_LOCKED));
 664 #endif
 665         GEM_BUG_ON(timeout < 0);
 666
 667         timeout = i915_gem_object_wait_reservation(obj->resv,
 668                                                    flags, timeout,
 669                                                    rps_client);
 670         return timeout < 0 ? timeout : 0;
 671 }
 672
 673 static struct intel_rps_client *to_rps_client(struct drm_file *file)
 674 {
 675         struct drm_i915_file_private *fpriv = file->driver_priv;
 676
 677         return &fpriv->rps_client;
 678 }
 679
 680 static int
 681 i915_gem_phys_pwrite(struct drm_i915_gem_object *obj,
 682                      struct drm_i915_gem_pwrite *args,
 683                      struct drm_file *file)
 684 {
 685         void *vaddr = obj->phys_handle->vaddr + args->offset;
 686         char __user *user_data = u64_to_user_ptr(args->data_ptr);
 687
 688         /* We manually control the domain here and pretend that it
 689          * remains coherent i.e. in the GTT domain, like shmem_pwrite.
 690          */
 691         intel_fb_obj_invalidate(obj, ORIGIN_CPU);
 692         if (copy_from_user(vaddr, user_data, args->size))
 693                 return -EFAULT;
 694
 695         drm_clflush_virt_range(vaddr, args->size);
 696         i915_gem_chipset_flush(to_i915(obj->base.dev));
 697
 698         intel_fb_obj_flush(obj, ORIGIN_CPU);
 699         return 0;
 700 }
 701
 702 void *i915_gem_object_alloc(struct drm_i915_private *dev_priv)
 703 {
 704         return kmem_cache_zalloc(dev_priv->objects, GFP_KERNEL);
 705 }
 706
 707 void i915_gem_object_free(struct drm_i915_gem_object *obj)
 708 {
 709         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
 710         kmem_cache_free(dev_priv->objects, obj);
 711 }
 712
 713 static int
 714 i915_gem_create(struct drm_file *file,
 715                 struct drm_i915_private *dev_priv,
 716                 uint64_t size,
 717                 uint32_t *handle_p)
 718 {
 719         struct drm_i915_gem_object *obj;
 720         int ret;
 721         u32 handle;
 722
 723         size = roundup(size, PAGE_SIZE);
 724         if (size == 0)
 725                 return -EINVAL;
 726
 727         /* Allocate the new object */
 728         obj = i915_gem_object_create(dev_priv, size);
 729         if (IS_ERR(obj))
 730                 return PTR_ERR(obj);
 731
 732         ret = drm_gem_handle_create(file, &obj->base, &handle);
 733         /* drop reference from allocate - handle holds it now */
 734         i915_gem_object_put(obj);
 735         if (ret)
 736                 return ret;
 737
 738         *handle_p = handle;
 739         return 0;
 740 }
 741
 742 int
 743 i915_gem_dumb_create(struct drm_file *file,
 744                      struct drm_device *dev,
 745                      struct drm_mode_create_dumb *args)
 746 {
 747         /* have to work out size/pitch and return them */
 748         args->pitch = ALIGN(args->width * DIV_ROUND_UP(args->bpp, 8), 64);
 749         args->size = args->pitch * args->height;
 750         return i915_gem_create(file, to_i915(dev),
 751                                args->size, &args->handle);
 752 }
 753
 754 static bool gpu_write_needs_clflush(struct drm_i915_gem_object *obj)
 755 {
 756         return !(obj->cache_level == I915_CACHE_NONE ||
 757                  obj->cache_level == I915_CACHE_WT);
 758 }
 759
 760 /**
 761  * Creates a new mm object and returns a handle to it.
 762  * @dev: drm device pointer
 763  * @data: ioctl data blob
 764  * @file: drm file pointer
 765  */
 766 int
 767 i915_gem_create_ioctl(struct drm_device *dev, void *data,
 768                       struct drm_file *file)
 769 {
 770         struct drm_i915_private *dev_priv = to_i915(dev);
 771         struct drm_i915_gem_create *args = data;
 772
 773         i915_gem_flush_free_objects(dev_priv);
 774
 775         return i915_gem_create(file, dev_priv,
 776                                args->size, &args->handle);
 777 }
 778
 779 static inline enum fb_op_origin
 780 fb_write_origin(struct drm_i915_gem_object *obj, unsigned int domain)
 781 {
 782         return (domain == I915_GEM_DOMAIN_GTT ?
 783                 obj->frontbuffer_ggtt_origin : ORIGIN_CPU);
 784 }
 785
 786 void i915_gem_flush_ggtt_writes(struct drm_i915_private *dev_priv)
 787 {
 788         intel_wakeref_t wakeref;
 789
 790         /*
 791          * No actual flushing is required for the GTT write domain for reads
 792          * from the GTT domain. Writes to it "immediately" go to main memory
 793          * as far as we know, so there's no chipset flush. It also doesn't
 794          * land in the GPU render cache.
 795          *
 796          * However, we do have to enforce the order so that all writes through
 797          * the GTT land before any writes to the device, such as updates to
 798          * the GATT itself.
 799          *
 800          * We also have to wait a bit for the writes to land from the GTT.
 801          * An uncached read (i.e. mmio) seems to be ideal for the round-trip
 802          * timing. This issue has only been observed when switching quickly
 803          * between GTT writes and CPU reads from inside the kernel on recent hw,
 804          * and it appears to only affect discrete GTT blocks (i.e. on LLC
 805          * system agents we cannot reproduce this behaviour, until Cannonlake
 806          * that was!).
 807          */
 808
 809         wmb();
 810
 811         if (INTEL_INFO(dev_priv)->has_coherent_ggtt)
 812                 return;
 813
 814         i915_gem_chipset_flush(dev_priv);
 815
 816         with_intel_runtime_pm(dev_priv, wakeref) {
 817                 spin_lock_irq(&dev_priv->uncore.lock);
 818
 819                 POSTING_READ_FW(RING_HEAD(RENDER_RING_BASE));
 820
 821                 spin_unlock_irq(&dev_priv->uncore.lock);
 822         }
 823 }
 824
 825 static void
 826 flush_write_domain(struct drm_i915_gem_object *obj, unsigned int flush_domains)
 827 {
 828         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
 829         struct i915_vma *vma;
 830
 831         if (!(obj->write_domain & flush_domains))
 832                 return;
 833
 834         switch (obj->write_domain) {
 835         case I915_GEM_DOMAIN_GTT:
 836                 i915_gem_flush_ggtt_writes(dev_priv);
 837
 838                 intel_fb_obj_flush(obj,
 839                                    fb_write_origin(obj, I915_GEM_DOMAIN_GTT));
 840
 841                 for_each_ggtt_vma(vma, obj) {
 842                         if (vma->iomap)
 843                                 continue;
 844
 845                         i915_vma_unset_ggtt_write(vma);
 846                 }
 847                 break;
 848
 849         case I915_GEM_DOMAIN_WC:
 850                 wmb();
 851                 break;
 852
 853         case I915_GEM_DOMAIN_CPU:
 854                 i915_gem_clflush_object(obj, I915_CLFLUSH_SYNC);
 855                 break;
 856
 857         case I915_GEM_DOMAIN_RENDER:
 858                 if (gpu_write_needs_clflush(obj))
 859                         obj->cache_dirty = true;
 860                 break;
 861         }
 862
 863         obj->write_domain = 0;
 864 }
 865
 866 /*
 867  * Pins the specified object's pages and synchronizes the object with
 868  * GPU accesses. Sets needs_clflush to non-zero if the caller should
 869  * flush the object from the CPU cache.
 870  */
 871 int i915_gem_obj_prepare_shmem_read(struct drm_i915_gem_object *obj,
 872                                     unsigned int *needs_clflush)
 873 {
 874         int ret;
 875
 876         lockdep_assert_held(&obj->base.dev->struct_mutex);
 877
 878         *needs_clflush = 0;
 879         if (!i915_gem_object_has_struct_page(obj))
 880                 return -ENODEV;
 881
 882         ret = i915_gem_object_wait(obj,
 883                                    I915_WAIT_INTERRUPTIBLE |
 884                                    I915_WAIT_LOCKED,
 885                                    MAX_SCHEDULE_TIMEOUT,
 886                                    NULL);
 887         if (ret)
 888                 return ret;
 889
 890         ret = i915_gem_object_pin_pages(obj);
 891         if (ret)
 892                 return ret;
 893
 894         if (obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ ||
 895             !static_cpu_has(X86_FEATURE_CLFLUSH)) {
 896                 ret = i915_gem_object_set_to_cpu_domain(obj, false);
 897                 if (ret)
 898                         goto err_unpin;
 899                 else
 900                         goto out;
 901         }
 902
 903         flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
 904
 905         /* If we're not in the cpu read domain, set ourself into the gtt
 906          * read domain and manually flush cachelines (if required). This
 907          * optimizes for the case when the gpu will dirty the data
 908          * anyway again before the next pread happens.
 909          */
 910         if (!obj->cache_dirty &&
 911             !(obj->read_domains & I915_GEM_DOMAIN_CPU))
 912                 *needs_clflush = CLFLUSH_BEFORE;
 913
 914 out:
 915         /* return with the pages pinned */
 916         return 0;
 917
 918 err_unpin:
 919         i915_gem_object_unpin_pages(obj);
 920         return ret;
 921 }
 922
 923 int i915_gem_obj_prepare_shmem_write(struct drm_i915_gem_object *obj,
 924                                      unsigned int *needs_clflush)
 925 {
 926         int ret;
 927
 928         lockdep_assert_held(&obj->base.dev->struct_mutex);
 929
 930         *needs_clflush = 0;
 931         if (!i915_gem_object_has_struct_page(obj))
 932                 return -ENODEV;
 933
 934         ret = i915_gem_object_wait(obj,
 935                                    I915_WAIT_INTERRUPTIBLE |
 936                                    I915_WAIT_LOCKED |
 937                                    I915_WAIT_ALL,
 938                                    MAX_SCHEDULE_TIMEOUT,
 939                                    NULL);
 940         if (ret)
 941                 return ret;
 942
 943         ret = i915_gem_object_pin_pages(obj);
 944         if (ret)
 945                 return ret;
 946
 947         if (obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE ||
 948             !static_cpu_has(X86_FEATURE_CLFLUSH)) {
 949                 ret = i915_gem_object_set_to_cpu_domain(obj, true);
 950                 if (ret)
 951                         goto err_unpin;
 952                 else
 953                         goto out;
 954         }
 955
 956         flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
 957
 958         /* If we're not in the cpu write domain, set ourself into the
 959          * gtt write domain and manually flush cachelines (as required).
 960          * This optimizes for the case when the gpu will use the data
 961          * right away and we therefore have to clflush anyway.
 962          */
 963         if (!obj->cache_dirty) {
 964                 *needs_clflush |= CLFLUSH_AFTER;
 965
 966                 /*
 967                  * Same trick applies to invalidate partially written
 968                  * cachelines read before writing.
 969                  */
 970                 if (!(obj->read_domains & I915_GEM_DOMAIN_CPU))
 971                         *needs_clflush |= CLFLUSH_BEFORE;
 972         }
 973
 974 out:
 975         intel_fb_obj_invalidate(obj, ORIGIN_CPU);
 976         obj->mm.dirty = true;
 977         /* return with the pages pinned */
 978         return 0;
 979
 980 err_unpin:
 981         i915_gem_object_unpin_pages(obj);
 982         return ret;
 983 }
 984
 985 static int
 986 shmem_pread(struct page *page, int offset, int len, char __user *user_data,
 987             bool needs_clflush)
 988 {
 989         char *vaddr;
 990         int ret;
 991
 992         vaddr = kmap(page);
 993
 994         if (needs_clflush)
 995                 drm_clflush_virt_range(vaddr + offset, len);
 996
 997         ret = __copy_to_user(user_data, vaddr + offset, len);
 998
 999         kunmap(page);
1000
1001         return ret ? -EFAULT : 0;
1002 }
1003
1004 static int
1005 i915_gem_shmem_pread(struct drm_i915_gem_object *obj,
1006                      struct drm_i915_gem_pread *args)
1007 {
1008         char __user *user_data;
1009         u64 remain;
1010         unsigned int needs_clflush;
1011         unsigned int idx, offset;
1012         int ret;
1013
1014         ret = mutex_lock_interruptible(&obj->base.dev->struct_mutex);
1015         if (ret)
1016                 return ret;
1017
1018         ret = i915_gem_obj_prepare_shmem_read(obj, &needs_clflush);
1019         mutex_unlock(&obj->base.dev->struct_mutex);
1020         if (ret)
1021                 return ret;
1022
1023         remain = args->size;
1024         user_data = u64_to_user_ptr(args->data_ptr);
1025         offset = offset_in_page(args->offset);
1026         for (idx = args->offset >> PAGE_SHIFT; remain; idx++) {
1027                 struct page *page = i915_gem_object_get_page(obj, idx);
1028                 unsigned int length = min_t(u64, remain, PAGE_SIZE - offset);
1029
1030                 ret = shmem_pread(page, offset, length, user_data,
1031                                   needs_clflush);
1032                 if (ret)
1033                         break;
1034
1035                 remain -= length;
1036                 user_data += length;
1037                 offset = 0;
1038         }
1039
1040         i915_gem_obj_finish_shmem_access(obj);
1041         return ret;
1042 }
1043
1044 static inline bool
1045 gtt_user_read(struct io_mapping *mapping,
1046               loff_t base, int offset,
1047               char __user *user_data, int length)
1048 {
1049         void __iomem *vaddr;
1050         unsigned long unwritten;
1051
1052         /* We can use the cpu mem copy function because this is X86. */
1053         vaddr = io_mapping_map_atomic_wc(mapping, base);
1054         unwritten = __copy_to_user_inatomic(user_data,
1055                                             (void __force *)vaddr + offset,
1056                                             length);
1057         io_mapping_unmap_atomic(vaddr);
1058         if (unwritten) {
1059                 vaddr = io_mapping_map_wc(mapping, base, PAGE_SIZE);
1060                 unwritten = copy_to_user(user_data,
1061                                          (void __force *)vaddr + offset,
1062                                          length);
1063                 io_mapping_unmap(vaddr);
1064         }
1065         return unwritten;
1066 }
1067
1068 static int
1069 i915_gem_gtt_pread(struct drm_i915_gem_object *obj,
1070                    const struct drm_i915_gem_pread *args)
1071 {
1072         struct drm_i915_private *i915 = to_i915(obj->base.dev);
1073         struct i915_ggtt *ggtt = &i915->ggtt;
1074         intel_wakeref_t wakeref;
1075         struct drm_mm_node node;
1076         struct i915_vma *vma;
1077         void __user *user_data;
1078         u64 remain, offset;
1079         int ret;
1080
1081         ret = mutex_lock_interruptible(&i915->drm.struct_mutex);
1082         if (ret)
1083                 return ret;
1084
1085         wakeref = intel_runtime_pm_get(i915);
1086         vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
1087                                        PIN_MAPPABLE |
1088                                        PIN_NONFAULT |
1089                                        PIN_NONBLOCK);
1090         if (!IS_ERR(vma)) {
1091                 node.start = i915_ggtt_offset(vma);
1092                 node.allocated = false;
1093                 ret = i915_vma_put_fence(vma);
1094                 if (ret) {
1095                         i915_vma_unpin(vma);
1096                         vma = ERR_PTR(ret);
1097                 }
1098         }
1099         if (IS_ERR(vma)) {
1100                 ret = insert_mappable_node(ggtt, &node, PAGE_SIZE);
1101                 if (ret)
1102                         goto out_unlock;
1103                 GEM_BUG_ON(!node.allocated);
1104         }
1105
1106         ret = i915_gem_object_set_to_gtt_domain(obj, false);
1107         if (ret)
1108                 goto out_unpin;
1109
1110         mutex_unlock(&i915->drm.struct_mutex);
1111
1112         user_data = u64_to_user_ptr(args->data_ptr);
1113         remain = args->size;
1114         offset = args->offset;
1115
1116         while (remain > 0) {
1117                 /* Operation in this page
1118                  *
1119                  * page_base = page offset within aperture
1120                  * page_offset = offset within page
1121                  * page_length = bytes to copy for this page
1122                  */
1123                 u32 page_base = node.start;
1124                 unsigned page_offset = offset_in_page(offset);
1125                 unsigned page_length = PAGE_SIZE - page_offset;
1126                 page_length = remain < page_length ? remain : page_length;
1127                 if (node.allocated) {
1128                         wmb();
1129                         ggtt->vm.insert_page(&ggtt->vm,
1130                                              i915_gem_object_get_dma_address(obj, offset >> PAGE_SHIFT),
1131                                              node.start, I915_CACHE_NONE, 0);
1132                         wmb();
1133                 } else {
1134                         page_base += offset & PAGE_MASK;
1135                 }
1136
1137                 if (gtt_user_read(&ggtt->iomap, page_base, page_offset,
1138                                   user_data, page_length)) {
1139                         ret = -EFAULT;
1140                         break;
1141                 }
1142
1143                 remain -= page_length;
1144                 user_data += page_length;
1145                 offset += page_length;
1146         }
1147
1148         mutex_lock(&i915->drm.struct_mutex);
1149 out_unpin:
1150         if (node.allocated) {
1151                 wmb();
1152                 ggtt->vm.clear_range(&ggtt->vm, node.start, node.size);
1153                 remove_mappable_node(&node);
1154         } else {
1155                 i915_vma_unpin(vma);
1156         }
1157 out_unlock:
1158         intel_runtime_pm_put(i915, wakeref);
1159         mutex_unlock(&i915->drm.struct_mutex);
1160
1161         return ret;
1162 }
1163
1164 /**
1165  * Reads data from the object referenced by handle.
1166  * @dev: drm device pointer
1167  * @data: ioctl data blob
1168  * @file: drm file pointer
1169  *
1170  * On error, the contents of *data are undefined.
1171  */
1172 int
1173 i915_gem_pread_ioctl(struct drm_device *dev, void *data,
1174                      struct drm_file *file)
1175 {
1176         struct drm_i915_gem_pread *args = data;
1177         struct drm_i915_gem_object *obj;
1178         int ret;
1179
1180         if (args->size == 0)
1181                 return 0;
1182
1183         if (!access_ok(u64_to_user_ptr(args->data_ptr),
1184                        args->size))
1185                 return -EFAULT;
1186
1187         obj = i915_gem_object_lookup(file, args->handle);
1188         if (!obj)
1189                 return -ENOENT;
1190
1191         /* Bounds check source.  */
1192         if (range_overflows_t(u64, args->offset, args->size, obj->base.size)) {
1193                 ret = -EINVAL;
1194                 goto out;
1195         }
1196
1197         trace_i915_gem_object_pread(obj, args->offset, args->size);
1198
1199         ret = i915_gem_object_wait(obj,
1200                                    I915_WAIT_INTERRUPTIBLE,
1201                                    MAX_SCHEDULE_TIMEOUT,
1202                                    to_rps_client(file));
1203         if (ret)
1204                 goto out;
1205
1206         ret = i915_gem_object_pin_pages(obj);
1207         if (ret)
1208                 goto out;
1209
1210         ret = i915_gem_shmem_pread(obj, args);
1211         if (ret == -EFAULT || ret == -ENODEV)
1212                 ret = i915_gem_gtt_pread(obj, args);
1213
1214         i915_gem_object_unpin_pages(obj);
1215 out:
1216         i915_gem_object_put(obj);
1217         return ret;
1218 }
1219
1220 /* This is the fast write path which cannot handle
1221  * page faults in the source data
1222  */
1223
1224 static inline bool
1225 ggtt_write(struct io_mapping *mapping,
1226            loff_t base, int offset,
1227            char __user *user_data, int length)
1228 {
1229         void __iomem *vaddr;
1230         unsigned long unwritten;
1231
1232         /* We can use the cpu mem copy function because this is X86. */
1233         vaddr = io_mapping_map_atomic_wc(mapping, base);
1234         unwritten = __copy_from_user_inatomic_nocache((void __force *)vaddr + offset,
1235                                                       user_data, length);
1236         io_mapping_unmap_atomic(vaddr);
1237         if (unwritten) {
1238                 vaddr = io_mapping_map_wc(mapping, base, PAGE_SIZE);
1239                 unwritten = copy_from_user((void __force *)vaddr + offset,
1240                                            user_data, length);
1241                 io_mapping_unmap(vaddr);
1242         }
1243
1244         return unwritten;
1245 }
1246
1247 /**
1248  * This is the fast pwrite path, where we copy the data directly from the
1249  * user into the GTT, uncached.
1250  * @obj: i915 GEM object
1251  * @args: pwrite arguments structure
1252  */
1253 static int
1254 i915_gem_gtt_pwrite_fast(struct drm_i915_gem_object *obj,
1255                          const struct drm_i915_gem_pwrite *args)
1256 {
1257         struct drm_i915_private *i915 = to_i915(obj->base.dev);
1258         struct i915_ggtt *ggtt = &i915->ggtt;
1259         intel_wakeref_t wakeref;
1260         struct drm_mm_node node;
1261         struct i915_vma *vma;
1262         u64 remain, offset;
1263         void __user *user_data;
1264         int ret;
1265
1266         ret = mutex_lock_interruptible(&i915->drm.struct_mutex);
1267         if (ret)
1268                 return ret;
1269
1270         if (i915_gem_object_has_struct_page(obj)) {
1271                 /*
1272                  * Avoid waking the device up if we can fallback, as
1273                  * waking/resuming is very slow (worst-case 10-100 ms
1274                  * depending on PCI sleeps and our own resume time).
1275                  * This easily dwarfs any performance advantage from
1276                  * using the cache bypass of indirect GGTT access.
1277                  */
1278                 wakeref = intel_runtime_pm_get_if_in_use(i915);
1279                 if (!wakeref) {
1280                         ret = -EFAULT;
1281                         goto out_unlock;
1282                 }
1283         } else {
1284                 /* No backing pages, no fallback, we must force GGTT access */
1285                 wakeref = intel_runtime_pm_get(i915);
1286         }
1287
1288         vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
1289                                        PIN_MAPPABLE |
1290                                        PIN_NONFAULT |
1291                                        PIN_NONBLOCK);
1292         if (!IS_ERR(vma)) {
1293                 node.start = i915_ggtt_offset(vma);
1294                 node.allocated = false;
1295                 ret = i915_vma_put_fence(vma);
1296                 if (ret) {
1297                         i915_vma_unpin(vma);
1298                         vma = ERR_PTR(ret);
1299                 }
1300         }
1301         if (IS_ERR(vma)) {
1302                 ret = insert_mappable_node(ggtt, &node, PAGE_SIZE);
1303                 if (ret)
1304                         goto out_rpm;
1305                 GEM_BUG_ON(!node.allocated);
1306         }
1307
1308         ret = i915_gem_object_set_to_gtt_domain(obj, true);
1309         if (ret)
1310                 goto out_unpin;
1311
1312         mutex_unlock(&i915->drm.struct_mutex);
1313
1314         intel_fb_obj_invalidate(obj, ORIGIN_CPU);
1315
1316         user_data = u64_to_user_ptr(args->data_ptr);
1317         offset = args->offset;
1318         remain = args->size;
1319         while (remain) {
1320                 /* Operation in this page
1321                  *
1322                  * page_base = page offset within aperture
1323                  * page_offset = offset within page
1324                  * page_length = bytes to copy for this page
1325                  */
1326                 u32 page_base = node.start;
1327                 unsigned int page_offset = offset_in_page(offset);
1328                 unsigned int page_length = PAGE_SIZE - page_offset;
1329                 page_length = remain < page_length ? remain : page_length;
1330                 if (node.allocated) {
1331                         wmb(); /* flush the write before we modify the GGTT */
1332                         ggtt->vm.insert_page(&ggtt->vm,
1333                                              i915_gem_object_get_dma_address(obj, offset >> PAGE_SHIFT),
1334                                              node.start, I915_CACHE_NONE, 0);
1335                         wmb(); /* flush modifications to the GGTT (insert_page) */
1336                 } else {
1337                         page_base += offset & PAGE_MASK;
1338                 }
1339                 /* If we get a fault while copying data, then (presumably) our
1340                  * source page isn't available.  Return the error and we'll
1341                  * retry in the slow path.
1342                  * If the object is non-shmem backed, we retry again with the
1343                  * path that handles page fault.
1344                  */
1345                 if (ggtt_write(&ggtt->iomap, page_base, page_offset,
1346                                user_data, page_length)) {
1347                         ret = -EFAULT;
1348                         break;
1349                 }
1350
1351                 remain -= page_length;
1352                 user_data += page_length;
1353                 offset += page_length;
1354         }
1355         intel_fb_obj_flush(obj, ORIGIN_CPU);
1356
1357         mutex_lock(&i915->drm.struct_mutex);
1358 out_unpin:
1359         if (node.allocated) {
1360                 wmb();
1361                 ggtt->vm.clear_range(&ggtt->vm, node.start, node.size);
1362                 remove_mappable_node(&node);
1363         } else {
1364                 i915_vma_unpin(vma);
1365         }
1366 out_rpm:
1367         intel_runtime_pm_put(i915, wakeref);
1368 out_unlock:
1369         mutex_unlock(&i915->drm.struct_mutex);
1370         return ret;
1371 }
1372
1373 /* Per-page copy function for the shmem pwrite fastpath.
1374  * Flushes invalid cachelines before writing to the target if
1375  * needs_clflush_before is set and flushes out any written cachelines after
1376  * writing if needs_clflush is set.
1377  */
1378 static int
1379 shmem_pwrite(struct page *page, int offset, int len, char __user *user_data,
1380              bool needs_clflush_before,
1381              bool needs_clflush_after)
1382 {
1383         char *vaddr;
1384         int ret;
1385
1386         vaddr = kmap(page);
1387
1388         if (needs_clflush_before)
1389                 drm_clflush_virt_range(vaddr + offset, len);
1390
1391         ret = __copy_from_user(vaddr + offset, user_data, len);
1392         if (!ret && needs_clflush_after)
1393                 drm_clflush_virt_range(vaddr + offset, len);
1394
1395         kunmap(page);
1396
1397         return ret ? -EFAULT : 0;
1398 }
1399
1400 static int
1401 i915_gem_shmem_pwrite(struct drm_i915_gem_object *obj,
1402                       const struct drm_i915_gem_pwrite *args)
1403 {
1404         struct drm_i915_private *i915 = to_i915(obj->base.dev);
1405         void __user *user_data;
1406         u64 remain;
1407         unsigned int partial_cacheline_write;
1408         unsigned int needs_clflush;
1409         unsigned int offset, idx;
1410         int ret;
1411
1412         ret = mutex_lock_interruptible(&i915->drm.struct_mutex);
1413         if (ret)
1414                 return ret;
1415
1416         ret = i915_gem_obj_prepare_shmem_write(obj, &needs_clflush);
1417         mutex_unlock(&i915->drm.struct_mutex);
1418         if (ret)
1419                 return ret;
1420
1421         /* If we don't overwrite a cacheline completely we need to be
1422          * careful to have up-to-date data by first clflushing. Don't
1423          * overcomplicate things and flush the entire patch.
1424          */
1425         partial_cacheline_write = 0;
1426         if (needs_clflush & CLFLUSH_BEFORE)
1427                 partial_cacheline_write = boot_cpu_data.x86_clflush_size - 1;
1428
1429         user_data = u64_to_user_ptr(args->data_ptr);
1430         remain = args->size;
1431         offset = offset_in_page(args->offset);
1432         for (idx = args->offset >> PAGE_SHIFT; remain; idx++) {
1433                 struct page *page = i915_gem_object_get_page(obj, idx);
1434                 unsigned int length = min_t(u64, remain, PAGE_SIZE - offset);
1435
1436                 ret = shmem_pwrite(page, offset, length, user_data,
1437                                    (offset | length) & partial_cacheline_write,
1438                                    needs_clflush & CLFLUSH_AFTER);
1439                 if (ret)
1440                         break;
1441
1442                 remain -= length;
1443                 user_data += length;
1444                 offset = 0;
1445         }
1446
1447         intel_fb_obj_flush(obj, ORIGIN_CPU);
1448         i915_gem_obj_finish_shmem_access(obj);
1449         return ret;
1450 }
1451
1452 /**
1453  * Writes data to the object referenced by handle.
1454  * @dev: drm device
1455  * @data: ioctl data blob
1456  * @file: drm file
1457  *
1458  * On error, the contents of the buffer that were to be modified are undefined.
1459  */
1460 int
1461 i915_gem_pwrite_ioctl(struct drm_device *dev, void *data,
1462                       struct drm_file *file)
1463 {
1464         struct drm_i915_gem_pwrite *args = data;
1465         struct drm_i915_gem_object *obj;
1466         int ret;
1467
1468         if (args->size == 0)
1469                 return 0;
1470
1471         if (!access_ok(u64_to_user_ptr(args->data_ptr), args->size))
1472                 return -EFAULT;
1473
1474         obj = i915_gem_object_lookup(file, args->handle);
1475         if (!obj)
1476                 return -ENOENT;
1477
1478         /* Bounds check destination. */
1479         if (range_overflows_t(u64, args->offset, args->size, obj->base.size)) {
1480                 ret = -EINVAL;
1481                 goto err;
1482         }
1483
1484         /* Writes not allowed into this read-only object */
1485         if (i915_gem_object_is_readonly(obj)) {
1486                 ret = -EINVAL;
1487                 goto err;
1488         }
1489
1490         trace_i915_gem_object_pwrite(obj, args->offset, args->size);
1491
1492         ret = -ENODEV;
1493         if (obj->ops->pwrite)
1494                 ret = obj->ops->pwrite(obj, args);
1495         if (ret != -ENODEV)
1496                 goto err;
1497
1498         ret = i915_gem_object_wait(obj,
1499                                    I915_WAIT_INTERRUPTIBLE |
1500                                    I915_WAIT_ALL,
1501                                    MAX_SCHEDULE_TIMEOUT,
1502                                    to_rps_client(file));
1503         if (ret)
1504                 goto err;
1505
1506         ret = i915_gem_object_pin_pages(obj);
1507         if (ret)
1508                 goto err;
1509
1510         ret = -EFAULT;
1511         /* We can only do the GTT pwrite on untiled buffers, as otherwise
1512          * it would end up going through the fenced access, and we'll get
1513          * different detiling behavior between reading and writing.
1514          * pread/pwrite currently are reading and writing from the CPU
1515          * perspective, requiring manual detiling by the client.
1516          */
1517         if (!i915_gem_object_has_struct_page(obj) ||
1518             cpu_write_needs_clflush(obj))
1519                 /* Note that the gtt paths might fail with non-page-backed user
1520                  * pointers (e.g. gtt mappings when moving data between
1521                  * textures). Fallback to the shmem path in that case.
1522                  */
1523                 ret = i915_gem_gtt_pwrite_fast(obj, args);
1524
1525         if (ret == -EFAULT || ret == -ENOSPC) {
1526                 if (obj->phys_handle)
1527                         ret = i915_gem_phys_pwrite(obj, args, file);
1528                 else
1529                         ret = i915_gem_shmem_pwrite(obj, args);
1530         }
1531
1532         i915_gem_object_unpin_pages(obj);
1533 err:
1534         i915_gem_object_put(obj);
1535         return ret;
1536 }
1537
1538 static void i915_gem_object_bump_inactive_ggtt(struct drm_i915_gem_object *obj)
1539 {
1540         struct drm_i915_private *i915;
1541         struct list_head *list;
1542         struct i915_vma *vma;
1543
1544         GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
1545
1546         for_each_ggtt_vma(vma, obj) {
1547                 if (i915_vma_is_active(vma))
1548                         continue;
1549
1550                 if (!drm_mm_node_allocated(&vma->node))
1551                         continue;
1552
1553                 list_move_tail(&vma->vm_link, &vma->vm->inactive_list);
1554         }
1555
1556         i915 = to_i915(obj->base.dev);
1557         spin_lock(&i915->mm.obj_lock);
1558         list = obj->bind_count ? &i915->mm.bound_list : &i915->mm.unbound_list;
1559         list_move_tail(&obj->mm.link, list);
1560         spin_unlock(&i915->mm.obj_lock);
1561 }
1562
1563 /**
1564  * Called when user space prepares to use an object with the CPU, either
1565  * through the mmap ioctl's mapping or a GTT mapping.
1566  * @dev: drm device
1567  * @data: ioctl data blob
1568  * @file: drm file
1569  */
1570 int
1571 i915_gem_set_domain_ioctl(struct drm_device *dev, void *data,
1572                           struct drm_file *file)
1573 {
1574         struct drm_i915_gem_set_domain *args = data;
1575         struct drm_i915_gem_object *obj;
1576         uint32_t read_domains = args->read_domains;
1577         uint32_t write_domain = args->write_domain;
1578         int err;
1579
1580         /* Only handle setting domains to types used by the CPU. */
1581         if ((write_domain | read_domains) & I915_GEM_GPU_DOMAINS)
1582                 return -EINVAL;
1583
1584         /* Having something in the write domain implies it's in the read
1585          * domain, and only that read domain.  Enforce that in the request.
1586          */
1587         if (write_domain != 0 && read_domains != write_domain)
1588                 return -EINVAL;
1589
1590         obj = i915_gem_object_lookup(file, args->handle);
1591         if (!obj)
1592                 return -ENOENT;
1593
1594         /* Try to flush the object off the GPU without holding the lock.
1595          * We will repeat the flush holding the lock in the normal manner
1596          * to catch cases where we are gazumped.
1597          */
1598         err = i915_gem_object_wait(obj,
1599                                    I915_WAIT_INTERRUPTIBLE |
1600                                    I915_WAIT_PRIORITY |
1601                                    (write_domain ? I915_WAIT_ALL : 0),
1602                                    MAX_SCHEDULE_TIMEOUT,
1603                                    to_rps_client(file));
1604         if (err)
1605                 goto out;
1606
1607         /*
1608          * Proxy objects do not control access to the backing storage, ergo
1609          * they cannot be used as a means to manipulate the cache domain
1610          * tracking for that backing storage. The proxy object is always
1611          * considered to be outside of any cache domain.
1612          */
1613         if (i915_gem_object_is_proxy(obj)) {
1614                 err = -ENXIO;
1615                 goto out;
1616         }
1617
1618         /*
1619          * Flush and acquire obj->pages so that we are coherent through
1620          * direct access in memory with previous cached writes through
1621          * shmemfs and that our cache domain tracking remains valid.
1622          * For example, if the obj->filp was moved to swap without us
1623          * being notified and releasing the pages, we would mistakenly
1624          * continue to assume that the obj remained out of the CPU cached
1625          * domain.
1626          */
1627         err = i915_gem_object_pin_pages(obj);
1628         if (err)
1629                 goto out;
1630
1631         err = i915_mutex_lock_interruptible(dev);
1632         if (err)
1633                 goto out_unpin;
1634
1635         if (read_domains & I915_GEM_DOMAIN_WC)
1636                 err = i915_gem_object_set_to_wc_domain(obj, write_domain);
1637         else if (read_domains & I915_GEM_DOMAIN_GTT)
1638                 err = i915_gem_object_set_to_gtt_domain(obj, write_domain);
1639         else
1640                 err = i915_gem_object_set_to_cpu_domain(obj, write_domain);
1641
1642         /* And bump the LRU for this access */
1643         i915_gem_object_bump_inactive_ggtt(obj);
1644
1645         mutex_unlock(&dev->struct_mutex);
1646
1647         if (write_domain != 0)
1648                 intel_fb_obj_invalidate(obj,
1649                                         fb_write_origin(obj, write_domain));
1650
1651 out_unpin:
1652         i915_gem_object_unpin_pages(obj);
1653 out:
1654         i915_gem_object_put(obj);
1655         return err;
1656 }
1657
1658 /**
1659  * Called when user space has done writes to this buffer
1660  * @dev: drm device
1661  * @data: ioctl data blob
1662  * @file: drm file
1663  */
1664 int
1665 i915_gem_sw_finish_ioctl(struct drm_device *dev, void *data,
1666                          struct drm_file *file)
1667 {
1668         struct drm_i915_gem_sw_finish *args = data;
1669         struct drm_i915_gem_object *obj;
1670
1671         obj = i915_gem_object_lookup(file, args->handle);
1672         if (!obj)
1673                 return -ENOENT;
1674
1675         /*
1676          * Proxy objects are barred from CPU access, so there is no
1677          * need to ban sw_finish as it is a nop.
1678          */
1679
1680         /* Pinned buffers may be scanout, so flush the cache */
1681         i915_gem_object_flush_if_display(obj);
1682         i915_gem_object_put(obj);
1683
1684         return 0;
1685 }
1686
1687 /**
1688  * i915_gem_mmap_ioctl - Maps the contents of an object, returning the address
1689  *                       it is mapped to.
1690  * @dev: drm device
1691  * @data: ioctl data blob
1692  * @file: drm file
1693  *
1694  * While the mapping holds a reference on the contents of the object, it doesn't
1695  * imply a ref on the object itself.
1696  *
1697  * IMPORTANT:
1698  *
1699  * DRM driver writers who look a this function as an example for how to do GEM
1700  * mmap support, please don't implement mmap support like here. The modern way
1701  * to implement DRM mmap support is with an mmap offset ioctl (like
1702  * i915_gem_mmap_gtt) and then using the mmap syscall on the DRM fd directly.
1703  * That way debug tooling like valgrind will understand what's going on, hiding
1704  * the mmap call in a driver private ioctl will break that. The i915 driver only
1705  * does cpu mmaps this way because we didn't know better.
1706  */
1707 int
1708 i915_gem_mmap_ioctl(struct drm_device *dev, void *data,
1709                     struct drm_file *file)
1710 {
1711         struct drm_i915_gem_mmap *args = data;
1712         struct drm_i915_gem_object *obj;
1713         unsigned long addr;
1714
1715         if (args->flags & ~(I915_MMAP_WC))
1716                 return -EINVAL;
1717
1718         if (args->flags & I915_MMAP_WC && !boot_cpu_has(X86_FEATURE_PAT))
1719                 return -ENODEV;
1720
1721         obj = i915_gem_object_lookup(file, args->handle);
1722         if (!obj)
1723                 return -ENOENT;
1724
1725         /* prime objects have no backing filp to GEM mmap
1726          * pages from.
1727          */
1728         if (!obj->base.filp) {
1729                 i915_gem_object_put(obj);
1730                 return -ENXIO;
1731         }
1732
1733         addr = vm_mmap(obj->base.filp, 0, args->size,
1734                        PROT_READ | PROT_WRITE, MAP_SHARED,
1735                        args->offset);
1736         if (args->flags & I915_MMAP_WC) {
1737                 struct mm_struct *mm = current->mm;
1738                 struct vm_area_struct *vma;
1739
1740                 if (down_write_killable(&mm->mmap_sem)) {
1741                         i915_gem_object_put(obj);
1742                         return -EINTR;
1743                 }
1744                 vma = find_vma(mm, addr);
1745                 if (vma)
1746                         vma->vm_page_prot =
1747                                 pgprot_writecombine(vm_get_page_prot(vma->vm_flags));
1748                 else
1749                         addr = -ENOMEM;
1750                 up_write(&mm->mmap_sem);
1751
1752                 /* This may race, but that's ok, it only gets set */
1753                 WRITE_ONCE(obj->frontbuffer_ggtt_origin, ORIGIN_CPU);
1754         }
1755         i915_gem_object_put(obj);
1756         if (IS_ERR((void *)addr))
1757                 return addr;
1758
1759         args->addr_ptr = (uint64_t) addr;
1760
1761         return 0;
1762 }
1763
1764 static unsigned int tile_row_pages(const struct drm_i915_gem_object *obj)
1765 {
1766         return i915_gem_object_get_tile_row_size(obj) >> PAGE_SHIFT;
1767 }
1768
1769 /**
1770  * i915_gem_mmap_gtt_version - report the current feature set for GTT mmaps
1771  *
1772  * A history of the GTT mmap interface:
1773  *
1774  * 0 - Everything had to fit into the GTT. Both parties of a memcpy had to
1775  *     aligned and suitable for fencing, and still fit into the available
1776  *     mappable space left by the pinned display objects. A classic problem
1777  *     we called the page-fault-of-doom where we would ping-pong between
1778  *     two objects that could not fit inside the GTT and so the memcpy
1779  *     would page one object in at the expense of the other between every
1780  *     single byte.
1781  *
1782  * 1 - Objects can be any size, and have any compatible fencing (X Y, or none
1783  *     as set via i915_gem_set_tiling() [DRM_I915_GEM_SET_TILING]). If the
1784  *     object is too large for the available space (or simply too large
1785  *     for the mappable aperture!), a view is created instead and faulted
1786  *     into userspace. (This view is aligned and sized appropriately for
1787  *     fenced access.)
1788  *
1789  * 2 - Recognise WC as a separate cache domain so that we can flush the
1790  *     delayed writes via GTT before performing direct access via WC.
1791  *
1792  * Restrictions:
1793  *
1794  *  * snoopable objects cannot be accessed via the GTT. It can cause machine
1795  *    hangs on some architectures, corruption on others. An attempt to service
1796  *    a GTT page fault from a snoopable object will generate a SIGBUS.
1797  *
1798  *  * the object must be able to fit into RAM (physical memory, though no
1799  *    limited to the mappable aperture).
1800  *
1801  *
1802  * Caveats:
1803  *
1804  *  * a new GTT page fault will synchronize rendering from the GPU and flush
1805  *    all data to system memory. Subsequent access will not be synchronized.
1806  *
1807  *  * all mappings are revoked on runtime device suspend.
1808  *
1809  *  * there are only 8, 16 or 32 fence registers to share between all users
1810  *    (older machines require fence register for display and blitter access
1811  *    as well). Contention of the fence registers will cause the previous users
1812  *    to be unmapped and any new access will generate new page faults.
1813  *
1814  *  * running out of memory while servicing a fault may generate a SIGBUS,
1815  *    rather than the expected SIGSEGV.
1816  */
1817 int i915_gem_mmap_gtt_version(void)
1818 {
1819         return 2;
1820 }
1821
1822 static inline struct i915_ggtt_view
1823 compute_partial_view(const struct drm_i915_gem_object *obj,
1824                      pgoff_t page_offset,
1825                      unsigned int chunk)
1826 {
1827         struct i915_ggtt_view view;
1828
1829         if (i915_gem_object_is_tiled(obj))
1830                 chunk = roundup(chunk, tile_row_pages(obj));
1831
1832         view.type = I915_GGTT_VIEW_PARTIAL;
1833         view.partial.offset = rounddown(page_offset, chunk);
1834         view.partial.size =
1835                 min_t(unsigned int, chunk,
1836                       (obj->base.size >> PAGE_SHIFT) - view.partial.offset);
1837
1838         /* If the partial covers the entire object, just create a normal VMA. */
1839         if (chunk >= obj->base.size >> PAGE_SHIFT)
1840                 view.type = I915_GGTT_VIEW_NORMAL;
1841
1842         return view;
1843 }
1844
1845 /**
1846  * i915_gem_fault - fault a page into the GTT
1847  * @vmf: fault info
1848  *
1849  * The fault handler is set up by drm_gem_mmap() when a object is GTT mapped
1850  * from userspace.  The fault handler takes care of binding the object to
1851  * the GTT (if needed), allocating and programming a fence register (again,
1852  * only if needed based on whether the old reg is still valid or the object
1853  * is tiled) and inserting a new PTE into the faulting process.
1854  *
1855  * Note that the faulting process may involve evicting existing objects
1856  * from the GTT and/or fence registers to make room.  So performance may
1857  * suffer if the GTT working set is large or there are few fence registers
1858  * left.
1859  *
1860  * The current feature set supported by i915_gem_fault() and thus GTT mmaps
1861  * is exposed via I915_PARAM_MMAP_GTT_VERSION (see i915_gem_mmap_gtt_version).
1862  */
1863 vm_fault_t i915_gem_fault(struct vm_fault *vmf)
1864 {
1865 #define MIN_CHUNK_PAGES (SZ_1M >> PAGE_SHIFT)
1866         struct vm_area_struct *area = vmf->vma;
1867         struct drm_i915_gem_object *obj = to_intel_bo(area->vm_private_data);
1868         struct drm_device *dev = obj->base.dev;
1869         struct drm_i915_private *dev_priv = to_i915(dev);
1870         struct i915_ggtt *ggtt = &dev_priv->ggtt;
1871         bool write = area->vm_flags & VM_WRITE;
1872         intel_wakeref_t wakeref;
1873         struct i915_vma *vma;
1874         pgoff_t page_offset;
1875         int ret;
1876
1877         /* Sanity check that we allow writing into this object */
1878         if (i915_gem_object_is_readonly(obj) && write)
1879                 return VM_FAULT_SIGBUS;
1880
1881         /* We don't use vmf->pgoff since that has the fake offset */
1882         page_offset = (vmf->address - area->vm_start) >> PAGE_SHIFT;
1883
1884         trace_i915_gem_object_fault(obj, page_offset, true, write);
1885
1886         /* Try to flush the object off the GPU first without holding the lock.
1887          * Upon acquiring the lock, we will perform our sanity checks and then
1888          * repeat the flush holding the lock in the normal manner to catch cases
1889          * where we are gazumped.
1890          */
1891         ret = i915_gem_object_wait(obj,
1892                                    I915_WAIT_INTERRUPTIBLE,
1893                                    MAX_SCHEDULE_TIMEOUT,
1894                                    NULL);
1895         if (ret)
1896                 goto err;
1897
1898         ret = i915_gem_object_pin_pages(obj);
1899         if (ret)
1900                 goto err;
1901
1902         wakeref = intel_runtime_pm_get(dev_priv);
1903
1904         ret = i915_mutex_lock_interruptible(dev);
1905         if (ret)
1906                 goto err_rpm;
1907
1908         /* Access to snoopable pages through the GTT is incoherent. */
1909         if (obj->cache_level != I915_CACHE_NONE && !HAS_LLC(dev_priv)) {
1910                 ret = -EFAULT;
1911                 goto err_unlock;
1912         }
1913
1914
1915         /* Now pin it into the GTT as needed */
1916         vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
1917                                        PIN_MAPPABLE |
1918                                        PIN_NONBLOCK |
1919                                        PIN_NONFAULT);
1920         if (IS_ERR(vma)) {
1921                 /* Use a partial view if it is bigger than available space */
1922                 struct i915_ggtt_view view =
1923                         compute_partial_view(obj, page_offset, MIN_CHUNK_PAGES);
1924                 unsigned int flags;
1925
1926                 flags = PIN_MAPPABLE;
1927                 if (view.type == I915_GGTT_VIEW_NORMAL)
1928                         flags |= PIN_NONBLOCK; /* avoid warnings for pinned */
1929
1930                 /*
1931                  * Userspace is now writing through an untracked VMA, abandon
1932                  * all hope that the hardware is able to track future writes.
1933                  */
1934                 obj->frontbuffer_ggtt_origin = ORIGIN_CPU;
1935
1936                 vma = i915_gem_object_ggtt_pin(obj, &view, 0, 0, flags);
1937                 if (IS_ERR(vma) && !view.type) {
1938                         flags = PIN_MAPPABLE;
1939                         view.type = I915_GGTT_VIEW_PARTIAL;
1940                         vma = i915_gem_object_ggtt_pin(obj, &view, 0, 0, flags);
1941                 }
1942         }
1943         if (IS_ERR(vma)) {
1944                 ret = PTR_ERR(vma);
1945                 goto err_unlock;
1946         }
1947
1948         ret = i915_gem_object_set_to_gtt_domain(obj, write);
1949         if (ret)
1950                 goto err_unpin;
1951
1952         ret = i915_vma_pin_fence(vma);
1953         if (ret)
1954                 goto err_unpin;
1955
1956         /* Finally, remap it using the new GTT offset */
1957         ret = remap_io_mapping(area,
1958                                area->vm_start + (vma->ggtt_view.partial.offset << PAGE_SHIFT),
1959                                (ggtt->gmadr.start + vma->node.start) >> PAGE_SHIFT,
1960                                min_t(u64, vma->size, area->vm_end - area->vm_start),
1961                                &ggtt->iomap);
1962         if (ret)
1963                 goto err_fence;
1964
1965         /* Mark as being mmapped into userspace for later revocation */
1966         assert_rpm_wakelock_held(dev_priv);
1967         if (!i915_vma_set_userfault(vma) && !obj->userfault_count++)
1968                 list_add(&obj->userfault_link, &dev_priv->mm.userfault_list);
1969         GEM_BUG_ON(!obj->userfault_count);
1970
1971         i915_vma_set_ggtt_write(vma);
1972
1973 err_fence:
1974         i915_vma_unpin_fence(vma);
1975 err_unpin:
1976         __i915_vma_unpin(vma);
1977 err_unlock:
1978         mutex_unlock(&dev->struct_mutex);
1979 err_rpm:
1980         intel_runtime_pm_put(dev_priv, wakeref);
1981         i915_gem_object_unpin_pages(obj);
1982 err:
1983         switch (ret) {
1984         case -EIO:
1985                 /*
1986                  * We eat errors when the gpu is terminally wedged to avoid
1987                  * userspace unduly crashing (gl has no provisions for mmaps to
1988                  * fail). But any other -EIO isn't ours (e.g. swap in failure)
1989                  * and so needs to be reported.
1990                  */
1991                 if (!i915_terminally_wedged(&dev_priv->gpu_error))
1992                         return VM_FAULT_SIGBUS;
1993                 /* else: fall through */
1994         case -EAGAIN:
1995                 /*
1996                  * EAGAIN means the gpu is hung and we'll wait for the error
1997                  * handler to reset everything when re-faulting in
1998                  * i915_mutex_lock_interruptible.
1999                  */
2000         case 0:
2001         case -ERESTARTSYS:
2002         case -EINTR:
2003         case -EBUSY:
2004                 /*
2005                  * EBUSY is ok: this just means that another thread
2006                  * already did the job.
2007                  */
2008                 return VM_FAULT_NOPAGE;
2009         case -ENOMEM:
2010                 return VM_FAULT_OOM;
2011         case -ENOSPC:
2012         case -EFAULT:
2013                 return VM_FAULT_SIGBUS;
2014         default:
2015                 WARN_ONCE(ret, "unhandled error in i915_gem_fault: %i\n", ret);
2016                 return VM_FAULT_SIGBUS;
2017         }
2018 }
2019
2020 static void __i915_gem_object_release_mmap(struct drm_i915_gem_object *obj)
2021 {
2022         struct i915_vma *vma;
2023
2024         GEM_BUG_ON(!obj->userfault_count);
2025
2026         obj->userfault_count = 0;
2027         list_del(&obj->userfault_link);
2028         drm_vma_node_unmap(&obj->base.vma_node,
2029                            obj->base.dev->anon_inode->i_mapping);
2030
2031         for_each_ggtt_vma(vma, obj)
2032                 i915_vma_unset_userfault(vma);
2033 }
2034
2035 /**
2036  * i915_gem_release_mmap - remove physical page mappings
2037  * @obj: obj in question
2038  *
2039  * Preserve the reservation of the mmapping with the DRM core code, but
2040  * relinquish ownership of the pages back to the system.
2041  *
2042  * It is vital that we remove the page mapping if we have mapped a tiled
2043  * object through the GTT and then lose the fence register due to
2044  * resource pressure. Similarly if the object has been moved out of the
2045  * aperture, than pages mapped into userspace must be revoked. Removing the
2046  * mapping will then trigger a page fault on the next user access, allowing
2047  * fixup by i915_gem_fault().
2048  */
2049 void
2050 i915_gem_release_mmap(struct drm_i915_gem_object *obj)
2051 {
2052         struct drm_i915_private *i915 = to_i915(obj->base.dev);
2053         intel_wakeref_t wakeref;
2054
2055         /* Serialisation between user GTT access and our code depends upon
2056          * revoking the CPU's PTE whilst the mutex is held. The next user
2057          * pagefault then has to wait until we release the mutex.
2058          *
2059          * Note that RPM complicates somewhat by adding an additional
2060          * requirement that operations to the GGTT be made holding the RPM
2061          * wakeref.
2062          */
2063         lockdep_assert_held(&i915->drm.struct_mutex);
2064         wakeref = intel_runtime_pm_get(i915);
2065
2066         if (!obj->userfault_count)
2067                 goto out;
2068
2069         __i915_gem_object_release_mmap(obj);
2070
2071         /* Ensure that the CPU's PTE are revoked and there are not outstanding
2072          * memory transactions from userspace before we return. The TLB
2073          * flushing implied above by changing the PTE above *should* be
2074          * sufficient, an extra barrier here just provides us with a bit
2075          * of paranoid documentation about our requirement to serialise
2076          * memory writes before touching registers / GSM.
2077          */
2078         wmb();
2079
2080 out:
2081         intel_runtime_pm_put(i915, wakeref);
2082 }
2083
2084 void i915_gem_runtime_suspend(struct drm_i915_private *dev_priv)
2085 {
2086         struct drm_i915_gem_object *obj, *on;
2087         int i;
2088
2089         /*
2090          * Only called during RPM suspend. All users of the userfault_list
2091          * must be holding an RPM wakeref to ensure that this can not
2092          * run concurrently with themselves (and use the struct_mutex for
2093          * protection between themselves).
2094          */
2095
2096         list_for_each_entry_safe(obj, on,
2097                                  &dev_priv->mm.userfault_list, userfault_link)
2098                 __i915_gem_object_release_mmap(obj);
2099
2100         /* The fence will be lost when the device powers down. If any were
2101          * in use by hardware (i.e. they are pinned), we should not be powering
2102          * down! All other fences will be reacquired by the user upon waking.
2103          */
2104         for (i = 0; i < dev_priv->num_fence_regs; i++) {
2105                 struct drm_i915_fence_reg *reg = &dev_priv->fence_regs[i];
2106
2107                 /* Ideally we want to assert that the fence register is not
2108                  * live at this point (i.e. that no piece of code will be
2109                  * trying to write through fence + GTT, as that both violates
2110                  * our tracking of activity and associated locking/barriers,
2111                  * but also is illegal given that the hw is powered down).
2112                  *
2113                  * Previously we used reg->pin_count as a "liveness" indicator.
2114                  * That is not sufficient, and we need a more fine-grained
2115                  * tool if we want to have a sanity check here.
2116                  */
2117
2118                 if (!reg->vma)
2119                         continue;
2120
2121                 GEM_BUG_ON(i915_vma_has_userfault(reg->vma));
2122                 reg->dirty = true;
2123         }
2124 }
2125
2126 static int i915_gem_object_create_mmap_offset(struct drm_i915_gem_object *obj)
2127 {
2128         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
2129         int err;
2130
2131         err = drm_gem_create_mmap_offset(&obj->base);
2132         if (likely(!err))
2133                 return 0;
2134
2135         /* Attempt to reap some mmap space from dead objects */
2136         do {
2137                 err = i915_gem_wait_for_idle(dev_priv,
2138                                              I915_WAIT_INTERRUPTIBLE,
2139                                              MAX_SCHEDULE_TIMEOUT);
2140                 if (err)
2141                         break;
2142
2143                 i915_gem_drain_freed_objects(dev_priv);
2144                 err = drm_gem_create_mmap_offset(&obj->base);
2145                 if (!err)
2146                         break;
2147
2148         } while (flush_delayed_work(&dev_priv->gt.retire_work));
2149
2150         return err;
2151 }
2152
2153 static void i915_gem_object_free_mmap_offset(struct drm_i915_gem_object *obj)
2154 {
2155         drm_gem_free_mmap_offset(&obj->base);
2156 }
2157
2158 int
2159 i915_gem_mmap_gtt(struct drm_file *file,
2160                   struct drm_device *dev,
2161                   uint32_t handle,
2162                   uint64_t *offset)
2163 {
2164         struct drm_i915_gem_object *obj;
2165         int ret;
2166
2167         obj = i915_gem_object_lookup(file, handle);
2168         if (!obj)
2169                 return -ENOENT;
2170
2171         ret = i915_gem_object_create_mmap_offset(obj);
2172         if (ret == 0)
2173                 *offset = drm_vma_node_offset_addr(&obj->base.vma_node);
2174
2175         i915_gem_object_put(obj);
2176         return ret;
2177 }
2178
2179 /**
2180  * i915_gem_mmap_gtt_ioctl - prepare an object for GTT mmap'ing
2181  * @dev: DRM device
2182  * @data: GTT mapping ioctl data
2183  * @file: GEM object info
2184  *
2185  * Simply returns the fake offset to userspace so it can mmap it.
2186  * The mmap call will end up in drm_gem_mmap(), which will set things
2187  * up so we can get faults in the handler above.
2188  *
2189  * The fault handler will take care of binding the object into the GTT
2190  * (since it may have been evicted to make room for something), allocating
2191  * a fence register, and mapping the appropriate aperture address into
2192  * userspace.
2193  */
2194 int
2195 i915_gem_mmap_gtt_ioctl(struct drm_device *dev, void *data,
2196                         struct drm_file *file)
2197 {
2198         struct drm_i915_gem_mmap_gtt *args = data;
2199
2200         return i915_gem_mmap_gtt(file, dev, args->handle, &args->offset);
2201 }
2202
2203 /* Immediately discard the backing storage */
2204 static void
2205 i915_gem_object_truncate(struct drm_i915_gem_object *obj)
2206 {
2207         i915_gem_object_free_mmap_offset(obj);
2208
2209         if (obj->base.filp == NULL)
2210                 return;
2211
2212         /* Our goal here is to return as much of the memory as
2213          * is possible back to the system as we are called from OOM.
2214          * To do this we must instruct the shmfs to drop all of its
2215          * backing pages, *now*.
2216          */
2217         shmem_truncate_range(file_inode(obj->base.filp), 0, (loff_t)-1);
2218         obj->mm.madv = __I915_MADV_PURGED;
2219         obj->mm.pages = ERR_PTR(-EFAULT);
2220 }
2221
2222 /* Try to discard unwanted pages */
2223 void __i915_gem_object_invalidate(struct drm_i915_gem_object *obj)
2224 {
2225         struct address_space *mapping;
2226
2227         lockdep_assert_held(&obj->mm.lock);
2228         GEM_BUG_ON(i915_gem_object_has_pages(obj));
2229
2230         switch (obj->mm.madv) {
2231         case I915_MADV_DONTNEED:
2232                 i915_gem_object_truncate(obj);
2233         case __I915_MADV_PURGED:
2234                 return;
2235         }
2236
2237         if (obj->base.filp == NULL)
2238                 return;
2239
2240         mapping = obj->base.filp->f_mapping,
2241         invalidate_mapping_pages(mapping, 0, (loff_t)-1);
2242 }
2243
2244 /*
2245  * Move pages to appropriate lru and release the pagevec, decrementing the
2246  * ref count of those pages.
2247  */
2248 static void check_release_pagevec(struct pagevec *pvec)
2249 {
2250         check_move_unevictable_pages(pvec);
2251         __pagevec_release(pvec);
2252         cond_resched();
2253 }
2254
2255 static void
2256 i915_gem_object_put_pages_gtt(struct drm_i915_gem_object *obj,
2257                               struct sg_table *pages)
2258 {
2259         struct sgt_iter sgt_iter;
2260         struct pagevec pvec;
2261         struct page *page;
2262
2263         __i915_gem_object_release_shmem(obj, pages, true);
2264
2265         i915_gem_gtt_finish_pages(obj, pages);
2266
2267         if (i915_gem_object_needs_bit17_swizzle(obj))
2268                 i915_gem_object_save_bit_17_swizzle(obj, pages);
2269
2270         mapping_clear_unevictable(file_inode(obj->base.filp)->i_mapping);
2271
2272         pagevec_init(&pvec);
2273         for_each_sgt_page(page, sgt_iter, pages) {
2274                 if (obj->mm.dirty)
2275                         set_page_dirty(page);
2276
2277                 if (obj->mm.madv == I915_MADV_WILLNEED)
2278                         mark_page_accessed(page);
2279
2280                 if (!pagevec_add(&pvec, page))
2281                         check_release_pagevec(&pvec);
2282         }
2283         if (pagevec_count(&pvec))
2284                 check_release_pagevec(&pvec);
2285         obj->mm.dirty = false;
2286
2287         sg_free_table(pages);
2288         kfree(pages);
2289 }
2290
2291 static void __i915_gem_object_reset_page_iter(struct drm_i915_gem_object *obj)
2292 {
2293         struct radix_tree_iter iter;
2294         void __rcu **slot;
2295
2296         rcu_read_lock();
2297         radix_tree_for_each_slot(slot, &obj->mm.get_page.radix, &iter, 0)
2298                 radix_tree_delete(&obj->mm.get_page.radix, iter.index);
2299         rcu_read_unlock();
2300 }
2301
2302 static struct sg_table *
2303 __i915_gem_object_unset_pages(struct drm_i915_gem_object *obj)
2304 {
2305         struct drm_i915_private *i915 = to_i915(obj->base.dev);
2306         struct sg_table *pages;
2307
2308         pages = fetch_and_zero(&obj->mm.pages);
2309         if (IS_ERR_OR_NULL(pages))
2310                 return pages;
2311
2312         spin_lock(&i915->mm.obj_lock);
2313         list_del(&obj->mm.link);
2314         spin_unlock(&i915->mm.obj_lock);
2315
2316         if (obj->mm.mapping) {
2317                 void *ptr;
2318
2319                 ptr = page_mask_bits(obj->mm.mapping);
2320                 if (is_vmalloc_addr(ptr))
2321                         vunmap(ptr);
2322                 else
2323                         kunmap(kmap_to_page(ptr));
2324
2325                 obj->mm.mapping = NULL;
2326         }
2327
2328         __i915_gem_object_reset_page_iter(obj);
2329         obj->mm.page_sizes.phys = obj->mm.page_sizes.sg = 0;
2330
2331         return pages;
2332 }
2333
2334 int __i915_gem_object_put_pages(struct drm_i915_gem_object *obj,
2335                                 enum i915_mm_subclass subclass)
2336 {
2337         struct sg_table *pages;
2338         int ret;
2339
2340         if (i915_gem_object_has_pinned_pages(obj))
2341                 return -EBUSY;
2342
2343         GEM_BUG_ON(obj->bind_count);
2344
2345         /* May be called by shrinker from within get_pages() (on another bo) */
2346         mutex_lock_nested(&obj->mm.lock, subclass);
2347         if (unlikely(atomic_read(&obj->mm.pages_pin_count))) {
2348                 ret = -EBUSY;
2349                 goto unlock;
2350         }
2351
2352         /*
2353          * ->put_pages might need to allocate memory for the bit17 swizzle
2354          * array, hence protect them from being reaped by removing them from gtt
2355          * lists early.
2356          */
2357         pages = __i915_gem_object_unset_pages(obj);
2358
2359         /*
2360          * XXX Temporary hijinx to avoid updating all backends to handle
2361          * NULL pages. In the future, when we have more asynchronous
2362          * get_pages backends we should be better able to handle the
2363          * cancellation of the async task in a more uniform manner.
2364          */
2365         if (!pages && !i915_gem_object_needs_async_cancel(obj))
2366                 pages = ERR_PTR(-EINVAL);
2367
2368         if (!IS_ERR(pages))
2369                 obj->ops->put_pages(obj, pages);
2370
2371         ret = 0;
2372 unlock:
2373         mutex_unlock(&obj->mm.lock);
2374
2375         return ret;
2376 }
2377
2378 bool i915_sg_trim(struct sg_table *orig_st)
2379 {
2380         struct sg_table new_st;
2381         struct scatterlist *sg, *new_sg;
2382         unsigned int i;
2383
2384         if (orig_st->nents == orig_st->orig_nents)
2385                 return false;
2386
2387         if (sg_alloc_table(&new_st, orig_st->nents, GFP_KERNEL | __GFP_NOWARN))
2388                 return false;
2389
2390         new_sg = new_st.sgl;
2391         for_each_sg(orig_st->sgl, sg, orig_st->nents, i) {
2392                 sg_set_page(new_sg, sg_page(sg), sg->length, 0);
2393                 sg_dma_address(new_sg) = sg_dma_address(sg);
2394                 sg_dma_len(new_sg) = sg_dma_len(sg);
2395
2396                 new_sg = sg_next(new_sg);
2397         }
2398         GEM_BUG_ON(new_sg); /* Should walk exactly nents and hit the end */
2399
2400         sg_free_table(orig_st);
2401
2402         *orig_st = new_st;
2403         return true;
2404 }
2405
2406 static int i915_gem_object_get_pages_gtt(struct drm_i915_gem_object *obj)
2407 {
2408         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
2409         const unsigned long page_count = obj->base.size / PAGE_SIZE;
2410         unsigned long i;
2411         struct address_space *mapping;
2412         struct sg_table *st;
2413         struct scatterlist *sg;
2414         struct sgt_iter sgt_iter;
2415         struct page *page;
2416         unsigned long last_pfn = 0;     /* suppress gcc warning */
2417         unsigned int max_segment = i915_sg_segment_size();
2418         unsigned int sg_page_sizes;
2419         struct pagevec pvec;
2420         gfp_t noreclaim;
2421         int ret;
2422
2423         /*
2424          * Assert that the object is not currently in any GPU domain. As it
2425          * wasn't in the GTT, there shouldn't be any way it could have been in
2426          * a GPU cache
2427          */
2428         GEM_BUG_ON(obj->read_domains & I915_GEM_GPU_DOMAINS);
2429         GEM_BUG_ON(obj->write_domain & I915_GEM_GPU_DOMAINS);
2430
2431         /*
2432          * If there's no chance of allocating enough pages for the whole
2433          * object, bail early.
2434          */
2435         if (page_count > totalram_pages())
2436                 return -ENOMEM;
2437
2438         st = kmalloc(sizeof(*st), GFP_KERNEL);
2439         if (st == NULL)
2440                 return -ENOMEM;
2441
2442 rebuild_st:
2443         if (sg_alloc_table(st, page_count, GFP_KERNEL)) {
2444                 kfree(st);
2445                 return -ENOMEM;
2446         }
2447
2448         /*
2449          * Get the list of pages out of our struct file.  They'll be pinned
2450          * at this point until we release them.
2451          *
2452          * Fail silently without starting the shrinker
2453          */
2454         mapping = obj->base.filp->f_mapping;
2455         mapping_set_unevictable(mapping);
2456         noreclaim = mapping_gfp_constraint(mapping, ~__GFP_RECLAIM);
2457         noreclaim |= __GFP_NORETRY | __GFP_NOWARN;
2458
2459         sg = st->sgl;
2460         st->nents = 0;
2461         sg_page_sizes = 0;
2462         for (i = 0; i < page_count; i++) {
2463                 const unsigned int shrink[] = {
2464                         I915_SHRINK_BOUND | I915_SHRINK_UNBOUND | I915_SHRINK_PURGEABLE,
2465                         0,
2466                 }, *s = shrink;
2467                 gfp_t gfp = noreclaim;
2468
2469                 do {
2470                         cond_resched();
2471                         page = shmem_read_mapping_page_gfp(mapping, i, gfp);
2472                         if (likely(!IS_ERR(page)))
2473                                 break;
2474
2475                         if (!*s) {
2476                                 ret = PTR_ERR(page);
2477                                 goto err_sg;
2478                         }
2479
2480                         i915_gem_shrink(dev_priv, 2 * page_count, NULL, *s++);
2481
2482                         /*
2483                          * We've tried hard to allocate the memory by reaping
2484                          * our own buffer, now let the real VM do its job and
2485                          * go down in flames if truly OOM.
2486                          *
2487                          * However, since graphics tend to be disposable,
2488                          * defer the oom here by reporting the ENOMEM back
2489                          * to userspace.
2490                          */
2491                         if (!*s) {
2492                                 /* reclaim and warn, but no oom */
2493                                 gfp = mapping_gfp_mask(mapping);
2494
2495                                 /*
2496                                  * Our bo are always dirty and so we require
2497                                  * kswapd to reclaim our pages (direct reclaim
2498                                  * does not effectively begin pageout of our
2499                                  * buffers on its own). However, direct reclaim
2500                                  * only waits for kswapd when under allocation
2501                                  * congestion. So as a result __GFP_RECLAIM is
2502                                  * unreliable and fails to actually reclaim our
2503                                  * dirty pages -- unless you try over and over
2504                                  * again with !__GFP_NORETRY. However, we still
2505                                  * want to fail this allocation rather than
2506                                  * trigger the out-of-memory killer and for
2507                                  * this we want __GFP_RETRY_MAYFAIL.
2508                                  */
2509                                 gfp |= __GFP_RETRY_MAYFAIL;
2510                         }
2511                 } while (1);
2512
2513                 if (!i ||
2514                     sg->length >= max_segment ||
2515                     page_to_pfn(page) != last_pfn + 1) {
2516                         if (i) {
2517                                 sg_page_sizes |= sg->length;
2518                                 sg = sg_next(sg);
2519                         }
2520                         st->nents++;
2521                         sg_set_page(sg, page, PAGE_SIZE, 0);
2522                 } else {
2523                         sg->length += PAGE_SIZE;
2524                 }
2525                 last_pfn = page_to_pfn(page);
2526
2527                 /* Check that the i965g/gm workaround works. */
2528                 WARN_ON((gfp & __GFP_DMA32) && (last_pfn >= 0x00100000UL));
2529         }
2530         if (sg) { /* loop terminated early; short sg table */
2531                 sg_page_sizes |= sg->length;
2532                 sg_mark_end(sg);
2533         }
2534
2535         /* Trim unused sg entries to avoid wasting memory. */
2536         i915_sg_trim(st);
2537
2538         ret = i915_gem_gtt_prepare_pages(obj, st);
2539         if (ret) {
2540                 /*
2541                  * DMA remapping failed? One possible cause is that
2542                  * it could not reserve enough large entries, asking
2543                  * for PAGE_SIZE chunks instead may be helpful.
2544                  */
2545                 if (max_segment > PAGE_SIZE) {
2546                         for_each_sgt_page(page, sgt_iter, st)
2547                                 put_page(page);
2548                         sg_free_table(st);
2549
2550                         max_segment = PAGE_SIZE;
2551                         goto rebuild_st;
2552                 } else {
2553                         dev_warn(&dev_priv->drm.pdev->dev,
2554                                  "Failed to DMA remap %lu pages\n",
2555                                  page_count);
2556                         goto err_pages;
2557                 }
2558         }
2559
2560         if (i915_gem_object_needs_bit17_swizzle(obj))
2561                 i915_gem_object_do_bit_17_swizzle(obj, st);
2562
2563         __i915_gem_object_set_pages(obj, st, sg_page_sizes);
2564
2565         return 0;
2566
2567 err_sg:
2568         sg_mark_end(sg);
2569 err_pages:
2570         mapping_clear_unevictable(mapping);
2571         pagevec_init(&pvec);
2572         for_each_sgt_page(page, sgt_iter, st) {
2573                 if (!pagevec_add(&pvec, page))
2574                         check_release_pagevec(&pvec);
2575         }
2576         if (pagevec_count(&pvec))
2577                 check_release_pagevec(&pvec);
2578         sg_free_table(st);
2579         kfree(st);
2580
2581         /*
2582          * shmemfs first checks if there is enough memory to allocate the page
2583          * and reports ENOSPC should there be insufficient, along with the usual
2584          * ENOMEM for a genuine allocation failure.
2585          *
2586          * We use ENOSPC in our driver to mean that we have run out of aperture
2587          * space and so want to translate the error from shmemfs back to our
2588          * usual understanding of ENOMEM.
2589          */
2590         if (ret == -ENOSPC)
2591                 ret = -ENOMEM;
2592
2593         return ret;
2594 }
2595
2596 void __i915_gem_object_set_pages(struct drm_i915_gem_object *obj,
2597                                  struct sg_table *pages,
2598                                  unsigned int sg_page_sizes)
2599 {
2600         struct drm_i915_private *i915 = to_i915(obj->base.dev);
2601         unsigned long supported = INTEL_INFO(i915)->page_sizes;
2602         int i;
2603
2604         lockdep_assert_held(&obj->mm.lock);
2605
2606         obj->mm.get_page.sg_pos = pages->sgl;
2607         obj->mm.get_page.sg_idx = 0;
2608
2609         obj->mm.pages = pages;
2610
2611         if (i915_gem_object_is_tiled(obj) &&
2612             i915->quirks & QUIRK_PIN_SWIZZLED_PAGES) {
2613                 GEM_BUG_ON(obj->mm.quirked);
2614                 __i915_gem_object_pin_pages(obj);
2615                 obj->mm.quirked = true;
2616         }
2617
2618         GEM_BUG_ON(!sg_page_sizes);
2619         obj->mm.page_sizes.phys = sg_page_sizes;
2620
2621         /*
2622          * Calculate the supported page-sizes which fit into the given
2623          * sg_page_sizes. This will give us the page-sizes which we may be able
2624          * to use opportunistically when later inserting into the GTT. For
2625          * example if phys=2G, then in theory we should be able to use 1G, 2M,
2626          * 64K or 4K pages, although in practice this will depend on a number of
2627          * other factors.
2628          */
2629         obj->mm.page_sizes.sg = 0;
2630         for_each_set_bit(i, &supported, ilog2(I915_GTT_MAX_PAGE_SIZE) + 1) {
2631                 if (obj->mm.page_sizes.phys & ~0u << i)
2632                         obj->mm.page_sizes.sg |= BIT(i);
2633         }
2634         GEM_BUG_ON(!HAS_PAGE_SIZES(i915, obj->mm.page_sizes.sg));
2635
2636         spin_lock(&i915->mm.obj_lock);
2637         list_add(&obj->mm.link, &i915->mm.unbound_list);
2638         spin_unlock(&i915->mm.obj_lock);
2639 }
2640
2641 static int ____i915_gem_object_get_pages(struct drm_i915_gem_object *obj)
2642 {
2643         int err;
2644
2645         if (unlikely(obj->mm.madv != I915_MADV_WILLNEED)) {
2646                 DRM_DEBUG("Attempting to obtain a purgeable object\n");
2647                 return -EFAULT;
2648         }
2649
2650         err = obj->ops->get_pages(obj);
2651         GEM_BUG_ON(!err && !i915_gem_object_has_pages(obj));
2652
2653         return err;
2654 }
2655
2656 /* Ensure that the associated pages are gathered from the backing storage
2657  * and pinned into our object. i915_gem_object_pin_pages() may be called
2658  * multiple times before they are released by a single call to
2659  * i915_gem_object_unpin_pages() - once the pages are no longer referenced
2660  * either as a result of memory pressure (reaping pages under the shrinker)
2661  * or as the object is itself released.
2662  */
2663 int __i915_gem_object_get_pages(struct drm_i915_gem_object *obj)
2664 {
2665         int err;
2666
2667         err = mutex_lock_interruptible(&obj->mm.lock);
2668         if (err)
2669                 return err;
2670
2671         if (unlikely(!i915_gem_object_has_pages(obj))) {
2672                 GEM_BUG_ON(i915_gem_object_has_pinned_pages(obj));
2673
2674                 err = ____i915_gem_object_get_pages(obj);
2675                 if (err)
2676                         goto unlock;
2677
2678                 smp_mb__before_atomic();
2679         }
2680         atomic_inc(&obj->mm.pages_pin_count);
2681
2682 unlock:
2683         mutex_unlock(&obj->mm.lock);
2684         return err;
2685 }
2686
2687 /* The 'mapping' part of i915_gem_object_pin_map() below */
2688 static void *i915_gem_object_map(const struct drm_i915_gem_object *obj,
2689                                  enum i915_map_type type)
2690 {
2691         unsigned long n_pages = obj->base.size >> PAGE_SHIFT;
2692         struct sg_table *sgt = obj->mm.pages;
2693         struct sgt_iter sgt_iter;
2694         struct page *page;
2695         struct page *stack_pages[32];
2696         struct page **pages = stack_pages;
2697         unsigned long i = 0;
2698         pgprot_t pgprot;
2699         void *addr;
2700
2701         /* A single page can always be kmapped */
2702         if (n_pages == 1 && type == I915_MAP_WB)
2703                 return kmap(sg_page(sgt->sgl));
2704
2705         if (n_pages > ARRAY_SIZE(stack_pages)) {
2706                 /* Too big for stack -- allocate temporary array instead */
2707                 pages = kvmalloc_array(n_pages, sizeof(*pages), GFP_KERNEL);
2708                 if (!pages)
2709                         return NULL;
2710         }
2711
2712         for_each_sgt_page(page, sgt_iter, sgt)
2713                 pages[i++] = page;
2714
2715         /* Check that we have the expected number of pages */
2716         GEM_BUG_ON(i != n_pages);
2717
2718         switch (type) {
2719         default:
2720                 MISSING_CASE(type);
2721                 /* fallthrough to use PAGE_KERNEL anyway */
2722         case I915_MAP_WB:
2723                 pgprot = PAGE_KERNEL;
2724                 break;
2725         case I915_MAP_WC:
2726                 pgprot = pgprot_writecombine(PAGE_KERNEL_IO);
2727                 break;
2728         }
2729         addr = vmap(pages, n_pages, 0, pgprot);
2730
2731         if (pages != stack_pages)
2732                 kvfree(pages);
2733
2734         return addr;
2735 }
2736
2737 /* get, pin, and map the pages of the object into kernel space */
2738 void *i915_gem_object_pin_map(struct drm_i915_gem_object *obj,
2739                               enum i915_map_type type)
2740 {
2741         enum i915_map_type has_type;
2742         bool pinned;
2743         void *ptr;
2744         int ret;
2745
2746         if (unlikely(!i915_gem_object_has_struct_page(obj)))
2747                 return ERR_PTR(-ENXIO);
2748
2749         ret = mutex_lock_interruptible(&obj->mm.lock);
2750         if (ret)
2751                 return ERR_PTR(ret);
2752
2753         pinned = !(type & I915_MAP_OVERRIDE);
2754         type &= ~I915_MAP_OVERRIDE;
2755
2756         if (!atomic_inc_not_zero(&obj->mm.pages_pin_count)) {
2757                 if (unlikely(!i915_gem_object_has_pages(obj))) {
2758                         GEM_BUG_ON(i915_gem_object_has_pinned_pages(obj));
2759
2760                         ret = ____i915_gem_object_get_pages(obj);
2761                         if (ret)
2762                                 goto err_unlock;
2763
2764                         smp_mb__before_atomic();
2765                 }
2766                 atomic_inc(&obj->mm.pages_pin_count);
2767                 pinned = false;
2768         }
2769         GEM_BUG_ON(!i915_gem_object_has_pages(obj));
2770
2771         ptr = page_unpack_bits(obj->mm.mapping, &has_type);
2772         if (ptr && has_type != type) {
2773                 if (pinned) {
2774                         ret = -EBUSY;
2775                         goto err_unpin;
2776                 }
2777
2778                 if (is_vmalloc_addr(ptr))
2779                         vunmap(ptr);
2780                 else
2781                         kunmap(kmap_to_page(ptr));
2782
2783                 ptr = obj->mm.mapping = NULL;
2784         }
2785
2786         if (!ptr) {
2787                 ptr = i915_gem_object_map(obj, type);
2788                 if (!ptr) {
2789                         ret = -ENOMEM;
2790                         goto err_unpin;
2791                 }
2792
2793                 obj->mm.mapping = page_pack_bits(ptr, type);
2794         }
2795
2796 out_unlock:
2797         mutex_unlock(&obj->mm.lock);
2798         return ptr;
2799
2800 err_unpin:
2801         atomic_dec(&obj->mm.pages_pin_count);
2802 err_unlock:
2803         ptr = ERR_PTR(ret);
2804         goto out_unlock;
2805 }
2806
2807 static int
2808 i915_gem_object_pwrite_gtt(struct drm_i915_gem_object *obj,
2809                            const struct drm_i915_gem_pwrite *arg)
2810 {
2811         struct address_space *mapping = obj->base.filp->f_mapping;
2812         char __user *user_data = u64_to_user_ptr(arg->data_ptr);
2813         u64 remain, offset;
2814         unsigned int pg;
2815
2816         /* Before we instantiate/pin the backing store for our use, we
2817          * can prepopulate the shmemfs filp efficiently using a write into
2818          * the pagecache. We avoid the penalty of instantiating all the
2819          * pages, important if the user is just writing to a few and never
2820          * uses the object on the GPU, and using a direct write into shmemfs
2821          * allows it to avoid the cost of retrieving a page (either swapin
2822          * or clearing-before-use) before it is overwritten.
2823          */
2824         if (i915_gem_object_has_pages(obj))
2825                 return -ENODEV;
2826
2827         if (obj->mm.madv != I915_MADV_WILLNEED)
2828                 return -EFAULT;
2829
2830         /* Before the pages are instantiated the object is treated as being
2831          * in the CPU domain. The pages will be clflushed as required before
2832          * use, and we can freely write into the pages directly. If userspace
2833          * races pwrite with any other operation; corruption will ensue -
2834          * that is userspace's prerogative!
2835          */
2836
2837         remain = arg->size;
2838         offset = arg->offset;
2839         pg = offset_in_page(offset);
2840
2841         do {
2842                 unsigned int len, unwritten;
2843                 struct page *page;
2844                 void *data, *vaddr;
2845                 int err;
2846
2847                 len = PAGE_SIZE - pg;
2848                 if (len > remain)
2849                         len = remain;
2850
2851                 err = pagecache_write_begin(obj->base.filp, mapping,
2852                                             offset, len, 0,
2853                                             &page, &data);
2854                 if (err < 0)
2855                         return err;
2856
2857                 vaddr = kmap(page);
2858                 unwritten = copy_from_user(vaddr + pg, user_data, len);
2859                 kunmap(page);
2860
2861                 err = pagecache_write_end(obj->base.filp, mapping,
2862                                           offset, len, len - unwritten,
2863                                           page, data);
2864                 if (err < 0)
2865                         return err;
2866
2867                 if (unwritten)
2868                         return -EFAULT;
2869
2870                 remain -= len;
2871                 user_data += len;
2872                 offset += len;
2873                 pg = 0;
2874         } while (remain);
2875
2876         return 0;
2877 }
2878
2879 struct i915_request *
2880 i915_gem_find_active_request(struct intel_engine_cs *engine)
2881 {
2882         struct i915_request *request, *active = NULL;
2883         unsigned long flags;
2884
2885         /*
2886          * We are called by the error capture, reset and to dump engine
2887          * state at random points in time. In particular, note that neither is
2888          * crucially ordered with an interrupt. After a hang, the GPU is dead
2889          * and we assume that no more writes can happen (we waited long enough
2890          * for all writes that were in transaction to be flushed) - adding an
2891          * extra delay for a recent interrupt is pointless. Hence, we do
2892          * not need an engine->irq_seqno_barrier() before the seqno reads.
2893          * At all other times, we must assume the GPU is still running, but
2894          * we only care about the snapshot of this moment.
2895          */
2896         spin_lock_irqsave(&engine->timeline.lock, flags);
2897         list_for_each_entry(request, &engine->timeline.requests, link) {
2898                 if (__i915_request_completed(request, request->global_seqno))
2899                         continue;
2900
2901                 active = request;
2902                 break;
2903         }
2904         spin_unlock_irqrestore(&engine->timeline.lock, flags);
2905
2906         return active;
2907 }
2908
2909 static void
2910 i915_gem_retire_work_handler(struct work_struct *work)
2911 {
2912         struct drm_i915_private *dev_priv =
2913                 container_of(work, typeof(*dev_priv), gt.retire_work.work);
2914         struct drm_device *dev = &dev_priv->drm;
2915
2916         /* Come back later if the device is busy... */
2917         if (mutex_trylock(&dev->struct_mutex)) {
2918                 i915_retire_requests(dev_priv);
2919                 mutex_unlock(&dev->struct_mutex);
2920         }
2921
2922         /*
2923          * Keep the retire handler running until we are finally idle.
2924          * We do not need to do this test under locking as in the worst-case
2925          * we queue the retire worker once too often.
2926          */
2927         if (READ_ONCE(dev_priv->gt.awake))
2928                 queue_delayed_work(dev_priv->wq,
2929                                    &dev_priv->gt.retire_work,
2930                                    round_jiffies_up_relative(HZ));
2931 }
2932
2933 static void shrink_caches(struct drm_i915_private *i915)
2934 {
2935         /*
2936          * kmem_cache_shrink() discards empty slabs and reorders partially
2937          * filled slabs to prioritise allocating from the mostly full slabs,
2938          * with the aim of reducing fragmentation.
2939          */
2940         kmem_cache_shrink(i915->priorities);
2941         kmem_cache_shrink(i915->dependencies);
2942         kmem_cache_shrink(i915->requests);
2943         kmem_cache_shrink(i915->luts);
2944         kmem_cache_shrink(i915->vmas);
2945         kmem_cache_shrink(i915->objects);
2946 }
2947
2948 struct sleep_rcu_work {
2949         union {
2950                 struct rcu_head rcu;
2951                 struct work_struct work;
2952         };
2953         struct drm_i915_private *i915;
2954         unsigned int epoch;
2955 };
2956
2957 static inline bool
2958 same_epoch(struct drm_i915_private *i915, unsigned int epoch)
2959 {
2960         /*
2961          * There is a small chance that the epoch wrapped since we started
2962          * sleeping. If we assume that epoch is at least a u32, then it will
2963          * take at least 2^32 * 100ms for it to wrap, or about 326 years.
2964          */
2965         return epoch == READ_ONCE(i915->gt.epoch);
2966 }
2967
2968 static void __sleep_work(struct work_struct *work)
2969 {
2970         struct sleep_rcu_work *s = container_of(work, typeof(*s), work);
2971         struct drm_i915_private *i915 = s->i915;
2972         unsigned int epoch = s->epoch;
2973
2974         kfree(s);
2975         if (same_epoch(i915, epoch))
2976                 shrink_caches(i915);
2977 }
2978
2979 static void __sleep_rcu(struct rcu_head *rcu)
2980 {
2981         struct sleep_rcu_work *s = container_of(rcu, typeof(*s), rcu);
2982         struct drm_i915_private *i915 = s->i915;
2983
2984         destroy_rcu_head(&s->rcu);
2985
2986         if (same_epoch(i915, s->epoch)) {
2987                 INIT_WORK(&s->work, __sleep_work);
2988                 queue_work(i915->wq, &s->work);
2989         } else {
2990                 kfree(s);
2991         }
2992 }
2993
2994 static inline bool
2995 new_requests_since_last_retire(const struct drm_i915_private *i915)
2996 {
2997         return (READ_ONCE(i915->gt.active_requests) ||
2998                 work_pending(&i915->gt.idle_work.work));
2999 }
3000
3001 static void assert_kernel_context_is_current(struct drm_i915_private *i915)
3002 {
3003         struct intel_engine_cs *engine;
3004         enum intel_engine_id id;
3005
3006         if (i915_terminally_wedged(&i915->gpu_error))
3007                 return;
3008
3009         GEM_BUG_ON(i915->gt.active_requests);
3010         for_each_engine(engine, i915, id) {
3011                 GEM_BUG_ON(__i915_gem_active_peek(&engine->timeline.last_request));
3012                 GEM_BUG_ON(engine->last_retired_context !=
3013                            to_intel_context(i915->kernel_context, engine));
3014         }
3015 }
3016
3017 static void
3018 i915_gem_idle_work_handler(struct work_struct *work)
3019 {
3020         struct drm_i915_private *dev_priv =
3021                 container_of(work, typeof(*dev_priv), gt.idle_work.work);
3022         unsigned int epoch = I915_EPOCH_INVALID;
3023         bool rearm_hangcheck;
3024
3025         if (!READ_ONCE(dev_priv->gt.awake))
3026                 return;
3027
3028         if (READ_ONCE(dev_priv->gt.active_requests))
3029                 return;
3030
3031         /*
3032          * Flush out the last user context, leaving only the pinned
3033          * kernel context resident. When we are idling on the kernel_context,
3034          * no more new requests (with a context switch) are emitted and we
3035          * can finally rest. A consequence is that the idle work handler is
3036          * always called at least twice before idling (and if the system is
3037          * idle that implies a round trip through the retire worker).
3038          */
3039         mutex_lock(&dev_priv->drm.struct_mutex);
3040         i915_gem_switch_to_kernel_context(dev_priv);
3041         mutex_unlock(&dev_priv->drm.struct_mutex);
3042
3043         GEM_TRACE("active_requests=%d (after switch-to-kernel-context)\n",
3044                   READ_ONCE(dev_priv->gt.active_requests));
3045
3046         /*
3047          * Wait for last execlists context complete, but bail out in case a
3048          * new request is submitted. As we don't trust the hardware, we
3049          * continue on if the wait times out. This is necessary to allow
3050          * the machine to suspend even if the hardware dies, and we will
3051          * try to recover in resume (after depriving the hardware of power,
3052          * it may be in a better mmod).
3053          */
3054         __wait_for(if (new_requests_since_last_retire(dev_priv)) return,
3055                    intel_engines_are_idle(dev_priv),
3056                    I915_IDLE_ENGINES_TIMEOUT * 1000,
3057                    10, 500);
3058
3059         rearm_hangcheck =
3060                 cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work);
3061
3062         if (!mutex_trylock(&dev_priv->drm.struct_mutex)) {
3063                 /* Currently busy, come back later */
3064                 mod_delayed_work(dev_priv->wq,
3065                                  &dev_priv->gt.idle_work,
3066                                  msecs_to_jiffies(50));
3067                 goto out_rearm;
3068         }
3069
3070         /*
3071          * New request retired after this work handler started, extend active
3072          * period until next instance of the work.
3073          */
3074         if (new_requests_since_last_retire(dev_priv))
3075                 goto out_unlock;
3076
3077         epoch = __i915_gem_park(dev_priv);
3078
3079         assert_kernel_context_is_current(dev_priv);
3080
3081         rearm_hangcheck = false;
3082 out_unlock:
3083         mutex_unlock(&dev_priv->drm.struct_mutex);
3084
3085 out_rearm:
3086         if (rearm_hangcheck) {
3087                 GEM_BUG_ON(!dev_priv->gt.awake);
3088                 i915_queue_hangcheck(dev_priv);
3089         }
3090
3091         /*
3092          * When we are idle, it is an opportune time to reap our caches.
3093          * However, we have many objects that utilise RCU and the ordered
3094          * i915->wq that this work is executing on. To try and flush any
3095          * pending frees now we are idle, we first wait for an RCU grace
3096          * period, and then queue a task (that will run last on the wq) to
3097          * shrink and re-optimize the caches.
3098          */
3099         if (same_epoch(dev_priv, epoch)) {
3100                 struct sleep_rcu_work *s = kmalloc(sizeof(*s), GFP_KERNEL);
3101                 if (s) {
3102                         init_rcu_head(&s->rcu);
3103                         s->i915 = dev_priv;
3104                         s->epoch = epoch;
3105                         call_rcu(&s->rcu, __sleep_rcu);
3106                 }
3107         }
3108 }
3109
3110 void i915_gem_close_object(struct drm_gem_object *gem, struct drm_file *file)
3111 {
3112         struct drm_i915_private *i915 = to_i915(gem->dev);
3113         struct drm_i915_gem_object *obj = to_intel_bo(gem);
3114         struct drm_i915_file_private *fpriv = file->driver_priv;
3115         struct i915_lut_handle *lut, *ln;
3116
3117         mutex_lock(&i915->drm.struct_mutex);
3118
3119         list_for_each_entry_safe(lut, ln, &obj->lut_list, obj_link) {
3120                 struct i915_gem_context *ctx = lut->ctx;
3121                 struct i915_vma *vma;
3122
3123                 GEM_BUG_ON(ctx->file_priv == ERR_PTR(-EBADF));
3124                 if (ctx->file_priv != fpriv)
3125                         continue;
3126
3127                 vma = radix_tree_delete(&ctx->handles_vma, lut->handle);
3128                 GEM_BUG_ON(vma->obj != obj);
3129
3130                 /* We allow the process to have multiple handles to the same
3131                  * vma, in the same fd namespace, by virtue of flink/open.
3132                  */
3133                 GEM_BUG_ON(!vma->open_count);
3134                 if (!--vma->open_count && !i915_vma_is_ggtt(vma))
3135                         i915_vma_close(vma);
3136
3137                 list_del(&lut->obj_link);
3138                 list_del(&lut->ctx_link);
3139
3140                 kmem_cache_free(i915->luts, lut);
3141                 __i915_gem_object_release_unless_active(obj);
3142         }
3143
3144         mutex_unlock(&i915->drm.struct_mutex);
3145 }
3146
3147 static unsigned long to_wait_timeout(s64 timeout_ns)
3148 {
3149         if (timeout_ns < 0)
3150                 return MAX_SCHEDULE_TIMEOUT;
3151
3152         if (timeout_ns == 0)
3153                 return 0;
3154
3155         return nsecs_to_jiffies_timeout(timeout_ns);
3156 }
3157
3158 /**
3159  * i915_gem_wait_ioctl - implements DRM_IOCTL_I915_GEM_WAIT
3160  * @dev: drm device pointer
3161  * @data: ioctl data blob
3162  * @file: drm file pointer
3163  *
3164  * Returns 0 if successful, else an error is returned with the remaining time in
3165  * the timeout parameter.
3166  *  -ETIME: object is still busy after timeout
3167  *  -ERESTARTSYS: signal interrupted the wait
3168  *  -ENONENT: object doesn't exist
3169  * Also possible, but rare:
3170  *  -EAGAIN: incomplete, restart syscall
3171  *  -ENOMEM: damn
3172  *  -ENODEV: Internal IRQ fail
3173  *  -E?: The add request failed
3174  *
3175  * The wait ioctl with a timeout of 0 reimplements the busy ioctl. With any
3176  * non-zero timeout parameter the wait ioctl will wait for the given number of
3177  * nanoseconds on an object becoming unbusy. Since the wait itself does so
3178  * without holding struct_mutex the object may become re-busied before this
3179  * function completes. A similar but shorter * race condition exists in the busy
3180  * ioctl
3181  */
3182 int
3183 i915_gem_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
3184 {
3185         struct drm_i915_gem_wait *args = data;
3186         struct drm_i915_gem_object *obj;
3187         ktime_t start;
3188         long ret;
3189
3190         if (args->flags != 0)
3191                 return -EINVAL;
3192
3193         obj = i915_gem_object_lookup(file, args->bo_handle);
3194         if (!obj)
3195                 return -ENOENT;
3196
3197         start = ktime_get();
3198
3199         ret = i915_gem_object_wait(obj,
3200                                    I915_WAIT_INTERRUPTIBLE |
3201                                    I915_WAIT_PRIORITY |
3202                                    I915_WAIT_ALL,
3203                                    to_wait_timeout(args->timeout_ns),
3204                                    to_rps_client(file));
3205
3206         if (args->timeout_ns > 0) {
3207                 args->timeout_ns -= ktime_to_ns(ktime_sub(ktime_get(), start));
3208                 if (args->timeout_ns < 0)
3209                         args->timeout_ns = 0;
3210
3211                 /*
3212                  * Apparently ktime isn't accurate enough and occasionally has a
3213                  * bit of mismatch in the jiffies<->nsecs<->ktime loop. So patch
3214                  * things up to make the test happy. We allow up to 1 jiffy.
3215                  *
3216                  * This is a regression from the timespec->ktime conversion.
3217                  */
3218                 if (ret == -ETIME && !nsecs_to_jiffies(args->timeout_ns))
3219                         args->timeout_ns = 0;
3220
3221                 /* Asked to wait beyond the jiffie/scheduler precision? */
3222                 if (ret == -ETIME && args->timeout_ns)
3223                         ret = -EAGAIN;
3224         }
3225
3226         i915_gem_object_put(obj);
3227         return ret;
3228 }
3229
3230 static long wait_for_timeline(struct i915_timeline *tl,
3231                               unsigned int flags, long timeout)
3232 {
3233         struct i915_request *rq;
3234
3235         rq = i915_gem_active_get_unlocked(&tl->last_request);
3236         if (!rq)
3237                 return timeout;
3238
3239         /*
3240          * "Race-to-idle".
3241          *
3242          * Switching to the kernel context is often used a synchronous
3243          * step prior to idling, e.g. in suspend for flushing all
3244          * current operations to memory before sleeping. These we
3245          * want to complete as quickly as possible to avoid prolonged
3246          * stalls, so allow the gpu to boost to maximum clocks.
3247          */
3248         if (flags & I915_WAIT_FOR_IDLE_BOOST)
3249                 gen6_rps_boost(rq, NULL);
3250
3251         timeout = i915_request_wait(rq, flags, timeout);
3252         i915_request_put(rq);
3253
3254         return timeout;
3255 }
3256
3257 static int wait_for_engines(struct drm_i915_private *i915)
3258 {
3259         if (wait_for(intel_engines_are_idle(i915), I915_IDLE_ENGINES_TIMEOUT)) {
3260                 dev_err(i915->drm.dev,
3261                         "Failed to idle engines, declaring wedged!\n");
3262                 GEM_TRACE_DUMP();
3263                 i915_gem_set_wedged(i915);
3264                 return -EIO;
3265         }
3266
3267         return 0;
3268 }
3269
3270 int i915_gem_wait_for_idle(struct drm_i915_private *i915,
3271                            unsigned int flags, long timeout)
3272 {
3273         GEM_TRACE("flags=%x (%s), timeout=%ld%s\n",
3274                   flags, flags & I915_WAIT_LOCKED ? "locked" : "unlocked",
3275                   timeout, timeout == MAX_SCHEDULE_TIMEOUT ? " (forever)" : "");
3276
3277         /* If the device is asleep, we have no requests outstanding */
3278         if (!READ_ONCE(i915->gt.awake))
3279                 return 0;
3280
3281         if (flags & I915_WAIT_LOCKED) {
3282                 struct i915_timeline *tl;
3283                 int err;
3284
3285                 lockdep_assert_held(&i915->drm.struct_mutex);
3286
3287                 list_for_each_entry(tl, &i915->gt.timelines, link) {
3288                         timeout = wait_for_timeline(tl, flags, timeout);
3289                         if (timeout < 0)
3290                                 return timeout;
3291                 }
3292                 if (GEM_SHOW_DEBUG() && !timeout) {
3293                         /* Presume that timeout was non-zero to begin with! */
3294                         dev_warn(&i915->drm.pdev->dev,
3295                                  "Missed idle-completion interrupt!\n");
3296                         GEM_TRACE_DUMP();
3297                 }
3298
3299                 err = wait_for_engines(i915);
3300                 if (err)
3301                         return err;
3302
3303                 i915_retire_requests(i915);
3304                 GEM_BUG_ON(i915->gt.active_requests);
3305         } else {
3306                 struct intel_engine_cs *engine;
3307                 enum intel_engine_id id;
3308
3309                 for_each_engine(engine, i915, id) {
3310                         struct i915_timeline *tl = &engine->timeline;
3311
3312                         timeout = wait_for_timeline(tl, flags, timeout);
3313                         if (timeout < 0)
3314                                 return timeout;
3315                 }
3316         }
3317
3318         return 0;
3319 }
3320
3321 static void __i915_gem_object_flush_for_display(struct drm_i915_gem_object *obj)
3322 {
3323         /*
3324          * We manually flush the CPU domain so that we can override and
3325          * force the flush for the display, and perform it asyncrhonously.
3326          */
3327         flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
3328         if (obj->cache_dirty)
3329                 i915_gem_clflush_object(obj, I915_CLFLUSH_FORCE);
3330         obj->write_domain = 0;
3331 }
3332
3333 void i915_gem_object_flush_if_display(struct drm_i915_gem_object *obj)
3334 {
3335         if (!READ_ONCE(obj->pin_global))
3336                 return;
3337
3338         mutex_lock(&obj->base.dev->struct_mutex);
3339         __i915_gem_object_flush_for_display(obj);
3340         mutex_unlock(&obj->base.dev->struct_mutex);
3341 }
3342
3343 /**
3344  * Moves a single object to the WC read, and possibly write domain.
3345  * @obj: object to act on
3346  * @write: ask for write access or read only
3347  *
3348  * This function returns when the move is complete, including waiting on
3349  * flushes to occur.
3350  */
3351 int
3352 i915_gem_object_set_to_wc_domain(struct drm_i915_gem_object *obj, bool write)
3353 {
3354         int ret;
3355
3356         lockdep_assert_held(&obj->base.dev->struct_mutex);
3357
3358         ret = i915_gem_object_wait(obj,
3359                                    I915_WAIT_INTERRUPTIBLE |
3360                                    I915_WAIT_LOCKED |
3361                                    (write ? I915_WAIT_ALL : 0),
3362                                    MAX_SCHEDULE_TIMEOUT,
3363                                    NULL);
3364         if (ret)
3365                 return ret;
3366
3367         if (obj->write_domain == I915_GEM_DOMAIN_WC)
3368                 return 0;
3369
3370         /* Flush and acquire obj->pages so that we are coherent through
3371          * direct access in memory with previous cached writes through
3372          * shmemfs and that our cache domain tracking remains valid.
3373          * For example, if the obj->filp was moved to swap without us
3374          * being notified and releasing the pages, we would mistakenly
3375          * continue to assume that the obj remained out of the CPU cached
3376          * domain.
3377          */
3378         ret = i915_gem_object_pin_pages(obj);
3379         if (ret)
3380                 return ret;
3381
3382         flush_write_domain(obj, ~I915_GEM_DOMAIN_WC);
3383
3384         /* Serialise direct access to this object with the barriers for
3385          * coherent writes from the GPU, by effectively invalidating the
3386          * WC domain upon first access.
3387          */
3388         if ((obj->read_domains & I915_GEM_DOMAIN_WC) == 0)
3389                 mb();
3390
3391         /* It should now be out of any other write domains, and we can update
3392          * the domain values for our changes.
3393          */
3394         GEM_BUG_ON((obj->write_domain & ~I915_GEM_DOMAIN_WC) != 0);
3395         obj->read_domains |= I915_GEM_DOMAIN_WC;
3396         if (write) {
3397                 obj->read_domains = I915_GEM_DOMAIN_WC;
3398                 obj->write_domain = I915_GEM_DOMAIN_WC;
3399                 obj->mm.dirty = true;
3400         }
3401
3402         i915_gem_object_unpin_pages(obj);
3403         return 0;
3404 }
3405
3406 /**
3407  * Moves a single object to the GTT read, and possibly write domain.
3408  * @obj: object to act on
3409  * @write: ask for write access or read only
3410  *
3411  * This function returns when the move is complete, including waiting on
3412  * flushes to occur.
3413  */
3414 int
3415 i915_gem_object_set_to_gtt_domain(struct drm_i915_gem_object *obj, bool write)
3416 {
3417         int ret;
3418
3419         lockdep_assert_held(&obj->base.dev->struct_mutex);
3420
3421         ret = i915_gem_object_wait(obj,
3422                                    I915_WAIT_INTERRUPTIBLE |
3423                                    I915_WAIT_LOCKED |
3424                                    (write ? I915_WAIT_ALL : 0),
3425                                    MAX_SCHEDULE_TIMEOUT,
3426                                    NULL);
3427         if (ret)
3428                 return ret;
3429
3430         if (obj->write_domain == I915_GEM_DOMAIN_GTT)
3431                 return 0;
3432
3433         /* Flush and acquire obj->pages so that we are coherent through
3434          * direct access in memory with previous cached writes through
3435          * shmemfs and that our cache domain tracking remains valid.
3436          * For example, if the obj->filp was moved to swap without us
3437          * being notified and releasing the pages, we would mistakenly
3438          * continue to assume that the obj remained out of the CPU cached
3439          * domain.
3440          */
3441         ret = i915_gem_object_pin_pages(obj);
3442         if (ret)
3443                 return ret;
3444
3445         flush_write_domain(obj, ~I915_GEM_DOMAIN_GTT);
3446
3447         /* Serialise direct access to this object with the barriers for
3448          * coherent writes from the GPU, by effectively invalidating the
3449          * GTT domain upon first access.
3450          */
3451         if ((obj->read_domains & I915_GEM_DOMAIN_GTT) == 0)
3452                 mb();
3453
3454         /* It should now be out of any other write domains, and we can update
3455          * the domain values for our changes.
3456          */
3457         GEM_BUG_ON((obj->write_domain & ~I915_GEM_DOMAIN_GTT) != 0);
3458         obj->read_domains |= I915_GEM_DOMAIN_GTT;
3459         if (write) {
3460                 obj->read_domains = I915_GEM_DOMAIN_GTT;
3461                 obj->write_domain = I915_GEM_DOMAIN_GTT;
3462                 obj->mm.dirty = true;
3463         }
3464
3465         i915_gem_object_unpin_pages(obj);
3466         return 0;
3467 }
3468
3469 /**
3470  * Changes the cache-level of an object across all VMA.
3471  * @obj: object to act on
3472  * @cache_level: new cache level to set for the object
3473  *
3474  * After this function returns, the object will be in the new cache-level
3475  * across all GTT and the contents of the backing storage will be coherent,
3476  * with respect to the new cache-level. In order to keep the backing storage
3477  * coherent for all users, we only allow a single cache level to be set
3478  * globally on the object and prevent it from being changed whilst the
3479  * hardware is reading from the object. That is if the object is currently
3480  * on the scanout it will be set to uncached (or equivalent display
3481  * cache coherency) and all non-MOCS GPU access will also be uncached so
3482  * that all direct access to the scanout remains coherent.
3483  */
3484 int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
3485                                     enum i915_cache_level cache_level)
3486 {
3487         struct i915_vma *vma;
3488         int ret;
3489
3490         lockdep_assert_held(&obj->base.dev->struct_mutex);
3491
3492         if (obj->cache_level == cache_level)
3493                 return 0;
3494
3495         /* Inspect the list of currently bound VMA and unbind any that would
3496          * be invalid given the new cache-level. This is principally to
3497          * catch the issue of the CS prefetch crossing page boundaries and
3498          * reading an invalid PTE on older architectures.
3499          */
3500 restart:
3501         list_for_each_entry(vma, &obj->vma_list, obj_link) {
3502                 if (!drm_mm_node_allocated(&vma->node))
3503                         continue;
3504
3505                 if (i915_vma_is_pinned(vma)) {
3506                         DRM_DEBUG("can not change the cache level of pinned objects\n");
3507                         return -EBUSY;
3508                 }
3509
3510                 if (!i915_vma_is_closed(vma) &&
3511                     i915_gem_valid_gtt_space(vma, cache_level))
3512                         continue;
3513
3514                 ret = i915_vma_unbind(vma);
3515                 if (ret)
3516                         return ret;
3517
3518                 /* As unbinding may affect other elements in the
3519                  * obj->vma_list (due to side-effects from retiring
3520                  * an active vma), play safe and restart the iterator.
3521                  */
3522                 goto restart;
3523         }
3524
3525         /* We can reuse the existing drm_mm nodes but need to change the
3526          * cache-level on the PTE. We could simply unbind them all and
3527          * rebind with the correct cache-level on next use. However since
3528          * we already have a valid slot, dma mapping, pages etc, we may as
3529          * rewrite the PTE in the belief that doing so tramples upon less
3530          * state and so involves less work.
3531          */
3532         if (obj->bind_count) {
3533                 /* Before we change the PTE, the GPU must not be accessing it.
3534                  * If we wait upon the object, we know that all the bound
3535                  * VMA are no longer active.
3536                  */
3537                 ret = i915_gem_object_wait(obj,
3538                                            I915_WAIT_INTERRUPTIBLE |
3539                                            I915_WAIT_LOCKED |
3540                                            I915_WAIT_ALL,
3541                                            MAX_SCHEDULE_TIMEOUT,
3542                                            NULL);
3543                 if (ret)
3544                         return ret;
3545
3546                 if (!HAS_LLC(to_i915(obj->base.dev)) &&
3547                     cache_level != I915_CACHE_NONE) {
3548                         /* Access to snoopable pages through the GTT is
3549                          * incoherent and on some machines causes a hard
3550                          * lockup. Relinquish the CPU mmaping to force
3551                          * userspace to refault in the pages and we can
3552                          * then double check if the GTT mapping is still
3553                          * valid for that pointer access.
3554                          */
3555                         i915_gem_release_mmap(obj);
3556
3557                         /* As we no longer need a fence for GTT access,
3558                          * we can relinquish it now (and so prevent having
3559                          * to steal a fence from someone else on the next
3560                          * fence request). Note GPU activity would have
3561                          * dropped the fence as all snoopable access is
3562                          * supposed to be linear.
3563                          */
3564                         for_each_ggtt_vma(vma, obj) {
3565                                 ret = i915_vma_put_fence(vma);
3566                                 if (ret)
3567                                         return ret;
3568                         }
3569                 } else {
3570                         /* We either have incoherent backing store and
3571                          * so no GTT access or the architecture is fully
3572                          * coherent. In such cases, existing GTT mmaps
3573                          * ignore the cache bit in the PTE and we can
3574                          * rewrite it without confusing the GPU or having
3575                          * to force userspace to fault back in its mmaps.
3576                          */
3577                 }
3578
3579                 list_for_each_entry(vma, &obj->vma_list, obj_link) {
3580                         if (!drm_mm_node_allocated(&vma->node))
3581                                 continue;
3582
3583                         ret = i915_vma_bind(vma, cache_level, PIN_UPDATE);
3584                         if (ret)
3585                                 return ret;
3586                 }
3587         }
3588
3589         list_for_each_entry(vma, &obj->vma_list, obj_link)
3590                 vma->node.color = cache_level;
3591         i915_gem_object_set_cache_coherency(obj, cache_level);
3592         obj->cache_dirty = true; /* Always invalidate stale cachelines */
3593
3594         return 0;
3595 }
3596
3597 int i915_gem_get_caching_ioctl(struct drm_device *dev, void *data,
3598                                struct drm_file *file)
3599 {
3600         struct drm_i915_gem_caching *args = data;
3601         struct drm_i915_gem_object *obj;
3602         int err = 0;
3603
3604         rcu_read_lock();
3605         obj = i915_gem_object_lookup_rcu(file, args->handle);
3606         if (!obj) {
3607                 err = -ENOENT;
3608                 goto out;
3609         }
3610
3611         switch (obj->cache_level) {
3612         case I915_CACHE_LLC:
3613         case I915_CACHE_L3_LLC:
3614                 args->caching = I915_CACHING_CACHED;
3615                 break;
3616
3617         case I915_CACHE_WT:
3618                 args->caching = I915_CACHING_DISPLAY;
3619                 break;
3620
3621         default:
3622                 args->caching = I915_CACHING_NONE;
3623                 break;
3624         }
3625 out:
3626         rcu_read_unlock();
3627         return err;
3628 }
3629
3630 int i915_gem_set_caching_ioctl(struct drm_device *dev, void *data,
3631                                struct drm_file *file)
3632 {
3633         struct drm_i915_private *i915 = to_i915(dev);
3634         struct drm_i915_gem_caching *args = data;
3635         struct drm_i915_gem_object *obj;
3636         enum i915_cache_level level;
3637         int ret = 0;
3638
3639         switch (args->caching) {
3640         case I915_CACHING_NONE:
3641                 level = I915_CACHE_NONE;
3642                 break;
3643         case I915_CACHING_CACHED:
3644                 /*
3645                  * Due to a HW issue on BXT A stepping, GPU stores via a
3646                  * snooped mapping may leave stale data in a corresponding CPU
3647                  * cacheline, whereas normally such cachelines would get
3648                  * invalidated.
3649                  */
3650                 if (!HAS_LLC(i915) && !HAS_SNOOP(i915))
3651                         return -ENODEV;
3652
3653                 level = I915_CACHE_LLC;
3654                 break;
3655         case I915_CACHING_DISPLAY:
3656                 level = HAS_WT(i915) ? I915_CACHE_WT : I915_CACHE_NONE;
3657                 break;
3658         default:
3659                 return -EINVAL;
3660         }
3661
3662         obj = i915_gem_object_lookup(file, args->handle);
3663         if (!obj)
3664                 return -ENOENT;
3665
3666         /*
3667          * The caching mode of proxy object is handled by its generator, and
3668          * not allowed to be changed by userspace.
3669          */
3670         if (i915_gem_object_is_proxy(obj)) {
3671                 ret = -ENXIO;
3672                 goto out;
3673         }
3674
3675         if (obj->cache_level == level)
3676                 goto out;
3677
3678         ret = i915_gem_object_wait(obj,
3679                                    I915_WAIT_INTERRUPTIBLE,
3680                                    MAX_SCHEDULE_TIMEOUT,
3681                                    to_rps_client(file));
3682         if (ret)
3683                 goto out;
3684
3685         ret = i915_mutex_lock_interruptible(dev);
3686         if (ret)
3687                 goto out;
3688
3689         ret = i915_gem_object_set_cache_level(obj, level);
3690         mutex_unlock(&dev->struct_mutex);
3691
3692 out:
3693         i915_gem_object_put(obj);
3694         return ret;
3695 }
3696
3697 /*
3698  * Prepare buffer for display plane (scanout, cursors, etc). Can be called from
3699  * an uninterruptible phase (modesetting) and allows any flushes to be pipelined
3700  * (for pageflips). We only flush the caches while preparing the buffer for
3701  * display, the callers are responsible for frontbuffer flush.
3702  */
3703 struct i915_vma *
3704 i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj,
3705                                      u32 alignment,
3706                                      const struct i915_ggtt_view *view,
3707                                      unsigned int flags)
3708 {
3709         struct i915_vma *vma;
3710         int ret;
3711
3712         lockdep_assert_held(&obj->base.dev->struct_mutex);
3713
3714         /* Mark the global pin early so that we account for the
3715          * display coherency whilst setting up the cache domains.
3716          */
3717         obj->pin_global++;
3718
3719         /* The display engine is not coherent with the LLC cache on gen6.  As
3720          * a result, we make sure that the pinning that is about to occur is
3721          * done with uncached PTEs. This is lowest common denominator for all
3722          * chipsets.
3723          *
3724          * However for gen6+, we could do better by using the GFDT bit instead
3725          * of uncaching, which would allow us to flush all the LLC-cached data
3726          * with that bit in the PTE to main memory with just one PIPE_CONTROL.
3727          */
3728         ret = i915_gem_object_set_cache_level(obj,
3729                                               HAS_WT(to_i915(obj->base.dev)) ?
3730                                               I915_CACHE_WT : I915_CACHE_NONE);
3731         if (ret) {
3732                 vma = ERR_PTR(ret);
3733                 goto err_unpin_global;
3734         }
3735
3736         /* As the user may map the buffer once pinned in the display plane
3737          * (e.g. libkms for the bootup splash), we have to ensure that we
3738          * always use map_and_fenceable for all scanout buffers. However,
3739          * it may simply be too big to fit into mappable, in which case
3740          * put it anyway and hope that userspace can cope (but always first
3741          * try to preserve the existing ABI).
3742          */
3743         vma = ERR_PTR(-ENOSPC);
3744         if ((flags & PIN_MAPPABLE) == 0 &&
3745             (!view || view->type == I915_GGTT_VIEW_NORMAL))
3746                 vma = i915_gem_object_ggtt_pin(obj, view, 0, alignment,
3747                                                flags |
3748                                                PIN_MAPPABLE |
3749                                                PIN_NONBLOCK);
3750         if (IS_ERR(vma))
3751                 vma = i915_gem_object_ggtt_pin(obj, view, 0, alignment, flags);
3752         if (IS_ERR(vma))
3753                 goto err_unpin_global;
3754
3755         vma->display_alignment = max_t(u64, vma->display_alignment, alignment);
3756
3757         __i915_gem_object_flush_for_display(obj);
3758
3759         /* It should now be out of any other write domains, and we can update
3760          * the domain values for our changes.
3761          */
3762         obj->read_domains |= I915_GEM_DOMAIN_GTT;
3763
3764         return vma;
3765
3766 err_unpin_global:
3767         obj->pin_global--;
3768         return vma;
3769 }
3770
3771 void
3772 i915_gem_object_unpin_from_display_plane(struct i915_vma *vma)
3773 {
3774         lockdep_assert_held(&vma->vm->i915->drm.struct_mutex);
3775
3776         if (WARN_ON(vma->obj->pin_global == 0))
3777                 return;
3778
3779         if (--vma->obj->pin_global == 0)
3780                 vma->display_alignment = I915_GTT_MIN_ALIGNMENT;
3781
3782         /* Bump the LRU to try and avoid premature eviction whilst flipping  */
3783         i915_gem_object_bump_inactive_ggtt(vma->obj);
3784
3785         i915_vma_unpin(vma);
3786 }
3787
3788 /**
3789  * Moves a single object to the CPU read, and possibly write domain.
3790  * @obj: object to act on
3791  * @write: requesting write or read-only access
3792  *
3793  * This function returns when the move is complete, including waiting on
3794  * flushes to occur.
3795  */
3796 int
3797 i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write)
3798 {
3799         int ret;
3800
3801         lockdep_assert_held(&obj->base.dev->struct_mutex);
3802
3803         ret = i915_gem_object_wait(obj,
3804                                    I915_WAIT_INTERRUPTIBLE |
3805                                    I915_WAIT_LOCKED |
3806                                    (write ? I915_WAIT_ALL : 0),
3807                                    MAX_SCHEDULE_TIMEOUT,
3808                                    NULL);
3809         if (ret)
3810                 return ret;
3811
3812         flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
3813
3814         /* Flush the CPU cache if it's still invalid. */
3815         if ((obj->read_domains & I915_GEM_DOMAIN_CPU) == 0) {
3816                 i915_gem_clflush_object(obj, I915_CLFLUSH_SYNC);
3817                 obj->read_domains |= I915_GEM_DOMAIN_CPU;
3818         }
3819
3820         /* It should now be out of any other write domains, and we can update
3821          * the domain values for our changes.
3822          */
3823         GEM_BUG_ON(obj->write_domain & ~I915_GEM_DOMAIN_CPU);
3824
3825         /* If we're writing through the CPU, then the GPU read domains will
3826          * need to be invalidated at next use.
3827          */
3828         if (write)
3829                 __start_cpu_write(obj);
3830
3831         return 0;
3832 }
3833
3834 /* Throttle our rendering by waiting until the ring has completed our requests
3835  * emitted over 20 msec ago.
3836  *
3837  * Note that if we were to use the current jiffies each time around the loop,
3838  * we wouldn't escape the function with any frames outstanding if the time to
3839  * render a frame was over 20ms.
3840  *
3841  * This should get us reasonable parallelism between CPU and GPU but also
3842  * relatively low latency when blocking on a particular request to finish.
3843  */
3844 static int
3845 i915_gem_ring_throttle(struct drm_device *dev, struct drm_file *file)
3846 {
3847         struct drm_i915_private *dev_priv = to_i915(dev);
3848         struct drm_i915_file_private *file_priv = file->driver_priv;
3849         unsigned long recent_enough = jiffies - DRM_I915_THROTTLE_JIFFIES;
3850         struct i915_request *request, *target = NULL;
3851         long ret;
3852
3853         /* ABI: return -EIO if already wedged */
3854         if (i915_terminally_wedged(&dev_priv->gpu_error))
3855                 return -EIO;
3856
3857         spin_lock(&file_priv->mm.lock);
3858         list_for_each_entry(request, &file_priv->mm.request_list, client_link) {
3859                 if (time_after_eq(request->emitted_jiffies, recent_enough))
3860                         break;
3861
3862                 if (target) {
3863                         list_del(&target->client_link);
3864                         target->file_priv = NULL;
3865                 }
3866
3867                 target = request;
3868         }
3869         if (target)
3870                 i915_request_get(target);
3871         spin_unlock(&file_priv->mm.lock);
3872
3873         if (target == NULL)
3874                 return 0;
3875
3876         ret = i915_request_wait(target,
3877                                 I915_WAIT_INTERRUPTIBLE,
3878                                 MAX_SCHEDULE_TIMEOUT);
3879         i915_request_put(target);
3880
3881         return ret < 0 ? ret : 0;
3882 }
3883
3884 struct i915_vma *
3885 i915_gem_object_ggtt_pin(struct drm_i915_gem_object *obj,
3886                          const struct i915_ggtt_view *view,
3887                          u64 size,
3888                          u64 alignment,
3889                          u64 flags)
3890 {
3891         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
3892         struct i915_address_space *vm = &dev_priv->ggtt.vm;
3893         struct i915_vma *vma;
3894         int ret;
3895
3896         lockdep_assert_held(&obj->base.dev->struct_mutex);
3897
3898         if (flags & PIN_MAPPABLE &&
3899             (!view || view->type == I915_GGTT_VIEW_NORMAL)) {
3900                 /* If the required space is larger than the available
3901                  * aperture, we will not able to find a slot for the
3902                  * object and unbinding the object now will be in
3903                  * vain. Worse, doing so may cause us to ping-pong
3904                  * the object in and out of the Global GTT and
3905                  * waste a lot of cycles under the mutex.
3906                  */
3907                 if (obj->base.size > dev_priv->ggtt.mappable_end)
3908                         return ERR_PTR(-E2BIG);
3909
3910                 /* If NONBLOCK is set the caller is optimistically
3911                  * trying to cache the full object within the mappable
3912                  * aperture, and *must* have a fallback in place for
3913                  * situations where we cannot bind the object. We
3914                  * can be a little more lax here and use the fallback
3915                  * more often to avoid costly migrations of ourselves
3916                  * and other objects within the aperture.
3917                  *
3918                  * Half-the-aperture is used as a simple heuristic.
3919                  * More interesting would to do search for a free
3920                  * block prior to making the commitment to unbind.
3921                  * That caters for the self-harm case, and with a
3922                  * little more heuristics (e.g. NOFAULT, NOEVICT)
3923                  * we could try to minimise harm to others.
3924                  */
3925                 if (flags & PIN_NONBLOCK &&
3926                     obj->base.size > dev_priv->ggtt.mappable_end / 2)
3927                         return ERR_PTR(-ENOSPC);
3928         }
3929
3930         vma = i915_vma_instance(obj, vm, view);
3931         if (unlikely(IS_ERR(vma)))
3932                 return vma;
3933
3934         if (i915_vma_misplaced(vma, size, alignment, flags)) {
3935                 if (flags & PIN_NONBLOCK) {
3936                         if (i915_vma_is_pinned(vma) || i915_vma_is_active(vma))
3937                                 return ERR_PTR(-ENOSPC);
3938
3939                         if (flags & PIN_MAPPABLE &&
3940                             vma->fence_size > dev_priv->ggtt.mappable_end / 2)
3941                                 return ERR_PTR(-ENOSPC);
3942                 }
3943
3944                 WARN(i915_vma_is_pinned(vma),
3945                      "bo is already pinned in ggtt with incorrect alignment:"
3946                      " offset=%08x, req.alignment=%llx,"
3947                      " req.map_and_fenceable=%d, vma->map_and_fenceable=%d\n",
3948                      i915_ggtt_offset(vma), alignment,
3949                      !!(flags & PIN_MAPPABLE),
3950                      i915_vma_is_map_and_fenceable(vma));
3951                 ret = i915_vma_unbind(vma);
3952                 if (ret)
3953                         return ERR_PTR(ret);
3954         }
3955
3956         ret = i915_vma_pin(vma, size, alignment, flags | PIN_GLOBAL);
3957         if (ret)
3958                 return ERR_PTR(ret);
3959
3960         return vma;
3961 }
3962
3963 static __always_inline unsigned int __busy_read_flag(unsigned int id)
3964 {
3965         /* Note that we could alias engines in the execbuf API, but
3966          * that would be very unwise as it prevents userspace from
3967          * fine control over engine selection. Ahem.
3968          *
3969          * This should be something like EXEC_MAX_ENGINE instead of
3970          * I915_NUM_ENGINES.
3971          */
3972         BUILD_BUG_ON(I915_NUM_ENGINES > 16);
3973         return 0x10000 << id;
3974 }
3975
3976 static __always_inline unsigned int __busy_write_id(unsigned int id)
3977 {
3978         /* The uABI guarantees an active writer is also amongst the read
3979          * engines. This would be true if we accessed the activity tracking
3980          * under the lock, but as we perform the lookup of the object and
3981          * its activity locklessly we can not guarantee that the last_write
3982          * being active implies that we have set the same engine flag from
3983          * last_read - hence we always set both read and write busy for
3984          * last_write.
3985          */
3986         return id | __busy_read_flag(id);
3987 }
3988
3989 static __always_inline unsigned int
3990 __busy_set_if_active(const struct dma_fence *fence,
3991                      unsigned int (*flag)(unsigned int id))
3992 {
3993         struct i915_request *rq;
3994
3995         /* We have to check the current hw status of the fence as the uABI
3996          * guarantees forward progress. We could rely on the idle worker
3997          * to eventually flush us, but to minimise latency just ask the
3998          * hardware.
3999          *
4000          * Note we only report on the status of native fences.
4001          */
4002         if (!dma_fence_is_i915(fence))
4003                 return 0;
4004
4005         /* opencode to_request() in order to avoid const warnings */
4006         rq = container_of(fence, struct i915_request, fence);
4007         if (i915_request_completed(rq))
4008                 return 0;
4009
4010         return flag(rq->engine->uabi_id);
4011 }
4012
4013 static __always_inline unsigned int
4014 busy_check_reader(const struct dma_fence *fence)
4015 {
4016         return __busy_set_if_active(fence, __busy_read_flag);
4017 }
4018
4019 static __always_inline unsigned int
4020 busy_check_writer(const struct dma_fence *fence)
4021 {
4022         if (!fence)
4023                 return 0;
4024
4025         return __busy_set_if_active(fence, __busy_write_id);
4026 }
4027
4028 int
4029 i915_gem_busy_ioctl(struct drm_device *dev, void *data,
4030                     struct drm_file *file)
4031 {
4032         struct drm_i915_gem_busy *args = data;
4033         struct drm_i915_gem_object *obj;
4034         struct reservation_object_list *list;
4035         unsigned int seq;
4036         int err;
4037
4038         err = -ENOENT;
4039         rcu_read_lock();
4040         obj = i915_gem_object_lookup_rcu(file, args->handle);
4041         if (!obj)
4042                 goto out;
4043
4044         /* A discrepancy here is that we do not report the status of
4045          * non-i915 fences, i.e. even though we may report the object as idle,
4046          * a call to set-domain may still stall waiting for foreign rendering.
4047          * This also means that wait-ioctl may report an object as busy,
4048          * where busy-ioctl considers it idle.
4049          *
4050          * We trade the ability to warn of foreign fences to report on which
4051          * i915 engines are active for the object.
4052          *
4053          * Alternatively, we can trade that extra information on read/write
4054          * activity with
4055          *      args->busy =
4056          *              !reservation_object_test_signaled_rcu(obj->resv, true);
4057          * to report the overall busyness. This is what the wait-ioctl does.
4058          *
4059          */
4060 retry:
4061         seq = raw_read_seqcount(&obj->resv->seq);
4062
4063         /* Translate the exclusive fence to the READ *and* WRITE engine */
4064         args->busy = busy_check_writer(rcu_dereference(obj->resv->fence_excl));
4065
4066         /* Translate shared fences to READ set of engines */
4067         list = rcu_dereference(obj->resv->fence);
4068         if (list) {
4069                 unsigned int shared_count = list->shared_count, i;
4070
4071                 for (i = 0; i < shared_count; ++i) {
4072                         struct dma_fence *fence =
4073                                 rcu_dereference(list->shared[i]);
4074
4075                         args->busy |= busy_check_reader(fence);
4076                 }
4077         }
4078
4079         if (args->busy && read_seqcount_retry(&obj->resv->seq, seq))
4080                 goto retry;
4081
4082         err = 0;
4083 out:
4084         rcu_read_unlock();
4085         return err;
4086 }
4087
4088 int
4089 i915_gem_throttle_ioctl(struct drm_device *dev, void *data,
4090                         struct drm_file *file_priv)
4091 {
4092         return i915_gem_ring_throttle(dev, file_priv);
4093 }
4094
4095 int
4096 i915_gem_madvise_ioctl(struct drm_device *dev, void *data,
4097                        struct drm_file *file_priv)
4098 {
4099         struct drm_i915_private *dev_priv = to_i915(dev);
4100         struct drm_i915_gem_madvise *args = data;
4101         struct drm_i915_gem_object *obj;
4102         int err;
4103
4104         switch (args->madv) {
4105         case I915_MADV_DONTNEED:
4106         case I915_MADV_WILLNEED:
4107             break;
4108         default:
4109             return -EINVAL;
4110         }
4111
4112         obj = i915_gem_object_lookup(file_priv, args->handle);
4113         if (!obj)
4114                 return -ENOENT;
4115
4116         err = mutex_lock_interruptible(&obj->mm.lock);
4117         if (err)
4118                 goto out;
4119
4120         if (i915_gem_object_has_pages(obj) &&
4121             i915_gem_object_is_tiled(obj) &&
4122             dev_priv->quirks & QUIRK_PIN_SWIZZLED_PAGES) {
4123                 if (obj->mm.madv == I915_MADV_WILLNEED) {
4124                         GEM_BUG_ON(!obj->mm.quirked);
4125                         __i915_gem_object_unpin_pages(obj);
4126                         obj->mm.quirked = false;
4127                 }
4128                 if (args->madv == I915_MADV_WILLNEED) {
4129                         GEM_BUG_ON(obj->mm.quirked);
4130                         __i915_gem_object_pin_pages(obj);
4131                         obj->mm.quirked = true;
4132                 }
4133         }
4134
4135         if (obj->mm.madv != __I915_MADV_PURGED)
4136                 obj->mm.madv = args->madv;
4137
4138         /* if the object is no longer attached, discard its backing storage */
4139         if (obj->mm.madv == I915_MADV_DONTNEED &&
4140             !i915_gem_object_has_pages(obj))
4141                 i915_gem_object_truncate(obj);
4142
4143         args->retained = obj->mm.madv != __I915_MADV_PURGED;
4144         mutex_unlock(&obj->mm.lock);
4145
4146 out:
4147         i915_gem_object_put(obj);
4148         return err;
4149 }
4150
4151 static void
4152 frontbuffer_retire(struct i915_gem_active *active, struct i915_request *request)
4153 {
4154         struct drm_i915_gem_object *obj =
4155                 container_of(active, typeof(*obj), frontbuffer_write);
4156
4157         intel_fb_obj_flush(obj, ORIGIN_CS);
4158 }
4159
4160 void i915_gem_object_init(struct drm_i915_gem_object *obj,
4161                           const struct drm_i915_gem_object_ops *ops)
4162 {
4163         mutex_init(&obj->mm.lock);
4164
4165         INIT_LIST_HEAD(&obj->vma_list);
4166         INIT_LIST_HEAD(&obj->lut_list);
4167         INIT_LIST_HEAD(&obj->batch_pool_link);
4168
4169         init_rcu_head(&obj->rcu);
4170
4171         obj->ops = ops;
4172
4173         reservation_object_init(&obj->__builtin_resv);
4174         obj->resv = &obj->__builtin_resv;
4175
4176         obj->frontbuffer_ggtt_origin = ORIGIN_GTT;
4177         init_request_active(&obj->frontbuffer_write, frontbuffer_retire);
4178
4179         obj->mm.madv = I915_MADV_WILLNEED;
4180         INIT_RADIX_TREE(&obj->mm.get_page.radix, GFP_KERNEL | __GFP_NOWARN);
4181         mutex_init(&obj->mm.get_page.lock);
4182
4183         i915_gem_info_add_obj(to_i915(obj->base.dev), obj->base.size);
4184 }
4185
4186 static const struct drm_i915_gem_object_ops i915_gem_object_ops = {
4187         .flags = I915_GEM_OBJECT_HAS_STRUCT_PAGE |
4188                  I915_GEM_OBJECT_IS_SHRINKABLE,
4189
4190         .get_pages = i915_gem_object_get_pages_gtt,
4191         .put_pages = i915_gem_object_put_pages_gtt,
4192
4193         .pwrite = i915_gem_object_pwrite_gtt,
4194 };
4195
4196 static int i915_gem_object_create_shmem(struct drm_device *dev,
4197                                         struct drm_gem_object *obj,
4198                                         size_t size)
4199 {
4200         struct drm_i915_private *i915 = to_i915(dev);
4201         unsigned long flags = VM_NORESERVE;
4202         struct file *filp;
4203
4204         drm_gem_private_object_init(dev, obj, size);
4205
4206         if (i915->mm.gemfs)
4207                 filp = shmem_file_setup_with_mnt(i915->mm.gemfs, "i915", size,
4208                                                  flags);
4209         else
4210                 filp = shmem_file_setup("i915", size, flags);
4211
4212         if (IS_ERR(filp))
4213                 return PTR_ERR(filp);
4214
4215         obj->filp = filp;
4216
4217         return 0;
4218 }
4219
4220 struct drm_i915_gem_object *
4221 i915_gem_object_create(struct drm_i915_private *dev_priv, u64 size)
4222 {
4223         struct drm_i915_gem_object *obj;
4224         struct address_space *mapping;
4225         unsigned int cache_level;
4226         gfp_t mask;
4227         int ret;
4228
4229         /* There is a prevalence of the assumption that we fit the object's
4230          * page count inside a 32bit _signed_ variable. Let's document this and
4231          * catch if we ever need to fix it. In the meantime, if you do spot
4232          * such a local variable, please consider fixing!
4233          */
4234         if (size >> PAGE_SHIFT > INT_MAX)
4235                 return ERR_PTR(-E2BIG);
4236
4237         if (overflows_type(size, obj->base.size))
4238                 return ERR_PTR(-E2BIG);
4239
4240         obj = i915_gem_object_alloc(dev_priv);
4241         if (obj == NULL)
4242                 return ERR_PTR(-ENOMEM);
4243
4244         ret = i915_gem_object_create_shmem(&dev_priv->drm, &obj->base, size);
4245         if (ret)
4246                 goto fail;
4247
4248         mask = GFP_HIGHUSER | __GFP_RECLAIMABLE;
4249         if (IS_I965GM(dev_priv) || IS_I965G(dev_priv)) {
4250                 /* 965gm cannot relocate objects above 4GiB. */
4251                 mask &= ~__GFP_HIGHMEM;
4252                 mask |= __GFP_DMA32;
4253         }
4254
4255         mapping = obj->base.filp->f_mapping;
4256         mapping_set_gfp_mask(mapping, mask);
4257         GEM_BUG_ON(!(mapping_gfp_mask(mapping) & __GFP_RECLAIM));
4258
4259         i915_gem_object_init(obj, &i915_gem_object_ops);
4260
4261         obj->write_domain = I915_GEM_DOMAIN_CPU;
4262         obj->read_domains = I915_GEM_DOMAIN_CPU;
4263
4264         if (HAS_LLC(dev_priv))
4265                 /* On some devices, we can have the GPU use the LLC (the CPU
4266                  * cache) for about a 10% performance improvement
4267                  * compared to uncached.  Graphics requests other than
4268                  * display scanout are coherent with the CPU in
4269                  * accessing this cache.  This means in this mode we
4270                  * don't need to clflush on the CPU side, and on the
4271                  * GPU side we only need to flush internal caches to
4272                  * get data visible to the CPU.
4273                  *
4274                  * However, we maintain the display planes as UC, and so
4275                  * need to rebind when first used as such.
4276                  */
4277                 cache_level = I915_CACHE_LLC;
4278         else
4279                 cache_level = I915_CACHE_NONE;
4280
4281         i915_gem_object_set_cache_coherency(obj, cache_level);
4282
4283         trace_i915_gem_object_create(obj);
4284
4285         return obj;
4286
4287 fail:
4288         i915_gem_object_free(obj);
4289         return ERR_PTR(ret);
4290 }
4291
4292 static bool discard_backing_storage(struct drm_i915_gem_object *obj)
4293 {
4294         /* If we are the last user of the backing storage (be it shmemfs
4295          * pages or stolen etc), we know that the pages are going to be
4296          * immediately released. In this case, we can then skip copying
4297          * back the contents from the GPU.
4298          */
4299
4300         if (obj->mm.madv != I915_MADV_WILLNEED)
4301                 return false;
4302
4303         if (obj->base.filp == NULL)
4304                 return true;
4305
4306         /* At first glance, this looks racy, but then again so would be
4307          * userspace racing mmap against close. However, the first external
4308          * reference to the filp can only be obtained through the
4309          * i915_gem_mmap_ioctl() which safeguards us against the user
4310          * acquiring such a reference whilst we are in the middle of
4311          * freeing the object.
4312          */
4313         return atomic_long_read(&obj->base.filp->f_count) == 1;
4314 }
4315
4316 static void __i915_gem_free_objects(struct drm_i915_private *i915,
4317                                     struct llist_node *freed)
4318 {
4319         struct drm_i915_gem_object *obj, *on;
4320         intel_wakeref_t wakeref;
4321
4322         wakeref = intel_runtime_pm_get(i915);
4323         llist_for_each_entry_safe(obj, on, freed, freed) {
4324                 struct i915_vma *vma, *vn;
4325
4326                 trace_i915_gem_object_destroy(obj);
4327
4328                 mutex_lock(&i915->drm.struct_mutex);
4329
4330                 GEM_BUG_ON(i915_gem_object_is_active(obj));
4331                 list_for_each_entry_safe(vma, vn,
4332                                          &obj->vma_list, obj_link) {
4333                         GEM_BUG_ON(i915_vma_is_active(vma));
4334                         vma->flags &= ~I915_VMA_PIN_MASK;
4335                         i915_vma_destroy(vma);
4336                 }
4337                 GEM_BUG_ON(!list_empty(&obj->vma_list));
4338                 GEM_BUG_ON(!RB_EMPTY_ROOT(&obj->vma_tree));
4339
4340                 /* This serializes freeing with the shrinker. Since the free
4341                  * is delayed, first by RCU then by the workqueue, we want the
4342                  * shrinker to be able to free pages of unreferenced objects,
4343                  * or else we may oom whilst there are plenty of deferred
4344                  * freed objects.
4345                  */
4346                 if (i915_gem_object_has_pages(obj)) {
4347                         spin_lock(&i915->mm.obj_lock);
4348                         list_del_init(&obj->mm.link);
4349                         spin_unlock(&i915->mm.obj_lock);
4350                 }
4351
4352                 mutex_unlock(&i915->drm.struct_mutex);
4353
4354                 GEM_BUG_ON(obj->bind_count);
4355                 GEM_BUG_ON(obj->userfault_count);
4356                 GEM_BUG_ON(atomic_read(&obj->frontbuffer_bits));
4357                 GEM_BUG_ON(!list_empty(&obj->lut_list));
4358
4359                 if (obj->ops->release)
4360                         obj->ops->release(obj);
4361
4362                 if (WARN_ON(i915_gem_object_has_pinned_pages(obj)))
4363                         atomic_set(&obj->mm.pages_pin_count, 0);
4364                 __i915_gem_object_put_pages(obj, I915_MM_NORMAL);
4365                 GEM_BUG_ON(i915_gem_object_has_pages(obj));
4366
4367                 if (obj->base.import_attach)
4368                         drm_prime_gem_destroy(&obj->base, NULL);
4369
4370                 reservation_object_fini(&obj->__builtin_resv);
4371                 drm_gem_object_release(&obj->base);
4372                 i915_gem_info_remove_obj(i915, obj->base.size);
4373
4374                 kfree(obj->bit_17);
4375                 i915_gem_object_free(obj);
4376
4377                 GEM_BUG_ON(!atomic_read(&i915->mm.free_count));
4378                 atomic_dec(&i915->mm.free_count);
4379
4380                 if (on)
4381                         cond_resched();
4382         }
4383         intel_runtime_pm_put(i915, wakeref);
4384 }
4385
4386 static void i915_gem_flush_free_objects(struct drm_i915_private *i915)
4387 {
4388         struct llist_node *freed;
4389
4390         /* Free the oldest, most stale object to keep the free_list short */
4391         freed = NULL;
4392         if (!llist_empty(&i915->mm.free_list)) { /* quick test for hotpath */
4393                 /* Only one consumer of llist_del_first() allowed */
4394                 spin_lock(&i915->mm.free_lock);
4395                 freed = llist_del_first(&i915->mm.free_list);
4396                 spin_unlock(&i915->mm.free_lock);
4397         }
4398         if (unlikely(freed)) {
4399                 freed->next = NULL;
4400                 __i915_gem_free_objects(i915, freed);
4401         }
4402 }
4403
4404 static void __i915_gem_free_work(struct work_struct *work)
4405 {
4406         struct drm_i915_private *i915 =
4407                 container_of(work, struct drm_i915_private, mm.free_work);
4408         struct llist_node *freed;
4409
4410         /*
4411          * All file-owned VMA should have been released by this point through
4412          * i915_gem_close_object(), or earlier by i915_gem_context_close().
4413          * However, the object may also be bound into the global GTT (e.g.
4414          * older GPUs without per-process support, or for direct access through
4415          * the GTT either for the user or for scanout). Those VMA still need to
4416          * unbound now.
4417          */
4418
4419         spin_lock(&i915->mm.free_lock);
4420         while ((freed = llist_del_all(&i915->mm.free_list))) {
4421                 spin_unlock(&i915->mm.free_lock);
4422
4423                 __i915_gem_free_objects(i915, freed);
4424                 if (need_resched())
4425                         return;
4426
4427                 spin_lock(&i915->mm.free_lock);
4428         }
4429         spin_unlock(&i915->mm.free_lock);
4430 }
4431
4432 static void __i915_gem_free_object_rcu(struct rcu_head *head)
4433 {
4434         struct drm_i915_gem_object *obj =
4435                 container_of(head, typeof(*obj), rcu);
4436         struct drm_i915_private *i915 = to_i915(obj->base.dev);
4437
4438         /*
4439          * We reuse obj->rcu for the freed list, so we had better not treat
4440          * it like a rcu_head from this point forwards. And we expect all
4441          * objects to be freed via this path.
4442          */
4443         destroy_rcu_head(&obj->rcu);
4444
4445         /*
4446          * Since we require blocking on struct_mutex to unbind the freed
4447          * object from the GPU before releasing resources back to the
4448          * system, we can not do that directly from the RCU callback (which may
4449          * be a softirq context), but must instead then defer that work onto a
4450          * kthread. We use the RCU callback rather than move the freed object
4451          * directly onto the work queue so that we can mix between using the
4452          * worker and performing frees directly from subsequent allocations for
4453          * crude but effective memory throttling.
4454          */
4455         if (llist_add(&obj->freed, &i915->mm.free_list))
4456                 queue_work(i915->wq, &i915->mm.free_work);
4457 }
4458
4459 void i915_gem_free_object(struct drm_gem_object *gem_obj)
4460 {
4461         struct drm_i915_gem_object *obj = to_intel_bo(gem_obj);
4462
4463         if (obj->mm.quirked)
4464                 __i915_gem_object_unpin_pages(obj);
4465
4466         if (discard_backing_storage(obj))
4467                 obj->mm.madv = I915_MADV_DONTNEED;
4468
4469         /*
4470          * Before we free the object, make sure any pure RCU-only
4471          * read-side critical sections are complete, e.g.
4472          * i915_gem_busy_ioctl(). For the corresponding synchronized
4473          * lookup see i915_gem_object_lookup_rcu().
4474          */
4475         atomic_inc(&to_i915(obj->base.dev)->mm.free_count);
4476         call_rcu(&obj->rcu, __i915_gem_free_object_rcu);
4477 }
4478
4479 void __i915_gem_object_release_unless_active(struct drm_i915_gem_object *obj)
4480 {
4481         lockdep_assert_held(&obj->base.dev->struct_mutex);
4482
4483         if (!i915_gem_object_has_active_reference(obj) &&
4484             i915_gem_object_is_active(obj))
4485                 i915_gem_object_set_active_reference(obj);
4486         else
4487                 i915_gem_object_put(obj);
4488 }
4489
4490 void i915_gem_sanitize(struct drm_i915_private *i915)
4491 {
4492         intel_wakeref_t wakeref;
4493
4494         GEM_TRACE("\n");
4495
4496         mutex_lock(&i915->drm.struct_mutex);
4497
4498         wakeref = intel_runtime_pm_get(i915);
4499         intel_uncore_forcewake_get(i915, FORCEWAKE_ALL);
4500
4501         /*
4502          * As we have just resumed the machine and woken the device up from
4503          * deep PCI sleep (presumably D3_cold), assume the HW has been reset
4504          * back to defaults, recovering from whatever wedged state we left it
4505          * in and so worth trying to use the device once more.
4506          */
4507         if (i915_terminally_wedged(&i915->gpu_error))
4508                 i915_gem_unset_wedged(i915);
4509
4510         /*
4511          * If we inherit context state from the BIOS or earlier occupants
4512          * of the GPU, the GPU may be in an inconsistent state when we
4513          * try to take over. The only way to remove the earlier state
4514          * is by resetting. However, resetting on earlier gen is tricky as
4515          * it may impact the display and we are uncertain about the stability
4516          * of the reset, so this could be applied to even earlier gen.
4517          */
4518         intel_engines_sanitize(i915, false);
4519
4520         intel_uncore_forcewake_put(i915, FORCEWAKE_ALL);
4521         intel_runtime_pm_put(i915, wakeref);
4522
4523         i915_gem_contexts_lost(i915);
4524         mutex_unlock(&i915->drm.struct_mutex);
4525 }
4526
4527 int i915_gem_suspend(struct drm_i915_private *i915)
4528 {
4529         intel_wakeref_t wakeref;
4530         int ret;
4531
4532         GEM_TRACE("\n");
4533
4534         wakeref = intel_runtime_pm_get(i915);
4535         intel_suspend_gt_powersave(i915);
4536
4537         mutex_lock(&i915->drm.struct_mutex);
4538
4539         /*
4540          * We have to flush all the executing contexts to main memory so
4541          * that they can saved in the hibernation image. To ensure the last
4542          * context image is coherent, we have to switch away from it. That
4543          * leaves the i915->kernel_context still active when
4544          * we actually suspend, and its image in memory may not match the GPU
4545          * state. Fortunately, the kernel_context is disposable and we do
4546          * not rely on its state.
4547          */
4548         if (!i915_terminally_wedged(&i915->gpu_error)) {
4549                 ret = i915_gem_switch_to_kernel_context(i915);
4550                 if (ret)
4551                         goto err_unlock;
4552
4553                 ret = i915_gem_wait_for_idle(i915,
4554                                              I915_WAIT_INTERRUPTIBLE |
4555                                              I915_WAIT_LOCKED |
4556                                              I915_WAIT_FOR_IDLE_BOOST,
4557                                              MAX_SCHEDULE_TIMEOUT);
4558                 if (ret && ret != -EIO)
4559                         goto err_unlock;
4560
4561                 assert_kernel_context_is_current(i915);
4562         }
4563         i915_retire_requests(i915); /* ensure we flush after wedging */
4564
4565         mutex_unlock(&i915->drm.struct_mutex);
4566
4567         intel_uc_suspend(i915);
4568
4569         cancel_delayed_work_sync(&i915->gpu_error.hangcheck_work);
4570         cancel_delayed_work_sync(&i915->gt.retire_work);
4571
4572         /*
4573          * As the idle_work is rearming if it detects a race, play safe and
4574          * repeat the flush until it is definitely idle.
4575          */
4576         drain_delayed_work(&i915->gt.idle_work);
4577
4578         /*
4579          * Assert that we successfully flushed all the work and
4580          * reset the GPU back to its idle, low power state.
4581          */
4582         WARN_ON(i915->gt.awake);
4583         if (WARN_ON(!intel_engines_are_idle(i915)))
4584                 i915_gem_set_wedged(i915); /* no hope, discard everything */
4585
4586         intel_runtime_pm_put(i915, wakeref);
4587         return 0;
4588
4589 err_unlock:
4590         mutex_unlock(&i915->drm.struct_mutex);
4591         intel_runtime_pm_put(i915, wakeref);
4592         return ret;
4593 }
4594
4595 void i915_gem_suspend_late(struct drm_i915_private *i915)
4596 {
4597         struct drm_i915_gem_object *obj;
4598         struct list_head *phases[] = {
4599                 &i915->mm.unbound_list,
4600                 &i915->mm.bound_list,
4601                 NULL
4602         }, **phase;
4603
4604         /*
4605          * Neither the BIOS, ourselves or any other kernel
4606          * expects the system to be in execlists mode on startup,
4607          * so we need to reset the GPU back to legacy mode. And the only
4608          * known way to disable logical contexts is through a GPU reset.
4609          *
4610          * So in order to leave the system in a known default configuration,
4611          * always reset the GPU upon unload and suspend. Afterwards we then
4612          * clean up the GEM state tracking, flushing off the requests and
4613          * leaving the system in a known idle state.
4614          *
4615          * Note that is of the upmost importance that the GPU is idle and
4616          * all stray writes are flushed *before* we dismantle the backing
4617          * storage for the pinned objects.
4618          *
4619          * However, since we are uncertain that resetting the GPU on older
4620          * machines is a good idea, we don't - just in case it leaves the
4621          * machine in an unusable condition.
4622          */
4623
4624         mutex_lock(&i915->drm.struct_mutex);
4625         for (phase = phases; *phase; phase++) {
4626                 list_for_each_entry(obj, *phase, mm.link)
4627                         WARN_ON(i915_gem_object_set_to_gtt_domain(obj, false));
4628         }
4629         mutex_unlock(&i915->drm.struct_mutex);
4630
4631         intel_uc_sanitize(i915);
4632         i915_gem_sanitize(i915);
4633 }
4634
4635 void i915_gem_resume(struct drm_i915_private *i915)
4636 {
4637         GEM_TRACE("\n");
4638
4639         WARN_ON(i915->gt.awake);
4640
4641         mutex_lock(&i915->drm.struct_mutex);
4642         intel_uncore_forcewake_get(i915, FORCEWAKE_ALL);
4643
4644         i915_gem_restore_gtt_mappings(i915);
4645         i915_gem_restore_fences(i915);
4646
4647         /*
4648          * As we didn't flush the kernel context before suspend, we cannot
4649          * guarantee that the context image is complete. So let's just reset
4650          * it and start again.
4651          */
4652         i915->gt.resume(i915);
4653
4654         if (i915_gem_init_hw(i915))
4655                 goto err_wedged;
4656
4657         intel_uc_resume(i915);
4658
4659         /* Always reload a context for powersaving. */
4660         if (i915_gem_switch_to_kernel_context(i915))
4661                 goto err_wedged;
4662
4663 out_unlock:
4664         intel_uncore_forcewake_put(i915, FORCEWAKE_ALL);
4665         mutex_unlock(&i915->drm.struct_mutex);
4666         return;
4667
4668 err_wedged:
4669         if (!i915_terminally_wedged(&i915->gpu_error)) {
4670                 DRM_ERROR("failed to re-initialize GPU, declaring wedged!\n");
4671                 i915_gem_set_wedged(i915);
4672         }
4673         goto out_unlock;
4674 }
4675
4676 void i915_gem_init_swizzling(struct drm_i915_private *dev_priv)
4677 {
4678         if (INTEL_GEN(dev_priv) < 5 ||
4679             dev_priv->mm.bit_6_swizzle_x == I915_BIT_6_SWIZZLE_NONE)
4680                 return;
4681
4682         I915_WRITE(DISP_ARB_CTL, I915_READ(DISP_ARB_CTL) |
4683                                  DISP_TILE_SURFACE_SWIZZLING);
4684
4685         if (IS_GEN(dev_priv, 5))
4686                 return;
4687
4688         I915_WRITE(TILECTL, I915_READ(TILECTL) | TILECTL_SWZCTL);
4689         if (IS_GEN(dev_priv, 6))
4690                 I915_WRITE(ARB_MODE, _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_SNB));
4691         else if (IS_GEN(dev_priv, 7))
4692                 I915_WRITE(ARB_MODE, _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_IVB));
4693         else if (IS_GEN(dev_priv, 8))
4694                 I915_WRITE(GAMTARBMODE, _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_BDW));
4695         else
4696                 BUG();
4697 }
4698
4699 static void init_unused_ring(struct drm_i915_private *dev_priv, u32 base)
4700 {
4701         I915_WRITE(RING_CTL(base), 0);
4702         I915_WRITE(RING_HEAD(base), 0);
4703         I915_WRITE(RING_TAIL(base), 0);
4704         I915_WRITE(RING_START(base), 0);
4705 }
4706
4707 static void init_unused_rings(struct drm_i915_private *dev_priv)
4708 {
4709         if (IS_I830(dev_priv)) {
4710                 init_unused_ring(dev_priv, PRB1_BASE);
4711                 init_unused_ring(dev_priv, SRB0_BASE);
4712                 init_unused_ring(dev_priv, SRB1_BASE);
4713                 init_unused_ring(dev_priv, SRB2_BASE);
4714                 init_unused_ring(dev_priv, SRB3_BASE);
4715         } else if (IS_GEN(dev_priv, 2)) {
4716                 init_unused_ring(dev_priv, SRB0_BASE);
4717                 init_unused_ring(dev_priv, SRB1_BASE);
4718         } else if (IS_GEN(dev_priv, 3)) {
4719                 init_unused_ring(dev_priv, PRB1_BASE);
4720                 init_unused_ring(dev_priv, PRB2_BASE);
4721         }
4722 }
4723
4724 static int __i915_gem_restart_engines(void *data)
4725 {
4726         struct drm_i915_private *i915 = data;
4727         struct intel_engine_cs *engine;
4728         enum intel_engine_id id;
4729         int err;
4730
4731         for_each_engine(engine, i915, id) {
4732                 err = engine->init_hw(engine);
4733                 if (err) {
4734                         DRM_ERROR("Failed to restart %s (%d)\n",
4735                                   engine->name, err);
4736                         return err;
4737                 }
4738         }
4739
4740         return 0;
4741 }
4742
4743 int i915_gem_init_hw(struct drm_i915_private *dev_priv)
4744 {
4745         int ret;
4746
4747         dev_priv->gt.last_init_time = ktime_get();
4748
4749         /* Double layer security blanket, see i915_gem_init() */
4750         intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
4751
4752         if (HAS_EDRAM(dev_priv) && INTEL_GEN(dev_priv) < 9)
4753                 I915_WRITE(HSW_IDICR, I915_READ(HSW_IDICR) | IDIHASHMSK(0xf));
4754
4755         if (IS_HASWELL(dev_priv))
4756                 I915_WRITE(MI_PREDICATE_RESULT_2, IS_HSW_GT3(dev_priv) ?
4757                            LOWER_SLICE_ENABLED : LOWER_SLICE_DISABLED);
4758
4759         /* Apply the GT workarounds... */
4760         intel_gt_apply_workarounds(dev_priv);
4761         /* ...and determine whether they are sticking. */
4762         intel_gt_verify_workarounds(dev_priv, "init");
4763
4764         i915_gem_init_swizzling(dev_priv);
4765
4766         /*
4767          * At least 830 can leave some of the unused rings
4768          * "active" (ie. head != tail) after resume which
4769          * will prevent c3 entry. Makes sure all unused rings
4770          * are totally idle.
4771          */
4772         init_unused_rings(dev_priv);
4773
4774         BUG_ON(!dev_priv->kernel_context);
4775         if (i915_terminally_wedged(&dev_priv->gpu_error)) {
4776                 ret = -EIO;
4777                 goto out;
4778         }
4779
4780         ret = i915_ppgtt_init_hw(dev_priv);
4781         if (ret) {
4782                 DRM_ERROR("Enabling PPGTT failed (%d)\n", ret);
4783                 goto out;
4784         }
4785
4786         ret = intel_wopcm_init_hw(&dev_priv->wopcm);
4787         if (ret) {
4788                 DRM_ERROR("Enabling WOPCM failed (%d)\n", ret);
4789                 goto out;
4790         }
4791
4792         /* We can't enable contexts until all firmware is loaded */
4793         ret = intel_uc_init_hw(dev_priv);
4794         if (ret) {
4795                 DRM_ERROR("Enabling uc failed (%d)\n", ret);
4796                 goto out;
4797         }
4798
4799         intel_mocs_init_l3cc_table(dev_priv);
4800
4801         /* Only when the HW is re-initialised, can we replay the requests */
4802         ret = __i915_gem_restart_engines(dev_priv);
4803         if (ret)
4804                 goto cleanup_uc;
4805
4806         intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
4807
4808         return 0;
4809
4810 cleanup_uc:
4811         intel_uc_fini_hw(dev_priv);
4812 out:
4813         intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
4814
4815         return ret;
4816 }
4817
4818 static int __intel_engines_record_defaults(struct drm_i915_private *i915)
4819 {
4820         struct i915_gem_context *ctx;
4821         struct intel_engine_cs *engine;
4822         enum intel_engine_id id;
4823         int err;
4824
4825         /*
4826          * As we reset the gpu during very early sanitisation, the current
4827          * register state on the GPU should reflect its defaults values.
4828          * We load a context onto the hw (with restore-inhibit), then switch
4829          * over to a second context to save that default register state. We
4830          * can then prime every new context with that state so they all start
4831          * from the same default HW values.
4832          */
4833
4834         ctx = i915_gem_context_create_kernel(i915, 0);
4835         if (IS_ERR(ctx))
4836                 return PTR_ERR(ctx);
4837
4838         for_each_engine(engine, i915, id) {
4839                 struct i915_request *rq;
4840
4841                 rq = i915_request_alloc(engine, ctx);
4842                 if (IS_ERR(rq)) {
4843                         err = PTR_ERR(rq);
4844                         goto out_ctx;
4845                 }
4846
4847                 err = 0;
4848                 if (engine->init_context)
4849                         err = engine->init_context(rq);
4850
4851                 i915_request_add(rq);
4852                 if (err)
4853                         goto err_active;
4854         }
4855
4856         err = i915_gem_switch_to_kernel_context(i915);
4857         if (err)
4858                 goto err_active;
4859
4860         if (i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED, HZ / 5)) {
4861                 i915_gem_set_wedged(i915);
4862                 err = -EIO; /* Caller will declare us wedged */
4863                 goto err_active;
4864         }
4865
4866         assert_kernel_context_is_current(i915);
4867
4868         /*
4869          * Immediately park the GPU so that we enable powersaving and
4870          * treat it as idle. The next time we issue a request, we will
4871          * unpark and start using the engine->pinned_default_state, otherwise
4872          * it is in limbo and an early reset may fail.
4873          */
4874         __i915_gem_park(i915);
4875
4876         for_each_engine(engine, i915, id) {
4877                 struct i915_vma *state;
4878                 void *vaddr;
4879
4880                 GEM_BUG_ON(to_intel_context(ctx, engine)->pin_count);
4881
4882                 state = to_intel_context(ctx, engine)->state;
4883                 if (!state)
4884                         continue;
4885
4886                 /*
4887                  * As we will hold a reference to the logical state, it will
4888                  * not be torn down with the context, and importantly the
4889                  * object will hold onto its vma (making it possible for a
4890                  * stray GTT write to corrupt our defaults). Unmap the vma
4891                  * from the GTT to prevent such accidents and reclaim the
4892                  * space.
4893                  */
4894                 err = i915_vma_unbind(state);
4895                 if (err)
4896                         goto err_active;
4897
4898                 err = i915_gem_object_set_to_cpu_domain(state->obj, false);
4899                 if (err)
4900                         goto err_active;
4901
4902                 engine->default_state = i915_gem_object_get(state->obj);
4903
4904                 /* Check we can acquire the image of the context state */
4905                 vaddr = i915_gem_object_pin_map(engine->default_state,
4906                                                 I915_MAP_FORCE_WB);
4907                 if (IS_ERR(vaddr)) {
4908                         err = PTR_ERR(vaddr);
4909                         goto err_active;
4910                 }
4911
4912                 i915_gem_object_unpin_map(engine->default_state);
4913         }
4914
4915         if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) {
4916                 unsigned int found = intel_engines_has_context_isolation(i915);
4917
4918                 /*
4919                  * Make sure that classes with multiple engine instances all
4920                  * share the same basic configuration.
4921                  */
4922                 for_each_engine(engine, i915, id) {
4923                         unsigned int bit = BIT(engine->uabi_class);
4924                         unsigned int expected = engine->default_state ? bit : 0;
4925
4926                         if ((found & bit) != expected) {
4927                                 DRM_ERROR("mismatching default context state for class %d on engine %s\n",
4928                                           engine->uabi_class, engine->name);
4929                         }
4930                 }
4931         }
4932
4933 out_ctx:
4934         i915_gem_context_set_closed(ctx);
4935         i915_gem_context_put(ctx);
4936         return err;
4937
4938 err_active:
4939         /*
4940          * If we have to abandon now, we expect the engines to be idle
4941          * and ready to be torn-down. First try to flush any remaining
4942          * request, ensure we are pointing at the kernel context and
4943          * then remove it.
4944          */
4945         if (WARN_ON(i915_gem_switch_to_kernel_context(i915)))
4946                 goto out_ctx;
4947
4948         if (WARN_ON(i915_gem_wait_for_idle(i915,
4949                                            I915_WAIT_LOCKED,
4950                                            MAX_SCHEDULE_TIMEOUT)))
4951                 goto out_ctx;
4952
4953         i915_gem_contexts_lost(i915);
4954         goto out_ctx;
4955 }
4956
4957 static int
4958 i915_gem_init_scratch(struct drm_i915_private *i915, unsigned int size)
4959 {
4960         struct drm_i915_gem_object *obj;
4961         struct i915_vma *vma;
4962         int ret;
4963
4964         obj = i915_gem_object_create_stolen(i915, size);
4965         if (!obj)
4966                 obj = i915_gem_object_create_internal(i915, size);
4967         if (IS_ERR(obj)) {
4968                 DRM_ERROR("Failed to allocate scratch page\n");
4969                 return PTR_ERR(obj);
4970         }
4971
4972         vma = i915_vma_instance(obj, &i915->ggtt.vm, NULL);
4973         if (IS_ERR(vma)) {
4974                 ret = PTR_ERR(vma);
4975                 goto err_unref;
4976         }
4977
4978         ret = i915_vma_pin(vma, 0, 0, PIN_GLOBAL | PIN_HIGH);
4979         if (ret)
4980                 goto err_unref;
4981
4982         i915->gt.scratch = vma;
4983         return 0;
4984
4985 err_unref:
4986         i915_gem_object_put(obj);
4987         return ret;
4988 }
4989
4990 static void i915_gem_fini_scratch(struct drm_i915_private *i915)
4991 {
4992         i915_vma_unpin_and_release(&i915->gt.scratch, 0);
4993 }
4994
4995 int i915_gem_init(struct drm_i915_private *dev_priv)
4996 {
4997         int ret;
4998
4999         /* We need to fallback to 4K pages if host doesn't support huge gtt. */
5000         if (intel_vgpu_active(dev_priv) && !intel_vgpu_has_huge_gtt(dev_priv))
5001                 mkwrite_device_info(dev_priv)->page_sizes =
5002                         I915_GTT_PAGE_SIZE_4K;
5003
5004         dev_priv->mm.unordered_timeline = dma_fence_context_alloc(1);
5005
5006         if (HAS_LOGICAL_RING_CONTEXTS(dev_priv)) {
5007                 dev_priv->gt.resume = intel_lr_context_resume;
5008                 dev_priv->gt.cleanup_engine = intel_logical_ring_cleanup;
5009         } else {
5010                 dev_priv->gt.resume = intel_legacy_submission_resume;
5011                 dev_priv->gt.cleanup_engine = intel_engine_cleanup;
5012         }
5013
5014         ret = i915_gem_init_userptr(dev_priv);
5015         if (ret)
5016                 return ret;
5017
5018         ret = intel_uc_init_misc(dev_priv);
5019         if (ret)
5020                 return ret;
5021
5022         ret = intel_wopcm_init(&dev_priv->wopcm);
5023         if (ret)
5024                 goto err_uc_misc;
5025
5026         /* This is just a security blanket to placate dragons.
5027          * On some systems, we very sporadically observe that the first TLBs
5028          * used by the CS may be stale, despite us poking the TLB reset. If
5029          * we hold the forcewake during initialisation these problems
5030          * just magically go away.
5031          */
5032         mutex_lock(&dev_priv->drm.struct_mutex);
5033         intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
5034
5035         ret = i915_gem_init_ggtt(dev_priv);
5036         if (ret) {
5037                 GEM_BUG_ON(ret == -EIO);
5038                 goto err_unlock;
5039         }
5040
5041         ret = i915_gem_init_scratch(dev_priv,
5042                                     IS_GEN(dev_priv, 2) ? SZ_256K : PAGE_SIZE);
5043         if (ret) {
5044                 GEM_BUG_ON(ret == -EIO);
5045                 goto err_ggtt;
5046         }
5047
5048         ret = i915_gem_contexts_init(dev_priv);
5049         if (ret) {
5050                 GEM_BUG_ON(ret == -EIO);
5051                 goto err_scratch;
5052         }
5053
5054         ret = intel_engines_init(dev_priv);
5055         if (ret) {
5056                 GEM_BUG_ON(ret == -EIO);
5057                 goto err_context;
5058         }
5059
5060         intel_init_gt_powersave(dev_priv);
5061
5062         ret = intel_uc_init(dev_priv);
5063         if (ret)
5064                 goto err_pm;
5065
5066         ret = i915_gem_init_hw(dev_priv);
5067         if (ret)
5068                 goto err_uc_init;
5069
5070         /*
5071          * Despite its name intel_init_clock_gating applies both display
5072          * clock gating workarounds; GT mmio workarounds and the occasional
5073          * GT power context workaround. Worse, sometimes it includes a context
5074          * register workaround which we need to apply before we record the
5075          * default HW state for all contexts.
5076          *
5077          * FIXME: break up the workarounds and apply them at the right time!
5078          */
5079         intel_init_clock_gating(dev_priv);
5080
5081         ret = __intel_engines_record_defaults(dev_priv);
5082         if (ret)
5083                 goto err_init_hw;
5084
5085         if (i915_inject_load_failure()) {
5086                 ret = -ENODEV;
5087                 goto err_init_hw;
5088         }
5089
5090         if (i915_inject_load_failure()) {
5091                 ret = -EIO;
5092                 goto err_init_hw;
5093         }
5094
5095         intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
5096         mutex_unlock(&dev_priv->drm.struct_mutex);
5097
5098         return 0;
5099
5100         /*
5101          * Unwinding is complicated by that we want to handle -EIO to mean
5102          * disable GPU submission but keep KMS alive. We want to mark the
5103          * HW as irrevisibly wedged, but keep enough state around that the
5104          * driver doesn't explode during runtime.
5105          */
5106 err_init_hw:
5107         mutex_unlock(&dev_priv->drm.struct_mutex);
5108
5109         WARN_ON(i915_gem_suspend(dev_priv));
5110         i915_gem_suspend_late(dev_priv);
5111
5112         i915_gem_drain_workqueue(dev_priv);
5113
5114         mutex_lock(&dev_priv->drm.struct_mutex);
5115         intel_uc_fini_hw(dev_priv);
5116 err_uc_init:
5117         intel_uc_fini(dev_priv);
5118 err_pm:
5119         if (ret != -EIO) {
5120                 intel_cleanup_gt_powersave(dev_priv);
5121                 i915_gem_cleanup_engines(dev_priv);
5122         }
5123 err_context:
5124         if (ret != -EIO)
5125                 i915_gem_contexts_fini(dev_priv);
5126 err_scratch:
5127         i915_gem_fini_scratch(dev_priv);
5128 err_ggtt:
5129 err_unlock:
5130         intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
5131         mutex_unlock(&dev_priv->drm.struct_mutex);
5132
5133 err_uc_misc:
5134         intel_uc_fini_misc(dev_priv);
5135
5136         if (ret != -EIO)
5137                 i915_gem_cleanup_userptr(dev_priv);
5138
5139         if (ret == -EIO) {
5140                 mutex_lock(&dev_priv->drm.struct_mutex);
5141
5142                 /*
5143                  * Allow engine initialisation to fail by marking the GPU as
5144                  * wedged. But we only want to do this where the GPU is angry,
5145                  * for all other failure, such as an allocation failure, bail.
5146                  */
5147                 if (!i915_terminally_wedged(&dev_priv->gpu_error)) {
5148                         i915_load_error(dev_priv,
5149                                         "Failed to initialize GPU, declaring it wedged!\n");
5150                         i915_gem_set_wedged(dev_priv);
5151                 }
5152
5153                 /* Minimal basic recovery for KMS */
5154                 ret = i915_ggtt_enable_hw(dev_priv);
5155                 i915_gem_restore_gtt_mappings(dev_priv);
5156                 i915_gem_restore_fences(dev_priv);
5157                 intel_init_clock_gating(dev_priv);
5158
5159                 mutex_unlock(&dev_priv->drm.struct_mutex);
5160         }
5161
5162         i915_gem_drain_freed_objects(dev_priv);
5163         return ret;
5164 }
5165
5166 void i915_gem_fini(struct drm_i915_private *dev_priv)
5167 {
5168         i915_gem_suspend_late(dev_priv);
5169         intel_disable_gt_powersave(dev_priv);
5170
5171         /* Flush any outstanding unpin_work. */
5172         i915_gem_drain_workqueue(dev_priv);
5173
5174         mutex_lock(&dev_priv->drm.struct_mutex);
5175         intel_uc_fini_hw(dev_priv);
5176         intel_uc_fini(dev_priv);
5177         i915_gem_cleanup_engines(dev_priv);
5178         i915_gem_contexts_fini(dev_priv);
5179         i915_gem_fini_scratch(dev_priv);
5180         mutex_unlock(&dev_priv->drm.struct_mutex);
5181
5182         intel_wa_list_free(&dev_priv->gt_wa_list);
5183
5184         intel_cleanup_gt_powersave(dev_priv);
5185
5186         intel_uc_fini_misc(dev_priv);
5187         i915_gem_cleanup_userptr(dev_priv);
5188
5189         i915_gem_drain_freed_objects(dev_priv);
5190
5191         WARN_ON(!list_empty(&dev_priv->contexts.list));
5192 }
5193
5194 void i915_gem_init_mmio(struct drm_i915_private *i915)
5195 {
5196         i915_gem_sanitize(i915);
5197 }
5198
5199 void
5200 i915_gem_cleanup_engines(struct drm_i915_private *dev_priv)
5201 {
5202         struct intel_engine_cs *engine;
5203         enum intel_engine_id id;
5204
5205         for_each_engine(engine, dev_priv, id)
5206                 dev_priv->gt.cleanup_engine(engine);
5207 }
5208
5209 void
5210 i915_gem_load_init_fences(struct drm_i915_private *dev_priv)
5211 {
5212         int i;
5213
5214         if (INTEL_GEN(dev_priv) >= 7 && !IS_VALLEYVIEW(dev_priv) &&
5215             !IS_CHERRYVIEW(dev_priv))
5216                 dev_priv->num_fence_regs = 32;
5217         else if (INTEL_GEN(dev_priv) >= 4 ||
5218                  IS_I945G(dev_priv) || IS_I945GM(dev_priv) ||
5219                  IS_G33(dev_priv) || IS_PINEVIEW(dev_priv))
5220                 dev_priv->num_fence_regs = 16;
5221         else
5222                 dev_priv->num_fence_regs = 8;
5223
5224         if (intel_vgpu_active(dev_priv))
5225                 dev_priv->num_fence_regs =
5226                                 I915_READ(vgtif_reg(avail_rs.fence_num));
5227
5228         /* Initialize fence registers to zero */
5229         for (i = 0; i < dev_priv->num_fence_regs; i++) {
5230                 struct drm_i915_fence_reg *fence = &dev_priv->fence_regs[i];
5231
5232                 fence->i915 = dev_priv;
5233                 fence->id = i;
5234                 list_add_tail(&fence->link, &dev_priv->mm.fence_list);
5235         }
5236         i915_gem_restore_fences(dev_priv);
5237
5238         i915_gem_detect_bit_6_swizzle(dev_priv);
5239 }
5240
5241 static void i915_gem_init__mm(struct drm_i915_private *i915)
5242 {
5243         spin_lock_init(&i915->mm.object_stat_lock);
5244         spin_lock_init(&i915->mm.obj_lock);
5245         spin_lock_init(&i915->mm.free_lock);
5246
5247         init_llist_head(&i915->mm.free_list);
5248
5249         INIT_LIST_HEAD(&i915->mm.unbound_list);
5250         INIT_LIST_HEAD(&i915->mm.bound_list);
5251         INIT_LIST_HEAD(&i915->mm.fence_list);
5252         INIT_LIST_HEAD(&i915->mm.userfault_list);
5253
5254         INIT_WORK(&i915->mm.free_work, __i915_gem_free_work);
5255 }
5256
5257 int i915_gem_init_early(struct drm_i915_private *dev_priv)
5258 {
5259         int err = -ENOMEM;
5260
5261         dev_priv->objects = KMEM_CACHE(drm_i915_gem_object, SLAB_HWCACHE_ALIGN);
5262         if (!dev_priv->objects)
5263                 goto err_out;
5264
5265         dev_priv->vmas = KMEM_CACHE(i915_vma, SLAB_HWCACHE_ALIGN);
5266         if (!dev_priv->vmas)
5267                 goto err_objects;
5268
5269         dev_priv->luts = KMEM_CACHE(i915_lut_handle, 0);
5270         if (!dev_priv->luts)
5271                 goto err_vmas;
5272
5273         dev_priv->requests = KMEM_CACHE(i915_request,
5274                                         SLAB_HWCACHE_ALIGN |
5275                                         SLAB_RECLAIM_ACCOUNT |
5276                                         SLAB_TYPESAFE_BY_RCU);
5277         if (!dev_priv->requests)
5278                 goto err_luts;
5279
5280         dev_priv->dependencies = KMEM_CACHE(i915_dependency,
5281                                             SLAB_HWCACHE_ALIGN |
5282                                             SLAB_RECLAIM_ACCOUNT);
5283         if (!dev_priv->dependencies)
5284                 goto err_requests;
5285
5286         dev_priv->priorities = KMEM_CACHE(i915_priolist, SLAB_HWCACHE_ALIGN);
5287         if (!dev_priv->priorities)
5288                 goto err_dependencies;
5289
5290         INIT_LIST_HEAD(&dev_priv->gt.timelines);
5291         INIT_LIST_HEAD(&dev_priv->gt.active_rings);
5292         INIT_LIST_HEAD(&dev_priv->gt.closed_vma);
5293
5294         i915_gem_init__mm(dev_priv);
5295
5296         INIT_DELAYED_WORK(&dev_priv->gt.retire_work,
5297                           i915_gem_retire_work_handler);
5298         INIT_DELAYED_WORK(&dev_priv->gt.idle_work,
5299                           i915_gem_idle_work_handler);
5300         init_waitqueue_head(&dev_priv->gpu_error.wait_queue);
5301         init_waitqueue_head(&dev_priv->gpu_error.reset_queue);
5302         mutex_init(&dev_priv->gpu_error.wedge_mutex);
5303
5304         atomic_set(&dev_priv->mm.bsd_engine_dispatch_index, 0);
5305
5306         spin_lock_init(&dev_priv->fb_tracking.lock);
5307
5308         err = i915_gemfs_init(dev_priv);
5309         if (err)
5310                 DRM_NOTE("Unable to create a private tmpfs mount, hugepage support will be disabled(%d).\n", err);
5311
5312         return 0;
5313
5314 err_dependencies:
5315         kmem_cache_destroy(dev_priv->dependencies);
5316 err_requests:
5317         kmem_cache_destroy(dev_priv->requests);
5318 err_luts:
5319         kmem_cache_destroy(dev_priv->luts);
5320 err_vmas:
5321         kmem_cache_destroy(dev_priv->vmas);
5322 err_objects:
5323         kmem_cache_destroy(dev_priv->objects);
5324 err_out:
5325         return err;
5326 }
5327
5328 void i915_gem_cleanup_early(struct drm_i915_private *dev_priv)
5329 {
5330         i915_gem_drain_freed_objects(dev_priv);
5331         GEM_BUG_ON(!llist_empty(&dev_priv->mm.free_list));
5332         GEM_BUG_ON(atomic_read(&dev_priv->mm.free_count));
5333         WARN_ON(dev_priv->mm.object_count);
5334         WARN_ON(!list_empty(&dev_priv->gt.timelines));
5335
5336         kmem_cache_destroy(dev_priv->priorities);
5337         kmem_cache_destroy(dev_priv->dependencies);
5338         kmem_cache_destroy(dev_priv->requests);
5339         kmem_cache_destroy(dev_priv->luts);
5340         kmem_cache_destroy(dev_priv->vmas);
5341         kmem_cache_destroy(dev_priv->objects);
5342
5343         /* And ensure that our DESTROY_BY_RCU slabs are truly destroyed */
5344         rcu_barrier();
5345
5346         i915_gemfs_fini(dev_priv);
5347 }
5348
5349 int i915_gem_freeze(struct drm_i915_private *dev_priv)
5350 {
5351         /* Discard all purgeable objects, let userspace recover those as
5352          * required after resuming.
5353          */
5354         i915_gem_shrink_all(dev_priv);
5355
5356         return 0;
5357 }
5358
5359 int i915_gem_freeze_late(struct drm_i915_private *i915)
5360 {
5361         struct drm_i915_gem_object *obj;
5362         struct list_head *phases[] = {
5363                 &i915->mm.unbound_list,
5364                 &i915->mm.bound_list,
5365                 NULL
5366         }, **phase;
5367
5368         /*
5369          * Called just before we write the hibernation image.
5370          *
5371          * We need to update the domain tracking to reflect that the CPU
5372          * will be accessing all the pages to create and restore from the
5373          * hibernation, and so upon restoration those pages will be in the
5374          * CPU domain.
5375          *
5376          * To make sure the hibernation image contains the latest state,
5377          * we update that state just before writing out the image.
5378          *
5379          * To try and reduce the hibernation image, we manually shrink
5380          * the objects as well, see i915_gem_freeze()
5381          */
5382
5383         i915_gem_shrink(i915, -1UL, NULL, I915_SHRINK_UNBOUND);
5384         i915_gem_drain_freed_objects(i915);
5385
5386         mutex_lock(&i915->drm.struct_mutex);
5387         for (phase = phases; *phase; phase++) {
5388                 list_for_each_entry(obj, *phase, mm.link)
5389                         WARN_ON(i915_gem_object_set_to_cpu_domain(obj, true));
5390         }
5391         mutex_unlock(&i915->drm.struct_mutex);
5392
5393         return 0;
5394 }
5395
5396 void i915_gem_release(struct drm_device *dev, struct drm_file *file)
5397 {
5398         struct drm_i915_file_private *file_priv = file->driver_priv;
5399         struct i915_request *request;
5400
5401         /* Clean up our request list when the client is going away, so that
5402          * later retire_requests won't dereference our soon-to-be-gone
5403          * file_priv.
5404          */
5405         spin_lock(&file_priv->mm.lock);
5406         list_for_each_entry(request, &file_priv->mm.request_list, client_link)
5407                 request->file_priv = NULL;
5408         spin_unlock(&file_priv->mm.lock);
5409 }
5410
5411 int i915_gem_open(struct drm_i915_private *i915, struct drm_file *file)
5412 {
5413         struct drm_i915_file_private *file_priv;
5414         int ret;
5415
5416         DRM_DEBUG("\n");
5417
5418         file_priv = kzalloc(sizeof(*file_priv), GFP_KERNEL);
5419         if (!file_priv)
5420                 return -ENOMEM;
5421
5422         file->driver_priv = file_priv;
5423         file_priv->dev_priv = i915;
5424         file_priv->file = file;
5425
5426         spin_lock_init(&file_priv->mm.lock);
5427         INIT_LIST_HEAD(&file_priv->mm.request_list);
5428
5429         file_priv->bsd_engine = -1;
5430         file_priv->hang_timestamp = jiffies;
5431
5432         ret = i915_gem_context_open(i915, file);
5433         if (ret)
5434                 kfree(file_priv);
5435
5436         return ret;
5437 }
5438
5439 /**
5440  * i915_gem_track_fb - update frontbuffer tracking
5441  * @old: current GEM buffer for the frontbuffer slots
5442  * @new: new GEM buffer for the frontbuffer slots
5443  * @frontbuffer_bits: bitmask of frontbuffer slots
5444  *
5445  * This updates the frontbuffer tracking bits @frontbuffer_bits by clearing them
5446  * from @old and setting them in @new. Both @old and @new can be NULL.
5447  */
5448 void i915_gem_track_fb(struct drm_i915_gem_object *old,
5449                        struct drm_i915_gem_object *new,
5450                        unsigned frontbuffer_bits)
5451 {
5452         /* Control of individual bits within the mask are guarded by
5453          * the owning plane->mutex, i.e. we can never see concurrent
5454          * manipulation of individual bits. But since the bitfield as a whole
5455          * is updated using RMW, we need to use atomics in order to update
5456          * the bits.
5457          */
5458         BUILD_BUG_ON(INTEL_FRONTBUFFER_BITS_PER_PIPE * I915_MAX_PIPES >
5459                      BITS_PER_TYPE(atomic_t));
5460
5461         if (old) {
5462                 WARN_ON(!(atomic_read(&old->frontbuffer_bits) & frontbuffer_bits));
5463                 atomic_andnot(frontbuffer_bits, &old->frontbuffer_bits);
5464         }
5465
5466         if (new) {
5467                 WARN_ON(atomic_read(&new->frontbuffer_bits) & frontbuffer_bits);
5468                 atomic_or(frontbuffer_bits, &new->frontbuffer_bits);
5469         }
5470 }
5471
5472 /* Allocate a new GEM object and fill it with the supplied data */
5473 struct drm_i915_gem_object *
5474 i915_gem_object_create_from_data(struct drm_i915_private *dev_priv,
5475                                  const void *data, size_t size)
5476 {
5477         struct drm_i915_gem_object *obj;
5478         struct file *file;
5479         size_t offset;
5480         int err;
5481
5482         obj = i915_gem_object_create(dev_priv, round_up(size, PAGE_SIZE));
5483         if (IS_ERR(obj))
5484                 return obj;
5485
5486         GEM_BUG_ON(obj->write_domain != I915_GEM_DOMAIN_CPU);
5487
5488         file = obj->base.filp;
5489         offset = 0;
5490         do {
5491                 unsigned int len = min_t(typeof(size), size, PAGE_SIZE);
5492                 struct page *page;
5493                 void *pgdata, *vaddr;
5494
5495                 err = pagecache_write_begin(file, file->f_mapping,
5496                                             offset, len, 0,
5497                                             &page, &pgdata);
5498                 if (err < 0)
5499                         goto fail;
5500
5501                 vaddr = kmap(page);
5502                 memcpy(vaddr, data, len);
5503                 kunmap(page);
5504
5505                 err = pagecache_write_end(file, file->f_mapping,
5506                                           offset, len, len,
5507                                           page, pgdata);
5508                 if (err < 0)
5509                         goto fail;
5510
5511                 size -= len;
5512                 data += len;
5513                 offset += len;
5514         } while (size);
5515
5516         return obj;
5517
5518 fail:
5519         i915_gem_object_put(obj);
5520         return ERR_PTR(err);
5521 }
5522
5523 struct scatterlist *
5524 i915_gem_object_get_sg(struct drm_i915_gem_object *obj,
5525                        unsigned int n,
5526                        unsigned int *offset)
5527 {
5528         struct i915_gem_object_page_iter *iter = &obj->mm.get_page;
5529         struct scatterlist *sg;
5530         unsigned int idx, count;
5531
5532         might_sleep();
5533         GEM_BUG_ON(n >= obj->base.size >> PAGE_SHIFT);
5534         GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
5535
5536         /* As we iterate forward through the sg, we record each entry in a
5537          * radixtree for quick repeated (backwards) lookups. If we have seen
5538          * this index previously, we will have an entry for it.
5539          *
5540          * Initial lookup is O(N), but this is amortized to O(1) for
5541          * sequential page access (where each new request is consecutive
5542          * to the previous one). Repeated lookups are O(lg(obj->base.size)),
5543          * i.e. O(1) with a large constant!
5544          */
5545         if (n < READ_ONCE(iter->sg_idx))
5546                 goto lookup;
5547
5548         mutex_lock(&iter->lock);
5549
5550         /* We prefer to reuse the last sg so that repeated lookup of this
5551          * (or the subsequent) sg are fast - comparing against the last
5552          * sg is faster than going through the radixtree.
5553          */
5554
5555         sg = iter->sg_pos;
5556         idx = iter->sg_idx;
5557         count = __sg_page_count(sg);
5558
5559         while (idx + count <= n) {
5560                 void *entry;
5561                 unsigned long i;
5562                 int ret;
5563
5564                 /* If we cannot allocate and insert this entry, or the
5565                  * individual pages from this range, cancel updating the
5566                  * sg_idx so that on this lookup we are forced to linearly
5567                  * scan onwards, but on future lookups we will try the
5568                  * insertion again (in which case we need to be careful of
5569                  * the error return reporting that we have already inserted
5570                  * this index).
5571                  */
5572                 ret = radix_tree_insert(&iter->radix, idx, sg);
5573                 if (ret && ret != -EEXIST)
5574                         goto scan;
5575
5576                 entry = xa_mk_value(idx);
5577                 for (i = 1; i < count; i++) {
5578                         ret = radix_tree_insert(&iter->radix, idx + i, entry);
5579                         if (ret && ret != -EEXIST)
5580                                 goto scan;
5581                 }
5582
5583                 idx += count;
5584                 sg = ____sg_next(sg);
5585                 count = __sg_page_count(sg);
5586         }
5587
5588 scan:
5589         iter->sg_pos = sg;
5590         iter->sg_idx = idx;
5591
5592         mutex_unlock(&iter->lock);
5593
5594         if (unlikely(n < idx)) /* insertion completed by another thread */
5595                 goto lookup;
5596
5597         /* In case we failed to insert the entry into the radixtree, we need
5598          * to look beyond the current sg.
5599          */
5600         while (idx + count <= n) {
5601                 idx += count;
5602                 sg = ____sg_next(sg);
5603                 count = __sg_page_count(sg);
5604         }
5605
5606         *offset = n - idx;
5607         return sg;
5608
5609 lookup:
5610         rcu_read_lock();
5611
5612         sg = radix_tree_lookup(&iter->radix, n);
5613         GEM_BUG_ON(!sg);
5614
5615         /* If this index is in the middle of multi-page sg entry,
5616          * the radix tree will contain a value entry that points
5617          * to the start of that range. We will return the pointer to
5618          * the base page and the offset of this page within the
5619          * sg entry's range.
5620          */
5621         *offset = 0;
5622         if (unlikely(xa_is_value(sg))) {
5623                 unsigned long base = xa_to_value(sg);
5624
5625                 sg = radix_tree_lookup(&iter->radix, base);
5626                 GEM_BUG_ON(!sg);
5627
5628                 *offset = n - base;
5629         }
5630
5631         rcu_read_unlock();
5632
5633         return sg;
5634 }
5635
5636 struct page *
5637 i915_gem_object_get_page(struct drm_i915_gem_object *obj, unsigned int n)
5638 {
5639         struct scatterlist *sg;
5640         unsigned int offset;
5641
5642         GEM_BUG_ON(!i915_gem_object_has_struct_page(obj));
5643
5644         sg = i915_gem_object_get_sg(obj, n, &offset);
5645         return nth_page(sg_page(sg), offset);
5646 }
5647
5648 /* Like i915_gem_object_get_page(), but mark the returned page dirty */
5649 struct page *
5650 i915_gem_object_get_dirty_page(struct drm_i915_gem_object *obj,
5651                                unsigned int n)
5652 {
5653         struct page *page;
5654
5655         page = i915_gem_object_get_page(obj, n);
5656         if (!obj->mm.dirty)
5657                 set_page_dirty(page);
5658
5659         return page;
5660 }
5661
5662 dma_addr_t
5663 i915_gem_object_get_dma_address(struct drm_i915_gem_object *obj,
5664                                 unsigned long n)
5665 {
5666         struct scatterlist *sg;
5667         unsigned int offset;
5668
5669         sg = i915_gem_object_get_sg(obj, n, &offset);
5670         return sg_dma_address(sg) + (offset << PAGE_SHIFT);
5671 }
5672
5673 int i915_gem_object_attach_phys(struct drm_i915_gem_object *obj, int align)
5674 {
5675         struct sg_table *pages;
5676         int err;
5677
5678         if (align > obj->base.size)
5679                 return -EINVAL;
5680
5681         if (obj->ops == &i915_gem_phys_ops)
5682                 return 0;
5683
5684         if (obj->ops != &i915_gem_object_ops)
5685                 return -EINVAL;
5686
5687         err = i915_gem_object_unbind(obj);
5688         if (err)
5689                 return err;
5690
5691         mutex_lock(&obj->mm.lock);
5692
5693         if (obj->mm.madv != I915_MADV_WILLNEED) {
5694                 err = -EFAULT;
5695                 goto err_unlock;
5696         }
5697
5698         if (obj->mm.quirked) {
5699                 err = -EFAULT;
5700                 goto err_unlock;
5701         }
5702
5703         if (obj->mm.mapping) {
5704                 err = -EBUSY;
5705                 goto err_unlock;
5706         }
5707
5708         pages = __i915_gem_object_unset_pages(obj);
5709
5710         obj->ops = &i915_gem_phys_ops;
5711
5712         err = ____i915_gem_object_get_pages(obj);
5713         if (err)
5714                 goto err_xfer;
5715
5716         /* Perma-pin (until release) the physical set of pages */
5717         __i915_gem_object_pin_pages(obj);
5718
5719         if (!IS_ERR_OR_NULL(pages))
5720                 i915_gem_object_ops.put_pages(obj, pages);
5721         mutex_unlock(&obj->mm.lock);
5722         return 0;
5723
5724 err_xfer:
5725         obj->ops = &i915_gem_object_ops;
5726         if (!IS_ERR_OR_NULL(pages)) {
5727                 unsigned int sg_page_sizes = i915_sg_page_sizes(pages->sgl);
5728
5729                 __i915_gem_object_set_pages(obj, pages, sg_page_sizes);
5730         }
5731 err_unlock:
5732         mutex_unlock(&obj->mm.lock);
5733         return err;
5734 }
5735
5736 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
5737 #include "selftests/scatterlist.c"
5738 #include "selftests/mock_gem_device.c"
5739 #include "selftests/huge_gem_object.c"
5740 #include "selftests/huge_pages.c"
5741 #include "selftests/i915_gem_object.c"
5742 #include "selftests/i915_gem_coherency.c"
5743 #include "selftests/i915_gem.c"
5744 #endif