drm/i915: Fix context ban and hang accounting for client
[sfrench/cifs-2.6.git] / drivers / gpu / drm / i915 / i915_gem.c
1 /*
2  * Copyright © 2008-2015 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * Authors:
24  *    Eric Anholt <eric@anholt.net>
25  *
26  */
27
28 #include <drm/drmP.h>
29 #include <drm/drm_vma_manager.h>
30 #include <drm/i915_drm.h>
31 #include "i915_drv.h"
32 #include "i915_gem_clflush.h"
33 #include "i915_vgpu.h"
34 #include "i915_trace.h"
35 #include "intel_drv.h"
36 #include "intel_frontbuffer.h"
37 #include "intel_mocs.h"
38 #include "intel_workarounds.h"
39 #include "i915_gemfs.h"
40 #include <linux/dma-fence-array.h>
41 #include <linux/kthread.h>
42 #include <linux/reservation.h>
43 #include <linux/shmem_fs.h>
44 #include <linux/slab.h>
45 #include <linux/stop_machine.h>
46 #include <linux/swap.h>
47 #include <linux/pci.h>
48 #include <linux/dma-buf.h>
49
50 static void i915_gem_flush_free_objects(struct drm_i915_private *i915);
51
52 static bool cpu_write_needs_clflush(struct drm_i915_gem_object *obj)
53 {
54         if (obj->cache_dirty)
55                 return false;
56
57         if (!(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE))
58                 return true;
59
60         return obj->pin_global; /* currently in use by HW, keep flushed */
61 }
62
63 static int
64 insert_mappable_node(struct i915_ggtt *ggtt,
65                      struct drm_mm_node *node, u32 size)
66 {
67         memset(node, 0, sizeof(*node));
68         return drm_mm_insert_node_in_range(&ggtt->base.mm, node,
69                                            size, 0, I915_COLOR_UNEVICTABLE,
70                                            0, ggtt->mappable_end,
71                                            DRM_MM_INSERT_LOW);
72 }
73
74 static void
75 remove_mappable_node(struct drm_mm_node *node)
76 {
77         drm_mm_remove_node(node);
78 }
79
80 /* some bookkeeping */
81 static void i915_gem_info_add_obj(struct drm_i915_private *dev_priv,
82                                   u64 size)
83 {
84         spin_lock(&dev_priv->mm.object_stat_lock);
85         dev_priv->mm.object_count++;
86         dev_priv->mm.object_memory += size;
87         spin_unlock(&dev_priv->mm.object_stat_lock);
88 }
89
90 static void i915_gem_info_remove_obj(struct drm_i915_private *dev_priv,
91                                      u64 size)
92 {
93         spin_lock(&dev_priv->mm.object_stat_lock);
94         dev_priv->mm.object_count--;
95         dev_priv->mm.object_memory -= size;
96         spin_unlock(&dev_priv->mm.object_stat_lock);
97 }
98
99 static int
100 i915_gem_wait_for_error(struct i915_gpu_error *error)
101 {
102         int ret;
103
104         might_sleep();
105
106         /*
107          * Only wait 10 seconds for the gpu reset to complete to avoid hanging
108          * userspace. If it takes that long something really bad is going on and
109          * we should simply try to bail out and fail as gracefully as possible.
110          */
111         ret = wait_event_interruptible_timeout(error->reset_queue,
112                                                !i915_reset_backoff(error),
113                                                I915_RESET_TIMEOUT);
114         if (ret == 0) {
115                 DRM_ERROR("Timed out waiting for the gpu reset to complete\n");
116                 return -EIO;
117         } else if (ret < 0) {
118                 return ret;
119         } else {
120                 return 0;
121         }
122 }
123
124 int i915_mutex_lock_interruptible(struct drm_device *dev)
125 {
126         struct drm_i915_private *dev_priv = to_i915(dev);
127         int ret;
128
129         ret = i915_gem_wait_for_error(&dev_priv->gpu_error);
130         if (ret)
131                 return ret;
132
133         ret = mutex_lock_interruptible(&dev->struct_mutex);
134         if (ret)
135                 return ret;
136
137         return 0;
138 }
139
140 static u32 __i915_gem_park(struct drm_i915_private *i915)
141 {
142         lockdep_assert_held(&i915->drm.struct_mutex);
143         GEM_BUG_ON(i915->gt.active_requests);
144         GEM_BUG_ON(!list_empty(&i915->gt.active_rings));
145
146         if (!i915->gt.awake)
147                 return I915_EPOCH_INVALID;
148
149         GEM_BUG_ON(i915->gt.epoch == I915_EPOCH_INVALID);
150
151         /*
152          * Be paranoid and flush a concurrent interrupt to make sure
153          * we don't reactivate any irq tasklets after parking.
154          *
155          * FIXME: Note that even though we have waited for execlists to be idle,
156          * there may still be an in-flight interrupt even though the CSB
157          * is now empty. synchronize_irq() makes sure that a residual interrupt
158          * is completed before we continue, but it doesn't prevent the HW from
159          * raising a spurious interrupt later. To complete the shield we should
160          * coordinate disabling the CS irq with flushing the interrupts.
161          */
162         synchronize_irq(i915->drm.irq);
163
164         intel_engines_park(i915);
165         i915_timelines_park(i915);
166
167         i915_pmu_gt_parked(i915);
168         i915_vma_parked(i915);
169
170         i915->gt.awake = false;
171
172         if (INTEL_GEN(i915) >= 6)
173                 gen6_rps_idle(i915);
174
175         intel_display_power_put(i915, POWER_DOMAIN_GT_IRQ);
176
177         intel_runtime_pm_put(i915);
178
179         return i915->gt.epoch;
180 }
181
182 void i915_gem_park(struct drm_i915_private *i915)
183 {
184         lockdep_assert_held(&i915->drm.struct_mutex);
185         GEM_BUG_ON(i915->gt.active_requests);
186
187         if (!i915->gt.awake)
188                 return;
189
190         /* Defer the actual call to __i915_gem_park() to prevent ping-pongs */
191         mod_delayed_work(i915->wq, &i915->gt.idle_work, msecs_to_jiffies(100));
192 }
193
194 void i915_gem_unpark(struct drm_i915_private *i915)
195 {
196         lockdep_assert_held(&i915->drm.struct_mutex);
197         GEM_BUG_ON(!i915->gt.active_requests);
198
199         if (i915->gt.awake)
200                 return;
201
202         intel_runtime_pm_get_noresume(i915);
203
204         /*
205          * It seems that the DMC likes to transition between the DC states a lot
206          * when there are no connected displays (no active power domains) during
207          * command submission.
208          *
209          * This activity has negative impact on the performance of the chip with
210          * huge latencies observed in the interrupt handler and elsewhere.
211          *
212          * Work around it by grabbing a GT IRQ power domain whilst there is any
213          * GT activity, preventing any DC state transitions.
214          */
215         intel_display_power_get(i915, POWER_DOMAIN_GT_IRQ);
216
217         i915->gt.awake = true;
218         if (unlikely(++i915->gt.epoch == 0)) /* keep 0 as invalid */
219                 i915->gt.epoch = 1;
220
221         intel_enable_gt_powersave(i915);
222         i915_update_gfx_val(i915);
223         if (INTEL_GEN(i915) >= 6)
224                 gen6_rps_busy(i915);
225         i915_pmu_gt_unparked(i915);
226
227         intel_engines_unpark(i915);
228
229         i915_queue_hangcheck(i915);
230
231         queue_delayed_work(i915->wq,
232                            &i915->gt.retire_work,
233                            round_jiffies_up_relative(HZ));
234 }
235
236 int
237 i915_gem_get_aperture_ioctl(struct drm_device *dev, void *data,
238                             struct drm_file *file)
239 {
240         struct drm_i915_private *dev_priv = to_i915(dev);
241         struct i915_ggtt *ggtt = &dev_priv->ggtt;
242         struct drm_i915_gem_get_aperture *args = data;
243         struct i915_vma *vma;
244         u64 pinned;
245
246         pinned = ggtt->base.reserved;
247         mutex_lock(&dev->struct_mutex);
248         list_for_each_entry(vma, &ggtt->base.active_list, vm_link)
249                 if (i915_vma_is_pinned(vma))
250                         pinned += vma->node.size;
251         list_for_each_entry(vma, &ggtt->base.inactive_list, vm_link)
252                 if (i915_vma_is_pinned(vma))
253                         pinned += vma->node.size;
254         mutex_unlock(&dev->struct_mutex);
255
256         args->aper_size = ggtt->base.total;
257         args->aper_available_size = args->aper_size - pinned;
258
259         return 0;
260 }
261
262 static int i915_gem_object_get_pages_phys(struct drm_i915_gem_object *obj)
263 {
264         struct address_space *mapping = obj->base.filp->f_mapping;
265         drm_dma_handle_t *phys;
266         struct sg_table *st;
267         struct scatterlist *sg;
268         char *vaddr;
269         int i;
270         int err;
271
272         if (WARN_ON(i915_gem_object_needs_bit17_swizzle(obj)))
273                 return -EINVAL;
274
275         /* Always aligning to the object size, allows a single allocation
276          * to handle all possible callers, and given typical object sizes,
277          * the alignment of the buddy allocation will naturally match.
278          */
279         phys = drm_pci_alloc(obj->base.dev,
280                              roundup_pow_of_two(obj->base.size),
281                              roundup_pow_of_two(obj->base.size));
282         if (!phys)
283                 return -ENOMEM;
284
285         vaddr = phys->vaddr;
286         for (i = 0; i < obj->base.size / PAGE_SIZE; i++) {
287                 struct page *page;
288                 char *src;
289
290                 page = shmem_read_mapping_page(mapping, i);
291                 if (IS_ERR(page)) {
292                         err = PTR_ERR(page);
293                         goto err_phys;
294                 }
295
296                 src = kmap_atomic(page);
297                 memcpy(vaddr, src, PAGE_SIZE);
298                 drm_clflush_virt_range(vaddr, PAGE_SIZE);
299                 kunmap_atomic(src);
300
301                 put_page(page);
302                 vaddr += PAGE_SIZE;
303         }
304
305         i915_gem_chipset_flush(to_i915(obj->base.dev));
306
307         st = kmalloc(sizeof(*st), GFP_KERNEL);
308         if (!st) {
309                 err = -ENOMEM;
310                 goto err_phys;
311         }
312
313         if (sg_alloc_table(st, 1, GFP_KERNEL)) {
314                 kfree(st);
315                 err = -ENOMEM;
316                 goto err_phys;
317         }
318
319         sg = st->sgl;
320         sg->offset = 0;
321         sg->length = obj->base.size;
322
323         sg_dma_address(sg) = phys->busaddr;
324         sg_dma_len(sg) = obj->base.size;
325
326         obj->phys_handle = phys;
327
328         __i915_gem_object_set_pages(obj, st, sg->length);
329
330         return 0;
331
332 err_phys:
333         drm_pci_free(obj->base.dev, phys);
334
335         return err;
336 }
337
338 static void __start_cpu_write(struct drm_i915_gem_object *obj)
339 {
340         obj->read_domains = I915_GEM_DOMAIN_CPU;
341         obj->write_domain = I915_GEM_DOMAIN_CPU;
342         if (cpu_write_needs_clflush(obj))
343                 obj->cache_dirty = true;
344 }
345
346 static void
347 __i915_gem_object_release_shmem(struct drm_i915_gem_object *obj,
348                                 struct sg_table *pages,
349                                 bool needs_clflush)
350 {
351         GEM_BUG_ON(obj->mm.madv == __I915_MADV_PURGED);
352
353         if (obj->mm.madv == I915_MADV_DONTNEED)
354                 obj->mm.dirty = false;
355
356         if (needs_clflush &&
357             (obj->read_domains & I915_GEM_DOMAIN_CPU) == 0 &&
358             !(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ))
359                 drm_clflush_sg(pages);
360
361         __start_cpu_write(obj);
362 }
363
364 static void
365 i915_gem_object_put_pages_phys(struct drm_i915_gem_object *obj,
366                                struct sg_table *pages)
367 {
368         __i915_gem_object_release_shmem(obj, pages, false);
369
370         if (obj->mm.dirty) {
371                 struct address_space *mapping = obj->base.filp->f_mapping;
372                 char *vaddr = obj->phys_handle->vaddr;
373                 int i;
374
375                 for (i = 0; i < obj->base.size / PAGE_SIZE; i++) {
376                         struct page *page;
377                         char *dst;
378
379                         page = shmem_read_mapping_page(mapping, i);
380                         if (IS_ERR(page))
381                                 continue;
382
383                         dst = kmap_atomic(page);
384                         drm_clflush_virt_range(vaddr, PAGE_SIZE);
385                         memcpy(dst, vaddr, PAGE_SIZE);
386                         kunmap_atomic(dst);
387
388                         set_page_dirty(page);
389                         if (obj->mm.madv == I915_MADV_WILLNEED)
390                                 mark_page_accessed(page);
391                         put_page(page);
392                         vaddr += PAGE_SIZE;
393                 }
394                 obj->mm.dirty = false;
395         }
396
397         sg_free_table(pages);
398         kfree(pages);
399
400         drm_pci_free(obj->base.dev, obj->phys_handle);
401 }
402
403 static void
404 i915_gem_object_release_phys(struct drm_i915_gem_object *obj)
405 {
406         i915_gem_object_unpin_pages(obj);
407 }
408
409 static const struct drm_i915_gem_object_ops i915_gem_phys_ops = {
410         .get_pages = i915_gem_object_get_pages_phys,
411         .put_pages = i915_gem_object_put_pages_phys,
412         .release = i915_gem_object_release_phys,
413 };
414
415 static const struct drm_i915_gem_object_ops i915_gem_object_ops;
416
417 int i915_gem_object_unbind(struct drm_i915_gem_object *obj)
418 {
419         struct i915_vma *vma;
420         LIST_HEAD(still_in_list);
421         int ret;
422
423         lockdep_assert_held(&obj->base.dev->struct_mutex);
424
425         /* Closed vma are removed from the obj->vma_list - but they may
426          * still have an active binding on the object. To remove those we
427          * must wait for all rendering to complete to the object (as unbinding
428          * must anyway), and retire the requests.
429          */
430         ret = i915_gem_object_set_to_cpu_domain(obj, false);
431         if (ret)
432                 return ret;
433
434         while ((vma = list_first_entry_or_null(&obj->vma_list,
435                                                struct i915_vma,
436                                                obj_link))) {
437                 list_move_tail(&vma->obj_link, &still_in_list);
438                 ret = i915_vma_unbind(vma);
439                 if (ret)
440                         break;
441         }
442         list_splice(&still_in_list, &obj->vma_list);
443
444         return ret;
445 }
446
447 static long
448 i915_gem_object_wait_fence(struct dma_fence *fence,
449                            unsigned int flags,
450                            long timeout,
451                            struct intel_rps_client *rps_client)
452 {
453         struct i915_request *rq;
454
455         BUILD_BUG_ON(I915_WAIT_INTERRUPTIBLE != 0x1);
456
457         if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags))
458                 return timeout;
459
460         if (!dma_fence_is_i915(fence))
461                 return dma_fence_wait_timeout(fence,
462                                               flags & I915_WAIT_INTERRUPTIBLE,
463                                               timeout);
464
465         rq = to_request(fence);
466         if (i915_request_completed(rq))
467                 goto out;
468
469         /*
470          * This client is about to stall waiting for the GPU. In many cases
471          * this is undesirable and limits the throughput of the system, as
472          * many clients cannot continue processing user input/output whilst
473          * blocked. RPS autotuning may take tens of milliseconds to respond
474          * to the GPU load and thus incurs additional latency for the client.
475          * We can circumvent that by promoting the GPU frequency to maximum
476          * before we wait. This makes the GPU throttle up much more quickly
477          * (good for benchmarks and user experience, e.g. window animations),
478          * but at a cost of spending more power processing the workload
479          * (bad for battery). Not all clients even want their results
480          * immediately and for them we should just let the GPU select its own
481          * frequency to maximise efficiency. To prevent a single client from
482          * forcing the clocks too high for the whole system, we only allow
483          * each client to waitboost once in a busy period.
484          */
485         if (rps_client && !i915_request_started(rq)) {
486                 if (INTEL_GEN(rq->i915) >= 6)
487                         gen6_rps_boost(rq, rps_client);
488         }
489
490         timeout = i915_request_wait(rq, flags, timeout);
491
492 out:
493         if (flags & I915_WAIT_LOCKED && i915_request_completed(rq))
494                 i915_request_retire_upto(rq);
495
496         return timeout;
497 }
498
499 static long
500 i915_gem_object_wait_reservation(struct reservation_object *resv,
501                                  unsigned int flags,
502                                  long timeout,
503                                  struct intel_rps_client *rps_client)
504 {
505         unsigned int seq = __read_seqcount_begin(&resv->seq);
506         struct dma_fence *excl;
507         bool prune_fences = false;
508
509         if (flags & I915_WAIT_ALL) {
510                 struct dma_fence **shared;
511                 unsigned int count, i;
512                 int ret;
513
514                 ret = reservation_object_get_fences_rcu(resv,
515                                                         &excl, &count, &shared);
516                 if (ret)
517                         return ret;
518
519                 for (i = 0; i < count; i++) {
520                         timeout = i915_gem_object_wait_fence(shared[i],
521                                                              flags, timeout,
522                                                              rps_client);
523                         if (timeout < 0)
524                                 break;
525
526                         dma_fence_put(shared[i]);
527                 }
528
529                 for (; i < count; i++)
530                         dma_fence_put(shared[i]);
531                 kfree(shared);
532
533                 /*
534                  * If both shared fences and an exclusive fence exist,
535                  * then by construction the shared fences must be later
536                  * than the exclusive fence. If we successfully wait for
537                  * all the shared fences, we know that the exclusive fence
538                  * must all be signaled. If all the shared fences are
539                  * signaled, we can prune the array and recover the
540                  * floating references on the fences/requests.
541                  */
542                 prune_fences = count && timeout >= 0;
543         } else {
544                 excl = reservation_object_get_excl_rcu(resv);
545         }
546
547         if (excl && timeout >= 0)
548                 timeout = i915_gem_object_wait_fence(excl, flags, timeout,
549                                                      rps_client);
550
551         dma_fence_put(excl);
552
553         /*
554          * Opportunistically prune the fences iff we know they have *all* been
555          * signaled and that the reservation object has not been changed (i.e.
556          * no new fences have been added).
557          */
558         if (prune_fences && !__read_seqcount_retry(&resv->seq, seq)) {
559                 if (reservation_object_trylock(resv)) {
560                         if (!__read_seqcount_retry(&resv->seq, seq))
561                                 reservation_object_add_excl_fence(resv, NULL);
562                         reservation_object_unlock(resv);
563                 }
564         }
565
566         return timeout;
567 }
568
569 static void __fence_set_priority(struct dma_fence *fence,
570                                  const struct i915_sched_attr *attr)
571 {
572         struct i915_request *rq;
573         struct intel_engine_cs *engine;
574
575         if (dma_fence_is_signaled(fence) || !dma_fence_is_i915(fence))
576                 return;
577
578         rq = to_request(fence);
579         engine = rq->engine;
580
581         local_bh_disable();
582         rcu_read_lock(); /* RCU serialisation for set-wedged protection */
583         if (engine->schedule)
584                 engine->schedule(rq, attr);
585         rcu_read_unlock();
586         local_bh_enable(); /* kick the tasklets if queues were reprioritised */
587 }
588
589 static void fence_set_priority(struct dma_fence *fence,
590                                const struct i915_sched_attr *attr)
591 {
592         /* Recurse once into a fence-array */
593         if (dma_fence_is_array(fence)) {
594                 struct dma_fence_array *array = to_dma_fence_array(fence);
595                 int i;
596
597                 for (i = 0; i < array->num_fences; i++)
598                         __fence_set_priority(array->fences[i], attr);
599         } else {
600                 __fence_set_priority(fence, attr);
601         }
602 }
603
604 int
605 i915_gem_object_wait_priority(struct drm_i915_gem_object *obj,
606                               unsigned int flags,
607                               const struct i915_sched_attr *attr)
608 {
609         struct dma_fence *excl;
610
611         if (flags & I915_WAIT_ALL) {
612                 struct dma_fence **shared;
613                 unsigned int count, i;
614                 int ret;
615
616                 ret = reservation_object_get_fences_rcu(obj->resv,
617                                                         &excl, &count, &shared);
618                 if (ret)
619                         return ret;
620
621                 for (i = 0; i < count; i++) {
622                         fence_set_priority(shared[i], attr);
623                         dma_fence_put(shared[i]);
624                 }
625
626                 kfree(shared);
627         } else {
628                 excl = reservation_object_get_excl_rcu(obj->resv);
629         }
630
631         if (excl) {
632                 fence_set_priority(excl, attr);
633                 dma_fence_put(excl);
634         }
635         return 0;
636 }
637
638 /**
639  * Waits for rendering to the object to be completed
640  * @obj: i915 gem object
641  * @flags: how to wait (under a lock, for all rendering or just for writes etc)
642  * @timeout: how long to wait
643  * @rps_client: client (user process) to charge for any waitboosting
644  */
645 int
646 i915_gem_object_wait(struct drm_i915_gem_object *obj,
647                      unsigned int flags,
648                      long timeout,
649                      struct intel_rps_client *rps_client)
650 {
651         might_sleep();
652 #if IS_ENABLED(CONFIG_LOCKDEP)
653         GEM_BUG_ON(debug_locks &&
654                    !!lockdep_is_held(&obj->base.dev->struct_mutex) !=
655                    !!(flags & I915_WAIT_LOCKED));
656 #endif
657         GEM_BUG_ON(timeout < 0);
658
659         timeout = i915_gem_object_wait_reservation(obj->resv,
660                                                    flags, timeout,
661                                                    rps_client);
662         return timeout < 0 ? timeout : 0;
663 }
664
665 static struct intel_rps_client *to_rps_client(struct drm_file *file)
666 {
667         struct drm_i915_file_private *fpriv = file->driver_priv;
668
669         return &fpriv->rps_client;
670 }
671
672 static int
673 i915_gem_phys_pwrite(struct drm_i915_gem_object *obj,
674                      struct drm_i915_gem_pwrite *args,
675                      struct drm_file *file)
676 {
677         void *vaddr = obj->phys_handle->vaddr + args->offset;
678         char __user *user_data = u64_to_user_ptr(args->data_ptr);
679
680         /* We manually control the domain here and pretend that it
681          * remains coherent i.e. in the GTT domain, like shmem_pwrite.
682          */
683         intel_fb_obj_invalidate(obj, ORIGIN_CPU);
684         if (copy_from_user(vaddr, user_data, args->size))
685                 return -EFAULT;
686
687         drm_clflush_virt_range(vaddr, args->size);
688         i915_gem_chipset_flush(to_i915(obj->base.dev));
689
690         intel_fb_obj_flush(obj, ORIGIN_CPU);
691         return 0;
692 }
693
694 void *i915_gem_object_alloc(struct drm_i915_private *dev_priv)
695 {
696         return kmem_cache_zalloc(dev_priv->objects, GFP_KERNEL);
697 }
698
699 void i915_gem_object_free(struct drm_i915_gem_object *obj)
700 {
701         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
702         kmem_cache_free(dev_priv->objects, obj);
703 }
704
705 static int
706 i915_gem_create(struct drm_file *file,
707                 struct drm_i915_private *dev_priv,
708                 uint64_t size,
709                 uint32_t *handle_p)
710 {
711         struct drm_i915_gem_object *obj;
712         int ret;
713         u32 handle;
714
715         size = roundup(size, PAGE_SIZE);
716         if (size == 0)
717                 return -EINVAL;
718
719         /* Allocate the new object */
720         obj = i915_gem_object_create(dev_priv, size);
721         if (IS_ERR(obj))
722                 return PTR_ERR(obj);
723
724         ret = drm_gem_handle_create(file, &obj->base, &handle);
725         /* drop reference from allocate - handle holds it now */
726         i915_gem_object_put(obj);
727         if (ret)
728                 return ret;
729
730         *handle_p = handle;
731         return 0;
732 }
733
734 int
735 i915_gem_dumb_create(struct drm_file *file,
736                      struct drm_device *dev,
737                      struct drm_mode_create_dumb *args)
738 {
739         /* have to work out size/pitch and return them */
740         args->pitch = ALIGN(args->width * DIV_ROUND_UP(args->bpp, 8), 64);
741         args->size = args->pitch * args->height;
742         return i915_gem_create(file, to_i915(dev),
743                                args->size, &args->handle);
744 }
745
746 static bool gpu_write_needs_clflush(struct drm_i915_gem_object *obj)
747 {
748         return !(obj->cache_level == I915_CACHE_NONE ||
749                  obj->cache_level == I915_CACHE_WT);
750 }
751
752 /**
753  * Creates a new mm object and returns a handle to it.
754  * @dev: drm device pointer
755  * @data: ioctl data blob
756  * @file: drm file pointer
757  */
758 int
759 i915_gem_create_ioctl(struct drm_device *dev, void *data,
760                       struct drm_file *file)
761 {
762         struct drm_i915_private *dev_priv = to_i915(dev);
763         struct drm_i915_gem_create *args = data;
764
765         i915_gem_flush_free_objects(dev_priv);
766
767         return i915_gem_create(file, dev_priv,
768                                args->size, &args->handle);
769 }
770
771 static inline enum fb_op_origin
772 fb_write_origin(struct drm_i915_gem_object *obj, unsigned int domain)
773 {
774         return (domain == I915_GEM_DOMAIN_GTT ?
775                 obj->frontbuffer_ggtt_origin : ORIGIN_CPU);
776 }
777
778 void i915_gem_flush_ggtt_writes(struct drm_i915_private *dev_priv)
779 {
780         /*
781          * No actual flushing is required for the GTT write domain for reads
782          * from the GTT domain. Writes to it "immediately" go to main memory
783          * as far as we know, so there's no chipset flush. It also doesn't
784          * land in the GPU render cache.
785          *
786          * However, we do have to enforce the order so that all writes through
787          * the GTT land before any writes to the device, such as updates to
788          * the GATT itself.
789          *
790          * We also have to wait a bit for the writes to land from the GTT.
791          * An uncached read (i.e. mmio) seems to be ideal for the round-trip
792          * timing. This issue has only been observed when switching quickly
793          * between GTT writes and CPU reads from inside the kernel on recent hw,
794          * and it appears to only affect discrete GTT blocks (i.e. on LLC
795          * system agents we cannot reproduce this behaviour, until Cannonlake
796          * that was!).
797          */
798
799         wmb();
800
801         intel_runtime_pm_get(dev_priv);
802         spin_lock_irq(&dev_priv->uncore.lock);
803
804         POSTING_READ_FW(RING_HEAD(RENDER_RING_BASE));
805
806         spin_unlock_irq(&dev_priv->uncore.lock);
807         intel_runtime_pm_put(dev_priv);
808 }
809
810 static void
811 flush_write_domain(struct drm_i915_gem_object *obj, unsigned int flush_domains)
812 {
813         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
814         struct i915_vma *vma;
815
816         if (!(obj->write_domain & flush_domains))
817                 return;
818
819         switch (obj->write_domain) {
820         case I915_GEM_DOMAIN_GTT:
821                 i915_gem_flush_ggtt_writes(dev_priv);
822
823                 intel_fb_obj_flush(obj,
824                                    fb_write_origin(obj, I915_GEM_DOMAIN_GTT));
825
826                 for_each_ggtt_vma(vma, obj) {
827                         if (vma->iomap)
828                                 continue;
829
830                         i915_vma_unset_ggtt_write(vma);
831                 }
832                 break;
833
834         case I915_GEM_DOMAIN_CPU:
835                 i915_gem_clflush_object(obj, I915_CLFLUSH_SYNC);
836                 break;
837
838         case I915_GEM_DOMAIN_RENDER:
839                 if (gpu_write_needs_clflush(obj))
840                         obj->cache_dirty = true;
841                 break;
842         }
843
844         obj->write_domain = 0;
845 }
846
847 static inline int
848 __copy_to_user_swizzled(char __user *cpu_vaddr,
849                         const char *gpu_vaddr, int gpu_offset,
850                         int length)
851 {
852         int ret, cpu_offset = 0;
853
854         while (length > 0) {
855                 int cacheline_end = ALIGN(gpu_offset + 1, 64);
856                 int this_length = min(cacheline_end - gpu_offset, length);
857                 int swizzled_gpu_offset = gpu_offset ^ 64;
858
859                 ret = __copy_to_user(cpu_vaddr + cpu_offset,
860                                      gpu_vaddr + swizzled_gpu_offset,
861                                      this_length);
862                 if (ret)
863                         return ret + length;
864
865                 cpu_offset += this_length;
866                 gpu_offset += this_length;
867                 length -= this_length;
868         }
869
870         return 0;
871 }
872
873 static inline int
874 __copy_from_user_swizzled(char *gpu_vaddr, int gpu_offset,
875                           const char __user *cpu_vaddr,
876                           int length)
877 {
878         int ret, cpu_offset = 0;
879
880         while (length > 0) {
881                 int cacheline_end = ALIGN(gpu_offset + 1, 64);
882                 int this_length = min(cacheline_end - gpu_offset, length);
883                 int swizzled_gpu_offset = gpu_offset ^ 64;
884
885                 ret = __copy_from_user(gpu_vaddr + swizzled_gpu_offset,
886                                        cpu_vaddr + cpu_offset,
887                                        this_length);
888                 if (ret)
889                         return ret + length;
890
891                 cpu_offset += this_length;
892                 gpu_offset += this_length;
893                 length -= this_length;
894         }
895
896         return 0;
897 }
898
899 /*
900  * Pins the specified object's pages and synchronizes the object with
901  * GPU accesses. Sets needs_clflush to non-zero if the caller should
902  * flush the object from the CPU cache.
903  */
904 int i915_gem_obj_prepare_shmem_read(struct drm_i915_gem_object *obj,
905                                     unsigned int *needs_clflush)
906 {
907         int ret;
908
909         lockdep_assert_held(&obj->base.dev->struct_mutex);
910
911         *needs_clflush = 0;
912         if (!i915_gem_object_has_struct_page(obj))
913                 return -ENODEV;
914
915         ret = i915_gem_object_wait(obj,
916                                    I915_WAIT_INTERRUPTIBLE |
917                                    I915_WAIT_LOCKED,
918                                    MAX_SCHEDULE_TIMEOUT,
919                                    NULL);
920         if (ret)
921                 return ret;
922
923         ret = i915_gem_object_pin_pages(obj);
924         if (ret)
925                 return ret;
926
927         if (obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ ||
928             !static_cpu_has(X86_FEATURE_CLFLUSH)) {
929                 ret = i915_gem_object_set_to_cpu_domain(obj, false);
930                 if (ret)
931                         goto err_unpin;
932                 else
933                         goto out;
934         }
935
936         flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
937
938         /* If we're not in the cpu read domain, set ourself into the gtt
939          * read domain and manually flush cachelines (if required). This
940          * optimizes for the case when the gpu will dirty the data
941          * anyway again before the next pread happens.
942          */
943         if (!obj->cache_dirty &&
944             !(obj->read_domains & I915_GEM_DOMAIN_CPU))
945                 *needs_clflush = CLFLUSH_BEFORE;
946
947 out:
948         /* return with the pages pinned */
949         return 0;
950
951 err_unpin:
952         i915_gem_object_unpin_pages(obj);
953         return ret;
954 }
955
956 int i915_gem_obj_prepare_shmem_write(struct drm_i915_gem_object *obj,
957                                      unsigned int *needs_clflush)
958 {
959         int ret;
960
961         lockdep_assert_held(&obj->base.dev->struct_mutex);
962
963         *needs_clflush = 0;
964         if (!i915_gem_object_has_struct_page(obj))
965                 return -ENODEV;
966
967         ret = i915_gem_object_wait(obj,
968                                    I915_WAIT_INTERRUPTIBLE |
969                                    I915_WAIT_LOCKED |
970                                    I915_WAIT_ALL,
971                                    MAX_SCHEDULE_TIMEOUT,
972                                    NULL);
973         if (ret)
974                 return ret;
975
976         ret = i915_gem_object_pin_pages(obj);
977         if (ret)
978                 return ret;
979
980         if (obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE ||
981             !static_cpu_has(X86_FEATURE_CLFLUSH)) {
982                 ret = i915_gem_object_set_to_cpu_domain(obj, true);
983                 if (ret)
984                         goto err_unpin;
985                 else
986                         goto out;
987         }
988
989         flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
990
991         /* If we're not in the cpu write domain, set ourself into the
992          * gtt write domain and manually flush cachelines (as required).
993          * This optimizes for the case when the gpu will use the data
994          * right away and we therefore have to clflush anyway.
995          */
996         if (!obj->cache_dirty) {
997                 *needs_clflush |= CLFLUSH_AFTER;
998
999                 /*
1000                  * Same trick applies to invalidate partially written
1001                  * cachelines read before writing.
1002                  */
1003                 if (!(obj->read_domains & I915_GEM_DOMAIN_CPU))
1004                         *needs_clflush |= CLFLUSH_BEFORE;
1005         }
1006
1007 out:
1008         intel_fb_obj_invalidate(obj, ORIGIN_CPU);
1009         obj->mm.dirty = true;
1010         /* return with the pages pinned */
1011         return 0;
1012
1013 err_unpin:
1014         i915_gem_object_unpin_pages(obj);
1015         return ret;
1016 }
1017
1018 static void
1019 shmem_clflush_swizzled_range(char *addr, unsigned long length,
1020                              bool swizzled)
1021 {
1022         if (unlikely(swizzled)) {
1023                 unsigned long start = (unsigned long) addr;
1024                 unsigned long end = (unsigned long) addr + length;
1025
1026                 /* For swizzling simply ensure that we always flush both
1027                  * channels. Lame, but simple and it works. Swizzled
1028                  * pwrite/pread is far from a hotpath - current userspace
1029                  * doesn't use it at all. */
1030                 start = round_down(start, 128);
1031                 end = round_up(end, 128);
1032
1033                 drm_clflush_virt_range((void *)start, end - start);
1034         } else {
1035                 drm_clflush_virt_range(addr, length);
1036         }
1037
1038 }
1039
1040 /* Only difference to the fast-path function is that this can handle bit17
1041  * and uses non-atomic copy and kmap functions. */
1042 static int
1043 shmem_pread_slow(struct page *page, int offset, int length,
1044                  char __user *user_data,
1045                  bool page_do_bit17_swizzling, bool needs_clflush)
1046 {
1047         char *vaddr;
1048         int ret;
1049
1050         vaddr = kmap(page);
1051         if (needs_clflush)
1052                 shmem_clflush_swizzled_range(vaddr + offset, length,
1053                                              page_do_bit17_swizzling);
1054
1055         if (page_do_bit17_swizzling)
1056                 ret = __copy_to_user_swizzled(user_data, vaddr, offset, length);
1057         else
1058                 ret = __copy_to_user(user_data, vaddr + offset, length);
1059         kunmap(page);
1060
1061         return ret ? - EFAULT : 0;
1062 }
1063
1064 static int
1065 shmem_pread(struct page *page, int offset, int length, char __user *user_data,
1066             bool page_do_bit17_swizzling, bool needs_clflush)
1067 {
1068         int ret;
1069
1070         ret = -ENODEV;
1071         if (!page_do_bit17_swizzling) {
1072                 char *vaddr = kmap_atomic(page);
1073
1074                 if (needs_clflush)
1075                         drm_clflush_virt_range(vaddr + offset, length);
1076                 ret = __copy_to_user_inatomic(user_data, vaddr + offset, length);
1077                 kunmap_atomic(vaddr);
1078         }
1079         if (ret == 0)
1080                 return 0;
1081
1082         return shmem_pread_slow(page, offset, length, user_data,
1083                                 page_do_bit17_swizzling, needs_clflush);
1084 }
1085
1086 static int
1087 i915_gem_shmem_pread(struct drm_i915_gem_object *obj,
1088                      struct drm_i915_gem_pread *args)
1089 {
1090         char __user *user_data;
1091         u64 remain;
1092         unsigned int obj_do_bit17_swizzling;
1093         unsigned int needs_clflush;
1094         unsigned int idx, offset;
1095         int ret;
1096
1097         obj_do_bit17_swizzling = 0;
1098         if (i915_gem_object_needs_bit17_swizzle(obj))
1099                 obj_do_bit17_swizzling = BIT(17);
1100
1101         ret = mutex_lock_interruptible(&obj->base.dev->struct_mutex);
1102         if (ret)
1103                 return ret;
1104
1105         ret = i915_gem_obj_prepare_shmem_read(obj, &needs_clflush);
1106         mutex_unlock(&obj->base.dev->struct_mutex);
1107         if (ret)
1108                 return ret;
1109
1110         remain = args->size;
1111         user_data = u64_to_user_ptr(args->data_ptr);
1112         offset = offset_in_page(args->offset);
1113         for (idx = args->offset >> PAGE_SHIFT; remain; idx++) {
1114                 struct page *page = i915_gem_object_get_page(obj, idx);
1115                 int length;
1116
1117                 length = remain;
1118                 if (offset + length > PAGE_SIZE)
1119                         length = PAGE_SIZE - offset;
1120
1121                 ret = shmem_pread(page, offset, length, user_data,
1122                                   page_to_phys(page) & obj_do_bit17_swizzling,
1123                                   needs_clflush);
1124                 if (ret)
1125                         break;
1126
1127                 remain -= length;
1128                 user_data += length;
1129                 offset = 0;
1130         }
1131
1132         i915_gem_obj_finish_shmem_access(obj);
1133         return ret;
1134 }
1135
1136 static inline bool
1137 gtt_user_read(struct io_mapping *mapping,
1138               loff_t base, int offset,
1139               char __user *user_data, int length)
1140 {
1141         void __iomem *vaddr;
1142         unsigned long unwritten;
1143
1144         /* We can use the cpu mem copy function because this is X86. */
1145         vaddr = io_mapping_map_atomic_wc(mapping, base);
1146         unwritten = __copy_to_user_inatomic(user_data,
1147                                             (void __force *)vaddr + offset,
1148                                             length);
1149         io_mapping_unmap_atomic(vaddr);
1150         if (unwritten) {
1151                 vaddr = io_mapping_map_wc(mapping, base, PAGE_SIZE);
1152                 unwritten = copy_to_user(user_data,
1153                                          (void __force *)vaddr + offset,
1154                                          length);
1155                 io_mapping_unmap(vaddr);
1156         }
1157         return unwritten;
1158 }
1159
1160 static int
1161 i915_gem_gtt_pread(struct drm_i915_gem_object *obj,
1162                    const struct drm_i915_gem_pread *args)
1163 {
1164         struct drm_i915_private *i915 = to_i915(obj->base.dev);
1165         struct i915_ggtt *ggtt = &i915->ggtt;
1166         struct drm_mm_node node;
1167         struct i915_vma *vma;
1168         void __user *user_data;
1169         u64 remain, offset;
1170         int ret;
1171
1172         ret = mutex_lock_interruptible(&i915->drm.struct_mutex);
1173         if (ret)
1174                 return ret;
1175
1176         intel_runtime_pm_get(i915);
1177         vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
1178                                        PIN_MAPPABLE |
1179                                        PIN_NONFAULT |
1180                                        PIN_NONBLOCK);
1181         if (!IS_ERR(vma)) {
1182                 node.start = i915_ggtt_offset(vma);
1183                 node.allocated = false;
1184                 ret = i915_vma_put_fence(vma);
1185                 if (ret) {
1186                         i915_vma_unpin(vma);
1187                         vma = ERR_PTR(ret);
1188                 }
1189         }
1190         if (IS_ERR(vma)) {
1191                 ret = insert_mappable_node(ggtt, &node, PAGE_SIZE);
1192                 if (ret)
1193                         goto out_unlock;
1194                 GEM_BUG_ON(!node.allocated);
1195         }
1196
1197         ret = i915_gem_object_set_to_gtt_domain(obj, false);
1198         if (ret)
1199                 goto out_unpin;
1200
1201         mutex_unlock(&i915->drm.struct_mutex);
1202
1203         user_data = u64_to_user_ptr(args->data_ptr);
1204         remain = args->size;
1205         offset = args->offset;
1206
1207         while (remain > 0) {
1208                 /* Operation in this page
1209                  *
1210                  * page_base = page offset within aperture
1211                  * page_offset = offset within page
1212                  * page_length = bytes to copy for this page
1213                  */
1214                 u32 page_base = node.start;
1215                 unsigned page_offset = offset_in_page(offset);
1216                 unsigned page_length = PAGE_SIZE - page_offset;
1217                 page_length = remain < page_length ? remain : page_length;
1218                 if (node.allocated) {
1219                         wmb();
1220                         ggtt->base.insert_page(&ggtt->base,
1221                                                i915_gem_object_get_dma_address(obj, offset >> PAGE_SHIFT),
1222                                                node.start, I915_CACHE_NONE, 0);
1223                         wmb();
1224                 } else {
1225                         page_base += offset & PAGE_MASK;
1226                 }
1227
1228                 if (gtt_user_read(&ggtt->iomap, page_base, page_offset,
1229                                   user_data, page_length)) {
1230                         ret = -EFAULT;
1231                         break;
1232                 }
1233
1234                 remain -= page_length;
1235                 user_data += page_length;
1236                 offset += page_length;
1237         }
1238
1239         mutex_lock(&i915->drm.struct_mutex);
1240 out_unpin:
1241         if (node.allocated) {
1242                 wmb();
1243                 ggtt->base.clear_range(&ggtt->base,
1244                                        node.start, node.size);
1245                 remove_mappable_node(&node);
1246         } else {
1247                 i915_vma_unpin(vma);
1248         }
1249 out_unlock:
1250         intel_runtime_pm_put(i915);
1251         mutex_unlock(&i915->drm.struct_mutex);
1252
1253         return ret;
1254 }
1255
1256 /**
1257  * Reads data from the object referenced by handle.
1258  * @dev: drm device pointer
1259  * @data: ioctl data blob
1260  * @file: drm file pointer
1261  *
1262  * On error, the contents of *data are undefined.
1263  */
1264 int
1265 i915_gem_pread_ioctl(struct drm_device *dev, void *data,
1266                      struct drm_file *file)
1267 {
1268         struct drm_i915_gem_pread *args = data;
1269         struct drm_i915_gem_object *obj;
1270         int ret;
1271
1272         if (args->size == 0)
1273                 return 0;
1274
1275         if (!access_ok(VERIFY_WRITE,
1276                        u64_to_user_ptr(args->data_ptr),
1277                        args->size))
1278                 return -EFAULT;
1279
1280         obj = i915_gem_object_lookup(file, args->handle);
1281         if (!obj)
1282                 return -ENOENT;
1283
1284         /* Bounds check source.  */
1285         if (range_overflows_t(u64, args->offset, args->size, obj->base.size)) {
1286                 ret = -EINVAL;
1287                 goto out;
1288         }
1289
1290         trace_i915_gem_object_pread(obj, args->offset, args->size);
1291
1292         ret = i915_gem_object_wait(obj,
1293                                    I915_WAIT_INTERRUPTIBLE,
1294                                    MAX_SCHEDULE_TIMEOUT,
1295                                    to_rps_client(file));
1296         if (ret)
1297                 goto out;
1298
1299         ret = i915_gem_object_pin_pages(obj);
1300         if (ret)
1301                 goto out;
1302
1303         ret = i915_gem_shmem_pread(obj, args);
1304         if (ret == -EFAULT || ret == -ENODEV)
1305                 ret = i915_gem_gtt_pread(obj, args);
1306
1307         i915_gem_object_unpin_pages(obj);
1308 out:
1309         i915_gem_object_put(obj);
1310         return ret;
1311 }
1312
1313 /* This is the fast write path which cannot handle
1314  * page faults in the source data
1315  */
1316
1317 static inline bool
1318 ggtt_write(struct io_mapping *mapping,
1319            loff_t base, int offset,
1320            char __user *user_data, int length)
1321 {
1322         void __iomem *vaddr;
1323         unsigned long unwritten;
1324
1325         /* We can use the cpu mem copy function because this is X86. */
1326         vaddr = io_mapping_map_atomic_wc(mapping, base);
1327         unwritten = __copy_from_user_inatomic_nocache((void __force *)vaddr + offset,
1328                                                       user_data, length);
1329         io_mapping_unmap_atomic(vaddr);
1330         if (unwritten) {
1331                 vaddr = io_mapping_map_wc(mapping, base, PAGE_SIZE);
1332                 unwritten = copy_from_user((void __force *)vaddr + offset,
1333                                            user_data, length);
1334                 io_mapping_unmap(vaddr);
1335         }
1336
1337         return unwritten;
1338 }
1339
1340 /**
1341  * This is the fast pwrite path, where we copy the data directly from the
1342  * user into the GTT, uncached.
1343  * @obj: i915 GEM object
1344  * @args: pwrite arguments structure
1345  */
1346 static int
1347 i915_gem_gtt_pwrite_fast(struct drm_i915_gem_object *obj,
1348                          const struct drm_i915_gem_pwrite *args)
1349 {
1350         struct drm_i915_private *i915 = to_i915(obj->base.dev);
1351         struct i915_ggtt *ggtt = &i915->ggtt;
1352         struct drm_mm_node node;
1353         struct i915_vma *vma;
1354         u64 remain, offset;
1355         void __user *user_data;
1356         int ret;
1357
1358         ret = mutex_lock_interruptible(&i915->drm.struct_mutex);
1359         if (ret)
1360                 return ret;
1361
1362         if (i915_gem_object_has_struct_page(obj)) {
1363                 /*
1364                  * Avoid waking the device up if we can fallback, as
1365                  * waking/resuming is very slow (worst-case 10-100 ms
1366                  * depending on PCI sleeps and our own resume time).
1367                  * This easily dwarfs any performance advantage from
1368                  * using the cache bypass of indirect GGTT access.
1369                  */
1370                 if (!intel_runtime_pm_get_if_in_use(i915)) {
1371                         ret = -EFAULT;
1372                         goto out_unlock;
1373                 }
1374         } else {
1375                 /* No backing pages, no fallback, we must force GGTT access */
1376                 intel_runtime_pm_get(i915);
1377         }
1378
1379         vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
1380                                        PIN_MAPPABLE |
1381                                        PIN_NONFAULT |
1382                                        PIN_NONBLOCK);
1383         if (!IS_ERR(vma)) {
1384                 node.start = i915_ggtt_offset(vma);
1385                 node.allocated = false;
1386                 ret = i915_vma_put_fence(vma);
1387                 if (ret) {
1388                         i915_vma_unpin(vma);
1389                         vma = ERR_PTR(ret);
1390                 }
1391         }
1392         if (IS_ERR(vma)) {
1393                 ret = insert_mappable_node(ggtt, &node, PAGE_SIZE);
1394                 if (ret)
1395                         goto out_rpm;
1396                 GEM_BUG_ON(!node.allocated);
1397         }
1398
1399         ret = i915_gem_object_set_to_gtt_domain(obj, true);
1400         if (ret)
1401                 goto out_unpin;
1402
1403         mutex_unlock(&i915->drm.struct_mutex);
1404
1405         intel_fb_obj_invalidate(obj, ORIGIN_CPU);
1406
1407         user_data = u64_to_user_ptr(args->data_ptr);
1408         offset = args->offset;
1409         remain = args->size;
1410         while (remain) {
1411                 /* Operation in this page
1412                  *
1413                  * page_base = page offset within aperture
1414                  * page_offset = offset within page
1415                  * page_length = bytes to copy for this page
1416                  */
1417                 u32 page_base = node.start;
1418                 unsigned int page_offset = offset_in_page(offset);
1419                 unsigned int page_length = PAGE_SIZE - page_offset;
1420                 page_length = remain < page_length ? remain : page_length;
1421                 if (node.allocated) {
1422                         wmb(); /* flush the write before we modify the GGTT */
1423                         ggtt->base.insert_page(&ggtt->base,
1424                                                i915_gem_object_get_dma_address(obj, offset >> PAGE_SHIFT),
1425                                                node.start, I915_CACHE_NONE, 0);
1426                         wmb(); /* flush modifications to the GGTT (insert_page) */
1427                 } else {
1428                         page_base += offset & PAGE_MASK;
1429                 }
1430                 /* If we get a fault while copying data, then (presumably) our
1431                  * source page isn't available.  Return the error and we'll
1432                  * retry in the slow path.
1433                  * If the object is non-shmem backed, we retry again with the
1434                  * path that handles page fault.
1435                  */
1436                 if (ggtt_write(&ggtt->iomap, page_base, page_offset,
1437                                user_data, page_length)) {
1438                         ret = -EFAULT;
1439                         break;
1440                 }
1441
1442                 remain -= page_length;
1443                 user_data += page_length;
1444                 offset += page_length;
1445         }
1446         intel_fb_obj_flush(obj, ORIGIN_CPU);
1447
1448         mutex_lock(&i915->drm.struct_mutex);
1449 out_unpin:
1450         if (node.allocated) {
1451                 wmb();
1452                 ggtt->base.clear_range(&ggtt->base,
1453                                        node.start, node.size);
1454                 remove_mappable_node(&node);
1455         } else {
1456                 i915_vma_unpin(vma);
1457         }
1458 out_rpm:
1459         intel_runtime_pm_put(i915);
1460 out_unlock:
1461         mutex_unlock(&i915->drm.struct_mutex);
1462         return ret;
1463 }
1464
1465 static int
1466 shmem_pwrite_slow(struct page *page, int offset, int length,
1467                   char __user *user_data,
1468                   bool page_do_bit17_swizzling,
1469                   bool needs_clflush_before,
1470                   bool needs_clflush_after)
1471 {
1472         char *vaddr;
1473         int ret;
1474
1475         vaddr = kmap(page);
1476         if (unlikely(needs_clflush_before || page_do_bit17_swizzling))
1477                 shmem_clflush_swizzled_range(vaddr + offset, length,
1478                                              page_do_bit17_swizzling);
1479         if (page_do_bit17_swizzling)
1480                 ret = __copy_from_user_swizzled(vaddr, offset, user_data,
1481                                                 length);
1482         else
1483                 ret = __copy_from_user(vaddr + offset, user_data, length);
1484         if (needs_clflush_after)
1485                 shmem_clflush_swizzled_range(vaddr + offset, length,
1486                                              page_do_bit17_swizzling);
1487         kunmap(page);
1488
1489         return ret ? -EFAULT : 0;
1490 }
1491
1492 /* Per-page copy function for the shmem pwrite fastpath.
1493  * Flushes invalid cachelines before writing to the target if
1494  * needs_clflush_before is set and flushes out any written cachelines after
1495  * writing if needs_clflush is set.
1496  */
1497 static int
1498 shmem_pwrite(struct page *page, int offset, int len, char __user *user_data,
1499              bool page_do_bit17_swizzling,
1500              bool needs_clflush_before,
1501              bool needs_clflush_after)
1502 {
1503         int ret;
1504
1505         ret = -ENODEV;
1506         if (!page_do_bit17_swizzling) {
1507                 char *vaddr = kmap_atomic(page);
1508
1509                 if (needs_clflush_before)
1510                         drm_clflush_virt_range(vaddr + offset, len);
1511                 ret = __copy_from_user_inatomic(vaddr + offset, user_data, len);
1512                 if (needs_clflush_after)
1513                         drm_clflush_virt_range(vaddr + offset, len);
1514
1515                 kunmap_atomic(vaddr);
1516         }
1517         if (ret == 0)
1518                 return ret;
1519
1520         return shmem_pwrite_slow(page, offset, len, user_data,
1521                                  page_do_bit17_swizzling,
1522                                  needs_clflush_before,
1523                                  needs_clflush_after);
1524 }
1525
1526 static int
1527 i915_gem_shmem_pwrite(struct drm_i915_gem_object *obj,
1528                       const struct drm_i915_gem_pwrite *args)
1529 {
1530         struct drm_i915_private *i915 = to_i915(obj->base.dev);
1531         void __user *user_data;
1532         u64 remain;
1533         unsigned int obj_do_bit17_swizzling;
1534         unsigned int partial_cacheline_write;
1535         unsigned int needs_clflush;
1536         unsigned int offset, idx;
1537         int ret;
1538
1539         ret = mutex_lock_interruptible(&i915->drm.struct_mutex);
1540         if (ret)
1541                 return ret;
1542
1543         ret = i915_gem_obj_prepare_shmem_write(obj, &needs_clflush);
1544         mutex_unlock(&i915->drm.struct_mutex);
1545         if (ret)
1546                 return ret;
1547
1548         obj_do_bit17_swizzling = 0;
1549         if (i915_gem_object_needs_bit17_swizzle(obj))
1550                 obj_do_bit17_swizzling = BIT(17);
1551
1552         /* If we don't overwrite a cacheline completely we need to be
1553          * careful to have up-to-date data by first clflushing. Don't
1554          * overcomplicate things and flush the entire patch.
1555          */
1556         partial_cacheline_write = 0;
1557         if (needs_clflush & CLFLUSH_BEFORE)
1558                 partial_cacheline_write = boot_cpu_data.x86_clflush_size - 1;
1559
1560         user_data = u64_to_user_ptr(args->data_ptr);
1561         remain = args->size;
1562         offset = offset_in_page(args->offset);
1563         for (idx = args->offset >> PAGE_SHIFT; remain; idx++) {
1564                 struct page *page = i915_gem_object_get_page(obj, idx);
1565                 int length;
1566
1567                 length = remain;
1568                 if (offset + length > PAGE_SIZE)
1569                         length = PAGE_SIZE - offset;
1570
1571                 ret = shmem_pwrite(page, offset, length, user_data,
1572                                    page_to_phys(page) & obj_do_bit17_swizzling,
1573                                    (offset | length) & partial_cacheline_write,
1574                                    needs_clflush & CLFLUSH_AFTER);
1575                 if (ret)
1576                         break;
1577
1578                 remain -= length;
1579                 user_data += length;
1580                 offset = 0;
1581         }
1582
1583         intel_fb_obj_flush(obj, ORIGIN_CPU);
1584         i915_gem_obj_finish_shmem_access(obj);
1585         return ret;
1586 }
1587
1588 /**
1589  * Writes data to the object referenced by handle.
1590  * @dev: drm device
1591  * @data: ioctl data blob
1592  * @file: drm file
1593  *
1594  * On error, the contents of the buffer that were to be modified are undefined.
1595  */
1596 int
1597 i915_gem_pwrite_ioctl(struct drm_device *dev, void *data,
1598                       struct drm_file *file)
1599 {
1600         struct drm_i915_gem_pwrite *args = data;
1601         struct drm_i915_gem_object *obj;
1602         int ret;
1603
1604         if (args->size == 0)
1605                 return 0;
1606
1607         if (!access_ok(VERIFY_READ,
1608                        u64_to_user_ptr(args->data_ptr),
1609                        args->size))
1610                 return -EFAULT;
1611
1612         obj = i915_gem_object_lookup(file, args->handle);
1613         if (!obj)
1614                 return -ENOENT;
1615
1616         /* Bounds check destination. */
1617         if (range_overflows_t(u64, args->offset, args->size, obj->base.size)) {
1618                 ret = -EINVAL;
1619                 goto err;
1620         }
1621
1622         trace_i915_gem_object_pwrite(obj, args->offset, args->size);
1623
1624         ret = -ENODEV;
1625         if (obj->ops->pwrite)
1626                 ret = obj->ops->pwrite(obj, args);
1627         if (ret != -ENODEV)
1628                 goto err;
1629
1630         ret = i915_gem_object_wait(obj,
1631                                    I915_WAIT_INTERRUPTIBLE |
1632                                    I915_WAIT_ALL,
1633                                    MAX_SCHEDULE_TIMEOUT,
1634                                    to_rps_client(file));
1635         if (ret)
1636                 goto err;
1637
1638         ret = i915_gem_object_pin_pages(obj);
1639         if (ret)
1640                 goto err;
1641
1642         ret = -EFAULT;
1643         /* We can only do the GTT pwrite on untiled buffers, as otherwise
1644          * it would end up going through the fenced access, and we'll get
1645          * different detiling behavior between reading and writing.
1646          * pread/pwrite currently are reading and writing from the CPU
1647          * perspective, requiring manual detiling by the client.
1648          */
1649         if (!i915_gem_object_has_struct_page(obj) ||
1650             cpu_write_needs_clflush(obj))
1651                 /* Note that the gtt paths might fail with non-page-backed user
1652                  * pointers (e.g. gtt mappings when moving data between
1653                  * textures). Fallback to the shmem path in that case.
1654                  */
1655                 ret = i915_gem_gtt_pwrite_fast(obj, args);
1656
1657         if (ret == -EFAULT || ret == -ENOSPC) {
1658                 if (obj->phys_handle)
1659                         ret = i915_gem_phys_pwrite(obj, args, file);
1660                 else
1661                         ret = i915_gem_shmem_pwrite(obj, args);
1662         }
1663
1664         i915_gem_object_unpin_pages(obj);
1665 err:
1666         i915_gem_object_put(obj);
1667         return ret;
1668 }
1669
1670 static void i915_gem_object_bump_inactive_ggtt(struct drm_i915_gem_object *obj)
1671 {
1672         struct drm_i915_private *i915;
1673         struct list_head *list;
1674         struct i915_vma *vma;
1675
1676         GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
1677
1678         for_each_ggtt_vma(vma, obj) {
1679                 if (i915_vma_is_active(vma))
1680                         continue;
1681
1682                 if (!drm_mm_node_allocated(&vma->node))
1683                         continue;
1684
1685                 list_move_tail(&vma->vm_link, &vma->vm->inactive_list);
1686         }
1687
1688         i915 = to_i915(obj->base.dev);
1689         spin_lock(&i915->mm.obj_lock);
1690         list = obj->bind_count ? &i915->mm.bound_list : &i915->mm.unbound_list;
1691         list_move_tail(&obj->mm.link, list);
1692         spin_unlock(&i915->mm.obj_lock);
1693 }
1694
1695 /**
1696  * Called when user space prepares to use an object with the CPU, either
1697  * through the mmap ioctl's mapping or a GTT mapping.
1698  * @dev: drm device
1699  * @data: ioctl data blob
1700  * @file: drm file
1701  */
1702 int
1703 i915_gem_set_domain_ioctl(struct drm_device *dev, void *data,
1704                           struct drm_file *file)
1705 {
1706         struct drm_i915_gem_set_domain *args = data;
1707         struct drm_i915_gem_object *obj;
1708         uint32_t read_domains = args->read_domains;
1709         uint32_t write_domain = args->write_domain;
1710         int err;
1711
1712         /* Only handle setting domains to types used by the CPU. */
1713         if ((write_domain | read_domains) & I915_GEM_GPU_DOMAINS)
1714                 return -EINVAL;
1715
1716         /* Having something in the write domain implies it's in the read
1717          * domain, and only that read domain.  Enforce that in the request.
1718          */
1719         if (write_domain != 0 && read_domains != write_domain)
1720                 return -EINVAL;
1721
1722         obj = i915_gem_object_lookup(file, args->handle);
1723         if (!obj)
1724                 return -ENOENT;
1725
1726         /* Try to flush the object off the GPU without holding the lock.
1727          * We will repeat the flush holding the lock in the normal manner
1728          * to catch cases where we are gazumped.
1729          */
1730         err = i915_gem_object_wait(obj,
1731                                    I915_WAIT_INTERRUPTIBLE |
1732                                    (write_domain ? I915_WAIT_ALL : 0),
1733                                    MAX_SCHEDULE_TIMEOUT,
1734                                    to_rps_client(file));
1735         if (err)
1736                 goto out;
1737
1738         /*
1739          * Proxy objects do not control access to the backing storage, ergo
1740          * they cannot be used as a means to manipulate the cache domain
1741          * tracking for that backing storage. The proxy object is always
1742          * considered to be outside of any cache domain.
1743          */
1744         if (i915_gem_object_is_proxy(obj)) {
1745                 err = -ENXIO;
1746                 goto out;
1747         }
1748
1749         /*
1750          * Flush and acquire obj->pages so that we are coherent through
1751          * direct access in memory with previous cached writes through
1752          * shmemfs and that our cache domain tracking remains valid.
1753          * For example, if the obj->filp was moved to swap without us
1754          * being notified and releasing the pages, we would mistakenly
1755          * continue to assume that the obj remained out of the CPU cached
1756          * domain.
1757          */
1758         err = i915_gem_object_pin_pages(obj);
1759         if (err)
1760                 goto out;
1761
1762         err = i915_mutex_lock_interruptible(dev);
1763         if (err)
1764                 goto out_unpin;
1765
1766         if (read_domains & I915_GEM_DOMAIN_WC)
1767                 err = i915_gem_object_set_to_wc_domain(obj, write_domain);
1768         else if (read_domains & I915_GEM_DOMAIN_GTT)
1769                 err = i915_gem_object_set_to_gtt_domain(obj, write_domain);
1770         else
1771                 err = i915_gem_object_set_to_cpu_domain(obj, write_domain);
1772
1773         /* And bump the LRU for this access */
1774         i915_gem_object_bump_inactive_ggtt(obj);
1775
1776         mutex_unlock(&dev->struct_mutex);
1777
1778         if (write_domain != 0)
1779                 intel_fb_obj_invalidate(obj,
1780                                         fb_write_origin(obj, write_domain));
1781
1782 out_unpin:
1783         i915_gem_object_unpin_pages(obj);
1784 out:
1785         i915_gem_object_put(obj);
1786         return err;
1787 }
1788
1789 /**
1790  * Called when user space has done writes to this buffer
1791  * @dev: drm device
1792  * @data: ioctl data blob
1793  * @file: drm file
1794  */
1795 int
1796 i915_gem_sw_finish_ioctl(struct drm_device *dev, void *data,
1797                          struct drm_file *file)
1798 {
1799         struct drm_i915_gem_sw_finish *args = data;
1800         struct drm_i915_gem_object *obj;
1801
1802         obj = i915_gem_object_lookup(file, args->handle);
1803         if (!obj)
1804                 return -ENOENT;
1805
1806         /*
1807          * Proxy objects are barred from CPU access, so there is no
1808          * need to ban sw_finish as it is a nop.
1809          */
1810
1811         /* Pinned buffers may be scanout, so flush the cache */
1812         i915_gem_object_flush_if_display(obj);
1813         i915_gem_object_put(obj);
1814
1815         return 0;
1816 }
1817
1818 /**
1819  * i915_gem_mmap_ioctl - Maps the contents of an object, returning the address
1820  *                       it is mapped to.
1821  * @dev: drm device
1822  * @data: ioctl data blob
1823  * @file: drm file
1824  *
1825  * While the mapping holds a reference on the contents of the object, it doesn't
1826  * imply a ref on the object itself.
1827  *
1828  * IMPORTANT:
1829  *
1830  * DRM driver writers who look a this function as an example for how to do GEM
1831  * mmap support, please don't implement mmap support like here. The modern way
1832  * to implement DRM mmap support is with an mmap offset ioctl (like
1833  * i915_gem_mmap_gtt) and then using the mmap syscall on the DRM fd directly.
1834  * That way debug tooling like valgrind will understand what's going on, hiding
1835  * the mmap call in a driver private ioctl will break that. The i915 driver only
1836  * does cpu mmaps this way because we didn't know better.
1837  */
1838 int
1839 i915_gem_mmap_ioctl(struct drm_device *dev, void *data,
1840                     struct drm_file *file)
1841 {
1842         struct drm_i915_gem_mmap *args = data;
1843         struct drm_i915_gem_object *obj;
1844         unsigned long addr;
1845
1846         if (args->flags & ~(I915_MMAP_WC))
1847                 return -EINVAL;
1848
1849         if (args->flags & I915_MMAP_WC && !boot_cpu_has(X86_FEATURE_PAT))
1850                 return -ENODEV;
1851
1852         obj = i915_gem_object_lookup(file, args->handle);
1853         if (!obj)
1854                 return -ENOENT;
1855
1856         /* prime objects have no backing filp to GEM mmap
1857          * pages from.
1858          */
1859         if (!obj->base.filp) {
1860                 i915_gem_object_put(obj);
1861                 return -ENXIO;
1862         }
1863
1864         addr = vm_mmap(obj->base.filp, 0, args->size,
1865                        PROT_READ | PROT_WRITE, MAP_SHARED,
1866                        args->offset);
1867         if (args->flags & I915_MMAP_WC) {
1868                 struct mm_struct *mm = current->mm;
1869                 struct vm_area_struct *vma;
1870
1871                 if (down_write_killable(&mm->mmap_sem)) {
1872                         i915_gem_object_put(obj);
1873                         return -EINTR;
1874                 }
1875                 vma = find_vma(mm, addr);
1876                 if (vma)
1877                         vma->vm_page_prot =
1878                                 pgprot_writecombine(vm_get_page_prot(vma->vm_flags));
1879                 else
1880                         addr = -ENOMEM;
1881                 up_write(&mm->mmap_sem);
1882
1883                 /* This may race, but that's ok, it only gets set */
1884                 WRITE_ONCE(obj->frontbuffer_ggtt_origin, ORIGIN_CPU);
1885         }
1886         i915_gem_object_put(obj);
1887         if (IS_ERR((void *)addr))
1888                 return addr;
1889
1890         args->addr_ptr = (uint64_t) addr;
1891
1892         return 0;
1893 }
1894
1895 static unsigned int tile_row_pages(struct drm_i915_gem_object *obj)
1896 {
1897         return i915_gem_object_get_tile_row_size(obj) >> PAGE_SHIFT;
1898 }
1899
1900 /**
1901  * i915_gem_mmap_gtt_version - report the current feature set for GTT mmaps
1902  *
1903  * A history of the GTT mmap interface:
1904  *
1905  * 0 - Everything had to fit into the GTT. Both parties of a memcpy had to
1906  *     aligned and suitable for fencing, and still fit into the available
1907  *     mappable space left by the pinned display objects. A classic problem
1908  *     we called the page-fault-of-doom where we would ping-pong between
1909  *     two objects that could not fit inside the GTT and so the memcpy
1910  *     would page one object in at the expense of the other between every
1911  *     single byte.
1912  *
1913  * 1 - Objects can be any size, and have any compatible fencing (X Y, or none
1914  *     as set via i915_gem_set_tiling() [DRM_I915_GEM_SET_TILING]). If the
1915  *     object is too large for the available space (or simply too large
1916  *     for the mappable aperture!), a view is created instead and faulted
1917  *     into userspace. (This view is aligned and sized appropriately for
1918  *     fenced access.)
1919  *
1920  * 2 - Recognise WC as a separate cache domain so that we can flush the
1921  *     delayed writes via GTT before performing direct access via WC.
1922  *
1923  * Restrictions:
1924  *
1925  *  * snoopable objects cannot be accessed via the GTT. It can cause machine
1926  *    hangs on some architectures, corruption on others. An attempt to service
1927  *    a GTT page fault from a snoopable object will generate a SIGBUS.
1928  *
1929  *  * the object must be able to fit into RAM (physical memory, though no
1930  *    limited to the mappable aperture).
1931  *
1932  *
1933  * Caveats:
1934  *
1935  *  * a new GTT page fault will synchronize rendering from the GPU and flush
1936  *    all data to system memory. Subsequent access will not be synchronized.
1937  *
1938  *  * all mappings are revoked on runtime device suspend.
1939  *
1940  *  * there are only 8, 16 or 32 fence registers to share between all users
1941  *    (older machines require fence register for display and blitter access
1942  *    as well). Contention of the fence registers will cause the previous users
1943  *    to be unmapped and any new access will generate new page faults.
1944  *
1945  *  * running out of memory while servicing a fault may generate a SIGBUS,
1946  *    rather than the expected SIGSEGV.
1947  */
1948 int i915_gem_mmap_gtt_version(void)
1949 {
1950         return 2;
1951 }
1952
1953 static inline struct i915_ggtt_view
1954 compute_partial_view(struct drm_i915_gem_object *obj,
1955                      pgoff_t page_offset,
1956                      unsigned int chunk)
1957 {
1958         struct i915_ggtt_view view;
1959
1960         if (i915_gem_object_is_tiled(obj))
1961                 chunk = roundup(chunk, tile_row_pages(obj));
1962
1963         view.type = I915_GGTT_VIEW_PARTIAL;
1964         view.partial.offset = rounddown(page_offset, chunk);
1965         view.partial.size =
1966                 min_t(unsigned int, chunk,
1967                       (obj->base.size >> PAGE_SHIFT) - view.partial.offset);
1968
1969         /* If the partial covers the entire object, just create a normal VMA. */
1970         if (chunk >= obj->base.size >> PAGE_SHIFT)
1971                 view.type = I915_GGTT_VIEW_NORMAL;
1972
1973         return view;
1974 }
1975
1976 /**
1977  * i915_gem_fault - fault a page into the GTT
1978  * @vmf: fault info
1979  *
1980  * The fault handler is set up by drm_gem_mmap() when a object is GTT mapped
1981  * from userspace.  The fault handler takes care of binding the object to
1982  * the GTT (if needed), allocating and programming a fence register (again,
1983  * only if needed based on whether the old reg is still valid or the object
1984  * is tiled) and inserting a new PTE into the faulting process.
1985  *
1986  * Note that the faulting process may involve evicting existing objects
1987  * from the GTT and/or fence registers to make room.  So performance may
1988  * suffer if the GTT working set is large or there are few fence registers
1989  * left.
1990  *
1991  * The current feature set supported by i915_gem_fault() and thus GTT mmaps
1992  * is exposed via I915_PARAM_MMAP_GTT_VERSION (see i915_gem_mmap_gtt_version).
1993  */
1994 int i915_gem_fault(struct vm_fault *vmf)
1995 {
1996 #define MIN_CHUNK_PAGES ((1 << 20) >> PAGE_SHIFT) /* 1 MiB */
1997         struct vm_area_struct *area = vmf->vma;
1998         struct drm_i915_gem_object *obj = to_intel_bo(area->vm_private_data);
1999         struct drm_device *dev = obj->base.dev;
2000         struct drm_i915_private *dev_priv = to_i915(dev);
2001         struct i915_ggtt *ggtt = &dev_priv->ggtt;
2002         bool write = !!(vmf->flags & FAULT_FLAG_WRITE);
2003         struct i915_vma *vma;
2004         pgoff_t page_offset;
2005         unsigned int flags;
2006         int ret;
2007
2008         /* We don't use vmf->pgoff since that has the fake offset */
2009         page_offset = (vmf->address - area->vm_start) >> PAGE_SHIFT;
2010
2011         trace_i915_gem_object_fault(obj, page_offset, true, write);
2012
2013         /* Try to flush the object off the GPU first without holding the lock.
2014          * Upon acquiring the lock, we will perform our sanity checks and then
2015          * repeat the flush holding the lock in the normal manner to catch cases
2016          * where we are gazumped.
2017          */
2018         ret = i915_gem_object_wait(obj,
2019                                    I915_WAIT_INTERRUPTIBLE,
2020                                    MAX_SCHEDULE_TIMEOUT,
2021                                    NULL);
2022         if (ret)
2023                 goto err;
2024
2025         ret = i915_gem_object_pin_pages(obj);
2026         if (ret)
2027                 goto err;
2028
2029         intel_runtime_pm_get(dev_priv);
2030
2031         ret = i915_mutex_lock_interruptible(dev);
2032         if (ret)
2033                 goto err_rpm;
2034
2035         /* Access to snoopable pages through the GTT is incoherent. */
2036         if (obj->cache_level != I915_CACHE_NONE && !HAS_LLC(dev_priv)) {
2037                 ret = -EFAULT;
2038                 goto err_unlock;
2039         }
2040
2041         /* If the object is smaller than a couple of partial vma, it is
2042          * not worth only creating a single partial vma - we may as well
2043          * clear enough space for the full object.
2044          */
2045         flags = PIN_MAPPABLE;
2046         if (obj->base.size > 2 * MIN_CHUNK_PAGES << PAGE_SHIFT)
2047                 flags |= PIN_NONBLOCK | PIN_NONFAULT;
2048
2049         /* Now pin it into the GTT as needed */
2050         vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0, flags);
2051         if (IS_ERR(vma)) {
2052                 /* Use a partial view if it is bigger than available space */
2053                 struct i915_ggtt_view view =
2054                         compute_partial_view(obj, page_offset, MIN_CHUNK_PAGES);
2055
2056                 /* Userspace is now writing through an untracked VMA, abandon
2057                  * all hope that the hardware is able to track future writes.
2058                  */
2059                 obj->frontbuffer_ggtt_origin = ORIGIN_CPU;
2060
2061                 vma = i915_gem_object_ggtt_pin(obj, &view, 0, 0, PIN_MAPPABLE);
2062         }
2063         if (IS_ERR(vma)) {
2064                 ret = PTR_ERR(vma);
2065                 goto err_unlock;
2066         }
2067
2068         ret = i915_gem_object_set_to_gtt_domain(obj, write);
2069         if (ret)
2070                 goto err_unpin;
2071
2072         ret = i915_vma_pin_fence(vma);
2073         if (ret)
2074                 goto err_unpin;
2075
2076         /* Finally, remap it using the new GTT offset */
2077         ret = remap_io_mapping(area,
2078                                area->vm_start + (vma->ggtt_view.partial.offset << PAGE_SHIFT),
2079                                (ggtt->gmadr.start + vma->node.start) >> PAGE_SHIFT,
2080                                min_t(u64, vma->size, area->vm_end - area->vm_start),
2081                                &ggtt->iomap);
2082         if (ret)
2083                 goto err_fence;
2084
2085         /* Mark as being mmapped into userspace for later revocation */
2086         assert_rpm_wakelock_held(dev_priv);
2087         if (!i915_vma_set_userfault(vma) && !obj->userfault_count++)
2088                 list_add(&obj->userfault_link, &dev_priv->mm.userfault_list);
2089         GEM_BUG_ON(!obj->userfault_count);
2090
2091         i915_vma_set_ggtt_write(vma);
2092
2093 err_fence:
2094         i915_vma_unpin_fence(vma);
2095 err_unpin:
2096         __i915_vma_unpin(vma);
2097 err_unlock:
2098         mutex_unlock(&dev->struct_mutex);
2099 err_rpm:
2100         intel_runtime_pm_put(dev_priv);
2101         i915_gem_object_unpin_pages(obj);
2102 err:
2103         switch (ret) {
2104         case -EIO:
2105                 /*
2106                  * We eat errors when the gpu is terminally wedged to avoid
2107                  * userspace unduly crashing (gl has no provisions for mmaps to
2108                  * fail). But any other -EIO isn't ours (e.g. swap in failure)
2109                  * and so needs to be reported.
2110                  */
2111                 if (!i915_terminally_wedged(&dev_priv->gpu_error)) {
2112                         ret = VM_FAULT_SIGBUS;
2113                         break;
2114                 }
2115         case -EAGAIN:
2116                 /*
2117                  * EAGAIN means the gpu is hung and we'll wait for the error
2118                  * handler to reset everything when re-faulting in
2119                  * i915_mutex_lock_interruptible.
2120                  */
2121         case 0:
2122         case -ERESTARTSYS:
2123         case -EINTR:
2124         case -EBUSY:
2125                 /*
2126                  * EBUSY is ok: this just means that another thread
2127                  * already did the job.
2128                  */
2129                 ret = VM_FAULT_NOPAGE;
2130                 break;
2131         case -ENOMEM:
2132                 ret = VM_FAULT_OOM;
2133                 break;
2134         case -ENOSPC:
2135         case -EFAULT:
2136                 ret = VM_FAULT_SIGBUS;
2137                 break;
2138         default:
2139                 WARN_ONCE(ret, "unhandled error in i915_gem_fault: %i\n", ret);
2140                 ret = VM_FAULT_SIGBUS;
2141                 break;
2142         }
2143         return ret;
2144 }
2145
2146 static void __i915_gem_object_release_mmap(struct drm_i915_gem_object *obj)
2147 {
2148         struct i915_vma *vma;
2149
2150         GEM_BUG_ON(!obj->userfault_count);
2151
2152         obj->userfault_count = 0;
2153         list_del(&obj->userfault_link);
2154         drm_vma_node_unmap(&obj->base.vma_node,
2155                            obj->base.dev->anon_inode->i_mapping);
2156
2157         for_each_ggtt_vma(vma, obj)
2158                 i915_vma_unset_userfault(vma);
2159 }
2160
2161 /**
2162  * i915_gem_release_mmap - remove physical page mappings
2163  * @obj: obj in question
2164  *
2165  * Preserve the reservation of the mmapping with the DRM core code, but
2166  * relinquish ownership of the pages back to the system.
2167  *
2168  * It is vital that we remove the page mapping if we have mapped a tiled
2169  * object through the GTT and then lose the fence register due to
2170  * resource pressure. Similarly if the object has been moved out of the
2171  * aperture, than pages mapped into userspace must be revoked. Removing the
2172  * mapping will then trigger a page fault on the next user access, allowing
2173  * fixup by i915_gem_fault().
2174  */
2175 void
2176 i915_gem_release_mmap(struct drm_i915_gem_object *obj)
2177 {
2178         struct drm_i915_private *i915 = to_i915(obj->base.dev);
2179
2180         /* Serialisation between user GTT access and our code depends upon
2181          * revoking the CPU's PTE whilst the mutex is held. The next user
2182          * pagefault then has to wait until we release the mutex.
2183          *
2184          * Note that RPM complicates somewhat by adding an additional
2185          * requirement that operations to the GGTT be made holding the RPM
2186          * wakeref.
2187          */
2188         lockdep_assert_held(&i915->drm.struct_mutex);
2189         intel_runtime_pm_get(i915);
2190
2191         if (!obj->userfault_count)
2192                 goto out;
2193
2194         __i915_gem_object_release_mmap(obj);
2195
2196         /* Ensure that the CPU's PTE are revoked and there are not outstanding
2197          * memory transactions from userspace before we return. The TLB
2198          * flushing implied above by changing the PTE above *should* be
2199          * sufficient, an extra barrier here just provides us with a bit
2200          * of paranoid documentation about our requirement to serialise
2201          * memory writes before touching registers / GSM.
2202          */
2203         wmb();
2204
2205 out:
2206         intel_runtime_pm_put(i915);
2207 }
2208
2209 void i915_gem_runtime_suspend(struct drm_i915_private *dev_priv)
2210 {
2211         struct drm_i915_gem_object *obj, *on;
2212         int i;
2213
2214         /*
2215          * Only called during RPM suspend. All users of the userfault_list
2216          * must be holding an RPM wakeref to ensure that this can not
2217          * run concurrently with themselves (and use the struct_mutex for
2218          * protection between themselves).
2219          */
2220
2221         list_for_each_entry_safe(obj, on,
2222                                  &dev_priv->mm.userfault_list, userfault_link)
2223                 __i915_gem_object_release_mmap(obj);
2224
2225         /* The fence will be lost when the device powers down. If any were
2226          * in use by hardware (i.e. they are pinned), we should not be powering
2227          * down! All other fences will be reacquired by the user upon waking.
2228          */
2229         for (i = 0; i < dev_priv->num_fence_regs; i++) {
2230                 struct drm_i915_fence_reg *reg = &dev_priv->fence_regs[i];
2231
2232                 /* Ideally we want to assert that the fence register is not
2233                  * live at this point (i.e. that no piece of code will be
2234                  * trying to write through fence + GTT, as that both violates
2235                  * our tracking of activity and associated locking/barriers,
2236                  * but also is illegal given that the hw is powered down).
2237                  *
2238                  * Previously we used reg->pin_count as a "liveness" indicator.
2239                  * That is not sufficient, and we need a more fine-grained
2240                  * tool if we want to have a sanity check here.
2241                  */
2242
2243                 if (!reg->vma)
2244                         continue;
2245
2246                 GEM_BUG_ON(i915_vma_has_userfault(reg->vma));
2247                 reg->dirty = true;
2248         }
2249 }
2250
2251 static int i915_gem_object_create_mmap_offset(struct drm_i915_gem_object *obj)
2252 {
2253         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
2254         int err;
2255
2256         err = drm_gem_create_mmap_offset(&obj->base);
2257         if (likely(!err))
2258                 return 0;
2259
2260         /* Attempt to reap some mmap space from dead objects */
2261         do {
2262                 err = i915_gem_wait_for_idle(dev_priv, I915_WAIT_INTERRUPTIBLE);
2263                 if (err)
2264                         break;
2265
2266                 i915_gem_drain_freed_objects(dev_priv);
2267                 err = drm_gem_create_mmap_offset(&obj->base);
2268                 if (!err)
2269                         break;
2270
2271         } while (flush_delayed_work(&dev_priv->gt.retire_work));
2272
2273         return err;
2274 }
2275
2276 static void i915_gem_object_free_mmap_offset(struct drm_i915_gem_object *obj)
2277 {
2278         drm_gem_free_mmap_offset(&obj->base);
2279 }
2280
2281 int
2282 i915_gem_mmap_gtt(struct drm_file *file,
2283                   struct drm_device *dev,
2284                   uint32_t handle,
2285                   uint64_t *offset)
2286 {
2287         struct drm_i915_gem_object *obj;
2288         int ret;
2289
2290         obj = i915_gem_object_lookup(file, handle);
2291         if (!obj)
2292                 return -ENOENT;
2293
2294         ret = i915_gem_object_create_mmap_offset(obj);
2295         if (ret == 0)
2296                 *offset = drm_vma_node_offset_addr(&obj->base.vma_node);
2297
2298         i915_gem_object_put(obj);
2299         return ret;
2300 }
2301
2302 /**
2303  * i915_gem_mmap_gtt_ioctl - prepare an object for GTT mmap'ing
2304  * @dev: DRM device
2305  * @data: GTT mapping ioctl data
2306  * @file: GEM object info
2307  *
2308  * Simply returns the fake offset to userspace so it can mmap it.
2309  * The mmap call will end up in drm_gem_mmap(), which will set things
2310  * up so we can get faults in the handler above.
2311  *
2312  * The fault handler will take care of binding the object into the GTT
2313  * (since it may have been evicted to make room for something), allocating
2314  * a fence register, and mapping the appropriate aperture address into
2315  * userspace.
2316  */
2317 int
2318 i915_gem_mmap_gtt_ioctl(struct drm_device *dev, void *data,
2319                         struct drm_file *file)
2320 {
2321         struct drm_i915_gem_mmap_gtt *args = data;
2322
2323         return i915_gem_mmap_gtt(file, dev, args->handle, &args->offset);
2324 }
2325
2326 /* Immediately discard the backing storage */
2327 static void
2328 i915_gem_object_truncate(struct drm_i915_gem_object *obj)
2329 {
2330         i915_gem_object_free_mmap_offset(obj);
2331
2332         if (obj->base.filp == NULL)
2333                 return;
2334
2335         /* Our goal here is to return as much of the memory as
2336          * is possible back to the system as we are called from OOM.
2337          * To do this we must instruct the shmfs to drop all of its
2338          * backing pages, *now*.
2339          */
2340         shmem_truncate_range(file_inode(obj->base.filp), 0, (loff_t)-1);
2341         obj->mm.madv = __I915_MADV_PURGED;
2342         obj->mm.pages = ERR_PTR(-EFAULT);
2343 }
2344
2345 /* Try to discard unwanted pages */
2346 void __i915_gem_object_invalidate(struct drm_i915_gem_object *obj)
2347 {
2348         struct address_space *mapping;
2349
2350         lockdep_assert_held(&obj->mm.lock);
2351         GEM_BUG_ON(i915_gem_object_has_pages(obj));
2352
2353         switch (obj->mm.madv) {
2354         case I915_MADV_DONTNEED:
2355                 i915_gem_object_truncate(obj);
2356         case __I915_MADV_PURGED:
2357                 return;
2358         }
2359
2360         if (obj->base.filp == NULL)
2361                 return;
2362
2363         mapping = obj->base.filp->f_mapping,
2364         invalidate_mapping_pages(mapping, 0, (loff_t)-1);
2365 }
2366
2367 static void
2368 i915_gem_object_put_pages_gtt(struct drm_i915_gem_object *obj,
2369                               struct sg_table *pages)
2370 {
2371         struct sgt_iter sgt_iter;
2372         struct page *page;
2373
2374         __i915_gem_object_release_shmem(obj, pages, true);
2375
2376         i915_gem_gtt_finish_pages(obj, pages);
2377
2378         if (i915_gem_object_needs_bit17_swizzle(obj))
2379                 i915_gem_object_save_bit_17_swizzle(obj, pages);
2380
2381         for_each_sgt_page(page, sgt_iter, pages) {
2382                 if (obj->mm.dirty)
2383                         set_page_dirty(page);
2384
2385                 if (obj->mm.madv == I915_MADV_WILLNEED)
2386                         mark_page_accessed(page);
2387
2388                 put_page(page);
2389         }
2390         obj->mm.dirty = false;
2391
2392         sg_free_table(pages);
2393         kfree(pages);
2394 }
2395
2396 static void __i915_gem_object_reset_page_iter(struct drm_i915_gem_object *obj)
2397 {
2398         struct radix_tree_iter iter;
2399         void __rcu **slot;
2400
2401         rcu_read_lock();
2402         radix_tree_for_each_slot(slot, &obj->mm.get_page.radix, &iter, 0)
2403                 radix_tree_delete(&obj->mm.get_page.radix, iter.index);
2404         rcu_read_unlock();
2405 }
2406
2407 void __i915_gem_object_put_pages(struct drm_i915_gem_object *obj,
2408                                  enum i915_mm_subclass subclass)
2409 {
2410         struct drm_i915_private *i915 = to_i915(obj->base.dev);
2411         struct sg_table *pages;
2412
2413         if (i915_gem_object_has_pinned_pages(obj))
2414                 return;
2415
2416         GEM_BUG_ON(obj->bind_count);
2417         if (!i915_gem_object_has_pages(obj))
2418                 return;
2419
2420         /* May be called by shrinker from within get_pages() (on another bo) */
2421         mutex_lock_nested(&obj->mm.lock, subclass);
2422         if (unlikely(atomic_read(&obj->mm.pages_pin_count)))
2423                 goto unlock;
2424
2425         /* ->put_pages might need to allocate memory for the bit17 swizzle
2426          * array, hence protect them from being reaped by removing them from gtt
2427          * lists early. */
2428         pages = fetch_and_zero(&obj->mm.pages);
2429         GEM_BUG_ON(!pages);
2430
2431         spin_lock(&i915->mm.obj_lock);
2432         list_del(&obj->mm.link);
2433         spin_unlock(&i915->mm.obj_lock);
2434
2435         if (obj->mm.mapping) {
2436                 void *ptr;
2437
2438                 ptr = page_mask_bits(obj->mm.mapping);
2439                 if (is_vmalloc_addr(ptr))
2440                         vunmap(ptr);
2441                 else
2442                         kunmap(kmap_to_page(ptr));
2443
2444                 obj->mm.mapping = NULL;
2445         }
2446
2447         __i915_gem_object_reset_page_iter(obj);
2448
2449         if (!IS_ERR(pages))
2450                 obj->ops->put_pages(obj, pages);
2451
2452         obj->mm.page_sizes.phys = obj->mm.page_sizes.sg = 0;
2453
2454 unlock:
2455         mutex_unlock(&obj->mm.lock);
2456 }
2457
2458 static bool i915_sg_trim(struct sg_table *orig_st)
2459 {
2460         struct sg_table new_st;
2461         struct scatterlist *sg, *new_sg;
2462         unsigned int i;
2463
2464         if (orig_st->nents == orig_st->orig_nents)
2465                 return false;
2466
2467         if (sg_alloc_table(&new_st, orig_st->nents, GFP_KERNEL | __GFP_NOWARN))
2468                 return false;
2469
2470         new_sg = new_st.sgl;
2471         for_each_sg(orig_st->sgl, sg, orig_st->nents, i) {
2472                 sg_set_page(new_sg, sg_page(sg), sg->length, 0);
2473                 /* called before being DMA mapped, no need to copy sg->dma_* */
2474                 new_sg = sg_next(new_sg);
2475         }
2476         GEM_BUG_ON(new_sg); /* Should walk exactly nents and hit the end */
2477
2478         sg_free_table(orig_st);
2479
2480         *orig_st = new_st;
2481         return true;
2482 }
2483
2484 static int i915_gem_object_get_pages_gtt(struct drm_i915_gem_object *obj)
2485 {
2486         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
2487         const unsigned long page_count = obj->base.size / PAGE_SIZE;
2488         unsigned long i;
2489         struct address_space *mapping;
2490         struct sg_table *st;
2491         struct scatterlist *sg;
2492         struct sgt_iter sgt_iter;
2493         struct page *page;
2494         unsigned long last_pfn = 0;     /* suppress gcc warning */
2495         unsigned int max_segment = i915_sg_segment_size();
2496         unsigned int sg_page_sizes;
2497         gfp_t noreclaim;
2498         int ret;
2499
2500         /* Assert that the object is not currently in any GPU domain. As it
2501          * wasn't in the GTT, there shouldn't be any way it could have been in
2502          * a GPU cache
2503          */
2504         GEM_BUG_ON(obj->read_domains & I915_GEM_GPU_DOMAINS);
2505         GEM_BUG_ON(obj->write_domain & I915_GEM_GPU_DOMAINS);
2506
2507         st = kmalloc(sizeof(*st), GFP_KERNEL);
2508         if (st == NULL)
2509                 return -ENOMEM;
2510
2511 rebuild_st:
2512         if (sg_alloc_table(st, page_count, GFP_KERNEL)) {
2513                 kfree(st);
2514                 return -ENOMEM;
2515         }
2516
2517         /* Get the list of pages out of our struct file.  They'll be pinned
2518          * at this point until we release them.
2519          *
2520          * Fail silently without starting the shrinker
2521          */
2522         mapping = obj->base.filp->f_mapping;
2523         noreclaim = mapping_gfp_constraint(mapping, ~__GFP_RECLAIM);
2524         noreclaim |= __GFP_NORETRY | __GFP_NOWARN;
2525
2526         sg = st->sgl;
2527         st->nents = 0;
2528         sg_page_sizes = 0;
2529         for (i = 0; i < page_count; i++) {
2530                 const unsigned int shrink[] = {
2531                         I915_SHRINK_BOUND | I915_SHRINK_UNBOUND | I915_SHRINK_PURGEABLE,
2532                         0,
2533                 }, *s = shrink;
2534                 gfp_t gfp = noreclaim;
2535
2536                 do {
2537                         page = shmem_read_mapping_page_gfp(mapping, i, gfp);
2538                         if (likely(!IS_ERR(page)))
2539                                 break;
2540
2541                         if (!*s) {
2542                                 ret = PTR_ERR(page);
2543                                 goto err_sg;
2544                         }
2545
2546                         i915_gem_shrink(dev_priv, 2 * page_count, NULL, *s++);
2547                         cond_resched();
2548
2549                         /* We've tried hard to allocate the memory by reaping
2550                          * our own buffer, now let the real VM do its job and
2551                          * go down in flames if truly OOM.
2552                          *
2553                          * However, since graphics tend to be disposable,
2554                          * defer the oom here by reporting the ENOMEM back
2555                          * to userspace.
2556                          */
2557                         if (!*s) {
2558                                 /* reclaim and warn, but no oom */
2559                                 gfp = mapping_gfp_mask(mapping);
2560
2561                                 /* Our bo are always dirty and so we require
2562                                  * kswapd to reclaim our pages (direct reclaim
2563                                  * does not effectively begin pageout of our
2564                                  * buffers on its own). However, direct reclaim
2565                                  * only waits for kswapd when under allocation
2566                                  * congestion. So as a result __GFP_RECLAIM is
2567                                  * unreliable and fails to actually reclaim our
2568                                  * dirty pages -- unless you try over and over
2569                                  * again with !__GFP_NORETRY. However, we still
2570                                  * want to fail this allocation rather than
2571                                  * trigger the out-of-memory killer and for
2572                                  * this we want __GFP_RETRY_MAYFAIL.
2573                                  */
2574                                 gfp |= __GFP_RETRY_MAYFAIL;
2575                         }
2576                 } while (1);
2577
2578                 if (!i ||
2579                     sg->length >= max_segment ||
2580                     page_to_pfn(page) != last_pfn + 1) {
2581                         if (i) {
2582                                 sg_page_sizes |= sg->length;
2583                                 sg = sg_next(sg);
2584                         }
2585                         st->nents++;
2586                         sg_set_page(sg, page, PAGE_SIZE, 0);
2587                 } else {
2588                         sg->length += PAGE_SIZE;
2589                 }
2590                 last_pfn = page_to_pfn(page);
2591
2592                 /* Check that the i965g/gm workaround works. */
2593                 WARN_ON((gfp & __GFP_DMA32) && (last_pfn >= 0x00100000UL));
2594         }
2595         if (sg) { /* loop terminated early; short sg table */
2596                 sg_page_sizes |= sg->length;
2597                 sg_mark_end(sg);
2598         }
2599
2600         /* Trim unused sg entries to avoid wasting memory. */
2601         i915_sg_trim(st);
2602
2603         ret = i915_gem_gtt_prepare_pages(obj, st);
2604         if (ret) {
2605                 /* DMA remapping failed? One possible cause is that
2606                  * it could not reserve enough large entries, asking
2607                  * for PAGE_SIZE chunks instead may be helpful.
2608                  */
2609                 if (max_segment > PAGE_SIZE) {
2610                         for_each_sgt_page(page, sgt_iter, st)
2611                                 put_page(page);
2612                         sg_free_table(st);
2613
2614                         max_segment = PAGE_SIZE;
2615                         goto rebuild_st;
2616                 } else {
2617                         dev_warn(&dev_priv->drm.pdev->dev,
2618                                  "Failed to DMA remap %lu pages\n",
2619                                  page_count);
2620                         goto err_pages;
2621                 }
2622         }
2623
2624         if (i915_gem_object_needs_bit17_swizzle(obj))
2625                 i915_gem_object_do_bit_17_swizzle(obj, st);
2626
2627         __i915_gem_object_set_pages(obj, st, sg_page_sizes);
2628
2629         return 0;
2630
2631 err_sg:
2632         sg_mark_end(sg);
2633 err_pages:
2634         for_each_sgt_page(page, sgt_iter, st)
2635                 put_page(page);
2636         sg_free_table(st);
2637         kfree(st);
2638
2639         /* shmemfs first checks if there is enough memory to allocate the page
2640          * and reports ENOSPC should there be insufficient, along with the usual
2641          * ENOMEM for a genuine allocation failure.
2642          *
2643          * We use ENOSPC in our driver to mean that we have run out of aperture
2644          * space and so want to translate the error from shmemfs back to our
2645          * usual understanding of ENOMEM.
2646          */
2647         if (ret == -ENOSPC)
2648                 ret = -ENOMEM;
2649
2650         return ret;
2651 }
2652
2653 void __i915_gem_object_set_pages(struct drm_i915_gem_object *obj,
2654                                  struct sg_table *pages,
2655                                  unsigned int sg_page_sizes)
2656 {
2657         struct drm_i915_private *i915 = to_i915(obj->base.dev);
2658         unsigned long supported = INTEL_INFO(i915)->page_sizes;
2659         int i;
2660
2661         lockdep_assert_held(&obj->mm.lock);
2662
2663         obj->mm.get_page.sg_pos = pages->sgl;
2664         obj->mm.get_page.sg_idx = 0;
2665
2666         obj->mm.pages = pages;
2667
2668         if (i915_gem_object_is_tiled(obj) &&
2669             i915->quirks & QUIRK_PIN_SWIZZLED_PAGES) {
2670                 GEM_BUG_ON(obj->mm.quirked);
2671                 __i915_gem_object_pin_pages(obj);
2672                 obj->mm.quirked = true;
2673         }
2674
2675         GEM_BUG_ON(!sg_page_sizes);
2676         obj->mm.page_sizes.phys = sg_page_sizes;
2677
2678         /*
2679          * Calculate the supported page-sizes which fit into the given
2680          * sg_page_sizes. This will give us the page-sizes which we may be able
2681          * to use opportunistically when later inserting into the GTT. For
2682          * example if phys=2G, then in theory we should be able to use 1G, 2M,
2683          * 64K or 4K pages, although in practice this will depend on a number of
2684          * other factors.
2685          */
2686         obj->mm.page_sizes.sg = 0;
2687         for_each_set_bit(i, &supported, ilog2(I915_GTT_MAX_PAGE_SIZE) + 1) {
2688                 if (obj->mm.page_sizes.phys & ~0u << i)
2689                         obj->mm.page_sizes.sg |= BIT(i);
2690         }
2691         GEM_BUG_ON(!HAS_PAGE_SIZES(i915, obj->mm.page_sizes.sg));
2692
2693         spin_lock(&i915->mm.obj_lock);
2694         list_add(&obj->mm.link, &i915->mm.unbound_list);
2695         spin_unlock(&i915->mm.obj_lock);
2696 }
2697
2698 static int ____i915_gem_object_get_pages(struct drm_i915_gem_object *obj)
2699 {
2700         int err;
2701
2702         if (unlikely(obj->mm.madv != I915_MADV_WILLNEED)) {
2703                 DRM_DEBUG("Attempting to obtain a purgeable object\n");
2704                 return -EFAULT;
2705         }
2706
2707         err = obj->ops->get_pages(obj);
2708         GEM_BUG_ON(!err && !i915_gem_object_has_pages(obj));
2709
2710         return err;
2711 }
2712
2713 /* Ensure that the associated pages are gathered from the backing storage
2714  * and pinned into our object. i915_gem_object_pin_pages() may be called
2715  * multiple times before they are released by a single call to
2716  * i915_gem_object_unpin_pages() - once the pages are no longer referenced
2717  * either as a result of memory pressure (reaping pages under the shrinker)
2718  * or as the object is itself released.
2719  */
2720 int __i915_gem_object_get_pages(struct drm_i915_gem_object *obj)
2721 {
2722         int err;
2723
2724         err = mutex_lock_interruptible(&obj->mm.lock);
2725         if (err)
2726                 return err;
2727
2728         if (unlikely(!i915_gem_object_has_pages(obj))) {
2729                 GEM_BUG_ON(i915_gem_object_has_pinned_pages(obj));
2730
2731                 err = ____i915_gem_object_get_pages(obj);
2732                 if (err)
2733                         goto unlock;
2734
2735                 smp_mb__before_atomic();
2736         }
2737         atomic_inc(&obj->mm.pages_pin_count);
2738
2739 unlock:
2740         mutex_unlock(&obj->mm.lock);
2741         return err;
2742 }
2743
2744 /* The 'mapping' part of i915_gem_object_pin_map() below */
2745 static void *i915_gem_object_map(const struct drm_i915_gem_object *obj,
2746                                  enum i915_map_type type)
2747 {
2748         unsigned long n_pages = obj->base.size >> PAGE_SHIFT;
2749         struct sg_table *sgt = obj->mm.pages;
2750         struct sgt_iter sgt_iter;
2751         struct page *page;
2752         struct page *stack_pages[32];
2753         struct page **pages = stack_pages;
2754         unsigned long i = 0;
2755         pgprot_t pgprot;
2756         void *addr;
2757
2758         /* A single page can always be kmapped */
2759         if (n_pages == 1 && type == I915_MAP_WB)
2760                 return kmap(sg_page(sgt->sgl));
2761
2762         if (n_pages > ARRAY_SIZE(stack_pages)) {
2763                 /* Too big for stack -- allocate temporary array instead */
2764                 pages = kvmalloc_array(n_pages, sizeof(*pages), GFP_KERNEL);
2765                 if (!pages)
2766                         return NULL;
2767         }
2768
2769         for_each_sgt_page(page, sgt_iter, sgt)
2770                 pages[i++] = page;
2771
2772         /* Check that we have the expected number of pages */
2773         GEM_BUG_ON(i != n_pages);
2774
2775         switch (type) {
2776         default:
2777                 MISSING_CASE(type);
2778                 /* fallthrough to use PAGE_KERNEL anyway */
2779         case I915_MAP_WB:
2780                 pgprot = PAGE_KERNEL;
2781                 break;
2782         case I915_MAP_WC:
2783                 pgprot = pgprot_writecombine(PAGE_KERNEL_IO);
2784                 break;
2785         }
2786         addr = vmap(pages, n_pages, 0, pgprot);
2787
2788         if (pages != stack_pages)
2789                 kvfree(pages);
2790
2791         return addr;
2792 }
2793
2794 /* get, pin, and map the pages of the object into kernel space */
2795 void *i915_gem_object_pin_map(struct drm_i915_gem_object *obj,
2796                               enum i915_map_type type)
2797 {
2798         enum i915_map_type has_type;
2799         bool pinned;
2800         void *ptr;
2801         int ret;
2802
2803         if (unlikely(!i915_gem_object_has_struct_page(obj)))
2804                 return ERR_PTR(-ENXIO);
2805
2806         ret = mutex_lock_interruptible(&obj->mm.lock);
2807         if (ret)
2808                 return ERR_PTR(ret);
2809
2810         pinned = !(type & I915_MAP_OVERRIDE);
2811         type &= ~I915_MAP_OVERRIDE;
2812
2813         if (!atomic_inc_not_zero(&obj->mm.pages_pin_count)) {
2814                 if (unlikely(!i915_gem_object_has_pages(obj))) {
2815                         GEM_BUG_ON(i915_gem_object_has_pinned_pages(obj));
2816
2817                         ret = ____i915_gem_object_get_pages(obj);
2818                         if (ret)
2819                                 goto err_unlock;
2820
2821                         smp_mb__before_atomic();
2822                 }
2823                 atomic_inc(&obj->mm.pages_pin_count);
2824                 pinned = false;
2825         }
2826         GEM_BUG_ON(!i915_gem_object_has_pages(obj));
2827
2828         ptr = page_unpack_bits(obj->mm.mapping, &has_type);
2829         if (ptr && has_type != type) {
2830                 if (pinned) {
2831                         ret = -EBUSY;
2832                         goto err_unpin;
2833                 }
2834
2835                 if (is_vmalloc_addr(ptr))
2836                         vunmap(ptr);
2837                 else
2838                         kunmap(kmap_to_page(ptr));
2839
2840                 ptr = obj->mm.mapping = NULL;
2841         }
2842
2843         if (!ptr) {
2844                 ptr = i915_gem_object_map(obj, type);
2845                 if (!ptr) {
2846                         ret = -ENOMEM;
2847                         goto err_unpin;
2848                 }
2849
2850                 obj->mm.mapping = page_pack_bits(ptr, type);
2851         }
2852
2853 out_unlock:
2854         mutex_unlock(&obj->mm.lock);
2855         return ptr;
2856
2857 err_unpin:
2858         atomic_dec(&obj->mm.pages_pin_count);
2859 err_unlock:
2860         ptr = ERR_PTR(ret);
2861         goto out_unlock;
2862 }
2863
2864 static int
2865 i915_gem_object_pwrite_gtt(struct drm_i915_gem_object *obj,
2866                            const struct drm_i915_gem_pwrite *arg)
2867 {
2868         struct address_space *mapping = obj->base.filp->f_mapping;
2869         char __user *user_data = u64_to_user_ptr(arg->data_ptr);
2870         u64 remain, offset;
2871         unsigned int pg;
2872
2873         /* Before we instantiate/pin the backing store for our use, we
2874          * can prepopulate the shmemfs filp efficiently using a write into
2875          * the pagecache. We avoid the penalty of instantiating all the
2876          * pages, important if the user is just writing to a few and never
2877          * uses the object on the GPU, and using a direct write into shmemfs
2878          * allows it to avoid the cost of retrieving a page (either swapin
2879          * or clearing-before-use) before it is overwritten.
2880          */
2881         if (i915_gem_object_has_pages(obj))
2882                 return -ENODEV;
2883
2884         if (obj->mm.madv != I915_MADV_WILLNEED)
2885                 return -EFAULT;
2886
2887         /* Before the pages are instantiated the object is treated as being
2888          * in the CPU domain. The pages will be clflushed as required before
2889          * use, and we can freely write into the pages directly. If userspace
2890          * races pwrite with any other operation; corruption will ensue -
2891          * that is userspace's prerogative!
2892          */
2893
2894         remain = arg->size;
2895         offset = arg->offset;
2896         pg = offset_in_page(offset);
2897
2898         do {
2899                 unsigned int len, unwritten;
2900                 struct page *page;
2901                 void *data, *vaddr;
2902                 int err;
2903
2904                 len = PAGE_SIZE - pg;
2905                 if (len > remain)
2906                         len = remain;
2907
2908                 err = pagecache_write_begin(obj->base.filp, mapping,
2909                                             offset, len, 0,
2910                                             &page, &data);
2911                 if (err < 0)
2912                         return err;
2913
2914                 vaddr = kmap(page);
2915                 unwritten = copy_from_user(vaddr + pg, user_data, len);
2916                 kunmap(page);
2917
2918                 err = pagecache_write_end(obj->base.filp, mapping,
2919                                           offset, len, len - unwritten,
2920                                           page, data);
2921                 if (err < 0)
2922                         return err;
2923
2924                 if (unwritten)
2925                         return -EFAULT;
2926
2927                 remain -= len;
2928                 user_data += len;
2929                 offset += len;
2930                 pg = 0;
2931         } while (remain);
2932
2933         return 0;
2934 }
2935
2936 static void i915_gem_client_mark_guilty(struct drm_i915_file_private *file_priv,
2937                                         const struct i915_gem_context *ctx)
2938 {
2939         unsigned int score;
2940         unsigned long prev_hang;
2941
2942         if (i915_gem_context_is_banned(ctx))
2943                 score = I915_CLIENT_SCORE_CONTEXT_BAN;
2944         else
2945                 score = 0;
2946
2947         prev_hang = xchg(&file_priv->hang_timestamp, jiffies);
2948         if (time_before(jiffies, prev_hang + I915_CLIENT_FAST_HANG_JIFFIES))
2949                 score += I915_CLIENT_SCORE_HANG_FAST;
2950
2951         if (score) {
2952                 atomic_add(score, &file_priv->ban_score);
2953
2954                 DRM_DEBUG_DRIVER("client %s: gained %u ban score, now %u\n",
2955                                  ctx->name, score,
2956                                  atomic_read(&file_priv->ban_score));
2957         }
2958 }
2959
2960 static void i915_gem_context_mark_guilty(struct i915_gem_context *ctx)
2961 {
2962         unsigned int score;
2963         bool banned, bannable;
2964
2965         atomic_inc(&ctx->guilty_count);
2966
2967         bannable = i915_gem_context_is_bannable(ctx);
2968         score = atomic_add_return(CONTEXT_SCORE_GUILTY, &ctx->ban_score);
2969         banned = score >= CONTEXT_SCORE_BAN_THRESHOLD;
2970
2971         DRM_DEBUG_DRIVER("context %s: guilty %d, score %u, ban %s\n",
2972                          ctx->name, atomic_read(&ctx->guilty_count),
2973                          score, yesno(banned && bannable));
2974
2975         /* Cool contexts don't accumulate client ban score */
2976         if (!bannable)
2977                 return;
2978
2979         if (banned)
2980                 i915_gem_context_set_banned(ctx);
2981
2982         if (!IS_ERR_OR_NULL(ctx->file_priv))
2983                 i915_gem_client_mark_guilty(ctx->file_priv, ctx);
2984 }
2985
2986 static void i915_gem_context_mark_innocent(struct i915_gem_context *ctx)
2987 {
2988         atomic_inc(&ctx->active_count);
2989 }
2990
2991 struct i915_request *
2992 i915_gem_find_active_request(struct intel_engine_cs *engine)
2993 {
2994         struct i915_request *request, *active = NULL;
2995         unsigned long flags;
2996
2997         /*
2998          * We are called by the error capture, reset and to dump engine
2999          * state at random points in time. In particular, note that neither is
3000          * crucially ordered with an interrupt. After a hang, the GPU is dead
3001          * and we assume that no more writes can happen (we waited long enough
3002          * for all writes that were in transaction to be flushed) - adding an
3003          * extra delay for a recent interrupt is pointless. Hence, we do
3004          * not need an engine->irq_seqno_barrier() before the seqno reads.
3005          * At all other times, we must assume the GPU is still running, but
3006          * we only care about the snapshot of this moment.
3007          */
3008         spin_lock_irqsave(&engine->timeline.lock, flags);
3009         list_for_each_entry(request, &engine->timeline.requests, link) {
3010                 if (__i915_request_completed(request, request->global_seqno))
3011                         continue;
3012
3013                 active = request;
3014                 break;
3015         }
3016         spin_unlock_irqrestore(&engine->timeline.lock, flags);
3017
3018         return active;
3019 }
3020
3021 /*
3022  * Ensure irq handler finishes, and not run again.
3023  * Also return the active request so that we only search for it once.
3024  */
3025 struct i915_request *
3026 i915_gem_reset_prepare_engine(struct intel_engine_cs *engine)
3027 {
3028         struct i915_request *request = NULL;
3029
3030         /*
3031          * During the reset sequence, we must prevent the engine from
3032          * entering RC6. As the context state is undefined until we restart
3033          * the engine, if it does enter RC6 during the reset, the state
3034          * written to the powercontext is undefined and so we may lose
3035          * GPU state upon resume, i.e. fail to restart after a reset.
3036          */
3037         intel_uncore_forcewake_get(engine->i915, FORCEWAKE_ALL);
3038
3039         /*
3040          * Prevent the signaler thread from updating the request
3041          * state (by calling dma_fence_signal) as we are processing
3042          * the reset. The write from the GPU of the seqno is
3043          * asynchronous and the signaler thread may see a different
3044          * value to us and declare the request complete, even though
3045          * the reset routine have picked that request as the active
3046          * (incomplete) request. This conflict is not handled
3047          * gracefully!
3048          */
3049         kthread_park(engine->breadcrumbs.signaler);
3050
3051         /*
3052          * Prevent request submission to the hardware until we have
3053          * completed the reset in i915_gem_reset_finish(). If a request
3054          * is completed by one engine, it may then queue a request
3055          * to a second via its execlists->tasklet *just* as we are
3056          * calling engine->init_hw() and also writing the ELSP.
3057          * Turning off the execlists->tasklet until the reset is over
3058          * prevents the race.
3059          *
3060          * Note that this needs to be a single atomic operation on the
3061          * tasklet (flush existing tasks, prevent new tasks) to prevent
3062          * a race between reset and set-wedged. It is not, so we do the best
3063          * we can atm and make sure we don't lock the machine up in the more
3064          * common case of recursively being called from set-wedged from inside
3065          * i915_reset.
3066          */
3067         if (!atomic_read(&engine->execlists.tasklet.count))
3068                 tasklet_kill(&engine->execlists.tasklet);
3069         tasklet_disable(&engine->execlists.tasklet);
3070
3071         /*
3072          * We're using worker to queue preemption requests from the tasklet in
3073          * GuC submission mode.
3074          * Even though tasklet was disabled, we may still have a worker queued.
3075          * Let's make sure that all workers scheduled before disabling the
3076          * tasklet are completed before continuing with the reset.
3077          */
3078         if (engine->i915->guc.preempt_wq)
3079                 flush_workqueue(engine->i915->guc.preempt_wq);
3080
3081         if (engine->irq_seqno_barrier)
3082                 engine->irq_seqno_barrier(engine);
3083
3084         request = i915_gem_find_active_request(engine);
3085         if (request && request->fence.error == -EIO)
3086                 request = ERR_PTR(-EIO); /* Previous reset failed! */
3087
3088         return request;
3089 }
3090
3091 int i915_gem_reset_prepare(struct drm_i915_private *dev_priv)
3092 {
3093         struct intel_engine_cs *engine;
3094         struct i915_request *request;
3095         enum intel_engine_id id;
3096         int err = 0;
3097
3098         for_each_engine(engine, dev_priv, id) {
3099                 request = i915_gem_reset_prepare_engine(engine);
3100                 if (IS_ERR(request)) {
3101                         err = PTR_ERR(request);
3102                         continue;
3103                 }
3104
3105                 engine->hangcheck.active_request = request;
3106         }
3107
3108         i915_gem_revoke_fences(dev_priv);
3109         intel_uc_sanitize(dev_priv);
3110
3111         return err;
3112 }
3113
3114 static void skip_request(struct i915_request *request)
3115 {
3116         void *vaddr = request->ring->vaddr;
3117         u32 head;
3118
3119         /* As this request likely depends on state from the lost
3120          * context, clear out all the user operations leaving the
3121          * breadcrumb at the end (so we get the fence notifications).
3122          */
3123         head = request->head;
3124         if (request->postfix < head) {
3125                 memset(vaddr + head, 0, request->ring->size - head);
3126                 head = 0;
3127         }
3128         memset(vaddr + head, 0, request->postfix - head);
3129
3130         dma_fence_set_error(&request->fence, -EIO);
3131 }
3132
3133 static void engine_skip_context(struct i915_request *request)
3134 {
3135         struct intel_engine_cs *engine = request->engine;
3136         struct i915_gem_context *hung_ctx = request->ctx;
3137         struct i915_timeline *timeline = request->timeline;
3138         unsigned long flags;
3139
3140         GEM_BUG_ON(timeline == &engine->timeline);
3141
3142         spin_lock_irqsave(&engine->timeline.lock, flags);
3143         spin_lock_nested(&timeline->lock, SINGLE_DEPTH_NESTING);
3144
3145         list_for_each_entry_continue(request, &engine->timeline.requests, link)
3146                 if (request->ctx == hung_ctx)
3147                         skip_request(request);
3148
3149         list_for_each_entry(request, &timeline->requests, link)
3150                 skip_request(request);
3151
3152         spin_unlock(&timeline->lock);
3153         spin_unlock_irqrestore(&engine->timeline.lock, flags);
3154 }
3155
3156 /* Returns the request if it was guilty of the hang */
3157 static struct i915_request *
3158 i915_gem_reset_request(struct intel_engine_cs *engine,
3159                        struct i915_request *request,
3160                        bool stalled)
3161 {
3162         /* The guilty request will get skipped on a hung engine.
3163          *
3164          * Users of client default contexts do not rely on logical
3165          * state preserved between batches so it is safe to execute
3166          * queued requests following the hang. Non default contexts
3167          * rely on preserved state, so skipping a batch loses the
3168          * evolution of the state and it needs to be considered corrupted.
3169          * Executing more queued batches on top of corrupted state is
3170          * risky. But we take the risk by trying to advance through
3171          * the queued requests in order to make the client behaviour
3172          * more predictable around resets, by not throwing away random
3173          * amount of batches it has prepared for execution. Sophisticated
3174          * clients can use gem_reset_stats_ioctl and dma fence status
3175          * (exported via sync_file info ioctl on explicit fences) to observe
3176          * when it loses the context state and should rebuild accordingly.
3177          *
3178          * The context ban, and ultimately the client ban, mechanism are safety
3179          * valves if client submission ends up resulting in nothing more than
3180          * subsequent hangs.
3181          */
3182
3183         if (i915_request_completed(request)) {
3184                 GEM_TRACE("%s pardoned global=%d (fence %llx:%d), current %d\n",
3185                           engine->name, request->global_seqno,
3186                           request->fence.context, request->fence.seqno,
3187                           intel_engine_get_seqno(engine));
3188                 stalled = false;
3189         }
3190
3191         if (stalled) {
3192                 i915_gem_context_mark_guilty(request->ctx);
3193                 skip_request(request);
3194
3195                 /* If this context is now banned, skip all pending requests. */
3196                 if (i915_gem_context_is_banned(request->ctx))
3197                         engine_skip_context(request);
3198         } else {
3199                 /*
3200                  * Since this is not the hung engine, it may have advanced
3201                  * since the hang declaration. Double check by refinding
3202                  * the active request at the time of the reset.
3203                  */
3204                 request = i915_gem_find_active_request(engine);
3205                 if (request) {
3206                         i915_gem_context_mark_innocent(request->ctx);
3207                         dma_fence_set_error(&request->fence, -EAGAIN);
3208
3209                         /* Rewind the engine to replay the incomplete rq */
3210                         spin_lock_irq(&engine->timeline.lock);
3211                         request = list_prev_entry(request, link);
3212                         if (&request->link == &engine->timeline.requests)
3213                                 request = NULL;
3214                         spin_unlock_irq(&engine->timeline.lock);
3215                 }
3216         }
3217
3218         return request;
3219 }
3220
3221 void i915_gem_reset_engine(struct intel_engine_cs *engine,
3222                            struct i915_request *request,
3223                            bool stalled)
3224 {
3225         /*
3226          * Make sure this write is visible before we re-enable the interrupt
3227          * handlers on another CPU, as tasklet_enable() resolves to just
3228          * a compiler barrier which is insufficient for our purpose here.
3229          */
3230         smp_store_mb(engine->irq_posted, 0);
3231
3232         if (request)
3233                 request = i915_gem_reset_request(engine, request, stalled);
3234
3235         if (request) {
3236                 DRM_DEBUG_DRIVER("resetting %s to restart from tail of request 0x%x\n",
3237                                  engine->name, request->global_seqno);
3238         }
3239
3240         /* Setup the CS to resume from the breadcrumb of the hung request */
3241         engine->reset_hw(engine, request);
3242 }
3243
3244 void i915_gem_reset(struct drm_i915_private *dev_priv,
3245                     unsigned int stalled_mask)
3246 {
3247         struct intel_engine_cs *engine;
3248         enum intel_engine_id id;
3249
3250         lockdep_assert_held(&dev_priv->drm.struct_mutex);
3251
3252         i915_retire_requests(dev_priv);
3253
3254         for_each_engine(engine, dev_priv, id) {
3255                 struct i915_gem_context *ctx;
3256
3257                 i915_gem_reset_engine(engine,
3258                                       engine->hangcheck.active_request,
3259                                       stalled_mask & ENGINE_MASK(id));
3260                 ctx = fetch_and_zero(&engine->last_retired_context);
3261                 if (ctx)
3262                         intel_context_unpin(ctx, engine);
3263
3264                 /*
3265                  * Ostensibily, we always want a context loaded for powersaving,
3266                  * so if the engine is idle after the reset, send a request
3267                  * to load our scratch kernel_context.
3268                  *
3269                  * More mysteriously, if we leave the engine idle after a reset,
3270                  * the next userspace batch may hang, with what appears to be
3271                  * an incoherent read by the CS (presumably stale TLB). An
3272                  * empty request appears sufficient to paper over the glitch.
3273                  */
3274                 if (intel_engine_is_idle(engine)) {
3275                         struct i915_request *rq;
3276
3277                         rq = i915_request_alloc(engine,
3278                                                 dev_priv->kernel_context);
3279                         if (!IS_ERR(rq))
3280                                 __i915_request_add(rq, false);
3281                 }
3282         }
3283
3284         i915_gem_restore_fences(dev_priv);
3285 }
3286
3287 void i915_gem_reset_finish_engine(struct intel_engine_cs *engine)
3288 {
3289         tasklet_enable(&engine->execlists.tasklet);
3290         kthread_unpark(engine->breadcrumbs.signaler);
3291
3292         intel_uncore_forcewake_put(engine->i915, FORCEWAKE_ALL);
3293 }
3294
3295 void i915_gem_reset_finish(struct drm_i915_private *dev_priv)
3296 {
3297         struct intel_engine_cs *engine;
3298         enum intel_engine_id id;
3299
3300         lockdep_assert_held(&dev_priv->drm.struct_mutex);
3301
3302         for_each_engine(engine, dev_priv, id) {
3303                 engine->hangcheck.active_request = NULL;
3304                 i915_gem_reset_finish_engine(engine);
3305         }
3306 }
3307
3308 static void nop_submit_request(struct i915_request *request)
3309 {
3310         GEM_TRACE("%s fence %llx:%d -> -EIO\n",
3311                   request->engine->name,
3312                   request->fence.context, request->fence.seqno);
3313         dma_fence_set_error(&request->fence, -EIO);
3314
3315         i915_request_submit(request);
3316 }
3317
3318 static void nop_complete_submit_request(struct i915_request *request)
3319 {
3320         unsigned long flags;
3321
3322         GEM_TRACE("%s fence %llx:%d -> -EIO\n",
3323                   request->engine->name,
3324                   request->fence.context, request->fence.seqno);
3325         dma_fence_set_error(&request->fence, -EIO);
3326
3327         spin_lock_irqsave(&request->engine->timeline.lock, flags);
3328         __i915_request_submit(request);
3329         intel_engine_init_global_seqno(request->engine, request->global_seqno);
3330         spin_unlock_irqrestore(&request->engine->timeline.lock, flags);
3331 }
3332
3333 void i915_gem_set_wedged(struct drm_i915_private *i915)
3334 {
3335         struct intel_engine_cs *engine;
3336         enum intel_engine_id id;
3337
3338         GEM_TRACE("start\n");
3339
3340         if (GEM_SHOW_DEBUG()) {
3341                 struct drm_printer p = drm_debug_printer(__func__);
3342
3343                 for_each_engine(engine, i915, id)
3344                         intel_engine_dump(engine, &p, "%s\n", engine->name);
3345         }
3346
3347         set_bit(I915_WEDGED, &i915->gpu_error.flags);
3348         smp_mb__after_atomic();
3349
3350         /*
3351          * First, stop submission to hw, but do not yet complete requests by
3352          * rolling the global seqno forward (since this would complete requests
3353          * for which we haven't set the fence error to EIO yet).
3354          */
3355         for_each_engine(engine, i915, id) {
3356                 i915_gem_reset_prepare_engine(engine);
3357
3358                 engine->submit_request = nop_submit_request;
3359                 engine->schedule = NULL;
3360         }
3361         i915->caps.scheduler = 0;
3362
3363         /* Even if the GPU reset fails, it should still stop the engines */
3364         intel_gpu_reset(i915, ALL_ENGINES);
3365
3366         /*
3367          * Make sure no one is running the old callback before we proceed with
3368          * cancelling requests and resetting the completion tracking. Otherwise
3369          * we might submit a request to the hardware which never completes.
3370          */
3371         synchronize_rcu();
3372
3373         for_each_engine(engine, i915, id) {
3374                 /* Mark all executing requests as skipped */
3375                 engine->cancel_requests(engine);
3376
3377                 /*
3378                  * Only once we've force-cancelled all in-flight requests can we
3379                  * start to complete all requests.
3380                  */
3381                 engine->submit_request = nop_complete_submit_request;
3382         }
3383
3384         /*
3385          * Make sure no request can slip through without getting completed by
3386          * either this call here to intel_engine_init_global_seqno, or the one
3387          * in nop_complete_submit_request.
3388          */
3389         synchronize_rcu();
3390
3391         for_each_engine(engine, i915, id) {
3392                 unsigned long flags;
3393
3394                 /*
3395                  * Mark all pending requests as complete so that any concurrent
3396                  * (lockless) lookup doesn't try and wait upon the request as we
3397                  * reset it.
3398                  */
3399                 spin_lock_irqsave(&engine->timeline.lock, flags);
3400                 intel_engine_init_global_seqno(engine,
3401                                                intel_engine_last_submit(engine));
3402                 spin_unlock_irqrestore(&engine->timeline.lock, flags);
3403
3404                 i915_gem_reset_finish_engine(engine);
3405         }
3406
3407         GEM_TRACE("end\n");
3408
3409         wake_up_all(&i915->gpu_error.reset_queue);
3410 }
3411
3412 bool i915_gem_unset_wedged(struct drm_i915_private *i915)
3413 {
3414         struct i915_timeline *tl;
3415
3416         lockdep_assert_held(&i915->drm.struct_mutex);
3417         if (!test_bit(I915_WEDGED, &i915->gpu_error.flags))
3418                 return true;
3419
3420         GEM_TRACE("start\n");
3421
3422         /*
3423          * Before unwedging, make sure that all pending operations
3424          * are flushed and errored out - we may have requests waiting upon
3425          * third party fences. We marked all inflight requests as EIO, and
3426          * every execbuf since returned EIO, for consistency we want all
3427          * the currently pending requests to also be marked as EIO, which
3428          * is done inside our nop_submit_request - and so we must wait.
3429          *
3430          * No more can be submitted until we reset the wedged bit.
3431          */
3432         list_for_each_entry(tl, &i915->gt.timelines, link) {
3433                 struct i915_request *rq;
3434
3435                 rq = i915_gem_active_peek(&tl->last_request,
3436                                           &i915->drm.struct_mutex);
3437                 if (!rq)
3438                         continue;
3439
3440                 /*
3441                  * We can't use our normal waiter as we want to
3442                  * avoid recursively trying to handle the current
3443                  * reset. The basic dma_fence_default_wait() installs
3444                  * a callback for dma_fence_signal(), which is
3445                  * triggered by our nop handler (indirectly, the
3446                  * callback enables the signaler thread which is
3447                  * woken by the nop_submit_request() advancing the seqno
3448                  * and when the seqno passes the fence, the signaler
3449                  * then signals the fence waking us up).
3450                  */
3451                 if (dma_fence_default_wait(&rq->fence, true,
3452                                            MAX_SCHEDULE_TIMEOUT) < 0)
3453                         return false;
3454         }
3455         i915_retire_requests(i915);
3456         GEM_BUG_ON(i915->gt.active_requests);
3457
3458         /*
3459          * Undo nop_submit_request. We prevent all new i915 requests from
3460          * being queued (by disallowing execbuf whilst wedged) so having
3461          * waited for all active requests above, we know the system is idle
3462          * and do not have to worry about a thread being inside
3463          * engine->submit_request() as we swap over. So unlike installing
3464          * the nop_submit_request on reset, we can do this from normal
3465          * context and do not require stop_machine().
3466          */
3467         intel_engines_reset_default_submission(i915);
3468         i915_gem_contexts_lost(i915);
3469
3470         GEM_TRACE("end\n");
3471
3472         smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
3473         clear_bit(I915_WEDGED, &i915->gpu_error.flags);
3474
3475         return true;
3476 }
3477
3478 static void
3479 i915_gem_retire_work_handler(struct work_struct *work)
3480 {
3481         struct drm_i915_private *dev_priv =
3482                 container_of(work, typeof(*dev_priv), gt.retire_work.work);
3483         struct drm_device *dev = &dev_priv->drm;
3484
3485         /* Come back later if the device is busy... */
3486         if (mutex_trylock(&dev->struct_mutex)) {
3487                 i915_retire_requests(dev_priv);
3488                 mutex_unlock(&dev->struct_mutex);
3489         }
3490
3491         /*
3492          * Keep the retire handler running until we are finally idle.
3493          * We do not need to do this test under locking as in the worst-case
3494          * we queue the retire worker once too often.
3495          */
3496         if (READ_ONCE(dev_priv->gt.awake))
3497                 queue_delayed_work(dev_priv->wq,
3498                                    &dev_priv->gt.retire_work,
3499                                    round_jiffies_up_relative(HZ));
3500 }
3501
3502 static void shrink_caches(struct drm_i915_private *i915)
3503 {
3504         /*
3505          * kmem_cache_shrink() discards empty slabs and reorders partially
3506          * filled slabs to prioritise allocating from the mostly full slabs,
3507          * with the aim of reducing fragmentation.
3508          */
3509         kmem_cache_shrink(i915->priorities);
3510         kmem_cache_shrink(i915->dependencies);
3511         kmem_cache_shrink(i915->requests);
3512         kmem_cache_shrink(i915->luts);
3513         kmem_cache_shrink(i915->vmas);
3514         kmem_cache_shrink(i915->objects);
3515 }
3516
3517 struct sleep_rcu_work {
3518         union {
3519                 struct rcu_head rcu;
3520                 struct work_struct work;
3521         };
3522         struct drm_i915_private *i915;
3523         unsigned int epoch;
3524 };
3525
3526 static inline bool
3527 same_epoch(struct drm_i915_private *i915, unsigned int epoch)
3528 {
3529         /*
3530          * There is a small chance that the epoch wrapped since we started
3531          * sleeping. If we assume that epoch is at least a u32, then it will
3532          * take at least 2^32 * 100ms for it to wrap, or about 326 years.
3533          */
3534         return epoch == READ_ONCE(i915->gt.epoch);
3535 }
3536
3537 static void __sleep_work(struct work_struct *work)
3538 {
3539         struct sleep_rcu_work *s = container_of(work, typeof(*s), work);
3540         struct drm_i915_private *i915 = s->i915;
3541         unsigned int epoch = s->epoch;
3542
3543         kfree(s);
3544         if (same_epoch(i915, epoch))
3545                 shrink_caches(i915);
3546 }
3547
3548 static void __sleep_rcu(struct rcu_head *rcu)
3549 {
3550         struct sleep_rcu_work *s = container_of(rcu, typeof(*s), rcu);
3551         struct drm_i915_private *i915 = s->i915;
3552
3553         if (same_epoch(i915, s->epoch)) {
3554                 INIT_WORK(&s->work, __sleep_work);
3555                 queue_work(i915->wq, &s->work);
3556         } else {
3557                 kfree(s);
3558         }
3559 }
3560
3561 static inline bool
3562 new_requests_since_last_retire(const struct drm_i915_private *i915)
3563 {
3564         return (READ_ONCE(i915->gt.active_requests) ||
3565                 work_pending(&i915->gt.idle_work.work));
3566 }
3567
3568 static void
3569 i915_gem_idle_work_handler(struct work_struct *work)
3570 {
3571         struct drm_i915_private *dev_priv =
3572                 container_of(work, typeof(*dev_priv), gt.idle_work.work);
3573         unsigned int epoch = I915_EPOCH_INVALID;
3574         bool rearm_hangcheck;
3575
3576         if (!READ_ONCE(dev_priv->gt.awake))
3577                 return;
3578
3579         /*
3580          * Wait for last execlists context complete, but bail out in case a
3581          * new request is submitted. As we don't trust the hardware, we
3582          * continue on if the wait times out. This is necessary to allow
3583          * the machine to suspend even if the hardware dies, and we will
3584          * try to recover in resume (after depriving the hardware of power,
3585          * it may be in a better mmod).
3586          */
3587         __wait_for(if (new_requests_since_last_retire(dev_priv)) return,
3588                    intel_engines_are_idle(dev_priv),
3589                    I915_IDLE_ENGINES_TIMEOUT * 1000,
3590                    10, 500);
3591
3592         rearm_hangcheck =
3593                 cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work);
3594
3595         if (!mutex_trylock(&dev_priv->drm.struct_mutex)) {
3596                 /* Currently busy, come back later */
3597                 mod_delayed_work(dev_priv->wq,
3598                                  &dev_priv->gt.idle_work,
3599                                  msecs_to_jiffies(50));
3600                 goto out_rearm;
3601         }
3602
3603         /*
3604          * New request retired after this work handler started, extend active
3605          * period until next instance of the work.
3606          */
3607         if (new_requests_since_last_retire(dev_priv))
3608                 goto out_unlock;
3609
3610         epoch = __i915_gem_park(dev_priv);
3611
3612         rearm_hangcheck = false;
3613 out_unlock:
3614         mutex_unlock(&dev_priv->drm.struct_mutex);
3615
3616 out_rearm:
3617         if (rearm_hangcheck) {
3618                 GEM_BUG_ON(!dev_priv->gt.awake);
3619                 i915_queue_hangcheck(dev_priv);
3620         }
3621
3622         /*
3623          * When we are idle, it is an opportune time to reap our caches.
3624          * However, we have many objects that utilise RCU and the ordered
3625          * i915->wq that this work is executing on. To try and flush any
3626          * pending frees now we are idle, we first wait for an RCU grace
3627          * period, and then queue a task (that will run last on the wq) to
3628          * shrink and re-optimize the caches.
3629          */
3630         if (same_epoch(dev_priv, epoch)) {
3631                 struct sleep_rcu_work *s = kmalloc(sizeof(*s), GFP_KERNEL);
3632                 if (s) {
3633                         s->i915 = dev_priv;
3634                         s->epoch = epoch;
3635                         call_rcu(&s->rcu, __sleep_rcu);
3636                 }
3637         }
3638 }
3639
3640 void i915_gem_close_object(struct drm_gem_object *gem, struct drm_file *file)
3641 {
3642         struct drm_i915_private *i915 = to_i915(gem->dev);
3643         struct drm_i915_gem_object *obj = to_intel_bo(gem);
3644         struct drm_i915_file_private *fpriv = file->driver_priv;
3645         struct i915_lut_handle *lut, *ln;
3646
3647         mutex_lock(&i915->drm.struct_mutex);
3648
3649         list_for_each_entry_safe(lut, ln, &obj->lut_list, obj_link) {
3650                 struct i915_gem_context *ctx = lut->ctx;
3651                 struct i915_vma *vma;
3652
3653                 GEM_BUG_ON(ctx->file_priv == ERR_PTR(-EBADF));
3654                 if (ctx->file_priv != fpriv)
3655                         continue;
3656
3657                 vma = radix_tree_delete(&ctx->handles_vma, lut->handle);
3658                 GEM_BUG_ON(vma->obj != obj);
3659
3660                 /* We allow the process to have multiple handles to the same
3661                  * vma, in the same fd namespace, by virtue of flink/open.
3662                  */
3663                 GEM_BUG_ON(!vma->open_count);
3664                 if (!--vma->open_count && !i915_vma_is_ggtt(vma))
3665                         i915_vma_close(vma);
3666
3667                 list_del(&lut->obj_link);
3668                 list_del(&lut->ctx_link);
3669
3670                 kmem_cache_free(i915->luts, lut);
3671                 __i915_gem_object_release_unless_active(obj);
3672         }
3673
3674         mutex_unlock(&i915->drm.struct_mutex);
3675 }
3676
3677 static unsigned long to_wait_timeout(s64 timeout_ns)
3678 {
3679         if (timeout_ns < 0)
3680                 return MAX_SCHEDULE_TIMEOUT;
3681
3682         if (timeout_ns == 0)
3683                 return 0;
3684
3685         return nsecs_to_jiffies_timeout(timeout_ns);
3686 }
3687
3688 /**
3689  * i915_gem_wait_ioctl - implements DRM_IOCTL_I915_GEM_WAIT
3690  * @dev: drm device pointer
3691  * @data: ioctl data blob
3692  * @file: drm file pointer
3693  *
3694  * Returns 0 if successful, else an error is returned with the remaining time in
3695  * the timeout parameter.
3696  *  -ETIME: object is still busy after timeout
3697  *  -ERESTARTSYS: signal interrupted the wait
3698  *  -ENONENT: object doesn't exist
3699  * Also possible, but rare:
3700  *  -EAGAIN: incomplete, restart syscall
3701  *  -ENOMEM: damn
3702  *  -ENODEV: Internal IRQ fail
3703  *  -E?: The add request failed
3704  *
3705  * The wait ioctl with a timeout of 0 reimplements the busy ioctl. With any
3706  * non-zero timeout parameter the wait ioctl will wait for the given number of
3707  * nanoseconds on an object becoming unbusy. Since the wait itself does so
3708  * without holding struct_mutex the object may become re-busied before this
3709  * function completes. A similar but shorter * race condition exists in the busy
3710  * ioctl
3711  */
3712 int
3713 i915_gem_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
3714 {
3715         struct drm_i915_gem_wait *args = data;
3716         struct drm_i915_gem_object *obj;
3717         ktime_t start;
3718         long ret;
3719
3720         if (args->flags != 0)
3721                 return -EINVAL;
3722
3723         obj = i915_gem_object_lookup(file, args->bo_handle);
3724         if (!obj)
3725                 return -ENOENT;
3726
3727         start = ktime_get();
3728
3729         ret = i915_gem_object_wait(obj,
3730                                    I915_WAIT_INTERRUPTIBLE | I915_WAIT_ALL,
3731                                    to_wait_timeout(args->timeout_ns),
3732                                    to_rps_client(file));
3733
3734         if (args->timeout_ns > 0) {
3735                 args->timeout_ns -= ktime_to_ns(ktime_sub(ktime_get(), start));
3736                 if (args->timeout_ns < 0)
3737                         args->timeout_ns = 0;
3738
3739                 /*
3740                  * Apparently ktime isn't accurate enough and occasionally has a
3741                  * bit of mismatch in the jiffies<->nsecs<->ktime loop. So patch
3742                  * things up to make the test happy. We allow up to 1 jiffy.
3743                  *
3744                  * This is a regression from the timespec->ktime conversion.
3745                  */
3746                 if (ret == -ETIME && !nsecs_to_jiffies(args->timeout_ns))
3747                         args->timeout_ns = 0;
3748
3749                 /* Asked to wait beyond the jiffie/scheduler precision? */
3750                 if (ret == -ETIME && args->timeout_ns)
3751                         ret = -EAGAIN;
3752         }
3753
3754         i915_gem_object_put(obj);
3755         return ret;
3756 }
3757
3758 static int wait_for_timeline(struct i915_timeline *tl, unsigned int flags)
3759 {
3760         return i915_gem_active_wait(&tl->last_request, flags);
3761 }
3762
3763 static int wait_for_engines(struct drm_i915_private *i915)
3764 {
3765         if (wait_for(intel_engines_are_idle(i915), I915_IDLE_ENGINES_TIMEOUT)) {
3766                 dev_err(i915->drm.dev,
3767                         "Failed to idle engines, declaring wedged!\n");
3768                 GEM_TRACE_DUMP();
3769                 i915_gem_set_wedged(i915);
3770                 return -EIO;
3771         }
3772
3773         return 0;
3774 }
3775
3776 int i915_gem_wait_for_idle(struct drm_i915_private *i915, unsigned int flags)
3777 {
3778         /* If the device is asleep, we have no requests outstanding */
3779         if (!READ_ONCE(i915->gt.awake))
3780                 return 0;
3781
3782         if (flags & I915_WAIT_LOCKED) {
3783                 struct i915_timeline *tl;
3784                 int err;
3785
3786                 lockdep_assert_held(&i915->drm.struct_mutex);
3787
3788                 list_for_each_entry(tl, &i915->gt.timelines, link) {
3789                         err = wait_for_timeline(tl, flags);
3790                         if (err)
3791                                 return err;
3792                 }
3793                 i915_retire_requests(i915);
3794
3795                 return wait_for_engines(i915);
3796         } else {
3797                 struct intel_engine_cs *engine;
3798                 enum intel_engine_id id;
3799                 int err;
3800
3801                 for_each_engine(engine, i915, id) {
3802                         err = wait_for_timeline(&engine->timeline, flags);
3803                         if (err)
3804                                 return err;
3805                 }
3806
3807                 return 0;
3808         }
3809 }
3810
3811 static void __i915_gem_object_flush_for_display(struct drm_i915_gem_object *obj)
3812 {
3813         /*
3814          * We manually flush the CPU domain so that we can override and
3815          * force the flush for the display, and perform it asyncrhonously.
3816          */
3817         flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
3818         if (obj->cache_dirty)
3819                 i915_gem_clflush_object(obj, I915_CLFLUSH_FORCE);
3820         obj->write_domain = 0;
3821 }
3822
3823 void i915_gem_object_flush_if_display(struct drm_i915_gem_object *obj)
3824 {
3825         if (!READ_ONCE(obj->pin_global))
3826                 return;
3827
3828         mutex_lock(&obj->base.dev->struct_mutex);
3829         __i915_gem_object_flush_for_display(obj);
3830         mutex_unlock(&obj->base.dev->struct_mutex);
3831 }
3832
3833 /**
3834  * Moves a single object to the WC read, and possibly write domain.
3835  * @obj: object to act on
3836  * @write: ask for write access or read only
3837  *
3838  * This function returns when the move is complete, including waiting on
3839  * flushes to occur.
3840  */
3841 int
3842 i915_gem_object_set_to_wc_domain(struct drm_i915_gem_object *obj, bool write)
3843 {
3844         int ret;
3845
3846         lockdep_assert_held(&obj->base.dev->struct_mutex);
3847
3848         ret = i915_gem_object_wait(obj,
3849                                    I915_WAIT_INTERRUPTIBLE |
3850                                    I915_WAIT_LOCKED |
3851                                    (write ? I915_WAIT_ALL : 0),
3852                                    MAX_SCHEDULE_TIMEOUT,
3853                                    NULL);
3854         if (ret)
3855                 return ret;
3856
3857         if (obj->write_domain == I915_GEM_DOMAIN_WC)
3858                 return 0;
3859
3860         /* Flush and acquire obj->pages so that we are coherent through
3861          * direct access in memory with previous cached writes through
3862          * shmemfs and that our cache domain tracking remains valid.
3863          * For example, if the obj->filp was moved to swap without us
3864          * being notified and releasing the pages, we would mistakenly
3865          * continue to assume that the obj remained out of the CPU cached
3866          * domain.
3867          */
3868         ret = i915_gem_object_pin_pages(obj);
3869         if (ret)
3870                 return ret;
3871
3872         flush_write_domain(obj, ~I915_GEM_DOMAIN_WC);
3873
3874         /* Serialise direct access to this object with the barriers for
3875          * coherent writes from the GPU, by effectively invalidating the
3876          * WC domain upon first access.
3877          */
3878         if ((obj->read_domains & I915_GEM_DOMAIN_WC) == 0)
3879                 mb();
3880
3881         /* It should now be out of any other write domains, and we can update
3882          * the domain values for our changes.
3883          */
3884         GEM_BUG_ON((obj->write_domain & ~I915_GEM_DOMAIN_WC) != 0);
3885         obj->read_domains |= I915_GEM_DOMAIN_WC;
3886         if (write) {
3887                 obj->read_domains = I915_GEM_DOMAIN_WC;
3888                 obj->write_domain = I915_GEM_DOMAIN_WC;
3889                 obj->mm.dirty = true;
3890         }
3891
3892         i915_gem_object_unpin_pages(obj);
3893         return 0;
3894 }
3895
3896 /**
3897  * Moves a single object to the GTT read, and possibly write domain.
3898  * @obj: object to act on
3899  * @write: ask for write access or read only
3900  *
3901  * This function returns when the move is complete, including waiting on
3902  * flushes to occur.
3903  */
3904 int
3905 i915_gem_object_set_to_gtt_domain(struct drm_i915_gem_object *obj, bool write)
3906 {
3907         int ret;
3908
3909         lockdep_assert_held(&obj->base.dev->struct_mutex);
3910
3911         ret = i915_gem_object_wait(obj,
3912                                    I915_WAIT_INTERRUPTIBLE |
3913                                    I915_WAIT_LOCKED |
3914                                    (write ? I915_WAIT_ALL : 0),
3915                                    MAX_SCHEDULE_TIMEOUT,
3916                                    NULL);
3917         if (ret)
3918                 return ret;
3919
3920         if (obj->write_domain == I915_GEM_DOMAIN_GTT)
3921                 return 0;
3922
3923         /* Flush and acquire obj->pages so that we are coherent through
3924          * direct access in memory with previous cached writes through
3925          * shmemfs and that our cache domain tracking remains valid.
3926          * For example, if the obj->filp was moved to swap without us
3927          * being notified and releasing the pages, we would mistakenly
3928          * continue to assume that the obj remained out of the CPU cached
3929          * domain.
3930          */
3931         ret = i915_gem_object_pin_pages(obj);
3932         if (ret)
3933                 return ret;
3934
3935         flush_write_domain(obj, ~I915_GEM_DOMAIN_GTT);
3936
3937         /* Serialise direct access to this object with the barriers for
3938          * coherent writes from the GPU, by effectively invalidating the
3939          * GTT domain upon first access.
3940          */
3941         if ((obj->read_domains & I915_GEM_DOMAIN_GTT) == 0)
3942                 mb();
3943
3944         /* It should now be out of any other write domains, and we can update
3945          * the domain values for our changes.
3946          */
3947         GEM_BUG_ON((obj->write_domain & ~I915_GEM_DOMAIN_GTT) != 0);
3948         obj->read_domains |= I915_GEM_DOMAIN_GTT;
3949         if (write) {
3950                 obj->read_domains = I915_GEM_DOMAIN_GTT;
3951                 obj->write_domain = I915_GEM_DOMAIN_GTT;
3952                 obj->mm.dirty = true;
3953         }
3954
3955         i915_gem_object_unpin_pages(obj);
3956         return 0;
3957 }
3958
3959 /**
3960  * Changes the cache-level of an object across all VMA.
3961  * @obj: object to act on
3962  * @cache_level: new cache level to set for the object
3963  *
3964  * After this function returns, the object will be in the new cache-level
3965  * across all GTT and the contents of the backing storage will be coherent,
3966  * with respect to the new cache-level. In order to keep the backing storage
3967  * coherent for all users, we only allow a single cache level to be set
3968  * globally on the object and prevent it from being changed whilst the
3969  * hardware is reading from the object. That is if the object is currently
3970  * on the scanout it will be set to uncached (or equivalent display
3971  * cache coherency) and all non-MOCS GPU access will also be uncached so
3972  * that all direct access to the scanout remains coherent.
3973  */
3974 int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
3975                                     enum i915_cache_level cache_level)
3976 {
3977         struct i915_vma *vma;
3978         int ret;
3979
3980         lockdep_assert_held(&obj->base.dev->struct_mutex);
3981
3982         if (obj->cache_level == cache_level)
3983                 return 0;
3984
3985         /* Inspect the list of currently bound VMA and unbind any that would
3986          * be invalid given the new cache-level. This is principally to
3987          * catch the issue of the CS prefetch crossing page boundaries and
3988          * reading an invalid PTE on older architectures.
3989          */
3990 restart:
3991         list_for_each_entry(vma, &obj->vma_list, obj_link) {
3992                 if (!drm_mm_node_allocated(&vma->node))
3993                         continue;
3994
3995                 if (i915_vma_is_pinned(vma)) {
3996                         DRM_DEBUG("can not change the cache level of pinned objects\n");
3997                         return -EBUSY;
3998                 }
3999
4000                 if (!i915_vma_is_closed(vma) &&
4001                     i915_gem_valid_gtt_space(vma, cache_level))
4002                         continue;
4003
4004                 ret = i915_vma_unbind(vma);
4005                 if (ret)
4006                         return ret;
4007
4008                 /* As unbinding may affect other elements in the
4009                  * obj->vma_list (due to side-effects from retiring
4010                  * an active vma), play safe and restart the iterator.
4011                  */
4012                 goto restart;
4013         }
4014
4015         /* We can reuse the existing drm_mm nodes but need to change the
4016          * cache-level on the PTE. We could simply unbind them all and
4017          * rebind with the correct cache-level on next use. However since
4018          * we already have a valid slot, dma mapping, pages etc, we may as
4019          * rewrite the PTE in the belief that doing so tramples upon less
4020          * state and so involves less work.
4021          */
4022         if (obj->bind_count) {
4023                 /* Before we change the PTE, the GPU must not be accessing it.
4024                  * If we wait upon the object, we know that all the bound
4025                  * VMA are no longer active.
4026                  */
4027                 ret = i915_gem_object_wait(obj,
4028                                            I915_WAIT_INTERRUPTIBLE |
4029                                            I915_WAIT_LOCKED |
4030                                            I915_WAIT_ALL,
4031                                            MAX_SCHEDULE_TIMEOUT,
4032                                            NULL);
4033                 if (ret)
4034                         return ret;
4035
4036                 if (!HAS_LLC(to_i915(obj->base.dev)) &&
4037                     cache_level != I915_CACHE_NONE) {
4038                         /* Access to snoopable pages through the GTT is
4039                          * incoherent and on some machines causes a hard
4040                          * lockup. Relinquish the CPU mmaping to force
4041                          * userspace to refault in the pages and we can
4042                          * then double check if the GTT mapping is still
4043                          * valid for that pointer access.
4044                          */
4045                         i915_gem_release_mmap(obj);
4046
4047                         /* As we no longer need a fence for GTT access,
4048                          * we can relinquish it now (and so prevent having
4049                          * to steal a fence from someone else on the next
4050                          * fence request). Note GPU activity would have
4051                          * dropped the fence as all snoopable access is
4052                          * supposed to be linear.
4053                          */
4054                         for_each_ggtt_vma(vma, obj) {
4055                                 ret = i915_vma_put_fence(vma);
4056                                 if (ret)
4057                                         return ret;
4058                         }
4059                 } else {
4060                         /* We either have incoherent backing store and
4061                          * so no GTT access or the architecture is fully
4062                          * coherent. In such cases, existing GTT mmaps
4063                          * ignore the cache bit in the PTE and we can
4064                          * rewrite it without confusing the GPU or having
4065                          * to force userspace to fault back in its mmaps.
4066                          */
4067                 }
4068
4069                 list_for_each_entry(vma, &obj->vma_list, obj_link) {
4070                         if (!drm_mm_node_allocated(&vma->node))
4071                                 continue;
4072
4073                         ret = i915_vma_bind(vma, cache_level, PIN_UPDATE);
4074                         if (ret)
4075                                 return ret;
4076                 }
4077         }
4078
4079         list_for_each_entry(vma, &obj->vma_list, obj_link)
4080                 vma->node.color = cache_level;
4081         i915_gem_object_set_cache_coherency(obj, cache_level);
4082         obj->cache_dirty = true; /* Always invalidate stale cachelines */
4083
4084         return 0;
4085 }
4086
4087 int i915_gem_get_caching_ioctl(struct drm_device *dev, void *data,
4088                                struct drm_file *file)
4089 {
4090         struct drm_i915_gem_caching *args = data;
4091         struct drm_i915_gem_object *obj;
4092         int err = 0;
4093
4094         rcu_read_lock();
4095         obj = i915_gem_object_lookup_rcu(file, args->handle);
4096         if (!obj) {
4097                 err = -ENOENT;
4098                 goto out;
4099         }
4100
4101         switch (obj->cache_level) {
4102         case I915_CACHE_LLC:
4103         case I915_CACHE_L3_LLC:
4104                 args->caching = I915_CACHING_CACHED;
4105                 break;
4106
4107         case I915_CACHE_WT:
4108                 args->caching = I915_CACHING_DISPLAY;
4109                 break;
4110
4111         default:
4112                 args->caching = I915_CACHING_NONE;
4113                 break;
4114         }
4115 out:
4116         rcu_read_unlock();
4117         return err;
4118 }
4119
4120 int i915_gem_set_caching_ioctl(struct drm_device *dev, void *data,
4121                                struct drm_file *file)
4122 {
4123         struct drm_i915_private *i915 = to_i915(dev);
4124         struct drm_i915_gem_caching *args = data;
4125         struct drm_i915_gem_object *obj;
4126         enum i915_cache_level level;
4127         int ret = 0;
4128
4129         switch (args->caching) {
4130         case I915_CACHING_NONE:
4131                 level = I915_CACHE_NONE;
4132                 break;
4133         case I915_CACHING_CACHED:
4134                 /*
4135                  * Due to a HW issue on BXT A stepping, GPU stores via a
4136                  * snooped mapping may leave stale data in a corresponding CPU
4137                  * cacheline, whereas normally such cachelines would get
4138                  * invalidated.
4139                  */
4140                 if (!HAS_LLC(i915) && !HAS_SNOOP(i915))
4141                         return -ENODEV;
4142
4143                 level = I915_CACHE_LLC;
4144                 break;
4145         case I915_CACHING_DISPLAY:
4146                 level = HAS_WT(i915) ? I915_CACHE_WT : I915_CACHE_NONE;
4147                 break;
4148         default:
4149                 return -EINVAL;
4150         }
4151
4152         obj = i915_gem_object_lookup(file, args->handle);
4153         if (!obj)
4154                 return -ENOENT;
4155
4156         /*
4157          * The caching mode of proxy object is handled by its generator, and
4158          * not allowed to be changed by userspace.
4159          */
4160         if (i915_gem_object_is_proxy(obj)) {
4161                 ret = -ENXIO;
4162                 goto out;
4163         }
4164
4165         if (obj->cache_level == level)
4166                 goto out;
4167
4168         ret = i915_gem_object_wait(obj,
4169                                    I915_WAIT_INTERRUPTIBLE,
4170                                    MAX_SCHEDULE_TIMEOUT,
4171                                    to_rps_client(file));
4172         if (ret)
4173                 goto out;
4174
4175         ret = i915_mutex_lock_interruptible(dev);
4176         if (ret)
4177                 goto out;
4178
4179         ret = i915_gem_object_set_cache_level(obj, level);
4180         mutex_unlock(&dev->struct_mutex);
4181
4182 out:
4183         i915_gem_object_put(obj);
4184         return ret;
4185 }
4186
4187 /*
4188  * Prepare buffer for display plane (scanout, cursors, etc). Can be called from
4189  * an uninterruptible phase (modesetting) and allows any flushes to be pipelined
4190  * (for pageflips). We only flush the caches while preparing the buffer for
4191  * display, the callers are responsible for frontbuffer flush.
4192  */
4193 struct i915_vma *
4194 i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj,
4195                                      u32 alignment,
4196                                      const struct i915_ggtt_view *view,
4197                                      unsigned int flags)
4198 {
4199         struct i915_vma *vma;
4200         int ret;
4201
4202         lockdep_assert_held(&obj->base.dev->struct_mutex);
4203
4204         /* Mark the global pin early so that we account for the
4205          * display coherency whilst setting up the cache domains.
4206          */
4207         obj->pin_global++;
4208
4209         /* The display engine is not coherent with the LLC cache on gen6.  As
4210          * a result, we make sure that the pinning that is about to occur is
4211          * done with uncached PTEs. This is lowest common denominator for all
4212          * chipsets.
4213          *
4214          * However for gen6+, we could do better by using the GFDT bit instead
4215          * of uncaching, which would allow us to flush all the LLC-cached data
4216          * with that bit in the PTE to main memory with just one PIPE_CONTROL.
4217          */
4218         ret = i915_gem_object_set_cache_level(obj,
4219                                               HAS_WT(to_i915(obj->base.dev)) ?
4220                                               I915_CACHE_WT : I915_CACHE_NONE);
4221         if (ret) {
4222                 vma = ERR_PTR(ret);
4223                 goto err_unpin_global;
4224         }
4225
4226         /* As the user may map the buffer once pinned in the display plane
4227          * (e.g. libkms for the bootup splash), we have to ensure that we
4228          * always use map_and_fenceable for all scanout buffers. However,
4229          * it may simply be too big to fit into mappable, in which case
4230          * put it anyway and hope that userspace can cope (but always first
4231          * try to preserve the existing ABI).
4232          */
4233         vma = ERR_PTR(-ENOSPC);
4234         if ((flags & PIN_MAPPABLE) == 0 &&
4235             (!view || view->type == I915_GGTT_VIEW_NORMAL))
4236                 vma = i915_gem_object_ggtt_pin(obj, view, 0, alignment,
4237                                                flags |
4238                                                PIN_MAPPABLE |
4239                                                PIN_NONBLOCK);
4240         if (IS_ERR(vma))
4241                 vma = i915_gem_object_ggtt_pin(obj, view, 0, alignment, flags);
4242         if (IS_ERR(vma))
4243                 goto err_unpin_global;
4244
4245         vma->display_alignment = max_t(u64, vma->display_alignment, alignment);
4246
4247         __i915_gem_object_flush_for_display(obj);
4248
4249         /* It should now be out of any other write domains, and we can update
4250          * the domain values for our changes.
4251          */
4252         obj->read_domains |= I915_GEM_DOMAIN_GTT;
4253
4254         return vma;
4255
4256 err_unpin_global:
4257         obj->pin_global--;
4258         return vma;
4259 }
4260
4261 void
4262 i915_gem_object_unpin_from_display_plane(struct i915_vma *vma)
4263 {
4264         lockdep_assert_held(&vma->vm->i915->drm.struct_mutex);
4265
4266         if (WARN_ON(vma->obj->pin_global == 0))
4267                 return;
4268
4269         if (--vma->obj->pin_global == 0)
4270                 vma->display_alignment = I915_GTT_MIN_ALIGNMENT;
4271
4272         /* Bump the LRU to try and avoid premature eviction whilst flipping  */
4273         i915_gem_object_bump_inactive_ggtt(vma->obj);
4274
4275         i915_vma_unpin(vma);
4276 }
4277
4278 /**
4279  * Moves a single object to the CPU read, and possibly write domain.
4280  * @obj: object to act on
4281  * @write: requesting write or read-only access
4282  *
4283  * This function returns when the move is complete, including waiting on
4284  * flushes to occur.
4285  */
4286 int
4287 i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write)
4288 {
4289         int ret;
4290
4291         lockdep_assert_held(&obj->base.dev->struct_mutex);
4292
4293         ret = i915_gem_object_wait(obj,
4294                                    I915_WAIT_INTERRUPTIBLE |
4295                                    I915_WAIT_LOCKED |
4296                                    (write ? I915_WAIT_ALL : 0),
4297                                    MAX_SCHEDULE_TIMEOUT,
4298                                    NULL);
4299         if (ret)
4300                 return ret;
4301
4302         flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
4303
4304         /* Flush the CPU cache if it's still invalid. */
4305         if ((obj->read_domains & I915_GEM_DOMAIN_CPU) == 0) {
4306                 i915_gem_clflush_object(obj, I915_CLFLUSH_SYNC);
4307                 obj->read_domains |= I915_GEM_DOMAIN_CPU;
4308         }
4309
4310         /* It should now be out of any other write domains, and we can update
4311          * the domain values for our changes.
4312          */
4313         GEM_BUG_ON(obj->write_domain & ~I915_GEM_DOMAIN_CPU);
4314
4315         /* If we're writing through the CPU, then the GPU read domains will
4316          * need to be invalidated at next use.
4317          */
4318         if (write)
4319                 __start_cpu_write(obj);
4320
4321         return 0;
4322 }
4323
4324 /* Throttle our rendering by waiting until the ring has completed our requests
4325  * emitted over 20 msec ago.
4326  *
4327  * Note that if we were to use the current jiffies each time around the loop,
4328  * we wouldn't escape the function with any frames outstanding if the time to
4329  * render a frame was over 20ms.
4330  *
4331  * This should get us reasonable parallelism between CPU and GPU but also
4332  * relatively low latency when blocking on a particular request to finish.
4333  */
4334 static int
4335 i915_gem_ring_throttle(struct drm_device *dev, struct drm_file *file)
4336 {
4337         struct drm_i915_private *dev_priv = to_i915(dev);
4338         struct drm_i915_file_private *file_priv = file->driver_priv;
4339         unsigned long recent_enough = jiffies - DRM_I915_THROTTLE_JIFFIES;
4340         struct i915_request *request, *target = NULL;
4341         long ret;
4342
4343         /* ABI: return -EIO if already wedged */
4344         if (i915_terminally_wedged(&dev_priv->gpu_error))
4345                 return -EIO;
4346
4347         spin_lock(&file_priv->mm.lock);
4348         list_for_each_entry(request, &file_priv->mm.request_list, client_link) {
4349                 if (time_after_eq(request->emitted_jiffies, recent_enough))
4350                         break;
4351
4352                 if (target) {
4353                         list_del(&target->client_link);
4354                         target->file_priv = NULL;
4355                 }
4356
4357                 target = request;
4358         }
4359         if (target)
4360                 i915_request_get(target);
4361         spin_unlock(&file_priv->mm.lock);
4362
4363         if (target == NULL)
4364                 return 0;
4365
4366         ret = i915_request_wait(target,
4367                                 I915_WAIT_INTERRUPTIBLE,
4368                                 MAX_SCHEDULE_TIMEOUT);
4369         i915_request_put(target);
4370
4371         return ret < 0 ? ret : 0;
4372 }
4373
4374 struct i915_vma *
4375 i915_gem_object_ggtt_pin(struct drm_i915_gem_object *obj,
4376                          const struct i915_ggtt_view *view,
4377                          u64 size,
4378                          u64 alignment,
4379                          u64 flags)
4380 {
4381         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
4382         struct i915_address_space *vm = &dev_priv->ggtt.base;
4383         struct i915_vma *vma;
4384         int ret;
4385
4386         lockdep_assert_held(&obj->base.dev->struct_mutex);
4387
4388         if (flags & PIN_MAPPABLE &&
4389             (!view || view->type == I915_GGTT_VIEW_NORMAL)) {
4390                 /* If the required space is larger than the available
4391                  * aperture, we will not able to find a slot for the
4392                  * object and unbinding the object now will be in
4393                  * vain. Worse, doing so may cause us to ping-pong
4394                  * the object in and out of the Global GTT and
4395                  * waste a lot of cycles under the mutex.
4396                  */
4397                 if (obj->base.size > dev_priv->ggtt.mappable_end)
4398                         return ERR_PTR(-E2BIG);
4399
4400                 /* If NONBLOCK is set the caller is optimistically
4401                  * trying to cache the full object within the mappable
4402                  * aperture, and *must* have a fallback in place for
4403                  * situations where we cannot bind the object. We
4404                  * can be a little more lax here and use the fallback
4405                  * more often to avoid costly migrations of ourselves
4406                  * and other objects within the aperture.
4407                  *
4408                  * Half-the-aperture is used as a simple heuristic.
4409                  * More interesting would to do search for a free
4410                  * block prior to making the commitment to unbind.
4411                  * That caters for the self-harm case, and with a
4412                  * little more heuristics (e.g. NOFAULT, NOEVICT)
4413                  * we could try to minimise harm to others.
4414                  */
4415                 if (flags & PIN_NONBLOCK &&
4416                     obj->base.size > dev_priv->ggtt.mappable_end / 2)
4417                         return ERR_PTR(-ENOSPC);
4418         }
4419
4420         vma = i915_vma_instance(obj, vm, view);
4421         if (unlikely(IS_ERR(vma)))
4422                 return vma;
4423
4424         if (i915_vma_misplaced(vma, size, alignment, flags)) {
4425                 if (flags & PIN_NONBLOCK) {
4426                         if (i915_vma_is_pinned(vma) || i915_vma_is_active(vma))
4427                                 return ERR_PTR(-ENOSPC);
4428
4429                         if (flags & PIN_MAPPABLE &&
4430                             vma->fence_size > dev_priv->ggtt.mappable_end / 2)
4431                                 return ERR_PTR(-ENOSPC);
4432                 }
4433
4434                 WARN(i915_vma_is_pinned(vma),
4435                      "bo is already pinned in ggtt with incorrect alignment:"
4436                      " offset=%08x, req.alignment=%llx,"
4437                      " req.map_and_fenceable=%d, vma->map_and_fenceable=%d\n",
4438                      i915_ggtt_offset(vma), alignment,
4439                      !!(flags & PIN_MAPPABLE),
4440                      i915_vma_is_map_and_fenceable(vma));
4441                 ret = i915_vma_unbind(vma);
4442                 if (ret)
4443                         return ERR_PTR(ret);
4444         }
4445
4446         ret = i915_vma_pin(vma, size, alignment, flags | PIN_GLOBAL);
4447         if (ret)
4448                 return ERR_PTR(ret);
4449
4450         return vma;
4451 }
4452
4453 static __always_inline unsigned int __busy_read_flag(unsigned int id)
4454 {
4455         /* Note that we could alias engines in the execbuf API, but
4456          * that would be very unwise as it prevents userspace from
4457          * fine control over engine selection. Ahem.
4458          *
4459          * This should be something like EXEC_MAX_ENGINE instead of
4460          * I915_NUM_ENGINES.
4461          */
4462         BUILD_BUG_ON(I915_NUM_ENGINES > 16);
4463         return 0x10000 << id;
4464 }
4465
4466 static __always_inline unsigned int __busy_write_id(unsigned int id)
4467 {
4468         /* The uABI guarantees an active writer is also amongst the read
4469          * engines. This would be true if we accessed the activity tracking
4470          * under the lock, but as we perform the lookup of the object and
4471          * its activity locklessly we can not guarantee that the last_write
4472          * being active implies that we have set the same engine flag from
4473          * last_read - hence we always set both read and write busy for
4474          * last_write.
4475          */
4476         return id | __busy_read_flag(id);
4477 }
4478
4479 static __always_inline unsigned int
4480 __busy_set_if_active(const struct dma_fence *fence,
4481                      unsigned int (*flag)(unsigned int id))
4482 {
4483         struct i915_request *rq;
4484
4485         /* We have to check the current hw status of the fence as the uABI
4486          * guarantees forward progress. We could rely on the idle worker
4487          * to eventually flush us, but to minimise latency just ask the
4488          * hardware.
4489          *
4490          * Note we only report on the status of native fences.
4491          */
4492         if (!dma_fence_is_i915(fence))
4493                 return 0;
4494
4495         /* opencode to_request() in order to avoid const warnings */
4496         rq = container_of(fence, struct i915_request, fence);
4497         if (i915_request_completed(rq))
4498                 return 0;
4499
4500         return flag(rq->engine->uabi_id);
4501 }
4502
4503 static __always_inline unsigned int
4504 busy_check_reader(const struct dma_fence *fence)
4505 {
4506         return __busy_set_if_active(fence, __busy_read_flag);
4507 }
4508
4509 static __always_inline unsigned int
4510 busy_check_writer(const struct dma_fence *fence)
4511 {
4512         if (!fence)
4513                 return 0;
4514
4515         return __busy_set_if_active(fence, __busy_write_id);
4516 }
4517
4518 int
4519 i915_gem_busy_ioctl(struct drm_device *dev, void *data,
4520                     struct drm_file *file)
4521 {
4522         struct drm_i915_gem_busy *args = data;
4523         struct drm_i915_gem_object *obj;
4524         struct reservation_object_list *list;
4525         unsigned int seq;
4526         int err;
4527
4528         err = -ENOENT;
4529         rcu_read_lock();
4530         obj = i915_gem_object_lookup_rcu(file, args->handle);
4531         if (!obj)
4532                 goto out;
4533
4534         /* A discrepancy here is that we do not report the status of
4535          * non-i915 fences, i.e. even though we may report the object as idle,
4536          * a call to set-domain may still stall waiting for foreign rendering.
4537          * This also means that wait-ioctl may report an object as busy,
4538          * where busy-ioctl considers it idle.
4539          *
4540          * We trade the ability to warn of foreign fences to report on which
4541          * i915 engines are active for the object.
4542          *
4543          * Alternatively, we can trade that extra information on read/write
4544          * activity with
4545          *      args->busy =
4546          *              !reservation_object_test_signaled_rcu(obj->resv, true);
4547          * to report the overall busyness. This is what the wait-ioctl does.
4548          *
4549          */
4550 retry:
4551         seq = raw_read_seqcount(&obj->resv->seq);
4552
4553         /* Translate the exclusive fence to the READ *and* WRITE engine */
4554         args->busy = busy_check_writer(rcu_dereference(obj->resv->fence_excl));
4555
4556         /* Translate shared fences to READ set of engines */
4557         list = rcu_dereference(obj->resv->fence);
4558         if (list) {
4559                 unsigned int shared_count = list->shared_count, i;
4560
4561                 for (i = 0; i < shared_count; ++i) {
4562                         struct dma_fence *fence =
4563                                 rcu_dereference(list->shared[i]);
4564
4565                         args->busy |= busy_check_reader(fence);
4566                 }
4567         }
4568
4569         if (args->busy && read_seqcount_retry(&obj->resv->seq, seq))
4570                 goto retry;
4571
4572         err = 0;
4573 out:
4574         rcu_read_unlock();
4575         return err;
4576 }
4577
4578 int
4579 i915_gem_throttle_ioctl(struct drm_device *dev, void *data,
4580                         struct drm_file *file_priv)
4581 {
4582         return i915_gem_ring_throttle(dev, file_priv);
4583 }
4584
4585 int
4586 i915_gem_madvise_ioctl(struct drm_device *dev, void *data,
4587                        struct drm_file *file_priv)
4588 {
4589         struct drm_i915_private *dev_priv = to_i915(dev);
4590         struct drm_i915_gem_madvise *args = data;
4591         struct drm_i915_gem_object *obj;
4592         int err;
4593
4594         switch (args->madv) {
4595         case I915_MADV_DONTNEED:
4596         case I915_MADV_WILLNEED:
4597             break;
4598         default:
4599             return -EINVAL;
4600         }
4601
4602         obj = i915_gem_object_lookup(file_priv, args->handle);
4603         if (!obj)
4604                 return -ENOENT;
4605
4606         err = mutex_lock_interruptible(&obj->mm.lock);
4607         if (err)
4608                 goto out;
4609
4610         if (i915_gem_object_has_pages(obj) &&
4611             i915_gem_object_is_tiled(obj) &&
4612             dev_priv->quirks & QUIRK_PIN_SWIZZLED_PAGES) {
4613                 if (obj->mm.madv == I915_MADV_WILLNEED) {
4614                         GEM_BUG_ON(!obj->mm.quirked);
4615                         __i915_gem_object_unpin_pages(obj);
4616                         obj->mm.quirked = false;
4617                 }
4618                 if (args->madv == I915_MADV_WILLNEED) {
4619                         GEM_BUG_ON(obj->mm.quirked);
4620                         __i915_gem_object_pin_pages(obj);
4621                         obj->mm.quirked = true;
4622                 }
4623         }
4624
4625         if (obj->mm.madv != __I915_MADV_PURGED)
4626                 obj->mm.madv = args->madv;
4627
4628         /* if the object is no longer attached, discard its backing storage */
4629         if (obj->mm.madv == I915_MADV_DONTNEED &&
4630             !i915_gem_object_has_pages(obj))
4631                 i915_gem_object_truncate(obj);
4632
4633         args->retained = obj->mm.madv != __I915_MADV_PURGED;
4634         mutex_unlock(&obj->mm.lock);
4635
4636 out:
4637         i915_gem_object_put(obj);
4638         return err;
4639 }
4640
4641 static void
4642 frontbuffer_retire(struct i915_gem_active *active, struct i915_request *request)
4643 {
4644         struct drm_i915_gem_object *obj =
4645                 container_of(active, typeof(*obj), frontbuffer_write);
4646
4647         intel_fb_obj_flush(obj, ORIGIN_CS);
4648 }
4649
4650 void i915_gem_object_init(struct drm_i915_gem_object *obj,
4651                           const struct drm_i915_gem_object_ops *ops)
4652 {
4653         mutex_init(&obj->mm.lock);
4654
4655         INIT_LIST_HEAD(&obj->vma_list);
4656         INIT_LIST_HEAD(&obj->lut_list);
4657         INIT_LIST_HEAD(&obj->batch_pool_link);
4658
4659         obj->ops = ops;
4660
4661         reservation_object_init(&obj->__builtin_resv);
4662         obj->resv = &obj->__builtin_resv;
4663
4664         obj->frontbuffer_ggtt_origin = ORIGIN_GTT;
4665         init_request_active(&obj->frontbuffer_write, frontbuffer_retire);
4666
4667         obj->mm.madv = I915_MADV_WILLNEED;
4668         INIT_RADIX_TREE(&obj->mm.get_page.radix, GFP_KERNEL | __GFP_NOWARN);
4669         mutex_init(&obj->mm.get_page.lock);
4670
4671         i915_gem_info_add_obj(to_i915(obj->base.dev), obj->base.size);
4672 }
4673
4674 static const struct drm_i915_gem_object_ops i915_gem_object_ops = {
4675         .flags = I915_GEM_OBJECT_HAS_STRUCT_PAGE |
4676                  I915_GEM_OBJECT_IS_SHRINKABLE,
4677
4678         .get_pages = i915_gem_object_get_pages_gtt,
4679         .put_pages = i915_gem_object_put_pages_gtt,
4680
4681         .pwrite = i915_gem_object_pwrite_gtt,
4682 };
4683
4684 static int i915_gem_object_create_shmem(struct drm_device *dev,
4685                                         struct drm_gem_object *obj,
4686                                         size_t size)
4687 {
4688         struct drm_i915_private *i915 = to_i915(dev);
4689         unsigned long flags = VM_NORESERVE;
4690         struct file *filp;
4691
4692         drm_gem_private_object_init(dev, obj, size);
4693
4694         if (i915->mm.gemfs)
4695                 filp = shmem_file_setup_with_mnt(i915->mm.gemfs, "i915", size,
4696                                                  flags);
4697         else
4698                 filp = shmem_file_setup("i915", size, flags);
4699
4700         if (IS_ERR(filp))
4701                 return PTR_ERR(filp);
4702
4703         obj->filp = filp;
4704
4705         return 0;
4706 }
4707
4708 struct drm_i915_gem_object *
4709 i915_gem_object_create(struct drm_i915_private *dev_priv, u64 size)
4710 {
4711         struct drm_i915_gem_object *obj;
4712         struct address_space *mapping;
4713         unsigned int cache_level;
4714         gfp_t mask;
4715         int ret;
4716
4717         /* There is a prevalence of the assumption that we fit the object's
4718          * page count inside a 32bit _signed_ variable. Let's document this and
4719          * catch if we ever need to fix it. In the meantime, if you do spot
4720          * such a local variable, please consider fixing!
4721          */
4722         if (size >> PAGE_SHIFT > INT_MAX)
4723                 return ERR_PTR(-E2BIG);
4724
4725         if (overflows_type(size, obj->base.size))
4726                 return ERR_PTR(-E2BIG);
4727
4728         obj = i915_gem_object_alloc(dev_priv);
4729         if (obj == NULL)
4730                 return ERR_PTR(-ENOMEM);
4731
4732         ret = i915_gem_object_create_shmem(&dev_priv->drm, &obj->base, size);
4733         if (ret)
4734                 goto fail;
4735
4736         mask = GFP_HIGHUSER | __GFP_RECLAIMABLE;
4737         if (IS_I965GM(dev_priv) || IS_I965G(dev_priv)) {
4738                 /* 965gm cannot relocate objects above 4GiB. */
4739                 mask &= ~__GFP_HIGHMEM;
4740                 mask |= __GFP_DMA32;
4741         }
4742
4743         mapping = obj->base.filp->f_mapping;
4744         mapping_set_gfp_mask(mapping, mask);
4745         GEM_BUG_ON(!(mapping_gfp_mask(mapping) & __GFP_RECLAIM));
4746
4747         i915_gem_object_init(obj, &i915_gem_object_ops);
4748
4749         obj->write_domain = I915_GEM_DOMAIN_CPU;
4750         obj->read_domains = I915_GEM_DOMAIN_CPU;
4751
4752         if (HAS_LLC(dev_priv))
4753                 /* On some devices, we can have the GPU use the LLC (the CPU
4754                  * cache) for about a 10% performance improvement
4755                  * compared to uncached.  Graphics requests other than
4756                  * display scanout are coherent with the CPU in
4757                  * accessing this cache.  This means in this mode we
4758                  * don't need to clflush on the CPU side, and on the
4759                  * GPU side we only need to flush internal caches to
4760                  * get data visible to the CPU.
4761                  *
4762                  * However, we maintain the display planes as UC, and so
4763                  * need to rebind when first used as such.
4764                  */
4765                 cache_level = I915_CACHE_LLC;
4766         else
4767                 cache_level = I915_CACHE_NONE;
4768
4769         i915_gem_object_set_cache_coherency(obj, cache_level);
4770
4771         trace_i915_gem_object_create(obj);
4772
4773         return obj;
4774
4775 fail:
4776         i915_gem_object_free(obj);
4777         return ERR_PTR(ret);
4778 }
4779
4780 static bool discard_backing_storage(struct drm_i915_gem_object *obj)
4781 {
4782         /* If we are the last user of the backing storage (be it shmemfs
4783          * pages or stolen etc), we know that the pages are going to be
4784          * immediately released. In this case, we can then skip copying
4785          * back the contents from the GPU.
4786          */
4787
4788         if (obj->mm.madv != I915_MADV_WILLNEED)
4789                 return false;
4790
4791         if (obj->base.filp == NULL)
4792                 return true;
4793
4794         /* At first glance, this looks racy, but then again so would be
4795          * userspace racing mmap against close. However, the first external
4796          * reference to the filp can only be obtained through the
4797          * i915_gem_mmap_ioctl() which safeguards us against the user
4798          * acquiring such a reference whilst we are in the middle of
4799          * freeing the object.
4800          */
4801         return atomic_long_read(&obj->base.filp->f_count) == 1;
4802 }
4803
4804 static void __i915_gem_free_objects(struct drm_i915_private *i915,
4805                                     struct llist_node *freed)
4806 {
4807         struct drm_i915_gem_object *obj, *on;
4808
4809         intel_runtime_pm_get(i915);
4810         llist_for_each_entry_safe(obj, on, freed, freed) {
4811                 struct i915_vma *vma, *vn;
4812
4813                 trace_i915_gem_object_destroy(obj);
4814
4815                 mutex_lock(&i915->drm.struct_mutex);
4816
4817                 GEM_BUG_ON(i915_gem_object_is_active(obj));
4818                 list_for_each_entry_safe(vma, vn,
4819                                          &obj->vma_list, obj_link) {
4820                         GEM_BUG_ON(i915_vma_is_active(vma));
4821                         vma->flags &= ~I915_VMA_PIN_MASK;
4822                         i915_vma_destroy(vma);
4823                 }
4824                 GEM_BUG_ON(!list_empty(&obj->vma_list));
4825                 GEM_BUG_ON(!RB_EMPTY_ROOT(&obj->vma_tree));
4826
4827                 /* This serializes freeing with the shrinker. Since the free
4828                  * is delayed, first by RCU then by the workqueue, we want the
4829                  * shrinker to be able to free pages of unreferenced objects,
4830                  * or else we may oom whilst there are plenty of deferred
4831                  * freed objects.
4832                  */
4833                 if (i915_gem_object_has_pages(obj)) {
4834                         spin_lock(&i915->mm.obj_lock);
4835                         list_del_init(&obj->mm.link);
4836                         spin_unlock(&i915->mm.obj_lock);
4837                 }
4838
4839                 mutex_unlock(&i915->drm.struct_mutex);
4840
4841                 GEM_BUG_ON(obj->bind_count);
4842                 GEM_BUG_ON(obj->userfault_count);
4843                 GEM_BUG_ON(atomic_read(&obj->frontbuffer_bits));
4844                 GEM_BUG_ON(!list_empty(&obj->lut_list));
4845
4846                 if (obj->ops->release)
4847                         obj->ops->release(obj);
4848
4849                 if (WARN_ON(i915_gem_object_has_pinned_pages(obj)))
4850                         atomic_set(&obj->mm.pages_pin_count, 0);
4851                 __i915_gem_object_put_pages(obj, I915_MM_NORMAL);
4852                 GEM_BUG_ON(i915_gem_object_has_pages(obj));
4853
4854                 if (obj->base.import_attach)
4855                         drm_prime_gem_destroy(&obj->base, NULL);
4856
4857                 reservation_object_fini(&obj->__builtin_resv);
4858                 drm_gem_object_release(&obj->base);
4859                 i915_gem_info_remove_obj(i915, obj->base.size);
4860
4861                 kfree(obj->bit_17);
4862                 i915_gem_object_free(obj);
4863
4864                 GEM_BUG_ON(!atomic_read(&i915->mm.free_count));
4865                 atomic_dec(&i915->mm.free_count);
4866
4867                 if (on)
4868                         cond_resched();
4869         }
4870         intel_runtime_pm_put(i915);
4871 }
4872
4873 static void i915_gem_flush_free_objects(struct drm_i915_private *i915)
4874 {
4875         struct llist_node *freed;
4876
4877         /* Free the oldest, most stale object to keep the free_list short */
4878         freed = NULL;
4879         if (!llist_empty(&i915->mm.free_list)) { /* quick test for hotpath */
4880                 /* Only one consumer of llist_del_first() allowed */
4881                 spin_lock(&i915->mm.free_lock);
4882                 freed = llist_del_first(&i915->mm.free_list);
4883                 spin_unlock(&i915->mm.free_lock);
4884         }
4885         if (unlikely(freed)) {
4886                 freed->next = NULL;
4887                 __i915_gem_free_objects(i915, freed);
4888         }
4889 }
4890
4891 static void __i915_gem_free_work(struct work_struct *work)
4892 {
4893         struct drm_i915_private *i915 =
4894                 container_of(work, struct drm_i915_private, mm.free_work);
4895         struct llist_node *freed;
4896
4897         /*
4898          * All file-owned VMA should have been released by this point through
4899          * i915_gem_close_object(), or earlier by i915_gem_context_close().
4900          * However, the object may also be bound into the global GTT (e.g.
4901          * older GPUs without per-process support, or for direct access through
4902          * the GTT either for the user or for scanout). Those VMA still need to
4903          * unbound now.
4904          */
4905
4906         spin_lock(&i915->mm.free_lock);
4907         while ((freed = llist_del_all(&i915->mm.free_list))) {
4908                 spin_unlock(&i915->mm.free_lock);
4909
4910                 __i915_gem_free_objects(i915, freed);
4911                 if (need_resched())
4912                         return;
4913
4914                 spin_lock(&i915->mm.free_lock);
4915         }
4916         spin_unlock(&i915->mm.free_lock);
4917 }
4918
4919 static void __i915_gem_free_object_rcu(struct rcu_head *head)
4920 {
4921         struct drm_i915_gem_object *obj =
4922                 container_of(head, typeof(*obj), rcu);
4923         struct drm_i915_private *i915 = to_i915(obj->base.dev);
4924
4925         /*
4926          * Since we require blocking on struct_mutex to unbind the freed
4927          * object from the GPU before releasing resources back to the
4928          * system, we can not do that directly from the RCU callback (which may
4929          * be a softirq context), but must instead then defer that work onto a
4930          * kthread. We use the RCU callback rather than move the freed object
4931          * directly onto the work queue so that we can mix between using the
4932          * worker and performing frees directly from subsequent allocations for
4933          * crude but effective memory throttling.
4934          */
4935         if (llist_add(&obj->freed, &i915->mm.free_list))
4936                 queue_work(i915->wq, &i915->mm.free_work);
4937 }
4938
4939 void i915_gem_free_object(struct drm_gem_object *gem_obj)
4940 {
4941         struct drm_i915_gem_object *obj = to_intel_bo(gem_obj);
4942
4943         if (obj->mm.quirked)
4944                 __i915_gem_object_unpin_pages(obj);
4945
4946         if (discard_backing_storage(obj))
4947                 obj->mm.madv = I915_MADV_DONTNEED;
4948
4949         /*
4950          * Before we free the object, make sure any pure RCU-only
4951          * read-side critical sections are complete, e.g.
4952          * i915_gem_busy_ioctl(). For the corresponding synchronized
4953          * lookup see i915_gem_object_lookup_rcu().
4954          */
4955         atomic_inc(&to_i915(obj->base.dev)->mm.free_count);
4956         call_rcu(&obj->rcu, __i915_gem_free_object_rcu);
4957 }
4958
4959 void __i915_gem_object_release_unless_active(struct drm_i915_gem_object *obj)
4960 {
4961         lockdep_assert_held(&obj->base.dev->struct_mutex);
4962
4963         if (!i915_gem_object_has_active_reference(obj) &&
4964             i915_gem_object_is_active(obj))
4965                 i915_gem_object_set_active_reference(obj);
4966         else
4967                 i915_gem_object_put(obj);
4968 }
4969
4970 static void assert_kernel_context_is_current(struct drm_i915_private *i915)
4971 {
4972         struct i915_gem_context *kernel_context = i915->kernel_context;
4973         struct intel_engine_cs *engine;
4974         enum intel_engine_id id;
4975
4976         for_each_engine(engine, i915, id) {
4977                 GEM_BUG_ON(__i915_gem_active_peek(&engine->timeline.last_request));
4978                 GEM_BUG_ON(engine->last_retired_context != kernel_context);
4979         }
4980 }
4981
4982 void i915_gem_sanitize(struct drm_i915_private *i915)
4983 {
4984         if (i915_terminally_wedged(&i915->gpu_error)) {
4985                 mutex_lock(&i915->drm.struct_mutex);
4986                 i915_gem_unset_wedged(i915);
4987                 mutex_unlock(&i915->drm.struct_mutex);
4988         }
4989
4990         /*
4991          * If we inherit context state from the BIOS or earlier occupants
4992          * of the GPU, the GPU may be in an inconsistent state when we
4993          * try to take over. The only way to remove the earlier state
4994          * is by resetting. However, resetting on earlier gen is tricky as
4995          * it may impact the display and we are uncertain about the stability
4996          * of the reset, so this could be applied to even earlier gen.
4997          */
4998         if (INTEL_GEN(i915) >= 5 && intel_has_gpu_reset(i915))
4999                 WARN_ON(intel_gpu_reset(i915, ALL_ENGINES));
5000 }
5001
5002 int i915_gem_suspend(struct drm_i915_private *dev_priv)
5003 {
5004         struct drm_device *dev = &dev_priv->drm;
5005         int ret;
5006
5007         intel_runtime_pm_get(dev_priv);
5008         intel_suspend_gt_powersave(dev_priv);
5009
5010         mutex_lock(&dev->struct_mutex);
5011
5012         /* We have to flush all the executing contexts to main memory so
5013          * that they can saved in the hibernation image. To ensure the last
5014          * context image is coherent, we have to switch away from it. That
5015          * leaves the dev_priv->kernel_context still active when
5016          * we actually suspend, and its image in memory may not match the GPU
5017          * state. Fortunately, the kernel_context is disposable and we do
5018          * not rely on its state.
5019          */
5020         if (!i915_terminally_wedged(&dev_priv->gpu_error)) {
5021                 ret = i915_gem_switch_to_kernel_context(dev_priv);
5022                 if (ret)
5023                         goto err_unlock;
5024
5025                 ret = i915_gem_wait_for_idle(dev_priv,
5026                                              I915_WAIT_INTERRUPTIBLE |
5027                                              I915_WAIT_LOCKED);
5028                 if (ret && ret != -EIO)
5029                         goto err_unlock;
5030
5031                 assert_kernel_context_is_current(dev_priv);
5032         }
5033         i915_gem_contexts_lost(dev_priv);
5034         mutex_unlock(&dev->struct_mutex);
5035
5036         intel_uc_suspend(dev_priv);
5037
5038         cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work);
5039         cancel_delayed_work_sync(&dev_priv->gt.retire_work);
5040
5041         /* As the idle_work is rearming if it detects a race, play safe and
5042          * repeat the flush until it is definitely idle.
5043          */
5044         drain_delayed_work(&dev_priv->gt.idle_work);
5045
5046         /* Assert that we sucessfully flushed all the work and
5047          * reset the GPU back to its idle, low power state.
5048          */
5049         WARN_ON(dev_priv->gt.awake);
5050         if (WARN_ON(!intel_engines_are_idle(dev_priv)))
5051                 i915_gem_set_wedged(dev_priv); /* no hope, discard everything */
5052
5053         /*
5054          * Neither the BIOS, ourselves or any other kernel
5055          * expects the system to be in execlists mode on startup,
5056          * so we need to reset the GPU back to legacy mode. And the only
5057          * known way to disable logical contexts is through a GPU reset.
5058          *
5059          * So in order to leave the system in a known default configuration,
5060          * always reset the GPU upon unload and suspend. Afterwards we then
5061          * clean up the GEM state tracking, flushing off the requests and
5062          * leaving the system in a known idle state.
5063          *
5064          * Note that is of the upmost importance that the GPU is idle and
5065          * all stray writes are flushed *before* we dismantle the backing
5066          * storage for the pinned objects.
5067          *
5068          * However, since we are uncertain that resetting the GPU on older
5069          * machines is a good idea, we don't - just in case it leaves the
5070          * machine in an unusable condition.
5071          */
5072         intel_uc_sanitize(dev_priv);
5073         i915_gem_sanitize(dev_priv);
5074
5075         intel_runtime_pm_put(dev_priv);
5076         return 0;
5077
5078 err_unlock:
5079         mutex_unlock(&dev->struct_mutex);
5080         intel_runtime_pm_put(dev_priv);
5081         return ret;
5082 }
5083
5084 void i915_gem_resume(struct drm_i915_private *i915)
5085 {
5086         WARN_ON(i915->gt.awake);
5087
5088         mutex_lock(&i915->drm.struct_mutex);
5089         intel_uncore_forcewake_get(i915, FORCEWAKE_ALL);
5090
5091         i915_gem_restore_gtt_mappings(i915);
5092         i915_gem_restore_fences(i915);
5093
5094         /*
5095          * As we didn't flush the kernel context before suspend, we cannot
5096          * guarantee that the context image is complete. So let's just reset
5097          * it and start again.
5098          */
5099         i915->gt.resume(i915);
5100
5101         if (i915_gem_init_hw(i915))
5102                 goto err_wedged;
5103
5104         intel_uc_resume(i915);
5105
5106         /* Always reload a context for powersaving. */
5107         if (i915_gem_switch_to_kernel_context(i915))
5108                 goto err_wedged;
5109
5110 out_unlock:
5111         intel_uncore_forcewake_put(i915, FORCEWAKE_ALL);
5112         mutex_unlock(&i915->drm.struct_mutex);
5113         return;
5114
5115 err_wedged:
5116         if (!i915_terminally_wedged(&i915->gpu_error)) {
5117                 DRM_ERROR("failed to re-initialize GPU, declaring wedged!\n");
5118                 i915_gem_set_wedged(i915);
5119         }
5120         goto out_unlock;
5121 }
5122
5123 void i915_gem_init_swizzling(struct drm_i915_private *dev_priv)
5124 {
5125         if (INTEL_GEN(dev_priv) < 5 ||
5126             dev_priv->mm.bit_6_swizzle_x == I915_BIT_6_SWIZZLE_NONE)
5127                 return;
5128
5129         I915_WRITE(DISP_ARB_CTL, I915_READ(DISP_ARB_CTL) |
5130                                  DISP_TILE_SURFACE_SWIZZLING);
5131
5132         if (IS_GEN5(dev_priv))
5133                 return;
5134
5135         I915_WRITE(TILECTL, I915_READ(TILECTL) | TILECTL_SWZCTL);
5136         if (IS_GEN6(dev_priv))
5137                 I915_WRITE(ARB_MODE, _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_SNB));
5138         else if (IS_GEN7(dev_priv))
5139                 I915_WRITE(ARB_MODE, _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_IVB));
5140         else if (IS_GEN8(dev_priv))
5141                 I915_WRITE(GAMTARBMODE, _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_BDW));
5142         else
5143                 BUG();
5144 }
5145
5146 static void init_unused_ring(struct drm_i915_private *dev_priv, u32 base)
5147 {
5148         I915_WRITE(RING_CTL(base), 0);
5149         I915_WRITE(RING_HEAD(base), 0);
5150         I915_WRITE(RING_TAIL(base), 0);
5151         I915_WRITE(RING_START(base), 0);
5152 }
5153
5154 static void init_unused_rings(struct drm_i915_private *dev_priv)
5155 {
5156         if (IS_I830(dev_priv)) {
5157                 init_unused_ring(dev_priv, PRB1_BASE);
5158                 init_unused_ring(dev_priv, SRB0_BASE);
5159                 init_unused_ring(dev_priv, SRB1_BASE);
5160                 init_unused_ring(dev_priv, SRB2_BASE);
5161                 init_unused_ring(dev_priv, SRB3_BASE);
5162         } else if (IS_GEN2(dev_priv)) {
5163                 init_unused_ring(dev_priv, SRB0_BASE);
5164                 init_unused_ring(dev_priv, SRB1_BASE);
5165         } else if (IS_GEN3(dev_priv)) {
5166                 init_unused_ring(dev_priv, PRB1_BASE);
5167                 init_unused_ring(dev_priv, PRB2_BASE);
5168         }
5169 }
5170
5171 static int __i915_gem_restart_engines(void *data)
5172 {
5173         struct drm_i915_private *i915 = data;
5174         struct intel_engine_cs *engine;
5175         enum intel_engine_id id;
5176         int err;
5177
5178         for_each_engine(engine, i915, id) {
5179                 err = engine->init_hw(engine);
5180                 if (err) {
5181                         DRM_ERROR("Failed to restart %s (%d)\n",
5182                                   engine->name, err);
5183                         return err;
5184                 }
5185         }
5186
5187         return 0;
5188 }
5189
5190 int i915_gem_init_hw(struct drm_i915_private *dev_priv)
5191 {
5192         int ret;
5193
5194         dev_priv->gt.last_init_time = ktime_get();
5195
5196         /* Double layer security blanket, see i915_gem_init() */
5197         intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
5198
5199         if (HAS_EDRAM(dev_priv) && INTEL_GEN(dev_priv) < 9)
5200                 I915_WRITE(HSW_IDICR, I915_READ(HSW_IDICR) | IDIHASHMSK(0xf));
5201
5202         if (IS_HASWELL(dev_priv))
5203                 I915_WRITE(MI_PREDICATE_RESULT_2, IS_HSW_GT3(dev_priv) ?
5204                            LOWER_SLICE_ENABLED : LOWER_SLICE_DISABLED);
5205
5206         if (HAS_PCH_NOP(dev_priv)) {
5207                 if (IS_IVYBRIDGE(dev_priv)) {
5208                         u32 temp = I915_READ(GEN7_MSG_CTL);
5209                         temp &= ~(WAIT_FOR_PCH_FLR_ACK | WAIT_FOR_PCH_RESET_ACK);
5210                         I915_WRITE(GEN7_MSG_CTL, temp);
5211                 } else if (INTEL_GEN(dev_priv) >= 7) {
5212                         u32 temp = I915_READ(HSW_NDE_RSTWRN_OPT);
5213                         temp &= ~RESET_PCH_HANDSHAKE_ENABLE;
5214                         I915_WRITE(HSW_NDE_RSTWRN_OPT, temp);
5215                 }
5216         }
5217
5218         intel_gt_workarounds_apply(dev_priv);
5219
5220         i915_gem_init_swizzling(dev_priv);
5221
5222         /*
5223          * At least 830 can leave some of the unused rings
5224          * "active" (ie. head != tail) after resume which
5225          * will prevent c3 entry. Makes sure all unused rings
5226          * are totally idle.
5227          */
5228         init_unused_rings(dev_priv);
5229
5230         BUG_ON(!dev_priv->kernel_context);
5231         if (i915_terminally_wedged(&dev_priv->gpu_error)) {
5232                 ret = -EIO;
5233                 goto out;
5234         }
5235
5236         ret = i915_ppgtt_init_hw(dev_priv);
5237         if (ret) {
5238                 DRM_ERROR("Enabling PPGTT failed (%d)\n", ret);
5239                 goto out;
5240         }
5241
5242         ret = intel_wopcm_init_hw(&dev_priv->wopcm);
5243         if (ret) {
5244                 DRM_ERROR("Enabling WOPCM failed (%d)\n", ret);
5245                 goto out;
5246         }
5247
5248         /* We can't enable contexts until all firmware is loaded */
5249         ret = intel_uc_init_hw(dev_priv);
5250         if (ret) {
5251                 DRM_ERROR("Enabling uc failed (%d)\n", ret);
5252                 goto out;
5253         }
5254
5255         intel_mocs_init_l3cc_table(dev_priv);
5256
5257         /* Only when the HW is re-initialised, can we replay the requests */
5258         ret = __i915_gem_restart_engines(dev_priv);
5259 out:
5260         intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
5261         return ret;
5262 }
5263
5264 static int __intel_engines_record_defaults(struct drm_i915_private *i915)
5265 {
5266         struct i915_gem_context *ctx;
5267         struct intel_engine_cs *engine;
5268         enum intel_engine_id id;
5269         int err;
5270
5271         /*
5272          * As we reset the gpu during very early sanitisation, the current
5273          * register state on the GPU should reflect its defaults values.
5274          * We load a context onto the hw (with restore-inhibit), then switch
5275          * over to a second context to save that default register state. We
5276          * can then prime every new context with that state so they all start
5277          * from the same default HW values.
5278          */
5279
5280         ctx = i915_gem_context_create_kernel(i915, 0);
5281         if (IS_ERR(ctx))
5282                 return PTR_ERR(ctx);
5283
5284         for_each_engine(engine, i915, id) {
5285                 struct i915_request *rq;
5286
5287                 rq = i915_request_alloc(engine, ctx);
5288                 if (IS_ERR(rq)) {
5289                         err = PTR_ERR(rq);
5290                         goto out_ctx;
5291                 }
5292
5293                 err = 0;
5294                 if (engine->init_context)
5295                         err = engine->init_context(rq);
5296
5297                 __i915_request_add(rq, true);
5298                 if (err)
5299                         goto err_active;
5300         }
5301
5302         err = i915_gem_switch_to_kernel_context(i915);
5303         if (err)
5304                 goto err_active;
5305
5306         err = i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED);
5307         if (err)
5308                 goto err_active;
5309
5310         assert_kernel_context_is_current(i915);
5311
5312         for_each_engine(engine, i915, id) {
5313                 struct i915_vma *state;
5314
5315                 state = to_intel_context(ctx, engine)->state;
5316                 if (!state)
5317                         continue;
5318
5319                 /*
5320                  * As we will hold a reference to the logical state, it will
5321                  * not be torn down with the context, and importantly the
5322                  * object will hold onto its vma (making it possible for a
5323                  * stray GTT write to corrupt our defaults). Unmap the vma
5324                  * from the GTT to prevent such accidents and reclaim the
5325                  * space.
5326                  */
5327                 err = i915_vma_unbind(state);
5328                 if (err)
5329                         goto err_active;
5330
5331                 err = i915_gem_object_set_to_cpu_domain(state->obj, false);
5332                 if (err)
5333                         goto err_active;
5334
5335                 engine->default_state = i915_gem_object_get(state->obj);
5336         }
5337
5338         if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) {
5339                 unsigned int found = intel_engines_has_context_isolation(i915);
5340
5341                 /*
5342                  * Make sure that classes with multiple engine instances all
5343                  * share the same basic configuration.
5344                  */
5345                 for_each_engine(engine, i915, id) {
5346                         unsigned int bit = BIT(engine->uabi_class);
5347                         unsigned int expected = engine->default_state ? bit : 0;
5348
5349                         if ((found & bit) != expected) {
5350                                 DRM_ERROR("mismatching default context state for class %d on engine %s\n",
5351                                           engine->uabi_class, engine->name);
5352                         }
5353                 }
5354         }
5355
5356 out_ctx:
5357         i915_gem_context_set_closed(ctx);
5358         i915_gem_context_put(ctx);
5359         return err;
5360
5361 err_active:
5362         /*
5363          * If we have to abandon now, we expect the engines to be idle
5364          * and ready to be torn-down. First try to flush any remaining
5365          * request, ensure we are pointing at the kernel context and
5366          * then remove it.
5367          */
5368         if (WARN_ON(i915_gem_switch_to_kernel_context(i915)))
5369                 goto out_ctx;
5370
5371         if (WARN_ON(i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED)))
5372                 goto out_ctx;
5373
5374         i915_gem_contexts_lost(i915);
5375         goto out_ctx;
5376 }
5377
5378 int i915_gem_init(struct drm_i915_private *dev_priv)
5379 {
5380         int ret;
5381
5382         /*
5383          * We need to fallback to 4K pages since gvt gtt handling doesn't
5384          * support huge page entries - we will need to check either hypervisor
5385          * mm can support huge guest page or just do emulation in gvt.
5386          */
5387         if (intel_vgpu_active(dev_priv))
5388                 mkwrite_device_info(dev_priv)->page_sizes =
5389                         I915_GTT_PAGE_SIZE_4K;
5390
5391         dev_priv->mm.unordered_timeline = dma_fence_context_alloc(1);
5392
5393         if (HAS_LOGICAL_RING_CONTEXTS(dev_priv)) {
5394                 dev_priv->gt.resume = intel_lr_context_resume;
5395                 dev_priv->gt.cleanup_engine = intel_logical_ring_cleanup;
5396         } else {
5397                 dev_priv->gt.resume = intel_legacy_submission_resume;
5398                 dev_priv->gt.cleanup_engine = intel_engine_cleanup;
5399         }
5400
5401         ret = i915_gem_init_userptr(dev_priv);
5402         if (ret)
5403                 return ret;
5404
5405         ret = intel_wopcm_init(&dev_priv->wopcm);
5406         if (ret)
5407                 return ret;
5408
5409         ret = intel_uc_init_misc(dev_priv);
5410         if (ret)
5411                 return ret;
5412
5413         /* This is just a security blanket to placate dragons.
5414          * On some systems, we very sporadically observe that the first TLBs
5415          * used by the CS may be stale, despite us poking the TLB reset. If
5416          * we hold the forcewake during initialisation these problems
5417          * just magically go away.
5418          */
5419         mutex_lock(&dev_priv->drm.struct_mutex);
5420         intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
5421
5422         ret = i915_gem_init_ggtt(dev_priv);
5423         if (ret) {
5424                 GEM_BUG_ON(ret == -EIO);
5425                 goto err_unlock;
5426         }
5427
5428         ret = i915_gem_contexts_init(dev_priv);
5429         if (ret) {
5430                 GEM_BUG_ON(ret == -EIO);
5431                 goto err_ggtt;
5432         }
5433
5434         ret = intel_engines_init(dev_priv);
5435         if (ret) {
5436                 GEM_BUG_ON(ret == -EIO);
5437                 goto err_context;
5438         }
5439
5440         intel_init_gt_powersave(dev_priv);
5441
5442         ret = intel_uc_init(dev_priv);
5443         if (ret)
5444                 goto err_pm;
5445
5446         ret = i915_gem_init_hw(dev_priv);
5447         if (ret)
5448                 goto err_uc_init;
5449
5450         /*
5451          * Despite its name intel_init_clock_gating applies both display
5452          * clock gating workarounds; GT mmio workarounds and the occasional
5453          * GT power context workaround. Worse, sometimes it includes a context
5454          * register workaround which we need to apply before we record the
5455          * default HW state for all contexts.
5456          *
5457          * FIXME: break up the workarounds and apply them at the right time!
5458          */
5459         intel_init_clock_gating(dev_priv);
5460
5461         ret = __intel_engines_record_defaults(dev_priv);
5462         if (ret)
5463                 goto err_init_hw;
5464
5465         if (i915_inject_load_failure()) {
5466                 ret = -ENODEV;
5467                 goto err_init_hw;
5468         }
5469
5470         if (i915_inject_load_failure()) {
5471                 ret = -EIO;
5472                 goto err_init_hw;
5473         }
5474
5475         intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
5476         mutex_unlock(&dev_priv->drm.struct_mutex);
5477
5478         return 0;
5479
5480         /*
5481          * Unwinding is complicated by that we want to handle -EIO to mean
5482          * disable GPU submission but keep KMS alive. We want to mark the
5483          * HW as irrevisibly wedged, but keep enough state around that the
5484          * driver doesn't explode during runtime.
5485          */
5486 err_init_hw:
5487         i915_gem_wait_for_idle(dev_priv, I915_WAIT_LOCKED);
5488         i915_gem_contexts_lost(dev_priv);
5489         intel_uc_fini_hw(dev_priv);
5490 err_uc_init:
5491         intel_uc_fini(dev_priv);
5492 err_pm:
5493         if (ret != -EIO) {
5494                 intel_cleanup_gt_powersave(dev_priv);
5495                 i915_gem_cleanup_engines(dev_priv);
5496         }
5497 err_context:
5498         if (ret != -EIO)
5499                 i915_gem_contexts_fini(dev_priv);
5500 err_ggtt:
5501 err_unlock:
5502         intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
5503         mutex_unlock(&dev_priv->drm.struct_mutex);
5504
5505         intel_uc_fini_misc(dev_priv);
5506
5507         if (ret != -EIO)
5508                 i915_gem_cleanup_userptr(dev_priv);
5509
5510         if (ret == -EIO) {
5511                 /*
5512                  * Allow engine initialisation to fail by marking the GPU as
5513                  * wedged. But we only want to do this where the GPU is angry,
5514                  * for all other failure, such as an allocation failure, bail.
5515                  */
5516                 if (!i915_terminally_wedged(&dev_priv->gpu_error)) {
5517                         DRM_ERROR("Failed to initialize GPU, declaring it wedged\n");
5518                         i915_gem_set_wedged(dev_priv);
5519                 }
5520                 ret = 0;
5521         }
5522
5523         i915_gem_drain_freed_objects(dev_priv);
5524         return ret;
5525 }
5526
5527 void i915_gem_init_mmio(struct drm_i915_private *i915)
5528 {
5529         i915_gem_sanitize(i915);
5530 }
5531
5532 void
5533 i915_gem_cleanup_engines(struct drm_i915_private *dev_priv)
5534 {
5535         struct intel_engine_cs *engine;
5536         enum intel_engine_id id;
5537
5538         for_each_engine(engine, dev_priv, id)
5539                 dev_priv->gt.cleanup_engine(engine);
5540 }
5541
5542 void
5543 i915_gem_load_init_fences(struct drm_i915_private *dev_priv)
5544 {
5545         int i;
5546
5547         if (INTEL_GEN(dev_priv) >= 7 && !IS_VALLEYVIEW(dev_priv) &&
5548             !IS_CHERRYVIEW(dev_priv))
5549                 dev_priv->num_fence_regs = 32;
5550         else if (INTEL_GEN(dev_priv) >= 4 ||
5551                  IS_I945G(dev_priv) || IS_I945GM(dev_priv) ||
5552                  IS_G33(dev_priv) || IS_PINEVIEW(dev_priv))
5553                 dev_priv->num_fence_regs = 16;
5554         else
5555                 dev_priv->num_fence_regs = 8;
5556
5557         if (intel_vgpu_active(dev_priv))
5558                 dev_priv->num_fence_regs =
5559                                 I915_READ(vgtif_reg(avail_rs.fence_num));
5560
5561         /* Initialize fence registers to zero */
5562         for (i = 0; i < dev_priv->num_fence_regs; i++) {
5563                 struct drm_i915_fence_reg *fence = &dev_priv->fence_regs[i];
5564
5565                 fence->i915 = dev_priv;
5566                 fence->id = i;
5567                 list_add_tail(&fence->link, &dev_priv->mm.fence_list);
5568         }
5569         i915_gem_restore_fences(dev_priv);
5570
5571         i915_gem_detect_bit_6_swizzle(dev_priv);
5572 }
5573
5574 static void i915_gem_init__mm(struct drm_i915_private *i915)
5575 {
5576         spin_lock_init(&i915->mm.object_stat_lock);
5577         spin_lock_init(&i915->mm.obj_lock);
5578         spin_lock_init(&i915->mm.free_lock);
5579
5580         init_llist_head(&i915->mm.free_list);
5581
5582         INIT_LIST_HEAD(&i915->mm.unbound_list);
5583         INIT_LIST_HEAD(&i915->mm.bound_list);
5584         INIT_LIST_HEAD(&i915->mm.fence_list);
5585         INIT_LIST_HEAD(&i915->mm.userfault_list);
5586
5587         INIT_WORK(&i915->mm.free_work, __i915_gem_free_work);
5588 }
5589
5590 int i915_gem_init_early(struct drm_i915_private *dev_priv)
5591 {
5592         int err = -ENOMEM;
5593
5594         dev_priv->objects = KMEM_CACHE(drm_i915_gem_object, SLAB_HWCACHE_ALIGN);
5595         if (!dev_priv->objects)
5596                 goto err_out;
5597
5598         dev_priv->vmas = KMEM_CACHE(i915_vma, SLAB_HWCACHE_ALIGN);
5599         if (!dev_priv->vmas)
5600                 goto err_objects;
5601
5602         dev_priv->luts = KMEM_CACHE(i915_lut_handle, 0);
5603         if (!dev_priv->luts)
5604                 goto err_vmas;
5605
5606         dev_priv->requests = KMEM_CACHE(i915_request,
5607                                         SLAB_HWCACHE_ALIGN |
5608                                         SLAB_RECLAIM_ACCOUNT |
5609                                         SLAB_TYPESAFE_BY_RCU);
5610         if (!dev_priv->requests)
5611                 goto err_luts;
5612
5613         dev_priv->dependencies = KMEM_CACHE(i915_dependency,
5614                                             SLAB_HWCACHE_ALIGN |
5615                                             SLAB_RECLAIM_ACCOUNT);
5616         if (!dev_priv->dependencies)
5617                 goto err_requests;
5618
5619         dev_priv->priorities = KMEM_CACHE(i915_priolist, SLAB_HWCACHE_ALIGN);
5620         if (!dev_priv->priorities)
5621                 goto err_dependencies;
5622
5623         INIT_LIST_HEAD(&dev_priv->gt.timelines);
5624         INIT_LIST_HEAD(&dev_priv->gt.active_rings);
5625         INIT_LIST_HEAD(&dev_priv->gt.closed_vma);
5626
5627         i915_gem_init__mm(dev_priv);
5628
5629         INIT_DELAYED_WORK(&dev_priv->gt.retire_work,
5630                           i915_gem_retire_work_handler);
5631         INIT_DELAYED_WORK(&dev_priv->gt.idle_work,
5632                           i915_gem_idle_work_handler);
5633         init_waitqueue_head(&dev_priv->gpu_error.wait_queue);
5634         init_waitqueue_head(&dev_priv->gpu_error.reset_queue);
5635
5636         atomic_set(&dev_priv->mm.bsd_engine_dispatch_index, 0);
5637
5638         spin_lock_init(&dev_priv->fb_tracking.lock);
5639
5640         err = i915_gemfs_init(dev_priv);
5641         if (err)
5642                 DRM_NOTE("Unable to create a private tmpfs mount, hugepage support will be disabled(%d).\n", err);
5643
5644         return 0;
5645
5646 err_dependencies:
5647         kmem_cache_destroy(dev_priv->dependencies);
5648 err_requests:
5649         kmem_cache_destroy(dev_priv->requests);
5650 err_luts:
5651         kmem_cache_destroy(dev_priv->luts);
5652 err_vmas:
5653         kmem_cache_destroy(dev_priv->vmas);
5654 err_objects:
5655         kmem_cache_destroy(dev_priv->objects);
5656 err_out:
5657         return err;
5658 }
5659
5660 void i915_gem_cleanup_early(struct drm_i915_private *dev_priv)
5661 {
5662         i915_gem_drain_freed_objects(dev_priv);
5663         GEM_BUG_ON(!llist_empty(&dev_priv->mm.free_list));
5664         GEM_BUG_ON(atomic_read(&dev_priv->mm.free_count));
5665         WARN_ON(dev_priv->mm.object_count);
5666         WARN_ON(!list_empty(&dev_priv->gt.timelines));
5667
5668         kmem_cache_destroy(dev_priv->priorities);
5669         kmem_cache_destroy(dev_priv->dependencies);
5670         kmem_cache_destroy(dev_priv->requests);
5671         kmem_cache_destroy(dev_priv->luts);
5672         kmem_cache_destroy(dev_priv->vmas);
5673         kmem_cache_destroy(dev_priv->objects);
5674
5675         /* And ensure that our DESTROY_BY_RCU slabs are truly destroyed */
5676         rcu_barrier();
5677
5678         i915_gemfs_fini(dev_priv);
5679 }
5680
5681 int i915_gem_freeze(struct drm_i915_private *dev_priv)
5682 {
5683         /* Discard all purgeable objects, let userspace recover those as
5684          * required after resuming.
5685          */
5686         i915_gem_shrink_all(dev_priv);
5687
5688         return 0;
5689 }
5690
5691 int i915_gem_freeze_late(struct drm_i915_private *dev_priv)
5692 {
5693         struct drm_i915_gem_object *obj;
5694         struct list_head *phases[] = {
5695                 &dev_priv->mm.unbound_list,
5696                 &dev_priv->mm.bound_list,
5697                 NULL
5698         }, **p;
5699
5700         /* Called just before we write the hibernation image.
5701          *
5702          * We need to update the domain tracking to reflect that the CPU
5703          * will be accessing all the pages to create and restore from the
5704          * hibernation, and so upon restoration those pages will be in the
5705          * CPU domain.
5706          *
5707          * To make sure the hibernation image contains the latest state,
5708          * we update that state just before writing out the image.
5709          *
5710          * To try and reduce the hibernation image, we manually shrink
5711          * the objects as well, see i915_gem_freeze()
5712          */
5713
5714         i915_gem_shrink(dev_priv, -1UL, NULL, I915_SHRINK_UNBOUND);
5715         i915_gem_drain_freed_objects(dev_priv);
5716
5717         spin_lock(&dev_priv->mm.obj_lock);
5718         for (p = phases; *p; p++) {
5719                 list_for_each_entry(obj, *p, mm.link)
5720                         __start_cpu_write(obj);
5721         }
5722         spin_unlock(&dev_priv->mm.obj_lock);
5723
5724         return 0;
5725 }
5726
5727 void i915_gem_release(struct drm_device *dev, struct drm_file *file)
5728 {
5729         struct drm_i915_file_private *file_priv = file->driver_priv;
5730         struct i915_request *request;
5731
5732         /* Clean up our request list when the client is going away, so that
5733          * later retire_requests won't dereference our soon-to-be-gone
5734          * file_priv.
5735          */
5736         spin_lock(&file_priv->mm.lock);
5737         list_for_each_entry(request, &file_priv->mm.request_list, client_link)
5738                 request->file_priv = NULL;
5739         spin_unlock(&file_priv->mm.lock);
5740 }
5741
5742 int i915_gem_open(struct drm_i915_private *i915, struct drm_file *file)
5743 {
5744         struct drm_i915_file_private *file_priv;
5745         int ret;
5746
5747         DRM_DEBUG("\n");
5748
5749         file_priv = kzalloc(sizeof(*file_priv), GFP_KERNEL);
5750         if (!file_priv)
5751                 return -ENOMEM;
5752
5753         file->driver_priv = file_priv;
5754         file_priv->dev_priv = i915;
5755         file_priv->file = file;
5756
5757         spin_lock_init(&file_priv->mm.lock);
5758         INIT_LIST_HEAD(&file_priv->mm.request_list);
5759
5760         file_priv->bsd_engine = -1;
5761         file_priv->hang_timestamp = jiffies;
5762
5763         ret = i915_gem_context_open(i915, file);
5764         if (ret)
5765                 kfree(file_priv);
5766
5767         return ret;
5768 }
5769
5770 /**
5771  * i915_gem_track_fb - update frontbuffer tracking
5772  * @old: current GEM buffer for the frontbuffer slots
5773  * @new: new GEM buffer for the frontbuffer slots
5774  * @frontbuffer_bits: bitmask of frontbuffer slots
5775  *
5776  * This updates the frontbuffer tracking bits @frontbuffer_bits by clearing them
5777  * from @old and setting them in @new. Both @old and @new can be NULL.
5778  */
5779 void i915_gem_track_fb(struct drm_i915_gem_object *old,
5780                        struct drm_i915_gem_object *new,
5781                        unsigned frontbuffer_bits)
5782 {
5783         /* Control of individual bits within the mask are guarded by
5784          * the owning plane->mutex, i.e. we can never see concurrent
5785          * manipulation of individual bits. But since the bitfield as a whole
5786          * is updated using RMW, we need to use atomics in order to update
5787          * the bits.
5788          */
5789         BUILD_BUG_ON(INTEL_FRONTBUFFER_BITS_PER_PIPE * I915_MAX_PIPES >
5790                      sizeof(atomic_t) * BITS_PER_BYTE);
5791
5792         if (old) {
5793                 WARN_ON(!(atomic_read(&old->frontbuffer_bits) & frontbuffer_bits));
5794                 atomic_andnot(frontbuffer_bits, &old->frontbuffer_bits);
5795         }
5796
5797         if (new) {
5798                 WARN_ON(atomic_read(&new->frontbuffer_bits) & frontbuffer_bits);
5799                 atomic_or(frontbuffer_bits, &new->frontbuffer_bits);
5800         }
5801 }
5802
5803 /* Allocate a new GEM object and fill it with the supplied data */
5804 struct drm_i915_gem_object *
5805 i915_gem_object_create_from_data(struct drm_i915_private *dev_priv,
5806                                  const void *data, size_t size)
5807 {
5808         struct drm_i915_gem_object *obj;
5809         struct file *file;
5810         size_t offset;
5811         int err;
5812
5813         obj = i915_gem_object_create(dev_priv, round_up(size, PAGE_SIZE));
5814         if (IS_ERR(obj))
5815                 return obj;
5816
5817         GEM_BUG_ON(obj->write_domain != I915_GEM_DOMAIN_CPU);
5818
5819         file = obj->base.filp;
5820         offset = 0;
5821         do {
5822                 unsigned int len = min_t(typeof(size), size, PAGE_SIZE);
5823                 struct page *page;
5824                 void *pgdata, *vaddr;
5825
5826                 err = pagecache_write_begin(file, file->f_mapping,
5827                                             offset, len, 0,
5828                                             &page, &pgdata);
5829                 if (err < 0)
5830                         goto fail;
5831
5832                 vaddr = kmap(page);
5833                 memcpy(vaddr, data, len);
5834                 kunmap(page);
5835
5836                 err = pagecache_write_end(file, file->f_mapping,
5837                                           offset, len, len,
5838                                           page, pgdata);
5839                 if (err < 0)
5840                         goto fail;
5841
5842                 size -= len;
5843                 data += len;
5844                 offset += len;
5845         } while (size);
5846
5847         return obj;
5848
5849 fail:
5850         i915_gem_object_put(obj);
5851         return ERR_PTR(err);
5852 }
5853
5854 struct scatterlist *
5855 i915_gem_object_get_sg(struct drm_i915_gem_object *obj,
5856                        unsigned int n,
5857                        unsigned int *offset)
5858 {
5859         struct i915_gem_object_page_iter *iter = &obj->mm.get_page;
5860         struct scatterlist *sg;
5861         unsigned int idx, count;
5862
5863         might_sleep();
5864         GEM_BUG_ON(n >= obj->base.size >> PAGE_SHIFT);
5865         GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
5866
5867         /* As we iterate forward through the sg, we record each entry in a
5868          * radixtree for quick repeated (backwards) lookups. If we have seen
5869          * this index previously, we will have an entry for it.
5870          *
5871          * Initial lookup is O(N), but this is amortized to O(1) for
5872          * sequential page access (where each new request is consecutive
5873          * to the previous one). Repeated lookups are O(lg(obj->base.size)),
5874          * i.e. O(1) with a large constant!
5875          */
5876         if (n < READ_ONCE(iter->sg_idx))
5877                 goto lookup;
5878
5879         mutex_lock(&iter->lock);
5880
5881         /* We prefer to reuse the last sg so that repeated lookup of this
5882          * (or the subsequent) sg are fast - comparing against the last
5883          * sg is faster than going through the radixtree.
5884          */
5885
5886         sg = iter->sg_pos;
5887         idx = iter->sg_idx;
5888         count = __sg_page_count(sg);
5889
5890         while (idx + count <= n) {
5891                 unsigned long exception, i;
5892                 int ret;
5893
5894                 /* If we cannot allocate and insert this entry, or the
5895                  * individual pages from this range, cancel updating the
5896                  * sg_idx so that on this lookup we are forced to linearly
5897                  * scan onwards, but on future lookups we will try the
5898                  * insertion again (in which case we need to be careful of
5899                  * the error return reporting that we have already inserted
5900                  * this index).
5901                  */
5902                 ret = radix_tree_insert(&iter->radix, idx, sg);
5903                 if (ret && ret != -EEXIST)
5904                         goto scan;
5905
5906                 exception =
5907                         RADIX_TREE_EXCEPTIONAL_ENTRY |
5908                         idx << RADIX_TREE_EXCEPTIONAL_SHIFT;
5909                 for (i = 1; i < count; i++) {
5910                         ret = radix_tree_insert(&iter->radix, idx + i,
5911                                                 (void *)exception);
5912                         if (ret && ret != -EEXIST)
5913                                 goto scan;
5914                 }
5915
5916                 idx += count;
5917                 sg = ____sg_next(sg);
5918                 count = __sg_page_count(sg);
5919         }
5920
5921 scan:
5922         iter->sg_pos = sg;
5923         iter->sg_idx = idx;
5924
5925         mutex_unlock(&iter->lock);
5926
5927         if (unlikely(n < idx)) /* insertion completed by another thread */
5928                 goto lookup;
5929
5930         /* In case we failed to insert the entry into the radixtree, we need
5931          * to look beyond the current sg.
5932          */
5933         while (idx + count <= n) {
5934                 idx += count;
5935                 sg = ____sg_next(sg);
5936                 count = __sg_page_count(sg);
5937         }
5938
5939         *offset = n - idx;
5940         return sg;
5941
5942 lookup:
5943         rcu_read_lock();
5944
5945         sg = radix_tree_lookup(&iter->radix, n);
5946         GEM_BUG_ON(!sg);
5947
5948         /* If this index is in the middle of multi-page sg entry,
5949          * the radixtree will contain an exceptional entry that points
5950          * to the start of that range. We will return the pointer to
5951          * the base page and the offset of this page within the
5952          * sg entry's range.
5953          */
5954         *offset = 0;
5955         if (unlikely(radix_tree_exception(sg))) {
5956                 unsigned long base =
5957                         (unsigned long)sg >> RADIX_TREE_EXCEPTIONAL_SHIFT;
5958
5959                 sg = radix_tree_lookup(&iter->radix, base);
5960                 GEM_BUG_ON(!sg);
5961
5962                 *offset = n - base;
5963         }
5964
5965         rcu_read_unlock();
5966
5967         return sg;
5968 }
5969
5970 struct page *
5971 i915_gem_object_get_page(struct drm_i915_gem_object *obj, unsigned int n)
5972 {
5973         struct scatterlist *sg;
5974         unsigned int offset;
5975
5976         GEM_BUG_ON(!i915_gem_object_has_struct_page(obj));
5977
5978         sg = i915_gem_object_get_sg(obj, n, &offset);
5979         return nth_page(sg_page(sg), offset);
5980 }
5981
5982 /* Like i915_gem_object_get_page(), but mark the returned page dirty */
5983 struct page *
5984 i915_gem_object_get_dirty_page(struct drm_i915_gem_object *obj,
5985                                unsigned int n)
5986 {
5987         struct page *page;
5988
5989         page = i915_gem_object_get_page(obj, n);
5990         if (!obj->mm.dirty)
5991                 set_page_dirty(page);
5992
5993         return page;
5994 }
5995
5996 dma_addr_t
5997 i915_gem_object_get_dma_address(struct drm_i915_gem_object *obj,
5998                                 unsigned long n)
5999 {
6000         struct scatterlist *sg;
6001         unsigned int offset;
6002
6003         sg = i915_gem_object_get_sg(obj, n, &offset);
6004         return sg_dma_address(sg) + (offset << PAGE_SHIFT);
6005 }
6006
6007 int i915_gem_object_attach_phys(struct drm_i915_gem_object *obj, int align)
6008 {
6009         struct sg_table *pages;
6010         int err;
6011
6012         if (align > obj->base.size)
6013                 return -EINVAL;
6014
6015         if (obj->ops == &i915_gem_phys_ops)
6016                 return 0;
6017
6018         if (obj->ops != &i915_gem_object_ops)
6019                 return -EINVAL;
6020
6021         err = i915_gem_object_unbind(obj);
6022         if (err)
6023                 return err;
6024
6025         mutex_lock(&obj->mm.lock);
6026
6027         if (obj->mm.madv != I915_MADV_WILLNEED) {
6028                 err = -EFAULT;
6029                 goto err_unlock;
6030         }
6031
6032         if (obj->mm.quirked) {
6033                 err = -EFAULT;
6034                 goto err_unlock;
6035         }
6036
6037         if (obj->mm.mapping) {
6038                 err = -EBUSY;
6039                 goto err_unlock;
6040         }
6041
6042         pages = fetch_and_zero(&obj->mm.pages);
6043         if (pages) {
6044                 struct drm_i915_private *i915 = to_i915(obj->base.dev);
6045
6046                 __i915_gem_object_reset_page_iter(obj);
6047
6048                 spin_lock(&i915->mm.obj_lock);
6049                 list_del(&obj->mm.link);
6050                 spin_unlock(&i915->mm.obj_lock);
6051         }
6052
6053         obj->ops = &i915_gem_phys_ops;
6054
6055         err = ____i915_gem_object_get_pages(obj);
6056         if (err)
6057                 goto err_xfer;
6058
6059         /* Perma-pin (until release) the physical set of pages */
6060         __i915_gem_object_pin_pages(obj);
6061
6062         if (!IS_ERR_OR_NULL(pages))
6063                 i915_gem_object_ops.put_pages(obj, pages);
6064         mutex_unlock(&obj->mm.lock);
6065         return 0;
6066
6067 err_xfer:
6068         obj->ops = &i915_gem_object_ops;
6069         obj->mm.pages = pages;
6070 err_unlock:
6071         mutex_unlock(&obj->mm.lock);
6072         return err;
6073 }
6074
6075 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
6076 #include "selftests/scatterlist.c"
6077 #include "selftests/mock_gem_device.c"
6078 #include "selftests/huge_gem_object.c"
6079 #include "selftests/huge_pages.c"
6080 #include "selftests/i915_gem_object.c"
6081 #include "selftests/i915_gem_coherency.c"
6082 #endif