Merge tag 'vfio-v4.18-rc1' of git://github.com/awilliam/linux-vfio
[sfrench/cifs-2.6.git] / drivers / gpu / drm / i915 / i915_gem.c
1 /*
2  * Copyright © 2008-2015 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * Authors:
24  *    Eric Anholt <eric@anholt.net>
25  *
26  */
27
28 #include <drm/drmP.h>
29 #include <drm/drm_vma_manager.h>
30 #include <drm/i915_drm.h>
31 #include "i915_drv.h"
32 #include "i915_gem_clflush.h"
33 #include "i915_vgpu.h"
34 #include "i915_trace.h"
35 #include "intel_drv.h"
36 #include "intel_frontbuffer.h"
37 #include "intel_mocs.h"
38 #include "intel_workarounds.h"
39 #include "i915_gemfs.h"
40 #include <linux/dma-fence-array.h>
41 #include <linux/kthread.h>
42 #include <linux/reservation.h>
43 #include <linux/shmem_fs.h>
44 #include <linux/slab.h>
45 #include <linux/stop_machine.h>
46 #include <linux/swap.h>
47 #include <linux/pci.h>
48 #include <linux/dma-buf.h>
49
50 static void i915_gem_flush_free_objects(struct drm_i915_private *i915);
51
52 static bool cpu_write_needs_clflush(struct drm_i915_gem_object *obj)
53 {
54         if (obj->cache_dirty)
55                 return false;
56
57         if (!(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE))
58                 return true;
59
60         return obj->pin_global; /* currently in use by HW, keep flushed */
61 }
62
63 static int
64 insert_mappable_node(struct i915_ggtt *ggtt,
65                      struct drm_mm_node *node, u32 size)
66 {
67         memset(node, 0, sizeof(*node));
68         return drm_mm_insert_node_in_range(&ggtt->base.mm, node,
69                                            size, 0, I915_COLOR_UNEVICTABLE,
70                                            0, ggtt->mappable_end,
71                                            DRM_MM_INSERT_LOW);
72 }
73
74 static void
75 remove_mappable_node(struct drm_mm_node *node)
76 {
77         drm_mm_remove_node(node);
78 }
79
80 /* some bookkeeping */
81 static void i915_gem_info_add_obj(struct drm_i915_private *dev_priv,
82                                   u64 size)
83 {
84         spin_lock(&dev_priv->mm.object_stat_lock);
85         dev_priv->mm.object_count++;
86         dev_priv->mm.object_memory += size;
87         spin_unlock(&dev_priv->mm.object_stat_lock);
88 }
89
90 static void i915_gem_info_remove_obj(struct drm_i915_private *dev_priv,
91                                      u64 size)
92 {
93         spin_lock(&dev_priv->mm.object_stat_lock);
94         dev_priv->mm.object_count--;
95         dev_priv->mm.object_memory -= size;
96         spin_unlock(&dev_priv->mm.object_stat_lock);
97 }
98
99 static int
100 i915_gem_wait_for_error(struct i915_gpu_error *error)
101 {
102         int ret;
103
104         might_sleep();
105
106         /*
107          * Only wait 10 seconds for the gpu reset to complete to avoid hanging
108          * userspace. If it takes that long something really bad is going on and
109          * we should simply try to bail out and fail as gracefully as possible.
110          */
111         ret = wait_event_interruptible_timeout(error->reset_queue,
112                                                !i915_reset_backoff(error),
113                                                I915_RESET_TIMEOUT);
114         if (ret == 0) {
115                 DRM_ERROR("Timed out waiting for the gpu reset to complete\n");
116                 return -EIO;
117         } else if (ret < 0) {
118                 return ret;
119         } else {
120                 return 0;
121         }
122 }
123
124 int i915_mutex_lock_interruptible(struct drm_device *dev)
125 {
126         struct drm_i915_private *dev_priv = to_i915(dev);
127         int ret;
128
129         ret = i915_gem_wait_for_error(&dev_priv->gpu_error);
130         if (ret)
131                 return ret;
132
133         ret = mutex_lock_interruptible(&dev->struct_mutex);
134         if (ret)
135                 return ret;
136
137         return 0;
138 }
139
140 static u32 __i915_gem_park(struct drm_i915_private *i915)
141 {
142         lockdep_assert_held(&i915->drm.struct_mutex);
143         GEM_BUG_ON(i915->gt.active_requests);
144         GEM_BUG_ON(!list_empty(&i915->gt.active_rings));
145
146         if (!i915->gt.awake)
147                 return I915_EPOCH_INVALID;
148
149         GEM_BUG_ON(i915->gt.epoch == I915_EPOCH_INVALID);
150
151         /*
152          * Be paranoid and flush a concurrent interrupt to make sure
153          * we don't reactivate any irq tasklets after parking.
154          *
155          * FIXME: Note that even though we have waited for execlists to be idle,
156          * there may still be an in-flight interrupt even though the CSB
157          * is now empty. synchronize_irq() makes sure that a residual interrupt
158          * is completed before we continue, but it doesn't prevent the HW from
159          * raising a spurious interrupt later. To complete the shield we should
160          * coordinate disabling the CS irq with flushing the interrupts.
161          */
162         synchronize_irq(i915->drm.irq);
163
164         intel_engines_park(i915);
165         i915_timelines_park(i915);
166
167         i915_pmu_gt_parked(i915);
168         i915_vma_parked(i915);
169
170         i915->gt.awake = false;
171
172         if (INTEL_GEN(i915) >= 6)
173                 gen6_rps_idle(i915);
174
175         intel_display_power_put(i915, POWER_DOMAIN_GT_IRQ);
176
177         intel_runtime_pm_put(i915);
178
179         return i915->gt.epoch;
180 }
181
182 void i915_gem_park(struct drm_i915_private *i915)
183 {
184         lockdep_assert_held(&i915->drm.struct_mutex);
185         GEM_BUG_ON(i915->gt.active_requests);
186
187         if (!i915->gt.awake)
188                 return;
189
190         /* Defer the actual call to __i915_gem_park() to prevent ping-pongs */
191         mod_delayed_work(i915->wq, &i915->gt.idle_work, msecs_to_jiffies(100));
192 }
193
194 void i915_gem_unpark(struct drm_i915_private *i915)
195 {
196         lockdep_assert_held(&i915->drm.struct_mutex);
197         GEM_BUG_ON(!i915->gt.active_requests);
198
199         if (i915->gt.awake)
200                 return;
201
202         intel_runtime_pm_get_noresume(i915);
203
204         /*
205          * It seems that the DMC likes to transition between the DC states a lot
206          * when there are no connected displays (no active power domains) during
207          * command submission.
208          *
209          * This activity has negative impact on the performance of the chip with
210          * huge latencies observed in the interrupt handler and elsewhere.
211          *
212          * Work around it by grabbing a GT IRQ power domain whilst there is any
213          * GT activity, preventing any DC state transitions.
214          */
215         intel_display_power_get(i915, POWER_DOMAIN_GT_IRQ);
216
217         i915->gt.awake = true;
218         if (unlikely(++i915->gt.epoch == 0)) /* keep 0 as invalid */
219                 i915->gt.epoch = 1;
220
221         intel_enable_gt_powersave(i915);
222         i915_update_gfx_val(i915);
223         if (INTEL_GEN(i915) >= 6)
224                 gen6_rps_busy(i915);
225         i915_pmu_gt_unparked(i915);
226
227         intel_engines_unpark(i915);
228
229         i915_queue_hangcheck(i915);
230
231         queue_delayed_work(i915->wq,
232                            &i915->gt.retire_work,
233                            round_jiffies_up_relative(HZ));
234 }
235
236 int
237 i915_gem_get_aperture_ioctl(struct drm_device *dev, void *data,
238                             struct drm_file *file)
239 {
240         struct drm_i915_private *dev_priv = to_i915(dev);
241         struct i915_ggtt *ggtt = &dev_priv->ggtt;
242         struct drm_i915_gem_get_aperture *args = data;
243         struct i915_vma *vma;
244         u64 pinned;
245
246         pinned = ggtt->base.reserved;
247         mutex_lock(&dev->struct_mutex);
248         list_for_each_entry(vma, &ggtt->base.active_list, vm_link)
249                 if (i915_vma_is_pinned(vma))
250                         pinned += vma->node.size;
251         list_for_each_entry(vma, &ggtt->base.inactive_list, vm_link)
252                 if (i915_vma_is_pinned(vma))
253                         pinned += vma->node.size;
254         mutex_unlock(&dev->struct_mutex);
255
256         args->aper_size = ggtt->base.total;
257         args->aper_available_size = args->aper_size - pinned;
258
259         return 0;
260 }
261
262 static int i915_gem_object_get_pages_phys(struct drm_i915_gem_object *obj)
263 {
264         struct address_space *mapping = obj->base.filp->f_mapping;
265         drm_dma_handle_t *phys;
266         struct sg_table *st;
267         struct scatterlist *sg;
268         char *vaddr;
269         int i;
270         int err;
271
272         if (WARN_ON(i915_gem_object_needs_bit17_swizzle(obj)))
273                 return -EINVAL;
274
275         /* Always aligning to the object size, allows a single allocation
276          * to handle all possible callers, and given typical object sizes,
277          * the alignment of the buddy allocation will naturally match.
278          */
279         phys = drm_pci_alloc(obj->base.dev,
280                              roundup_pow_of_two(obj->base.size),
281                              roundup_pow_of_two(obj->base.size));
282         if (!phys)
283                 return -ENOMEM;
284
285         vaddr = phys->vaddr;
286         for (i = 0; i < obj->base.size / PAGE_SIZE; i++) {
287                 struct page *page;
288                 char *src;
289
290                 page = shmem_read_mapping_page(mapping, i);
291                 if (IS_ERR(page)) {
292                         err = PTR_ERR(page);
293                         goto err_phys;
294                 }
295
296                 src = kmap_atomic(page);
297                 memcpy(vaddr, src, PAGE_SIZE);
298                 drm_clflush_virt_range(vaddr, PAGE_SIZE);
299                 kunmap_atomic(src);
300
301                 put_page(page);
302                 vaddr += PAGE_SIZE;
303         }
304
305         i915_gem_chipset_flush(to_i915(obj->base.dev));
306
307         st = kmalloc(sizeof(*st), GFP_KERNEL);
308         if (!st) {
309                 err = -ENOMEM;
310                 goto err_phys;
311         }
312
313         if (sg_alloc_table(st, 1, GFP_KERNEL)) {
314                 kfree(st);
315                 err = -ENOMEM;
316                 goto err_phys;
317         }
318
319         sg = st->sgl;
320         sg->offset = 0;
321         sg->length = obj->base.size;
322
323         sg_dma_address(sg) = phys->busaddr;
324         sg_dma_len(sg) = obj->base.size;
325
326         obj->phys_handle = phys;
327
328         __i915_gem_object_set_pages(obj, st, sg->length);
329
330         return 0;
331
332 err_phys:
333         drm_pci_free(obj->base.dev, phys);
334
335         return err;
336 }
337
338 static void __start_cpu_write(struct drm_i915_gem_object *obj)
339 {
340         obj->read_domains = I915_GEM_DOMAIN_CPU;
341         obj->write_domain = I915_GEM_DOMAIN_CPU;
342         if (cpu_write_needs_clflush(obj))
343                 obj->cache_dirty = true;
344 }
345
346 static void
347 __i915_gem_object_release_shmem(struct drm_i915_gem_object *obj,
348                                 struct sg_table *pages,
349                                 bool needs_clflush)
350 {
351         GEM_BUG_ON(obj->mm.madv == __I915_MADV_PURGED);
352
353         if (obj->mm.madv == I915_MADV_DONTNEED)
354                 obj->mm.dirty = false;
355
356         if (needs_clflush &&
357             (obj->read_domains & I915_GEM_DOMAIN_CPU) == 0 &&
358             !(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ))
359                 drm_clflush_sg(pages);
360
361         __start_cpu_write(obj);
362 }
363
364 static void
365 i915_gem_object_put_pages_phys(struct drm_i915_gem_object *obj,
366                                struct sg_table *pages)
367 {
368         __i915_gem_object_release_shmem(obj, pages, false);
369
370         if (obj->mm.dirty) {
371                 struct address_space *mapping = obj->base.filp->f_mapping;
372                 char *vaddr = obj->phys_handle->vaddr;
373                 int i;
374
375                 for (i = 0; i < obj->base.size / PAGE_SIZE; i++) {
376                         struct page *page;
377                         char *dst;
378
379                         page = shmem_read_mapping_page(mapping, i);
380                         if (IS_ERR(page))
381                                 continue;
382
383                         dst = kmap_atomic(page);
384                         drm_clflush_virt_range(vaddr, PAGE_SIZE);
385                         memcpy(dst, vaddr, PAGE_SIZE);
386                         kunmap_atomic(dst);
387
388                         set_page_dirty(page);
389                         if (obj->mm.madv == I915_MADV_WILLNEED)
390                                 mark_page_accessed(page);
391                         put_page(page);
392                         vaddr += PAGE_SIZE;
393                 }
394                 obj->mm.dirty = false;
395         }
396
397         sg_free_table(pages);
398         kfree(pages);
399
400         drm_pci_free(obj->base.dev, obj->phys_handle);
401 }
402
403 static void
404 i915_gem_object_release_phys(struct drm_i915_gem_object *obj)
405 {
406         i915_gem_object_unpin_pages(obj);
407 }
408
409 static const struct drm_i915_gem_object_ops i915_gem_phys_ops = {
410         .get_pages = i915_gem_object_get_pages_phys,
411         .put_pages = i915_gem_object_put_pages_phys,
412         .release = i915_gem_object_release_phys,
413 };
414
415 static const struct drm_i915_gem_object_ops i915_gem_object_ops;
416
417 int i915_gem_object_unbind(struct drm_i915_gem_object *obj)
418 {
419         struct i915_vma *vma;
420         LIST_HEAD(still_in_list);
421         int ret;
422
423         lockdep_assert_held(&obj->base.dev->struct_mutex);
424
425         /* Closed vma are removed from the obj->vma_list - but they may
426          * still have an active binding on the object. To remove those we
427          * must wait for all rendering to complete to the object (as unbinding
428          * must anyway), and retire the requests.
429          */
430         ret = i915_gem_object_set_to_cpu_domain(obj, false);
431         if (ret)
432                 return ret;
433
434         while ((vma = list_first_entry_or_null(&obj->vma_list,
435                                                struct i915_vma,
436                                                obj_link))) {
437                 list_move_tail(&vma->obj_link, &still_in_list);
438                 ret = i915_vma_unbind(vma);
439                 if (ret)
440                         break;
441         }
442         list_splice(&still_in_list, &obj->vma_list);
443
444         return ret;
445 }
446
447 static long
448 i915_gem_object_wait_fence(struct dma_fence *fence,
449                            unsigned int flags,
450                            long timeout,
451                            struct intel_rps_client *rps_client)
452 {
453         struct i915_request *rq;
454
455         BUILD_BUG_ON(I915_WAIT_INTERRUPTIBLE != 0x1);
456
457         if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags))
458                 return timeout;
459
460         if (!dma_fence_is_i915(fence))
461                 return dma_fence_wait_timeout(fence,
462                                               flags & I915_WAIT_INTERRUPTIBLE,
463                                               timeout);
464
465         rq = to_request(fence);
466         if (i915_request_completed(rq))
467                 goto out;
468
469         /*
470          * This client is about to stall waiting for the GPU. In many cases
471          * this is undesirable and limits the throughput of the system, as
472          * many clients cannot continue processing user input/output whilst
473          * blocked. RPS autotuning may take tens of milliseconds to respond
474          * to the GPU load and thus incurs additional latency for the client.
475          * We can circumvent that by promoting the GPU frequency to maximum
476          * before we wait. This makes the GPU throttle up much more quickly
477          * (good for benchmarks and user experience, e.g. window animations),
478          * but at a cost of spending more power processing the workload
479          * (bad for battery). Not all clients even want their results
480          * immediately and for them we should just let the GPU select its own
481          * frequency to maximise efficiency. To prevent a single client from
482          * forcing the clocks too high for the whole system, we only allow
483          * each client to waitboost once in a busy period.
484          */
485         if (rps_client && !i915_request_started(rq)) {
486                 if (INTEL_GEN(rq->i915) >= 6)
487                         gen6_rps_boost(rq, rps_client);
488         }
489
490         timeout = i915_request_wait(rq, flags, timeout);
491
492 out:
493         if (flags & I915_WAIT_LOCKED && i915_request_completed(rq))
494                 i915_request_retire_upto(rq);
495
496         return timeout;
497 }
498
499 static long
500 i915_gem_object_wait_reservation(struct reservation_object *resv,
501                                  unsigned int flags,
502                                  long timeout,
503                                  struct intel_rps_client *rps_client)
504 {
505         unsigned int seq = __read_seqcount_begin(&resv->seq);
506         struct dma_fence *excl;
507         bool prune_fences = false;
508
509         if (flags & I915_WAIT_ALL) {
510                 struct dma_fence **shared;
511                 unsigned int count, i;
512                 int ret;
513
514                 ret = reservation_object_get_fences_rcu(resv,
515                                                         &excl, &count, &shared);
516                 if (ret)
517                         return ret;
518
519                 for (i = 0; i < count; i++) {
520                         timeout = i915_gem_object_wait_fence(shared[i],
521                                                              flags, timeout,
522                                                              rps_client);
523                         if (timeout < 0)
524                                 break;
525
526                         dma_fence_put(shared[i]);
527                 }
528
529                 for (; i < count; i++)
530                         dma_fence_put(shared[i]);
531                 kfree(shared);
532
533                 /*
534                  * If both shared fences and an exclusive fence exist,
535                  * then by construction the shared fences must be later
536                  * than the exclusive fence. If we successfully wait for
537                  * all the shared fences, we know that the exclusive fence
538                  * must all be signaled. If all the shared fences are
539                  * signaled, we can prune the array and recover the
540                  * floating references on the fences/requests.
541                  */
542                 prune_fences = count && timeout >= 0;
543         } else {
544                 excl = reservation_object_get_excl_rcu(resv);
545         }
546
547         if (excl && timeout >= 0)
548                 timeout = i915_gem_object_wait_fence(excl, flags, timeout,
549                                                      rps_client);
550
551         dma_fence_put(excl);
552
553         /*
554          * Opportunistically prune the fences iff we know they have *all* been
555          * signaled and that the reservation object has not been changed (i.e.
556          * no new fences have been added).
557          */
558         if (prune_fences && !__read_seqcount_retry(&resv->seq, seq)) {
559                 if (reservation_object_trylock(resv)) {
560                         if (!__read_seqcount_retry(&resv->seq, seq))
561                                 reservation_object_add_excl_fence(resv, NULL);
562                         reservation_object_unlock(resv);
563                 }
564         }
565
566         return timeout;
567 }
568
569 static void __fence_set_priority(struct dma_fence *fence,
570                                  const struct i915_sched_attr *attr)
571 {
572         struct i915_request *rq;
573         struct intel_engine_cs *engine;
574
575         if (dma_fence_is_signaled(fence) || !dma_fence_is_i915(fence))
576                 return;
577
578         rq = to_request(fence);
579         engine = rq->engine;
580
581         local_bh_disable();
582         rcu_read_lock(); /* RCU serialisation for set-wedged protection */
583         if (engine->schedule)
584                 engine->schedule(rq, attr);
585         rcu_read_unlock();
586         local_bh_enable(); /* kick the tasklets if queues were reprioritised */
587 }
588
589 static void fence_set_priority(struct dma_fence *fence,
590                                const struct i915_sched_attr *attr)
591 {
592         /* Recurse once into a fence-array */
593         if (dma_fence_is_array(fence)) {
594                 struct dma_fence_array *array = to_dma_fence_array(fence);
595                 int i;
596
597                 for (i = 0; i < array->num_fences; i++)
598                         __fence_set_priority(array->fences[i], attr);
599         } else {
600                 __fence_set_priority(fence, attr);
601         }
602 }
603
604 int
605 i915_gem_object_wait_priority(struct drm_i915_gem_object *obj,
606                               unsigned int flags,
607                               const struct i915_sched_attr *attr)
608 {
609         struct dma_fence *excl;
610
611         if (flags & I915_WAIT_ALL) {
612                 struct dma_fence **shared;
613                 unsigned int count, i;
614                 int ret;
615
616                 ret = reservation_object_get_fences_rcu(obj->resv,
617                                                         &excl, &count, &shared);
618                 if (ret)
619                         return ret;
620
621                 for (i = 0; i < count; i++) {
622                         fence_set_priority(shared[i], attr);
623                         dma_fence_put(shared[i]);
624                 }
625
626                 kfree(shared);
627         } else {
628                 excl = reservation_object_get_excl_rcu(obj->resv);
629         }
630
631         if (excl) {
632                 fence_set_priority(excl, attr);
633                 dma_fence_put(excl);
634         }
635         return 0;
636 }
637
638 /**
639  * Waits for rendering to the object to be completed
640  * @obj: i915 gem object
641  * @flags: how to wait (under a lock, for all rendering or just for writes etc)
642  * @timeout: how long to wait
643  * @rps_client: client (user process) to charge for any waitboosting
644  */
645 int
646 i915_gem_object_wait(struct drm_i915_gem_object *obj,
647                      unsigned int flags,
648                      long timeout,
649                      struct intel_rps_client *rps_client)
650 {
651         might_sleep();
652 #if IS_ENABLED(CONFIG_LOCKDEP)
653         GEM_BUG_ON(debug_locks &&
654                    !!lockdep_is_held(&obj->base.dev->struct_mutex) !=
655                    !!(flags & I915_WAIT_LOCKED));
656 #endif
657         GEM_BUG_ON(timeout < 0);
658
659         timeout = i915_gem_object_wait_reservation(obj->resv,
660                                                    flags, timeout,
661                                                    rps_client);
662         return timeout < 0 ? timeout : 0;
663 }
664
665 static struct intel_rps_client *to_rps_client(struct drm_file *file)
666 {
667         struct drm_i915_file_private *fpriv = file->driver_priv;
668
669         return &fpriv->rps_client;
670 }
671
672 static int
673 i915_gem_phys_pwrite(struct drm_i915_gem_object *obj,
674                      struct drm_i915_gem_pwrite *args,
675                      struct drm_file *file)
676 {
677         void *vaddr = obj->phys_handle->vaddr + args->offset;
678         char __user *user_data = u64_to_user_ptr(args->data_ptr);
679
680         /* We manually control the domain here and pretend that it
681          * remains coherent i.e. in the GTT domain, like shmem_pwrite.
682          */
683         intel_fb_obj_invalidate(obj, ORIGIN_CPU);
684         if (copy_from_user(vaddr, user_data, args->size))
685                 return -EFAULT;
686
687         drm_clflush_virt_range(vaddr, args->size);
688         i915_gem_chipset_flush(to_i915(obj->base.dev));
689
690         intel_fb_obj_flush(obj, ORIGIN_CPU);
691         return 0;
692 }
693
694 void *i915_gem_object_alloc(struct drm_i915_private *dev_priv)
695 {
696         return kmem_cache_zalloc(dev_priv->objects, GFP_KERNEL);
697 }
698
699 void i915_gem_object_free(struct drm_i915_gem_object *obj)
700 {
701         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
702         kmem_cache_free(dev_priv->objects, obj);
703 }
704
705 static int
706 i915_gem_create(struct drm_file *file,
707                 struct drm_i915_private *dev_priv,
708                 uint64_t size,
709                 uint32_t *handle_p)
710 {
711         struct drm_i915_gem_object *obj;
712         int ret;
713         u32 handle;
714
715         size = roundup(size, PAGE_SIZE);
716         if (size == 0)
717                 return -EINVAL;
718
719         /* Allocate the new object */
720         obj = i915_gem_object_create(dev_priv, size);
721         if (IS_ERR(obj))
722                 return PTR_ERR(obj);
723
724         ret = drm_gem_handle_create(file, &obj->base, &handle);
725         /* drop reference from allocate - handle holds it now */
726         i915_gem_object_put(obj);
727         if (ret)
728                 return ret;
729
730         *handle_p = handle;
731         return 0;
732 }
733
734 int
735 i915_gem_dumb_create(struct drm_file *file,
736                      struct drm_device *dev,
737                      struct drm_mode_create_dumb *args)
738 {
739         /* have to work out size/pitch and return them */
740         args->pitch = ALIGN(args->width * DIV_ROUND_UP(args->bpp, 8), 64);
741         args->size = args->pitch * args->height;
742         return i915_gem_create(file, to_i915(dev),
743                                args->size, &args->handle);
744 }
745
746 static bool gpu_write_needs_clflush(struct drm_i915_gem_object *obj)
747 {
748         return !(obj->cache_level == I915_CACHE_NONE ||
749                  obj->cache_level == I915_CACHE_WT);
750 }
751
752 /**
753  * Creates a new mm object and returns a handle to it.
754  * @dev: drm device pointer
755  * @data: ioctl data blob
756  * @file: drm file pointer
757  */
758 int
759 i915_gem_create_ioctl(struct drm_device *dev, void *data,
760                       struct drm_file *file)
761 {
762         struct drm_i915_private *dev_priv = to_i915(dev);
763         struct drm_i915_gem_create *args = data;
764
765         i915_gem_flush_free_objects(dev_priv);
766
767         return i915_gem_create(file, dev_priv,
768                                args->size, &args->handle);
769 }
770
771 static inline enum fb_op_origin
772 fb_write_origin(struct drm_i915_gem_object *obj, unsigned int domain)
773 {
774         return (domain == I915_GEM_DOMAIN_GTT ?
775                 obj->frontbuffer_ggtt_origin : ORIGIN_CPU);
776 }
777
778 void i915_gem_flush_ggtt_writes(struct drm_i915_private *dev_priv)
779 {
780         /*
781          * No actual flushing is required for the GTT write domain for reads
782          * from the GTT domain. Writes to it "immediately" go to main memory
783          * as far as we know, so there's no chipset flush. It also doesn't
784          * land in the GPU render cache.
785          *
786          * However, we do have to enforce the order so that all writes through
787          * the GTT land before any writes to the device, such as updates to
788          * the GATT itself.
789          *
790          * We also have to wait a bit for the writes to land from the GTT.
791          * An uncached read (i.e. mmio) seems to be ideal for the round-trip
792          * timing. This issue has only been observed when switching quickly
793          * between GTT writes and CPU reads from inside the kernel on recent hw,
794          * and it appears to only affect discrete GTT blocks (i.e. on LLC
795          * system agents we cannot reproduce this behaviour, until Cannonlake
796          * that was!).
797          */
798
799         wmb();
800
801         intel_runtime_pm_get(dev_priv);
802         spin_lock_irq(&dev_priv->uncore.lock);
803
804         POSTING_READ_FW(RING_HEAD(RENDER_RING_BASE));
805
806         spin_unlock_irq(&dev_priv->uncore.lock);
807         intel_runtime_pm_put(dev_priv);
808 }
809
810 static void
811 flush_write_domain(struct drm_i915_gem_object *obj, unsigned int flush_domains)
812 {
813         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
814         struct i915_vma *vma;
815
816         if (!(obj->write_domain & flush_domains))
817                 return;
818
819         switch (obj->write_domain) {
820         case I915_GEM_DOMAIN_GTT:
821                 i915_gem_flush_ggtt_writes(dev_priv);
822
823                 intel_fb_obj_flush(obj,
824                                    fb_write_origin(obj, I915_GEM_DOMAIN_GTT));
825
826                 for_each_ggtt_vma(vma, obj) {
827                         if (vma->iomap)
828                                 continue;
829
830                         i915_vma_unset_ggtt_write(vma);
831                 }
832                 break;
833
834         case I915_GEM_DOMAIN_CPU:
835                 i915_gem_clflush_object(obj, I915_CLFLUSH_SYNC);
836                 break;
837
838         case I915_GEM_DOMAIN_RENDER:
839                 if (gpu_write_needs_clflush(obj))
840                         obj->cache_dirty = true;
841                 break;
842         }
843
844         obj->write_domain = 0;
845 }
846
847 static inline int
848 __copy_to_user_swizzled(char __user *cpu_vaddr,
849                         const char *gpu_vaddr, int gpu_offset,
850                         int length)
851 {
852         int ret, cpu_offset = 0;
853
854         while (length > 0) {
855                 int cacheline_end = ALIGN(gpu_offset + 1, 64);
856                 int this_length = min(cacheline_end - gpu_offset, length);
857                 int swizzled_gpu_offset = gpu_offset ^ 64;
858
859                 ret = __copy_to_user(cpu_vaddr + cpu_offset,
860                                      gpu_vaddr + swizzled_gpu_offset,
861                                      this_length);
862                 if (ret)
863                         return ret + length;
864
865                 cpu_offset += this_length;
866                 gpu_offset += this_length;
867                 length -= this_length;
868         }
869
870         return 0;
871 }
872
873 static inline int
874 __copy_from_user_swizzled(char *gpu_vaddr, int gpu_offset,
875                           const char __user *cpu_vaddr,
876                           int length)
877 {
878         int ret, cpu_offset = 0;
879
880         while (length > 0) {
881                 int cacheline_end = ALIGN(gpu_offset + 1, 64);
882                 int this_length = min(cacheline_end - gpu_offset, length);
883                 int swizzled_gpu_offset = gpu_offset ^ 64;
884
885                 ret = __copy_from_user(gpu_vaddr + swizzled_gpu_offset,
886                                        cpu_vaddr + cpu_offset,
887                                        this_length);
888                 if (ret)
889                         return ret + length;
890
891                 cpu_offset += this_length;
892                 gpu_offset += this_length;
893                 length -= this_length;
894         }
895
896         return 0;
897 }
898
899 /*
900  * Pins the specified object's pages and synchronizes the object with
901  * GPU accesses. Sets needs_clflush to non-zero if the caller should
902  * flush the object from the CPU cache.
903  */
904 int i915_gem_obj_prepare_shmem_read(struct drm_i915_gem_object *obj,
905                                     unsigned int *needs_clflush)
906 {
907         int ret;
908
909         lockdep_assert_held(&obj->base.dev->struct_mutex);
910
911         *needs_clflush = 0;
912         if (!i915_gem_object_has_struct_page(obj))
913                 return -ENODEV;
914
915         ret = i915_gem_object_wait(obj,
916                                    I915_WAIT_INTERRUPTIBLE |
917                                    I915_WAIT_LOCKED,
918                                    MAX_SCHEDULE_TIMEOUT,
919                                    NULL);
920         if (ret)
921                 return ret;
922
923         ret = i915_gem_object_pin_pages(obj);
924         if (ret)
925                 return ret;
926
927         if (obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ ||
928             !static_cpu_has(X86_FEATURE_CLFLUSH)) {
929                 ret = i915_gem_object_set_to_cpu_domain(obj, false);
930                 if (ret)
931                         goto err_unpin;
932                 else
933                         goto out;
934         }
935
936         flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
937
938         /* If we're not in the cpu read domain, set ourself into the gtt
939          * read domain and manually flush cachelines (if required). This
940          * optimizes for the case when the gpu will dirty the data
941          * anyway again before the next pread happens.
942          */
943         if (!obj->cache_dirty &&
944             !(obj->read_domains & I915_GEM_DOMAIN_CPU))
945                 *needs_clflush = CLFLUSH_BEFORE;
946
947 out:
948         /* return with the pages pinned */
949         return 0;
950
951 err_unpin:
952         i915_gem_object_unpin_pages(obj);
953         return ret;
954 }
955
956 int i915_gem_obj_prepare_shmem_write(struct drm_i915_gem_object *obj,
957                                      unsigned int *needs_clflush)
958 {
959         int ret;
960
961         lockdep_assert_held(&obj->base.dev->struct_mutex);
962
963         *needs_clflush = 0;
964         if (!i915_gem_object_has_struct_page(obj))
965                 return -ENODEV;
966
967         ret = i915_gem_object_wait(obj,
968                                    I915_WAIT_INTERRUPTIBLE |
969                                    I915_WAIT_LOCKED |
970                                    I915_WAIT_ALL,
971                                    MAX_SCHEDULE_TIMEOUT,
972                                    NULL);
973         if (ret)
974                 return ret;
975
976         ret = i915_gem_object_pin_pages(obj);
977         if (ret)
978                 return ret;
979
980         if (obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE ||
981             !static_cpu_has(X86_FEATURE_CLFLUSH)) {
982                 ret = i915_gem_object_set_to_cpu_domain(obj, true);
983                 if (ret)
984                         goto err_unpin;
985                 else
986                         goto out;
987         }
988
989         flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
990
991         /* If we're not in the cpu write domain, set ourself into the
992          * gtt write domain and manually flush cachelines (as required).
993          * This optimizes for the case when the gpu will use the data
994          * right away and we therefore have to clflush anyway.
995          */
996         if (!obj->cache_dirty) {
997                 *needs_clflush |= CLFLUSH_AFTER;
998
999                 /*
1000                  * Same trick applies to invalidate partially written
1001                  * cachelines read before writing.
1002                  */
1003                 if (!(obj->read_domains & I915_GEM_DOMAIN_CPU))
1004                         *needs_clflush |= CLFLUSH_BEFORE;
1005         }
1006
1007 out:
1008         intel_fb_obj_invalidate(obj, ORIGIN_CPU);
1009         obj->mm.dirty = true;
1010         /* return with the pages pinned */
1011         return 0;
1012
1013 err_unpin:
1014         i915_gem_object_unpin_pages(obj);
1015         return ret;
1016 }
1017
1018 static void
1019 shmem_clflush_swizzled_range(char *addr, unsigned long length,
1020                              bool swizzled)
1021 {
1022         if (unlikely(swizzled)) {
1023                 unsigned long start = (unsigned long) addr;
1024                 unsigned long end = (unsigned long) addr + length;
1025
1026                 /* For swizzling simply ensure that we always flush both
1027                  * channels. Lame, but simple and it works. Swizzled
1028                  * pwrite/pread is far from a hotpath - current userspace
1029                  * doesn't use it at all. */
1030                 start = round_down(start, 128);
1031                 end = round_up(end, 128);
1032
1033                 drm_clflush_virt_range((void *)start, end - start);
1034         } else {
1035                 drm_clflush_virt_range(addr, length);
1036         }
1037
1038 }
1039
1040 /* Only difference to the fast-path function is that this can handle bit17
1041  * and uses non-atomic copy and kmap functions. */
1042 static int
1043 shmem_pread_slow(struct page *page, int offset, int length,
1044                  char __user *user_data,
1045                  bool page_do_bit17_swizzling, bool needs_clflush)
1046 {
1047         char *vaddr;
1048         int ret;
1049
1050         vaddr = kmap(page);
1051         if (needs_clflush)
1052                 shmem_clflush_swizzled_range(vaddr + offset, length,
1053                                              page_do_bit17_swizzling);
1054
1055         if (page_do_bit17_swizzling)
1056                 ret = __copy_to_user_swizzled(user_data, vaddr, offset, length);
1057         else
1058                 ret = __copy_to_user(user_data, vaddr + offset, length);
1059         kunmap(page);
1060
1061         return ret ? - EFAULT : 0;
1062 }
1063
1064 static int
1065 shmem_pread(struct page *page, int offset, int length, char __user *user_data,
1066             bool page_do_bit17_swizzling, bool needs_clflush)
1067 {
1068         int ret;
1069
1070         ret = -ENODEV;
1071         if (!page_do_bit17_swizzling) {
1072                 char *vaddr = kmap_atomic(page);
1073
1074                 if (needs_clflush)
1075                         drm_clflush_virt_range(vaddr + offset, length);
1076                 ret = __copy_to_user_inatomic(user_data, vaddr + offset, length);
1077                 kunmap_atomic(vaddr);
1078         }
1079         if (ret == 0)
1080                 return 0;
1081
1082         return shmem_pread_slow(page, offset, length, user_data,
1083                                 page_do_bit17_swizzling, needs_clflush);
1084 }
1085
1086 static int
1087 i915_gem_shmem_pread(struct drm_i915_gem_object *obj,
1088                      struct drm_i915_gem_pread *args)
1089 {
1090         char __user *user_data;
1091         u64 remain;
1092         unsigned int obj_do_bit17_swizzling;
1093         unsigned int needs_clflush;
1094         unsigned int idx, offset;
1095         int ret;
1096
1097         obj_do_bit17_swizzling = 0;
1098         if (i915_gem_object_needs_bit17_swizzle(obj))
1099                 obj_do_bit17_swizzling = BIT(17);
1100
1101         ret = mutex_lock_interruptible(&obj->base.dev->struct_mutex);
1102         if (ret)
1103                 return ret;
1104
1105         ret = i915_gem_obj_prepare_shmem_read(obj, &needs_clflush);
1106         mutex_unlock(&obj->base.dev->struct_mutex);
1107         if (ret)
1108                 return ret;
1109
1110         remain = args->size;
1111         user_data = u64_to_user_ptr(args->data_ptr);
1112         offset = offset_in_page(args->offset);
1113         for (idx = args->offset >> PAGE_SHIFT; remain; idx++) {
1114                 struct page *page = i915_gem_object_get_page(obj, idx);
1115                 int length;
1116
1117                 length = remain;
1118                 if (offset + length > PAGE_SIZE)
1119                         length = PAGE_SIZE - offset;
1120
1121                 ret = shmem_pread(page, offset, length, user_data,
1122                                   page_to_phys(page) & obj_do_bit17_swizzling,
1123                                   needs_clflush);
1124                 if (ret)
1125                         break;
1126
1127                 remain -= length;
1128                 user_data += length;
1129                 offset = 0;
1130         }
1131
1132         i915_gem_obj_finish_shmem_access(obj);
1133         return ret;
1134 }
1135
1136 static inline bool
1137 gtt_user_read(struct io_mapping *mapping,
1138               loff_t base, int offset,
1139               char __user *user_data, int length)
1140 {
1141         void __iomem *vaddr;
1142         unsigned long unwritten;
1143
1144         /* We can use the cpu mem copy function because this is X86. */
1145         vaddr = io_mapping_map_atomic_wc(mapping, base);
1146         unwritten = __copy_to_user_inatomic(user_data,
1147                                             (void __force *)vaddr + offset,
1148                                             length);
1149         io_mapping_unmap_atomic(vaddr);
1150         if (unwritten) {
1151                 vaddr = io_mapping_map_wc(mapping, base, PAGE_SIZE);
1152                 unwritten = copy_to_user(user_data,
1153                                          (void __force *)vaddr + offset,
1154                                          length);
1155                 io_mapping_unmap(vaddr);
1156         }
1157         return unwritten;
1158 }
1159
1160 static int
1161 i915_gem_gtt_pread(struct drm_i915_gem_object *obj,
1162                    const struct drm_i915_gem_pread *args)
1163 {
1164         struct drm_i915_private *i915 = to_i915(obj->base.dev);
1165         struct i915_ggtt *ggtt = &i915->ggtt;
1166         struct drm_mm_node node;
1167         struct i915_vma *vma;
1168         void __user *user_data;
1169         u64 remain, offset;
1170         int ret;
1171
1172         ret = mutex_lock_interruptible(&i915->drm.struct_mutex);
1173         if (ret)
1174                 return ret;
1175
1176         intel_runtime_pm_get(i915);
1177         vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
1178                                        PIN_MAPPABLE |
1179                                        PIN_NONFAULT |
1180                                        PIN_NONBLOCK);
1181         if (!IS_ERR(vma)) {
1182                 node.start = i915_ggtt_offset(vma);
1183                 node.allocated = false;
1184                 ret = i915_vma_put_fence(vma);
1185                 if (ret) {
1186                         i915_vma_unpin(vma);
1187                         vma = ERR_PTR(ret);
1188                 }
1189         }
1190         if (IS_ERR(vma)) {
1191                 ret = insert_mappable_node(ggtt, &node, PAGE_SIZE);
1192                 if (ret)
1193                         goto out_unlock;
1194                 GEM_BUG_ON(!node.allocated);
1195         }
1196
1197         ret = i915_gem_object_set_to_gtt_domain(obj, false);
1198         if (ret)
1199                 goto out_unpin;
1200
1201         mutex_unlock(&i915->drm.struct_mutex);
1202
1203         user_data = u64_to_user_ptr(args->data_ptr);
1204         remain = args->size;
1205         offset = args->offset;
1206
1207         while (remain > 0) {
1208                 /* Operation in this page
1209                  *
1210                  * page_base = page offset within aperture
1211                  * page_offset = offset within page
1212                  * page_length = bytes to copy for this page
1213                  */
1214                 u32 page_base = node.start;
1215                 unsigned page_offset = offset_in_page(offset);
1216                 unsigned page_length = PAGE_SIZE - page_offset;
1217                 page_length = remain < page_length ? remain : page_length;
1218                 if (node.allocated) {
1219                         wmb();
1220                         ggtt->base.insert_page(&ggtt->base,
1221                                                i915_gem_object_get_dma_address(obj, offset >> PAGE_SHIFT),
1222                                                node.start, I915_CACHE_NONE, 0);
1223                         wmb();
1224                 } else {
1225                         page_base += offset & PAGE_MASK;
1226                 }
1227
1228                 if (gtt_user_read(&ggtt->iomap, page_base, page_offset,
1229                                   user_data, page_length)) {
1230                         ret = -EFAULT;
1231                         break;
1232                 }
1233
1234                 remain -= page_length;
1235                 user_data += page_length;
1236                 offset += page_length;
1237         }
1238
1239         mutex_lock(&i915->drm.struct_mutex);
1240 out_unpin:
1241         if (node.allocated) {
1242                 wmb();
1243                 ggtt->base.clear_range(&ggtt->base,
1244                                        node.start, node.size);
1245                 remove_mappable_node(&node);
1246         } else {
1247                 i915_vma_unpin(vma);
1248         }
1249 out_unlock:
1250         intel_runtime_pm_put(i915);
1251         mutex_unlock(&i915->drm.struct_mutex);
1252
1253         return ret;
1254 }
1255
1256 /**
1257  * Reads data from the object referenced by handle.
1258  * @dev: drm device pointer
1259  * @data: ioctl data blob
1260  * @file: drm file pointer
1261  *
1262  * On error, the contents of *data are undefined.
1263  */
1264 int
1265 i915_gem_pread_ioctl(struct drm_device *dev, void *data,
1266                      struct drm_file *file)
1267 {
1268         struct drm_i915_gem_pread *args = data;
1269         struct drm_i915_gem_object *obj;
1270         int ret;
1271
1272         if (args->size == 0)
1273                 return 0;
1274
1275         if (!access_ok(VERIFY_WRITE,
1276                        u64_to_user_ptr(args->data_ptr),
1277                        args->size))
1278                 return -EFAULT;
1279
1280         obj = i915_gem_object_lookup(file, args->handle);
1281         if (!obj)
1282                 return -ENOENT;
1283
1284         /* Bounds check source.  */
1285         if (range_overflows_t(u64, args->offset, args->size, obj->base.size)) {
1286                 ret = -EINVAL;
1287                 goto out;
1288         }
1289
1290         trace_i915_gem_object_pread(obj, args->offset, args->size);
1291
1292         ret = i915_gem_object_wait(obj,
1293                                    I915_WAIT_INTERRUPTIBLE,
1294                                    MAX_SCHEDULE_TIMEOUT,
1295                                    to_rps_client(file));
1296         if (ret)
1297                 goto out;
1298
1299         ret = i915_gem_object_pin_pages(obj);
1300         if (ret)
1301                 goto out;
1302
1303         ret = i915_gem_shmem_pread(obj, args);
1304         if (ret == -EFAULT || ret == -ENODEV)
1305                 ret = i915_gem_gtt_pread(obj, args);
1306
1307         i915_gem_object_unpin_pages(obj);
1308 out:
1309         i915_gem_object_put(obj);
1310         return ret;
1311 }
1312
1313 /* This is the fast write path which cannot handle
1314  * page faults in the source data
1315  */
1316
1317 static inline bool
1318 ggtt_write(struct io_mapping *mapping,
1319            loff_t base, int offset,
1320            char __user *user_data, int length)
1321 {
1322         void __iomem *vaddr;
1323         unsigned long unwritten;
1324
1325         /* We can use the cpu mem copy function because this is X86. */
1326         vaddr = io_mapping_map_atomic_wc(mapping, base);
1327         unwritten = __copy_from_user_inatomic_nocache((void __force *)vaddr + offset,
1328                                                       user_data, length);
1329         io_mapping_unmap_atomic(vaddr);
1330         if (unwritten) {
1331                 vaddr = io_mapping_map_wc(mapping, base, PAGE_SIZE);
1332                 unwritten = copy_from_user((void __force *)vaddr + offset,
1333                                            user_data, length);
1334                 io_mapping_unmap(vaddr);
1335         }
1336
1337         return unwritten;
1338 }
1339
1340 /**
1341  * This is the fast pwrite path, where we copy the data directly from the
1342  * user into the GTT, uncached.
1343  * @obj: i915 GEM object
1344  * @args: pwrite arguments structure
1345  */
1346 static int
1347 i915_gem_gtt_pwrite_fast(struct drm_i915_gem_object *obj,
1348                          const struct drm_i915_gem_pwrite *args)
1349 {
1350         struct drm_i915_private *i915 = to_i915(obj->base.dev);
1351         struct i915_ggtt *ggtt = &i915->ggtt;
1352         struct drm_mm_node node;
1353         struct i915_vma *vma;
1354         u64 remain, offset;
1355         void __user *user_data;
1356         int ret;
1357
1358         ret = mutex_lock_interruptible(&i915->drm.struct_mutex);
1359         if (ret)
1360                 return ret;
1361
1362         if (i915_gem_object_has_struct_page(obj)) {
1363                 /*
1364                  * Avoid waking the device up if we can fallback, as
1365                  * waking/resuming is very slow (worst-case 10-100 ms
1366                  * depending on PCI sleeps and our own resume time).
1367                  * This easily dwarfs any performance advantage from
1368                  * using the cache bypass of indirect GGTT access.
1369                  */
1370                 if (!intel_runtime_pm_get_if_in_use(i915)) {
1371                         ret = -EFAULT;
1372                         goto out_unlock;
1373                 }
1374         } else {
1375                 /* No backing pages, no fallback, we must force GGTT access */
1376                 intel_runtime_pm_get(i915);
1377         }
1378
1379         vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
1380                                        PIN_MAPPABLE |
1381                                        PIN_NONFAULT |
1382                                        PIN_NONBLOCK);
1383         if (!IS_ERR(vma)) {
1384                 node.start = i915_ggtt_offset(vma);
1385                 node.allocated = false;
1386                 ret = i915_vma_put_fence(vma);
1387                 if (ret) {
1388                         i915_vma_unpin(vma);
1389                         vma = ERR_PTR(ret);
1390                 }
1391         }
1392         if (IS_ERR(vma)) {
1393                 ret = insert_mappable_node(ggtt, &node, PAGE_SIZE);
1394                 if (ret)
1395                         goto out_rpm;
1396                 GEM_BUG_ON(!node.allocated);
1397         }
1398
1399         ret = i915_gem_object_set_to_gtt_domain(obj, true);
1400         if (ret)
1401                 goto out_unpin;
1402
1403         mutex_unlock(&i915->drm.struct_mutex);
1404
1405         intel_fb_obj_invalidate(obj, ORIGIN_CPU);
1406
1407         user_data = u64_to_user_ptr(args->data_ptr);
1408         offset = args->offset;
1409         remain = args->size;
1410         while (remain) {
1411                 /* Operation in this page
1412                  *
1413                  * page_base = page offset within aperture
1414                  * page_offset = offset within page
1415                  * page_length = bytes to copy for this page
1416                  */
1417                 u32 page_base = node.start;
1418                 unsigned int page_offset = offset_in_page(offset);
1419                 unsigned int page_length = PAGE_SIZE - page_offset;
1420                 page_length = remain < page_length ? remain : page_length;
1421                 if (node.allocated) {
1422                         wmb(); /* flush the write before we modify the GGTT */
1423                         ggtt->base.insert_page(&ggtt->base,
1424                                                i915_gem_object_get_dma_address(obj, offset >> PAGE_SHIFT),
1425                                                node.start, I915_CACHE_NONE, 0);
1426                         wmb(); /* flush modifications to the GGTT (insert_page) */
1427                 } else {
1428                         page_base += offset & PAGE_MASK;
1429                 }
1430                 /* If we get a fault while copying data, then (presumably) our
1431                  * source page isn't available.  Return the error and we'll
1432                  * retry in the slow path.
1433                  * If the object is non-shmem backed, we retry again with the
1434                  * path that handles page fault.
1435                  */
1436                 if (ggtt_write(&ggtt->iomap, page_base, page_offset,
1437                                user_data, page_length)) {
1438                         ret = -EFAULT;
1439                         break;
1440                 }
1441
1442                 remain -= page_length;
1443                 user_data += page_length;
1444                 offset += page_length;
1445         }
1446         intel_fb_obj_flush(obj, ORIGIN_CPU);
1447
1448         mutex_lock(&i915->drm.struct_mutex);
1449 out_unpin:
1450         if (node.allocated) {
1451                 wmb();
1452                 ggtt->base.clear_range(&ggtt->base,
1453                                        node.start, node.size);
1454                 remove_mappable_node(&node);
1455         } else {
1456                 i915_vma_unpin(vma);
1457         }
1458 out_rpm:
1459         intel_runtime_pm_put(i915);
1460 out_unlock:
1461         mutex_unlock(&i915->drm.struct_mutex);
1462         return ret;
1463 }
1464
1465 static int
1466 shmem_pwrite_slow(struct page *page, int offset, int length,
1467                   char __user *user_data,
1468                   bool page_do_bit17_swizzling,
1469                   bool needs_clflush_before,
1470                   bool needs_clflush_after)
1471 {
1472         char *vaddr;
1473         int ret;
1474
1475         vaddr = kmap(page);
1476         if (unlikely(needs_clflush_before || page_do_bit17_swizzling))
1477                 shmem_clflush_swizzled_range(vaddr + offset, length,
1478                                              page_do_bit17_swizzling);
1479         if (page_do_bit17_swizzling)
1480                 ret = __copy_from_user_swizzled(vaddr, offset, user_data,
1481                                                 length);
1482         else
1483                 ret = __copy_from_user(vaddr + offset, user_data, length);
1484         if (needs_clflush_after)
1485                 shmem_clflush_swizzled_range(vaddr + offset, length,
1486                                              page_do_bit17_swizzling);
1487         kunmap(page);
1488
1489         return ret ? -EFAULT : 0;
1490 }
1491
1492 /* Per-page copy function for the shmem pwrite fastpath.
1493  * Flushes invalid cachelines before writing to the target if
1494  * needs_clflush_before is set and flushes out any written cachelines after
1495  * writing if needs_clflush is set.
1496  */
1497 static int
1498 shmem_pwrite(struct page *page, int offset, int len, char __user *user_data,
1499              bool page_do_bit17_swizzling,
1500              bool needs_clflush_before,
1501              bool needs_clflush_after)
1502 {
1503         int ret;
1504
1505         ret = -ENODEV;
1506         if (!page_do_bit17_swizzling) {
1507                 char *vaddr = kmap_atomic(page);
1508
1509                 if (needs_clflush_before)
1510                         drm_clflush_virt_range(vaddr + offset, len);
1511                 ret = __copy_from_user_inatomic(vaddr + offset, user_data, len);
1512                 if (needs_clflush_after)
1513                         drm_clflush_virt_range(vaddr + offset, len);
1514
1515                 kunmap_atomic(vaddr);
1516         }
1517         if (ret == 0)
1518                 return ret;
1519
1520         return shmem_pwrite_slow(page, offset, len, user_data,
1521                                  page_do_bit17_swizzling,
1522                                  needs_clflush_before,
1523                                  needs_clflush_after);
1524 }
1525
1526 static int
1527 i915_gem_shmem_pwrite(struct drm_i915_gem_object *obj,
1528                       const struct drm_i915_gem_pwrite *args)
1529 {
1530         struct drm_i915_private *i915 = to_i915(obj->base.dev);
1531         void __user *user_data;
1532         u64 remain;
1533         unsigned int obj_do_bit17_swizzling;
1534         unsigned int partial_cacheline_write;
1535         unsigned int needs_clflush;
1536         unsigned int offset, idx;
1537         int ret;
1538
1539         ret = mutex_lock_interruptible(&i915->drm.struct_mutex);
1540         if (ret)
1541                 return ret;
1542
1543         ret = i915_gem_obj_prepare_shmem_write(obj, &needs_clflush);
1544         mutex_unlock(&i915->drm.struct_mutex);
1545         if (ret)
1546                 return ret;
1547
1548         obj_do_bit17_swizzling = 0;
1549         if (i915_gem_object_needs_bit17_swizzle(obj))
1550                 obj_do_bit17_swizzling = BIT(17);
1551
1552         /* If we don't overwrite a cacheline completely we need to be
1553          * careful to have up-to-date data by first clflushing. Don't
1554          * overcomplicate things and flush the entire patch.
1555          */
1556         partial_cacheline_write = 0;
1557         if (needs_clflush & CLFLUSH_BEFORE)
1558                 partial_cacheline_write = boot_cpu_data.x86_clflush_size - 1;
1559
1560         user_data = u64_to_user_ptr(args->data_ptr);
1561         remain = args->size;
1562         offset = offset_in_page(args->offset);
1563         for (idx = args->offset >> PAGE_SHIFT; remain; idx++) {
1564                 struct page *page = i915_gem_object_get_page(obj, idx);
1565                 int length;
1566
1567                 length = remain;
1568                 if (offset + length > PAGE_SIZE)
1569                         length = PAGE_SIZE - offset;
1570
1571                 ret = shmem_pwrite(page, offset, length, user_data,
1572                                    page_to_phys(page) & obj_do_bit17_swizzling,
1573                                    (offset | length) & partial_cacheline_write,
1574                                    needs_clflush & CLFLUSH_AFTER);
1575                 if (ret)
1576                         break;
1577
1578                 remain -= length;
1579                 user_data += length;
1580                 offset = 0;
1581         }
1582
1583         intel_fb_obj_flush(obj, ORIGIN_CPU);
1584         i915_gem_obj_finish_shmem_access(obj);
1585         return ret;
1586 }
1587
1588 /**
1589  * Writes data to the object referenced by handle.
1590  * @dev: drm device
1591  * @data: ioctl data blob
1592  * @file: drm file
1593  *
1594  * On error, the contents of the buffer that were to be modified are undefined.
1595  */
1596 int
1597 i915_gem_pwrite_ioctl(struct drm_device *dev, void *data,
1598                       struct drm_file *file)
1599 {
1600         struct drm_i915_gem_pwrite *args = data;
1601         struct drm_i915_gem_object *obj;
1602         int ret;
1603
1604         if (args->size == 0)
1605                 return 0;
1606
1607         if (!access_ok(VERIFY_READ,
1608                        u64_to_user_ptr(args->data_ptr),
1609                        args->size))
1610                 return -EFAULT;
1611
1612         obj = i915_gem_object_lookup(file, args->handle);
1613         if (!obj)
1614                 return -ENOENT;
1615
1616         /* Bounds check destination. */
1617         if (range_overflows_t(u64, args->offset, args->size, obj->base.size)) {
1618                 ret = -EINVAL;
1619                 goto err;
1620         }
1621
1622         trace_i915_gem_object_pwrite(obj, args->offset, args->size);
1623
1624         ret = -ENODEV;
1625         if (obj->ops->pwrite)
1626                 ret = obj->ops->pwrite(obj, args);
1627         if (ret != -ENODEV)
1628                 goto err;
1629
1630         ret = i915_gem_object_wait(obj,
1631                                    I915_WAIT_INTERRUPTIBLE |
1632                                    I915_WAIT_ALL,
1633                                    MAX_SCHEDULE_TIMEOUT,
1634                                    to_rps_client(file));
1635         if (ret)
1636                 goto err;
1637
1638         ret = i915_gem_object_pin_pages(obj);
1639         if (ret)
1640                 goto err;
1641
1642         ret = -EFAULT;
1643         /* We can only do the GTT pwrite on untiled buffers, as otherwise
1644          * it would end up going through the fenced access, and we'll get
1645          * different detiling behavior between reading and writing.
1646          * pread/pwrite currently are reading and writing from the CPU
1647          * perspective, requiring manual detiling by the client.
1648          */
1649         if (!i915_gem_object_has_struct_page(obj) ||
1650             cpu_write_needs_clflush(obj))
1651                 /* Note that the gtt paths might fail with non-page-backed user
1652                  * pointers (e.g. gtt mappings when moving data between
1653                  * textures). Fallback to the shmem path in that case.
1654                  */
1655                 ret = i915_gem_gtt_pwrite_fast(obj, args);
1656
1657         if (ret == -EFAULT || ret == -ENOSPC) {
1658                 if (obj->phys_handle)
1659                         ret = i915_gem_phys_pwrite(obj, args, file);
1660                 else
1661                         ret = i915_gem_shmem_pwrite(obj, args);
1662         }
1663
1664         i915_gem_object_unpin_pages(obj);
1665 err:
1666         i915_gem_object_put(obj);
1667         return ret;
1668 }
1669
1670 static void i915_gem_object_bump_inactive_ggtt(struct drm_i915_gem_object *obj)
1671 {
1672         struct drm_i915_private *i915;
1673         struct list_head *list;
1674         struct i915_vma *vma;
1675
1676         GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
1677
1678         for_each_ggtt_vma(vma, obj) {
1679                 if (i915_vma_is_active(vma))
1680                         continue;
1681
1682                 if (!drm_mm_node_allocated(&vma->node))
1683                         continue;
1684
1685                 list_move_tail(&vma->vm_link, &vma->vm->inactive_list);
1686         }
1687
1688         i915 = to_i915(obj->base.dev);
1689         spin_lock(&i915->mm.obj_lock);
1690         list = obj->bind_count ? &i915->mm.bound_list : &i915->mm.unbound_list;
1691         list_move_tail(&obj->mm.link, list);
1692         spin_unlock(&i915->mm.obj_lock);
1693 }
1694
1695 /**
1696  * Called when user space prepares to use an object with the CPU, either
1697  * through the mmap ioctl's mapping or a GTT mapping.
1698  * @dev: drm device
1699  * @data: ioctl data blob
1700  * @file: drm file
1701  */
1702 int
1703 i915_gem_set_domain_ioctl(struct drm_device *dev, void *data,
1704                           struct drm_file *file)
1705 {
1706         struct drm_i915_gem_set_domain *args = data;
1707         struct drm_i915_gem_object *obj;
1708         uint32_t read_domains = args->read_domains;
1709         uint32_t write_domain = args->write_domain;
1710         int err;
1711
1712         /* Only handle setting domains to types used by the CPU. */
1713         if ((write_domain | read_domains) & I915_GEM_GPU_DOMAINS)
1714                 return -EINVAL;
1715
1716         /* Having something in the write domain implies it's in the read
1717          * domain, and only that read domain.  Enforce that in the request.
1718          */
1719         if (write_domain != 0 && read_domains != write_domain)
1720                 return -EINVAL;
1721
1722         obj = i915_gem_object_lookup(file, args->handle);
1723         if (!obj)
1724                 return -ENOENT;
1725
1726         /* Try to flush the object off the GPU without holding the lock.
1727          * We will repeat the flush holding the lock in the normal manner
1728          * to catch cases where we are gazumped.
1729          */
1730         err = i915_gem_object_wait(obj,
1731                                    I915_WAIT_INTERRUPTIBLE |
1732                                    (write_domain ? I915_WAIT_ALL : 0),
1733                                    MAX_SCHEDULE_TIMEOUT,
1734                                    to_rps_client(file));
1735         if (err)
1736                 goto out;
1737
1738         /*
1739          * Proxy objects do not control access to the backing storage, ergo
1740          * they cannot be used as a means to manipulate the cache domain
1741          * tracking for that backing storage. The proxy object is always
1742          * considered to be outside of any cache domain.
1743          */
1744         if (i915_gem_object_is_proxy(obj)) {
1745                 err = -ENXIO;
1746                 goto out;
1747         }
1748
1749         /*
1750          * Flush and acquire obj->pages so that we are coherent through
1751          * direct access in memory with previous cached writes through
1752          * shmemfs and that our cache domain tracking remains valid.
1753          * For example, if the obj->filp was moved to swap without us
1754          * being notified and releasing the pages, we would mistakenly
1755          * continue to assume that the obj remained out of the CPU cached
1756          * domain.
1757          */
1758         err = i915_gem_object_pin_pages(obj);
1759         if (err)
1760                 goto out;
1761
1762         err = i915_mutex_lock_interruptible(dev);
1763         if (err)
1764                 goto out_unpin;
1765
1766         if (read_domains & I915_GEM_DOMAIN_WC)
1767                 err = i915_gem_object_set_to_wc_domain(obj, write_domain);
1768         else if (read_domains & I915_GEM_DOMAIN_GTT)
1769                 err = i915_gem_object_set_to_gtt_domain(obj, write_domain);
1770         else
1771                 err = i915_gem_object_set_to_cpu_domain(obj, write_domain);
1772
1773         /* And bump the LRU for this access */
1774         i915_gem_object_bump_inactive_ggtt(obj);
1775
1776         mutex_unlock(&dev->struct_mutex);
1777
1778         if (write_domain != 0)
1779                 intel_fb_obj_invalidate(obj,
1780                                         fb_write_origin(obj, write_domain));
1781
1782 out_unpin:
1783         i915_gem_object_unpin_pages(obj);
1784 out:
1785         i915_gem_object_put(obj);
1786         return err;
1787 }
1788
1789 /**
1790  * Called when user space has done writes to this buffer
1791  * @dev: drm device
1792  * @data: ioctl data blob
1793  * @file: drm file
1794  */
1795 int
1796 i915_gem_sw_finish_ioctl(struct drm_device *dev, void *data,
1797                          struct drm_file *file)
1798 {
1799         struct drm_i915_gem_sw_finish *args = data;
1800         struct drm_i915_gem_object *obj;
1801
1802         obj = i915_gem_object_lookup(file, args->handle);
1803         if (!obj)
1804                 return -ENOENT;
1805
1806         /*
1807          * Proxy objects are barred from CPU access, so there is no
1808          * need to ban sw_finish as it is a nop.
1809          */
1810
1811         /* Pinned buffers may be scanout, so flush the cache */
1812         i915_gem_object_flush_if_display(obj);
1813         i915_gem_object_put(obj);
1814
1815         return 0;
1816 }
1817
1818 /**
1819  * i915_gem_mmap_ioctl - Maps the contents of an object, returning the address
1820  *                       it is mapped to.
1821  * @dev: drm device
1822  * @data: ioctl data blob
1823  * @file: drm file
1824  *
1825  * While the mapping holds a reference on the contents of the object, it doesn't
1826  * imply a ref on the object itself.
1827  *
1828  * IMPORTANT:
1829  *
1830  * DRM driver writers who look a this function as an example for how to do GEM
1831  * mmap support, please don't implement mmap support like here. The modern way
1832  * to implement DRM mmap support is with an mmap offset ioctl (like
1833  * i915_gem_mmap_gtt) and then using the mmap syscall on the DRM fd directly.
1834  * That way debug tooling like valgrind will understand what's going on, hiding
1835  * the mmap call in a driver private ioctl will break that. The i915 driver only
1836  * does cpu mmaps this way because we didn't know better.
1837  */
1838 int
1839 i915_gem_mmap_ioctl(struct drm_device *dev, void *data,
1840                     struct drm_file *file)
1841 {
1842         struct drm_i915_gem_mmap *args = data;
1843         struct drm_i915_gem_object *obj;
1844         unsigned long addr;
1845
1846         if (args->flags & ~(I915_MMAP_WC))
1847                 return -EINVAL;
1848
1849         if (args->flags & I915_MMAP_WC && !boot_cpu_has(X86_FEATURE_PAT))
1850                 return -ENODEV;
1851
1852         obj = i915_gem_object_lookup(file, args->handle);
1853         if (!obj)
1854                 return -ENOENT;
1855
1856         /* prime objects have no backing filp to GEM mmap
1857          * pages from.
1858          */
1859         if (!obj->base.filp) {
1860                 i915_gem_object_put(obj);
1861                 return -ENXIO;
1862         }
1863
1864         addr = vm_mmap(obj->base.filp, 0, args->size,
1865                        PROT_READ | PROT_WRITE, MAP_SHARED,
1866                        args->offset);
1867         if (args->flags & I915_MMAP_WC) {
1868                 struct mm_struct *mm = current->mm;
1869                 struct vm_area_struct *vma;
1870
1871                 if (down_write_killable(&mm->mmap_sem)) {
1872                         i915_gem_object_put(obj);
1873                         return -EINTR;
1874                 }
1875                 vma = find_vma(mm, addr);
1876                 if (vma)
1877                         vma->vm_page_prot =
1878                                 pgprot_writecombine(vm_get_page_prot(vma->vm_flags));
1879                 else
1880                         addr = -ENOMEM;
1881                 up_write(&mm->mmap_sem);
1882
1883                 /* This may race, but that's ok, it only gets set */
1884                 WRITE_ONCE(obj->frontbuffer_ggtt_origin, ORIGIN_CPU);
1885         }
1886         i915_gem_object_put(obj);
1887         if (IS_ERR((void *)addr))
1888                 return addr;
1889
1890         args->addr_ptr = (uint64_t) addr;
1891
1892         return 0;
1893 }
1894
1895 static unsigned int tile_row_pages(struct drm_i915_gem_object *obj)
1896 {
1897         return i915_gem_object_get_tile_row_size(obj) >> PAGE_SHIFT;
1898 }
1899
1900 /**
1901  * i915_gem_mmap_gtt_version - report the current feature set for GTT mmaps
1902  *
1903  * A history of the GTT mmap interface:
1904  *
1905  * 0 - Everything had to fit into the GTT. Both parties of a memcpy had to
1906  *     aligned and suitable for fencing, and still fit into the available
1907  *     mappable space left by the pinned display objects. A classic problem
1908  *     we called the page-fault-of-doom where we would ping-pong between
1909  *     two objects that could not fit inside the GTT and so the memcpy
1910  *     would page one object in at the expense of the other between every
1911  *     single byte.
1912  *
1913  * 1 - Objects can be any size, and have any compatible fencing (X Y, or none
1914  *     as set via i915_gem_set_tiling() [DRM_I915_GEM_SET_TILING]). If the
1915  *     object is too large for the available space (or simply too large
1916  *     for the mappable aperture!), a view is created instead and faulted
1917  *     into userspace. (This view is aligned and sized appropriately for
1918  *     fenced access.)
1919  *
1920  * 2 - Recognise WC as a separate cache domain so that we can flush the
1921  *     delayed writes via GTT before performing direct access via WC.
1922  *
1923  * Restrictions:
1924  *
1925  *  * snoopable objects cannot be accessed via the GTT. It can cause machine
1926  *    hangs on some architectures, corruption on others. An attempt to service
1927  *    a GTT page fault from a snoopable object will generate a SIGBUS.
1928  *
1929  *  * the object must be able to fit into RAM (physical memory, though no
1930  *    limited to the mappable aperture).
1931  *
1932  *
1933  * Caveats:
1934  *
1935  *  * a new GTT page fault will synchronize rendering from the GPU and flush
1936  *    all data to system memory. Subsequent access will not be synchronized.
1937  *
1938  *  * all mappings are revoked on runtime device suspend.
1939  *
1940  *  * there are only 8, 16 or 32 fence registers to share between all users
1941  *    (older machines require fence register for display and blitter access
1942  *    as well). Contention of the fence registers will cause the previous users
1943  *    to be unmapped and any new access will generate new page faults.
1944  *
1945  *  * running out of memory while servicing a fault may generate a SIGBUS,
1946  *    rather than the expected SIGSEGV.
1947  */
1948 int i915_gem_mmap_gtt_version(void)
1949 {
1950         return 2;
1951 }
1952
1953 static inline struct i915_ggtt_view
1954 compute_partial_view(struct drm_i915_gem_object *obj,
1955                      pgoff_t page_offset,
1956                      unsigned int chunk)
1957 {
1958         struct i915_ggtt_view view;
1959
1960         if (i915_gem_object_is_tiled(obj))
1961                 chunk = roundup(chunk, tile_row_pages(obj));
1962
1963         view.type = I915_GGTT_VIEW_PARTIAL;
1964         view.partial.offset = rounddown(page_offset, chunk);
1965         view.partial.size =
1966                 min_t(unsigned int, chunk,
1967                       (obj->base.size >> PAGE_SHIFT) - view.partial.offset);
1968
1969         /* If the partial covers the entire object, just create a normal VMA. */
1970         if (chunk >= obj->base.size >> PAGE_SHIFT)
1971                 view.type = I915_GGTT_VIEW_NORMAL;
1972
1973         return view;
1974 }
1975
1976 /**
1977  * i915_gem_fault - fault a page into the GTT
1978  * @vmf: fault info
1979  *
1980  * The fault handler is set up by drm_gem_mmap() when a object is GTT mapped
1981  * from userspace.  The fault handler takes care of binding the object to
1982  * the GTT (if needed), allocating and programming a fence register (again,
1983  * only if needed based on whether the old reg is still valid or the object
1984  * is tiled) and inserting a new PTE into the faulting process.
1985  *
1986  * Note that the faulting process may involve evicting existing objects
1987  * from the GTT and/or fence registers to make room.  So performance may
1988  * suffer if the GTT working set is large or there are few fence registers
1989  * left.
1990  *
1991  * The current feature set supported by i915_gem_fault() and thus GTT mmaps
1992  * is exposed via I915_PARAM_MMAP_GTT_VERSION (see i915_gem_mmap_gtt_version).
1993  */
1994 int i915_gem_fault(struct vm_fault *vmf)
1995 {
1996 #define MIN_CHUNK_PAGES ((1 << 20) >> PAGE_SHIFT) /* 1 MiB */
1997         struct vm_area_struct *area = vmf->vma;
1998         struct drm_i915_gem_object *obj = to_intel_bo(area->vm_private_data);
1999         struct drm_device *dev = obj->base.dev;
2000         struct drm_i915_private *dev_priv = to_i915(dev);
2001         struct i915_ggtt *ggtt = &dev_priv->ggtt;
2002         bool write = !!(vmf->flags & FAULT_FLAG_WRITE);
2003         struct i915_vma *vma;
2004         pgoff_t page_offset;
2005         unsigned int flags;
2006         int ret;
2007
2008         /* We don't use vmf->pgoff since that has the fake offset */
2009         page_offset = (vmf->address - area->vm_start) >> PAGE_SHIFT;
2010
2011         trace_i915_gem_object_fault(obj, page_offset, true, write);
2012
2013         /* Try to flush the object off the GPU first without holding the lock.
2014          * Upon acquiring the lock, we will perform our sanity checks and then
2015          * repeat the flush holding the lock in the normal manner to catch cases
2016          * where we are gazumped.
2017          */
2018         ret = i915_gem_object_wait(obj,
2019                                    I915_WAIT_INTERRUPTIBLE,
2020                                    MAX_SCHEDULE_TIMEOUT,
2021                                    NULL);
2022         if (ret)
2023                 goto err;
2024
2025         ret = i915_gem_object_pin_pages(obj);
2026         if (ret)
2027                 goto err;
2028
2029         intel_runtime_pm_get(dev_priv);
2030
2031         ret = i915_mutex_lock_interruptible(dev);
2032         if (ret)
2033                 goto err_rpm;
2034
2035         /* Access to snoopable pages through the GTT is incoherent. */
2036         if (obj->cache_level != I915_CACHE_NONE && !HAS_LLC(dev_priv)) {
2037                 ret = -EFAULT;
2038                 goto err_unlock;
2039         }
2040
2041         /* If the object is smaller than a couple of partial vma, it is
2042          * not worth only creating a single partial vma - we may as well
2043          * clear enough space for the full object.
2044          */
2045         flags = PIN_MAPPABLE;
2046         if (obj->base.size > 2 * MIN_CHUNK_PAGES << PAGE_SHIFT)
2047                 flags |= PIN_NONBLOCK | PIN_NONFAULT;
2048
2049         /* Now pin it into the GTT as needed */
2050         vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0, flags);
2051         if (IS_ERR(vma)) {
2052                 /* Use a partial view if it is bigger than available space */
2053                 struct i915_ggtt_view view =
2054                         compute_partial_view(obj, page_offset, MIN_CHUNK_PAGES);
2055
2056                 /* Userspace is now writing through an untracked VMA, abandon
2057                  * all hope that the hardware is able to track future writes.
2058                  */
2059                 obj->frontbuffer_ggtt_origin = ORIGIN_CPU;
2060
2061                 vma = i915_gem_object_ggtt_pin(obj, &view, 0, 0, PIN_MAPPABLE);
2062         }
2063         if (IS_ERR(vma)) {
2064                 ret = PTR_ERR(vma);
2065                 goto err_unlock;
2066         }
2067
2068         ret = i915_gem_object_set_to_gtt_domain(obj, write);
2069         if (ret)
2070                 goto err_unpin;
2071
2072         ret = i915_vma_pin_fence(vma);
2073         if (ret)
2074                 goto err_unpin;
2075
2076         /* Finally, remap it using the new GTT offset */
2077         ret = remap_io_mapping(area,
2078                                area->vm_start + (vma->ggtt_view.partial.offset << PAGE_SHIFT),
2079                                (ggtt->gmadr.start + vma->node.start) >> PAGE_SHIFT,
2080                                min_t(u64, vma->size, area->vm_end - area->vm_start),
2081                                &ggtt->iomap);
2082         if (ret)
2083                 goto err_fence;
2084
2085         /* Mark as being mmapped into userspace for later revocation */
2086         assert_rpm_wakelock_held(dev_priv);
2087         if (!i915_vma_set_userfault(vma) && !obj->userfault_count++)
2088                 list_add(&obj->userfault_link, &dev_priv->mm.userfault_list);
2089         GEM_BUG_ON(!obj->userfault_count);
2090
2091         i915_vma_set_ggtt_write(vma);
2092
2093 err_fence:
2094         i915_vma_unpin_fence(vma);
2095 err_unpin:
2096         __i915_vma_unpin(vma);
2097 err_unlock:
2098         mutex_unlock(&dev->struct_mutex);
2099 err_rpm:
2100         intel_runtime_pm_put(dev_priv);
2101         i915_gem_object_unpin_pages(obj);
2102 err:
2103         switch (ret) {
2104         case -EIO:
2105                 /*
2106                  * We eat errors when the gpu is terminally wedged to avoid
2107                  * userspace unduly crashing (gl has no provisions for mmaps to
2108                  * fail). But any other -EIO isn't ours (e.g. swap in failure)
2109                  * and so needs to be reported.
2110                  */
2111                 if (!i915_terminally_wedged(&dev_priv->gpu_error)) {
2112                         ret = VM_FAULT_SIGBUS;
2113                         break;
2114                 }
2115         case -EAGAIN:
2116                 /*
2117                  * EAGAIN means the gpu is hung and we'll wait for the error
2118                  * handler to reset everything when re-faulting in
2119                  * i915_mutex_lock_interruptible.
2120                  */
2121         case 0:
2122         case -ERESTARTSYS:
2123         case -EINTR:
2124         case -EBUSY:
2125                 /*
2126                  * EBUSY is ok: this just means that another thread
2127                  * already did the job.
2128                  */
2129                 ret = VM_FAULT_NOPAGE;
2130                 break;
2131         case -ENOMEM:
2132                 ret = VM_FAULT_OOM;
2133                 break;
2134         case -ENOSPC:
2135         case -EFAULT:
2136                 ret = VM_FAULT_SIGBUS;
2137                 break;
2138         default:
2139                 WARN_ONCE(ret, "unhandled error in i915_gem_fault: %i\n", ret);
2140                 ret = VM_FAULT_SIGBUS;
2141                 break;
2142         }
2143         return ret;
2144 }
2145
2146 static void __i915_gem_object_release_mmap(struct drm_i915_gem_object *obj)
2147 {
2148         struct i915_vma *vma;
2149
2150         GEM_BUG_ON(!obj->userfault_count);
2151
2152         obj->userfault_count = 0;
2153         list_del(&obj->userfault_link);
2154         drm_vma_node_unmap(&obj->base.vma_node,
2155                            obj->base.dev->anon_inode->i_mapping);
2156
2157         for_each_ggtt_vma(vma, obj)
2158                 i915_vma_unset_userfault(vma);
2159 }
2160
2161 /**
2162  * i915_gem_release_mmap - remove physical page mappings
2163  * @obj: obj in question
2164  *
2165  * Preserve the reservation of the mmapping with the DRM core code, but
2166  * relinquish ownership of the pages back to the system.
2167  *
2168  * It is vital that we remove the page mapping if we have mapped a tiled
2169  * object through the GTT and then lose the fence register due to
2170  * resource pressure. Similarly if the object has been moved out of the
2171  * aperture, than pages mapped into userspace must be revoked. Removing the
2172  * mapping will then trigger a page fault on the next user access, allowing
2173  * fixup by i915_gem_fault().
2174  */
2175 void
2176 i915_gem_release_mmap(struct drm_i915_gem_object *obj)
2177 {
2178         struct drm_i915_private *i915 = to_i915(obj->base.dev);
2179
2180         /* Serialisation between user GTT access and our code depends upon
2181          * revoking the CPU's PTE whilst the mutex is held. The next user
2182          * pagefault then has to wait until we release the mutex.
2183          *
2184          * Note that RPM complicates somewhat by adding an additional
2185          * requirement that operations to the GGTT be made holding the RPM
2186          * wakeref.
2187          */
2188         lockdep_assert_held(&i915->drm.struct_mutex);
2189         intel_runtime_pm_get(i915);
2190
2191         if (!obj->userfault_count)
2192                 goto out;
2193
2194         __i915_gem_object_release_mmap(obj);
2195
2196         /* Ensure that the CPU's PTE are revoked and there are not outstanding
2197          * memory transactions from userspace before we return. The TLB
2198          * flushing implied above by changing the PTE above *should* be
2199          * sufficient, an extra barrier here just provides us with a bit
2200          * of paranoid documentation about our requirement to serialise
2201          * memory writes before touching registers / GSM.
2202          */
2203         wmb();
2204
2205 out:
2206         intel_runtime_pm_put(i915);
2207 }
2208
2209 void i915_gem_runtime_suspend(struct drm_i915_private *dev_priv)
2210 {
2211         struct drm_i915_gem_object *obj, *on;
2212         int i;
2213
2214         /*
2215          * Only called during RPM suspend. All users of the userfault_list
2216          * must be holding an RPM wakeref to ensure that this can not
2217          * run concurrently with themselves (and use the struct_mutex for
2218          * protection between themselves).
2219          */
2220
2221         list_for_each_entry_safe(obj, on,
2222                                  &dev_priv->mm.userfault_list, userfault_link)
2223                 __i915_gem_object_release_mmap(obj);
2224
2225         /* The fence will be lost when the device powers down. If any were
2226          * in use by hardware (i.e. they are pinned), we should not be powering
2227          * down! All other fences will be reacquired by the user upon waking.
2228          */
2229         for (i = 0; i < dev_priv->num_fence_regs; i++) {
2230                 struct drm_i915_fence_reg *reg = &dev_priv->fence_regs[i];
2231
2232                 /* Ideally we want to assert that the fence register is not
2233                  * live at this point (i.e. that no piece of code will be
2234                  * trying to write through fence + GTT, as that both violates
2235                  * our tracking of activity and associated locking/barriers,
2236                  * but also is illegal given that the hw is powered down).
2237                  *
2238                  * Previously we used reg->pin_count as a "liveness" indicator.
2239                  * That is not sufficient, and we need a more fine-grained
2240                  * tool if we want to have a sanity check here.
2241                  */
2242
2243                 if (!reg->vma)
2244                         continue;
2245
2246                 GEM_BUG_ON(i915_vma_has_userfault(reg->vma));
2247                 reg->dirty = true;
2248         }
2249 }
2250
2251 static int i915_gem_object_create_mmap_offset(struct drm_i915_gem_object *obj)
2252 {
2253         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
2254         int err;
2255
2256         err = drm_gem_create_mmap_offset(&obj->base);
2257         if (likely(!err))
2258                 return 0;
2259
2260         /* Attempt to reap some mmap space from dead objects */
2261         do {
2262                 err = i915_gem_wait_for_idle(dev_priv, I915_WAIT_INTERRUPTIBLE);
2263                 if (err)
2264                         break;
2265
2266                 i915_gem_drain_freed_objects(dev_priv);
2267                 err = drm_gem_create_mmap_offset(&obj->base);
2268                 if (!err)
2269                         break;
2270
2271         } while (flush_delayed_work(&dev_priv->gt.retire_work));
2272
2273         return err;
2274 }
2275
2276 static void i915_gem_object_free_mmap_offset(struct drm_i915_gem_object *obj)
2277 {
2278         drm_gem_free_mmap_offset(&obj->base);
2279 }
2280
2281 int
2282 i915_gem_mmap_gtt(struct drm_file *file,
2283                   struct drm_device *dev,
2284                   uint32_t handle,
2285                   uint64_t *offset)
2286 {
2287         struct drm_i915_gem_object *obj;
2288         int ret;
2289
2290         obj = i915_gem_object_lookup(file, handle);
2291         if (!obj)
2292                 return -ENOENT;
2293
2294         ret = i915_gem_object_create_mmap_offset(obj);
2295         if (ret == 0)
2296                 *offset = drm_vma_node_offset_addr(&obj->base.vma_node);
2297
2298         i915_gem_object_put(obj);
2299         return ret;
2300 }
2301
2302 /**
2303  * i915_gem_mmap_gtt_ioctl - prepare an object for GTT mmap'ing
2304  * @dev: DRM device
2305  * @data: GTT mapping ioctl data
2306  * @file: GEM object info
2307  *
2308  * Simply returns the fake offset to userspace so it can mmap it.
2309  * The mmap call will end up in drm_gem_mmap(), which will set things
2310  * up so we can get faults in the handler above.
2311  *
2312  * The fault handler will take care of binding the object into the GTT
2313  * (since it may have been evicted to make room for something), allocating
2314  * a fence register, and mapping the appropriate aperture address into
2315  * userspace.
2316  */
2317 int
2318 i915_gem_mmap_gtt_ioctl(struct drm_device *dev, void *data,
2319                         struct drm_file *file)
2320 {
2321         struct drm_i915_gem_mmap_gtt *args = data;
2322
2323         return i915_gem_mmap_gtt(file, dev, args->handle, &args->offset);
2324 }
2325
2326 /* Immediately discard the backing storage */
2327 static void
2328 i915_gem_object_truncate(struct drm_i915_gem_object *obj)
2329 {
2330         i915_gem_object_free_mmap_offset(obj);
2331
2332         if (obj->base.filp == NULL)
2333                 return;
2334
2335         /* Our goal here is to return as much of the memory as
2336          * is possible back to the system as we are called from OOM.
2337          * To do this we must instruct the shmfs to drop all of its
2338          * backing pages, *now*.
2339          */
2340         shmem_truncate_range(file_inode(obj->base.filp), 0, (loff_t)-1);
2341         obj->mm.madv = __I915_MADV_PURGED;
2342         obj->mm.pages = ERR_PTR(-EFAULT);
2343 }
2344
2345 /* Try to discard unwanted pages */
2346 void __i915_gem_object_invalidate(struct drm_i915_gem_object *obj)
2347 {
2348         struct address_space *mapping;
2349
2350         lockdep_assert_held(&obj->mm.lock);
2351         GEM_BUG_ON(i915_gem_object_has_pages(obj));
2352
2353         switch (obj->mm.madv) {
2354         case I915_MADV_DONTNEED:
2355                 i915_gem_object_truncate(obj);
2356         case __I915_MADV_PURGED:
2357                 return;
2358         }
2359
2360         if (obj->base.filp == NULL)
2361                 return;
2362
2363         mapping = obj->base.filp->f_mapping,
2364         invalidate_mapping_pages(mapping, 0, (loff_t)-1);
2365 }
2366
2367 static void
2368 i915_gem_object_put_pages_gtt(struct drm_i915_gem_object *obj,
2369                               struct sg_table *pages)
2370 {
2371         struct sgt_iter sgt_iter;
2372         struct page *page;
2373
2374         __i915_gem_object_release_shmem(obj, pages, true);
2375
2376         i915_gem_gtt_finish_pages(obj, pages);
2377
2378         if (i915_gem_object_needs_bit17_swizzle(obj))
2379                 i915_gem_object_save_bit_17_swizzle(obj, pages);
2380
2381         for_each_sgt_page(page, sgt_iter, pages) {
2382                 if (obj->mm.dirty)
2383                         set_page_dirty(page);
2384
2385                 if (obj->mm.madv == I915_MADV_WILLNEED)
2386                         mark_page_accessed(page);
2387
2388                 put_page(page);
2389         }
2390         obj->mm.dirty = false;
2391
2392         sg_free_table(pages);
2393         kfree(pages);
2394 }
2395
2396 static void __i915_gem_object_reset_page_iter(struct drm_i915_gem_object *obj)
2397 {
2398         struct radix_tree_iter iter;
2399         void __rcu **slot;
2400
2401         rcu_read_lock();
2402         radix_tree_for_each_slot(slot, &obj->mm.get_page.radix, &iter, 0)
2403                 radix_tree_delete(&obj->mm.get_page.radix, iter.index);
2404         rcu_read_unlock();
2405 }
2406
2407 void __i915_gem_object_put_pages(struct drm_i915_gem_object *obj,
2408                                  enum i915_mm_subclass subclass)
2409 {
2410         struct drm_i915_private *i915 = to_i915(obj->base.dev);
2411         struct sg_table *pages;
2412
2413         if (i915_gem_object_has_pinned_pages(obj))
2414                 return;
2415
2416         GEM_BUG_ON(obj->bind_count);
2417         if (!i915_gem_object_has_pages(obj))
2418                 return;
2419
2420         /* May be called by shrinker from within get_pages() (on another bo) */
2421         mutex_lock_nested(&obj->mm.lock, subclass);
2422         if (unlikely(atomic_read(&obj->mm.pages_pin_count)))
2423                 goto unlock;
2424
2425         /* ->put_pages might need to allocate memory for the bit17 swizzle
2426          * array, hence protect them from being reaped by removing them from gtt
2427          * lists early. */
2428         pages = fetch_and_zero(&obj->mm.pages);
2429         GEM_BUG_ON(!pages);
2430
2431         spin_lock(&i915->mm.obj_lock);
2432         list_del(&obj->mm.link);
2433         spin_unlock(&i915->mm.obj_lock);
2434
2435         if (obj->mm.mapping) {
2436                 void *ptr;
2437
2438                 ptr = page_mask_bits(obj->mm.mapping);
2439                 if (is_vmalloc_addr(ptr))
2440                         vunmap(ptr);
2441                 else
2442                         kunmap(kmap_to_page(ptr));
2443
2444                 obj->mm.mapping = NULL;
2445         }
2446
2447         __i915_gem_object_reset_page_iter(obj);
2448
2449         if (!IS_ERR(pages))
2450                 obj->ops->put_pages(obj, pages);
2451
2452         obj->mm.page_sizes.phys = obj->mm.page_sizes.sg = 0;
2453
2454 unlock:
2455         mutex_unlock(&obj->mm.lock);
2456 }
2457
2458 static bool i915_sg_trim(struct sg_table *orig_st)
2459 {
2460         struct sg_table new_st;
2461         struct scatterlist *sg, *new_sg;
2462         unsigned int i;
2463
2464         if (orig_st->nents == orig_st->orig_nents)
2465                 return false;
2466
2467         if (sg_alloc_table(&new_st, orig_st->nents, GFP_KERNEL | __GFP_NOWARN))
2468                 return false;
2469
2470         new_sg = new_st.sgl;
2471         for_each_sg(orig_st->sgl, sg, orig_st->nents, i) {
2472                 sg_set_page(new_sg, sg_page(sg), sg->length, 0);
2473                 /* called before being DMA mapped, no need to copy sg->dma_* */
2474                 new_sg = sg_next(new_sg);
2475         }
2476         GEM_BUG_ON(new_sg); /* Should walk exactly nents and hit the end */
2477
2478         sg_free_table(orig_st);
2479
2480         *orig_st = new_st;
2481         return true;
2482 }
2483
2484 static int i915_gem_object_get_pages_gtt(struct drm_i915_gem_object *obj)
2485 {
2486         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
2487         const unsigned long page_count = obj->base.size / PAGE_SIZE;
2488         unsigned long i;
2489         struct address_space *mapping;
2490         struct sg_table *st;
2491         struct scatterlist *sg;
2492         struct sgt_iter sgt_iter;
2493         struct page *page;
2494         unsigned long last_pfn = 0;     /* suppress gcc warning */
2495         unsigned int max_segment = i915_sg_segment_size();
2496         unsigned int sg_page_sizes;
2497         gfp_t noreclaim;
2498         int ret;
2499
2500         /* Assert that the object is not currently in any GPU domain. As it
2501          * wasn't in the GTT, there shouldn't be any way it could have been in
2502          * a GPU cache
2503          */
2504         GEM_BUG_ON(obj->read_domains & I915_GEM_GPU_DOMAINS);
2505         GEM_BUG_ON(obj->write_domain & I915_GEM_GPU_DOMAINS);
2506
2507         st = kmalloc(sizeof(*st), GFP_KERNEL);
2508         if (st == NULL)
2509                 return -ENOMEM;
2510
2511 rebuild_st:
2512         if (sg_alloc_table(st, page_count, GFP_KERNEL)) {
2513                 kfree(st);
2514                 return -ENOMEM;
2515         }
2516
2517         /* Get the list of pages out of our struct file.  They'll be pinned
2518          * at this point until we release them.
2519          *
2520          * Fail silently without starting the shrinker
2521          */
2522         mapping = obj->base.filp->f_mapping;
2523         noreclaim = mapping_gfp_constraint(mapping, ~__GFP_RECLAIM);
2524         noreclaim |= __GFP_NORETRY | __GFP_NOWARN;
2525
2526         sg = st->sgl;
2527         st->nents = 0;
2528         sg_page_sizes = 0;
2529         for (i = 0; i < page_count; i++) {
2530                 const unsigned int shrink[] = {
2531                         I915_SHRINK_BOUND | I915_SHRINK_UNBOUND | I915_SHRINK_PURGEABLE,
2532                         0,
2533                 }, *s = shrink;
2534                 gfp_t gfp = noreclaim;
2535
2536                 do {
2537                         page = shmem_read_mapping_page_gfp(mapping, i, gfp);
2538                         if (likely(!IS_ERR(page)))
2539                                 break;
2540
2541                         if (!*s) {
2542                                 ret = PTR_ERR(page);
2543                                 goto err_sg;
2544                         }
2545
2546                         i915_gem_shrink(dev_priv, 2 * page_count, NULL, *s++);
2547                         cond_resched();
2548
2549                         /* We've tried hard to allocate the memory by reaping
2550                          * our own buffer, now let the real VM do its job and
2551                          * go down in flames if truly OOM.
2552                          *
2553                          * However, since graphics tend to be disposable,
2554                          * defer the oom here by reporting the ENOMEM back
2555                          * to userspace.
2556                          */
2557                         if (!*s) {
2558                                 /* reclaim and warn, but no oom */
2559                                 gfp = mapping_gfp_mask(mapping);
2560
2561                                 /* Our bo are always dirty and so we require
2562                                  * kswapd to reclaim our pages (direct reclaim
2563                                  * does not effectively begin pageout of our
2564                                  * buffers on its own). However, direct reclaim
2565                                  * only waits for kswapd when under allocation
2566                                  * congestion. So as a result __GFP_RECLAIM is
2567                                  * unreliable and fails to actually reclaim our
2568                                  * dirty pages -- unless you try over and over
2569                                  * again with !__GFP_NORETRY. However, we still
2570                                  * want to fail this allocation rather than
2571                                  * trigger the out-of-memory killer and for
2572                                  * this we want __GFP_RETRY_MAYFAIL.
2573                                  */
2574                                 gfp |= __GFP_RETRY_MAYFAIL;
2575                         }
2576                 } while (1);
2577
2578                 if (!i ||
2579                     sg->length >= max_segment ||
2580                     page_to_pfn(page) != last_pfn + 1) {
2581                         if (i) {
2582                                 sg_page_sizes |= sg->length;
2583                                 sg = sg_next(sg);
2584                         }
2585                         st->nents++;
2586                         sg_set_page(sg, page, PAGE_SIZE, 0);
2587                 } else {
2588                         sg->length += PAGE_SIZE;
2589                 }
2590                 last_pfn = page_to_pfn(page);
2591
2592                 /* Check that the i965g/gm workaround works. */
2593                 WARN_ON((gfp & __GFP_DMA32) && (last_pfn >= 0x00100000UL));
2594         }
2595         if (sg) { /* loop terminated early; short sg table */
2596                 sg_page_sizes |= sg->length;
2597                 sg_mark_end(sg);
2598         }
2599
2600         /* Trim unused sg entries to avoid wasting memory. */
2601         i915_sg_trim(st);
2602
2603         ret = i915_gem_gtt_prepare_pages(obj, st);
2604         if (ret) {
2605                 /* DMA remapping failed? One possible cause is that
2606                  * it could not reserve enough large entries, asking
2607                  * for PAGE_SIZE chunks instead may be helpful.
2608                  */
2609                 if (max_segment > PAGE_SIZE) {
2610                         for_each_sgt_page(page, sgt_iter, st)
2611                                 put_page(page);
2612                         sg_free_table(st);
2613
2614                         max_segment = PAGE_SIZE;
2615                         goto rebuild_st;
2616                 } else {
2617                         dev_warn(&dev_priv->drm.pdev->dev,
2618                                  "Failed to DMA remap %lu pages\n",
2619                                  page_count);
2620                         goto err_pages;
2621                 }
2622         }
2623
2624         if (i915_gem_object_needs_bit17_swizzle(obj))
2625                 i915_gem_object_do_bit_17_swizzle(obj, st);
2626
2627         __i915_gem_object_set_pages(obj, st, sg_page_sizes);
2628
2629         return 0;
2630
2631 err_sg:
2632         sg_mark_end(sg);
2633 err_pages:
2634         for_each_sgt_page(page, sgt_iter, st)
2635                 put_page(page);
2636         sg_free_table(st);
2637         kfree(st);
2638
2639         /* shmemfs first checks if there is enough memory to allocate the page
2640          * and reports ENOSPC should there be insufficient, along with the usual
2641          * ENOMEM for a genuine allocation failure.
2642          *
2643          * We use ENOSPC in our driver to mean that we have run out of aperture
2644          * space and so want to translate the error from shmemfs back to our
2645          * usual understanding of ENOMEM.
2646          */
2647         if (ret == -ENOSPC)
2648                 ret = -ENOMEM;
2649
2650         return ret;
2651 }
2652
2653 void __i915_gem_object_set_pages(struct drm_i915_gem_object *obj,
2654                                  struct sg_table *pages,
2655                                  unsigned int sg_page_sizes)
2656 {
2657         struct drm_i915_private *i915 = to_i915(obj->base.dev);
2658         unsigned long supported = INTEL_INFO(i915)->page_sizes;
2659         int i;
2660
2661         lockdep_assert_held(&obj->mm.lock);
2662
2663         obj->mm.get_page.sg_pos = pages->sgl;
2664         obj->mm.get_page.sg_idx = 0;
2665
2666         obj->mm.pages = pages;
2667
2668         if (i915_gem_object_is_tiled(obj) &&
2669             i915->quirks & QUIRK_PIN_SWIZZLED_PAGES) {
2670                 GEM_BUG_ON(obj->mm.quirked);
2671                 __i915_gem_object_pin_pages(obj);
2672                 obj->mm.quirked = true;
2673         }
2674
2675         GEM_BUG_ON(!sg_page_sizes);
2676         obj->mm.page_sizes.phys = sg_page_sizes;
2677
2678         /*
2679          * Calculate the supported page-sizes which fit into the given
2680          * sg_page_sizes. This will give us the page-sizes which we may be able
2681          * to use opportunistically when later inserting into the GTT. For
2682          * example if phys=2G, then in theory we should be able to use 1G, 2M,
2683          * 64K or 4K pages, although in practice this will depend on a number of
2684          * other factors.
2685          */
2686         obj->mm.page_sizes.sg = 0;
2687         for_each_set_bit(i, &supported, ilog2(I915_GTT_MAX_PAGE_SIZE) + 1) {
2688                 if (obj->mm.page_sizes.phys & ~0u << i)
2689                         obj->mm.page_sizes.sg |= BIT(i);
2690         }
2691         GEM_BUG_ON(!HAS_PAGE_SIZES(i915, obj->mm.page_sizes.sg));
2692
2693         spin_lock(&i915->mm.obj_lock);
2694         list_add(&obj->mm.link, &i915->mm.unbound_list);
2695         spin_unlock(&i915->mm.obj_lock);
2696 }
2697
2698 static int ____i915_gem_object_get_pages(struct drm_i915_gem_object *obj)
2699 {
2700         int err;
2701
2702         if (unlikely(obj->mm.madv != I915_MADV_WILLNEED)) {
2703                 DRM_DEBUG("Attempting to obtain a purgeable object\n");
2704                 return -EFAULT;
2705         }
2706
2707         err = obj->ops->get_pages(obj);
2708         GEM_BUG_ON(!err && !i915_gem_object_has_pages(obj));
2709
2710         return err;
2711 }
2712
2713 /* Ensure that the associated pages are gathered from the backing storage
2714  * and pinned into our object. i915_gem_object_pin_pages() may be called
2715  * multiple times before they are released by a single call to
2716  * i915_gem_object_unpin_pages() - once the pages are no longer referenced
2717  * either as a result of memory pressure (reaping pages under the shrinker)
2718  * or as the object is itself released.
2719  */
2720 int __i915_gem_object_get_pages(struct drm_i915_gem_object *obj)
2721 {
2722         int err;
2723
2724         err = mutex_lock_interruptible(&obj->mm.lock);
2725         if (err)
2726                 return err;
2727
2728         if (unlikely(!i915_gem_object_has_pages(obj))) {
2729                 GEM_BUG_ON(i915_gem_object_has_pinned_pages(obj));
2730
2731                 err = ____i915_gem_object_get_pages(obj);
2732                 if (err)
2733                         goto unlock;
2734
2735                 smp_mb__before_atomic();
2736         }
2737         atomic_inc(&obj->mm.pages_pin_count);
2738
2739 unlock:
2740         mutex_unlock(&obj->mm.lock);
2741         return err;
2742 }
2743
2744 /* The 'mapping' part of i915_gem_object_pin_map() below */
2745 static void *i915_gem_object_map(const struct drm_i915_gem_object *obj,
2746                                  enum i915_map_type type)
2747 {
2748         unsigned long n_pages = obj->base.size >> PAGE_SHIFT;
2749         struct sg_table *sgt = obj->mm.pages;
2750         struct sgt_iter sgt_iter;
2751         struct page *page;
2752         struct page *stack_pages[32];
2753         struct page **pages = stack_pages;
2754         unsigned long i = 0;
2755         pgprot_t pgprot;
2756         void *addr;
2757
2758         /* A single page can always be kmapped */
2759         if (n_pages == 1 && type == I915_MAP_WB)
2760                 return kmap(sg_page(sgt->sgl));
2761
2762         if (n_pages > ARRAY_SIZE(stack_pages)) {
2763                 /* Too big for stack -- allocate temporary array instead */
2764                 pages = kvmalloc_array(n_pages, sizeof(*pages), GFP_KERNEL);
2765                 if (!pages)
2766                         return NULL;
2767         }
2768
2769         for_each_sgt_page(page, sgt_iter, sgt)
2770                 pages[i++] = page;
2771
2772         /* Check that we have the expected number of pages */
2773         GEM_BUG_ON(i != n_pages);
2774
2775         switch (type) {
2776         default:
2777                 MISSING_CASE(type);
2778                 /* fallthrough to use PAGE_KERNEL anyway */
2779         case I915_MAP_WB:
2780                 pgprot = PAGE_KERNEL;
2781                 break;
2782         case I915_MAP_WC:
2783                 pgprot = pgprot_writecombine(PAGE_KERNEL_IO);
2784                 break;
2785         }
2786         addr = vmap(pages, n_pages, 0, pgprot);
2787
2788         if (pages != stack_pages)
2789                 kvfree(pages);
2790
2791         return addr;
2792 }
2793
2794 /* get, pin, and map the pages of the object into kernel space */
2795 void *i915_gem_object_pin_map(struct drm_i915_gem_object *obj,
2796                               enum i915_map_type type)
2797 {
2798         enum i915_map_type has_type;
2799         bool pinned;
2800         void *ptr;
2801         int ret;
2802
2803         if (unlikely(!i915_gem_object_has_struct_page(obj)))
2804                 return ERR_PTR(-ENXIO);
2805
2806         ret = mutex_lock_interruptible(&obj->mm.lock);
2807         if (ret)
2808                 return ERR_PTR(ret);
2809
2810         pinned = !(type & I915_MAP_OVERRIDE);
2811         type &= ~I915_MAP_OVERRIDE;
2812
2813         if (!atomic_inc_not_zero(&obj->mm.pages_pin_count)) {
2814                 if (unlikely(!i915_gem_object_has_pages(obj))) {
2815                         GEM_BUG_ON(i915_gem_object_has_pinned_pages(obj));
2816
2817                         ret = ____i915_gem_object_get_pages(obj);
2818                         if (ret)
2819                                 goto err_unlock;
2820
2821                         smp_mb__before_atomic();
2822                 }
2823                 atomic_inc(&obj->mm.pages_pin_count);
2824                 pinned = false;
2825         }
2826         GEM_BUG_ON(!i915_gem_object_has_pages(obj));
2827
2828         ptr = page_unpack_bits(obj->mm.mapping, &has_type);
2829         if (ptr && has_type != type) {
2830                 if (pinned) {
2831                         ret = -EBUSY;
2832                         goto err_unpin;
2833                 }
2834
2835                 if (is_vmalloc_addr(ptr))
2836                         vunmap(ptr);
2837                 else
2838                         kunmap(kmap_to_page(ptr));
2839
2840                 ptr = obj->mm.mapping = NULL;
2841         }
2842
2843         if (!ptr) {
2844                 ptr = i915_gem_object_map(obj, type);
2845                 if (!ptr) {
2846                         ret = -ENOMEM;
2847                         goto err_unpin;
2848                 }
2849
2850                 obj->mm.mapping = page_pack_bits(ptr, type);
2851         }
2852
2853 out_unlock:
2854         mutex_unlock(&obj->mm.lock);
2855         return ptr;
2856
2857 err_unpin:
2858         atomic_dec(&obj->mm.pages_pin_count);
2859 err_unlock:
2860         ptr = ERR_PTR(ret);
2861         goto out_unlock;
2862 }
2863
2864 static int
2865 i915_gem_object_pwrite_gtt(struct drm_i915_gem_object *obj,
2866                            const struct drm_i915_gem_pwrite *arg)
2867 {
2868         struct address_space *mapping = obj->base.filp->f_mapping;
2869         char __user *user_data = u64_to_user_ptr(arg->data_ptr);
2870         u64 remain, offset;
2871         unsigned int pg;
2872
2873         /* Before we instantiate/pin the backing store for our use, we
2874          * can prepopulate the shmemfs filp efficiently using a write into
2875          * the pagecache. We avoid the penalty of instantiating all the
2876          * pages, important if the user is just writing to a few and never
2877          * uses the object on the GPU, and using a direct write into shmemfs
2878          * allows it to avoid the cost of retrieving a page (either swapin
2879          * or clearing-before-use) before it is overwritten.
2880          */
2881         if (i915_gem_object_has_pages(obj))
2882                 return -ENODEV;
2883
2884         if (obj->mm.madv != I915_MADV_WILLNEED)
2885                 return -EFAULT;
2886
2887         /* Before the pages are instantiated the object is treated as being
2888          * in the CPU domain. The pages will be clflushed as required before
2889          * use, and we can freely write into the pages directly. If userspace
2890          * races pwrite with any other operation; corruption will ensue -
2891          * that is userspace's prerogative!
2892          */
2893
2894         remain = arg->size;
2895         offset = arg->offset;
2896         pg = offset_in_page(offset);
2897
2898         do {
2899                 unsigned int len, unwritten;
2900                 struct page *page;
2901                 void *data, *vaddr;
2902                 int err;
2903
2904                 len = PAGE_SIZE - pg;
2905                 if (len > remain)
2906                         len = remain;
2907
2908                 err = pagecache_write_begin(obj->base.filp, mapping,
2909                                             offset, len, 0,
2910                                             &page, &data);
2911                 if (err < 0)
2912                         return err;
2913
2914                 vaddr = kmap(page);
2915                 unwritten = copy_from_user(vaddr + pg, user_data, len);
2916                 kunmap(page);
2917
2918                 err = pagecache_write_end(obj->base.filp, mapping,
2919                                           offset, len, len - unwritten,
2920                                           page, data);
2921                 if (err < 0)
2922                         return err;
2923
2924                 if (unwritten)
2925                         return -EFAULT;
2926
2927                 remain -= len;
2928                 user_data += len;
2929                 offset += len;
2930                 pg = 0;
2931         } while (remain);
2932
2933         return 0;
2934 }
2935
2936 static void i915_gem_context_mark_guilty(struct i915_gem_context *ctx)
2937 {
2938         bool banned;
2939
2940         atomic_inc(&ctx->guilty_count);
2941
2942         banned = false;
2943         if (i915_gem_context_is_bannable(ctx)) {
2944                 unsigned int score;
2945
2946                 score = atomic_add_return(CONTEXT_SCORE_GUILTY,
2947                                           &ctx->ban_score);
2948                 banned = score >= CONTEXT_SCORE_BAN_THRESHOLD;
2949
2950                 DRM_DEBUG_DRIVER("context %s marked guilty (score %d) banned? %s\n",
2951                                  ctx->name, score, yesno(banned));
2952         }
2953         if (!banned)
2954                 return;
2955
2956         i915_gem_context_set_banned(ctx);
2957         if (!IS_ERR_OR_NULL(ctx->file_priv)) {
2958                 atomic_inc(&ctx->file_priv->context_bans);
2959                 DRM_DEBUG_DRIVER("client %s has had %d context banned\n",
2960                                  ctx->name, atomic_read(&ctx->file_priv->context_bans));
2961         }
2962 }
2963
2964 static void i915_gem_context_mark_innocent(struct i915_gem_context *ctx)
2965 {
2966         atomic_inc(&ctx->active_count);
2967 }
2968
2969 struct i915_request *
2970 i915_gem_find_active_request(struct intel_engine_cs *engine)
2971 {
2972         struct i915_request *request, *active = NULL;
2973         unsigned long flags;
2974
2975         /*
2976          * We are called by the error capture, reset and to dump engine
2977          * state at random points in time. In particular, note that neither is
2978          * crucially ordered with an interrupt. After a hang, the GPU is dead
2979          * and we assume that no more writes can happen (we waited long enough
2980          * for all writes that were in transaction to be flushed) - adding an
2981          * extra delay for a recent interrupt is pointless. Hence, we do
2982          * not need an engine->irq_seqno_barrier() before the seqno reads.
2983          * At all other times, we must assume the GPU is still running, but
2984          * we only care about the snapshot of this moment.
2985          */
2986         spin_lock_irqsave(&engine->timeline.lock, flags);
2987         list_for_each_entry(request, &engine->timeline.requests, link) {
2988                 if (__i915_request_completed(request, request->global_seqno))
2989                         continue;
2990
2991                 active = request;
2992                 break;
2993         }
2994         spin_unlock_irqrestore(&engine->timeline.lock, flags);
2995
2996         return active;
2997 }
2998
2999 /*
3000  * Ensure irq handler finishes, and not run again.
3001  * Also return the active request so that we only search for it once.
3002  */
3003 struct i915_request *
3004 i915_gem_reset_prepare_engine(struct intel_engine_cs *engine)
3005 {
3006         struct i915_request *request = NULL;
3007
3008         /*
3009          * During the reset sequence, we must prevent the engine from
3010          * entering RC6. As the context state is undefined until we restart
3011          * the engine, if it does enter RC6 during the reset, the state
3012          * written to the powercontext is undefined and so we may lose
3013          * GPU state upon resume, i.e. fail to restart after a reset.
3014          */
3015         intel_uncore_forcewake_get(engine->i915, FORCEWAKE_ALL);
3016
3017         /*
3018          * Prevent the signaler thread from updating the request
3019          * state (by calling dma_fence_signal) as we are processing
3020          * the reset. The write from the GPU of the seqno is
3021          * asynchronous and the signaler thread may see a different
3022          * value to us and declare the request complete, even though
3023          * the reset routine have picked that request as the active
3024          * (incomplete) request. This conflict is not handled
3025          * gracefully!
3026          */
3027         kthread_park(engine->breadcrumbs.signaler);
3028
3029         /*
3030          * Prevent request submission to the hardware until we have
3031          * completed the reset in i915_gem_reset_finish(). If a request
3032          * is completed by one engine, it may then queue a request
3033          * to a second via its execlists->tasklet *just* as we are
3034          * calling engine->init_hw() and also writing the ELSP.
3035          * Turning off the execlists->tasklet until the reset is over
3036          * prevents the race.
3037          *
3038          * Note that this needs to be a single atomic operation on the
3039          * tasklet (flush existing tasks, prevent new tasks) to prevent
3040          * a race between reset and set-wedged. It is not, so we do the best
3041          * we can atm and make sure we don't lock the machine up in the more
3042          * common case of recursively being called from set-wedged from inside
3043          * i915_reset.
3044          */
3045         if (!atomic_read(&engine->execlists.tasklet.count))
3046                 tasklet_kill(&engine->execlists.tasklet);
3047         tasklet_disable(&engine->execlists.tasklet);
3048
3049         /*
3050          * We're using worker to queue preemption requests from the tasklet in
3051          * GuC submission mode.
3052          * Even though tasklet was disabled, we may still have a worker queued.
3053          * Let's make sure that all workers scheduled before disabling the
3054          * tasklet are completed before continuing with the reset.
3055          */
3056         if (engine->i915->guc.preempt_wq)
3057                 flush_workqueue(engine->i915->guc.preempt_wq);
3058
3059         if (engine->irq_seqno_barrier)
3060                 engine->irq_seqno_barrier(engine);
3061
3062         request = i915_gem_find_active_request(engine);
3063         if (request && request->fence.error == -EIO)
3064                 request = ERR_PTR(-EIO); /* Previous reset failed! */
3065
3066         return request;
3067 }
3068
3069 int i915_gem_reset_prepare(struct drm_i915_private *dev_priv)
3070 {
3071         struct intel_engine_cs *engine;
3072         struct i915_request *request;
3073         enum intel_engine_id id;
3074         int err = 0;
3075
3076         for_each_engine(engine, dev_priv, id) {
3077                 request = i915_gem_reset_prepare_engine(engine);
3078                 if (IS_ERR(request)) {
3079                         err = PTR_ERR(request);
3080                         continue;
3081                 }
3082
3083                 engine->hangcheck.active_request = request;
3084         }
3085
3086         i915_gem_revoke_fences(dev_priv);
3087         intel_uc_sanitize(dev_priv);
3088
3089         return err;
3090 }
3091
3092 static void skip_request(struct i915_request *request)
3093 {
3094         void *vaddr = request->ring->vaddr;
3095         u32 head;
3096
3097         /* As this request likely depends on state from the lost
3098          * context, clear out all the user operations leaving the
3099          * breadcrumb at the end (so we get the fence notifications).
3100          */
3101         head = request->head;
3102         if (request->postfix < head) {
3103                 memset(vaddr + head, 0, request->ring->size - head);
3104                 head = 0;
3105         }
3106         memset(vaddr + head, 0, request->postfix - head);
3107
3108         dma_fence_set_error(&request->fence, -EIO);
3109 }
3110
3111 static void engine_skip_context(struct i915_request *request)
3112 {
3113         struct intel_engine_cs *engine = request->engine;
3114         struct i915_gem_context *hung_ctx = request->ctx;
3115         struct i915_timeline *timeline = request->timeline;
3116         unsigned long flags;
3117
3118         GEM_BUG_ON(timeline == &engine->timeline);
3119
3120         spin_lock_irqsave(&engine->timeline.lock, flags);
3121         spin_lock_nested(&timeline->lock, SINGLE_DEPTH_NESTING);
3122
3123         list_for_each_entry_continue(request, &engine->timeline.requests, link)
3124                 if (request->ctx == hung_ctx)
3125                         skip_request(request);
3126
3127         list_for_each_entry(request, &timeline->requests, link)
3128                 skip_request(request);
3129
3130         spin_unlock(&timeline->lock);
3131         spin_unlock_irqrestore(&engine->timeline.lock, flags);
3132 }
3133
3134 /* Returns the request if it was guilty of the hang */
3135 static struct i915_request *
3136 i915_gem_reset_request(struct intel_engine_cs *engine,
3137                        struct i915_request *request,
3138                        bool stalled)
3139 {
3140         /* The guilty request will get skipped on a hung engine.
3141          *
3142          * Users of client default contexts do not rely on logical
3143          * state preserved between batches so it is safe to execute
3144          * queued requests following the hang. Non default contexts
3145          * rely on preserved state, so skipping a batch loses the
3146          * evolution of the state and it needs to be considered corrupted.
3147          * Executing more queued batches on top of corrupted state is
3148          * risky. But we take the risk by trying to advance through
3149          * the queued requests in order to make the client behaviour
3150          * more predictable around resets, by not throwing away random
3151          * amount of batches it has prepared for execution. Sophisticated
3152          * clients can use gem_reset_stats_ioctl and dma fence status
3153          * (exported via sync_file info ioctl on explicit fences) to observe
3154          * when it loses the context state and should rebuild accordingly.
3155          *
3156          * The context ban, and ultimately the client ban, mechanism are safety
3157          * valves if client submission ends up resulting in nothing more than
3158          * subsequent hangs.
3159          */
3160
3161         if (i915_request_completed(request)) {
3162                 GEM_TRACE("%s pardoned global=%d (fence %llx:%d), current %d\n",
3163                           engine->name, request->global_seqno,
3164                           request->fence.context, request->fence.seqno,
3165                           intel_engine_get_seqno(engine));
3166                 stalled = false;
3167         }
3168
3169         if (stalled) {
3170                 i915_gem_context_mark_guilty(request->ctx);
3171                 skip_request(request);
3172
3173                 /* If this context is now banned, skip all pending requests. */
3174                 if (i915_gem_context_is_banned(request->ctx))
3175                         engine_skip_context(request);
3176         } else {
3177                 /*
3178                  * Since this is not the hung engine, it may have advanced
3179                  * since the hang declaration. Double check by refinding
3180                  * the active request at the time of the reset.
3181                  */
3182                 request = i915_gem_find_active_request(engine);
3183                 if (request) {
3184                         i915_gem_context_mark_innocent(request->ctx);
3185                         dma_fence_set_error(&request->fence, -EAGAIN);
3186
3187                         /* Rewind the engine to replay the incomplete rq */
3188                         spin_lock_irq(&engine->timeline.lock);
3189                         request = list_prev_entry(request, link);
3190                         if (&request->link == &engine->timeline.requests)
3191                                 request = NULL;
3192                         spin_unlock_irq(&engine->timeline.lock);
3193                 }
3194         }
3195
3196         return request;
3197 }
3198
3199 void i915_gem_reset_engine(struct intel_engine_cs *engine,
3200                            struct i915_request *request,
3201                            bool stalled)
3202 {
3203         /*
3204          * Make sure this write is visible before we re-enable the interrupt
3205          * handlers on another CPU, as tasklet_enable() resolves to just
3206          * a compiler barrier which is insufficient for our purpose here.
3207          */
3208         smp_store_mb(engine->irq_posted, 0);
3209
3210         if (request)
3211                 request = i915_gem_reset_request(engine, request, stalled);
3212
3213         if (request) {
3214                 DRM_DEBUG_DRIVER("resetting %s to restart from tail of request 0x%x\n",
3215                                  engine->name, request->global_seqno);
3216         }
3217
3218         /* Setup the CS to resume from the breadcrumb of the hung request */
3219         engine->reset_hw(engine, request);
3220 }
3221
3222 void i915_gem_reset(struct drm_i915_private *dev_priv,
3223                     unsigned int stalled_mask)
3224 {
3225         struct intel_engine_cs *engine;
3226         enum intel_engine_id id;
3227
3228         lockdep_assert_held(&dev_priv->drm.struct_mutex);
3229
3230         i915_retire_requests(dev_priv);
3231
3232         for_each_engine(engine, dev_priv, id) {
3233                 struct i915_gem_context *ctx;
3234
3235                 i915_gem_reset_engine(engine,
3236                                       engine->hangcheck.active_request,
3237                                       stalled_mask & ENGINE_MASK(id));
3238                 ctx = fetch_and_zero(&engine->last_retired_context);
3239                 if (ctx)
3240                         intel_context_unpin(ctx, engine);
3241
3242                 /*
3243                  * Ostensibily, we always want a context loaded for powersaving,
3244                  * so if the engine is idle after the reset, send a request
3245                  * to load our scratch kernel_context.
3246                  *
3247                  * More mysteriously, if we leave the engine idle after a reset,
3248                  * the next userspace batch may hang, with what appears to be
3249                  * an incoherent read by the CS (presumably stale TLB). An
3250                  * empty request appears sufficient to paper over the glitch.
3251                  */
3252                 if (intel_engine_is_idle(engine)) {
3253                         struct i915_request *rq;
3254
3255                         rq = i915_request_alloc(engine,
3256                                                 dev_priv->kernel_context);
3257                         if (!IS_ERR(rq))
3258                                 __i915_request_add(rq, false);
3259                 }
3260         }
3261
3262         i915_gem_restore_fences(dev_priv);
3263 }
3264
3265 void i915_gem_reset_finish_engine(struct intel_engine_cs *engine)
3266 {
3267         tasklet_enable(&engine->execlists.tasklet);
3268         kthread_unpark(engine->breadcrumbs.signaler);
3269
3270         intel_uncore_forcewake_put(engine->i915, FORCEWAKE_ALL);
3271 }
3272
3273 void i915_gem_reset_finish(struct drm_i915_private *dev_priv)
3274 {
3275         struct intel_engine_cs *engine;
3276         enum intel_engine_id id;
3277
3278         lockdep_assert_held(&dev_priv->drm.struct_mutex);
3279
3280         for_each_engine(engine, dev_priv, id) {
3281                 engine->hangcheck.active_request = NULL;
3282                 i915_gem_reset_finish_engine(engine);
3283         }
3284 }
3285
3286 static void nop_submit_request(struct i915_request *request)
3287 {
3288         GEM_TRACE("%s fence %llx:%d -> -EIO\n",
3289                   request->engine->name,
3290                   request->fence.context, request->fence.seqno);
3291         dma_fence_set_error(&request->fence, -EIO);
3292
3293         i915_request_submit(request);
3294 }
3295
3296 static void nop_complete_submit_request(struct i915_request *request)
3297 {
3298         unsigned long flags;
3299
3300         GEM_TRACE("%s fence %llx:%d -> -EIO\n",
3301                   request->engine->name,
3302                   request->fence.context, request->fence.seqno);
3303         dma_fence_set_error(&request->fence, -EIO);
3304
3305         spin_lock_irqsave(&request->engine->timeline.lock, flags);
3306         __i915_request_submit(request);
3307         intel_engine_init_global_seqno(request->engine, request->global_seqno);
3308         spin_unlock_irqrestore(&request->engine->timeline.lock, flags);
3309 }
3310
3311 void i915_gem_set_wedged(struct drm_i915_private *i915)
3312 {
3313         struct intel_engine_cs *engine;
3314         enum intel_engine_id id;
3315
3316         GEM_TRACE("start\n");
3317
3318         if (GEM_SHOW_DEBUG()) {
3319                 struct drm_printer p = drm_debug_printer(__func__);
3320
3321                 for_each_engine(engine, i915, id)
3322                         intel_engine_dump(engine, &p, "%s\n", engine->name);
3323         }
3324
3325         set_bit(I915_WEDGED, &i915->gpu_error.flags);
3326         smp_mb__after_atomic();
3327
3328         /*
3329          * First, stop submission to hw, but do not yet complete requests by
3330          * rolling the global seqno forward (since this would complete requests
3331          * for which we haven't set the fence error to EIO yet).
3332          */
3333         for_each_engine(engine, i915, id) {
3334                 i915_gem_reset_prepare_engine(engine);
3335
3336                 engine->submit_request = nop_submit_request;
3337                 engine->schedule = NULL;
3338         }
3339         i915->caps.scheduler = 0;
3340
3341         /* Even if the GPU reset fails, it should still stop the engines */
3342         intel_gpu_reset(i915, ALL_ENGINES);
3343
3344         /*
3345          * Make sure no one is running the old callback before we proceed with
3346          * cancelling requests and resetting the completion tracking. Otherwise
3347          * we might submit a request to the hardware which never completes.
3348          */
3349         synchronize_rcu();
3350
3351         for_each_engine(engine, i915, id) {
3352                 /* Mark all executing requests as skipped */
3353                 engine->cancel_requests(engine);
3354
3355                 /*
3356                  * Only once we've force-cancelled all in-flight requests can we
3357                  * start to complete all requests.
3358                  */
3359                 engine->submit_request = nop_complete_submit_request;
3360         }
3361
3362         /*
3363          * Make sure no request can slip through without getting completed by
3364          * either this call here to intel_engine_init_global_seqno, or the one
3365          * in nop_complete_submit_request.
3366          */
3367         synchronize_rcu();
3368
3369         for_each_engine(engine, i915, id) {
3370                 unsigned long flags;
3371
3372                 /*
3373                  * Mark all pending requests as complete so that any concurrent
3374                  * (lockless) lookup doesn't try and wait upon the request as we
3375                  * reset it.
3376                  */
3377                 spin_lock_irqsave(&engine->timeline.lock, flags);
3378                 intel_engine_init_global_seqno(engine,
3379                                                intel_engine_last_submit(engine));
3380                 spin_unlock_irqrestore(&engine->timeline.lock, flags);
3381
3382                 i915_gem_reset_finish_engine(engine);
3383         }
3384
3385         GEM_TRACE("end\n");
3386
3387         wake_up_all(&i915->gpu_error.reset_queue);
3388 }
3389
3390 bool i915_gem_unset_wedged(struct drm_i915_private *i915)
3391 {
3392         struct i915_timeline *tl;
3393
3394         lockdep_assert_held(&i915->drm.struct_mutex);
3395         if (!test_bit(I915_WEDGED, &i915->gpu_error.flags))
3396                 return true;
3397
3398         GEM_TRACE("start\n");
3399
3400         /*
3401          * Before unwedging, make sure that all pending operations
3402          * are flushed and errored out - we may have requests waiting upon
3403          * third party fences. We marked all inflight requests as EIO, and
3404          * every execbuf since returned EIO, for consistency we want all
3405          * the currently pending requests to also be marked as EIO, which
3406          * is done inside our nop_submit_request - and so we must wait.
3407          *
3408          * No more can be submitted until we reset the wedged bit.
3409          */
3410         list_for_each_entry(tl, &i915->gt.timelines, link) {
3411                 struct i915_request *rq;
3412
3413                 rq = i915_gem_active_peek(&tl->last_request,
3414                                           &i915->drm.struct_mutex);
3415                 if (!rq)
3416                         continue;
3417
3418                 /*
3419                  * We can't use our normal waiter as we want to
3420                  * avoid recursively trying to handle the current
3421                  * reset. The basic dma_fence_default_wait() installs
3422                  * a callback for dma_fence_signal(), which is
3423                  * triggered by our nop handler (indirectly, the
3424                  * callback enables the signaler thread which is
3425                  * woken by the nop_submit_request() advancing the seqno
3426                  * and when the seqno passes the fence, the signaler
3427                  * then signals the fence waking us up).
3428                  */
3429                 if (dma_fence_default_wait(&rq->fence, true,
3430                                            MAX_SCHEDULE_TIMEOUT) < 0)
3431                         return false;
3432         }
3433         i915_retire_requests(i915);
3434         GEM_BUG_ON(i915->gt.active_requests);
3435
3436         /*
3437          * Undo nop_submit_request. We prevent all new i915 requests from
3438          * being queued (by disallowing execbuf whilst wedged) so having
3439          * waited for all active requests above, we know the system is idle
3440          * and do not have to worry about a thread being inside
3441          * engine->submit_request() as we swap over. So unlike installing
3442          * the nop_submit_request on reset, we can do this from normal
3443          * context and do not require stop_machine().
3444          */
3445         intel_engines_reset_default_submission(i915);
3446         i915_gem_contexts_lost(i915);
3447
3448         GEM_TRACE("end\n");
3449
3450         smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
3451         clear_bit(I915_WEDGED, &i915->gpu_error.flags);
3452
3453         return true;
3454 }
3455
3456 static void
3457 i915_gem_retire_work_handler(struct work_struct *work)
3458 {
3459         struct drm_i915_private *dev_priv =
3460                 container_of(work, typeof(*dev_priv), gt.retire_work.work);
3461         struct drm_device *dev = &dev_priv->drm;
3462
3463         /* Come back later if the device is busy... */
3464         if (mutex_trylock(&dev->struct_mutex)) {
3465                 i915_retire_requests(dev_priv);
3466                 mutex_unlock(&dev->struct_mutex);
3467         }
3468
3469         /*
3470          * Keep the retire handler running until we are finally idle.
3471          * We do not need to do this test under locking as in the worst-case
3472          * we queue the retire worker once too often.
3473          */
3474         if (READ_ONCE(dev_priv->gt.awake))
3475                 queue_delayed_work(dev_priv->wq,
3476                                    &dev_priv->gt.retire_work,
3477                                    round_jiffies_up_relative(HZ));
3478 }
3479
3480 static void shrink_caches(struct drm_i915_private *i915)
3481 {
3482         /*
3483          * kmem_cache_shrink() discards empty slabs and reorders partially
3484          * filled slabs to prioritise allocating from the mostly full slabs,
3485          * with the aim of reducing fragmentation.
3486          */
3487         kmem_cache_shrink(i915->priorities);
3488         kmem_cache_shrink(i915->dependencies);
3489         kmem_cache_shrink(i915->requests);
3490         kmem_cache_shrink(i915->luts);
3491         kmem_cache_shrink(i915->vmas);
3492         kmem_cache_shrink(i915->objects);
3493 }
3494
3495 struct sleep_rcu_work {
3496         union {
3497                 struct rcu_head rcu;
3498                 struct work_struct work;
3499         };
3500         struct drm_i915_private *i915;
3501         unsigned int epoch;
3502 };
3503
3504 static inline bool
3505 same_epoch(struct drm_i915_private *i915, unsigned int epoch)
3506 {
3507         /*
3508          * There is a small chance that the epoch wrapped since we started
3509          * sleeping. If we assume that epoch is at least a u32, then it will
3510          * take at least 2^32 * 100ms for it to wrap, or about 326 years.
3511          */
3512         return epoch == READ_ONCE(i915->gt.epoch);
3513 }
3514
3515 static void __sleep_work(struct work_struct *work)
3516 {
3517         struct sleep_rcu_work *s = container_of(work, typeof(*s), work);
3518         struct drm_i915_private *i915 = s->i915;
3519         unsigned int epoch = s->epoch;
3520
3521         kfree(s);
3522         if (same_epoch(i915, epoch))
3523                 shrink_caches(i915);
3524 }
3525
3526 static void __sleep_rcu(struct rcu_head *rcu)
3527 {
3528         struct sleep_rcu_work *s = container_of(rcu, typeof(*s), rcu);
3529         struct drm_i915_private *i915 = s->i915;
3530
3531         if (same_epoch(i915, s->epoch)) {
3532                 INIT_WORK(&s->work, __sleep_work);
3533                 queue_work(i915->wq, &s->work);
3534         } else {
3535                 kfree(s);
3536         }
3537 }
3538
3539 static inline bool
3540 new_requests_since_last_retire(const struct drm_i915_private *i915)
3541 {
3542         return (READ_ONCE(i915->gt.active_requests) ||
3543                 work_pending(&i915->gt.idle_work.work));
3544 }
3545
3546 static void
3547 i915_gem_idle_work_handler(struct work_struct *work)
3548 {
3549         struct drm_i915_private *dev_priv =
3550                 container_of(work, typeof(*dev_priv), gt.idle_work.work);
3551         unsigned int epoch = I915_EPOCH_INVALID;
3552         bool rearm_hangcheck;
3553
3554         if (!READ_ONCE(dev_priv->gt.awake))
3555                 return;
3556
3557         /*
3558          * Wait for last execlists context complete, but bail out in case a
3559          * new request is submitted. As we don't trust the hardware, we
3560          * continue on if the wait times out. This is necessary to allow
3561          * the machine to suspend even if the hardware dies, and we will
3562          * try to recover in resume (after depriving the hardware of power,
3563          * it may be in a better mmod).
3564          */
3565         __wait_for(if (new_requests_since_last_retire(dev_priv)) return,
3566                    intel_engines_are_idle(dev_priv),
3567                    I915_IDLE_ENGINES_TIMEOUT * 1000,
3568                    10, 500);
3569
3570         rearm_hangcheck =
3571                 cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work);
3572
3573         if (!mutex_trylock(&dev_priv->drm.struct_mutex)) {
3574                 /* Currently busy, come back later */
3575                 mod_delayed_work(dev_priv->wq,
3576                                  &dev_priv->gt.idle_work,
3577                                  msecs_to_jiffies(50));
3578                 goto out_rearm;
3579         }
3580
3581         /*
3582          * New request retired after this work handler started, extend active
3583          * period until next instance of the work.
3584          */
3585         if (new_requests_since_last_retire(dev_priv))
3586                 goto out_unlock;
3587
3588         epoch = __i915_gem_park(dev_priv);
3589
3590         rearm_hangcheck = false;
3591 out_unlock:
3592         mutex_unlock(&dev_priv->drm.struct_mutex);
3593
3594 out_rearm:
3595         if (rearm_hangcheck) {
3596                 GEM_BUG_ON(!dev_priv->gt.awake);
3597                 i915_queue_hangcheck(dev_priv);
3598         }
3599
3600         /*
3601          * When we are idle, it is an opportune time to reap our caches.
3602          * However, we have many objects that utilise RCU and the ordered
3603          * i915->wq that this work is executing on. To try and flush any
3604          * pending frees now we are idle, we first wait for an RCU grace
3605          * period, and then queue a task (that will run last on the wq) to
3606          * shrink and re-optimize the caches.
3607          */
3608         if (same_epoch(dev_priv, epoch)) {
3609                 struct sleep_rcu_work *s = kmalloc(sizeof(*s), GFP_KERNEL);
3610                 if (s) {
3611                         s->i915 = dev_priv;
3612                         s->epoch = epoch;
3613                         call_rcu(&s->rcu, __sleep_rcu);
3614                 }
3615         }
3616 }
3617
3618 void i915_gem_close_object(struct drm_gem_object *gem, struct drm_file *file)
3619 {
3620         struct drm_i915_private *i915 = to_i915(gem->dev);
3621         struct drm_i915_gem_object *obj = to_intel_bo(gem);
3622         struct drm_i915_file_private *fpriv = file->driver_priv;
3623         struct i915_lut_handle *lut, *ln;
3624
3625         mutex_lock(&i915->drm.struct_mutex);
3626
3627         list_for_each_entry_safe(lut, ln, &obj->lut_list, obj_link) {
3628                 struct i915_gem_context *ctx = lut->ctx;
3629                 struct i915_vma *vma;
3630
3631                 GEM_BUG_ON(ctx->file_priv == ERR_PTR(-EBADF));
3632                 if (ctx->file_priv != fpriv)
3633                         continue;
3634
3635                 vma = radix_tree_delete(&ctx->handles_vma, lut->handle);
3636                 GEM_BUG_ON(vma->obj != obj);
3637
3638                 /* We allow the process to have multiple handles to the same
3639                  * vma, in the same fd namespace, by virtue of flink/open.
3640                  */
3641                 GEM_BUG_ON(!vma->open_count);
3642                 if (!--vma->open_count && !i915_vma_is_ggtt(vma))
3643                         i915_vma_close(vma);
3644
3645                 list_del(&lut->obj_link);
3646                 list_del(&lut->ctx_link);
3647
3648                 kmem_cache_free(i915->luts, lut);
3649                 __i915_gem_object_release_unless_active(obj);
3650         }
3651
3652         mutex_unlock(&i915->drm.struct_mutex);
3653 }
3654
3655 static unsigned long to_wait_timeout(s64 timeout_ns)
3656 {
3657         if (timeout_ns < 0)
3658                 return MAX_SCHEDULE_TIMEOUT;
3659
3660         if (timeout_ns == 0)
3661                 return 0;
3662
3663         return nsecs_to_jiffies_timeout(timeout_ns);
3664 }
3665
3666 /**
3667  * i915_gem_wait_ioctl - implements DRM_IOCTL_I915_GEM_WAIT
3668  * @dev: drm device pointer
3669  * @data: ioctl data blob
3670  * @file: drm file pointer
3671  *
3672  * Returns 0 if successful, else an error is returned with the remaining time in
3673  * the timeout parameter.
3674  *  -ETIME: object is still busy after timeout
3675  *  -ERESTARTSYS: signal interrupted the wait
3676  *  -ENONENT: object doesn't exist
3677  * Also possible, but rare:
3678  *  -EAGAIN: incomplete, restart syscall
3679  *  -ENOMEM: damn
3680  *  -ENODEV: Internal IRQ fail
3681  *  -E?: The add request failed
3682  *
3683  * The wait ioctl with a timeout of 0 reimplements the busy ioctl. With any
3684  * non-zero timeout parameter the wait ioctl will wait for the given number of
3685  * nanoseconds on an object becoming unbusy. Since the wait itself does so
3686  * without holding struct_mutex the object may become re-busied before this
3687  * function completes. A similar but shorter * race condition exists in the busy
3688  * ioctl
3689  */
3690 int
3691 i915_gem_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
3692 {
3693         struct drm_i915_gem_wait *args = data;
3694         struct drm_i915_gem_object *obj;
3695         ktime_t start;
3696         long ret;
3697
3698         if (args->flags != 0)
3699                 return -EINVAL;
3700
3701         obj = i915_gem_object_lookup(file, args->bo_handle);
3702         if (!obj)
3703                 return -ENOENT;
3704
3705         start = ktime_get();
3706
3707         ret = i915_gem_object_wait(obj,
3708                                    I915_WAIT_INTERRUPTIBLE | I915_WAIT_ALL,
3709                                    to_wait_timeout(args->timeout_ns),
3710                                    to_rps_client(file));
3711
3712         if (args->timeout_ns > 0) {
3713                 args->timeout_ns -= ktime_to_ns(ktime_sub(ktime_get(), start));
3714                 if (args->timeout_ns < 0)
3715                         args->timeout_ns = 0;
3716
3717                 /*
3718                  * Apparently ktime isn't accurate enough and occasionally has a
3719                  * bit of mismatch in the jiffies<->nsecs<->ktime loop. So patch
3720                  * things up to make the test happy. We allow up to 1 jiffy.
3721                  *
3722                  * This is a regression from the timespec->ktime conversion.
3723                  */
3724                 if (ret == -ETIME && !nsecs_to_jiffies(args->timeout_ns))
3725                         args->timeout_ns = 0;
3726
3727                 /* Asked to wait beyond the jiffie/scheduler precision? */
3728                 if (ret == -ETIME && args->timeout_ns)
3729                         ret = -EAGAIN;
3730         }
3731
3732         i915_gem_object_put(obj);
3733         return ret;
3734 }
3735
3736 static int wait_for_timeline(struct i915_timeline *tl, unsigned int flags)
3737 {
3738         return i915_gem_active_wait(&tl->last_request, flags);
3739 }
3740
3741 static int wait_for_engines(struct drm_i915_private *i915)
3742 {
3743         if (wait_for(intel_engines_are_idle(i915), I915_IDLE_ENGINES_TIMEOUT)) {
3744                 dev_err(i915->drm.dev,
3745                         "Failed to idle engines, declaring wedged!\n");
3746                 GEM_TRACE_DUMP();
3747                 i915_gem_set_wedged(i915);
3748                 return -EIO;
3749         }
3750
3751         return 0;
3752 }
3753
3754 int i915_gem_wait_for_idle(struct drm_i915_private *i915, unsigned int flags)
3755 {
3756         /* If the device is asleep, we have no requests outstanding */
3757         if (!READ_ONCE(i915->gt.awake))
3758                 return 0;
3759
3760         if (flags & I915_WAIT_LOCKED) {
3761                 struct i915_timeline *tl;
3762                 int err;
3763
3764                 lockdep_assert_held(&i915->drm.struct_mutex);
3765
3766                 list_for_each_entry(tl, &i915->gt.timelines, link) {
3767                         err = wait_for_timeline(tl, flags);
3768                         if (err)
3769                                 return err;
3770                 }
3771                 i915_retire_requests(i915);
3772
3773                 return wait_for_engines(i915);
3774         } else {
3775                 struct intel_engine_cs *engine;
3776                 enum intel_engine_id id;
3777                 int err;
3778
3779                 for_each_engine(engine, i915, id) {
3780                         err = wait_for_timeline(&engine->timeline, flags);
3781                         if (err)
3782                                 return err;
3783                 }
3784
3785                 return 0;
3786         }
3787 }
3788
3789 static void __i915_gem_object_flush_for_display(struct drm_i915_gem_object *obj)
3790 {
3791         /*
3792          * We manually flush the CPU domain so that we can override and
3793          * force the flush for the display, and perform it asyncrhonously.
3794          */
3795         flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
3796         if (obj->cache_dirty)
3797                 i915_gem_clflush_object(obj, I915_CLFLUSH_FORCE);
3798         obj->write_domain = 0;
3799 }
3800
3801 void i915_gem_object_flush_if_display(struct drm_i915_gem_object *obj)
3802 {
3803         if (!READ_ONCE(obj->pin_global))
3804                 return;
3805
3806         mutex_lock(&obj->base.dev->struct_mutex);
3807         __i915_gem_object_flush_for_display(obj);
3808         mutex_unlock(&obj->base.dev->struct_mutex);
3809 }
3810
3811 /**
3812  * Moves a single object to the WC read, and possibly write domain.
3813  * @obj: object to act on
3814  * @write: ask for write access or read only
3815  *
3816  * This function returns when the move is complete, including waiting on
3817  * flushes to occur.
3818  */
3819 int
3820 i915_gem_object_set_to_wc_domain(struct drm_i915_gem_object *obj, bool write)
3821 {
3822         int ret;
3823
3824         lockdep_assert_held(&obj->base.dev->struct_mutex);
3825
3826         ret = i915_gem_object_wait(obj,
3827                                    I915_WAIT_INTERRUPTIBLE |
3828                                    I915_WAIT_LOCKED |
3829                                    (write ? I915_WAIT_ALL : 0),
3830                                    MAX_SCHEDULE_TIMEOUT,
3831                                    NULL);
3832         if (ret)
3833                 return ret;
3834
3835         if (obj->write_domain == I915_GEM_DOMAIN_WC)
3836                 return 0;
3837
3838         /* Flush and acquire obj->pages so that we are coherent through
3839          * direct access in memory with previous cached writes through
3840          * shmemfs and that our cache domain tracking remains valid.
3841          * For example, if the obj->filp was moved to swap without us
3842          * being notified and releasing the pages, we would mistakenly
3843          * continue to assume that the obj remained out of the CPU cached
3844          * domain.
3845          */
3846         ret = i915_gem_object_pin_pages(obj);
3847         if (ret)
3848                 return ret;
3849
3850         flush_write_domain(obj, ~I915_GEM_DOMAIN_WC);
3851
3852         /* Serialise direct access to this object with the barriers for
3853          * coherent writes from the GPU, by effectively invalidating the
3854          * WC domain upon first access.
3855          */
3856         if ((obj->read_domains & I915_GEM_DOMAIN_WC) == 0)
3857                 mb();
3858
3859         /* It should now be out of any other write domains, and we can update
3860          * the domain values for our changes.
3861          */
3862         GEM_BUG_ON((obj->write_domain & ~I915_GEM_DOMAIN_WC) != 0);
3863         obj->read_domains |= I915_GEM_DOMAIN_WC;
3864         if (write) {
3865                 obj->read_domains = I915_GEM_DOMAIN_WC;
3866                 obj->write_domain = I915_GEM_DOMAIN_WC;
3867                 obj->mm.dirty = true;
3868         }
3869
3870         i915_gem_object_unpin_pages(obj);
3871         return 0;
3872 }
3873
3874 /**
3875  * Moves a single object to the GTT read, and possibly write domain.
3876  * @obj: object to act on
3877  * @write: ask for write access or read only
3878  *
3879  * This function returns when the move is complete, including waiting on
3880  * flushes to occur.
3881  */
3882 int
3883 i915_gem_object_set_to_gtt_domain(struct drm_i915_gem_object *obj, bool write)
3884 {
3885         int ret;
3886
3887         lockdep_assert_held(&obj->base.dev->struct_mutex);
3888
3889         ret = i915_gem_object_wait(obj,
3890                                    I915_WAIT_INTERRUPTIBLE |
3891                                    I915_WAIT_LOCKED |
3892                                    (write ? I915_WAIT_ALL : 0),
3893                                    MAX_SCHEDULE_TIMEOUT,
3894                                    NULL);
3895         if (ret)
3896                 return ret;
3897
3898         if (obj->write_domain == I915_GEM_DOMAIN_GTT)
3899                 return 0;
3900
3901         /* Flush and acquire obj->pages so that we are coherent through
3902          * direct access in memory with previous cached writes through
3903          * shmemfs and that our cache domain tracking remains valid.
3904          * For example, if the obj->filp was moved to swap without us
3905          * being notified and releasing the pages, we would mistakenly
3906          * continue to assume that the obj remained out of the CPU cached
3907          * domain.
3908          */
3909         ret = i915_gem_object_pin_pages(obj);
3910         if (ret)
3911                 return ret;
3912
3913         flush_write_domain(obj, ~I915_GEM_DOMAIN_GTT);
3914
3915         /* Serialise direct access to this object with the barriers for
3916          * coherent writes from the GPU, by effectively invalidating the
3917          * GTT domain upon first access.
3918          */
3919         if ((obj->read_domains & I915_GEM_DOMAIN_GTT) == 0)
3920                 mb();
3921
3922         /* It should now be out of any other write domains, and we can update
3923          * the domain values for our changes.
3924          */
3925         GEM_BUG_ON((obj->write_domain & ~I915_GEM_DOMAIN_GTT) != 0);
3926         obj->read_domains |= I915_GEM_DOMAIN_GTT;
3927         if (write) {
3928                 obj->read_domains = I915_GEM_DOMAIN_GTT;
3929                 obj->write_domain = I915_GEM_DOMAIN_GTT;
3930                 obj->mm.dirty = true;
3931         }
3932
3933         i915_gem_object_unpin_pages(obj);
3934         return 0;
3935 }
3936
3937 /**
3938  * Changes the cache-level of an object across all VMA.
3939  * @obj: object to act on
3940  * @cache_level: new cache level to set for the object
3941  *
3942  * After this function returns, the object will be in the new cache-level
3943  * across all GTT and the contents of the backing storage will be coherent,
3944  * with respect to the new cache-level. In order to keep the backing storage
3945  * coherent for all users, we only allow a single cache level to be set
3946  * globally on the object and prevent it from being changed whilst the
3947  * hardware is reading from the object. That is if the object is currently
3948  * on the scanout it will be set to uncached (or equivalent display
3949  * cache coherency) and all non-MOCS GPU access will also be uncached so
3950  * that all direct access to the scanout remains coherent.
3951  */
3952 int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
3953                                     enum i915_cache_level cache_level)
3954 {
3955         struct i915_vma *vma;
3956         int ret;
3957
3958         lockdep_assert_held(&obj->base.dev->struct_mutex);
3959
3960         if (obj->cache_level == cache_level)
3961                 return 0;
3962
3963         /* Inspect the list of currently bound VMA and unbind any that would
3964          * be invalid given the new cache-level. This is principally to
3965          * catch the issue of the CS prefetch crossing page boundaries and
3966          * reading an invalid PTE on older architectures.
3967          */
3968 restart:
3969         list_for_each_entry(vma, &obj->vma_list, obj_link) {
3970                 if (!drm_mm_node_allocated(&vma->node))
3971                         continue;
3972
3973                 if (i915_vma_is_pinned(vma)) {
3974                         DRM_DEBUG("can not change the cache level of pinned objects\n");
3975                         return -EBUSY;
3976                 }
3977
3978                 if (!i915_vma_is_closed(vma) &&
3979                     i915_gem_valid_gtt_space(vma, cache_level))
3980                         continue;
3981
3982                 ret = i915_vma_unbind(vma);
3983                 if (ret)
3984                         return ret;
3985
3986                 /* As unbinding may affect other elements in the
3987                  * obj->vma_list (due to side-effects from retiring
3988                  * an active vma), play safe and restart the iterator.
3989                  */
3990                 goto restart;
3991         }
3992
3993         /* We can reuse the existing drm_mm nodes but need to change the
3994          * cache-level on the PTE. We could simply unbind them all and
3995          * rebind with the correct cache-level on next use. However since
3996          * we already have a valid slot, dma mapping, pages etc, we may as
3997          * rewrite the PTE in the belief that doing so tramples upon less
3998          * state and so involves less work.
3999          */
4000         if (obj->bind_count) {
4001                 /* Before we change the PTE, the GPU must not be accessing it.
4002                  * If we wait upon the object, we know that all the bound
4003                  * VMA are no longer active.
4004                  */
4005                 ret = i915_gem_object_wait(obj,
4006                                            I915_WAIT_INTERRUPTIBLE |
4007                                            I915_WAIT_LOCKED |
4008                                            I915_WAIT_ALL,
4009                                            MAX_SCHEDULE_TIMEOUT,
4010                                            NULL);
4011                 if (ret)
4012                         return ret;
4013
4014                 if (!HAS_LLC(to_i915(obj->base.dev)) &&
4015                     cache_level != I915_CACHE_NONE) {
4016                         /* Access to snoopable pages through the GTT is
4017                          * incoherent and on some machines causes a hard
4018                          * lockup. Relinquish the CPU mmaping to force
4019                          * userspace to refault in the pages and we can
4020                          * then double check if the GTT mapping is still
4021                          * valid for that pointer access.
4022                          */
4023                         i915_gem_release_mmap(obj);
4024
4025                         /* As we no longer need a fence for GTT access,
4026                          * we can relinquish it now (and so prevent having
4027                          * to steal a fence from someone else on the next
4028                          * fence request). Note GPU activity would have
4029                          * dropped the fence as all snoopable access is
4030                          * supposed to be linear.
4031                          */
4032                         for_each_ggtt_vma(vma, obj) {
4033                                 ret = i915_vma_put_fence(vma);
4034                                 if (ret)
4035                                         return ret;
4036                         }
4037                 } else {
4038                         /* We either have incoherent backing store and
4039                          * so no GTT access or the architecture is fully
4040                          * coherent. In such cases, existing GTT mmaps
4041                          * ignore the cache bit in the PTE and we can
4042                          * rewrite it without confusing the GPU or having
4043                          * to force userspace to fault back in its mmaps.
4044                          */
4045                 }
4046
4047                 list_for_each_entry(vma, &obj->vma_list, obj_link) {
4048                         if (!drm_mm_node_allocated(&vma->node))
4049                                 continue;
4050
4051                         ret = i915_vma_bind(vma, cache_level, PIN_UPDATE);
4052                         if (ret)
4053                                 return ret;
4054                 }
4055         }
4056
4057         list_for_each_entry(vma, &obj->vma_list, obj_link)
4058                 vma->node.color = cache_level;
4059         i915_gem_object_set_cache_coherency(obj, cache_level);
4060         obj->cache_dirty = true; /* Always invalidate stale cachelines */
4061
4062         return 0;
4063 }
4064
4065 int i915_gem_get_caching_ioctl(struct drm_device *dev, void *data,
4066                                struct drm_file *file)
4067 {
4068         struct drm_i915_gem_caching *args = data;
4069         struct drm_i915_gem_object *obj;
4070         int err = 0;
4071
4072         rcu_read_lock();
4073         obj = i915_gem_object_lookup_rcu(file, args->handle);
4074         if (!obj) {
4075                 err = -ENOENT;
4076                 goto out;
4077         }
4078
4079         switch (obj->cache_level) {
4080         case I915_CACHE_LLC:
4081         case I915_CACHE_L3_LLC:
4082                 args->caching = I915_CACHING_CACHED;
4083                 break;
4084
4085         case I915_CACHE_WT:
4086                 args->caching = I915_CACHING_DISPLAY;
4087                 break;
4088
4089         default:
4090                 args->caching = I915_CACHING_NONE;
4091                 break;
4092         }
4093 out:
4094         rcu_read_unlock();
4095         return err;
4096 }
4097
4098 int i915_gem_set_caching_ioctl(struct drm_device *dev, void *data,
4099                                struct drm_file *file)
4100 {
4101         struct drm_i915_private *i915 = to_i915(dev);
4102         struct drm_i915_gem_caching *args = data;
4103         struct drm_i915_gem_object *obj;
4104         enum i915_cache_level level;
4105         int ret = 0;
4106
4107         switch (args->caching) {
4108         case I915_CACHING_NONE:
4109                 level = I915_CACHE_NONE;
4110                 break;
4111         case I915_CACHING_CACHED:
4112                 /*
4113                  * Due to a HW issue on BXT A stepping, GPU stores via a
4114                  * snooped mapping may leave stale data in a corresponding CPU
4115                  * cacheline, whereas normally such cachelines would get
4116                  * invalidated.
4117                  */
4118                 if (!HAS_LLC(i915) && !HAS_SNOOP(i915))
4119                         return -ENODEV;
4120
4121                 level = I915_CACHE_LLC;
4122                 break;
4123         case I915_CACHING_DISPLAY:
4124                 level = HAS_WT(i915) ? I915_CACHE_WT : I915_CACHE_NONE;
4125                 break;
4126         default:
4127                 return -EINVAL;
4128         }
4129
4130         obj = i915_gem_object_lookup(file, args->handle);
4131         if (!obj)
4132                 return -ENOENT;
4133
4134         /*
4135          * The caching mode of proxy object is handled by its generator, and
4136          * not allowed to be changed by userspace.
4137          */
4138         if (i915_gem_object_is_proxy(obj)) {
4139                 ret = -ENXIO;
4140                 goto out;
4141         }
4142
4143         if (obj->cache_level == level)
4144                 goto out;
4145
4146         ret = i915_gem_object_wait(obj,
4147                                    I915_WAIT_INTERRUPTIBLE,
4148                                    MAX_SCHEDULE_TIMEOUT,
4149                                    to_rps_client(file));
4150         if (ret)
4151                 goto out;
4152
4153         ret = i915_mutex_lock_interruptible(dev);
4154         if (ret)
4155                 goto out;
4156
4157         ret = i915_gem_object_set_cache_level(obj, level);
4158         mutex_unlock(&dev->struct_mutex);
4159
4160 out:
4161         i915_gem_object_put(obj);
4162         return ret;
4163 }
4164
4165 /*
4166  * Prepare buffer for display plane (scanout, cursors, etc). Can be called from
4167  * an uninterruptible phase (modesetting) and allows any flushes to be pipelined
4168  * (for pageflips). We only flush the caches while preparing the buffer for
4169  * display, the callers are responsible for frontbuffer flush.
4170  */
4171 struct i915_vma *
4172 i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj,
4173                                      u32 alignment,
4174                                      const struct i915_ggtt_view *view,
4175                                      unsigned int flags)
4176 {
4177         struct i915_vma *vma;
4178         int ret;
4179
4180         lockdep_assert_held(&obj->base.dev->struct_mutex);
4181
4182         /* Mark the global pin early so that we account for the
4183          * display coherency whilst setting up the cache domains.
4184          */
4185         obj->pin_global++;
4186
4187         /* The display engine is not coherent with the LLC cache on gen6.  As
4188          * a result, we make sure that the pinning that is about to occur is
4189          * done with uncached PTEs. This is lowest common denominator for all
4190          * chipsets.
4191          *
4192          * However for gen6+, we could do better by using the GFDT bit instead
4193          * of uncaching, which would allow us to flush all the LLC-cached data
4194          * with that bit in the PTE to main memory with just one PIPE_CONTROL.
4195          */
4196         ret = i915_gem_object_set_cache_level(obj,
4197                                               HAS_WT(to_i915(obj->base.dev)) ?
4198                                               I915_CACHE_WT : I915_CACHE_NONE);
4199         if (ret) {
4200                 vma = ERR_PTR(ret);
4201                 goto err_unpin_global;
4202         }
4203
4204         /* As the user may map the buffer once pinned in the display plane
4205          * (e.g. libkms for the bootup splash), we have to ensure that we
4206          * always use map_and_fenceable for all scanout buffers. However,
4207          * it may simply be too big to fit into mappable, in which case
4208          * put it anyway and hope that userspace can cope (but always first
4209          * try to preserve the existing ABI).
4210          */
4211         vma = ERR_PTR(-ENOSPC);
4212         if ((flags & PIN_MAPPABLE) == 0 &&
4213             (!view || view->type == I915_GGTT_VIEW_NORMAL))
4214                 vma = i915_gem_object_ggtt_pin(obj, view, 0, alignment,
4215                                                flags |
4216                                                PIN_MAPPABLE |
4217                                                PIN_NONBLOCK);
4218         if (IS_ERR(vma))
4219                 vma = i915_gem_object_ggtt_pin(obj, view, 0, alignment, flags);
4220         if (IS_ERR(vma))
4221                 goto err_unpin_global;
4222
4223         vma->display_alignment = max_t(u64, vma->display_alignment, alignment);
4224
4225         __i915_gem_object_flush_for_display(obj);
4226
4227         /* It should now be out of any other write domains, and we can update
4228          * the domain values for our changes.
4229          */
4230         obj->read_domains |= I915_GEM_DOMAIN_GTT;
4231
4232         return vma;
4233
4234 err_unpin_global:
4235         obj->pin_global--;
4236         return vma;
4237 }
4238
4239 void
4240 i915_gem_object_unpin_from_display_plane(struct i915_vma *vma)
4241 {
4242         lockdep_assert_held(&vma->vm->i915->drm.struct_mutex);
4243
4244         if (WARN_ON(vma->obj->pin_global == 0))
4245                 return;
4246
4247         if (--vma->obj->pin_global == 0)
4248                 vma->display_alignment = I915_GTT_MIN_ALIGNMENT;
4249
4250         /* Bump the LRU to try and avoid premature eviction whilst flipping  */
4251         i915_gem_object_bump_inactive_ggtt(vma->obj);
4252
4253         i915_vma_unpin(vma);
4254 }
4255
4256 /**
4257  * Moves a single object to the CPU read, and possibly write domain.
4258  * @obj: object to act on
4259  * @write: requesting write or read-only access
4260  *
4261  * This function returns when the move is complete, including waiting on
4262  * flushes to occur.
4263  */
4264 int
4265 i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write)
4266 {
4267         int ret;
4268
4269         lockdep_assert_held(&obj->base.dev->struct_mutex);
4270
4271         ret = i915_gem_object_wait(obj,
4272                                    I915_WAIT_INTERRUPTIBLE |
4273                                    I915_WAIT_LOCKED |
4274                                    (write ? I915_WAIT_ALL : 0),
4275                                    MAX_SCHEDULE_TIMEOUT,
4276                                    NULL);
4277         if (ret)
4278                 return ret;
4279
4280         flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
4281
4282         /* Flush the CPU cache if it's still invalid. */
4283         if ((obj->read_domains & I915_GEM_DOMAIN_CPU) == 0) {
4284                 i915_gem_clflush_object(obj, I915_CLFLUSH_SYNC);
4285                 obj->read_domains |= I915_GEM_DOMAIN_CPU;
4286         }
4287
4288         /* It should now be out of any other write domains, and we can update
4289          * the domain values for our changes.
4290          */
4291         GEM_BUG_ON(obj->write_domain & ~I915_GEM_DOMAIN_CPU);
4292
4293         /* If we're writing through the CPU, then the GPU read domains will
4294          * need to be invalidated at next use.
4295          */
4296         if (write)
4297                 __start_cpu_write(obj);
4298
4299         return 0;
4300 }
4301
4302 /* Throttle our rendering by waiting until the ring has completed our requests
4303  * emitted over 20 msec ago.
4304  *
4305  * Note that if we were to use the current jiffies each time around the loop,
4306  * we wouldn't escape the function with any frames outstanding if the time to
4307  * render a frame was over 20ms.
4308  *
4309  * This should get us reasonable parallelism between CPU and GPU but also
4310  * relatively low latency when blocking on a particular request to finish.
4311  */
4312 static int
4313 i915_gem_ring_throttle(struct drm_device *dev, struct drm_file *file)
4314 {
4315         struct drm_i915_private *dev_priv = to_i915(dev);
4316         struct drm_i915_file_private *file_priv = file->driver_priv;
4317         unsigned long recent_enough = jiffies - DRM_I915_THROTTLE_JIFFIES;
4318         struct i915_request *request, *target = NULL;
4319         long ret;
4320
4321         /* ABI: return -EIO if already wedged */
4322         if (i915_terminally_wedged(&dev_priv->gpu_error))
4323                 return -EIO;
4324
4325         spin_lock(&file_priv->mm.lock);
4326         list_for_each_entry(request, &file_priv->mm.request_list, client_link) {
4327                 if (time_after_eq(request->emitted_jiffies, recent_enough))
4328                         break;
4329
4330                 if (target) {
4331                         list_del(&target->client_link);
4332                         target->file_priv = NULL;
4333                 }
4334
4335                 target = request;
4336         }
4337         if (target)
4338                 i915_request_get(target);
4339         spin_unlock(&file_priv->mm.lock);
4340
4341         if (target == NULL)
4342                 return 0;
4343
4344         ret = i915_request_wait(target,
4345                                 I915_WAIT_INTERRUPTIBLE,
4346                                 MAX_SCHEDULE_TIMEOUT);
4347         i915_request_put(target);
4348
4349         return ret < 0 ? ret : 0;
4350 }
4351
4352 struct i915_vma *
4353 i915_gem_object_ggtt_pin(struct drm_i915_gem_object *obj,
4354                          const struct i915_ggtt_view *view,
4355                          u64 size,
4356                          u64 alignment,
4357                          u64 flags)
4358 {
4359         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
4360         struct i915_address_space *vm = &dev_priv->ggtt.base;
4361         struct i915_vma *vma;
4362         int ret;
4363
4364         lockdep_assert_held(&obj->base.dev->struct_mutex);
4365
4366         if (flags & PIN_MAPPABLE &&
4367             (!view || view->type == I915_GGTT_VIEW_NORMAL)) {
4368                 /* If the required space is larger than the available
4369                  * aperture, we will not able to find a slot for the
4370                  * object and unbinding the object now will be in
4371                  * vain. Worse, doing so may cause us to ping-pong
4372                  * the object in and out of the Global GTT and
4373                  * waste a lot of cycles under the mutex.
4374                  */
4375                 if (obj->base.size > dev_priv->ggtt.mappable_end)
4376                         return ERR_PTR(-E2BIG);
4377
4378                 /* If NONBLOCK is set the caller is optimistically
4379                  * trying to cache the full object within the mappable
4380                  * aperture, and *must* have a fallback in place for
4381                  * situations where we cannot bind the object. We
4382                  * can be a little more lax here and use the fallback
4383                  * more often to avoid costly migrations of ourselves
4384                  * and other objects within the aperture.
4385                  *
4386                  * Half-the-aperture is used as a simple heuristic.
4387                  * More interesting would to do search for a free
4388                  * block prior to making the commitment to unbind.
4389                  * That caters for the self-harm case, and with a
4390                  * little more heuristics (e.g. NOFAULT, NOEVICT)
4391                  * we could try to minimise harm to others.
4392                  */
4393                 if (flags & PIN_NONBLOCK &&
4394                     obj->base.size > dev_priv->ggtt.mappable_end / 2)
4395                         return ERR_PTR(-ENOSPC);
4396         }
4397
4398         vma = i915_vma_instance(obj, vm, view);
4399         if (unlikely(IS_ERR(vma)))
4400                 return vma;
4401
4402         if (i915_vma_misplaced(vma, size, alignment, flags)) {
4403                 if (flags & PIN_NONBLOCK) {
4404                         if (i915_vma_is_pinned(vma) || i915_vma_is_active(vma))
4405                                 return ERR_PTR(-ENOSPC);
4406
4407                         if (flags & PIN_MAPPABLE &&
4408                             vma->fence_size > dev_priv->ggtt.mappable_end / 2)
4409                                 return ERR_PTR(-ENOSPC);
4410                 }
4411
4412                 WARN(i915_vma_is_pinned(vma),
4413                      "bo is already pinned in ggtt with incorrect alignment:"
4414                      " offset=%08x, req.alignment=%llx,"
4415                      " req.map_and_fenceable=%d, vma->map_and_fenceable=%d\n",
4416                      i915_ggtt_offset(vma), alignment,
4417                      !!(flags & PIN_MAPPABLE),
4418                      i915_vma_is_map_and_fenceable(vma));
4419                 ret = i915_vma_unbind(vma);
4420                 if (ret)
4421                         return ERR_PTR(ret);
4422         }
4423
4424         ret = i915_vma_pin(vma, size, alignment, flags | PIN_GLOBAL);
4425         if (ret)
4426                 return ERR_PTR(ret);
4427
4428         return vma;
4429 }
4430
4431 static __always_inline unsigned int __busy_read_flag(unsigned int id)
4432 {
4433         /* Note that we could alias engines in the execbuf API, but
4434          * that would be very unwise as it prevents userspace from
4435          * fine control over engine selection. Ahem.
4436          *
4437          * This should be something like EXEC_MAX_ENGINE instead of
4438          * I915_NUM_ENGINES.
4439          */
4440         BUILD_BUG_ON(I915_NUM_ENGINES > 16);
4441         return 0x10000 << id;
4442 }
4443
4444 static __always_inline unsigned int __busy_write_id(unsigned int id)
4445 {
4446         /* The uABI guarantees an active writer is also amongst the read
4447          * engines. This would be true if we accessed the activity tracking
4448          * under the lock, but as we perform the lookup of the object and
4449          * its activity locklessly we can not guarantee that the last_write
4450          * being active implies that we have set the same engine flag from
4451          * last_read - hence we always set both read and write busy for
4452          * last_write.
4453          */
4454         return id | __busy_read_flag(id);
4455 }
4456
4457 static __always_inline unsigned int
4458 __busy_set_if_active(const struct dma_fence *fence,
4459                      unsigned int (*flag)(unsigned int id))
4460 {
4461         struct i915_request *rq;
4462
4463         /* We have to check the current hw status of the fence as the uABI
4464          * guarantees forward progress. We could rely on the idle worker
4465          * to eventually flush us, but to minimise latency just ask the
4466          * hardware.
4467          *
4468          * Note we only report on the status of native fences.
4469          */
4470         if (!dma_fence_is_i915(fence))
4471                 return 0;
4472
4473         /* opencode to_request() in order to avoid const warnings */
4474         rq = container_of(fence, struct i915_request, fence);
4475         if (i915_request_completed(rq))
4476                 return 0;
4477
4478         return flag(rq->engine->uabi_id);
4479 }
4480
4481 static __always_inline unsigned int
4482 busy_check_reader(const struct dma_fence *fence)
4483 {
4484         return __busy_set_if_active(fence, __busy_read_flag);
4485 }
4486
4487 static __always_inline unsigned int
4488 busy_check_writer(const struct dma_fence *fence)
4489 {
4490         if (!fence)
4491                 return 0;
4492
4493         return __busy_set_if_active(fence, __busy_write_id);
4494 }
4495
4496 int
4497 i915_gem_busy_ioctl(struct drm_device *dev, void *data,
4498                     struct drm_file *file)
4499 {
4500         struct drm_i915_gem_busy *args = data;
4501         struct drm_i915_gem_object *obj;
4502         struct reservation_object_list *list;
4503         unsigned int seq;
4504         int err;
4505
4506         err = -ENOENT;
4507         rcu_read_lock();
4508         obj = i915_gem_object_lookup_rcu(file, args->handle);
4509         if (!obj)
4510                 goto out;
4511
4512         /* A discrepancy here is that we do not report the status of
4513          * non-i915 fences, i.e. even though we may report the object as idle,
4514          * a call to set-domain may still stall waiting for foreign rendering.
4515          * This also means that wait-ioctl may report an object as busy,
4516          * where busy-ioctl considers it idle.
4517          *
4518          * We trade the ability to warn of foreign fences to report on which
4519          * i915 engines are active for the object.
4520          *
4521          * Alternatively, we can trade that extra information on read/write
4522          * activity with
4523          *      args->busy =
4524          *              !reservation_object_test_signaled_rcu(obj->resv, true);
4525          * to report the overall busyness. This is what the wait-ioctl does.
4526          *
4527          */
4528 retry:
4529         seq = raw_read_seqcount(&obj->resv->seq);
4530
4531         /* Translate the exclusive fence to the READ *and* WRITE engine */
4532         args->busy = busy_check_writer(rcu_dereference(obj->resv->fence_excl));
4533
4534         /* Translate shared fences to READ set of engines */
4535         list = rcu_dereference(obj->resv->fence);
4536         if (list) {
4537                 unsigned int shared_count = list->shared_count, i;
4538
4539                 for (i = 0; i < shared_count; ++i) {
4540                         struct dma_fence *fence =
4541                                 rcu_dereference(list->shared[i]);
4542
4543                         args->busy |= busy_check_reader(fence);
4544                 }
4545         }
4546
4547         if (args->busy && read_seqcount_retry(&obj->resv->seq, seq))
4548                 goto retry;
4549
4550         err = 0;
4551 out:
4552         rcu_read_unlock();
4553         return err;
4554 }
4555
4556 int
4557 i915_gem_throttle_ioctl(struct drm_device *dev, void *data,
4558                         struct drm_file *file_priv)
4559 {
4560         return i915_gem_ring_throttle(dev, file_priv);
4561 }
4562
4563 int
4564 i915_gem_madvise_ioctl(struct drm_device *dev, void *data,
4565                        struct drm_file *file_priv)
4566 {
4567         struct drm_i915_private *dev_priv = to_i915(dev);
4568         struct drm_i915_gem_madvise *args = data;
4569         struct drm_i915_gem_object *obj;
4570         int err;
4571
4572         switch (args->madv) {
4573         case I915_MADV_DONTNEED:
4574         case I915_MADV_WILLNEED:
4575             break;
4576         default:
4577             return -EINVAL;
4578         }
4579
4580         obj = i915_gem_object_lookup(file_priv, args->handle);
4581         if (!obj)
4582                 return -ENOENT;
4583
4584         err = mutex_lock_interruptible(&obj->mm.lock);
4585         if (err)
4586                 goto out;
4587
4588         if (i915_gem_object_has_pages(obj) &&
4589             i915_gem_object_is_tiled(obj) &&
4590             dev_priv->quirks & QUIRK_PIN_SWIZZLED_PAGES) {
4591                 if (obj->mm.madv == I915_MADV_WILLNEED) {
4592                         GEM_BUG_ON(!obj->mm.quirked);
4593                         __i915_gem_object_unpin_pages(obj);
4594                         obj->mm.quirked = false;
4595                 }
4596                 if (args->madv == I915_MADV_WILLNEED) {
4597                         GEM_BUG_ON(obj->mm.quirked);
4598                         __i915_gem_object_pin_pages(obj);
4599                         obj->mm.quirked = true;
4600                 }
4601         }
4602
4603         if (obj->mm.madv != __I915_MADV_PURGED)
4604                 obj->mm.madv = args->madv;
4605
4606         /* if the object is no longer attached, discard its backing storage */
4607         if (obj->mm.madv == I915_MADV_DONTNEED &&
4608             !i915_gem_object_has_pages(obj))
4609                 i915_gem_object_truncate(obj);
4610
4611         args->retained = obj->mm.madv != __I915_MADV_PURGED;
4612         mutex_unlock(&obj->mm.lock);
4613
4614 out:
4615         i915_gem_object_put(obj);
4616         return err;
4617 }
4618
4619 static void
4620 frontbuffer_retire(struct i915_gem_active *active, struct i915_request *request)
4621 {
4622         struct drm_i915_gem_object *obj =
4623                 container_of(active, typeof(*obj), frontbuffer_write);
4624
4625         intel_fb_obj_flush(obj, ORIGIN_CS);
4626 }
4627
4628 void i915_gem_object_init(struct drm_i915_gem_object *obj,
4629                           const struct drm_i915_gem_object_ops *ops)
4630 {
4631         mutex_init(&obj->mm.lock);
4632
4633         INIT_LIST_HEAD(&obj->vma_list);
4634         INIT_LIST_HEAD(&obj->lut_list);
4635         INIT_LIST_HEAD(&obj->batch_pool_link);
4636
4637         obj->ops = ops;
4638
4639         reservation_object_init(&obj->__builtin_resv);
4640         obj->resv = &obj->__builtin_resv;
4641
4642         obj->frontbuffer_ggtt_origin = ORIGIN_GTT;
4643         init_request_active(&obj->frontbuffer_write, frontbuffer_retire);
4644
4645         obj->mm.madv = I915_MADV_WILLNEED;
4646         INIT_RADIX_TREE(&obj->mm.get_page.radix, GFP_KERNEL | __GFP_NOWARN);
4647         mutex_init(&obj->mm.get_page.lock);
4648
4649         i915_gem_info_add_obj(to_i915(obj->base.dev), obj->base.size);
4650 }
4651
4652 static const struct drm_i915_gem_object_ops i915_gem_object_ops = {
4653         .flags = I915_GEM_OBJECT_HAS_STRUCT_PAGE |
4654                  I915_GEM_OBJECT_IS_SHRINKABLE,
4655
4656         .get_pages = i915_gem_object_get_pages_gtt,
4657         .put_pages = i915_gem_object_put_pages_gtt,
4658
4659         .pwrite = i915_gem_object_pwrite_gtt,
4660 };
4661
4662 static int i915_gem_object_create_shmem(struct drm_device *dev,
4663                                         struct drm_gem_object *obj,
4664                                         size_t size)
4665 {
4666         struct drm_i915_private *i915 = to_i915(dev);
4667         unsigned long flags = VM_NORESERVE;
4668         struct file *filp;
4669
4670         drm_gem_private_object_init(dev, obj, size);
4671
4672         if (i915->mm.gemfs)
4673                 filp = shmem_file_setup_with_mnt(i915->mm.gemfs, "i915", size,
4674                                                  flags);
4675         else
4676                 filp = shmem_file_setup("i915", size, flags);
4677
4678         if (IS_ERR(filp))
4679                 return PTR_ERR(filp);
4680
4681         obj->filp = filp;
4682
4683         return 0;
4684 }
4685
4686 struct drm_i915_gem_object *
4687 i915_gem_object_create(struct drm_i915_private *dev_priv, u64 size)
4688 {
4689         struct drm_i915_gem_object *obj;
4690         struct address_space *mapping;
4691         unsigned int cache_level;
4692         gfp_t mask;
4693         int ret;
4694
4695         /* There is a prevalence of the assumption that we fit the object's
4696          * page count inside a 32bit _signed_ variable. Let's document this and
4697          * catch if we ever need to fix it. In the meantime, if you do spot
4698          * such a local variable, please consider fixing!
4699          */
4700         if (size >> PAGE_SHIFT > INT_MAX)
4701                 return ERR_PTR(-E2BIG);
4702
4703         if (overflows_type(size, obj->base.size))
4704                 return ERR_PTR(-E2BIG);
4705
4706         obj = i915_gem_object_alloc(dev_priv);
4707         if (obj == NULL)
4708                 return ERR_PTR(-ENOMEM);
4709
4710         ret = i915_gem_object_create_shmem(&dev_priv->drm, &obj->base, size);
4711         if (ret)
4712                 goto fail;
4713
4714         mask = GFP_HIGHUSER | __GFP_RECLAIMABLE;
4715         if (IS_I965GM(dev_priv) || IS_I965G(dev_priv)) {
4716                 /* 965gm cannot relocate objects above 4GiB. */
4717                 mask &= ~__GFP_HIGHMEM;
4718                 mask |= __GFP_DMA32;
4719         }
4720
4721         mapping = obj->base.filp->f_mapping;
4722         mapping_set_gfp_mask(mapping, mask);
4723         GEM_BUG_ON(!(mapping_gfp_mask(mapping) & __GFP_RECLAIM));
4724
4725         i915_gem_object_init(obj, &i915_gem_object_ops);
4726
4727         obj->write_domain = I915_GEM_DOMAIN_CPU;
4728         obj->read_domains = I915_GEM_DOMAIN_CPU;
4729
4730         if (HAS_LLC(dev_priv))
4731                 /* On some devices, we can have the GPU use the LLC (the CPU
4732                  * cache) for about a 10% performance improvement
4733                  * compared to uncached.  Graphics requests other than
4734                  * display scanout are coherent with the CPU in
4735                  * accessing this cache.  This means in this mode we
4736                  * don't need to clflush on the CPU side, and on the
4737                  * GPU side we only need to flush internal caches to
4738                  * get data visible to the CPU.
4739                  *
4740                  * However, we maintain the display planes as UC, and so
4741                  * need to rebind when first used as such.
4742                  */
4743                 cache_level = I915_CACHE_LLC;
4744         else
4745                 cache_level = I915_CACHE_NONE;
4746
4747         i915_gem_object_set_cache_coherency(obj, cache_level);
4748
4749         trace_i915_gem_object_create(obj);
4750
4751         return obj;
4752
4753 fail:
4754         i915_gem_object_free(obj);
4755         return ERR_PTR(ret);
4756 }
4757
4758 static bool discard_backing_storage(struct drm_i915_gem_object *obj)
4759 {
4760         /* If we are the last user of the backing storage (be it shmemfs
4761          * pages or stolen etc), we know that the pages are going to be
4762          * immediately released. In this case, we can then skip copying
4763          * back the contents from the GPU.
4764          */
4765
4766         if (obj->mm.madv != I915_MADV_WILLNEED)
4767                 return false;
4768
4769         if (obj->base.filp == NULL)
4770                 return true;
4771
4772         /* At first glance, this looks racy, but then again so would be
4773          * userspace racing mmap against close. However, the first external
4774          * reference to the filp can only be obtained through the
4775          * i915_gem_mmap_ioctl() which safeguards us against the user
4776          * acquiring such a reference whilst we are in the middle of
4777          * freeing the object.
4778          */
4779         return atomic_long_read(&obj->base.filp->f_count) == 1;
4780 }
4781
4782 static void __i915_gem_free_objects(struct drm_i915_private *i915,
4783                                     struct llist_node *freed)
4784 {
4785         struct drm_i915_gem_object *obj, *on;
4786
4787         intel_runtime_pm_get(i915);
4788         llist_for_each_entry_safe(obj, on, freed, freed) {
4789                 struct i915_vma *vma, *vn;
4790
4791                 trace_i915_gem_object_destroy(obj);
4792
4793                 mutex_lock(&i915->drm.struct_mutex);
4794
4795                 GEM_BUG_ON(i915_gem_object_is_active(obj));
4796                 list_for_each_entry_safe(vma, vn,
4797                                          &obj->vma_list, obj_link) {
4798                         GEM_BUG_ON(i915_vma_is_active(vma));
4799                         vma->flags &= ~I915_VMA_PIN_MASK;
4800                         i915_vma_destroy(vma);
4801                 }
4802                 GEM_BUG_ON(!list_empty(&obj->vma_list));
4803                 GEM_BUG_ON(!RB_EMPTY_ROOT(&obj->vma_tree));
4804
4805                 /* This serializes freeing with the shrinker. Since the free
4806                  * is delayed, first by RCU then by the workqueue, we want the
4807                  * shrinker to be able to free pages of unreferenced objects,
4808                  * or else we may oom whilst there are plenty of deferred
4809                  * freed objects.
4810                  */
4811                 if (i915_gem_object_has_pages(obj)) {
4812                         spin_lock(&i915->mm.obj_lock);
4813                         list_del_init(&obj->mm.link);
4814                         spin_unlock(&i915->mm.obj_lock);
4815                 }
4816
4817                 mutex_unlock(&i915->drm.struct_mutex);
4818
4819                 GEM_BUG_ON(obj->bind_count);
4820                 GEM_BUG_ON(obj->userfault_count);
4821                 GEM_BUG_ON(atomic_read(&obj->frontbuffer_bits));
4822                 GEM_BUG_ON(!list_empty(&obj->lut_list));
4823
4824                 if (obj->ops->release)
4825                         obj->ops->release(obj);
4826
4827                 if (WARN_ON(i915_gem_object_has_pinned_pages(obj)))
4828                         atomic_set(&obj->mm.pages_pin_count, 0);
4829                 __i915_gem_object_put_pages(obj, I915_MM_NORMAL);
4830                 GEM_BUG_ON(i915_gem_object_has_pages(obj));
4831
4832                 if (obj->base.import_attach)
4833                         drm_prime_gem_destroy(&obj->base, NULL);
4834
4835                 reservation_object_fini(&obj->__builtin_resv);
4836                 drm_gem_object_release(&obj->base);
4837                 i915_gem_info_remove_obj(i915, obj->base.size);
4838
4839                 kfree(obj->bit_17);
4840                 i915_gem_object_free(obj);
4841
4842                 GEM_BUG_ON(!atomic_read(&i915->mm.free_count));
4843                 atomic_dec(&i915->mm.free_count);
4844
4845                 if (on)
4846                         cond_resched();
4847         }
4848         intel_runtime_pm_put(i915);
4849 }
4850
4851 static void i915_gem_flush_free_objects(struct drm_i915_private *i915)
4852 {
4853         struct llist_node *freed;
4854
4855         /* Free the oldest, most stale object to keep the free_list short */
4856         freed = NULL;
4857         if (!llist_empty(&i915->mm.free_list)) { /* quick test for hotpath */
4858                 /* Only one consumer of llist_del_first() allowed */
4859                 spin_lock(&i915->mm.free_lock);
4860                 freed = llist_del_first(&i915->mm.free_list);
4861                 spin_unlock(&i915->mm.free_lock);
4862         }
4863         if (unlikely(freed)) {
4864                 freed->next = NULL;
4865                 __i915_gem_free_objects(i915, freed);
4866         }
4867 }
4868
4869 static void __i915_gem_free_work(struct work_struct *work)
4870 {
4871         struct drm_i915_private *i915 =
4872                 container_of(work, struct drm_i915_private, mm.free_work);
4873         struct llist_node *freed;
4874
4875         /*
4876          * All file-owned VMA should have been released by this point through
4877          * i915_gem_close_object(), or earlier by i915_gem_context_close().
4878          * However, the object may also be bound into the global GTT (e.g.
4879          * older GPUs without per-process support, or for direct access through
4880          * the GTT either for the user or for scanout). Those VMA still need to
4881          * unbound now.
4882          */
4883
4884         spin_lock(&i915->mm.free_lock);
4885         while ((freed = llist_del_all(&i915->mm.free_list))) {
4886                 spin_unlock(&i915->mm.free_lock);
4887
4888                 __i915_gem_free_objects(i915, freed);
4889                 if (need_resched())
4890                         return;
4891
4892                 spin_lock(&i915->mm.free_lock);
4893         }
4894         spin_unlock(&i915->mm.free_lock);
4895 }
4896
4897 static void __i915_gem_free_object_rcu(struct rcu_head *head)
4898 {
4899         struct drm_i915_gem_object *obj =
4900                 container_of(head, typeof(*obj), rcu);
4901         struct drm_i915_private *i915 = to_i915(obj->base.dev);
4902
4903         /*
4904          * Since we require blocking on struct_mutex to unbind the freed
4905          * object from the GPU before releasing resources back to the
4906          * system, we can not do that directly from the RCU callback (which may
4907          * be a softirq context), but must instead then defer that work onto a
4908          * kthread. We use the RCU callback rather than move the freed object
4909          * directly onto the work queue so that we can mix between using the
4910          * worker and performing frees directly from subsequent allocations for
4911          * crude but effective memory throttling.
4912          */
4913         if (llist_add(&obj->freed, &i915->mm.free_list))
4914                 queue_work(i915->wq, &i915->mm.free_work);
4915 }
4916
4917 void i915_gem_free_object(struct drm_gem_object *gem_obj)
4918 {
4919         struct drm_i915_gem_object *obj = to_intel_bo(gem_obj);
4920
4921         if (obj->mm.quirked)
4922                 __i915_gem_object_unpin_pages(obj);
4923
4924         if (discard_backing_storage(obj))
4925                 obj->mm.madv = I915_MADV_DONTNEED;
4926
4927         /*
4928          * Before we free the object, make sure any pure RCU-only
4929          * read-side critical sections are complete, e.g.
4930          * i915_gem_busy_ioctl(). For the corresponding synchronized
4931          * lookup see i915_gem_object_lookup_rcu().
4932          */
4933         atomic_inc(&to_i915(obj->base.dev)->mm.free_count);
4934         call_rcu(&obj->rcu, __i915_gem_free_object_rcu);
4935 }
4936
4937 void __i915_gem_object_release_unless_active(struct drm_i915_gem_object *obj)
4938 {
4939         lockdep_assert_held(&obj->base.dev->struct_mutex);
4940
4941         if (!i915_gem_object_has_active_reference(obj) &&
4942             i915_gem_object_is_active(obj))
4943                 i915_gem_object_set_active_reference(obj);
4944         else
4945                 i915_gem_object_put(obj);
4946 }
4947
4948 static void assert_kernel_context_is_current(struct drm_i915_private *i915)
4949 {
4950         struct i915_gem_context *kernel_context = i915->kernel_context;
4951         struct intel_engine_cs *engine;
4952         enum intel_engine_id id;
4953
4954         for_each_engine(engine, i915, id) {
4955                 GEM_BUG_ON(__i915_gem_active_peek(&engine->timeline.last_request));
4956                 GEM_BUG_ON(engine->last_retired_context != kernel_context);
4957         }
4958 }
4959
4960 void i915_gem_sanitize(struct drm_i915_private *i915)
4961 {
4962         if (i915_terminally_wedged(&i915->gpu_error)) {
4963                 mutex_lock(&i915->drm.struct_mutex);
4964                 i915_gem_unset_wedged(i915);
4965                 mutex_unlock(&i915->drm.struct_mutex);
4966         }
4967
4968         /*
4969          * If we inherit context state from the BIOS or earlier occupants
4970          * of the GPU, the GPU may be in an inconsistent state when we
4971          * try to take over. The only way to remove the earlier state
4972          * is by resetting. However, resetting on earlier gen is tricky as
4973          * it may impact the display and we are uncertain about the stability
4974          * of the reset, so this could be applied to even earlier gen.
4975          */
4976         if (INTEL_GEN(i915) >= 5 && intel_has_gpu_reset(i915))
4977                 WARN_ON(intel_gpu_reset(i915, ALL_ENGINES));
4978 }
4979
4980 int i915_gem_suspend(struct drm_i915_private *dev_priv)
4981 {
4982         struct drm_device *dev = &dev_priv->drm;
4983         int ret;
4984
4985         intel_runtime_pm_get(dev_priv);
4986         intel_suspend_gt_powersave(dev_priv);
4987
4988         mutex_lock(&dev->struct_mutex);
4989
4990         /* We have to flush all the executing contexts to main memory so
4991          * that they can saved in the hibernation image. To ensure the last
4992          * context image is coherent, we have to switch away from it. That
4993          * leaves the dev_priv->kernel_context still active when
4994          * we actually suspend, and its image in memory may not match the GPU
4995          * state. Fortunately, the kernel_context is disposable and we do
4996          * not rely on its state.
4997          */
4998         if (!i915_terminally_wedged(&dev_priv->gpu_error)) {
4999                 ret = i915_gem_switch_to_kernel_context(dev_priv);
5000                 if (ret)
5001                         goto err_unlock;
5002
5003                 ret = i915_gem_wait_for_idle(dev_priv,
5004                                              I915_WAIT_INTERRUPTIBLE |
5005                                              I915_WAIT_LOCKED);
5006                 if (ret && ret != -EIO)
5007                         goto err_unlock;
5008
5009                 assert_kernel_context_is_current(dev_priv);
5010         }
5011         i915_gem_contexts_lost(dev_priv);
5012         mutex_unlock(&dev->struct_mutex);
5013
5014         intel_uc_suspend(dev_priv);
5015
5016         cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work);
5017         cancel_delayed_work_sync(&dev_priv->gt.retire_work);
5018
5019         /* As the idle_work is rearming if it detects a race, play safe and
5020          * repeat the flush until it is definitely idle.
5021          */
5022         drain_delayed_work(&dev_priv->gt.idle_work);
5023
5024         /* Assert that we sucessfully flushed all the work and
5025          * reset the GPU back to its idle, low power state.
5026          */
5027         WARN_ON(dev_priv->gt.awake);
5028         if (WARN_ON(!intel_engines_are_idle(dev_priv)))
5029                 i915_gem_set_wedged(dev_priv); /* no hope, discard everything */
5030
5031         /*
5032          * Neither the BIOS, ourselves or any other kernel
5033          * expects the system to be in execlists mode on startup,
5034          * so we need to reset the GPU back to legacy mode. And the only
5035          * known way to disable logical contexts is through a GPU reset.
5036          *
5037          * So in order to leave the system in a known default configuration,
5038          * always reset the GPU upon unload and suspend. Afterwards we then
5039          * clean up the GEM state tracking, flushing off the requests and
5040          * leaving the system in a known idle state.
5041          *
5042          * Note that is of the upmost importance that the GPU is idle and
5043          * all stray writes are flushed *before* we dismantle the backing
5044          * storage for the pinned objects.
5045          *
5046          * However, since we are uncertain that resetting the GPU on older
5047          * machines is a good idea, we don't - just in case it leaves the
5048          * machine in an unusable condition.
5049          */
5050         intel_uc_sanitize(dev_priv);
5051         i915_gem_sanitize(dev_priv);
5052
5053         intel_runtime_pm_put(dev_priv);
5054         return 0;
5055
5056 err_unlock:
5057         mutex_unlock(&dev->struct_mutex);
5058         intel_runtime_pm_put(dev_priv);
5059         return ret;
5060 }
5061
5062 void i915_gem_resume(struct drm_i915_private *i915)
5063 {
5064         WARN_ON(i915->gt.awake);
5065
5066         mutex_lock(&i915->drm.struct_mutex);
5067         intel_uncore_forcewake_get(i915, FORCEWAKE_ALL);
5068
5069         i915_gem_restore_gtt_mappings(i915);
5070         i915_gem_restore_fences(i915);
5071
5072         /*
5073          * As we didn't flush the kernel context before suspend, we cannot
5074          * guarantee that the context image is complete. So let's just reset
5075          * it and start again.
5076          */
5077         i915->gt.resume(i915);
5078
5079         if (i915_gem_init_hw(i915))
5080                 goto err_wedged;
5081
5082         intel_uc_resume(i915);
5083
5084         /* Always reload a context for powersaving. */
5085         if (i915_gem_switch_to_kernel_context(i915))
5086                 goto err_wedged;
5087
5088 out_unlock:
5089         intel_uncore_forcewake_put(i915, FORCEWAKE_ALL);
5090         mutex_unlock(&i915->drm.struct_mutex);
5091         return;
5092
5093 err_wedged:
5094         if (!i915_terminally_wedged(&i915->gpu_error)) {
5095                 DRM_ERROR("failed to re-initialize GPU, declaring wedged!\n");
5096                 i915_gem_set_wedged(i915);
5097         }
5098         goto out_unlock;
5099 }
5100
5101 void i915_gem_init_swizzling(struct drm_i915_private *dev_priv)
5102 {
5103         if (INTEL_GEN(dev_priv) < 5 ||
5104             dev_priv->mm.bit_6_swizzle_x == I915_BIT_6_SWIZZLE_NONE)
5105                 return;
5106
5107         I915_WRITE(DISP_ARB_CTL, I915_READ(DISP_ARB_CTL) |
5108                                  DISP_TILE_SURFACE_SWIZZLING);
5109
5110         if (IS_GEN5(dev_priv))
5111                 return;
5112
5113         I915_WRITE(TILECTL, I915_READ(TILECTL) | TILECTL_SWZCTL);
5114         if (IS_GEN6(dev_priv))
5115                 I915_WRITE(ARB_MODE, _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_SNB));
5116         else if (IS_GEN7(dev_priv))
5117                 I915_WRITE(ARB_MODE, _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_IVB));
5118         else if (IS_GEN8(dev_priv))
5119                 I915_WRITE(GAMTARBMODE, _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_BDW));
5120         else
5121                 BUG();
5122 }
5123
5124 static void init_unused_ring(struct drm_i915_private *dev_priv, u32 base)
5125 {
5126         I915_WRITE(RING_CTL(base), 0);
5127         I915_WRITE(RING_HEAD(base), 0);
5128         I915_WRITE(RING_TAIL(base), 0);
5129         I915_WRITE(RING_START(base), 0);
5130 }
5131
5132 static void init_unused_rings(struct drm_i915_private *dev_priv)
5133 {
5134         if (IS_I830(dev_priv)) {
5135                 init_unused_ring(dev_priv, PRB1_BASE);
5136                 init_unused_ring(dev_priv, SRB0_BASE);
5137                 init_unused_ring(dev_priv, SRB1_BASE);
5138                 init_unused_ring(dev_priv, SRB2_BASE);
5139                 init_unused_ring(dev_priv, SRB3_BASE);
5140         } else if (IS_GEN2(dev_priv)) {
5141                 init_unused_ring(dev_priv, SRB0_BASE);
5142                 init_unused_ring(dev_priv, SRB1_BASE);
5143         } else if (IS_GEN3(dev_priv)) {
5144                 init_unused_ring(dev_priv, PRB1_BASE);
5145                 init_unused_ring(dev_priv, PRB2_BASE);
5146         }
5147 }
5148
5149 static int __i915_gem_restart_engines(void *data)
5150 {
5151         struct drm_i915_private *i915 = data;
5152         struct intel_engine_cs *engine;
5153         enum intel_engine_id id;
5154         int err;
5155
5156         for_each_engine(engine, i915, id) {
5157                 err = engine->init_hw(engine);
5158                 if (err) {
5159                         DRM_ERROR("Failed to restart %s (%d)\n",
5160                                   engine->name, err);
5161                         return err;
5162                 }
5163         }
5164
5165         return 0;
5166 }
5167
5168 int i915_gem_init_hw(struct drm_i915_private *dev_priv)
5169 {
5170         int ret;
5171
5172         dev_priv->gt.last_init_time = ktime_get();
5173
5174         /* Double layer security blanket, see i915_gem_init() */
5175         intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
5176
5177         if (HAS_EDRAM(dev_priv) && INTEL_GEN(dev_priv) < 9)
5178                 I915_WRITE(HSW_IDICR, I915_READ(HSW_IDICR) | IDIHASHMSK(0xf));
5179
5180         if (IS_HASWELL(dev_priv))
5181                 I915_WRITE(MI_PREDICATE_RESULT_2, IS_HSW_GT3(dev_priv) ?
5182                            LOWER_SLICE_ENABLED : LOWER_SLICE_DISABLED);
5183
5184         if (HAS_PCH_NOP(dev_priv)) {
5185                 if (IS_IVYBRIDGE(dev_priv)) {
5186                         u32 temp = I915_READ(GEN7_MSG_CTL);
5187                         temp &= ~(WAIT_FOR_PCH_FLR_ACK | WAIT_FOR_PCH_RESET_ACK);
5188                         I915_WRITE(GEN7_MSG_CTL, temp);
5189                 } else if (INTEL_GEN(dev_priv) >= 7) {
5190                         u32 temp = I915_READ(HSW_NDE_RSTWRN_OPT);
5191                         temp &= ~RESET_PCH_HANDSHAKE_ENABLE;
5192                         I915_WRITE(HSW_NDE_RSTWRN_OPT, temp);
5193                 }
5194         }
5195
5196         intel_gt_workarounds_apply(dev_priv);
5197
5198         i915_gem_init_swizzling(dev_priv);
5199
5200         /*
5201          * At least 830 can leave some of the unused rings
5202          * "active" (ie. head != tail) after resume which
5203          * will prevent c3 entry. Makes sure all unused rings
5204          * are totally idle.
5205          */
5206         init_unused_rings(dev_priv);
5207
5208         BUG_ON(!dev_priv->kernel_context);
5209         if (i915_terminally_wedged(&dev_priv->gpu_error)) {
5210                 ret = -EIO;
5211                 goto out;
5212         }
5213
5214         ret = i915_ppgtt_init_hw(dev_priv);
5215         if (ret) {
5216                 DRM_ERROR("Enabling PPGTT failed (%d)\n", ret);
5217                 goto out;
5218         }
5219
5220         ret = intel_wopcm_init_hw(&dev_priv->wopcm);
5221         if (ret) {
5222                 DRM_ERROR("Enabling WOPCM failed (%d)\n", ret);
5223                 goto out;
5224         }
5225
5226         /* We can't enable contexts until all firmware is loaded */
5227         ret = intel_uc_init_hw(dev_priv);
5228         if (ret) {
5229                 DRM_ERROR("Enabling uc failed (%d)\n", ret);
5230                 goto out;
5231         }
5232
5233         intel_mocs_init_l3cc_table(dev_priv);
5234
5235         /* Only when the HW is re-initialised, can we replay the requests */
5236         ret = __i915_gem_restart_engines(dev_priv);
5237 out:
5238         intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
5239         return ret;
5240 }
5241
5242 static int __intel_engines_record_defaults(struct drm_i915_private *i915)
5243 {
5244         struct i915_gem_context *ctx;
5245         struct intel_engine_cs *engine;
5246         enum intel_engine_id id;
5247         int err;
5248
5249         /*
5250          * As we reset the gpu during very early sanitisation, the current
5251          * register state on the GPU should reflect its defaults values.
5252          * We load a context onto the hw (with restore-inhibit), then switch
5253          * over to a second context to save that default register state. We
5254          * can then prime every new context with that state so they all start
5255          * from the same default HW values.
5256          */
5257
5258         ctx = i915_gem_context_create_kernel(i915, 0);
5259         if (IS_ERR(ctx))
5260                 return PTR_ERR(ctx);
5261
5262         for_each_engine(engine, i915, id) {
5263                 struct i915_request *rq;
5264
5265                 rq = i915_request_alloc(engine, ctx);
5266                 if (IS_ERR(rq)) {
5267                         err = PTR_ERR(rq);
5268                         goto out_ctx;
5269                 }
5270
5271                 err = 0;
5272                 if (engine->init_context)
5273                         err = engine->init_context(rq);
5274
5275                 __i915_request_add(rq, true);
5276                 if (err)
5277                         goto err_active;
5278         }
5279
5280         err = i915_gem_switch_to_kernel_context(i915);
5281         if (err)
5282                 goto err_active;
5283
5284         err = i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED);
5285         if (err)
5286                 goto err_active;
5287
5288         assert_kernel_context_is_current(i915);
5289
5290         for_each_engine(engine, i915, id) {
5291                 struct i915_vma *state;
5292
5293                 state = to_intel_context(ctx, engine)->state;
5294                 if (!state)
5295                         continue;
5296
5297                 /*
5298                  * As we will hold a reference to the logical state, it will
5299                  * not be torn down with the context, and importantly the
5300                  * object will hold onto its vma (making it possible for a
5301                  * stray GTT write to corrupt our defaults). Unmap the vma
5302                  * from the GTT to prevent such accidents and reclaim the
5303                  * space.
5304                  */
5305                 err = i915_vma_unbind(state);
5306                 if (err)
5307                         goto err_active;
5308
5309                 err = i915_gem_object_set_to_cpu_domain(state->obj, false);
5310                 if (err)
5311                         goto err_active;
5312
5313                 engine->default_state = i915_gem_object_get(state->obj);
5314         }
5315
5316         if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) {
5317                 unsigned int found = intel_engines_has_context_isolation(i915);
5318
5319                 /*
5320                  * Make sure that classes with multiple engine instances all
5321                  * share the same basic configuration.
5322                  */
5323                 for_each_engine(engine, i915, id) {
5324                         unsigned int bit = BIT(engine->uabi_class);
5325                         unsigned int expected = engine->default_state ? bit : 0;
5326
5327                         if ((found & bit) != expected) {
5328                                 DRM_ERROR("mismatching default context state for class %d on engine %s\n",
5329                                           engine->uabi_class, engine->name);
5330                         }
5331                 }
5332         }
5333
5334 out_ctx:
5335         i915_gem_context_set_closed(ctx);
5336         i915_gem_context_put(ctx);
5337         return err;
5338
5339 err_active:
5340         /*
5341          * If we have to abandon now, we expect the engines to be idle
5342          * and ready to be torn-down. First try to flush any remaining
5343          * request, ensure we are pointing at the kernel context and
5344          * then remove it.
5345          */
5346         if (WARN_ON(i915_gem_switch_to_kernel_context(i915)))
5347                 goto out_ctx;
5348
5349         if (WARN_ON(i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED)))
5350                 goto out_ctx;
5351
5352         i915_gem_contexts_lost(i915);
5353         goto out_ctx;
5354 }
5355
5356 int i915_gem_init(struct drm_i915_private *dev_priv)
5357 {
5358         int ret;
5359
5360         /*
5361          * We need to fallback to 4K pages since gvt gtt handling doesn't
5362          * support huge page entries - we will need to check either hypervisor
5363          * mm can support huge guest page or just do emulation in gvt.
5364          */
5365         if (intel_vgpu_active(dev_priv))
5366                 mkwrite_device_info(dev_priv)->page_sizes =
5367                         I915_GTT_PAGE_SIZE_4K;
5368
5369         dev_priv->mm.unordered_timeline = dma_fence_context_alloc(1);
5370
5371         if (HAS_LOGICAL_RING_CONTEXTS(dev_priv)) {
5372                 dev_priv->gt.resume = intel_lr_context_resume;
5373                 dev_priv->gt.cleanup_engine = intel_logical_ring_cleanup;
5374         } else {
5375                 dev_priv->gt.resume = intel_legacy_submission_resume;
5376                 dev_priv->gt.cleanup_engine = intel_engine_cleanup;
5377         }
5378
5379         ret = i915_gem_init_userptr(dev_priv);
5380         if (ret)
5381                 return ret;
5382
5383         ret = intel_wopcm_init(&dev_priv->wopcm);
5384         if (ret)
5385                 return ret;
5386
5387         ret = intel_uc_init_misc(dev_priv);
5388         if (ret)
5389                 return ret;
5390
5391         /* This is just a security blanket to placate dragons.
5392          * On some systems, we very sporadically observe that the first TLBs
5393          * used by the CS may be stale, despite us poking the TLB reset. If
5394          * we hold the forcewake during initialisation these problems
5395          * just magically go away.
5396          */
5397         mutex_lock(&dev_priv->drm.struct_mutex);
5398         intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
5399
5400         ret = i915_gem_init_ggtt(dev_priv);
5401         if (ret) {
5402                 GEM_BUG_ON(ret == -EIO);
5403                 goto err_unlock;
5404         }
5405
5406         ret = i915_gem_contexts_init(dev_priv);
5407         if (ret) {
5408                 GEM_BUG_ON(ret == -EIO);
5409                 goto err_ggtt;
5410         }
5411
5412         ret = intel_engines_init(dev_priv);
5413         if (ret) {
5414                 GEM_BUG_ON(ret == -EIO);
5415                 goto err_context;
5416         }
5417
5418         intel_init_gt_powersave(dev_priv);
5419
5420         ret = intel_uc_init(dev_priv);
5421         if (ret)
5422                 goto err_pm;
5423
5424         ret = i915_gem_init_hw(dev_priv);
5425         if (ret)
5426                 goto err_uc_init;
5427
5428         /*
5429          * Despite its name intel_init_clock_gating applies both display
5430          * clock gating workarounds; GT mmio workarounds and the occasional
5431          * GT power context workaround. Worse, sometimes it includes a context
5432          * register workaround which we need to apply before we record the
5433          * default HW state for all contexts.
5434          *
5435          * FIXME: break up the workarounds and apply them at the right time!
5436          */
5437         intel_init_clock_gating(dev_priv);
5438
5439         ret = __intel_engines_record_defaults(dev_priv);
5440         if (ret)
5441                 goto err_init_hw;
5442
5443         if (i915_inject_load_failure()) {
5444                 ret = -ENODEV;
5445                 goto err_init_hw;
5446         }
5447
5448         if (i915_inject_load_failure()) {
5449                 ret = -EIO;
5450                 goto err_init_hw;
5451         }
5452
5453         intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
5454         mutex_unlock(&dev_priv->drm.struct_mutex);
5455
5456         return 0;
5457
5458         /*
5459          * Unwinding is complicated by that we want to handle -EIO to mean
5460          * disable GPU submission but keep KMS alive. We want to mark the
5461          * HW as irrevisibly wedged, but keep enough state around that the
5462          * driver doesn't explode during runtime.
5463          */
5464 err_init_hw:
5465         i915_gem_wait_for_idle(dev_priv, I915_WAIT_LOCKED);
5466         i915_gem_contexts_lost(dev_priv);
5467         intel_uc_fini_hw(dev_priv);
5468 err_uc_init:
5469         intel_uc_fini(dev_priv);
5470 err_pm:
5471         if (ret != -EIO) {
5472                 intel_cleanup_gt_powersave(dev_priv);
5473                 i915_gem_cleanup_engines(dev_priv);
5474         }
5475 err_context:
5476         if (ret != -EIO)
5477                 i915_gem_contexts_fini(dev_priv);
5478 err_ggtt:
5479 err_unlock:
5480         intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
5481         mutex_unlock(&dev_priv->drm.struct_mutex);
5482
5483         intel_uc_fini_misc(dev_priv);
5484
5485         if (ret != -EIO)
5486                 i915_gem_cleanup_userptr(dev_priv);
5487
5488         if (ret == -EIO) {
5489                 /*
5490                  * Allow engine initialisation to fail by marking the GPU as
5491                  * wedged. But we only want to do this where the GPU is angry,
5492                  * for all other failure, such as an allocation failure, bail.
5493                  */
5494                 if (!i915_terminally_wedged(&dev_priv->gpu_error)) {
5495                         DRM_ERROR("Failed to initialize GPU, declaring it wedged\n");
5496                         i915_gem_set_wedged(dev_priv);
5497                 }
5498                 ret = 0;
5499         }
5500
5501         i915_gem_drain_freed_objects(dev_priv);
5502         return ret;
5503 }
5504
5505 void i915_gem_init_mmio(struct drm_i915_private *i915)
5506 {
5507         i915_gem_sanitize(i915);
5508 }
5509
5510 void
5511 i915_gem_cleanup_engines(struct drm_i915_private *dev_priv)
5512 {
5513         struct intel_engine_cs *engine;
5514         enum intel_engine_id id;
5515
5516         for_each_engine(engine, dev_priv, id)
5517                 dev_priv->gt.cleanup_engine(engine);
5518 }
5519
5520 void
5521 i915_gem_load_init_fences(struct drm_i915_private *dev_priv)
5522 {
5523         int i;
5524
5525         if (INTEL_GEN(dev_priv) >= 7 && !IS_VALLEYVIEW(dev_priv) &&
5526             !IS_CHERRYVIEW(dev_priv))
5527                 dev_priv->num_fence_regs = 32;
5528         else if (INTEL_GEN(dev_priv) >= 4 ||
5529                  IS_I945G(dev_priv) || IS_I945GM(dev_priv) ||
5530                  IS_G33(dev_priv) || IS_PINEVIEW(dev_priv))
5531                 dev_priv->num_fence_regs = 16;
5532         else
5533                 dev_priv->num_fence_regs = 8;
5534
5535         if (intel_vgpu_active(dev_priv))
5536                 dev_priv->num_fence_regs =
5537                                 I915_READ(vgtif_reg(avail_rs.fence_num));
5538
5539         /* Initialize fence registers to zero */
5540         for (i = 0; i < dev_priv->num_fence_regs; i++) {
5541                 struct drm_i915_fence_reg *fence = &dev_priv->fence_regs[i];
5542
5543                 fence->i915 = dev_priv;
5544                 fence->id = i;
5545                 list_add_tail(&fence->link, &dev_priv->mm.fence_list);
5546         }
5547         i915_gem_restore_fences(dev_priv);
5548
5549         i915_gem_detect_bit_6_swizzle(dev_priv);
5550 }
5551
5552 static void i915_gem_init__mm(struct drm_i915_private *i915)
5553 {
5554         spin_lock_init(&i915->mm.object_stat_lock);
5555         spin_lock_init(&i915->mm.obj_lock);
5556         spin_lock_init(&i915->mm.free_lock);
5557
5558         init_llist_head(&i915->mm.free_list);
5559
5560         INIT_LIST_HEAD(&i915->mm.unbound_list);
5561         INIT_LIST_HEAD(&i915->mm.bound_list);
5562         INIT_LIST_HEAD(&i915->mm.fence_list);
5563         INIT_LIST_HEAD(&i915->mm.userfault_list);
5564
5565         INIT_WORK(&i915->mm.free_work, __i915_gem_free_work);
5566 }
5567
5568 int i915_gem_init_early(struct drm_i915_private *dev_priv)
5569 {
5570         int err = -ENOMEM;
5571
5572         dev_priv->objects = KMEM_CACHE(drm_i915_gem_object, SLAB_HWCACHE_ALIGN);
5573         if (!dev_priv->objects)
5574                 goto err_out;
5575
5576         dev_priv->vmas = KMEM_CACHE(i915_vma, SLAB_HWCACHE_ALIGN);
5577         if (!dev_priv->vmas)
5578                 goto err_objects;
5579
5580         dev_priv->luts = KMEM_CACHE(i915_lut_handle, 0);
5581         if (!dev_priv->luts)
5582                 goto err_vmas;
5583
5584         dev_priv->requests = KMEM_CACHE(i915_request,
5585                                         SLAB_HWCACHE_ALIGN |
5586                                         SLAB_RECLAIM_ACCOUNT |
5587                                         SLAB_TYPESAFE_BY_RCU);
5588         if (!dev_priv->requests)
5589                 goto err_luts;
5590
5591         dev_priv->dependencies = KMEM_CACHE(i915_dependency,
5592                                             SLAB_HWCACHE_ALIGN |
5593                                             SLAB_RECLAIM_ACCOUNT);
5594         if (!dev_priv->dependencies)
5595                 goto err_requests;
5596
5597         dev_priv->priorities = KMEM_CACHE(i915_priolist, SLAB_HWCACHE_ALIGN);
5598         if (!dev_priv->priorities)
5599                 goto err_dependencies;
5600
5601         INIT_LIST_HEAD(&dev_priv->gt.timelines);
5602         INIT_LIST_HEAD(&dev_priv->gt.active_rings);
5603         INIT_LIST_HEAD(&dev_priv->gt.closed_vma);
5604
5605         i915_gem_init__mm(dev_priv);
5606
5607         INIT_DELAYED_WORK(&dev_priv->gt.retire_work,
5608                           i915_gem_retire_work_handler);
5609         INIT_DELAYED_WORK(&dev_priv->gt.idle_work,
5610                           i915_gem_idle_work_handler);
5611         init_waitqueue_head(&dev_priv->gpu_error.wait_queue);
5612         init_waitqueue_head(&dev_priv->gpu_error.reset_queue);
5613
5614         atomic_set(&dev_priv->mm.bsd_engine_dispatch_index, 0);
5615
5616         spin_lock_init(&dev_priv->fb_tracking.lock);
5617
5618         err = i915_gemfs_init(dev_priv);
5619         if (err)
5620                 DRM_NOTE("Unable to create a private tmpfs mount, hugepage support will be disabled(%d).\n", err);
5621
5622         return 0;
5623
5624 err_dependencies:
5625         kmem_cache_destroy(dev_priv->dependencies);
5626 err_requests:
5627         kmem_cache_destroy(dev_priv->requests);
5628 err_luts:
5629         kmem_cache_destroy(dev_priv->luts);
5630 err_vmas:
5631         kmem_cache_destroy(dev_priv->vmas);
5632 err_objects:
5633         kmem_cache_destroy(dev_priv->objects);
5634 err_out:
5635         return err;
5636 }
5637
5638 void i915_gem_cleanup_early(struct drm_i915_private *dev_priv)
5639 {
5640         i915_gem_drain_freed_objects(dev_priv);
5641         GEM_BUG_ON(!llist_empty(&dev_priv->mm.free_list));
5642         GEM_BUG_ON(atomic_read(&dev_priv->mm.free_count));
5643         WARN_ON(dev_priv->mm.object_count);
5644         WARN_ON(!list_empty(&dev_priv->gt.timelines));
5645
5646         kmem_cache_destroy(dev_priv->priorities);
5647         kmem_cache_destroy(dev_priv->dependencies);
5648         kmem_cache_destroy(dev_priv->requests);
5649         kmem_cache_destroy(dev_priv->luts);
5650         kmem_cache_destroy(dev_priv->vmas);
5651         kmem_cache_destroy(dev_priv->objects);
5652
5653         /* And ensure that our DESTROY_BY_RCU slabs are truly destroyed */
5654         rcu_barrier();
5655
5656         i915_gemfs_fini(dev_priv);
5657 }
5658
5659 int i915_gem_freeze(struct drm_i915_private *dev_priv)
5660 {
5661         /* Discard all purgeable objects, let userspace recover those as
5662          * required after resuming.
5663          */
5664         i915_gem_shrink_all(dev_priv);
5665
5666         return 0;
5667 }
5668
5669 int i915_gem_freeze_late(struct drm_i915_private *dev_priv)
5670 {
5671         struct drm_i915_gem_object *obj;
5672         struct list_head *phases[] = {
5673                 &dev_priv->mm.unbound_list,
5674                 &dev_priv->mm.bound_list,
5675                 NULL
5676         }, **p;
5677
5678         /* Called just before we write the hibernation image.
5679          *
5680          * We need to update the domain tracking to reflect that the CPU
5681          * will be accessing all the pages to create and restore from the
5682          * hibernation, and so upon restoration those pages will be in the
5683          * CPU domain.
5684          *
5685          * To make sure the hibernation image contains the latest state,
5686          * we update that state just before writing out the image.
5687          *
5688          * To try and reduce the hibernation image, we manually shrink
5689          * the objects as well, see i915_gem_freeze()
5690          */
5691
5692         i915_gem_shrink(dev_priv, -1UL, NULL, I915_SHRINK_UNBOUND);
5693         i915_gem_drain_freed_objects(dev_priv);
5694
5695         spin_lock(&dev_priv->mm.obj_lock);
5696         for (p = phases; *p; p++) {
5697                 list_for_each_entry(obj, *p, mm.link)
5698                         __start_cpu_write(obj);
5699         }
5700         spin_unlock(&dev_priv->mm.obj_lock);
5701
5702         return 0;
5703 }
5704
5705 void i915_gem_release(struct drm_device *dev, struct drm_file *file)
5706 {
5707         struct drm_i915_file_private *file_priv = file->driver_priv;
5708         struct i915_request *request;
5709
5710         /* Clean up our request list when the client is going away, so that
5711          * later retire_requests won't dereference our soon-to-be-gone
5712          * file_priv.
5713          */
5714         spin_lock(&file_priv->mm.lock);
5715         list_for_each_entry(request, &file_priv->mm.request_list, client_link)
5716                 request->file_priv = NULL;
5717         spin_unlock(&file_priv->mm.lock);
5718 }
5719
5720 int i915_gem_open(struct drm_i915_private *i915, struct drm_file *file)
5721 {
5722         struct drm_i915_file_private *file_priv;
5723         int ret;
5724
5725         DRM_DEBUG("\n");
5726
5727         file_priv = kzalloc(sizeof(*file_priv), GFP_KERNEL);
5728         if (!file_priv)
5729                 return -ENOMEM;
5730
5731         file->driver_priv = file_priv;
5732         file_priv->dev_priv = i915;
5733         file_priv->file = file;
5734
5735         spin_lock_init(&file_priv->mm.lock);
5736         INIT_LIST_HEAD(&file_priv->mm.request_list);
5737
5738         file_priv->bsd_engine = -1;
5739
5740         ret = i915_gem_context_open(i915, file);
5741         if (ret)
5742                 kfree(file_priv);
5743
5744         return ret;
5745 }
5746
5747 /**
5748  * i915_gem_track_fb - update frontbuffer tracking
5749  * @old: current GEM buffer for the frontbuffer slots
5750  * @new: new GEM buffer for the frontbuffer slots
5751  * @frontbuffer_bits: bitmask of frontbuffer slots
5752  *
5753  * This updates the frontbuffer tracking bits @frontbuffer_bits by clearing them
5754  * from @old and setting them in @new. Both @old and @new can be NULL.
5755  */
5756 void i915_gem_track_fb(struct drm_i915_gem_object *old,
5757                        struct drm_i915_gem_object *new,
5758                        unsigned frontbuffer_bits)
5759 {
5760         /* Control of individual bits within the mask are guarded by
5761          * the owning plane->mutex, i.e. we can never see concurrent
5762          * manipulation of individual bits. But since the bitfield as a whole
5763          * is updated using RMW, we need to use atomics in order to update
5764          * the bits.
5765          */
5766         BUILD_BUG_ON(INTEL_FRONTBUFFER_BITS_PER_PIPE * I915_MAX_PIPES >
5767                      sizeof(atomic_t) * BITS_PER_BYTE);
5768
5769         if (old) {
5770                 WARN_ON(!(atomic_read(&old->frontbuffer_bits) & frontbuffer_bits));
5771                 atomic_andnot(frontbuffer_bits, &old->frontbuffer_bits);
5772         }
5773
5774         if (new) {
5775                 WARN_ON(atomic_read(&new->frontbuffer_bits) & frontbuffer_bits);
5776                 atomic_or(frontbuffer_bits, &new->frontbuffer_bits);
5777         }
5778 }
5779
5780 /* Allocate a new GEM object and fill it with the supplied data */
5781 struct drm_i915_gem_object *
5782 i915_gem_object_create_from_data(struct drm_i915_private *dev_priv,
5783                                  const void *data, size_t size)
5784 {
5785         struct drm_i915_gem_object *obj;
5786         struct file *file;
5787         size_t offset;
5788         int err;
5789
5790         obj = i915_gem_object_create(dev_priv, round_up(size, PAGE_SIZE));
5791         if (IS_ERR(obj))
5792                 return obj;
5793
5794         GEM_BUG_ON(obj->write_domain != I915_GEM_DOMAIN_CPU);
5795
5796         file = obj->base.filp;
5797         offset = 0;
5798         do {
5799                 unsigned int len = min_t(typeof(size), size, PAGE_SIZE);
5800                 struct page *page;
5801                 void *pgdata, *vaddr;
5802
5803                 err = pagecache_write_begin(file, file->f_mapping,
5804                                             offset, len, 0,
5805                                             &page, &pgdata);
5806                 if (err < 0)
5807                         goto fail;
5808
5809                 vaddr = kmap(page);
5810                 memcpy(vaddr, data, len);
5811                 kunmap(page);
5812
5813                 err = pagecache_write_end(file, file->f_mapping,
5814                                           offset, len, len,
5815                                           page, pgdata);
5816                 if (err < 0)
5817                         goto fail;
5818
5819                 size -= len;
5820                 data += len;
5821                 offset += len;
5822         } while (size);
5823
5824         return obj;
5825
5826 fail:
5827         i915_gem_object_put(obj);
5828         return ERR_PTR(err);
5829 }
5830
5831 struct scatterlist *
5832 i915_gem_object_get_sg(struct drm_i915_gem_object *obj,
5833                        unsigned int n,
5834                        unsigned int *offset)
5835 {
5836         struct i915_gem_object_page_iter *iter = &obj->mm.get_page;
5837         struct scatterlist *sg;
5838         unsigned int idx, count;
5839
5840         might_sleep();
5841         GEM_BUG_ON(n >= obj->base.size >> PAGE_SHIFT);
5842         GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
5843
5844         /* As we iterate forward through the sg, we record each entry in a
5845          * radixtree for quick repeated (backwards) lookups. If we have seen
5846          * this index previously, we will have an entry for it.
5847          *
5848          * Initial lookup is O(N), but this is amortized to O(1) for
5849          * sequential page access (where each new request is consecutive
5850          * to the previous one). Repeated lookups are O(lg(obj->base.size)),
5851          * i.e. O(1) with a large constant!
5852          */
5853         if (n < READ_ONCE(iter->sg_idx))
5854                 goto lookup;
5855
5856         mutex_lock(&iter->lock);
5857
5858         /* We prefer to reuse the last sg so that repeated lookup of this
5859          * (or the subsequent) sg are fast - comparing against the last
5860          * sg is faster than going through the radixtree.
5861          */
5862
5863         sg = iter->sg_pos;
5864         idx = iter->sg_idx;
5865         count = __sg_page_count(sg);
5866
5867         while (idx + count <= n) {
5868                 unsigned long exception, i;
5869                 int ret;
5870
5871                 /* If we cannot allocate and insert this entry, or the
5872                  * individual pages from this range, cancel updating the
5873                  * sg_idx so that on this lookup we are forced to linearly
5874                  * scan onwards, but on future lookups we will try the
5875                  * insertion again (in which case we need to be careful of
5876                  * the error return reporting that we have already inserted
5877                  * this index).
5878                  */
5879                 ret = radix_tree_insert(&iter->radix, idx, sg);
5880                 if (ret && ret != -EEXIST)
5881                         goto scan;
5882
5883                 exception =
5884                         RADIX_TREE_EXCEPTIONAL_ENTRY |
5885                         idx << RADIX_TREE_EXCEPTIONAL_SHIFT;
5886                 for (i = 1; i < count; i++) {
5887                         ret = radix_tree_insert(&iter->radix, idx + i,
5888                                                 (void *)exception);
5889                         if (ret && ret != -EEXIST)
5890                                 goto scan;
5891                 }
5892
5893                 idx += count;
5894                 sg = ____sg_next(sg);
5895                 count = __sg_page_count(sg);
5896         }
5897
5898 scan:
5899         iter->sg_pos = sg;
5900         iter->sg_idx = idx;
5901
5902         mutex_unlock(&iter->lock);
5903
5904         if (unlikely(n < idx)) /* insertion completed by another thread */
5905                 goto lookup;
5906
5907         /* In case we failed to insert the entry into the radixtree, we need
5908          * to look beyond the current sg.
5909          */
5910         while (idx + count <= n) {
5911                 idx += count;
5912                 sg = ____sg_next(sg);
5913                 count = __sg_page_count(sg);
5914         }
5915
5916         *offset = n - idx;
5917         return sg;
5918
5919 lookup:
5920         rcu_read_lock();
5921
5922         sg = radix_tree_lookup(&iter->radix, n);
5923         GEM_BUG_ON(!sg);
5924
5925         /* If this index is in the middle of multi-page sg entry,
5926          * the radixtree will contain an exceptional entry that points
5927          * to the start of that range. We will return the pointer to
5928          * the base page and the offset of this page within the
5929          * sg entry's range.
5930          */
5931         *offset = 0;
5932         if (unlikely(radix_tree_exception(sg))) {
5933                 unsigned long base =
5934                         (unsigned long)sg >> RADIX_TREE_EXCEPTIONAL_SHIFT;
5935
5936                 sg = radix_tree_lookup(&iter->radix, base);
5937                 GEM_BUG_ON(!sg);
5938
5939                 *offset = n - base;
5940         }
5941
5942         rcu_read_unlock();
5943
5944         return sg;
5945 }
5946
5947 struct page *
5948 i915_gem_object_get_page(struct drm_i915_gem_object *obj, unsigned int n)
5949 {
5950         struct scatterlist *sg;
5951         unsigned int offset;
5952
5953         GEM_BUG_ON(!i915_gem_object_has_struct_page(obj));
5954
5955         sg = i915_gem_object_get_sg(obj, n, &offset);
5956         return nth_page(sg_page(sg), offset);
5957 }
5958
5959 /* Like i915_gem_object_get_page(), but mark the returned page dirty */
5960 struct page *
5961 i915_gem_object_get_dirty_page(struct drm_i915_gem_object *obj,
5962                                unsigned int n)
5963 {
5964         struct page *page;
5965
5966         page = i915_gem_object_get_page(obj, n);
5967         if (!obj->mm.dirty)
5968                 set_page_dirty(page);
5969
5970         return page;
5971 }
5972
5973 dma_addr_t
5974 i915_gem_object_get_dma_address(struct drm_i915_gem_object *obj,
5975                                 unsigned long n)
5976 {
5977         struct scatterlist *sg;
5978         unsigned int offset;
5979
5980         sg = i915_gem_object_get_sg(obj, n, &offset);
5981         return sg_dma_address(sg) + (offset << PAGE_SHIFT);
5982 }
5983
5984 int i915_gem_object_attach_phys(struct drm_i915_gem_object *obj, int align)
5985 {
5986         struct sg_table *pages;
5987         int err;
5988
5989         if (align > obj->base.size)
5990                 return -EINVAL;
5991
5992         if (obj->ops == &i915_gem_phys_ops)
5993                 return 0;
5994
5995         if (obj->ops != &i915_gem_object_ops)
5996                 return -EINVAL;
5997
5998         err = i915_gem_object_unbind(obj);
5999         if (err)
6000                 return err;
6001
6002         mutex_lock(&obj->mm.lock);
6003
6004         if (obj->mm.madv != I915_MADV_WILLNEED) {
6005                 err = -EFAULT;
6006                 goto err_unlock;
6007         }
6008
6009         if (obj->mm.quirked) {
6010                 err = -EFAULT;
6011                 goto err_unlock;
6012         }
6013
6014         if (obj->mm.mapping) {
6015                 err = -EBUSY;
6016                 goto err_unlock;
6017         }
6018
6019         pages = fetch_and_zero(&obj->mm.pages);
6020         if (pages) {
6021                 struct drm_i915_private *i915 = to_i915(obj->base.dev);
6022
6023                 __i915_gem_object_reset_page_iter(obj);
6024
6025                 spin_lock(&i915->mm.obj_lock);
6026                 list_del(&obj->mm.link);
6027                 spin_unlock(&i915->mm.obj_lock);
6028         }
6029
6030         obj->ops = &i915_gem_phys_ops;
6031
6032         err = ____i915_gem_object_get_pages(obj);
6033         if (err)
6034                 goto err_xfer;
6035
6036         /* Perma-pin (until release) the physical set of pages */
6037         __i915_gem_object_pin_pages(obj);
6038
6039         if (!IS_ERR_OR_NULL(pages))
6040                 i915_gem_object_ops.put_pages(obj, pages);
6041         mutex_unlock(&obj->mm.lock);
6042         return 0;
6043
6044 err_xfer:
6045         obj->ops = &i915_gem_object_ops;
6046         obj->mm.pages = pages;
6047 err_unlock:
6048         mutex_unlock(&obj->mm.lock);
6049         return err;
6050 }
6051
6052 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
6053 #include "selftests/scatterlist.c"
6054 #include "selftests/mock_gem_device.c"
6055 #include "selftests/huge_gem_object.c"
6056 #include "selftests/huge_pages.c"
6057 #include "selftests/i915_gem_object.c"
6058 #include "selftests/i915_gem_coherency.c"
6059 #endif