Merge tag 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost
[sfrench/cifs-2.6.git] / drivers / gpu / drm / i915 / i915_gem.c
1 /*
2  * Copyright © 2008-2015 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * Authors:
24  *    Eric Anholt <eric@anholt.net>
25  *
26  */
27
28 #include <drm/drm_vma_manager.h>
29 #include <drm/drm_pci.h>
30 #include <drm/i915_drm.h>
31 #include <linux/dma-fence-array.h>
32 #include <linux/kthread.h>
33 #include <linux/reservation.h>
34 #include <linux/shmem_fs.h>
35 #include <linux/slab.h>
36 #include <linux/stop_machine.h>
37 #include <linux/swap.h>
38 #include <linux/pci.h>
39 #include <linux/dma-buf.h>
40 #include <linux/mman.h>
41
42 #include "i915_drv.h"
43 #include "i915_gem_clflush.h"
44 #include "i915_gemfs.h"
45 #include "i915_globals.h"
46 #include "i915_reset.h"
47 #include "i915_trace.h"
48 #include "i915_vgpu.h"
49
50 #include "intel_drv.h"
51 #include "intel_frontbuffer.h"
52 #include "intel_mocs.h"
53 #include "intel_pm.h"
54 #include "intel_workarounds.h"
55
56 static void i915_gem_flush_free_objects(struct drm_i915_private *i915);
57
58 static bool cpu_write_needs_clflush(struct drm_i915_gem_object *obj)
59 {
60         if (obj->cache_dirty)
61                 return false;
62
63         if (!(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE))
64                 return true;
65
66         return obj->pin_global; /* currently in use by HW, keep flushed */
67 }
68
69 static int
70 insert_mappable_node(struct i915_ggtt *ggtt,
71                      struct drm_mm_node *node, u32 size)
72 {
73         memset(node, 0, sizeof(*node));
74         return drm_mm_insert_node_in_range(&ggtt->vm.mm, node,
75                                            size, 0, I915_COLOR_UNEVICTABLE,
76                                            0, ggtt->mappable_end,
77                                            DRM_MM_INSERT_LOW);
78 }
79
80 static void
81 remove_mappable_node(struct drm_mm_node *node)
82 {
83         drm_mm_remove_node(node);
84 }
85
86 /* some bookkeeping */
87 static void i915_gem_info_add_obj(struct drm_i915_private *dev_priv,
88                                   u64 size)
89 {
90         spin_lock(&dev_priv->mm.object_stat_lock);
91         dev_priv->mm.object_count++;
92         dev_priv->mm.object_memory += size;
93         spin_unlock(&dev_priv->mm.object_stat_lock);
94 }
95
96 static void i915_gem_info_remove_obj(struct drm_i915_private *dev_priv,
97                                      u64 size)
98 {
99         spin_lock(&dev_priv->mm.object_stat_lock);
100         dev_priv->mm.object_count--;
101         dev_priv->mm.object_memory -= size;
102         spin_unlock(&dev_priv->mm.object_stat_lock);
103 }
104
105 static void __i915_gem_park(struct drm_i915_private *i915)
106 {
107         intel_wakeref_t wakeref;
108
109         GEM_TRACE("\n");
110
111         lockdep_assert_held(&i915->drm.struct_mutex);
112         GEM_BUG_ON(i915->gt.active_requests);
113         GEM_BUG_ON(!list_empty(&i915->gt.active_rings));
114
115         if (!i915->gt.awake)
116                 return;
117
118         /*
119          * Be paranoid and flush a concurrent interrupt to make sure
120          * we don't reactivate any irq tasklets after parking.
121          *
122          * FIXME: Note that even though we have waited for execlists to be idle,
123          * there may still be an in-flight interrupt even though the CSB
124          * is now empty. synchronize_irq() makes sure that a residual interrupt
125          * is completed before we continue, but it doesn't prevent the HW from
126          * raising a spurious interrupt later. To complete the shield we should
127          * coordinate disabling the CS irq with flushing the interrupts.
128          */
129         synchronize_irq(i915->drm.irq);
130
131         intel_engines_park(i915);
132         i915_timelines_park(i915);
133
134         i915_pmu_gt_parked(i915);
135         i915_vma_parked(i915);
136
137         wakeref = fetch_and_zero(&i915->gt.awake);
138         GEM_BUG_ON(!wakeref);
139
140         if (INTEL_GEN(i915) >= 6)
141                 gen6_rps_idle(i915);
142
143         intel_display_power_put(i915, POWER_DOMAIN_GT_IRQ, wakeref);
144
145         i915_globals_park();
146 }
147
148 void i915_gem_park(struct drm_i915_private *i915)
149 {
150         GEM_TRACE("\n");
151
152         lockdep_assert_held(&i915->drm.struct_mutex);
153         GEM_BUG_ON(i915->gt.active_requests);
154
155         if (!i915->gt.awake)
156                 return;
157
158         /* Defer the actual call to __i915_gem_park() to prevent ping-pongs */
159         mod_delayed_work(i915->wq, &i915->gt.idle_work, msecs_to_jiffies(100));
160 }
161
162 void i915_gem_unpark(struct drm_i915_private *i915)
163 {
164         GEM_TRACE("\n");
165
166         lockdep_assert_held(&i915->drm.struct_mutex);
167         GEM_BUG_ON(!i915->gt.active_requests);
168         assert_rpm_wakelock_held(i915);
169
170         if (i915->gt.awake)
171                 return;
172
173         /*
174          * It seems that the DMC likes to transition between the DC states a lot
175          * when there are no connected displays (no active power domains) during
176          * command submission.
177          *
178          * This activity has negative impact on the performance of the chip with
179          * huge latencies observed in the interrupt handler and elsewhere.
180          *
181          * Work around it by grabbing a GT IRQ power domain whilst there is any
182          * GT activity, preventing any DC state transitions.
183          */
184         i915->gt.awake = intel_display_power_get(i915, POWER_DOMAIN_GT_IRQ);
185         GEM_BUG_ON(!i915->gt.awake);
186
187         i915_globals_unpark();
188
189         intel_enable_gt_powersave(i915);
190         i915_update_gfx_val(i915);
191         if (INTEL_GEN(i915) >= 6)
192                 gen6_rps_busy(i915);
193         i915_pmu_gt_unparked(i915);
194
195         intel_engines_unpark(i915);
196
197         i915_queue_hangcheck(i915);
198
199         queue_delayed_work(i915->wq,
200                            &i915->gt.retire_work,
201                            round_jiffies_up_relative(HZ));
202 }
203
204 int
205 i915_gem_get_aperture_ioctl(struct drm_device *dev, void *data,
206                             struct drm_file *file)
207 {
208         struct i915_ggtt *ggtt = &to_i915(dev)->ggtt;
209         struct drm_i915_gem_get_aperture *args = data;
210         struct i915_vma *vma;
211         u64 pinned;
212
213         mutex_lock(&ggtt->vm.mutex);
214
215         pinned = ggtt->vm.reserved;
216         list_for_each_entry(vma, &ggtt->vm.bound_list, vm_link)
217                 if (i915_vma_is_pinned(vma))
218                         pinned += vma->node.size;
219
220         mutex_unlock(&ggtt->vm.mutex);
221
222         args->aper_size = ggtt->vm.total;
223         args->aper_available_size = args->aper_size - pinned;
224
225         return 0;
226 }
227
228 static int i915_gem_object_get_pages_phys(struct drm_i915_gem_object *obj)
229 {
230         struct address_space *mapping = obj->base.filp->f_mapping;
231         drm_dma_handle_t *phys;
232         struct sg_table *st;
233         struct scatterlist *sg;
234         char *vaddr;
235         int i;
236         int err;
237
238         if (WARN_ON(i915_gem_object_needs_bit17_swizzle(obj)))
239                 return -EINVAL;
240
241         /* Always aligning to the object size, allows a single allocation
242          * to handle all possible callers, and given typical object sizes,
243          * the alignment of the buddy allocation will naturally match.
244          */
245         phys = drm_pci_alloc(obj->base.dev,
246                              roundup_pow_of_two(obj->base.size),
247                              roundup_pow_of_two(obj->base.size));
248         if (!phys)
249                 return -ENOMEM;
250
251         vaddr = phys->vaddr;
252         for (i = 0; i < obj->base.size / PAGE_SIZE; i++) {
253                 struct page *page;
254                 char *src;
255
256                 page = shmem_read_mapping_page(mapping, i);
257                 if (IS_ERR(page)) {
258                         err = PTR_ERR(page);
259                         goto err_phys;
260                 }
261
262                 src = kmap_atomic(page);
263                 memcpy(vaddr, src, PAGE_SIZE);
264                 drm_clflush_virt_range(vaddr, PAGE_SIZE);
265                 kunmap_atomic(src);
266
267                 put_page(page);
268                 vaddr += PAGE_SIZE;
269         }
270
271         i915_gem_chipset_flush(to_i915(obj->base.dev));
272
273         st = kmalloc(sizeof(*st), GFP_KERNEL);
274         if (!st) {
275                 err = -ENOMEM;
276                 goto err_phys;
277         }
278
279         if (sg_alloc_table(st, 1, GFP_KERNEL)) {
280                 kfree(st);
281                 err = -ENOMEM;
282                 goto err_phys;
283         }
284
285         sg = st->sgl;
286         sg->offset = 0;
287         sg->length = obj->base.size;
288
289         sg_dma_address(sg) = phys->busaddr;
290         sg_dma_len(sg) = obj->base.size;
291
292         obj->phys_handle = phys;
293
294         __i915_gem_object_set_pages(obj, st, sg->length);
295
296         return 0;
297
298 err_phys:
299         drm_pci_free(obj->base.dev, phys);
300
301         return err;
302 }
303
304 static void __start_cpu_write(struct drm_i915_gem_object *obj)
305 {
306         obj->read_domains = I915_GEM_DOMAIN_CPU;
307         obj->write_domain = I915_GEM_DOMAIN_CPU;
308         if (cpu_write_needs_clflush(obj))
309                 obj->cache_dirty = true;
310 }
311
312 void
313 __i915_gem_object_release_shmem(struct drm_i915_gem_object *obj,
314                                 struct sg_table *pages,
315                                 bool needs_clflush)
316 {
317         GEM_BUG_ON(obj->mm.madv == __I915_MADV_PURGED);
318
319         if (obj->mm.madv == I915_MADV_DONTNEED)
320                 obj->mm.dirty = false;
321
322         if (needs_clflush &&
323             (obj->read_domains & I915_GEM_DOMAIN_CPU) == 0 &&
324             !(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ))
325                 drm_clflush_sg(pages);
326
327         __start_cpu_write(obj);
328 }
329
330 static void
331 i915_gem_object_put_pages_phys(struct drm_i915_gem_object *obj,
332                                struct sg_table *pages)
333 {
334         __i915_gem_object_release_shmem(obj, pages, false);
335
336         if (obj->mm.dirty) {
337                 struct address_space *mapping = obj->base.filp->f_mapping;
338                 char *vaddr = obj->phys_handle->vaddr;
339                 int i;
340
341                 for (i = 0; i < obj->base.size / PAGE_SIZE; i++) {
342                         struct page *page;
343                         char *dst;
344
345                         page = shmem_read_mapping_page(mapping, i);
346                         if (IS_ERR(page))
347                                 continue;
348
349                         dst = kmap_atomic(page);
350                         drm_clflush_virt_range(vaddr, PAGE_SIZE);
351                         memcpy(dst, vaddr, PAGE_SIZE);
352                         kunmap_atomic(dst);
353
354                         set_page_dirty(page);
355                         if (obj->mm.madv == I915_MADV_WILLNEED)
356                                 mark_page_accessed(page);
357                         put_page(page);
358                         vaddr += PAGE_SIZE;
359                 }
360                 obj->mm.dirty = false;
361         }
362
363         sg_free_table(pages);
364         kfree(pages);
365
366         drm_pci_free(obj->base.dev, obj->phys_handle);
367 }
368
369 static void
370 i915_gem_object_release_phys(struct drm_i915_gem_object *obj)
371 {
372         i915_gem_object_unpin_pages(obj);
373 }
374
375 static const struct drm_i915_gem_object_ops i915_gem_phys_ops = {
376         .get_pages = i915_gem_object_get_pages_phys,
377         .put_pages = i915_gem_object_put_pages_phys,
378         .release = i915_gem_object_release_phys,
379 };
380
381 static const struct drm_i915_gem_object_ops i915_gem_object_ops;
382
383 int i915_gem_object_unbind(struct drm_i915_gem_object *obj)
384 {
385         struct i915_vma *vma;
386         LIST_HEAD(still_in_list);
387         int ret;
388
389         lockdep_assert_held(&obj->base.dev->struct_mutex);
390
391         /* Closed vma are removed from the obj->vma_list - but they may
392          * still have an active binding on the object. To remove those we
393          * must wait for all rendering to complete to the object (as unbinding
394          * must anyway), and retire the requests.
395          */
396         ret = i915_gem_object_set_to_cpu_domain(obj, false);
397         if (ret)
398                 return ret;
399
400         spin_lock(&obj->vma.lock);
401         while (!ret && (vma = list_first_entry_or_null(&obj->vma.list,
402                                                        struct i915_vma,
403                                                        obj_link))) {
404                 list_move_tail(&vma->obj_link, &still_in_list);
405                 spin_unlock(&obj->vma.lock);
406
407                 ret = i915_vma_unbind(vma);
408
409                 spin_lock(&obj->vma.lock);
410         }
411         list_splice(&still_in_list, &obj->vma.list);
412         spin_unlock(&obj->vma.lock);
413
414         return ret;
415 }
416
417 static long
418 i915_gem_object_wait_fence(struct dma_fence *fence,
419                            unsigned int flags,
420                            long timeout)
421 {
422         struct i915_request *rq;
423
424         BUILD_BUG_ON(I915_WAIT_INTERRUPTIBLE != 0x1);
425
426         if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags))
427                 return timeout;
428
429         if (!dma_fence_is_i915(fence))
430                 return dma_fence_wait_timeout(fence,
431                                               flags & I915_WAIT_INTERRUPTIBLE,
432                                               timeout);
433
434         rq = to_request(fence);
435         if (i915_request_completed(rq))
436                 goto out;
437
438         timeout = i915_request_wait(rq, flags, timeout);
439
440 out:
441         if (flags & I915_WAIT_LOCKED && i915_request_completed(rq))
442                 i915_request_retire_upto(rq);
443
444         return timeout;
445 }
446
447 static long
448 i915_gem_object_wait_reservation(struct reservation_object *resv,
449                                  unsigned int flags,
450                                  long timeout)
451 {
452         unsigned int seq = __read_seqcount_begin(&resv->seq);
453         struct dma_fence *excl;
454         bool prune_fences = false;
455
456         if (flags & I915_WAIT_ALL) {
457                 struct dma_fence **shared;
458                 unsigned int count, i;
459                 int ret;
460
461                 ret = reservation_object_get_fences_rcu(resv,
462                                                         &excl, &count, &shared);
463                 if (ret)
464                         return ret;
465
466                 for (i = 0; i < count; i++) {
467                         timeout = i915_gem_object_wait_fence(shared[i],
468                                                              flags, timeout);
469                         if (timeout < 0)
470                                 break;
471
472                         dma_fence_put(shared[i]);
473                 }
474
475                 for (; i < count; i++)
476                         dma_fence_put(shared[i]);
477                 kfree(shared);
478
479                 /*
480                  * If both shared fences and an exclusive fence exist,
481                  * then by construction the shared fences must be later
482                  * than the exclusive fence. If we successfully wait for
483                  * all the shared fences, we know that the exclusive fence
484                  * must all be signaled. If all the shared fences are
485                  * signaled, we can prune the array and recover the
486                  * floating references on the fences/requests.
487                  */
488                 prune_fences = count && timeout >= 0;
489         } else {
490                 excl = reservation_object_get_excl_rcu(resv);
491         }
492
493         if (excl && timeout >= 0)
494                 timeout = i915_gem_object_wait_fence(excl, flags, timeout);
495
496         dma_fence_put(excl);
497
498         /*
499          * Opportunistically prune the fences iff we know they have *all* been
500          * signaled and that the reservation object has not been changed (i.e.
501          * no new fences have been added).
502          */
503         if (prune_fences && !__read_seqcount_retry(&resv->seq, seq)) {
504                 if (reservation_object_trylock(resv)) {
505                         if (!__read_seqcount_retry(&resv->seq, seq))
506                                 reservation_object_add_excl_fence(resv, NULL);
507                         reservation_object_unlock(resv);
508                 }
509         }
510
511         return timeout;
512 }
513
514 static void __fence_set_priority(struct dma_fence *fence,
515                                  const struct i915_sched_attr *attr)
516 {
517         struct i915_request *rq;
518         struct intel_engine_cs *engine;
519
520         if (dma_fence_is_signaled(fence) || !dma_fence_is_i915(fence))
521                 return;
522
523         rq = to_request(fence);
524         engine = rq->engine;
525
526         local_bh_disable();
527         rcu_read_lock(); /* RCU serialisation for set-wedged protection */
528         if (engine->schedule)
529                 engine->schedule(rq, attr);
530         rcu_read_unlock();
531         local_bh_enable(); /* kick the tasklets if queues were reprioritised */
532 }
533
534 static void fence_set_priority(struct dma_fence *fence,
535                                const struct i915_sched_attr *attr)
536 {
537         /* Recurse once into a fence-array */
538         if (dma_fence_is_array(fence)) {
539                 struct dma_fence_array *array = to_dma_fence_array(fence);
540                 int i;
541
542                 for (i = 0; i < array->num_fences; i++)
543                         __fence_set_priority(array->fences[i], attr);
544         } else {
545                 __fence_set_priority(fence, attr);
546         }
547 }
548
549 int
550 i915_gem_object_wait_priority(struct drm_i915_gem_object *obj,
551                               unsigned int flags,
552                               const struct i915_sched_attr *attr)
553 {
554         struct dma_fence *excl;
555
556         if (flags & I915_WAIT_ALL) {
557                 struct dma_fence **shared;
558                 unsigned int count, i;
559                 int ret;
560
561                 ret = reservation_object_get_fences_rcu(obj->resv,
562                                                         &excl, &count, &shared);
563                 if (ret)
564                         return ret;
565
566                 for (i = 0; i < count; i++) {
567                         fence_set_priority(shared[i], attr);
568                         dma_fence_put(shared[i]);
569                 }
570
571                 kfree(shared);
572         } else {
573                 excl = reservation_object_get_excl_rcu(obj->resv);
574         }
575
576         if (excl) {
577                 fence_set_priority(excl, attr);
578                 dma_fence_put(excl);
579         }
580         return 0;
581 }
582
583 /**
584  * Waits for rendering to the object to be completed
585  * @obj: i915 gem object
586  * @flags: how to wait (under a lock, for all rendering or just for writes etc)
587  * @timeout: how long to wait
588  */
589 int
590 i915_gem_object_wait(struct drm_i915_gem_object *obj,
591                      unsigned int flags,
592                      long timeout)
593 {
594         might_sleep();
595         GEM_BUG_ON(timeout < 0);
596
597         timeout = i915_gem_object_wait_reservation(obj->resv, flags, timeout);
598         return timeout < 0 ? timeout : 0;
599 }
600
601 static int
602 i915_gem_phys_pwrite(struct drm_i915_gem_object *obj,
603                      struct drm_i915_gem_pwrite *args,
604                      struct drm_file *file)
605 {
606         void *vaddr = obj->phys_handle->vaddr + args->offset;
607         char __user *user_data = u64_to_user_ptr(args->data_ptr);
608
609         /* We manually control the domain here and pretend that it
610          * remains coherent i.e. in the GTT domain, like shmem_pwrite.
611          */
612         intel_fb_obj_invalidate(obj, ORIGIN_CPU);
613         if (copy_from_user(vaddr, user_data, args->size))
614                 return -EFAULT;
615
616         drm_clflush_virt_range(vaddr, args->size);
617         i915_gem_chipset_flush(to_i915(obj->base.dev));
618
619         intel_fb_obj_flush(obj, ORIGIN_CPU);
620         return 0;
621 }
622
623 static int
624 i915_gem_create(struct drm_file *file,
625                 struct drm_i915_private *dev_priv,
626                 u64 *size_p,
627                 u32 *handle_p)
628 {
629         struct drm_i915_gem_object *obj;
630         u32 handle;
631         u64 size;
632         int ret;
633
634         size = round_up(*size_p, PAGE_SIZE);
635         if (size == 0)
636                 return -EINVAL;
637
638         /* Allocate the new object */
639         obj = i915_gem_object_create(dev_priv, size);
640         if (IS_ERR(obj))
641                 return PTR_ERR(obj);
642
643         ret = drm_gem_handle_create(file, &obj->base, &handle);
644         /* drop reference from allocate - handle holds it now */
645         i915_gem_object_put(obj);
646         if (ret)
647                 return ret;
648
649         *handle_p = handle;
650         *size_p = size;
651         return 0;
652 }
653
654 int
655 i915_gem_dumb_create(struct drm_file *file,
656                      struct drm_device *dev,
657                      struct drm_mode_create_dumb *args)
658 {
659         /* have to work out size/pitch and return them */
660         args->pitch = ALIGN(args->width * DIV_ROUND_UP(args->bpp, 8), 64);
661         args->size = args->pitch * args->height;
662         return i915_gem_create(file, to_i915(dev),
663                                &args->size, &args->handle);
664 }
665
666 static bool gpu_write_needs_clflush(struct drm_i915_gem_object *obj)
667 {
668         return !(obj->cache_level == I915_CACHE_NONE ||
669                  obj->cache_level == I915_CACHE_WT);
670 }
671
672 /**
673  * Creates a new mm object and returns a handle to it.
674  * @dev: drm device pointer
675  * @data: ioctl data blob
676  * @file: drm file pointer
677  */
678 int
679 i915_gem_create_ioctl(struct drm_device *dev, void *data,
680                       struct drm_file *file)
681 {
682         struct drm_i915_private *dev_priv = to_i915(dev);
683         struct drm_i915_gem_create *args = data;
684
685         i915_gem_flush_free_objects(dev_priv);
686
687         return i915_gem_create(file, dev_priv,
688                                &args->size, &args->handle);
689 }
690
691 static inline enum fb_op_origin
692 fb_write_origin(struct drm_i915_gem_object *obj, unsigned int domain)
693 {
694         return (domain == I915_GEM_DOMAIN_GTT ?
695                 obj->frontbuffer_ggtt_origin : ORIGIN_CPU);
696 }
697
698 void i915_gem_flush_ggtt_writes(struct drm_i915_private *dev_priv)
699 {
700         intel_wakeref_t wakeref;
701
702         /*
703          * No actual flushing is required for the GTT write domain for reads
704          * from the GTT domain. Writes to it "immediately" go to main memory
705          * as far as we know, so there's no chipset flush. It also doesn't
706          * land in the GPU render cache.
707          *
708          * However, we do have to enforce the order so that all writes through
709          * the GTT land before any writes to the device, such as updates to
710          * the GATT itself.
711          *
712          * We also have to wait a bit for the writes to land from the GTT.
713          * An uncached read (i.e. mmio) seems to be ideal for the round-trip
714          * timing. This issue has only been observed when switching quickly
715          * between GTT writes and CPU reads from inside the kernel on recent hw,
716          * and it appears to only affect discrete GTT blocks (i.e. on LLC
717          * system agents we cannot reproduce this behaviour, until Cannonlake
718          * that was!).
719          */
720
721         wmb();
722
723         if (INTEL_INFO(dev_priv)->has_coherent_ggtt)
724                 return;
725
726         i915_gem_chipset_flush(dev_priv);
727
728         with_intel_runtime_pm(dev_priv, wakeref) {
729                 spin_lock_irq(&dev_priv->uncore.lock);
730
731                 POSTING_READ_FW(RING_HEAD(RENDER_RING_BASE));
732
733                 spin_unlock_irq(&dev_priv->uncore.lock);
734         }
735 }
736
737 static void
738 flush_write_domain(struct drm_i915_gem_object *obj, unsigned int flush_domains)
739 {
740         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
741         struct i915_vma *vma;
742
743         if (!(obj->write_domain & flush_domains))
744                 return;
745
746         switch (obj->write_domain) {
747         case I915_GEM_DOMAIN_GTT:
748                 i915_gem_flush_ggtt_writes(dev_priv);
749
750                 intel_fb_obj_flush(obj,
751                                    fb_write_origin(obj, I915_GEM_DOMAIN_GTT));
752
753                 for_each_ggtt_vma(vma, obj) {
754                         if (vma->iomap)
755                                 continue;
756
757                         i915_vma_unset_ggtt_write(vma);
758                 }
759                 break;
760
761         case I915_GEM_DOMAIN_WC:
762                 wmb();
763                 break;
764
765         case I915_GEM_DOMAIN_CPU:
766                 i915_gem_clflush_object(obj, I915_CLFLUSH_SYNC);
767                 break;
768
769         case I915_GEM_DOMAIN_RENDER:
770                 if (gpu_write_needs_clflush(obj))
771                         obj->cache_dirty = true;
772                 break;
773         }
774
775         obj->write_domain = 0;
776 }
777
778 /*
779  * Pins the specified object's pages and synchronizes the object with
780  * GPU accesses. Sets needs_clflush to non-zero if the caller should
781  * flush the object from the CPU cache.
782  */
783 int i915_gem_obj_prepare_shmem_read(struct drm_i915_gem_object *obj,
784                                     unsigned int *needs_clflush)
785 {
786         int ret;
787
788         lockdep_assert_held(&obj->base.dev->struct_mutex);
789
790         *needs_clflush = 0;
791         if (!i915_gem_object_has_struct_page(obj))
792                 return -ENODEV;
793
794         ret = i915_gem_object_wait(obj,
795                                    I915_WAIT_INTERRUPTIBLE |
796                                    I915_WAIT_LOCKED,
797                                    MAX_SCHEDULE_TIMEOUT);
798         if (ret)
799                 return ret;
800
801         ret = i915_gem_object_pin_pages(obj);
802         if (ret)
803                 return ret;
804
805         if (obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ ||
806             !static_cpu_has(X86_FEATURE_CLFLUSH)) {
807                 ret = i915_gem_object_set_to_cpu_domain(obj, false);
808                 if (ret)
809                         goto err_unpin;
810                 else
811                         goto out;
812         }
813
814         flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
815
816         /* If we're not in the cpu read domain, set ourself into the gtt
817          * read domain and manually flush cachelines (if required). This
818          * optimizes for the case when the gpu will dirty the data
819          * anyway again before the next pread happens.
820          */
821         if (!obj->cache_dirty &&
822             !(obj->read_domains & I915_GEM_DOMAIN_CPU))
823                 *needs_clflush = CLFLUSH_BEFORE;
824
825 out:
826         /* return with the pages pinned */
827         return 0;
828
829 err_unpin:
830         i915_gem_object_unpin_pages(obj);
831         return ret;
832 }
833
834 int i915_gem_obj_prepare_shmem_write(struct drm_i915_gem_object *obj,
835                                      unsigned int *needs_clflush)
836 {
837         int ret;
838
839         lockdep_assert_held(&obj->base.dev->struct_mutex);
840
841         *needs_clflush = 0;
842         if (!i915_gem_object_has_struct_page(obj))
843                 return -ENODEV;
844
845         ret = i915_gem_object_wait(obj,
846                                    I915_WAIT_INTERRUPTIBLE |
847                                    I915_WAIT_LOCKED |
848                                    I915_WAIT_ALL,
849                                    MAX_SCHEDULE_TIMEOUT);
850         if (ret)
851                 return ret;
852
853         ret = i915_gem_object_pin_pages(obj);
854         if (ret)
855                 return ret;
856
857         if (obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE ||
858             !static_cpu_has(X86_FEATURE_CLFLUSH)) {
859                 ret = i915_gem_object_set_to_cpu_domain(obj, true);
860                 if (ret)
861                         goto err_unpin;
862                 else
863                         goto out;
864         }
865
866         flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
867
868         /* If we're not in the cpu write domain, set ourself into the
869          * gtt write domain and manually flush cachelines (as required).
870          * This optimizes for the case when the gpu will use the data
871          * right away and we therefore have to clflush anyway.
872          */
873         if (!obj->cache_dirty) {
874                 *needs_clflush |= CLFLUSH_AFTER;
875
876                 /*
877                  * Same trick applies to invalidate partially written
878                  * cachelines read before writing.
879                  */
880                 if (!(obj->read_domains & I915_GEM_DOMAIN_CPU))
881                         *needs_clflush |= CLFLUSH_BEFORE;
882         }
883
884 out:
885         intel_fb_obj_invalidate(obj, ORIGIN_CPU);
886         obj->mm.dirty = true;
887         /* return with the pages pinned */
888         return 0;
889
890 err_unpin:
891         i915_gem_object_unpin_pages(obj);
892         return ret;
893 }
894
895 static int
896 shmem_pread(struct page *page, int offset, int len, char __user *user_data,
897             bool needs_clflush)
898 {
899         char *vaddr;
900         int ret;
901
902         vaddr = kmap(page);
903
904         if (needs_clflush)
905                 drm_clflush_virt_range(vaddr + offset, len);
906
907         ret = __copy_to_user(user_data, vaddr + offset, len);
908
909         kunmap(page);
910
911         return ret ? -EFAULT : 0;
912 }
913
914 static int
915 i915_gem_shmem_pread(struct drm_i915_gem_object *obj,
916                      struct drm_i915_gem_pread *args)
917 {
918         char __user *user_data;
919         u64 remain;
920         unsigned int needs_clflush;
921         unsigned int idx, offset;
922         int ret;
923
924         ret = mutex_lock_interruptible(&obj->base.dev->struct_mutex);
925         if (ret)
926                 return ret;
927
928         ret = i915_gem_obj_prepare_shmem_read(obj, &needs_clflush);
929         mutex_unlock(&obj->base.dev->struct_mutex);
930         if (ret)
931                 return ret;
932
933         remain = args->size;
934         user_data = u64_to_user_ptr(args->data_ptr);
935         offset = offset_in_page(args->offset);
936         for (idx = args->offset >> PAGE_SHIFT; remain; idx++) {
937                 struct page *page = i915_gem_object_get_page(obj, idx);
938                 unsigned int length = min_t(u64, remain, PAGE_SIZE - offset);
939
940                 ret = shmem_pread(page, offset, length, user_data,
941                                   needs_clflush);
942                 if (ret)
943                         break;
944
945                 remain -= length;
946                 user_data += length;
947                 offset = 0;
948         }
949
950         i915_gem_obj_finish_shmem_access(obj);
951         return ret;
952 }
953
954 static inline bool
955 gtt_user_read(struct io_mapping *mapping,
956               loff_t base, int offset,
957               char __user *user_data, int length)
958 {
959         void __iomem *vaddr;
960         unsigned long unwritten;
961
962         /* We can use the cpu mem copy function because this is X86. */
963         vaddr = io_mapping_map_atomic_wc(mapping, base);
964         unwritten = __copy_to_user_inatomic(user_data,
965                                             (void __force *)vaddr + offset,
966                                             length);
967         io_mapping_unmap_atomic(vaddr);
968         if (unwritten) {
969                 vaddr = io_mapping_map_wc(mapping, base, PAGE_SIZE);
970                 unwritten = copy_to_user(user_data,
971                                          (void __force *)vaddr + offset,
972                                          length);
973                 io_mapping_unmap(vaddr);
974         }
975         return unwritten;
976 }
977
978 static int
979 i915_gem_gtt_pread(struct drm_i915_gem_object *obj,
980                    const struct drm_i915_gem_pread *args)
981 {
982         struct drm_i915_private *i915 = to_i915(obj->base.dev);
983         struct i915_ggtt *ggtt = &i915->ggtt;
984         intel_wakeref_t wakeref;
985         struct drm_mm_node node;
986         struct i915_vma *vma;
987         void __user *user_data;
988         u64 remain, offset;
989         int ret;
990
991         ret = mutex_lock_interruptible(&i915->drm.struct_mutex);
992         if (ret)
993                 return ret;
994
995         wakeref = intel_runtime_pm_get(i915);
996         vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
997                                        PIN_MAPPABLE |
998                                        PIN_NONFAULT |
999                                        PIN_NONBLOCK);
1000         if (!IS_ERR(vma)) {
1001                 node.start = i915_ggtt_offset(vma);
1002                 node.allocated = false;
1003                 ret = i915_vma_put_fence(vma);
1004                 if (ret) {
1005                         i915_vma_unpin(vma);
1006                         vma = ERR_PTR(ret);
1007                 }
1008         }
1009         if (IS_ERR(vma)) {
1010                 ret = insert_mappable_node(ggtt, &node, PAGE_SIZE);
1011                 if (ret)
1012                         goto out_unlock;
1013                 GEM_BUG_ON(!node.allocated);
1014         }
1015
1016         ret = i915_gem_object_set_to_gtt_domain(obj, false);
1017         if (ret)
1018                 goto out_unpin;
1019
1020         mutex_unlock(&i915->drm.struct_mutex);
1021
1022         user_data = u64_to_user_ptr(args->data_ptr);
1023         remain = args->size;
1024         offset = args->offset;
1025
1026         while (remain > 0) {
1027                 /* Operation in this page
1028                  *
1029                  * page_base = page offset within aperture
1030                  * page_offset = offset within page
1031                  * page_length = bytes to copy for this page
1032                  */
1033                 u32 page_base = node.start;
1034                 unsigned page_offset = offset_in_page(offset);
1035                 unsigned page_length = PAGE_SIZE - page_offset;
1036                 page_length = remain < page_length ? remain : page_length;
1037                 if (node.allocated) {
1038                         wmb();
1039                         ggtt->vm.insert_page(&ggtt->vm,
1040                                              i915_gem_object_get_dma_address(obj, offset >> PAGE_SHIFT),
1041                                              node.start, I915_CACHE_NONE, 0);
1042                         wmb();
1043                 } else {
1044                         page_base += offset & PAGE_MASK;
1045                 }
1046
1047                 if (gtt_user_read(&ggtt->iomap, page_base, page_offset,
1048                                   user_data, page_length)) {
1049                         ret = -EFAULT;
1050                         break;
1051                 }
1052
1053                 remain -= page_length;
1054                 user_data += page_length;
1055                 offset += page_length;
1056         }
1057
1058         mutex_lock(&i915->drm.struct_mutex);
1059 out_unpin:
1060         if (node.allocated) {
1061                 wmb();
1062                 ggtt->vm.clear_range(&ggtt->vm, node.start, node.size);
1063                 remove_mappable_node(&node);
1064         } else {
1065                 i915_vma_unpin(vma);
1066         }
1067 out_unlock:
1068         intel_runtime_pm_put(i915, wakeref);
1069         mutex_unlock(&i915->drm.struct_mutex);
1070
1071         return ret;
1072 }
1073
1074 /**
1075  * Reads data from the object referenced by handle.
1076  * @dev: drm device pointer
1077  * @data: ioctl data blob
1078  * @file: drm file pointer
1079  *
1080  * On error, the contents of *data are undefined.
1081  */
1082 int
1083 i915_gem_pread_ioctl(struct drm_device *dev, void *data,
1084                      struct drm_file *file)
1085 {
1086         struct drm_i915_gem_pread *args = data;
1087         struct drm_i915_gem_object *obj;
1088         int ret;
1089
1090         if (args->size == 0)
1091                 return 0;
1092
1093         if (!access_ok(u64_to_user_ptr(args->data_ptr),
1094                        args->size))
1095                 return -EFAULT;
1096
1097         obj = i915_gem_object_lookup(file, args->handle);
1098         if (!obj)
1099                 return -ENOENT;
1100
1101         /* Bounds check source.  */
1102         if (range_overflows_t(u64, args->offset, args->size, obj->base.size)) {
1103                 ret = -EINVAL;
1104                 goto out;
1105         }
1106
1107         trace_i915_gem_object_pread(obj, args->offset, args->size);
1108
1109         ret = i915_gem_object_wait(obj,
1110                                    I915_WAIT_INTERRUPTIBLE,
1111                                    MAX_SCHEDULE_TIMEOUT);
1112         if (ret)
1113                 goto out;
1114
1115         ret = i915_gem_object_pin_pages(obj);
1116         if (ret)
1117                 goto out;
1118
1119         ret = i915_gem_shmem_pread(obj, args);
1120         if (ret == -EFAULT || ret == -ENODEV)
1121                 ret = i915_gem_gtt_pread(obj, args);
1122
1123         i915_gem_object_unpin_pages(obj);
1124 out:
1125         i915_gem_object_put(obj);
1126         return ret;
1127 }
1128
1129 /* This is the fast write path which cannot handle
1130  * page faults in the source data
1131  */
1132
1133 static inline bool
1134 ggtt_write(struct io_mapping *mapping,
1135            loff_t base, int offset,
1136            char __user *user_data, int length)
1137 {
1138         void __iomem *vaddr;
1139         unsigned long unwritten;
1140
1141         /* We can use the cpu mem copy function because this is X86. */
1142         vaddr = io_mapping_map_atomic_wc(mapping, base);
1143         unwritten = __copy_from_user_inatomic_nocache((void __force *)vaddr + offset,
1144                                                       user_data, length);
1145         io_mapping_unmap_atomic(vaddr);
1146         if (unwritten) {
1147                 vaddr = io_mapping_map_wc(mapping, base, PAGE_SIZE);
1148                 unwritten = copy_from_user((void __force *)vaddr + offset,
1149                                            user_data, length);
1150                 io_mapping_unmap(vaddr);
1151         }
1152
1153         return unwritten;
1154 }
1155
1156 /**
1157  * This is the fast pwrite path, where we copy the data directly from the
1158  * user into the GTT, uncached.
1159  * @obj: i915 GEM object
1160  * @args: pwrite arguments structure
1161  */
1162 static int
1163 i915_gem_gtt_pwrite_fast(struct drm_i915_gem_object *obj,
1164                          const struct drm_i915_gem_pwrite *args)
1165 {
1166         struct drm_i915_private *i915 = to_i915(obj->base.dev);
1167         struct i915_ggtt *ggtt = &i915->ggtt;
1168         intel_wakeref_t wakeref;
1169         struct drm_mm_node node;
1170         struct i915_vma *vma;
1171         u64 remain, offset;
1172         void __user *user_data;
1173         int ret;
1174
1175         ret = mutex_lock_interruptible(&i915->drm.struct_mutex);
1176         if (ret)
1177                 return ret;
1178
1179         if (i915_gem_object_has_struct_page(obj)) {
1180                 /*
1181                  * Avoid waking the device up if we can fallback, as
1182                  * waking/resuming is very slow (worst-case 10-100 ms
1183                  * depending on PCI sleeps and our own resume time).
1184                  * This easily dwarfs any performance advantage from
1185                  * using the cache bypass of indirect GGTT access.
1186                  */
1187                 wakeref = intel_runtime_pm_get_if_in_use(i915);
1188                 if (!wakeref) {
1189                         ret = -EFAULT;
1190                         goto out_unlock;
1191                 }
1192         } else {
1193                 /* No backing pages, no fallback, we must force GGTT access */
1194                 wakeref = intel_runtime_pm_get(i915);
1195         }
1196
1197         vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
1198                                        PIN_MAPPABLE |
1199                                        PIN_NONFAULT |
1200                                        PIN_NONBLOCK);
1201         if (!IS_ERR(vma)) {
1202                 node.start = i915_ggtt_offset(vma);
1203                 node.allocated = false;
1204                 ret = i915_vma_put_fence(vma);
1205                 if (ret) {
1206                         i915_vma_unpin(vma);
1207                         vma = ERR_PTR(ret);
1208                 }
1209         }
1210         if (IS_ERR(vma)) {
1211                 ret = insert_mappable_node(ggtt, &node, PAGE_SIZE);
1212                 if (ret)
1213                         goto out_rpm;
1214                 GEM_BUG_ON(!node.allocated);
1215         }
1216
1217         ret = i915_gem_object_set_to_gtt_domain(obj, true);
1218         if (ret)
1219                 goto out_unpin;
1220
1221         mutex_unlock(&i915->drm.struct_mutex);
1222
1223         intel_fb_obj_invalidate(obj, ORIGIN_CPU);
1224
1225         user_data = u64_to_user_ptr(args->data_ptr);
1226         offset = args->offset;
1227         remain = args->size;
1228         while (remain) {
1229                 /* Operation in this page
1230                  *
1231                  * page_base = page offset within aperture
1232                  * page_offset = offset within page
1233                  * page_length = bytes to copy for this page
1234                  */
1235                 u32 page_base = node.start;
1236                 unsigned int page_offset = offset_in_page(offset);
1237                 unsigned int page_length = PAGE_SIZE - page_offset;
1238                 page_length = remain < page_length ? remain : page_length;
1239                 if (node.allocated) {
1240                         wmb(); /* flush the write before we modify the GGTT */
1241                         ggtt->vm.insert_page(&ggtt->vm,
1242                                              i915_gem_object_get_dma_address(obj, offset >> PAGE_SHIFT),
1243                                              node.start, I915_CACHE_NONE, 0);
1244                         wmb(); /* flush modifications to the GGTT (insert_page) */
1245                 } else {
1246                         page_base += offset & PAGE_MASK;
1247                 }
1248                 /* If we get a fault while copying data, then (presumably) our
1249                  * source page isn't available.  Return the error and we'll
1250                  * retry in the slow path.
1251                  * If the object is non-shmem backed, we retry again with the
1252                  * path that handles page fault.
1253                  */
1254                 if (ggtt_write(&ggtt->iomap, page_base, page_offset,
1255                                user_data, page_length)) {
1256                         ret = -EFAULT;
1257                         break;
1258                 }
1259
1260                 remain -= page_length;
1261                 user_data += page_length;
1262                 offset += page_length;
1263         }
1264         intel_fb_obj_flush(obj, ORIGIN_CPU);
1265
1266         mutex_lock(&i915->drm.struct_mutex);
1267 out_unpin:
1268         if (node.allocated) {
1269                 wmb();
1270                 ggtt->vm.clear_range(&ggtt->vm, node.start, node.size);
1271                 remove_mappable_node(&node);
1272         } else {
1273                 i915_vma_unpin(vma);
1274         }
1275 out_rpm:
1276         intel_runtime_pm_put(i915, wakeref);
1277 out_unlock:
1278         mutex_unlock(&i915->drm.struct_mutex);
1279         return ret;
1280 }
1281
1282 /* Per-page copy function for the shmem pwrite fastpath.
1283  * Flushes invalid cachelines before writing to the target if
1284  * needs_clflush_before is set and flushes out any written cachelines after
1285  * writing if needs_clflush is set.
1286  */
1287 static int
1288 shmem_pwrite(struct page *page, int offset, int len, char __user *user_data,
1289              bool needs_clflush_before,
1290              bool needs_clflush_after)
1291 {
1292         char *vaddr;
1293         int ret;
1294
1295         vaddr = kmap(page);
1296
1297         if (needs_clflush_before)
1298                 drm_clflush_virt_range(vaddr + offset, len);
1299
1300         ret = __copy_from_user(vaddr + offset, user_data, len);
1301         if (!ret && needs_clflush_after)
1302                 drm_clflush_virt_range(vaddr + offset, len);
1303
1304         kunmap(page);
1305
1306         return ret ? -EFAULT : 0;
1307 }
1308
1309 static int
1310 i915_gem_shmem_pwrite(struct drm_i915_gem_object *obj,
1311                       const struct drm_i915_gem_pwrite *args)
1312 {
1313         struct drm_i915_private *i915 = to_i915(obj->base.dev);
1314         void __user *user_data;
1315         u64 remain;
1316         unsigned int partial_cacheline_write;
1317         unsigned int needs_clflush;
1318         unsigned int offset, idx;
1319         int ret;
1320
1321         ret = mutex_lock_interruptible(&i915->drm.struct_mutex);
1322         if (ret)
1323                 return ret;
1324
1325         ret = i915_gem_obj_prepare_shmem_write(obj, &needs_clflush);
1326         mutex_unlock(&i915->drm.struct_mutex);
1327         if (ret)
1328                 return ret;
1329
1330         /* If we don't overwrite a cacheline completely we need to be
1331          * careful to have up-to-date data by first clflushing. Don't
1332          * overcomplicate things and flush the entire patch.
1333          */
1334         partial_cacheline_write = 0;
1335         if (needs_clflush & CLFLUSH_BEFORE)
1336                 partial_cacheline_write = boot_cpu_data.x86_clflush_size - 1;
1337
1338         user_data = u64_to_user_ptr(args->data_ptr);
1339         remain = args->size;
1340         offset = offset_in_page(args->offset);
1341         for (idx = args->offset >> PAGE_SHIFT; remain; idx++) {
1342                 struct page *page = i915_gem_object_get_page(obj, idx);
1343                 unsigned int length = min_t(u64, remain, PAGE_SIZE - offset);
1344
1345                 ret = shmem_pwrite(page, offset, length, user_data,
1346                                    (offset | length) & partial_cacheline_write,
1347                                    needs_clflush & CLFLUSH_AFTER);
1348                 if (ret)
1349                         break;
1350
1351                 remain -= length;
1352                 user_data += length;
1353                 offset = 0;
1354         }
1355
1356         intel_fb_obj_flush(obj, ORIGIN_CPU);
1357         i915_gem_obj_finish_shmem_access(obj);
1358         return ret;
1359 }
1360
1361 /**
1362  * Writes data to the object referenced by handle.
1363  * @dev: drm device
1364  * @data: ioctl data blob
1365  * @file: drm file
1366  *
1367  * On error, the contents of the buffer that were to be modified are undefined.
1368  */
1369 int
1370 i915_gem_pwrite_ioctl(struct drm_device *dev, void *data,
1371                       struct drm_file *file)
1372 {
1373         struct drm_i915_gem_pwrite *args = data;
1374         struct drm_i915_gem_object *obj;
1375         int ret;
1376
1377         if (args->size == 0)
1378                 return 0;
1379
1380         if (!access_ok(u64_to_user_ptr(args->data_ptr), args->size))
1381                 return -EFAULT;
1382
1383         obj = i915_gem_object_lookup(file, args->handle);
1384         if (!obj)
1385                 return -ENOENT;
1386
1387         /* Bounds check destination. */
1388         if (range_overflows_t(u64, args->offset, args->size, obj->base.size)) {
1389                 ret = -EINVAL;
1390                 goto err;
1391         }
1392
1393         /* Writes not allowed into this read-only object */
1394         if (i915_gem_object_is_readonly(obj)) {
1395                 ret = -EINVAL;
1396                 goto err;
1397         }
1398
1399         trace_i915_gem_object_pwrite(obj, args->offset, args->size);
1400
1401         ret = -ENODEV;
1402         if (obj->ops->pwrite)
1403                 ret = obj->ops->pwrite(obj, args);
1404         if (ret != -ENODEV)
1405                 goto err;
1406
1407         ret = i915_gem_object_wait(obj,
1408                                    I915_WAIT_INTERRUPTIBLE |
1409                                    I915_WAIT_ALL,
1410                                    MAX_SCHEDULE_TIMEOUT);
1411         if (ret)
1412                 goto err;
1413
1414         ret = i915_gem_object_pin_pages(obj);
1415         if (ret)
1416                 goto err;
1417
1418         ret = -EFAULT;
1419         /* We can only do the GTT pwrite on untiled buffers, as otherwise
1420          * it would end up going through the fenced access, and we'll get
1421          * different detiling behavior between reading and writing.
1422          * pread/pwrite currently are reading and writing from the CPU
1423          * perspective, requiring manual detiling by the client.
1424          */
1425         if (!i915_gem_object_has_struct_page(obj) ||
1426             cpu_write_needs_clflush(obj))
1427                 /* Note that the gtt paths might fail with non-page-backed user
1428                  * pointers (e.g. gtt mappings when moving data between
1429                  * textures). Fallback to the shmem path in that case.
1430                  */
1431                 ret = i915_gem_gtt_pwrite_fast(obj, args);
1432
1433         if (ret == -EFAULT || ret == -ENOSPC) {
1434                 if (obj->phys_handle)
1435                         ret = i915_gem_phys_pwrite(obj, args, file);
1436                 else
1437                         ret = i915_gem_shmem_pwrite(obj, args);
1438         }
1439
1440         i915_gem_object_unpin_pages(obj);
1441 err:
1442         i915_gem_object_put(obj);
1443         return ret;
1444 }
1445
1446 static void i915_gem_object_bump_inactive_ggtt(struct drm_i915_gem_object *obj)
1447 {
1448         struct drm_i915_private *i915 = to_i915(obj->base.dev);
1449         struct list_head *list;
1450         struct i915_vma *vma;
1451
1452         GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
1453
1454         mutex_lock(&i915->ggtt.vm.mutex);
1455         for_each_ggtt_vma(vma, obj) {
1456                 if (!drm_mm_node_allocated(&vma->node))
1457                         continue;
1458
1459                 list_move_tail(&vma->vm_link, &vma->vm->bound_list);
1460         }
1461         mutex_unlock(&i915->ggtt.vm.mutex);
1462
1463         spin_lock(&i915->mm.obj_lock);
1464         list = obj->bind_count ? &i915->mm.bound_list : &i915->mm.unbound_list;
1465         list_move_tail(&obj->mm.link, list);
1466         spin_unlock(&i915->mm.obj_lock);
1467 }
1468
1469 /**
1470  * Called when user space prepares to use an object with the CPU, either
1471  * through the mmap ioctl's mapping or a GTT mapping.
1472  * @dev: drm device
1473  * @data: ioctl data blob
1474  * @file: drm file
1475  */
1476 int
1477 i915_gem_set_domain_ioctl(struct drm_device *dev, void *data,
1478                           struct drm_file *file)
1479 {
1480         struct drm_i915_gem_set_domain *args = data;
1481         struct drm_i915_gem_object *obj;
1482         u32 read_domains = args->read_domains;
1483         u32 write_domain = args->write_domain;
1484         int err;
1485
1486         /* Only handle setting domains to types used by the CPU. */
1487         if ((write_domain | read_domains) & I915_GEM_GPU_DOMAINS)
1488                 return -EINVAL;
1489
1490         /*
1491          * Having something in the write domain implies it's in the read
1492          * domain, and only that read domain.  Enforce that in the request.
1493          */
1494         if (write_domain && read_domains != write_domain)
1495                 return -EINVAL;
1496
1497         if (!read_domains)
1498                 return 0;
1499
1500         obj = i915_gem_object_lookup(file, args->handle);
1501         if (!obj)
1502                 return -ENOENT;
1503
1504         /*
1505          * Already in the desired write domain? Nothing for us to do!
1506          *
1507          * We apply a little bit of cunning here to catch a broader set of
1508          * no-ops. If obj->write_domain is set, we must be in the same
1509          * obj->read_domains, and only that domain. Therefore, if that
1510          * obj->write_domain matches the request read_domains, we are
1511          * already in the same read/write domain and can skip the operation,
1512          * without having to further check the requested write_domain.
1513          */
1514         if (READ_ONCE(obj->write_domain) == read_domains) {
1515                 err = 0;
1516                 goto out;
1517         }
1518
1519         /*
1520          * Try to flush the object off the GPU without holding the lock.
1521          * We will repeat the flush holding the lock in the normal manner
1522          * to catch cases where we are gazumped.
1523          */
1524         err = i915_gem_object_wait(obj,
1525                                    I915_WAIT_INTERRUPTIBLE |
1526                                    I915_WAIT_PRIORITY |
1527                                    (write_domain ? I915_WAIT_ALL : 0),
1528                                    MAX_SCHEDULE_TIMEOUT);
1529         if (err)
1530                 goto out;
1531
1532         /*
1533          * Proxy objects do not control access to the backing storage, ergo
1534          * they cannot be used as a means to manipulate the cache domain
1535          * tracking for that backing storage. The proxy object is always
1536          * considered to be outside of any cache domain.
1537          */
1538         if (i915_gem_object_is_proxy(obj)) {
1539                 err = -ENXIO;
1540                 goto out;
1541         }
1542
1543         /*
1544          * Flush and acquire obj->pages so that we are coherent through
1545          * direct access in memory with previous cached writes through
1546          * shmemfs and that our cache domain tracking remains valid.
1547          * For example, if the obj->filp was moved to swap without us
1548          * being notified and releasing the pages, we would mistakenly
1549          * continue to assume that the obj remained out of the CPU cached
1550          * domain.
1551          */
1552         err = i915_gem_object_pin_pages(obj);
1553         if (err)
1554                 goto out;
1555
1556         err = i915_mutex_lock_interruptible(dev);
1557         if (err)
1558                 goto out_unpin;
1559
1560         if (read_domains & I915_GEM_DOMAIN_WC)
1561                 err = i915_gem_object_set_to_wc_domain(obj, write_domain);
1562         else if (read_domains & I915_GEM_DOMAIN_GTT)
1563                 err = i915_gem_object_set_to_gtt_domain(obj, write_domain);
1564         else
1565                 err = i915_gem_object_set_to_cpu_domain(obj, write_domain);
1566
1567         /* And bump the LRU for this access */
1568         i915_gem_object_bump_inactive_ggtt(obj);
1569
1570         mutex_unlock(&dev->struct_mutex);
1571
1572         if (write_domain != 0)
1573                 intel_fb_obj_invalidate(obj,
1574                                         fb_write_origin(obj, write_domain));
1575
1576 out_unpin:
1577         i915_gem_object_unpin_pages(obj);
1578 out:
1579         i915_gem_object_put(obj);
1580         return err;
1581 }
1582
1583 /**
1584  * Called when user space has done writes to this buffer
1585  * @dev: drm device
1586  * @data: ioctl data blob
1587  * @file: drm file
1588  */
1589 int
1590 i915_gem_sw_finish_ioctl(struct drm_device *dev, void *data,
1591                          struct drm_file *file)
1592 {
1593         struct drm_i915_gem_sw_finish *args = data;
1594         struct drm_i915_gem_object *obj;
1595
1596         obj = i915_gem_object_lookup(file, args->handle);
1597         if (!obj)
1598                 return -ENOENT;
1599
1600         /*
1601          * Proxy objects are barred from CPU access, so there is no
1602          * need to ban sw_finish as it is a nop.
1603          */
1604
1605         /* Pinned buffers may be scanout, so flush the cache */
1606         i915_gem_object_flush_if_display(obj);
1607         i915_gem_object_put(obj);
1608
1609         return 0;
1610 }
1611
1612 static inline bool
1613 __vma_matches(struct vm_area_struct *vma, struct file *filp,
1614               unsigned long addr, unsigned long size)
1615 {
1616         if (vma->vm_file != filp)
1617                 return false;
1618
1619         return vma->vm_start == addr &&
1620                (vma->vm_end - vma->vm_start) == PAGE_ALIGN(size);
1621 }
1622
1623 /**
1624  * i915_gem_mmap_ioctl - Maps the contents of an object, returning the address
1625  *                       it is mapped to.
1626  * @dev: drm device
1627  * @data: ioctl data blob
1628  * @file: drm file
1629  *
1630  * While the mapping holds a reference on the contents of the object, it doesn't
1631  * imply a ref on the object itself.
1632  *
1633  * IMPORTANT:
1634  *
1635  * DRM driver writers who look a this function as an example for how to do GEM
1636  * mmap support, please don't implement mmap support like here. The modern way
1637  * to implement DRM mmap support is with an mmap offset ioctl (like
1638  * i915_gem_mmap_gtt) and then using the mmap syscall on the DRM fd directly.
1639  * That way debug tooling like valgrind will understand what's going on, hiding
1640  * the mmap call in a driver private ioctl will break that. The i915 driver only
1641  * does cpu mmaps this way because we didn't know better.
1642  */
1643 int
1644 i915_gem_mmap_ioctl(struct drm_device *dev, void *data,
1645                     struct drm_file *file)
1646 {
1647         struct drm_i915_gem_mmap *args = data;
1648         struct drm_i915_gem_object *obj;
1649         unsigned long addr;
1650
1651         if (args->flags & ~(I915_MMAP_WC))
1652                 return -EINVAL;
1653
1654         if (args->flags & I915_MMAP_WC && !boot_cpu_has(X86_FEATURE_PAT))
1655                 return -ENODEV;
1656
1657         obj = i915_gem_object_lookup(file, args->handle);
1658         if (!obj)
1659                 return -ENOENT;
1660
1661         /* prime objects have no backing filp to GEM mmap
1662          * pages from.
1663          */
1664         if (!obj->base.filp) {
1665                 addr = -ENXIO;
1666                 goto err;
1667         }
1668
1669         if (range_overflows(args->offset, args->size, (u64)obj->base.size)) {
1670                 addr = -EINVAL;
1671                 goto err;
1672         }
1673
1674         addr = vm_mmap(obj->base.filp, 0, args->size,
1675                        PROT_READ | PROT_WRITE, MAP_SHARED,
1676                        args->offset);
1677         if (IS_ERR_VALUE(addr))
1678                 goto err;
1679
1680         if (args->flags & I915_MMAP_WC) {
1681                 struct mm_struct *mm = current->mm;
1682                 struct vm_area_struct *vma;
1683
1684                 if (down_write_killable(&mm->mmap_sem)) {
1685                         addr = -EINTR;
1686                         goto err;
1687                 }
1688                 vma = find_vma(mm, addr);
1689                 if (vma && __vma_matches(vma, obj->base.filp, addr, args->size))
1690                         vma->vm_page_prot =
1691                                 pgprot_writecombine(vm_get_page_prot(vma->vm_flags));
1692                 else
1693                         addr = -ENOMEM;
1694                 up_write(&mm->mmap_sem);
1695                 if (IS_ERR_VALUE(addr))
1696                         goto err;
1697
1698                 /* This may race, but that's ok, it only gets set */
1699                 WRITE_ONCE(obj->frontbuffer_ggtt_origin, ORIGIN_CPU);
1700         }
1701         i915_gem_object_put(obj);
1702
1703         args->addr_ptr = (u64)addr;
1704         return 0;
1705
1706 err:
1707         i915_gem_object_put(obj);
1708         return addr;
1709 }
1710
1711 static unsigned int tile_row_pages(const struct drm_i915_gem_object *obj)
1712 {
1713         return i915_gem_object_get_tile_row_size(obj) >> PAGE_SHIFT;
1714 }
1715
1716 /**
1717  * i915_gem_mmap_gtt_version - report the current feature set for GTT mmaps
1718  *
1719  * A history of the GTT mmap interface:
1720  *
1721  * 0 - Everything had to fit into the GTT. Both parties of a memcpy had to
1722  *     aligned and suitable for fencing, and still fit into the available
1723  *     mappable space left by the pinned display objects. A classic problem
1724  *     we called the page-fault-of-doom where we would ping-pong between
1725  *     two objects that could not fit inside the GTT and so the memcpy
1726  *     would page one object in at the expense of the other between every
1727  *     single byte.
1728  *
1729  * 1 - Objects can be any size, and have any compatible fencing (X Y, or none
1730  *     as set via i915_gem_set_tiling() [DRM_I915_GEM_SET_TILING]). If the
1731  *     object is too large for the available space (or simply too large
1732  *     for the mappable aperture!), a view is created instead and faulted
1733  *     into userspace. (This view is aligned and sized appropriately for
1734  *     fenced access.)
1735  *
1736  * 2 - Recognise WC as a separate cache domain so that we can flush the
1737  *     delayed writes via GTT before performing direct access via WC.
1738  *
1739  * 3 - Remove implicit set-domain(GTT) and synchronisation on initial
1740  *     pagefault; swapin remains transparent.
1741  *
1742  * Restrictions:
1743  *
1744  *  * snoopable objects cannot be accessed via the GTT. It can cause machine
1745  *    hangs on some architectures, corruption on others. An attempt to service
1746  *    a GTT page fault from a snoopable object will generate a SIGBUS.
1747  *
1748  *  * the object must be able to fit into RAM (physical memory, though no
1749  *    limited to the mappable aperture).
1750  *
1751  *
1752  * Caveats:
1753  *
1754  *  * a new GTT page fault will synchronize rendering from the GPU and flush
1755  *    all data to system memory. Subsequent access will not be synchronized.
1756  *
1757  *  * all mappings are revoked on runtime device suspend.
1758  *
1759  *  * there are only 8, 16 or 32 fence registers to share between all users
1760  *    (older machines require fence register for display and blitter access
1761  *    as well). Contention of the fence registers will cause the previous users
1762  *    to be unmapped and any new access will generate new page faults.
1763  *
1764  *  * running out of memory while servicing a fault may generate a SIGBUS,
1765  *    rather than the expected SIGSEGV.
1766  */
1767 int i915_gem_mmap_gtt_version(void)
1768 {
1769         return 3;
1770 }
1771
1772 static inline struct i915_ggtt_view
1773 compute_partial_view(const struct drm_i915_gem_object *obj,
1774                      pgoff_t page_offset,
1775                      unsigned int chunk)
1776 {
1777         struct i915_ggtt_view view;
1778
1779         if (i915_gem_object_is_tiled(obj))
1780                 chunk = roundup(chunk, tile_row_pages(obj));
1781
1782         view.type = I915_GGTT_VIEW_PARTIAL;
1783         view.partial.offset = rounddown(page_offset, chunk);
1784         view.partial.size =
1785                 min_t(unsigned int, chunk,
1786                       (obj->base.size >> PAGE_SHIFT) - view.partial.offset);
1787
1788         /* If the partial covers the entire object, just create a normal VMA. */
1789         if (chunk >= obj->base.size >> PAGE_SHIFT)
1790                 view.type = I915_GGTT_VIEW_NORMAL;
1791
1792         return view;
1793 }
1794
1795 /**
1796  * i915_gem_fault - fault a page into the GTT
1797  * @vmf: fault info
1798  *
1799  * The fault handler is set up by drm_gem_mmap() when a object is GTT mapped
1800  * from userspace.  The fault handler takes care of binding the object to
1801  * the GTT (if needed), allocating and programming a fence register (again,
1802  * only if needed based on whether the old reg is still valid or the object
1803  * is tiled) and inserting a new PTE into the faulting process.
1804  *
1805  * Note that the faulting process may involve evicting existing objects
1806  * from the GTT and/or fence registers to make room.  So performance may
1807  * suffer if the GTT working set is large or there are few fence registers
1808  * left.
1809  *
1810  * The current feature set supported by i915_gem_fault() and thus GTT mmaps
1811  * is exposed via I915_PARAM_MMAP_GTT_VERSION (see i915_gem_mmap_gtt_version).
1812  */
1813 vm_fault_t i915_gem_fault(struct vm_fault *vmf)
1814 {
1815 #define MIN_CHUNK_PAGES (SZ_1M >> PAGE_SHIFT)
1816         struct vm_area_struct *area = vmf->vma;
1817         struct drm_i915_gem_object *obj = to_intel_bo(area->vm_private_data);
1818         struct drm_device *dev = obj->base.dev;
1819         struct drm_i915_private *dev_priv = to_i915(dev);
1820         struct i915_ggtt *ggtt = &dev_priv->ggtt;
1821         bool write = area->vm_flags & VM_WRITE;
1822         intel_wakeref_t wakeref;
1823         struct i915_vma *vma;
1824         pgoff_t page_offset;
1825         int srcu;
1826         int ret;
1827
1828         /* Sanity check that we allow writing into this object */
1829         if (i915_gem_object_is_readonly(obj) && write)
1830                 return VM_FAULT_SIGBUS;
1831
1832         /* We don't use vmf->pgoff since that has the fake offset */
1833         page_offset = (vmf->address - area->vm_start) >> PAGE_SHIFT;
1834
1835         trace_i915_gem_object_fault(obj, page_offset, true, write);
1836
1837         ret = i915_gem_object_pin_pages(obj);
1838         if (ret)
1839                 goto err;
1840
1841         wakeref = intel_runtime_pm_get(dev_priv);
1842
1843         srcu = i915_reset_trylock(dev_priv);
1844         if (srcu < 0) {
1845                 ret = srcu;
1846                 goto err_rpm;
1847         }
1848
1849         ret = i915_mutex_lock_interruptible(dev);
1850         if (ret)
1851                 goto err_reset;
1852
1853         /* Access to snoopable pages through the GTT is incoherent. */
1854         if (obj->cache_level != I915_CACHE_NONE && !HAS_LLC(dev_priv)) {
1855                 ret = -EFAULT;
1856                 goto err_unlock;
1857         }
1858
1859         /* Now pin it into the GTT as needed */
1860         vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
1861                                        PIN_MAPPABLE |
1862                                        PIN_NONBLOCK |
1863                                        PIN_NONFAULT);
1864         if (IS_ERR(vma)) {
1865                 /* Use a partial view if it is bigger than available space */
1866                 struct i915_ggtt_view view =
1867                         compute_partial_view(obj, page_offset, MIN_CHUNK_PAGES);
1868                 unsigned int flags;
1869
1870                 flags = PIN_MAPPABLE;
1871                 if (view.type == I915_GGTT_VIEW_NORMAL)
1872                         flags |= PIN_NONBLOCK; /* avoid warnings for pinned */
1873
1874                 /*
1875                  * Userspace is now writing through an untracked VMA, abandon
1876                  * all hope that the hardware is able to track future writes.
1877                  */
1878                 obj->frontbuffer_ggtt_origin = ORIGIN_CPU;
1879
1880                 vma = i915_gem_object_ggtt_pin(obj, &view, 0, 0, flags);
1881                 if (IS_ERR(vma) && !view.type) {
1882                         flags = PIN_MAPPABLE;
1883                         view.type = I915_GGTT_VIEW_PARTIAL;
1884                         vma = i915_gem_object_ggtt_pin(obj, &view, 0, 0, flags);
1885                 }
1886         }
1887         if (IS_ERR(vma)) {
1888                 ret = PTR_ERR(vma);
1889                 goto err_unlock;
1890         }
1891
1892         ret = i915_vma_pin_fence(vma);
1893         if (ret)
1894                 goto err_unpin;
1895
1896         /* Finally, remap it using the new GTT offset */
1897         ret = remap_io_mapping(area,
1898                                area->vm_start + (vma->ggtt_view.partial.offset << PAGE_SHIFT),
1899                                (ggtt->gmadr.start + vma->node.start) >> PAGE_SHIFT,
1900                                min_t(u64, vma->size, area->vm_end - area->vm_start),
1901                                &ggtt->iomap);
1902         if (ret)
1903                 goto err_fence;
1904
1905         /* Mark as being mmapped into userspace for later revocation */
1906         assert_rpm_wakelock_held(dev_priv);
1907         if (!i915_vma_set_userfault(vma) && !obj->userfault_count++)
1908                 list_add(&obj->userfault_link, &dev_priv->mm.userfault_list);
1909         GEM_BUG_ON(!obj->userfault_count);
1910
1911         i915_vma_set_ggtt_write(vma);
1912
1913 err_fence:
1914         i915_vma_unpin_fence(vma);
1915 err_unpin:
1916         __i915_vma_unpin(vma);
1917 err_unlock:
1918         mutex_unlock(&dev->struct_mutex);
1919 err_reset:
1920         i915_reset_unlock(dev_priv, srcu);
1921 err_rpm:
1922         intel_runtime_pm_put(dev_priv, wakeref);
1923         i915_gem_object_unpin_pages(obj);
1924 err:
1925         switch (ret) {
1926         case -EIO:
1927                 /*
1928                  * We eat errors when the gpu is terminally wedged to avoid
1929                  * userspace unduly crashing (gl has no provisions for mmaps to
1930                  * fail). But any other -EIO isn't ours (e.g. swap in failure)
1931                  * and so needs to be reported.
1932                  */
1933                 if (!i915_terminally_wedged(dev_priv))
1934                         return VM_FAULT_SIGBUS;
1935                 /* else: fall through */
1936         case -EAGAIN:
1937                 /*
1938                  * EAGAIN means the gpu is hung and we'll wait for the error
1939                  * handler to reset everything when re-faulting in
1940                  * i915_mutex_lock_interruptible.
1941                  */
1942         case 0:
1943         case -ERESTARTSYS:
1944         case -EINTR:
1945         case -EBUSY:
1946                 /*
1947                  * EBUSY is ok: this just means that another thread
1948                  * already did the job.
1949                  */
1950                 return VM_FAULT_NOPAGE;
1951         case -ENOMEM:
1952                 return VM_FAULT_OOM;
1953         case -ENOSPC:
1954         case -EFAULT:
1955                 return VM_FAULT_SIGBUS;
1956         default:
1957                 WARN_ONCE(ret, "unhandled error in i915_gem_fault: %i\n", ret);
1958                 return VM_FAULT_SIGBUS;
1959         }
1960 }
1961
1962 static void __i915_gem_object_release_mmap(struct drm_i915_gem_object *obj)
1963 {
1964         struct i915_vma *vma;
1965
1966         GEM_BUG_ON(!obj->userfault_count);
1967
1968         obj->userfault_count = 0;
1969         list_del(&obj->userfault_link);
1970         drm_vma_node_unmap(&obj->base.vma_node,
1971                            obj->base.dev->anon_inode->i_mapping);
1972
1973         for_each_ggtt_vma(vma, obj)
1974                 i915_vma_unset_userfault(vma);
1975 }
1976
1977 /**
1978  * i915_gem_release_mmap - remove physical page mappings
1979  * @obj: obj in question
1980  *
1981  * Preserve the reservation of the mmapping with the DRM core code, but
1982  * relinquish ownership of the pages back to the system.
1983  *
1984  * It is vital that we remove the page mapping if we have mapped a tiled
1985  * object through the GTT and then lose the fence register due to
1986  * resource pressure. Similarly if the object has been moved out of the
1987  * aperture, than pages mapped into userspace must be revoked. Removing the
1988  * mapping will then trigger a page fault on the next user access, allowing
1989  * fixup by i915_gem_fault().
1990  */
1991 void
1992 i915_gem_release_mmap(struct drm_i915_gem_object *obj)
1993 {
1994         struct drm_i915_private *i915 = to_i915(obj->base.dev);
1995         intel_wakeref_t wakeref;
1996
1997         /* Serialisation between user GTT access and our code depends upon
1998          * revoking the CPU's PTE whilst the mutex is held. The next user
1999          * pagefault then has to wait until we release the mutex.
2000          *
2001          * Note that RPM complicates somewhat by adding an additional
2002          * requirement that operations to the GGTT be made holding the RPM
2003          * wakeref.
2004          */
2005         lockdep_assert_held(&i915->drm.struct_mutex);
2006         wakeref = intel_runtime_pm_get(i915);
2007
2008         if (!obj->userfault_count)
2009                 goto out;
2010
2011         __i915_gem_object_release_mmap(obj);
2012
2013         /* Ensure that the CPU's PTE are revoked and there are not outstanding
2014          * memory transactions from userspace before we return. The TLB
2015          * flushing implied above by changing the PTE above *should* be
2016          * sufficient, an extra barrier here just provides us with a bit
2017          * of paranoid documentation about our requirement to serialise
2018          * memory writes before touching registers / GSM.
2019          */
2020         wmb();
2021
2022 out:
2023         intel_runtime_pm_put(i915, wakeref);
2024 }
2025
2026 void i915_gem_runtime_suspend(struct drm_i915_private *dev_priv)
2027 {
2028         struct drm_i915_gem_object *obj, *on;
2029         int i;
2030
2031         /*
2032          * Only called during RPM suspend. All users of the userfault_list
2033          * must be holding an RPM wakeref to ensure that this can not
2034          * run concurrently with themselves (and use the struct_mutex for
2035          * protection between themselves).
2036          */
2037
2038         list_for_each_entry_safe(obj, on,
2039                                  &dev_priv->mm.userfault_list, userfault_link)
2040                 __i915_gem_object_release_mmap(obj);
2041
2042         /* The fence will be lost when the device powers down. If any were
2043          * in use by hardware (i.e. they are pinned), we should not be powering
2044          * down! All other fences will be reacquired by the user upon waking.
2045          */
2046         for (i = 0; i < dev_priv->num_fence_regs; i++) {
2047                 struct drm_i915_fence_reg *reg = &dev_priv->fence_regs[i];
2048
2049                 /* Ideally we want to assert that the fence register is not
2050                  * live at this point (i.e. that no piece of code will be
2051                  * trying to write through fence + GTT, as that both violates
2052                  * our tracking of activity and associated locking/barriers,
2053                  * but also is illegal given that the hw is powered down).
2054                  *
2055                  * Previously we used reg->pin_count as a "liveness" indicator.
2056                  * That is not sufficient, and we need a more fine-grained
2057                  * tool if we want to have a sanity check here.
2058                  */
2059
2060                 if (!reg->vma)
2061                         continue;
2062
2063                 GEM_BUG_ON(i915_vma_has_userfault(reg->vma));
2064                 reg->dirty = true;
2065         }
2066 }
2067
2068 static int i915_gem_object_create_mmap_offset(struct drm_i915_gem_object *obj)
2069 {
2070         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
2071         int err;
2072
2073         err = drm_gem_create_mmap_offset(&obj->base);
2074         if (likely(!err))
2075                 return 0;
2076
2077         /* Attempt to reap some mmap space from dead objects */
2078         do {
2079                 err = i915_gem_wait_for_idle(dev_priv,
2080                                              I915_WAIT_INTERRUPTIBLE,
2081                                              MAX_SCHEDULE_TIMEOUT);
2082                 if (err)
2083                         break;
2084
2085                 i915_gem_drain_freed_objects(dev_priv);
2086                 err = drm_gem_create_mmap_offset(&obj->base);
2087                 if (!err)
2088                         break;
2089
2090         } while (flush_delayed_work(&dev_priv->gt.retire_work));
2091
2092         return err;
2093 }
2094
2095 static void i915_gem_object_free_mmap_offset(struct drm_i915_gem_object *obj)
2096 {
2097         drm_gem_free_mmap_offset(&obj->base);
2098 }
2099
2100 int
2101 i915_gem_mmap_gtt(struct drm_file *file,
2102                   struct drm_device *dev,
2103                   u32 handle,
2104                   u64 *offset)
2105 {
2106         struct drm_i915_gem_object *obj;
2107         int ret;
2108
2109         obj = i915_gem_object_lookup(file, handle);
2110         if (!obj)
2111                 return -ENOENT;
2112
2113         ret = i915_gem_object_create_mmap_offset(obj);
2114         if (ret == 0)
2115                 *offset = drm_vma_node_offset_addr(&obj->base.vma_node);
2116
2117         i915_gem_object_put(obj);
2118         return ret;
2119 }
2120
2121 /**
2122  * i915_gem_mmap_gtt_ioctl - prepare an object for GTT mmap'ing
2123  * @dev: DRM device
2124  * @data: GTT mapping ioctl data
2125  * @file: GEM object info
2126  *
2127  * Simply returns the fake offset to userspace so it can mmap it.
2128  * The mmap call will end up in drm_gem_mmap(), which will set things
2129  * up so we can get faults in the handler above.
2130  *
2131  * The fault handler will take care of binding the object into the GTT
2132  * (since it may have been evicted to make room for something), allocating
2133  * a fence register, and mapping the appropriate aperture address into
2134  * userspace.
2135  */
2136 int
2137 i915_gem_mmap_gtt_ioctl(struct drm_device *dev, void *data,
2138                         struct drm_file *file)
2139 {
2140         struct drm_i915_gem_mmap_gtt *args = data;
2141
2142         return i915_gem_mmap_gtt(file, dev, args->handle, &args->offset);
2143 }
2144
2145 /* Immediately discard the backing storage */
2146 static void
2147 i915_gem_object_truncate(struct drm_i915_gem_object *obj)
2148 {
2149         i915_gem_object_free_mmap_offset(obj);
2150
2151         if (obj->base.filp == NULL)
2152                 return;
2153
2154         /* Our goal here is to return as much of the memory as
2155          * is possible back to the system as we are called from OOM.
2156          * To do this we must instruct the shmfs to drop all of its
2157          * backing pages, *now*.
2158          */
2159         shmem_truncate_range(file_inode(obj->base.filp), 0, (loff_t)-1);
2160         obj->mm.madv = __I915_MADV_PURGED;
2161         obj->mm.pages = ERR_PTR(-EFAULT);
2162 }
2163
2164 /* Try to discard unwanted pages */
2165 void __i915_gem_object_invalidate(struct drm_i915_gem_object *obj)
2166 {
2167         struct address_space *mapping;
2168
2169         lockdep_assert_held(&obj->mm.lock);
2170         GEM_BUG_ON(i915_gem_object_has_pages(obj));
2171
2172         switch (obj->mm.madv) {
2173         case I915_MADV_DONTNEED:
2174                 i915_gem_object_truncate(obj);
2175         case __I915_MADV_PURGED:
2176                 return;
2177         }
2178
2179         if (obj->base.filp == NULL)
2180                 return;
2181
2182         mapping = obj->base.filp->f_mapping,
2183         invalidate_mapping_pages(mapping, 0, (loff_t)-1);
2184 }
2185
2186 /*
2187  * Move pages to appropriate lru and release the pagevec, decrementing the
2188  * ref count of those pages.
2189  */
2190 static void check_release_pagevec(struct pagevec *pvec)
2191 {
2192         check_move_unevictable_pages(pvec);
2193         __pagevec_release(pvec);
2194         cond_resched();
2195 }
2196
2197 static void
2198 i915_gem_object_put_pages_gtt(struct drm_i915_gem_object *obj,
2199                               struct sg_table *pages)
2200 {
2201         struct sgt_iter sgt_iter;
2202         struct pagevec pvec;
2203         struct page *page;
2204
2205         __i915_gem_object_release_shmem(obj, pages, true);
2206         i915_gem_gtt_finish_pages(obj, pages);
2207
2208         if (i915_gem_object_needs_bit17_swizzle(obj))
2209                 i915_gem_object_save_bit_17_swizzle(obj, pages);
2210
2211         mapping_clear_unevictable(file_inode(obj->base.filp)->i_mapping);
2212
2213         pagevec_init(&pvec);
2214         for_each_sgt_page(page, sgt_iter, pages) {
2215                 if (obj->mm.dirty)
2216                         set_page_dirty(page);
2217
2218                 if (obj->mm.madv == I915_MADV_WILLNEED)
2219                         mark_page_accessed(page);
2220
2221                 if (!pagevec_add(&pvec, page))
2222                         check_release_pagevec(&pvec);
2223         }
2224         if (pagevec_count(&pvec))
2225                 check_release_pagevec(&pvec);
2226         obj->mm.dirty = false;
2227
2228         sg_free_table(pages);
2229         kfree(pages);
2230 }
2231
2232 static void __i915_gem_object_reset_page_iter(struct drm_i915_gem_object *obj)
2233 {
2234         struct radix_tree_iter iter;
2235         void __rcu **slot;
2236
2237         rcu_read_lock();
2238         radix_tree_for_each_slot(slot, &obj->mm.get_page.radix, &iter, 0)
2239                 radix_tree_delete(&obj->mm.get_page.radix, iter.index);
2240         rcu_read_unlock();
2241 }
2242
2243 static struct sg_table *
2244 __i915_gem_object_unset_pages(struct drm_i915_gem_object *obj)
2245 {
2246         struct drm_i915_private *i915 = to_i915(obj->base.dev);
2247         struct sg_table *pages;
2248
2249         pages = fetch_and_zero(&obj->mm.pages);
2250         if (IS_ERR_OR_NULL(pages))
2251                 return pages;
2252
2253         spin_lock(&i915->mm.obj_lock);
2254         list_del(&obj->mm.link);
2255         spin_unlock(&i915->mm.obj_lock);
2256
2257         if (obj->mm.mapping) {
2258                 void *ptr;
2259
2260                 ptr = page_mask_bits(obj->mm.mapping);
2261                 if (is_vmalloc_addr(ptr))
2262                         vunmap(ptr);
2263                 else
2264                         kunmap(kmap_to_page(ptr));
2265
2266                 obj->mm.mapping = NULL;
2267         }
2268
2269         __i915_gem_object_reset_page_iter(obj);
2270         obj->mm.page_sizes.phys = obj->mm.page_sizes.sg = 0;
2271
2272         return pages;
2273 }
2274
2275 int __i915_gem_object_put_pages(struct drm_i915_gem_object *obj,
2276                                 enum i915_mm_subclass subclass)
2277 {
2278         struct sg_table *pages;
2279         int ret;
2280
2281         if (i915_gem_object_has_pinned_pages(obj))
2282                 return -EBUSY;
2283
2284         GEM_BUG_ON(obj->bind_count);
2285
2286         /* May be called by shrinker from within get_pages() (on another bo) */
2287         mutex_lock_nested(&obj->mm.lock, subclass);
2288         if (unlikely(atomic_read(&obj->mm.pages_pin_count))) {
2289                 ret = -EBUSY;
2290                 goto unlock;
2291         }
2292
2293         /*
2294          * ->put_pages might need to allocate memory for the bit17 swizzle
2295          * array, hence protect them from being reaped by removing them from gtt
2296          * lists early.
2297          */
2298         pages = __i915_gem_object_unset_pages(obj);
2299
2300         /*
2301          * XXX Temporary hijinx to avoid updating all backends to handle
2302          * NULL pages. In the future, when we have more asynchronous
2303          * get_pages backends we should be better able to handle the
2304          * cancellation of the async task in a more uniform manner.
2305          */
2306         if (!pages && !i915_gem_object_needs_async_cancel(obj))
2307                 pages = ERR_PTR(-EINVAL);
2308
2309         if (!IS_ERR(pages))
2310                 obj->ops->put_pages(obj, pages);
2311
2312         ret = 0;
2313 unlock:
2314         mutex_unlock(&obj->mm.lock);
2315
2316         return ret;
2317 }
2318
2319 bool i915_sg_trim(struct sg_table *orig_st)
2320 {
2321         struct sg_table new_st;
2322         struct scatterlist *sg, *new_sg;
2323         unsigned int i;
2324
2325         if (orig_st->nents == orig_st->orig_nents)
2326                 return false;
2327
2328         if (sg_alloc_table(&new_st, orig_st->nents, GFP_KERNEL | __GFP_NOWARN))
2329                 return false;
2330
2331         new_sg = new_st.sgl;
2332         for_each_sg(orig_st->sgl, sg, orig_st->nents, i) {
2333                 sg_set_page(new_sg, sg_page(sg), sg->length, 0);
2334                 sg_dma_address(new_sg) = sg_dma_address(sg);
2335                 sg_dma_len(new_sg) = sg_dma_len(sg);
2336
2337                 new_sg = sg_next(new_sg);
2338         }
2339         GEM_BUG_ON(new_sg); /* Should walk exactly nents and hit the end */
2340
2341         sg_free_table(orig_st);
2342
2343         *orig_st = new_st;
2344         return true;
2345 }
2346
2347 static int i915_gem_object_get_pages_gtt(struct drm_i915_gem_object *obj)
2348 {
2349         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
2350         const unsigned long page_count = obj->base.size / PAGE_SIZE;
2351         unsigned long i;
2352         struct address_space *mapping;
2353         struct sg_table *st;
2354         struct scatterlist *sg;
2355         struct sgt_iter sgt_iter;
2356         struct page *page;
2357         unsigned long last_pfn = 0;     /* suppress gcc warning */
2358         unsigned int max_segment = i915_sg_segment_size();
2359         unsigned int sg_page_sizes;
2360         struct pagevec pvec;
2361         gfp_t noreclaim;
2362         int ret;
2363
2364         /*
2365          * Assert that the object is not currently in any GPU domain. As it
2366          * wasn't in the GTT, there shouldn't be any way it could have been in
2367          * a GPU cache
2368          */
2369         GEM_BUG_ON(obj->read_domains & I915_GEM_GPU_DOMAINS);
2370         GEM_BUG_ON(obj->write_domain & I915_GEM_GPU_DOMAINS);
2371
2372         /*
2373          * If there's no chance of allocating enough pages for the whole
2374          * object, bail early.
2375          */
2376         if (page_count > totalram_pages())
2377                 return -ENOMEM;
2378
2379         st = kmalloc(sizeof(*st), GFP_KERNEL);
2380         if (st == NULL)
2381                 return -ENOMEM;
2382
2383 rebuild_st:
2384         if (sg_alloc_table(st, page_count, GFP_KERNEL)) {
2385                 kfree(st);
2386                 return -ENOMEM;
2387         }
2388
2389         /*
2390          * Get the list of pages out of our struct file.  They'll be pinned
2391          * at this point until we release them.
2392          *
2393          * Fail silently without starting the shrinker
2394          */
2395         mapping = obj->base.filp->f_mapping;
2396         mapping_set_unevictable(mapping);
2397         noreclaim = mapping_gfp_constraint(mapping, ~__GFP_RECLAIM);
2398         noreclaim |= __GFP_NORETRY | __GFP_NOWARN;
2399
2400         sg = st->sgl;
2401         st->nents = 0;
2402         sg_page_sizes = 0;
2403         for (i = 0; i < page_count; i++) {
2404                 const unsigned int shrink[] = {
2405                         I915_SHRINK_BOUND | I915_SHRINK_UNBOUND | I915_SHRINK_PURGEABLE,
2406                         0,
2407                 }, *s = shrink;
2408                 gfp_t gfp = noreclaim;
2409
2410                 do {
2411                         cond_resched();
2412                         page = shmem_read_mapping_page_gfp(mapping, i, gfp);
2413                         if (!IS_ERR(page))
2414                                 break;
2415
2416                         if (!*s) {
2417                                 ret = PTR_ERR(page);
2418                                 goto err_sg;
2419                         }
2420
2421                         i915_gem_shrink(dev_priv, 2 * page_count, NULL, *s++);
2422
2423                         /*
2424                          * We've tried hard to allocate the memory by reaping
2425                          * our own buffer, now let the real VM do its job and
2426                          * go down in flames if truly OOM.
2427                          *
2428                          * However, since graphics tend to be disposable,
2429                          * defer the oom here by reporting the ENOMEM back
2430                          * to userspace.
2431                          */
2432                         if (!*s) {
2433                                 /* reclaim and warn, but no oom */
2434                                 gfp = mapping_gfp_mask(mapping);
2435
2436                                 /*
2437                                  * Our bo are always dirty and so we require
2438                                  * kswapd to reclaim our pages (direct reclaim
2439                                  * does not effectively begin pageout of our
2440                                  * buffers on its own). However, direct reclaim
2441                                  * only waits for kswapd when under allocation
2442                                  * congestion. So as a result __GFP_RECLAIM is
2443                                  * unreliable and fails to actually reclaim our
2444                                  * dirty pages -- unless you try over and over
2445                                  * again with !__GFP_NORETRY. However, we still
2446                                  * want to fail this allocation rather than
2447                                  * trigger the out-of-memory killer and for
2448                                  * this we want __GFP_RETRY_MAYFAIL.
2449                                  */
2450                                 gfp |= __GFP_RETRY_MAYFAIL;
2451                         }
2452                 } while (1);
2453
2454                 if (!i ||
2455                     sg->length >= max_segment ||
2456                     page_to_pfn(page) != last_pfn + 1) {
2457                         if (i) {
2458                                 sg_page_sizes |= sg->length;
2459                                 sg = sg_next(sg);
2460                         }
2461                         st->nents++;
2462                         sg_set_page(sg, page, PAGE_SIZE, 0);
2463                 } else {
2464                         sg->length += PAGE_SIZE;
2465                 }
2466                 last_pfn = page_to_pfn(page);
2467
2468                 /* Check that the i965g/gm workaround works. */
2469                 WARN_ON((gfp & __GFP_DMA32) && (last_pfn >= 0x00100000UL));
2470         }
2471         if (sg) { /* loop terminated early; short sg table */
2472                 sg_page_sizes |= sg->length;
2473                 sg_mark_end(sg);
2474         }
2475
2476         /* Trim unused sg entries to avoid wasting memory. */
2477         i915_sg_trim(st);
2478
2479         ret = i915_gem_gtt_prepare_pages(obj, st);
2480         if (ret) {
2481                 /*
2482                  * DMA remapping failed? One possible cause is that
2483                  * it could not reserve enough large entries, asking
2484                  * for PAGE_SIZE chunks instead may be helpful.
2485                  */
2486                 if (max_segment > PAGE_SIZE) {
2487                         for_each_sgt_page(page, sgt_iter, st)
2488                                 put_page(page);
2489                         sg_free_table(st);
2490
2491                         max_segment = PAGE_SIZE;
2492                         goto rebuild_st;
2493                 } else {
2494                         dev_warn(&dev_priv->drm.pdev->dev,
2495                                  "Failed to DMA remap %lu pages\n",
2496                                  page_count);
2497                         goto err_pages;
2498                 }
2499         }
2500
2501         if (i915_gem_object_needs_bit17_swizzle(obj))
2502                 i915_gem_object_do_bit_17_swizzle(obj, st);
2503
2504         __i915_gem_object_set_pages(obj, st, sg_page_sizes);
2505
2506         return 0;
2507
2508 err_sg:
2509         sg_mark_end(sg);
2510 err_pages:
2511         mapping_clear_unevictable(mapping);
2512         pagevec_init(&pvec);
2513         for_each_sgt_page(page, sgt_iter, st) {
2514                 if (!pagevec_add(&pvec, page))
2515                         check_release_pagevec(&pvec);
2516         }
2517         if (pagevec_count(&pvec))
2518                 check_release_pagevec(&pvec);
2519         sg_free_table(st);
2520         kfree(st);
2521
2522         /*
2523          * shmemfs first checks if there is enough memory to allocate the page
2524          * and reports ENOSPC should there be insufficient, along with the usual
2525          * ENOMEM for a genuine allocation failure.
2526          *
2527          * We use ENOSPC in our driver to mean that we have run out of aperture
2528          * space and so want to translate the error from shmemfs back to our
2529          * usual understanding of ENOMEM.
2530          */
2531         if (ret == -ENOSPC)
2532                 ret = -ENOMEM;
2533
2534         return ret;
2535 }
2536
2537 void __i915_gem_object_set_pages(struct drm_i915_gem_object *obj,
2538                                  struct sg_table *pages,
2539                                  unsigned int sg_page_sizes)
2540 {
2541         struct drm_i915_private *i915 = to_i915(obj->base.dev);
2542         unsigned long supported = INTEL_INFO(i915)->page_sizes;
2543         int i;
2544
2545         lockdep_assert_held(&obj->mm.lock);
2546
2547         /* Make the pages coherent with the GPU (flushing any swapin). */
2548         if (obj->cache_dirty) {
2549                 obj->write_domain = 0;
2550                 if (i915_gem_object_has_struct_page(obj))
2551                         drm_clflush_sg(pages);
2552                 obj->cache_dirty = false;
2553         }
2554
2555         obj->mm.get_page.sg_pos = pages->sgl;
2556         obj->mm.get_page.sg_idx = 0;
2557
2558         obj->mm.pages = pages;
2559
2560         if (i915_gem_object_is_tiled(obj) &&
2561             i915->quirks & QUIRK_PIN_SWIZZLED_PAGES) {
2562                 GEM_BUG_ON(obj->mm.quirked);
2563                 __i915_gem_object_pin_pages(obj);
2564                 obj->mm.quirked = true;
2565         }
2566
2567         GEM_BUG_ON(!sg_page_sizes);
2568         obj->mm.page_sizes.phys = sg_page_sizes;
2569
2570         /*
2571          * Calculate the supported page-sizes which fit into the given
2572          * sg_page_sizes. This will give us the page-sizes which we may be able
2573          * to use opportunistically when later inserting into the GTT. For
2574          * example if phys=2G, then in theory we should be able to use 1G, 2M,
2575          * 64K or 4K pages, although in practice this will depend on a number of
2576          * other factors.
2577          */
2578         obj->mm.page_sizes.sg = 0;
2579         for_each_set_bit(i, &supported, ilog2(I915_GTT_MAX_PAGE_SIZE) + 1) {
2580                 if (obj->mm.page_sizes.phys & ~0u << i)
2581                         obj->mm.page_sizes.sg |= BIT(i);
2582         }
2583         GEM_BUG_ON(!HAS_PAGE_SIZES(i915, obj->mm.page_sizes.sg));
2584
2585         spin_lock(&i915->mm.obj_lock);
2586         list_add(&obj->mm.link, &i915->mm.unbound_list);
2587         spin_unlock(&i915->mm.obj_lock);
2588 }
2589
2590 static int ____i915_gem_object_get_pages(struct drm_i915_gem_object *obj)
2591 {
2592         int err;
2593
2594         if (unlikely(obj->mm.madv != I915_MADV_WILLNEED)) {
2595                 DRM_DEBUG("Attempting to obtain a purgeable object\n");
2596                 return -EFAULT;
2597         }
2598
2599         err = obj->ops->get_pages(obj);
2600         GEM_BUG_ON(!err && !i915_gem_object_has_pages(obj));
2601
2602         return err;
2603 }
2604
2605 /* Ensure that the associated pages are gathered from the backing storage
2606  * and pinned into our object. i915_gem_object_pin_pages() may be called
2607  * multiple times before they are released by a single call to
2608  * i915_gem_object_unpin_pages() - once the pages are no longer referenced
2609  * either as a result of memory pressure (reaping pages under the shrinker)
2610  * or as the object is itself released.
2611  */
2612 int __i915_gem_object_get_pages(struct drm_i915_gem_object *obj)
2613 {
2614         int err;
2615
2616         err = mutex_lock_interruptible(&obj->mm.lock);
2617         if (err)
2618                 return err;
2619
2620         if (unlikely(!i915_gem_object_has_pages(obj))) {
2621                 GEM_BUG_ON(i915_gem_object_has_pinned_pages(obj));
2622
2623                 err = ____i915_gem_object_get_pages(obj);
2624                 if (err)
2625                         goto unlock;
2626
2627                 smp_mb__before_atomic();
2628         }
2629         atomic_inc(&obj->mm.pages_pin_count);
2630
2631 unlock:
2632         mutex_unlock(&obj->mm.lock);
2633         return err;
2634 }
2635
2636 /* The 'mapping' part of i915_gem_object_pin_map() below */
2637 static void *i915_gem_object_map(const struct drm_i915_gem_object *obj,
2638                                  enum i915_map_type type)
2639 {
2640         unsigned long n_pages = obj->base.size >> PAGE_SHIFT;
2641         struct sg_table *sgt = obj->mm.pages;
2642         struct sgt_iter sgt_iter;
2643         struct page *page;
2644         struct page *stack_pages[32];
2645         struct page **pages = stack_pages;
2646         unsigned long i = 0;
2647         pgprot_t pgprot;
2648         void *addr;
2649
2650         /* A single page can always be kmapped */
2651         if (n_pages == 1 && type == I915_MAP_WB)
2652                 return kmap(sg_page(sgt->sgl));
2653
2654         if (n_pages > ARRAY_SIZE(stack_pages)) {
2655                 /* Too big for stack -- allocate temporary array instead */
2656                 pages = kvmalloc_array(n_pages, sizeof(*pages), GFP_KERNEL);
2657                 if (!pages)
2658                         return NULL;
2659         }
2660
2661         for_each_sgt_page(page, sgt_iter, sgt)
2662                 pages[i++] = page;
2663
2664         /* Check that we have the expected number of pages */
2665         GEM_BUG_ON(i != n_pages);
2666
2667         switch (type) {
2668         default:
2669                 MISSING_CASE(type);
2670                 /* fallthrough to use PAGE_KERNEL anyway */
2671         case I915_MAP_WB:
2672                 pgprot = PAGE_KERNEL;
2673                 break;
2674         case I915_MAP_WC:
2675                 pgprot = pgprot_writecombine(PAGE_KERNEL_IO);
2676                 break;
2677         }
2678         addr = vmap(pages, n_pages, 0, pgprot);
2679
2680         if (pages != stack_pages)
2681                 kvfree(pages);
2682
2683         return addr;
2684 }
2685
2686 /* get, pin, and map the pages of the object into kernel space */
2687 void *i915_gem_object_pin_map(struct drm_i915_gem_object *obj,
2688                               enum i915_map_type type)
2689 {
2690         enum i915_map_type has_type;
2691         bool pinned;
2692         void *ptr;
2693         int ret;
2694
2695         if (unlikely(!i915_gem_object_has_struct_page(obj)))
2696                 return ERR_PTR(-ENXIO);
2697
2698         ret = mutex_lock_interruptible(&obj->mm.lock);
2699         if (ret)
2700                 return ERR_PTR(ret);
2701
2702         pinned = !(type & I915_MAP_OVERRIDE);
2703         type &= ~I915_MAP_OVERRIDE;
2704
2705         if (!atomic_inc_not_zero(&obj->mm.pages_pin_count)) {
2706                 if (unlikely(!i915_gem_object_has_pages(obj))) {
2707                         GEM_BUG_ON(i915_gem_object_has_pinned_pages(obj));
2708
2709                         ret = ____i915_gem_object_get_pages(obj);
2710                         if (ret)
2711                                 goto err_unlock;
2712
2713                         smp_mb__before_atomic();
2714                 }
2715                 atomic_inc(&obj->mm.pages_pin_count);
2716                 pinned = false;
2717         }
2718         GEM_BUG_ON(!i915_gem_object_has_pages(obj));
2719
2720         ptr = page_unpack_bits(obj->mm.mapping, &has_type);
2721         if (ptr && has_type != type) {
2722                 if (pinned) {
2723                         ret = -EBUSY;
2724                         goto err_unpin;
2725                 }
2726
2727                 if (is_vmalloc_addr(ptr))
2728                         vunmap(ptr);
2729                 else
2730                         kunmap(kmap_to_page(ptr));
2731
2732                 ptr = obj->mm.mapping = NULL;
2733         }
2734
2735         if (!ptr) {
2736                 ptr = i915_gem_object_map(obj, type);
2737                 if (!ptr) {
2738                         ret = -ENOMEM;
2739                         goto err_unpin;
2740                 }
2741
2742                 obj->mm.mapping = page_pack_bits(ptr, type);
2743         }
2744
2745 out_unlock:
2746         mutex_unlock(&obj->mm.lock);
2747         return ptr;
2748
2749 err_unpin:
2750         atomic_dec(&obj->mm.pages_pin_count);
2751 err_unlock:
2752         ptr = ERR_PTR(ret);
2753         goto out_unlock;
2754 }
2755
2756 void __i915_gem_object_flush_map(struct drm_i915_gem_object *obj,
2757                                  unsigned long offset,
2758                                  unsigned long size)
2759 {
2760         enum i915_map_type has_type;
2761         void *ptr;
2762
2763         GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
2764         GEM_BUG_ON(range_overflows_t(typeof(obj->base.size),
2765                                      offset, size, obj->base.size));
2766
2767         obj->mm.dirty = true;
2768
2769         if (obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE)
2770                 return;
2771
2772         ptr = page_unpack_bits(obj->mm.mapping, &has_type);
2773         if (has_type == I915_MAP_WC)
2774                 return;
2775
2776         drm_clflush_virt_range(ptr + offset, size);
2777         if (size == obj->base.size) {
2778                 obj->write_domain &= ~I915_GEM_DOMAIN_CPU;
2779                 obj->cache_dirty = false;
2780         }
2781 }
2782
2783 static int
2784 i915_gem_object_pwrite_gtt(struct drm_i915_gem_object *obj,
2785                            const struct drm_i915_gem_pwrite *arg)
2786 {
2787         struct address_space *mapping = obj->base.filp->f_mapping;
2788         char __user *user_data = u64_to_user_ptr(arg->data_ptr);
2789         u64 remain, offset;
2790         unsigned int pg;
2791
2792         /* Caller already validated user args */
2793         GEM_BUG_ON(!access_ok(user_data, arg->size));
2794
2795         /*
2796          * Before we instantiate/pin the backing store for our use, we
2797          * can prepopulate the shmemfs filp efficiently using a write into
2798          * the pagecache. We avoid the penalty of instantiating all the
2799          * pages, important if the user is just writing to a few and never
2800          * uses the object on the GPU, and using a direct write into shmemfs
2801          * allows it to avoid the cost of retrieving a page (either swapin
2802          * or clearing-before-use) before it is overwritten.
2803          */
2804         if (i915_gem_object_has_pages(obj))
2805                 return -ENODEV;
2806
2807         if (obj->mm.madv != I915_MADV_WILLNEED)
2808                 return -EFAULT;
2809
2810         /*
2811          * Before the pages are instantiated the object is treated as being
2812          * in the CPU domain. The pages will be clflushed as required before
2813          * use, and we can freely write into the pages directly. If userspace
2814          * races pwrite with any other operation; corruption will ensue -
2815          * that is userspace's prerogative!
2816          */
2817
2818         remain = arg->size;
2819         offset = arg->offset;
2820         pg = offset_in_page(offset);
2821
2822         do {
2823                 unsigned int len, unwritten;
2824                 struct page *page;
2825                 void *data, *vaddr;
2826                 int err;
2827                 char c;
2828
2829                 len = PAGE_SIZE - pg;
2830                 if (len > remain)
2831                         len = remain;
2832
2833                 /* Prefault the user page to reduce potential recursion */
2834                 err = __get_user(c, user_data);
2835                 if (err)
2836                         return err;
2837
2838                 err = __get_user(c, user_data + len - 1);
2839                 if (err)
2840                         return err;
2841
2842                 err = pagecache_write_begin(obj->base.filp, mapping,
2843                                             offset, len, 0,
2844                                             &page, &data);
2845                 if (err < 0)
2846                         return err;
2847
2848                 vaddr = kmap_atomic(page);
2849                 unwritten = __copy_from_user_inatomic(vaddr + pg,
2850                                                       user_data,
2851                                                       len);
2852                 kunmap_atomic(vaddr);
2853
2854                 err = pagecache_write_end(obj->base.filp, mapping,
2855                                           offset, len, len - unwritten,
2856                                           page, data);
2857                 if (err < 0)
2858                         return err;
2859
2860                 /* We don't handle -EFAULT, leave it to the caller to check */
2861                 if (unwritten)
2862                         return -ENODEV;
2863
2864                 remain -= len;
2865                 user_data += len;
2866                 offset += len;
2867                 pg = 0;
2868         } while (remain);
2869
2870         return 0;
2871 }
2872
2873 static void
2874 i915_gem_retire_work_handler(struct work_struct *work)
2875 {
2876         struct drm_i915_private *dev_priv =
2877                 container_of(work, typeof(*dev_priv), gt.retire_work.work);
2878         struct drm_device *dev = &dev_priv->drm;
2879
2880         /* Come back later if the device is busy... */
2881         if (mutex_trylock(&dev->struct_mutex)) {
2882                 i915_retire_requests(dev_priv);
2883                 mutex_unlock(&dev->struct_mutex);
2884         }
2885
2886         /*
2887          * Keep the retire handler running until we are finally idle.
2888          * We do not need to do this test under locking as in the worst-case
2889          * we queue the retire worker once too often.
2890          */
2891         if (READ_ONCE(dev_priv->gt.awake))
2892                 queue_delayed_work(dev_priv->wq,
2893                                    &dev_priv->gt.retire_work,
2894                                    round_jiffies_up_relative(HZ));
2895 }
2896
2897 static bool switch_to_kernel_context_sync(struct drm_i915_private *i915,
2898                                           unsigned long mask)
2899 {
2900         bool result = true;
2901
2902         /*
2903          * Even if we fail to switch, give whatever is running a small chance
2904          * to save itself before we report the failure. Yes, this may be a
2905          * false positive due to e.g. ENOMEM, caveat emptor!
2906          */
2907         if (i915_gem_switch_to_kernel_context(i915, mask))
2908                 result = false;
2909
2910         if (i915_gem_wait_for_idle(i915,
2911                                    I915_WAIT_LOCKED |
2912                                    I915_WAIT_FOR_IDLE_BOOST,
2913                                    I915_GEM_IDLE_TIMEOUT))
2914                 result = false;
2915
2916         if (!result) {
2917                 if (i915_modparams.reset) { /* XXX hide warning from gem_eio */
2918                         dev_err(i915->drm.dev,
2919                                 "Failed to idle engines, declaring wedged!\n");
2920                         GEM_TRACE_DUMP();
2921                 }
2922
2923                 /* Forcibly cancel outstanding work and leave the gpu quiet. */
2924                 i915_gem_set_wedged(i915);
2925         }
2926
2927         i915_retire_requests(i915); /* ensure we flush after wedging */
2928         return result;
2929 }
2930
2931 static bool load_power_context(struct drm_i915_private *i915)
2932 {
2933         /* Force loading the kernel context on all engines */
2934         if (!switch_to_kernel_context_sync(i915, ALL_ENGINES))
2935                 return false;
2936
2937         /*
2938          * Immediately park the GPU so that we enable powersaving and
2939          * treat it as idle. The next time we issue a request, we will
2940          * unpark and start using the engine->pinned_default_state, otherwise
2941          * it is in limbo and an early reset may fail.
2942          */
2943         __i915_gem_park(i915);
2944
2945         return true;
2946 }
2947
2948 static void
2949 i915_gem_idle_work_handler(struct work_struct *work)
2950 {
2951         struct drm_i915_private *i915 =
2952                 container_of(work, typeof(*i915), gt.idle_work.work);
2953         bool rearm_hangcheck;
2954
2955         if (!READ_ONCE(i915->gt.awake))
2956                 return;
2957
2958         if (READ_ONCE(i915->gt.active_requests))
2959                 return;
2960
2961         rearm_hangcheck =
2962                 cancel_delayed_work_sync(&i915->gpu_error.hangcheck_work);
2963
2964         if (!mutex_trylock(&i915->drm.struct_mutex)) {
2965                 /* Currently busy, come back later */
2966                 mod_delayed_work(i915->wq,
2967                                  &i915->gt.idle_work,
2968                                  msecs_to_jiffies(50));
2969                 goto out_rearm;
2970         }
2971
2972         /*
2973          * Flush out the last user context, leaving only the pinned
2974          * kernel context resident. Should anything unfortunate happen
2975          * while we are idle (such as the GPU being power cycled), no users
2976          * will be harmed.
2977          */
2978         if (!work_pending(&i915->gt.idle_work.work) &&
2979             !i915->gt.active_requests) {
2980                 ++i915->gt.active_requests; /* don't requeue idle */
2981
2982                 switch_to_kernel_context_sync(i915, i915->gt.active_engines);
2983
2984                 if (!--i915->gt.active_requests) {
2985                         __i915_gem_park(i915);
2986                         rearm_hangcheck = false;
2987                 }
2988         }
2989
2990         mutex_unlock(&i915->drm.struct_mutex);
2991
2992 out_rearm:
2993         if (rearm_hangcheck) {
2994                 GEM_BUG_ON(!i915->gt.awake);
2995                 i915_queue_hangcheck(i915);
2996         }
2997 }
2998
2999 void i915_gem_close_object(struct drm_gem_object *gem, struct drm_file *file)
3000 {
3001         struct drm_i915_private *i915 = to_i915(gem->dev);
3002         struct drm_i915_gem_object *obj = to_intel_bo(gem);
3003         struct drm_i915_file_private *fpriv = file->driver_priv;
3004         struct i915_lut_handle *lut, *ln;
3005
3006         mutex_lock(&i915->drm.struct_mutex);
3007
3008         list_for_each_entry_safe(lut, ln, &obj->lut_list, obj_link) {
3009                 struct i915_gem_context *ctx = lut->ctx;
3010                 struct i915_vma *vma;
3011
3012                 GEM_BUG_ON(ctx->file_priv == ERR_PTR(-EBADF));
3013                 if (ctx->file_priv != fpriv)
3014                         continue;
3015
3016                 vma = radix_tree_delete(&ctx->handles_vma, lut->handle);
3017                 GEM_BUG_ON(vma->obj != obj);
3018
3019                 /* We allow the process to have multiple handles to the same
3020                  * vma, in the same fd namespace, by virtue of flink/open.
3021                  */
3022                 GEM_BUG_ON(!vma->open_count);
3023                 if (!--vma->open_count && !i915_vma_is_ggtt(vma))
3024                         i915_vma_close(vma);
3025
3026                 list_del(&lut->obj_link);
3027                 list_del(&lut->ctx_link);
3028
3029                 i915_lut_handle_free(lut);
3030                 __i915_gem_object_release_unless_active(obj);
3031         }
3032
3033         mutex_unlock(&i915->drm.struct_mutex);
3034 }
3035
3036 static unsigned long to_wait_timeout(s64 timeout_ns)
3037 {
3038         if (timeout_ns < 0)
3039                 return MAX_SCHEDULE_TIMEOUT;
3040
3041         if (timeout_ns == 0)
3042                 return 0;
3043
3044         return nsecs_to_jiffies_timeout(timeout_ns);
3045 }
3046
3047 /**
3048  * i915_gem_wait_ioctl - implements DRM_IOCTL_I915_GEM_WAIT
3049  * @dev: drm device pointer
3050  * @data: ioctl data blob
3051  * @file: drm file pointer
3052  *
3053  * Returns 0 if successful, else an error is returned with the remaining time in
3054  * the timeout parameter.
3055  *  -ETIME: object is still busy after timeout
3056  *  -ERESTARTSYS: signal interrupted the wait
3057  *  -ENONENT: object doesn't exist
3058  * Also possible, but rare:
3059  *  -EAGAIN: incomplete, restart syscall
3060  *  -ENOMEM: damn
3061  *  -ENODEV: Internal IRQ fail
3062  *  -E?: The add request failed
3063  *
3064  * The wait ioctl with a timeout of 0 reimplements the busy ioctl. With any
3065  * non-zero timeout parameter the wait ioctl will wait for the given number of
3066  * nanoseconds on an object becoming unbusy. Since the wait itself does so
3067  * without holding struct_mutex the object may become re-busied before this
3068  * function completes. A similar but shorter * race condition exists in the busy
3069  * ioctl
3070  */
3071 int
3072 i915_gem_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
3073 {
3074         struct drm_i915_gem_wait *args = data;
3075         struct drm_i915_gem_object *obj;
3076         ktime_t start;
3077         long ret;
3078
3079         if (args->flags != 0)
3080                 return -EINVAL;
3081
3082         obj = i915_gem_object_lookup(file, args->bo_handle);
3083         if (!obj)
3084                 return -ENOENT;
3085
3086         start = ktime_get();
3087
3088         ret = i915_gem_object_wait(obj,
3089                                    I915_WAIT_INTERRUPTIBLE |
3090                                    I915_WAIT_PRIORITY |
3091                                    I915_WAIT_ALL,
3092                                    to_wait_timeout(args->timeout_ns));
3093
3094         if (args->timeout_ns > 0) {
3095                 args->timeout_ns -= ktime_to_ns(ktime_sub(ktime_get(), start));
3096                 if (args->timeout_ns < 0)
3097                         args->timeout_ns = 0;
3098
3099                 /*
3100                  * Apparently ktime isn't accurate enough and occasionally has a
3101                  * bit of mismatch in the jiffies<->nsecs<->ktime loop. So patch
3102                  * things up to make the test happy. We allow up to 1 jiffy.
3103                  *
3104                  * This is a regression from the timespec->ktime conversion.
3105                  */
3106                 if (ret == -ETIME && !nsecs_to_jiffies(args->timeout_ns))
3107                         args->timeout_ns = 0;
3108
3109                 /* Asked to wait beyond the jiffie/scheduler precision? */
3110                 if (ret == -ETIME && args->timeout_ns)
3111                         ret = -EAGAIN;
3112         }
3113
3114         i915_gem_object_put(obj);
3115         return ret;
3116 }
3117
3118 static int wait_for_engines(struct drm_i915_private *i915)
3119 {
3120         if (wait_for(intel_engines_are_idle(i915), I915_IDLE_ENGINES_TIMEOUT)) {
3121                 dev_err(i915->drm.dev,
3122                         "Failed to idle engines, declaring wedged!\n");
3123                 GEM_TRACE_DUMP();
3124                 i915_gem_set_wedged(i915);
3125                 return -EIO;
3126         }
3127
3128         return 0;
3129 }
3130
3131 static long
3132 wait_for_timelines(struct drm_i915_private *i915,
3133                    unsigned int flags, long timeout)
3134 {
3135         struct i915_gt_timelines *gt = &i915->gt.timelines;
3136         struct i915_timeline *tl;
3137
3138         if (!READ_ONCE(i915->gt.active_requests))
3139                 return timeout;
3140
3141         mutex_lock(&gt->mutex);
3142         list_for_each_entry(tl, &gt->active_list, link) {
3143                 struct i915_request *rq;
3144
3145                 rq = i915_active_request_get_unlocked(&tl->last_request);
3146                 if (!rq)
3147                         continue;
3148
3149                 mutex_unlock(&gt->mutex);
3150
3151                 /*
3152                  * "Race-to-idle".
3153                  *
3154                  * Switching to the kernel context is often used a synchronous
3155                  * step prior to idling, e.g. in suspend for flushing all
3156                  * current operations to memory before sleeping. These we
3157                  * want to complete as quickly as possible to avoid prolonged
3158                  * stalls, so allow the gpu to boost to maximum clocks.
3159                  */
3160                 if (flags & I915_WAIT_FOR_IDLE_BOOST)
3161                         gen6_rps_boost(rq);
3162
3163                 timeout = i915_request_wait(rq, flags, timeout);
3164                 i915_request_put(rq);
3165                 if (timeout < 0)
3166                         return timeout;
3167
3168                 /* restart after reacquiring the lock */
3169                 mutex_lock(&gt->mutex);
3170                 tl = list_entry(&gt->active_list, typeof(*tl), link);
3171         }
3172         mutex_unlock(&gt->mutex);
3173
3174         return timeout;
3175 }
3176
3177 int i915_gem_wait_for_idle(struct drm_i915_private *i915,
3178                            unsigned int flags, long timeout)
3179 {
3180         GEM_TRACE("flags=%x (%s), timeout=%ld%s\n",
3181                   flags, flags & I915_WAIT_LOCKED ? "locked" : "unlocked",
3182                   timeout, timeout == MAX_SCHEDULE_TIMEOUT ? " (forever)" : "");
3183
3184         /* If the device is asleep, we have no requests outstanding */
3185         if (!READ_ONCE(i915->gt.awake))
3186                 return 0;
3187
3188         timeout = wait_for_timelines(i915, flags, timeout);
3189         if (timeout < 0)
3190                 return timeout;
3191
3192         if (flags & I915_WAIT_LOCKED) {
3193                 int err;
3194
3195                 lockdep_assert_held(&i915->drm.struct_mutex);
3196
3197                 err = wait_for_engines(i915);
3198                 if (err)
3199                         return err;
3200
3201                 i915_retire_requests(i915);
3202         }
3203
3204         return 0;
3205 }
3206
3207 static void __i915_gem_object_flush_for_display(struct drm_i915_gem_object *obj)
3208 {
3209         /*
3210          * We manually flush the CPU domain so that we can override and
3211          * force the flush for the display, and perform it asyncrhonously.
3212          */
3213         flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
3214         if (obj->cache_dirty)
3215                 i915_gem_clflush_object(obj, I915_CLFLUSH_FORCE);
3216         obj->write_domain = 0;
3217 }
3218
3219 void i915_gem_object_flush_if_display(struct drm_i915_gem_object *obj)
3220 {
3221         if (!READ_ONCE(obj->pin_global))
3222                 return;
3223
3224         mutex_lock(&obj->base.dev->struct_mutex);
3225         __i915_gem_object_flush_for_display(obj);
3226         mutex_unlock(&obj->base.dev->struct_mutex);
3227 }
3228
3229 /**
3230  * Moves a single object to the WC read, and possibly write domain.
3231  * @obj: object to act on
3232  * @write: ask for write access or read only
3233  *
3234  * This function returns when the move is complete, including waiting on
3235  * flushes to occur.
3236  */
3237 int
3238 i915_gem_object_set_to_wc_domain(struct drm_i915_gem_object *obj, bool write)
3239 {
3240         int ret;
3241
3242         lockdep_assert_held(&obj->base.dev->struct_mutex);
3243
3244         ret = i915_gem_object_wait(obj,
3245                                    I915_WAIT_INTERRUPTIBLE |
3246                                    I915_WAIT_LOCKED |
3247                                    (write ? I915_WAIT_ALL : 0),
3248                                    MAX_SCHEDULE_TIMEOUT);
3249         if (ret)
3250                 return ret;
3251
3252         if (obj->write_domain == I915_GEM_DOMAIN_WC)
3253                 return 0;
3254
3255         /* Flush and acquire obj->pages so that we are coherent through
3256          * direct access in memory with previous cached writes through
3257          * shmemfs and that our cache domain tracking remains valid.
3258          * For example, if the obj->filp was moved to swap without us
3259          * being notified and releasing the pages, we would mistakenly
3260          * continue to assume that the obj remained out of the CPU cached
3261          * domain.
3262          */
3263         ret = i915_gem_object_pin_pages(obj);
3264         if (ret)
3265                 return ret;
3266
3267         flush_write_domain(obj, ~I915_GEM_DOMAIN_WC);
3268
3269         /* Serialise direct access to this object with the barriers for
3270          * coherent writes from the GPU, by effectively invalidating the
3271          * WC domain upon first access.
3272          */
3273         if ((obj->read_domains & I915_GEM_DOMAIN_WC) == 0)
3274                 mb();
3275
3276         /* It should now be out of any other write domains, and we can update
3277          * the domain values for our changes.
3278          */
3279         GEM_BUG_ON((obj->write_domain & ~I915_GEM_DOMAIN_WC) != 0);
3280         obj->read_domains |= I915_GEM_DOMAIN_WC;
3281         if (write) {
3282                 obj->read_domains = I915_GEM_DOMAIN_WC;
3283                 obj->write_domain = I915_GEM_DOMAIN_WC;
3284                 obj->mm.dirty = true;
3285         }
3286
3287         i915_gem_object_unpin_pages(obj);
3288         return 0;
3289 }
3290
3291 /**
3292  * Moves a single object to the GTT read, and possibly write domain.
3293  * @obj: object to act on
3294  * @write: ask for write access or read only
3295  *
3296  * This function returns when the move is complete, including waiting on
3297  * flushes to occur.
3298  */
3299 int
3300 i915_gem_object_set_to_gtt_domain(struct drm_i915_gem_object *obj, bool write)
3301 {
3302         int ret;
3303
3304         lockdep_assert_held(&obj->base.dev->struct_mutex);
3305
3306         ret = i915_gem_object_wait(obj,
3307                                    I915_WAIT_INTERRUPTIBLE |
3308                                    I915_WAIT_LOCKED |
3309                                    (write ? I915_WAIT_ALL : 0),
3310                                    MAX_SCHEDULE_TIMEOUT);
3311         if (ret)
3312                 return ret;
3313
3314         if (obj->write_domain == I915_GEM_DOMAIN_GTT)
3315                 return 0;
3316
3317         /* Flush and acquire obj->pages so that we are coherent through
3318          * direct access in memory with previous cached writes through
3319          * shmemfs and that our cache domain tracking remains valid.
3320          * For example, if the obj->filp was moved to swap without us
3321          * being notified and releasing the pages, we would mistakenly
3322          * continue to assume that the obj remained out of the CPU cached
3323          * domain.
3324          */
3325         ret = i915_gem_object_pin_pages(obj);
3326         if (ret)
3327                 return ret;
3328
3329         flush_write_domain(obj, ~I915_GEM_DOMAIN_GTT);
3330
3331         /* Serialise direct access to this object with the barriers for
3332          * coherent writes from the GPU, by effectively invalidating the
3333          * GTT domain upon first access.
3334          */
3335         if ((obj->read_domains & I915_GEM_DOMAIN_GTT) == 0)
3336                 mb();
3337
3338         /* It should now be out of any other write domains, and we can update
3339          * the domain values for our changes.
3340          */
3341         GEM_BUG_ON((obj->write_domain & ~I915_GEM_DOMAIN_GTT) != 0);
3342         obj->read_domains |= I915_GEM_DOMAIN_GTT;
3343         if (write) {
3344                 obj->read_domains = I915_GEM_DOMAIN_GTT;
3345                 obj->write_domain = I915_GEM_DOMAIN_GTT;
3346                 obj->mm.dirty = true;
3347         }
3348
3349         i915_gem_object_unpin_pages(obj);
3350         return 0;
3351 }
3352
3353 /**
3354  * Changes the cache-level of an object across all VMA.
3355  * @obj: object to act on
3356  * @cache_level: new cache level to set for the object
3357  *
3358  * After this function returns, the object will be in the new cache-level
3359  * across all GTT and the contents of the backing storage will be coherent,
3360  * with respect to the new cache-level. In order to keep the backing storage
3361  * coherent for all users, we only allow a single cache level to be set
3362  * globally on the object and prevent it from being changed whilst the
3363  * hardware is reading from the object. That is if the object is currently
3364  * on the scanout it will be set to uncached (or equivalent display
3365  * cache coherency) and all non-MOCS GPU access will also be uncached so
3366  * that all direct access to the scanout remains coherent.
3367  */
3368 int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
3369                                     enum i915_cache_level cache_level)
3370 {
3371         struct i915_vma *vma;
3372         int ret;
3373
3374         lockdep_assert_held(&obj->base.dev->struct_mutex);
3375
3376         if (obj->cache_level == cache_level)
3377                 return 0;
3378
3379         /* Inspect the list of currently bound VMA and unbind any that would
3380          * be invalid given the new cache-level. This is principally to
3381          * catch the issue of the CS prefetch crossing page boundaries and
3382          * reading an invalid PTE on older architectures.
3383          */
3384 restart:
3385         list_for_each_entry(vma, &obj->vma.list, obj_link) {
3386                 if (!drm_mm_node_allocated(&vma->node))
3387                         continue;
3388
3389                 if (i915_vma_is_pinned(vma)) {
3390                         DRM_DEBUG("can not change the cache level of pinned objects\n");
3391                         return -EBUSY;
3392                 }
3393
3394                 if (!i915_vma_is_closed(vma) &&
3395                     i915_gem_valid_gtt_space(vma, cache_level))
3396                         continue;
3397
3398                 ret = i915_vma_unbind(vma);
3399                 if (ret)
3400                         return ret;
3401
3402                 /* As unbinding may affect other elements in the
3403                  * obj->vma_list (due to side-effects from retiring
3404                  * an active vma), play safe and restart the iterator.
3405                  */
3406                 goto restart;
3407         }
3408
3409         /* We can reuse the existing drm_mm nodes but need to change the
3410          * cache-level on the PTE. We could simply unbind them all and
3411          * rebind with the correct cache-level on next use. However since
3412          * we already have a valid slot, dma mapping, pages etc, we may as
3413          * rewrite the PTE in the belief that doing so tramples upon less
3414          * state and so involves less work.
3415          */
3416         if (obj->bind_count) {
3417                 /* Before we change the PTE, the GPU must not be accessing it.
3418                  * If we wait upon the object, we know that all the bound
3419                  * VMA are no longer active.
3420                  */
3421                 ret = i915_gem_object_wait(obj,
3422                                            I915_WAIT_INTERRUPTIBLE |
3423                                            I915_WAIT_LOCKED |
3424                                            I915_WAIT_ALL,
3425                                            MAX_SCHEDULE_TIMEOUT);
3426                 if (ret)
3427                         return ret;
3428
3429                 if (!HAS_LLC(to_i915(obj->base.dev)) &&
3430                     cache_level != I915_CACHE_NONE) {
3431                         /* Access to snoopable pages through the GTT is
3432                          * incoherent and on some machines causes a hard
3433                          * lockup. Relinquish the CPU mmaping to force
3434                          * userspace to refault in the pages and we can
3435                          * then double check if the GTT mapping is still
3436                          * valid for that pointer access.
3437                          */
3438                         i915_gem_release_mmap(obj);
3439
3440                         /* As we no longer need a fence for GTT access,
3441                          * we can relinquish it now (and so prevent having
3442                          * to steal a fence from someone else on the next
3443                          * fence request). Note GPU activity would have
3444                          * dropped the fence as all snoopable access is
3445                          * supposed to be linear.
3446                          */
3447                         for_each_ggtt_vma(vma, obj) {
3448                                 ret = i915_vma_put_fence(vma);
3449                                 if (ret)
3450                                         return ret;
3451                         }
3452                 } else {
3453                         /* We either have incoherent backing store and
3454                          * so no GTT access or the architecture is fully
3455                          * coherent. In such cases, existing GTT mmaps
3456                          * ignore the cache bit in the PTE and we can
3457                          * rewrite it without confusing the GPU or having
3458                          * to force userspace to fault back in its mmaps.
3459                          */
3460                 }
3461
3462                 list_for_each_entry(vma, &obj->vma.list, obj_link) {
3463                         if (!drm_mm_node_allocated(&vma->node))
3464                                 continue;
3465
3466                         ret = i915_vma_bind(vma, cache_level, PIN_UPDATE);
3467                         if (ret)
3468                                 return ret;
3469                 }
3470         }
3471
3472         list_for_each_entry(vma, &obj->vma.list, obj_link)
3473                 vma->node.color = cache_level;
3474         i915_gem_object_set_cache_coherency(obj, cache_level);
3475         obj->cache_dirty = true; /* Always invalidate stale cachelines */
3476
3477         return 0;
3478 }
3479
3480 int i915_gem_get_caching_ioctl(struct drm_device *dev, void *data,
3481                                struct drm_file *file)
3482 {
3483         struct drm_i915_gem_caching *args = data;
3484         struct drm_i915_gem_object *obj;
3485         int err = 0;
3486
3487         rcu_read_lock();
3488         obj = i915_gem_object_lookup_rcu(file, args->handle);
3489         if (!obj) {
3490                 err = -ENOENT;
3491                 goto out;
3492         }
3493
3494         switch (obj->cache_level) {
3495         case I915_CACHE_LLC:
3496         case I915_CACHE_L3_LLC:
3497                 args->caching = I915_CACHING_CACHED;
3498                 break;
3499
3500         case I915_CACHE_WT:
3501                 args->caching = I915_CACHING_DISPLAY;
3502                 break;
3503
3504         default:
3505                 args->caching = I915_CACHING_NONE;
3506                 break;
3507         }
3508 out:
3509         rcu_read_unlock();
3510         return err;
3511 }
3512
3513 int i915_gem_set_caching_ioctl(struct drm_device *dev, void *data,
3514                                struct drm_file *file)
3515 {
3516         struct drm_i915_private *i915 = to_i915(dev);
3517         struct drm_i915_gem_caching *args = data;
3518         struct drm_i915_gem_object *obj;
3519         enum i915_cache_level level;
3520         int ret = 0;
3521
3522         switch (args->caching) {
3523         case I915_CACHING_NONE:
3524                 level = I915_CACHE_NONE;
3525                 break;
3526         case I915_CACHING_CACHED:
3527                 /*
3528                  * Due to a HW issue on BXT A stepping, GPU stores via a
3529                  * snooped mapping may leave stale data in a corresponding CPU
3530                  * cacheline, whereas normally such cachelines would get
3531                  * invalidated.
3532                  */
3533                 if (!HAS_LLC(i915) && !HAS_SNOOP(i915))
3534                         return -ENODEV;
3535
3536                 level = I915_CACHE_LLC;
3537                 break;
3538         case I915_CACHING_DISPLAY:
3539                 level = HAS_WT(i915) ? I915_CACHE_WT : I915_CACHE_NONE;
3540                 break;
3541         default:
3542                 return -EINVAL;
3543         }
3544
3545         obj = i915_gem_object_lookup(file, args->handle);
3546         if (!obj)
3547                 return -ENOENT;
3548
3549         /*
3550          * The caching mode of proxy object is handled by its generator, and
3551          * not allowed to be changed by userspace.
3552          */
3553         if (i915_gem_object_is_proxy(obj)) {
3554                 ret = -ENXIO;
3555                 goto out;
3556         }
3557
3558         if (obj->cache_level == level)
3559                 goto out;
3560
3561         ret = i915_gem_object_wait(obj,
3562                                    I915_WAIT_INTERRUPTIBLE,
3563                                    MAX_SCHEDULE_TIMEOUT);
3564         if (ret)
3565                 goto out;
3566
3567         ret = i915_mutex_lock_interruptible(dev);
3568         if (ret)
3569                 goto out;
3570
3571         ret = i915_gem_object_set_cache_level(obj, level);
3572         mutex_unlock(&dev->struct_mutex);
3573
3574 out:
3575         i915_gem_object_put(obj);
3576         return ret;
3577 }
3578
3579 /*
3580  * Prepare buffer for display plane (scanout, cursors, etc). Can be called from
3581  * an uninterruptible phase (modesetting) and allows any flushes to be pipelined
3582  * (for pageflips). We only flush the caches while preparing the buffer for
3583  * display, the callers are responsible for frontbuffer flush.
3584  */
3585 struct i915_vma *
3586 i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj,
3587                                      u32 alignment,
3588                                      const struct i915_ggtt_view *view,
3589                                      unsigned int flags)
3590 {
3591         struct i915_vma *vma;
3592         int ret;
3593
3594         lockdep_assert_held(&obj->base.dev->struct_mutex);
3595
3596         /* Mark the global pin early so that we account for the
3597          * display coherency whilst setting up the cache domains.
3598          */
3599         obj->pin_global++;
3600
3601         /* The display engine is not coherent with the LLC cache on gen6.  As
3602          * a result, we make sure that the pinning that is about to occur is
3603          * done with uncached PTEs. This is lowest common denominator for all
3604          * chipsets.
3605          *
3606          * However for gen6+, we could do better by using the GFDT bit instead
3607          * of uncaching, which would allow us to flush all the LLC-cached data
3608          * with that bit in the PTE to main memory with just one PIPE_CONTROL.
3609          */
3610         ret = i915_gem_object_set_cache_level(obj,
3611                                               HAS_WT(to_i915(obj->base.dev)) ?
3612                                               I915_CACHE_WT : I915_CACHE_NONE);
3613         if (ret) {
3614                 vma = ERR_PTR(ret);
3615                 goto err_unpin_global;
3616         }
3617
3618         /* As the user may map the buffer once pinned in the display plane
3619          * (e.g. libkms for the bootup splash), we have to ensure that we
3620          * always use map_and_fenceable for all scanout buffers. However,
3621          * it may simply be too big to fit into mappable, in which case
3622          * put it anyway and hope that userspace can cope (but always first
3623          * try to preserve the existing ABI).
3624          */
3625         vma = ERR_PTR(-ENOSPC);
3626         if ((flags & PIN_MAPPABLE) == 0 &&
3627             (!view || view->type == I915_GGTT_VIEW_NORMAL))
3628                 vma = i915_gem_object_ggtt_pin(obj, view, 0, alignment,
3629                                                flags |
3630                                                PIN_MAPPABLE |
3631                                                PIN_NONBLOCK);
3632         if (IS_ERR(vma))
3633                 vma = i915_gem_object_ggtt_pin(obj, view, 0, alignment, flags);
3634         if (IS_ERR(vma))
3635                 goto err_unpin_global;
3636
3637         vma->display_alignment = max_t(u64, vma->display_alignment, alignment);
3638
3639         __i915_gem_object_flush_for_display(obj);
3640
3641         /* It should now be out of any other write domains, and we can update
3642          * the domain values for our changes.
3643          */
3644         obj->read_domains |= I915_GEM_DOMAIN_GTT;
3645
3646         return vma;
3647
3648 err_unpin_global:
3649         obj->pin_global--;
3650         return vma;
3651 }
3652
3653 void
3654 i915_gem_object_unpin_from_display_plane(struct i915_vma *vma)
3655 {
3656         lockdep_assert_held(&vma->vm->i915->drm.struct_mutex);
3657
3658         if (WARN_ON(vma->obj->pin_global == 0))
3659                 return;
3660
3661         if (--vma->obj->pin_global == 0)
3662                 vma->display_alignment = I915_GTT_MIN_ALIGNMENT;
3663
3664         /* Bump the LRU to try and avoid premature eviction whilst flipping  */
3665         i915_gem_object_bump_inactive_ggtt(vma->obj);
3666
3667         i915_vma_unpin(vma);
3668 }
3669
3670 /**
3671  * Moves a single object to the CPU read, and possibly write domain.
3672  * @obj: object to act on
3673  * @write: requesting write or read-only access
3674  *
3675  * This function returns when the move is complete, including waiting on
3676  * flushes to occur.
3677  */
3678 int
3679 i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write)
3680 {
3681         int ret;
3682
3683         lockdep_assert_held(&obj->base.dev->struct_mutex);
3684
3685         ret = i915_gem_object_wait(obj,
3686                                    I915_WAIT_INTERRUPTIBLE |
3687                                    I915_WAIT_LOCKED |
3688                                    (write ? I915_WAIT_ALL : 0),
3689                                    MAX_SCHEDULE_TIMEOUT);
3690         if (ret)
3691                 return ret;
3692
3693         flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
3694
3695         /* Flush the CPU cache if it's still invalid. */
3696         if ((obj->read_domains & I915_GEM_DOMAIN_CPU) == 0) {
3697                 i915_gem_clflush_object(obj, I915_CLFLUSH_SYNC);
3698                 obj->read_domains |= I915_GEM_DOMAIN_CPU;
3699         }
3700
3701         /* It should now be out of any other write domains, and we can update
3702          * the domain values for our changes.
3703          */
3704         GEM_BUG_ON(obj->write_domain & ~I915_GEM_DOMAIN_CPU);
3705
3706         /* If we're writing through the CPU, then the GPU read domains will
3707          * need to be invalidated at next use.
3708          */
3709         if (write)
3710                 __start_cpu_write(obj);
3711
3712         return 0;
3713 }
3714
3715 /* Throttle our rendering by waiting until the ring has completed our requests
3716  * emitted over 20 msec ago.
3717  *
3718  * Note that if we were to use the current jiffies each time around the loop,
3719  * we wouldn't escape the function with any frames outstanding if the time to
3720  * render a frame was over 20ms.
3721  *
3722  * This should get us reasonable parallelism between CPU and GPU but also
3723  * relatively low latency when blocking on a particular request to finish.
3724  */
3725 static int
3726 i915_gem_ring_throttle(struct drm_device *dev, struct drm_file *file)
3727 {
3728         struct drm_i915_private *dev_priv = to_i915(dev);
3729         struct drm_i915_file_private *file_priv = file->driver_priv;
3730         unsigned long recent_enough = jiffies - DRM_I915_THROTTLE_JIFFIES;
3731         struct i915_request *request, *target = NULL;
3732         long ret;
3733
3734         /* ABI: return -EIO if already wedged */
3735         ret = i915_terminally_wedged(dev_priv);
3736         if (ret)
3737                 return ret;
3738
3739         spin_lock(&file_priv->mm.lock);
3740         list_for_each_entry(request, &file_priv->mm.request_list, client_link) {
3741                 if (time_after_eq(request->emitted_jiffies, recent_enough))
3742                         break;
3743
3744                 if (target) {
3745                         list_del(&target->client_link);
3746                         target->file_priv = NULL;
3747                 }
3748
3749                 target = request;
3750         }
3751         if (target)
3752                 i915_request_get(target);
3753         spin_unlock(&file_priv->mm.lock);
3754
3755         if (target == NULL)
3756                 return 0;
3757
3758         ret = i915_request_wait(target,
3759                                 I915_WAIT_INTERRUPTIBLE,
3760                                 MAX_SCHEDULE_TIMEOUT);
3761         i915_request_put(target);
3762
3763         return ret < 0 ? ret : 0;
3764 }
3765
3766 struct i915_vma *
3767 i915_gem_object_ggtt_pin(struct drm_i915_gem_object *obj,
3768                          const struct i915_ggtt_view *view,
3769                          u64 size,
3770                          u64 alignment,
3771                          u64 flags)
3772 {
3773         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
3774         struct i915_address_space *vm = &dev_priv->ggtt.vm;
3775         struct i915_vma *vma;
3776         int ret;
3777
3778         lockdep_assert_held(&obj->base.dev->struct_mutex);
3779
3780         if (flags & PIN_MAPPABLE &&
3781             (!view || view->type == I915_GGTT_VIEW_NORMAL)) {
3782                 /* If the required space is larger than the available
3783                  * aperture, we will not able to find a slot for the
3784                  * object and unbinding the object now will be in
3785                  * vain. Worse, doing so may cause us to ping-pong
3786                  * the object in and out of the Global GTT and
3787                  * waste a lot of cycles under the mutex.
3788                  */
3789                 if (obj->base.size > dev_priv->ggtt.mappable_end)
3790                         return ERR_PTR(-E2BIG);
3791
3792                 /* If NONBLOCK is set the caller is optimistically
3793                  * trying to cache the full object within the mappable
3794                  * aperture, and *must* have a fallback in place for
3795                  * situations where we cannot bind the object. We
3796                  * can be a little more lax here and use the fallback
3797                  * more often to avoid costly migrations of ourselves
3798                  * and other objects within the aperture.
3799                  *
3800                  * Half-the-aperture is used as a simple heuristic.
3801                  * More interesting would to do search for a free
3802                  * block prior to making the commitment to unbind.
3803                  * That caters for the self-harm case, and with a
3804                  * little more heuristics (e.g. NOFAULT, NOEVICT)
3805                  * we could try to minimise harm to others.
3806                  */
3807                 if (flags & PIN_NONBLOCK &&
3808                     obj->base.size > dev_priv->ggtt.mappable_end / 2)
3809                         return ERR_PTR(-ENOSPC);
3810         }
3811
3812         vma = i915_vma_instance(obj, vm, view);
3813         if (IS_ERR(vma))
3814                 return vma;
3815
3816         if (i915_vma_misplaced(vma, size, alignment, flags)) {
3817                 if (flags & PIN_NONBLOCK) {
3818                         if (i915_vma_is_pinned(vma) || i915_vma_is_active(vma))
3819                                 return ERR_PTR(-ENOSPC);
3820
3821                         if (flags & PIN_MAPPABLE &&
3822                             vma->fence_size > dev_priv->ggtt.mappable_end / 2)
3823                                 return ERR_PTR(-ENOSPC);
3824                 }
3825
3826                 WARN(i915_vma_is_pinned(vma),
3827                      "bo is already pinned in ggtt with incorrect alignment:"
3828                      " offset=%08x, req.alignment=%llx,"
3829                      " req.map_and_fenceable=%d, vma->map_and_fenceable=%d\n",
3830                      i915_ggtt_offset(vma), alignment,
3831                      !!(flags & PIN_MAPPABLE),
3832                      i915_vma_is_map_and_fenceable(vma));
3833                 ret = i915_vma_unbind(vma);
3834                 if (ret)
3835                         return ERR_PTR(ret);
3836         }
3837
3838         ret = i915_vma_pin(vma, size, alignment, flags | PIN_GLOBAL);
3839         if (ret)
3840                 return ERR_PTR(ret);
3841
3842         return vma;
3843 }
3844
3845 static __always_inline u32 __busy_read_flag(u8 id)
3846 {
3847         if (id == (u8)I915_ENGINE_CLASS_INVALID)
3848                 return 0xffff0000u;
3849
3850         GEM_BUG_ON(id >= 16);
3851         return 0x10000u << id;
3852 }
3853
3854 static __always_inline u32 __busy_write_id(u8 id)
3855 {
3856         /*
3857          * The uABI guarantees an active writer is also amongst the read
3858          * engines. This would be true if we accessed the activity tracking
3859          * under the lock, but as we perform the lookup of the object and
3860          * its activity locklessly we can not guarantee that the last_write
3861          * being active implies that we have set the same engine flag from
3862          * last_read - hence we always set both read and write busy for
3863          * last_write.
3864          */
3865         if (id == (u8)I915_ENGINE_CLASS_INVALID)
3866                 return 0xffffffffu;
3867
3868         return (id + 1) | __busy_read_flag(id);
3869 }
3870
3871 static __always_inline unsigned int
3872 __busy_set_if_active(const struct dma_fence *fence, u32 (*flag)(u8 id))
3873 {
3874         const struct i915_request *rq;
3875
3876         /*
3877          * We have to check the current hw status of the fence as the uABI
3878          * guarantees forward progress. We could rely on the idle worker
3879          * to eventually flush us, but to minimise latency just ask the
3880          * hardware.
3881          *
3882          * Note we only report on the status of native fences.
3883          */
3884         if (!dma_fence_is_i915(fence))
3885                 return 0;
3886
3887         /* opencode to_request() in order to avoid const warnings */
3888         rq = container_of(fence, const struct i915_request, fence);
3889         if (i915_request_completed(rq))
3890                 return 0;
3891
3892         /* Beware type-expansion follies! */
3893         BUILD_BUG_ON(!typecheck(u8, rq->engine->uabi_class));
3894         return flag(rq->engine->uabi_class);
3895 }
3896
3897 static __always_inline unsigned int
3898 busy_check_reader(const struct dma_fence *fence)
3899 {
3900         return __busy_set_if_active(fence, __busy_read_flag);
3901 }
3902
3903 static __always_inline unsigned int
3904 busy_check_writer(const struct dma_fence *fence)
3905 {
3906         if (!fence)
3907                 return 0;
3908
3909         return __busy_set_if_active(fence, __busy_write_id);
3910 }
3911
3912 int
3913 i915_gem_busy_ioctl(struct drm_device *dev, void *data,
3914                     struct drm_file *file)
3915 {
3916         struct drm_i915_gem_busy *args = data;
3917         struct drm_i915_gem_object *obj;
3918         struct reservation_object_list *list;
3919         unsigned int seq;
3920         int err;
3921
3922         err = -ENOENT;
3923         rcu_read_lock();
3924         obj = i915_gem_object_lookup_rcu(file, args->handle);
3925         if (!obj)
3926                 goto out;
3927
3928         /*
3929          * A discrepancy here is that we do not report the status of
3930          * non-i915 fences, i.e. even though we may report the object as idle,
3931          * a call to set-domain may still stall waiting for foreign rendering.
3932          * This also means that wait-ioctl may report an object as busy,
3933          * where busy-ioctl considers it idle.
3934          *
3935          * We trade the ability to warn of foreign fences to report on which
3936          * i915 engines are active for the object.
3937          *
3938          * Alternatively, we can trade that extra information on read/write
3939          * activity with
3940          *      args->busy =
3941          *              !reservation_object_test_signaled_rcu(obj->resv, true);
3942          * to report the overall busyness. This is what the wait-ioctl does.
3943          *
3944          */
3945 retry:
3946         seq = raw_read_seqcount(&obj->resv->seq);
3947
3948         /* Translate the exclusive fence to the READ *and* WRITE engine */
3949         args->busy = busy_check_writer(rcu_dereference(obj->resv->fence_excl));
3950
3951         /* Translate shared fences to READ set of engines */
3952         list = rcu_dereference(obj->resv->fence);
3953         if (list) {
3954                 unsigned int shared_count = list->shared_count, i;
3955
3956                 for (i = 0; i < shared_count; ++i) {
3957                         struct dma_fence *fence =
3958                                 rcu_dereference(list->shared[i]);
3959
3960                         args->busy |= busy_check_reader(fence);
3961                 }
3962         }
3963
3964         if (args->busy && read_seqcount_retry(&obj->resv->seq, seq))
3965                 goto retry;
3966
3967         err = 0;
3968 out:
3969         rcu_read_unlock();
3970         return err;
3971 }
3972
3973 int
3974 i915_gem_throttle_ioctl(struct drm_device *dev, void *data,
3975                         struct drm_file *file_priv)
3976 {
3977         return i915_gem_ring_throttle(dev, file_priv);
3978 }
3979
3980 int
3981 i915_gem_madvise_ioctl(struct drm_device *dev, void *data,
3982                        struct drm_file *file_priv)
3983 {
3984         struct drm_i915_private *dev_priv = to_i915(dev);
3985         struct drm_i915_gem_madvise *args = data;
3986         struct drm_i915_gem_object *obj;
3987         int err;
3988
3989         switch (args->madv) {
3990         case I915_MADV_DONTNEED:
3991         case I915_MADV_WILLNEED:
3992             break;
3993         default:
3994             return -EINVAL;
3995         }
3996
3997         obj = i915_gem_object_lookup(file_priv, args->handle);
3998         if (!obj)
3999                 return -ENOENT;
4000
4001         err = mutex_lock_interruptible(&obj->mm.lock);
4002         if (err)
4003                 goto out;
4004
4005         if (i915_gem_object_has_pages(obj) &&
4006             i915_gem_object_is_tiled(obj) &&
4007             dev_priv->quirks & QUIRK_PIN_SWIZZLED_PAGES) {
4008                 if (obj->mm.madv == I915_MADV_WILLNEED) {
4009                         GEM_BUG_ON(!obj->mm.quirked);
4010                         __i915_gem_object_unpin_pages(obj);
4011                         obj->mm.quirked = false;
4012                 }
4013                 if (args->madv == I915_MADV_WILLNEED) {
4014                         GEM_BUG_ON(obj->mm.quirked);
4015                         __i915_gem_object_pin_pages(obj);
4016                         obj->mm.quirked = true;
4017                 }
4018         }
4019
4020         if (obj->mm.madv != __I915_MADV_PURGED)
4021                 obj->mm.madv = args->madv;
4022
4023         /* if the object is no longer attached, discard its backing storage */
4024         if (obj->mm.madv == I915_MADV_DONTNEED &&
4025             !i915_gem_object_has_pages(obj))
4026                 i915_gem_object_truncate(obj);
4027
4028         args->retained = obj->mm.madv != __I915_MADV_PURGED;
4029         mutex_unlock(&obj->mm.lock);
4030
4031 out:
4032         i915_gem_object_put(obj);
4033         return err;
4034 }
4035
4036 static void
4037 frontbuffer_retire(struct i915_active_request *active,
4038                    struct i915_request *request)
4039 {
4040         struct drm_i915_gem_object *obj =
4041                 container_of(active, typeof(*obj), frontbuffer_write);
4042
4043         intel_fb_obj_flush(obj, ORIGIN_CS);
4044 }
4045
4046 void i915_gem_object_init(struct drm_i915_gem_object *obj,
4047                           const struct drm_i915_gem_object_ops *ops)
4048 {
4049         mutex_init(&obj->mm.lock);
4050
4051         spin_lock_init(&obj->vma.lock);
4052         INIT_LIST_HEAD(&obj->vma.list);
4053
4054         INIT_LIST_HEAD(&obj->lut_list);
4055         INIT_LIST_HEAD(&obj->batch_pool_link);
4056
4057         init_rcu_head(&obj->rcu);
4058
4059         obj->ops = ops;
4060
4061         reservation_object_init(&obj->__builtin_resv);
4062         obj->resv = &obj->__builtin_resv;
4063
4064         obj->frontbuffer_ggtt_origin = ORIGIN_GTT;
4065         i915_active_request_init(&obj->frontbuffer_write,
4066                                  NULL, frontbuffer_retire);
4067
4068         obj->mm.madv = I915_MADV_WILLNEED;
4069         INIT_RADIX_TREE(&obj->mm.get_page.radix, GFP_KERNEL | __GFP_NOWARN);
4070         mutex_init(&obj->mm.get_page.lock);
4071
4072         i915_gem_info_add_obj(to_i915(obj->base.dev), obj->base.size);
4073 }
4074
4075 static const struct drm_i915_gem_object_ops i915_gem_object_ops = {
4076         .flags = I915_GEM_OBJECT_HAS_STRUCT_PAGE |
4077                  I915_GEM_OBJECT_IS_SHRINKABLE,
4078
4079         .get_pages = i915_gem_object_get_pages_gtt,
4080         .put_pages = i915_gem_object_put_pages_gtt,
4081
4082         .pwrite = i915_gem_object_pwrite_gtt,
4083 };
4084
4085 static int i915_gem_object_create_shmem(struct drm_device *dev,
4086                                         struct drm_gem_object *obj,
4087                                         size_t size)
4088 {
4089         struct drm_i915_private *i915 = to_i915(dev);
4090         unsigned long flags = VM_NORESERVE;
4091         struct file *filp;
4092
4093         drm_gem_private_object_init(dev, obj, size);
4094
4095         if (i915->mm.gemfs)
4096                 filp = shmem_file_setup_with_mnt(i915->mm.gemfs, "i915", size,
4097                                                  flags);
4098         else
4099                 filp = shmem_file_setup("i915", size, flags);
4100
4101         if (IS_ERR(filp))
4102                 return PTR_ERR(filp);
4103
4104         obj->filp = filp;
4105
4106         return 0;
4107 }
4108
4109 struct drm_i915_gem_object *
4110 i915_gem_object_create(struct drm_i915_private *dev_priv, u64 size)
4111 {
4112         struct drm_i915_gem_object *obj;
4113         struct address_space *mapping;
4114         unsigned int cache_level;
4115         gfp_t mask;
4116         int ret;
4117
4118         /* There is a prevalence of the assumption that we fit the object's
4119          * page count inside a 32bit _signed_ variable. Let's document this and
4120          * catch if we ever need to fix it. In the meantime, if you do spot
4121          * such a local variable, please consider fixing!
4122          */
4123         if (size >> PAGE_SHIFT > INT_MAX)
4124                 return ERR_PTR(-E2BIG);
4125
4126         if (overflows_type(size, obj->base.size))
4127                 return ERR_PTR(-E2BIG);
4128
4129         obj = i915_gem_object_alloc();
4130         if (obj == NULL)
4131                 return ERR_PTR(-ENOMEM);
4132
4133         ret = i915_gem_object_create_shmem(&dev_priv->drm, &obj->base, size);
4134         if (ret)
4135                 goto fail;
4136
4137         mask = GFP_HIGHUSER | __GFP_RECLAIMABLE;
4138         if (IS_I965GM(dev_priv) || IS_I965G(dev_priv)) {
4139                 /* 965gm cannot relocate objects above 4GiB. */
4140                 mask &= ~__GFP_HIGHMEM;
4141                 mask |= __GFP_DMA32;
4142         }
4143
4144         mapping = obj->base.filp->f_mapping;
4145         mapping_set_gfp_mask(mapping, mask);
4146         GEM_BUG_ON(!(mapping_gfp_mask(mapping) & __GFP_RECLAIM));
4147
4148         i915_gem_object_init(obj, &i915_gem_object_ops);
4149
4150         obj->write_domain = I915_GEM_DOMAIN_CPU;
4151         obj->read_domains = I915_GEM_DOMAIN_CPU;
4152
4153         if (HAS_LLC(dev_priv))
4154                 /* On some devices, we can have the GPU use the LLC (the CPU
4155                  * cache) for about a 10% performance improvement
4156                  * compared to uncached.  Graphics requests other than
4157                  * display scanout are coherent with the CPU in
4158                  * accessing this cache.  This means in this mode we
4159                  * don't need to clflush on the CPU side, and on the
4160                  * GPU side we only need to flush internal caches to
4161                  * get data visible to the CPU.
4162                  *
4163                  * However, we maintain the display planes as UC, and so
4164                  * need to rebind when first used as such.
4165                  */
4166                 cache_level = I915_CACHE_LLC;
4167         else
4168                 cache_level = I915_CACHE_NONE;
4169
4170         i915_gem_object_set_cache_coherency(obj, cache_level);
4171
4172         trace_i915_gem_object_create(obj);
4173
4174         return obj;
4175
4176 fail:
4177         i915_gem_object_free(obj);
4178         return ERR_PTR(ret);
4179 }
4180
4181 static bool discard_backing_storage(struct drm_i915_gem_object *obj)
4182 {
4183         /* If we are the last user of the backing storage (be it shmemfs
4184          * pages or stolen etc), we know that the pages are going to be
4185          * immediately released. In this case, we can then skip copying
4186          * back the contents from the GPU.
4187          */
4188
4189         if (obj->mm.madv != I915_MADV_WILLNEED)
4190                 return false;
4191
4192         if (obj->base.filp == NULL)
4193                 return true;
4194
4195         /* At first glance, this looks racy, but then again so would be
4196          * userspace racing mmap against close. However, the first external
4197          * reference to the filp can only be obtained through the
4198          * i915_gem_mmap_ioctl() which safeguards us against the user
4199          * acquiring such a reference whilst we are in the middle of
4200          * freeing the object.
4201          */
4202         return file_count(obj->base.filp) == 1;
4203 }
4204
4205 static void __i915_gem_free_objects(struct drm_i915_private *i915,
4206                                     struct llist_node *freed)
4207 {
4208         struct drm_i915_gem_object *obj, *on;
4209         intel_wakeref_t wakeref;
4210
4211         wakeref = intel_runtime_pm_get(i915);
4212         llist_for_each_entry_safe(obj, on, freed, freed) {
4213                 struct i915_vma *vma, *vn;
4214
4215                 trace_i915_gem_object_destroy(obj);
4216
4217                 mutex_lock(&i915->drm.struct_mutex);
4218
4219                 GEM_BUG_ON(i915_gem_object_is_active(obj));
4220                 list_for_each_entry_safe(vma, vn, &obj->vma.list, obj_link) {
4221                         GEM_BUG_ON(i915_vma_is_active(vma));
4222                         vma->flags &= ~I915_VMA_PIN_MASK;
4223                         i915_vma_destroy(vma);
4224                 }
4225                 GEM_BUG_ON(!list_empty(&obj->vma.list));
4226                 GEM_BUG_ON(!RB_EMPTY_ROOT(&obj->vma.tree));
4227
4228                 /* This serializes freeing with the shrinker. Since the free
4229                  * is delayed, first by RCU then by the workqueue, we want the
4230                  * shrinker to be able to free pages of unreferenced objects,
4231                  * or else we may oom whilst there are plenty of deferred
4232                  * freed objects.
4233                  */
4234                 if (i915_gem_object_has_pages(obj)) {
4235                         spin_lock(&i915->mm.obj_lock);
4236                         list_del_init(&obj->mm.link);
4237                         spin_unlock(&i915->mm.obj_lock);
4238                 }
4239
4240                 mutex_unlock(&i915->drm.struct_mutex);
4241
4242                 GEM_BUG_ON(obj->bind_count);
4243                 GEM_BUG_ON(obj->userfault_count);
4244                 GEM_BUG_ON(atomic_read(&obj->frontbuffer_bits));
4245                 GEM_BUG_ON(!list_empty(&obj->lut_list));
4246
4247                 if (obj->ops->release)
4248                         obj->ops->release(obj);
4249
4250                 if (WARN_ON(i915_gem_object_has_pinned_pages(obj)))
4251                         atomic_set(&obj->mm.pages_pin_count, 0);
4252                 __i915_gem_object_put_pages(obj, I915_MM_NORMAL);
4253                 GEM_BUG_ON(i915_gem_object_has_pages(obj));
4254
4255                 if (obj->base.import_attach)
4256                         drm_prime_gem_destroy(&obj->base, NULL);
4257
4258                 reservation_object_fini(&obj->__builtin_resv);
4259                 drm_gem_object_release(&obj->base);
4260                 i915_gem_info_remove_obj(i915, obj->base.size);
4261
4262                 bitmap_free(obj->bit_17);
4263                 i915_gem_object_free(obj);
4264
4265                 GEM_BUG_ON(!atomic_read(&i915->mm.free_count));
4266                 atomic_dec(&i915->mm.free_count);
4267
4268                 if (on)
4269                         cond_resched();
4270         }
4271         intel_runtime_pm_put(i915, wakeref);
4272 }
4273
4274 static void i915_gem_flush_free_objects(struct drm_i915_private *i915)
4275 {
4276         struct llist_node *freed;
4277
4278         /* Free the oldest, most stale object to keep the free_list short */
4279         freed = NULL;
4280         if (!llist_empty(&i915->mm.free_list)) { /* quick test for hotpath */
4281                 /* Only one consumer of llist_del_first() allowed */
4282                 spin_lock(&i915->mm.free_lock);
4283                 freed = llist_del_first(&i915->mm.free_list);
4284                 spin_unlock(&i915->mm.free_lock);
4285         }
4286         if (unlikely(freed)) {
4287                 freed->next = NULL;
4288                 __i915_gem_free_objects(i915, freed);
4289         }
4290 }
4291
4292 static void __i915_gem_free_work(struct work_struct *work)
4293 {
4294         struct drm_i915_private *i915 =
4295                 container_of(work, struct drm_i915_private, mm.free_work);
4296         struct llist_node *freed;
4297
4298         /*
4299          * All file-owned VMA should have been released by this point through
4300          * i915_gem_close_object(), or earlier by i915_gem_context_close().
4301          * However, the object may also be bound into the global GTT (e.g.
4302          * older GPUs without per-process support, or for direct access through
4303          * the GTT either for the user or for scanout). Those VMA still need to
4304          * unbound now.
4305          */
4306
4307         spin_lock(&i915->mm.free_lock);
4308         while ((freed = llist_del_all(&i915->mm.free_list))) {
4309                 spin_unlock(&i915->mm.free_lock);
4310
4311                 __i915_gem_free_objects(i915, freed);
4312                 if (need_resched())
4313                         return;
4314
4315                 spin_lock(&i915->mm.free_lock);
4316         }
4317         spin_unlock(&i915->mm.free_lock);
4318 }
4319
4320 static void __i915_gem_free_object_rcu(struct rcu_head *head)
4321 {
4322         struct drm_i915_gem_object *obj =
4323                 container_of(head, typeof(*obj), rcu);
4324         struct drm_i915_private *i915 = to_i915(obj->base.dev);
4325
4326         /*
4327          * We reuse obj->rcu for the freed list, so we had better not treat
4328          * it like a rcu_head from this point forwards. And we expect all
4329          * objects to be freed via this path.
4330          */
4331         destroy_rcu_head(&obj->rcu);
4332
4333         /*
4334          * Since we require blocking on struct_mutex to unbind the freed
4335          * object from the GPU before releasing resources back to the
4336          * system, we can not do that directly from the RCU callback (which may
4337          * be a softirq context), but must instead then defer that work onto a
4338          * kthread. We use the RCU callback rather than move the freed object
4339          * directly onto the work queue so that we can mix between using the
4340          * worker and performing frees directly from subsequent allocations for
4341          * crude but effective memory throttling.
4342          */
4343         if (llist_add(&obj->freed, &i915->mm.free_list))
4344                 queue_work(i915->wq, &i915->mm.free_work);
4345 }
4346
4347 void i915_gem_free_object(struct drm_gem_object *gem_obj)
4348 {
4349         struct drm_i915_gem_object *obj = to_intel_bo(gem_obj);
4350
4351         if (obj->mm.quirked)
4352                 __i915_gem_object_unpin_pages(obj);
4353
4354         if (discard_backing_storage(obj))
4355                 obj->mm.madv = I915_MADV_DONTNEED;
4356
4357         /*
4358          * Before we free the object, make sure any pure RCU-only
4359          * read-side critical sections are complete, e.g.
4360          * i915_gem_busy_ioctl(). For the corresponding synchronized
4361          * lookup see i915_gem_object_lookup_rcu().
4362          */
4363         atomic_inc(&to_i915(obj->base.dev)->mm.free_count);
4364         call_rcu(&obj->rcu, __i915_gem_free_object_rcu);
4365 }
4366
4367 void __i915_gem_object_release_unless_active(struct drm_i915_gem_object *obj)
4368 {
4369         lockdep_assert_held(&obj->base.dev->struct_mutex);
4370
4371         if (!i915_gem_object_has_active_reference(obj) &&
4372             i915_gem_object_is_active(obj))
4373                 i915_gem_object_set_active_reference(obj);
4374         else
4375                 i915_gem_object_put(obj);
4376 }
4377
4378 void i915_gem_sanitize(struct drm_i915_private *i915)
4379 {
4380         intel_wakeref_t wakeref;
4381
4382         GEM_TRACE("\n");
4383
4384         wakeref = intel_runtime_pm_get(i915);
4385         intel_uncore_forcewake_get(&i915->uncore, FORCEWAKE_ALL);
4386
4387         /*
4388          * As we have just resumed the machine and woken the device up from
4389          * deep PCI sleep (presumably D3_cold), assume the HW has been reset
4390          * back to defaults, recovering from whatever wedged state we left it
4391          * in and so worth trying to use the device once more.
4392          */
4393         if (i915_terminally_wedged(i915))
4394                 i915_gem_unset_wedged(i915);
4395
4396         /*
4397          * If we inherit context state from the BIOS or earlier occupants
4398          * of the GPU, the GPU may be in an inconsistent state when we
4399          * try to take over. The only way to remove the earlier state
4400          * is by resetting. However, resetting on earlier gen is tricky as
4401          * it may impact the display and we are uncertain about the stability
4402          * of the reset, so this could be applied to even earlier gen.
4403          */
4404         intel_engines_sanitize(i915, false);
4405
4406         intel_uncore_forcewake_put(&i915->uncore, FORCEWAKE_ALL);
4407         intel_runtime_pm_put(i915, wakeref);
4408
4409         mutex_lock(&i915->drm.struct_mutex);
4410         i915_gem_contexts_lost(i915);
4411         mutex_unlock(&i915->drm.struct_mutex);
4412 }
4413
4414 void i915_gem_suspend(struct drm_i915_private *i915)
4415 {
4416         intel_wakeref_t wakeref;
4417
4418         GEM_TRACE("\n");
4419
4420         wakeref = intel_runtime_pm_get(i915);
4421
4422         flush_workqueue(i915->wq);
4423
4424         mutex_lock(&i915->drm.struct_mutex);
4425
4426         /*
4427          * We have to flush all the executing contexts to main memory so
4428          * that they can saved in the hibernation image. To ensure the last
4429          * context image is coherent, we have to switch away from it. That
4430          * leaves the i915->kernel_context still active when
4431          * we actually suspend, and its image in memory may not match the GPU
4432          * state. Fortunately, the kernel_context is disposable and we do
4433          * not rely on its state.
4434          */
4435         switch_to_kernel_context_sync(i915, i915->gt.active_engines);
4436
4437         mutex_unlock(&i915->drm.struct_mutex);
4438         i915_reset_flush(i915);
4439
4440         drain_delayed_work(&i915->gt.retire_work);
4441
4442         /*
4443          * As the idle_work is rearming if it detects a race, play safe and
4444          * repeat the flush until it is definitely idle.
4445          */
4446         drain_delayed_work(&i915->gt.idle_work);
4447
4448         /*
4449          * Assert that we successfully flushed all the work and
4450          * reset the GPU back to its idle, low power state.
4451          */
4452         GEM_BUG_ON(i915->gt.awake);
4453
4454         intel_uc_suspend(i915);
4455
4456         intel_runtime_pm_put(i915, wakeref);
4457 }
4458
4459 void i915_gem_suspend_late(struct drm_i915_private *i915)
4460 {
4461         struct drm_i915_gem_object *obj;
4462         struct list_head *phases[] = {
4463                 &i915->mm.unbound_list,
4464                 &i915->mm.bound_list,
4465                 NULL
4466         }, **phase;
4467
4468         /*
4469          * Neither the BIOS, ourselves or any other kernel
4470          * expects the system to be in execlists mode on startup,
4471          * so we need to reset the GPU back to legacy mode. And the only
4472          * known way to disable logical contexts is through a GPU reset.
4473          *
4474          * So in order to leave the system in a known default configuration,
4475          * always reset the GPU upon unload and suspend. Afterwards we then
4476          * clean up the GEM state tracking, flushing off the requests and
4477          * leaving the system in a known idle state.
4478          *
4479          * Note that is of the upmost importance that the GPU is idle and
4480          * all stray writes are flushed *before* we dismantle the backing
4481          * storage for the pinned objects.
4482          *
4483          * However, since we are uncertain that resetting the GPU on older
4484          * machines is a good idea, we don't - just in case it leaves the
4485          * machine in an unusable condition.
4486          */
4487
4488         mutex_lock(&i915->drm.struct_mutex);
4489         for (phase = phases; *phase; phase++) {
4490                 list_for_each_entry(obj, *phase, mm.link)
4491                         WARN_ON(i915_gem_object_set_to_gtt_domain(obj, false));
4492         }
4493         mutex_unlock(&i915->drm.struct_mutex);
4494
4495         intel_uc_sanitize(i915);
4496         i915_gem_sanitize(i915);
4497 }
4498
4499 void i915_gem_resume(struct drm_i915_private *i915)
4500 {
4501         GEM_TRACE("\n");
4502
4503         WARN_ON(i915->gt.awake);
4504
4505         mutex_lock(&i915->drm.struct_mutex);
4506         intel_uncore_forcewake_get(&i915->uncore, FORCEWAKE_ALL);
4507
4508         i915_gem_restore_gtt_mappings(i915);
4509         i915_gem_restore_fences(i915);
4510
4511         /*
4512          * As we didn't flush the kernel context before suspend, we cannot
4513          * guarantee that the context image is complete. So let's just reset
4514          * it and start again.
4515          */
4516         intel_gt_resume(i915);
4517
4518         if (i915_gem_init_hw(i915))
4519                 goto err_wedged;
4520
4521         intel_uc_resume(i915);
4522
4523         /* Always reload a context for powersaving. */
4524         if (!load_power_context(i915))
4525                 goto err_wedged;
4526
4527 out_unlock:
4528         intel_uncore_forcewake_put(&i915->uncore, FORCEWAKE_ALL);
4529         mutex_unlock(&i915->drm.struct_mutex);
4530         return;
4531
4532 err_wedged:
4533         if (!i915_reset_failed(i915)) {
4534                 dev_err(i915->drm.dev,
4535                         "Failed to re-initialize GPU, declaring it wedged!\n");
4536                 i915_gem_set_wedged(i915);
4537         }
4538         goto out_unlock;
4539 }
4540
4541 void i915_gem_init_swizzling(struct drm_i915_private *dev_priv)
4542 {
4543         if (INTEL_GEN(dev_priv) < 5 ||
4544             dev_priv->mm.bit_6_swizzle_x == I915_BIT_6_SWIZZLE_NONE)
4545                 return;
4546
4547         I915_WRITE(DISP_ARB_CTL, I915_READ(DISP_ARB_CTL) |
4548                                  DISP_TILE_SURFACE_SWIZZLING);
4549
4550         if (IS_GEN(dev_priv, 5))
4551                 return;
4552
4553         I915_WRITE(TILECTL, I915_READ(TILECTL) | TILECTL_SWZCTL);
4554         if (IS_GEN(dev_priv, 6))
4555                 I915_WRITE(ARB_MODE, _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_SNB));
4556         else if (IS_GEN(dev_priv, 7))
4557                 I915_WRITE(ARB_MODE, _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_IVB));
4558         else if (IS_GEN(dev_priv, 8))
4559                 I915_WRITE(GAMTARBMODE, _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_BDW));
4560         else
4561                 BUG();
4562 }
4563
4564 static void init_unused_ring(struct drm_i915_private *dev_priv, u32 base)
4565 {
4566         I915_WRITE(RING_CTL(base), 0);
4567         I915_WRITE(RING_HEAD(base), 0);
4568         I915_WRITE(RING_TAIL(base), 0);
4569         I915_WRITE(RING_START(base), 0);
4570 }
4571
4572 static void init_unused_rings(struct drm_i915_private *dev_priv)
4573 {
4574         if (IS_I830(dev_priv)) {
4575                 init_unused_ring(dev_priv, PRB1_BASE);
4576                 init_unused_ring(dev_priv, SRB0_BASE);
4577                 init_unused_ring(dev_priv, SRB1_BASE);
4578                 init_unused_ring(dev_priv, SRB2_BASE);
4579                 init_unused_ring(dev_priv, SRB3_BASE);
4580         } else if (IS_GEN(dev_priv, 2)) {
4581                 init_unused_ring(dev_priv, SRB0_BASE);
4582                 init_unused_ring(dev_priv, SRB1_BASE);
4583         } else if (IS_GEN(dev_priv, 3)) {
4584                 init_unused_ring(dev_priv, PRB1_BASE);
4585                 init_unused_ring(dev_priv, PRB2_BASE);
4586         }
4587 }
4588
4589 static int __i915_gem_restart_engines(void *data)
4590 {
4591         struct drm_i915_private *i915 = data;
4592         struct intel_engine_cs *engine;
4593         enum intel_engine_id id;
4594         int err;
4595
4596         for_each_engine(engine, i915, id) {
4597                 err = engine->init_hw(engine);
4598                 if (err) {
4599                         DRM_ERROR("Failed to restart %s (%d)\n",
4600                                   engine->name, err);
4601                         return err;
4602                 }
4603         }
4604
4605         intel_engines_set_scheduler_caps(i915);
4606
4607         return 0;
4608 }
4609
4610 int i915_gem_init_hw(struct drm_i915_private *dev_priv)
4611 {
4612         int ret;
4613
4614         dev_priv->gt.last_init_time = ktime_get();
4615
4616         /* Double layer security blanket, see i915_gem_init() */
4617         intel_uncore_forcewake_get(&dev_priv->uncore, FORCEWAKE_ALL);
4618
4619         if (HAS_EDRAM(dev_priv) && INTEL_GEN(dev_priv) < 9)
4620                 I915_WRITE(HSW_IDICR, I915_READ(HSW_IDICR) | IDIHASHMSK(0xf));
4621
4622         if (IS_HASWELL(dev_priv))
4623                 I915_WRITE(MI_PREDICATE_RESULT_2, IS_HSW_GT3(dev_priv) ?
4624                            LOWER_SLICE_ENABLED : LOWER_SLICE_DISABLED);
4625
4626         /* Apply the GT workarounds... */
4627         intel_gt_apply_workarounds(dev_priv);
4628         /* ...and determine whether they are sticking. */
4629         intel_gt_verify_workarounds(dev_priv, "init");
4630
4631         i915_gem_init_swizzling(dev_priv);
4632
4633         /*
4634          * At least 830 can leave some of the unused rings
4635          * "active" (ie. head != tail) after resume which
4636          * will prevent c3 entry. Makes sure all unused rings
4637          * are totally idle.
4638          */
4639         init_unused_rings(dev_priv);
4640
4641         BUG_ON(!dev_priv->kernel_context);
4642         ret = i915_terminally_wedged(dev_priv);
4643         if (ret)
4644                 goto out;
4645
4646         ret = i915_ppgtt_init_hw(dev_priv);
4647         if (ret) {
4648                 DRM_ERROR("Enabling PPGTT failed (%d)\n", ret);
4649                 goto out;
4650         }
4651
4652         ret = intel_wopcm_init_hw(&dev_priv->wopcm);
4653         if (ret) {
4654                 DRM_ERROR("Enabling WOPCM failed (%d)\n", ret);
4655                 goto out;
4656         }
4657
4658         /* We can't enable contexts until all firmware is loaded */
4659         ret = intel_uc_init_hw(dev_priv);
4660         if (ret) {
4661                 DRM_ERROR("Enabling uc failed (%d)\n", ret);
4662                 goto out;
4663         }
4664
4665         intel_mocs_init_l3cc_table(dev_priv);
4666
4667         /* Only when the HW is re-initialised, can we replay the requests */
4668         ret = __i915_gem_restart_engines(dev_priv);
4669         if (ret)
4670                 goto cleanup_uc;
4671
4672         intel_uncore_forcewake_put(&dev_priv->uncore, FORCEWAKE_ALL);
4673
4674         return 0;
4675
4676 cleanup_uc:
4677         intel_uc_fini_hw(dev_priv);
4678 out:
4679         intel_uncore_forcewake_put(&dev_priv->uncore, FORCEWAKE_ALL);
4680
4681         return ret;
4682 }
4683
4684 static int __intel_engines_record_defaults(struct drm_i915_private *i915)
4685 {
4686         struct i915_gem_context *ctx;
4687         struct intel_engine_cs *engine;
4688         enum intel_engine_id id;
4689         int err = 0;
4690
4691         /*
4692          * As we reset the gpu during very early sanitisation, the current
4693          * register state on the GPU should reflect its defaults values.
4694          * We load a context onto the hw (with restore-inhibit), then switch
4695          * over to a second context to save that default register state. We
4696          * can then prime every new context with that state so they all start
4697          * from the same default HW values.
4698          */
4699
4700         ctx = i915_gem_context_create_kernel(i915, 0);
4701         if (IS_ERR(ctx))
4702                 return PTR_ERR(ctx);
4703
4704         for_each_engine(engine, i915, id) {
4705                 struct i915_request *rq;
4706
4707                 rq = i915_request_alloc(engine, ctx);
4708                 if (IS_ERR(rq)) {
4709                         err = PTR_ERR(rq);
4710                         goto out_ctx;
4711                 }
4712
4713                 err = 0;
4714                 if (engine->init_context)
4715                         err = engine->init_context(rq);
4716
4717                 i915_request_add(rq);
4718                 if (err)
4719                         goto err_active;
4720         }
4721
4722         /* Flush the default context image to memory, and enable powersaving. */
4723         if (!load_power_context(i915)) {
4724                 err = -EIO;
4725                 goto err_active;
4726         }
4727
4728         for_each_engine(engine, i915, id) {
4729                 struct intel_context *ce;
4730                 struct i915_vma *state;
4731                 void *vaddr;
4732
4733                 ce = intel_context_lookup(ctx, engine);
4734                 if (!ce)
4735                         continue;
4736
4737                 state = ce->state;
4738                 if (!state)
4739                         continue;
4740
4741                 GEM_BUG_ON(intel_context_is_pinned(ce));
4742
4743                 /*
4744                  * As we will hold a reference to the logical state, it will
4745                  * not be torn down with the context, and importantly the
4746                  * object will hold onto its vma (making it possible for a
4747                  * stray GTT write to corrupt our defaults). Unmap the vma
4748                  * from the GTT to prevent such accidents and reclaim the
4749                  * space.
4750                  */
4751                 err = i915_vma_unbind(state);
4752                 if (err)
4753                         goto err_active;
4754
4755                 err = i915_gem_object_set_to_cpu_domain(state->obj, false);
4756                 if (err)
4757                         goto err_active;
4758
4759                 engine->default_state = i915_gem_object_get(state->obj);
4760                 i915_gem_object_set_cache_coherency(engine->default_state,
4761                                                     I915_CACHE_LLC);
4762
4763                 /* Check we can acquire the image of the context state */
4764                 vaddr = i915_gem_object_pin_map(engine->default_state,
4765                                                 I915_MAP_FORCE_WB);
4766                 if (IS_ERR(vaddr)) {
4767                         err = PTR_ERR(vaddr);
4768                         goto err_active;
4769                 }
4770
4771                 i915_gem_object_unpin_map(engine->default_state);
4772         }
4773
4774         if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) {
4775                 unsigned int found = intel_engines_has_context_isolation(i915);
4776
4777                 /*
4778                  * Make sure that classes with multiple engine instances all
4779                  * share the same basic configuration.
4780                  */
4781                 for_each_engine(engine, i915, id) {
4782                         unsigned int bit = BIT(engine->uabi_class);
4783                         unsigned int expected = engine->default_state ? bit : 0;
4784
4785                         if ((found & bit) != expected) {
4786                                 DRM_ERROR("mismatching default context state for class %d on engine %s\n",
4787                                           engine->uabi_class, engine->name);
4788                         }
4789                 }
4790         }
4791
4792 out_ctx:
4793         i915_gem_context_set_closed(ctx);
4794         i915_gem_context_put(ctx);
4795         return err;
4796
4797 err_active:
4798         /*
4799          * If we have to abandon now, we expect the engines to be idle
4800          * and ready to be torn-down. The quickest way we can accomplish
4801          * this is by declaring ourselves wedged.
4802          */
4803         i915_gem_set_wedged(i915);
4804         goto out_ctx;
4805 }
4806
4807 static int
4808 i915_gem_init_scratch(struct drm_i915_private *i915, unsigned int size)
4809 {
4810         struct drm_i915_gem_object *obj;
4811         struct i915_vma *vma;
4812         int ret;
4813
4814         obj = i915_gem_object_create_stolen(i915, size);
4815         if (!obj)
4816                 obj = i915_gem_object_create_internal(i915, size);
4817         if (IS_ERR(obj)) {
4818                 DRM_ERROR("Failed to allocate scratch page\n");
4819                 return PTR_ERR(obj);
4820         }
4821
4822         vma = i915_vma_instance(obj, &i915->ggtt.vm, NULL);
4823         if (IS_ERR(vma)) {
4824                 ret = PTR_ERR(vma);
4825                 goto err_unref;
4826         }
4827
4828         ret = i915_vma_pin(vma, 0, 0, PIN_GLOBAL | PIN_HIGH);
4829         if (ret)
4830                 goto err_unref;
4831
4832         i915->gt.scratch = vma;
4833         return 0;
4834
4835 err_unref:
4836         i915_gem_object_put(obj);
4837         return ret;
4838 }
4839
4840 static void i915_gem_fini_scratch(struct drm_i915_private *i915)
4841 {
4842         i915_vma_unpin_and_release(&i915->gt.scratch, 0);
4843 }
4844
4845 int i915_gem_init(struct drm_i915_private *dev_priv)
4846 {
4847         int ret;
4848
4849         /* We need to fallback to 4K pages if host doesn't support huge gtt. */
4850         if (intel_vgpu_active(dev_priv) && !intel_vgpu_has_huge_gtt(dev_priv))
4851                 mkwrite_device_info(dev_priv)->page_sizes =
4852                         I915_GTT_PAGE_SIZE_4K;
4853
4854         dev_priv->mm.unordered_timeline = dma_fence_context_alloc(1);
4855
4856         if (HAS_LOGICAL_RING_CONTEXTS(dev_priv))
4857                 dev_priv->gt.cleanup_engine = intel_logical_ring_cleanup;
4858         else
4859                 dev_priv->gt.cleanup_engine = intel_engine_cleanup;
4860
4861         i915_timelines_init(dev_priv);
4862
4863         ret = i915_gem_init_userptr(dev_priv);
4864         if (ret)
4865                 return ret;
4866
4867         ret = intel_uc_init_misc(dev_priv);
4868         if (ret)
4869                 return ret;
4870
4871         ret = intel_wopcm_init(&dev_priv->wopcm);
4872         if (ret)
4873                 goto err_uc_misc;
4874
4875         /* This is just a security blanket to placate dragons.
4876          * On some systems, we very sporadically observe that the first TLBs
4877          * used by the CS may be stale, despite us poking the TLB reset. If
4878          * we hold the forcewake during initialisation these problems
4879          * just magically go away.
4880          */
4881         mutex_lock(&dev_priv->drm.struct_mutex);
4882         intel_uncore_forcewake_get(&dev_priv->uncore, FORCEWAKE_ALL);
4883
4884         ret = i915_gem_init_ggtt(dev_priv);
4885         if (ret) {
4886                 GEM_BUG_ON(ret == -EIO);
4887                 goto err_unlock;
4888         }
4889
4890         ret = i915_gem_init_scratch(dev_priv,
4891                                     IS_GEN(dev_priv, 2) ? SZ_256K : PAGE_SIZE);
4892         if (ret) {
4893                 GEM_BUG_ON(ret == -EIO);
4894                 goto err_ggtt;
4895         }
4896
4897         ret = i915_gem_contexts_init(dev_priv);
4898         if (ret) {
4899                 GEM_BUG_ON(ret == -EIO);
4900                 goto err_scratch;
4901         }
4902
4903         ret = intel_engines_init(dev_priv);
4904         if (ret) {
4905                 GEM_BUG_ON(ret == -EIO);
4906                 goto err_context;
4907         }
4908
4909         intel_init_gt_powersave(dev_priv);
4910
4911         ret = intel_uc_init(dev_priv);
4912         if (ret)
4913                 goto err_pm;
4914
4915         ret = i915_gem_init_hw(dev_priv);
4916         if (ret)
4917                 goto err_uc_init;
4918
4919         /*
4920          * Despite its name intel_init_clock_gating applies both display
4921          * clock gating workarounds; GT mmio workarounds and the occasional
4922          * GT power context workaround. Worse, sometimes it includes a context
4923          * register workaround which we need to apply before we record the
4924          * default HW state for all contexts.
4925          *
4926          * FIXME: break up the workarounds and apply them at the right time!
4927          */
4928         intel_init_clock_gating(dev_priv);
4929
4930         ret = __intel_engines_record_defaults(dev_priv);
4931         if (ret)
4932                 goto err_init_hw;
4933
4934         if (i915_inject_load_failure()) {
4935                 ret = -ENODEV;
4936                 goto err_init_hw;
4937         }
4938
4939         if (i915_inject_load_failure()) {
4940                 ret = -EIO;
4941                 goto err_init_hw;
4942         }
4943
4944         intel_uncore_forcewake_put(&dev_priv->uncore, FORCEWAKE_ALL);
4945         mutex_unlock(&dev_priv->drm.struct_mutex);
4946
4947         return 0;
4948
4949         /*
4950          * Unwinding is complicated by that we want to handle -EIO to mean
4951          * disable GPU submission but keep KMS alive. We want to mark the
4952          * HW as irrevisibly wedged, but keep enough state around that the
4953          * driver doesn't explode during runtime.
4954          */
4955 err_init_hw:
4956         mutex_unlock(&dev_priv->drm.struct_mutex);
4957
4958         i915_gem_suspend(dev_priv);
4959         i915_gem_suspend_late(dev_priv);
4960
4961         i915_gem_drain_workqueue(dev_priv);
4962
4963         mutex_lock(&dev_priv->drm.struct_mutex);
4964         intel_uc_fini_hw(dev_priv);
4965 err_uc_init:
4966         intel_uc_fini(dev_priv);
4967 err_pm:
4968         if (ret != -EIO) {
4969                 intel_cleanup_gt_powersave(dev_priv);
4970                 i915_gem_cleanup_engines(dev_priv);
4971         }
4972 err_context:
4973         if (ret != -EIO)
4974                 i915_gem_contexts_fini(dev_priv);
4975 err_scratch:
4976         i915_gem_fini_scratch(dev_priv);
4977 err_ggtt:
4978 err_unlock:
4979         intel_uncore_forcewake_put(&dev_priv->uncore, FORCEWAKE_ALL);
4980         mutex_unlock(&dev_priv->drm.struct_mutex);
4981
4982 err_uc_misc:
4983         intel_uc_fini_misc(dev_priv);
4984
4985         if (ret != -EIO) {
4986                 i915_gem_cleanup_userptr(dev_priv);
4987                 i915_timelines_fini(dev_priv);
4988         }
4989
4990         if (ret == -EIO) {
4991                 mutex_lock(&dev_priv->drm.struct_mutex);
4992
4993                 /*
4994                  * Allow engine initialisation to fail by marking the GPU as
4995                  * wedged. But we only want to do this where the GPU is angry,
4996                  * for all other failure, such as an allocation failure, bail.
4997                  */
4998                 if (!i915_reset_failed(dev_priv)) {
4999                         i915_load_error(dev_priv,
5000                                         "Failed to initialize GPU, declaring it wedged!\n");
5001                         i915_gem_set_wedged(dev_priv);
5002                 }
5003
5004                 /* Minimal basic recovery for KMS */
5005                 ret = i915_ggtt_enable_hw(dev_priv);
5006                 i915_gem_restore_gtt_mappings(dev_priv);
5007                 i915_gem_restore_fences(dev_priv);
5008                 intel_init_clock_gating(dev_priv);
5009
5010                 mutex_unlock(&dev_priv->drm.struct_mutex);
5011         }
5012
5013         i915_gem_drain_freed_objects(dev_priv);
5014         return ret;
5015 }
5016
5017 void i915_gem_fini(struct drm_i915_private *dev_priv)
5018 {
5019         i915_gem_suspend_late(dev_priv);
5020         intel_disable_gt_powersave(dev_priv);
5021
5022         /* Flush any outstanding unpin_work. */
5023         i915_gem_drain_workqueue(dev_priv);
5024
5025         mutex_lock(&dev_priv->drm.struct_mutex);
5026         intel_uc_fini_hw(dev_priv);
5027         intel_uc_fini(dev_priv);
5028         i915_gem_cleanup_engines(dev_priv);
5029         i915_gem_contexts_fini(dev_priv);
5030         i915_gem_fini_scratch(dev_priv);
5031         mutex_unlock(&dev_priv->drm.struct_mutex);
5032
5033         intel_wa_list_free(&dev_priv->gt_wa_list);
5034
5035         intel_cleanup_gt_powersave(dev_priv);
5036
5037         intel_uc_fini_misc(dev_priv);
5038         i915_gem_cleanup_userptr(dev_priv);
5039         i915_timelines_fini(dev_priv);
5040
5041         i915_gem_drain_freed_objects(dev_priv);
5042
5043         WARN_ON(!list_empty(&dev_priv->contexts.list));
5044 }
5045
5046 void i915_gem_init_mmio(struct drm_i915_private *i915)
5047 {
5048         i915_gem_sanitize(i915);
5049 }
5050
5051 void
5052 i915_gem_cleanup_engines(struct drm_i915_private *dev_priv)
5053 {
5054         struct intel_engine_cs *engine;
5055         enum intel_engine_id id;
5056
5057         for_each_engine(engine, dev_priv, id)
5058                 dev_priv->gt.cleanup_engine(engine);
5059 }
5060
5061 void
5062 i915_gem_load_init_fences(struct drm_i915_private *dev_priv)
5063 {
5064         int i;
5065
5066         if (INTEL_GEN(dev_priv) >= 7 && !IS_VALLEYVIEW(dev_priv) &&
5067             !IS_CHERRYVIEW(dev_priv))
5068                 dev_priv->num_fence_regs = 32;
5069         else if (INTEL_GEN(dev_priv) >= 4 ||
5070                  IS_I945G(dev_priv) || IS_I945GM(dev_priv) ||
5071                  IS_G33(dev_priv) || IS_PINEVIEW(dev_priv))
5072                 dev_priv->num_fence_regs = 16;
5073         else
5074                 dev_priv->num_fence_regs = 8;
5075
5076         if (intel_vgpu_active(dev_priv))
5077                 dev_priv->num_fence_regs =
5078                                 I915_READ(vgtif_reg(avail_rs.fence_num));
5079
5080         /* Initialize fence registers to zero */
5081         for (i = 0; i < dev_priv->num_fence_regs; i++) {
5082                 struct drm_i915_fence_reg *fence = &dev_priv->fence_regs[i];
5083
5084                 fence->i915 = dev_priv;
5085                 fence->id = i;
5086                 list_add_tail(&fence->link, &dev_priv->mm.fence_list);
5087         }
5088         i915_gem_restore_fences(dev_priv);
5089
5090         i915_gem_detect_bit_6_swizzle(dev_priv);
5091 }
5092
5093 static void i915_gem_init__mm(struct drm_i915_private *i915)
5094 {
5095         spin_lock_init(&i915->mm.object_stat_lock);
5096         spin_lock_init(&i915->mm.obj_lock);
5097         spin_lock_init(&i915->mm.free_lock);
5098
5099         init_llist_head(&i915->mm.free_list);
5100
5101         INIT_LIST_HEAD(&i915->mm.unbound_list);
5102         INIT_LIST_HEAD(&i915->mm.bound_list);
5103         INIT_LIST_HEAD(&i915->mm.fence_list);
5104         INIT_LIST_HEAD(&i915->mm.userfault_list);
5105
5106         INIT_WORK(&i915->mm.free_work, __i915_gem_free_work);
5107 }
5108
5109 int i915_gem_init_early(struct drm_i915_private *dev_priv)
5110 {
5111         int err;
5112
5113         INIT_LIST_HEAD(&dev_priv->gt.active_rings);
5114         INIT_LIST_HEAD(&dev_priv->gt.closed_vma);
5115
5116         i915_gem_init__mm(dev_priv);
5117
5118         INIT_DELAYED_WORK(&dev_priv->gt.retire_work,
5119                           i915_gem_retire_work_handler);
5120         INIT_DELAYED_WORK(&dev_priv->gt.idle_work,
5121                           i915_gem_idle_work_handler);
5122         init_waitqueue_head(&dev_priv->gpu_error.wait_queue);
5123         init_waitqueue_head(&dev_priv->gpu_error.reset_queue);
5124         mutex_init(&dev_priv->gpu_error.wedge_mutex);
5125         init_srcu_struct(&dev_priv->gpu_error.reset_backoff_srcu);
5126
5127         atomic_set(&dev_priv->mm.bsd_engine_dispatch_index, 0);
5128
5129         spin_lock_init(&dev_priv->fb_tracking.lock);
5130
5131         err = i915_gemfs_init(dev_priv);
5132         if (err)
5133                 DRM_NOTE("Unable to create a private tmpfs mount, hugepage support will be disabled(%d).\n", err);
5134
5135         return 0;
5136 }
5137
5138 void i915_gem_cleanup_early(struct drm_i915_private *dev_priv)
5139 {
5140         i915_gem_drain_freed_objects(dev_priv);
5141         GEM_BUG_ON(!llist_empty(&dev_priv->mm.free_list));
5142         GEM_BUG_ON(atomic_read(&dev_priv->mm.free_count));
5143         WARN_ON(dev_priv->mm.object_count);
5144
5145         cleanup_srcu_struct(&dev_priv->gpu_error.reset_backoff_srcu);
5146
5147         i915_gemfs_fini(dev_priv);
5148 }
5149
5150 int i915_gem_freeze(struct drm_i915_private *dev_priv)
5151 {
5152         /* Discard all purgeable objects, let userspace recover those as
5153          * required after resuming.
5154          */
5155         i915_gem_shrink_all(dev_priv);
5156
5157         return 0;
5158 }
5159
5160 int i915_gem_freeze_late(struct drm_i915_private *i915)
5161 {
5162         struct drm_i915_gem_object *obj;
5163         struct list_head *phases[] = {
5164                 &i915->mm.unbound_list,
5165                 &i915->mm.bound_list,
5166                 NULL
5167         }, **phase;
5168
5169         /*
5170          * Called just before we write the hibernation image.
5171          *
5172          * We need to update the domain tracking to reflect that the CPU
5173          * will be accessing all the pages to create and restore from the
5174          * hibernation, and so upon restoration those pages will be in the
5175          * CPU domain.
5176          *
5177          * To make sure the hibernation image contains the latest state,
5178          * we update that state just before writing out the image.
5179          *
5180          * To try and reduce the hibernation image, we manually shrink
5181          * the objects as well, see i915_gem_freeze()
5182          */
5183
5184         i915_gem_shrink(i915, -1UL, NULL, I915_SHRINK_UNBOUND);
5185         i915_gem_drain_freed_objects(i915);
5186
5187         mutex_lock(&i915->drm.struct_mutex);
5188         for (phase = phases; *phase; phase++) {
5189                 list_for_each_entry(obj, *phase, mm.link)
5190                         WARN_ON(i915_gem_object_set_to_cpu_domain(obj, true));
5191         }
5192         mutex_unlock(&i915->drm.struct_mutex);
5193
5194         return 0;
5195 }
5196
5197 void i915_gem_release(struct drm_device *dev, struct drm_file *file)
5198 {
5199         struct drm_i915_file_private *file_priv = file->driver_priv;
5200         struct i915_request *request;
5201
5202         /* Clean up our request list when the client is going away, so that
5203          * later retire_requests won't dereference our soon-to-be-gone
5204          * file_priv.
5205          */
5206         spin_lock(&file_priv->mm.lock);
5207         list_for_each_entry(request, &file_priv->mm.request_list, client_link)
5208                 request->file_priv = NULL;
5209         spin_unlock(&file_priv->mm.lock);
5210 }
5211
5212 int i915_gem_open(struct drm_i915_private *i915, struct drm_file *file)
5213 {
5214         struct drm_i915_file_private *file_priv;
5215         int ret;
5216
5217         DRM_DEBUG("\n");
5218
5219         file_priv = kzalloc(sizeof(*file_priv), GFP_KERNEL);
5220         if (!file_priv)
5221                 return -ENOMEM;
5222
5223         file->driver_priv = file_priv;
5224         file_priv->dev_priv = i915;
5225         file_priv->file = file;
5226
5227         spin_lock_init(&file_priv->mm.lock);
5228         INIT_LIST_HEAD(&file_priv->mm.request_list);
5229
5230         file_priv->bsd_engine = -1;
5231         file_priv->hang_timestamp = jiffies;
5232
5233         ret = i915_gem_context_open(i915, file);
5234         if (ret)
5235                 kfree(file_priv);
5236
5237         return ret;
5238 }
5239
5240 /**
5241  * i915_gem_track_fb - update frontbuffer tracking
5242  * @old: current GEM buffer for the frontbuffer slots
5243  * @new: new GEM buffer for the frontbuffer slots
5244  * @frontbuffer_bits: bitmask of frontbuffer slots
5245  *
5246  * This updates the frontbuffer tracking bits @frontbuffer_bits by clearing them
5247  * from @old and setting them in @new. Both @old and @new can be NULL.
5248  */
5249 void i915_gem_track_fb(struct drm_i915_gem_object *old,
5250                        struct drm_i915_gem_object *new,
5251                        unsigned frontbuffer_bits)
5252 {
5253         /* Control of individual bits within the mask are guarded by
5254          * the owning plane->mutex, i.e. we can never see concurrent
5255          * manipulation of individual bits. But since the bitfield as a whole
5256          * is updated using RMW, we need to use atomics in order to update
5257          * the bits.
5258          */
5259         BUILD_BUG_ON(INTEL_FRONTBUFFER_BITS_PER_PIPE * I915_MAX_PIPES >
5260                      BITS_PER_TYPE(atomic_t));
5261
5262         if (old) {
5263                 WARN_ON(!(atomic_read(&old->frontbuffer_bits) & frontbuffer_bits));
5264                 atomic_andnot(frontbuffer_bits, &old->frontbuffer_bits);
5265         }
5266
5267         if (new) {
5268                 WARN_ON(atomic_read(&new->frontbuffer_bits) & frontbuffer_bits);
5269                 atomic_or(frontbuffer_bits, &new->frontbuffer_bits);
5270         }
5271 }
5272
5273 /* Allocate a new GEM object and fill it with the supplied data */
5274 struct drm_i915_gem_object *
5275 i915_gem_object_create_from_data(struct drm_i915_private *dev_priv,
5276                                  const void *data, size_t size)
5277 {
5278         struct drm_i915_gem_object *obj;
5279         struct file *file;
5280         size_t offset;
5281         int err;
5282
5283         obj = i915_gem_object_create(dev_priv, round_up(size, PAGE_SIZE));
5284         if (IS_ERR(obj))
5285                 return obj;
5286
5287         GEM_BUG_ON(obj->write_domain != I915_GEM_DOMAIN_CPU);
5288
5289         file = obj->base.filp;
5290         offset = 0;
5291         do {
5292                 unsigned int len = min_t(typeof(size), size, PAGE_SIZE);
5293                 struct page *page;
5294                 void *pgdata, *vaddr;
5295
5296                 err = pagecache_write_begin(file, file->f_mapping,
5297                                             offset, len, 0,
5298                                             &page, &pgdata);
5299                 if (err < 0)
5300                         goto fail;
5301
5302                 vaddr = kmap(page);
5303                 memcpy(vaddr, data, len);
5304                 kunmap(page);
5305
5306                 err = pagecache_write_end(file, file->f_mapping,
5307                                           offset, len, len,
5308                                           page, pgdata);
5309                 if (err < 0)
5310                         goto fail;
5311
5312                 size -= len;
5313                 data += len;
5314                 offset += len;
5315         } while (size);
5316
5317         return obj;
5318
5319 fail:
5320         i915_gem_object_put(obj);
5321         return ERR_PTR(err);
5322 }
5323
5324 struct scatterlist *
5325 i915_gem_object_get_sg(struct drm_i915_gem_object *obj,
5326                        unsigned int n,
5327                        unsigned int *offset)
5328 {
5329         struct i915_gem_object_page_iter *iter = &obj->mm.get_page;
5330         struct scatterlist *sg;
5331         unsigned int idx, count;
5332
5333         might_sleep();
5334         GEM_BUG_ON(n >= obj->base.size >> PAGE_SHIFT);
5335         GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
5336
5337         /* As we iterate forward through the sg, we record each entry in a
5338          * radixtree for quick repeated (backwards) lookups. If we have seen
5339          * this index previously, we will have an entry for it.
5340          *
5341          * Initial lookup is O(N), but this is amortized to O(1) for
5342          * sequential page access (where each new request is consecutive
5343          * to the previous one). Repeated lookups are O(lg(obj->base.size)),
5344          * i.e. O(1) with a large constant!
5345          */
5346         if (n < READ_ONCE(iter->sg_idx))
5347                 goto lookup;
5348
5349         mutex_lock(&iter->lock);
5350
5351         /* We prefer to reuse the last sg so that repeated lookup of this
5352          * (or the subsequent) sg are fast - comparing against the last
5353          * sg is faster than going through the radixtree.
5354          */
5355
5356         sg = iter->sg_pos;
5357         idx = iter->sg_idx;
5358         count = __sg_page_count(sg);
5359
5360         while (idx + count <= n) {
5361                 void *entry;
5362                 unsigned long i;
5363                 int ret;
5364
5365                 /* If we cannot allocate and insert this entry, or the
5366                  * individual pages from this range, cancel updating the
5367                  * sg_idx so that on this lookup we are forced to linearly
5368                  * scan onwards, but on future lookups we will try the
5369                  * insertion again (in which case we need to be careful of
5370                  * the error return reporting that we have already inserted
5371                  * this index).
5372                  */
5373                 ret = radix_tree_insert(&iter->radix, idx, sg);
5374                 if (ret && ret != -EEXIST)
5375                         goto scan;
5376
5377                 entry = xa_mk_value(idx);
5378                 for (i = 1; i < count; i++) {
5379                         ret = radix_tree_insert(&iter->radix, idx + i, entry);
5380                         if (ret && ret != -EEXIST)
5381                                 goto scan;
5382                 }
5383
5384                 idx += count;
5385                 sg = ____sg_next(sg);
5386                 count = __sg_page_count(sg);
5387         }
5388
5389 scan:
5390         iter->sg_pos = sg;
5391         iter->sg_idx = idx;
5392
5393         mutex_unlock(&iter->lock);
5394
5395         if (unlikely(n < idx)) /* insertion completed by another thread */
5396                 goto lookup;
5397
5398         /* In case we failed to insert the entry into the radixtree, we need
5399          * to look beyond the current sg.
5400          */
5401         while (idx + count <= n) {
5402                 idx += count;
5403                 sg = ____sg_next(sg);
5404                 count = __sg_page_count(sg);
5405         }
5406
5407         *offset = n - idx;
5408         return sg;
5409
5410 lookup:
5411         rcu_read_lock();
5412
5413         sg = radix_tree_lookup(&iter->radix, n);
5414         GEM_BUG_ON(!sg);
5415
5416         /* If this index is in the middle of multi-page sg entry,
5417          * the radix tree will contain a value entry that points
5418          * to the start of that range. We will return the pointer to
5419          * the base page and the offset of this page within the
5420          * sg entry's range.
5421          */
5422         *offset = 0;
5423         if (unlikely(xa_is_value(sg))) {
5424                 unsigned long base = xa_to_value(sg);
5425
5426                 sg = radix_tree_lookup(&iter->radix, base);
5427                 GEM_BUG_ON(!sg);
5428
5429                 *offset = n - base;
5430         }
5431
5432         rcu_read_unlock();
5433
5434         return sg;
5435 }
5436
5437 struct page *
5438 i915_gem_object_get_page(struct drm_i915_gem_object *obj, unsigned int n)
5439 {
5440         struct scatterlist *sg;
5441         unsigned int offset;
5442
5443         GEM_BUG_ON(!i915_gem_object_has_struct_page(obj));
5444
5445         sg = i915_gem_object_get_sg(obj, n, &offset);
5446         return nth_page(sg_page(sg), offset);
5447 }
5448
5449 /* Like i915_gem_object_get_page(), but mark the returned page dirty */
5450 struct page *
5451 i915_gem_object_get_dirty_page(struct drm_i915_gem_object *obj,
5452                                unsigned int n)
5453 {
5454         struct page *page;
5455
5456         page = i915_gem_object_get_page(obj, n);
5457         if (!obj->mm.dirty)
5458                 set_page_dirty(page);
5459
5460         return page;
5461 }
5462
5463 dma_addr_t
5464 i915_gem_object_get_dma_address(struct drm_i915_gem_object *obj,
5465                                 unsigned long n)
5466 {
5467         struct scatterlist *sg;
5468         unsigned int offset;
5469
5470         sg = i915_gem_object_get_sg(obj, n, &offset);
5471         return sg_dma_address(sg) + (offset << PAGE_SHIFT);
5472 }
5473
5474 int i915_gem_object_attach_phys(struct drm_i915_gem_object *obj, int align)
5475 {
5476         struct sg_table *pages;
5477         int err;
5478
5479         if (align > obj->base.size)
5480                 return -EINVAL;
5481
5482         if (obj->ops == &i915_gem_phys_ops)
5483                 return 0;
5484
5485         if (obj->ops != &i915_gem_object_ops)
5486                 return -EINVAL;
5487
5488         err = i915_gem_object_unbind(obj);
5489         if (err)
5490                 return err;
5491
5492         mutex_lock(&obj->mm.lock);
5493
5494         if (obj->mm.madv != I915_MADV_WILLNEED) {
5495                 err = -EFAULT;
5496                 goto err_unlock;
5497         }
5498
5499         if (obj->mm.quirked) {
5500                 err = -EFAULT;
5501                 goto err_unlock;
5502         }
5503
5504         if (obj->mm.mapping) {
5505                 err = -EBUSY;
5506                 goto err_unlock;
5507         }
5508
5509         pages = __i915_gem_object_unset_pages(obj);
5510
5511         obj->ops = &i915_gem_phys_ops;
5512
5513         err = ____i915_gem_object_get_pages(obj);
5514         if (err)
5515                 goto err_xfer;
5516
5517         /* Perma-pin (until release) the physical set of pages */
5518         __i915_gem_object_pin_pages(obj);
5519
5520         if (!IS_ERR_OR_NULL(pages))
5521                 i915_gem_object_ops.put_pages(obj, pages);
5522         mutex_unlock(&obj->mm.lock);
5523         return 0;
5524
5525 err_xfer:
5526         obj->ops = &i915_gem_object_ops;
5527         if (!IS_ERR_OR_NULL(pages)) {
5528                 unsigned int sg_page_sizes = i915_sg_page_sizes(pages->sgl);
5529
5530                 __i915_gem_object_set_pages(obj, pages, sg_page_sizes);
5531         }
5532 err_unlock:
5533         mutex_unlock(&obj->mm.lock);
5534         return err;
5535 }
5536
5537 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
5538 #include "selftests/scatterlist.c"
5539 #include "selftests/mock_gem_device.c"
5540 #include "selftests/huge_gem_object.c"
5541 #include "selftests/huge_pages.c"
5542 #include "selftests/i915_gem_object.c"
5543 #include "selftests/i915_gem_coherency.c"
5544 #include "selftests/i915_gem.c"
5545 #endif