Merge branch 'master' into fixes
[sfrench/cifs-2.6.git] / drivers / gpu / drm / i915 / i915_gem.c
1 /*
2  * Copyright © 2008-2015 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * Authors:
24  *    Eric Anholt <eric@anholt.net>
25  *
26  */
27
28 #include <drm/drmP.h>
29 #include <drm/drm_vma_manager.h>
30 #include <drm/i915_drm.h>
31 #include "i915_drv.h"
32 #include "i915_gem_clflush.h"
33 #include "i915_vgpu.h"
34 #include "i915_trace.h"
35 #include "intel_drv.h"
36 #include "intel_frontbuffer.h"
37 #include "intel_mocs.h"
38 #include "intel_workarounds.h"
39 #include "i915_gemfs.h"
40 #include <linux/dma-fence-array.h>
41 #include <linux/kthread.h>
42 #include <linux/reservation.h>
43 #include <linux/shmem_fs.h>
44 #include <linux/slab.h>
45 #include <linux/stop_machine.h>
46 #include <linux/swap.h>
47 #include <linux/pci.h>
48 #include <linux/dma-buf.h>
49
50 static void i915_gem_flush_free_objects(struct drm_i915_private *i915);
51
52 static bool cpu_write_needs_clflush(struct drm_i915_gem_object *obj)
53 {
54         if (obj->cache_dirty)
55                 return false;
56
57         if (!(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE))
58                 return true;
59
60         return obj->pin_global; /* currently in use by HW, keep flushed */
61 }
62
63 static int
64 insert_mappable_node(struct i915_ggtt *ggtt,
65                      struct drm_mm_node *node, u32 size)
66 {
67         memset(node, 0, sizeof(*node));
68         return drm_mm_insert_node_in_range(&ggtt->vm.mm, node,
69                                            size, 0, I915_COLOR_UNEVICTABLE,
70                                            0, ggtt->mappable_end,
71                                            DRM_MM_INSERT_LOW);
72 }
73
74 static void
75 remove_mappable_node(struct drm_mm_node *node)
76 {
77         drm_mm_remove_node(node);
78 }
79
80 /* some bookkeeping */
81 static void i915_gem_info_add_obj(struct drm_i915_private *dev_priv,
82                                   u64 size)
83 {
84         spin_lock(&dev_priv->mm.object_stat_lock);
85         dev_priv->mm.object_count++;
86         dev_priv->mm.object_memory += size;
87         spin_unlock(&dev_priv->mm.object_stat_lock);
88 }
89
90 static void i915_gem_info_remove_obj(struct drm_i915_private *dev_priv,
91                                      u64 size)
92 {
93         spin_lock(&dev_priv->mm.object_stat_lock);
94         dev_priv->mm.object_count--;
95         dev_priv->mm.object_memory -= size;
96         spin_unlock(&dev_priv->mm.object_stat_lock);
97 }
98
99 static int
100 i915_gem_wait_for_error(struct i915_gpu_error *error)
101 {
102         int ret;
103
104         might_sleep();
105
106         /*
107          * Only wait 10 seconds for the gpu reset to complete to avoid hanging
108          * userspace. If it takes that long something really bad is going on and
109          * we should simply try to bail out and fail as gracefully as possible.
110          */
111         ret = wait_event_interruptible_timeout(error->reset_queue,
112                                                !i915_reset_backoff(error),
113                                                I915_RESET_TIMEOUT);
114         if (ret == 0) {
115                 DRM_ERROR("Timed out waiting for the gpu reset to complete\n");
116                 return -EIO;
117         } else if (ret < 0) {
118                 return ret;
119         } else {
120                 return 0;
121         }
122 }
123
124 int i915_mutex_lock_interruptible(struct drm_device *dev)
125 {
126         struct drm_i915_private *dev_priv = to_i915(dev);
127         int ret;
128
129         ret = i915_gem_wait_for_error(&dev_priv->gpu_error);
130         if (ret)
131                 return ret;
132
133         ret = mutex_lock_interruptible(&dev->struct_mutex);
134         if (ret)
135                 return ret;
136
137         return 0;
138 }
139
140 static u32 __i915_gem_park(struct drm_i915_private *i915)
141 {
142         GEM_TRACE("\n");
143
144         lockdep_assert_held(&i915->drm.struct_mutex);
145         GEM_BUG_ON(i915->gt.active_requests);
146         GEM_BUG_ON(!list_empty(&i915->gt.active_rings));
147
148         if (!i915->gt.awake)
149                 return I915_EPOCH_INVALID;
150
151         GEM_BUG_ON(i915->gt.epoch == I915_EPOCH_INVALID);
152
153         /*
154          * Be paranoid and flush a concurrent interrupt to make sure
155          * we don't reactivate any irq tasklets after parking.
156          *
157          * FIXME: Note that even though we have waited for execlists to be idle,
158          * there may still be an in-flight interrupt even though the CSB
159          * is now empty. synchronize_irq() makes sure that a residual interrupt
160          * is completed before we continue, but it doesn't prevent the HW from
161          * raising a spurious interrupt later. To complete the shield we should
162          * coordinate disabling the CS irq with flushing the interrupts.
163          */
164         synchronize_irq(i915->drm.irq);
165
166         intel_engines_park(i915);
167         i915_timelines_park(i915);
168
169         i915_pmu_gt_parked(i915);
170         i915_vma_parked(i915);
171
172         i915->gt.awake = false;
173
174         if (INTEL_GEN(i915) >= 6)
175                 gen6_rps_idle(i915);
176
177         intel_display_power_put(i915, POWER_DOMAIN_GT_IRQ);
178
179         intel_runtime_pm_put(i915);
180
181         return i915->gt.epoch;
182 }
183
184 void i915_gem_park(struct drm_i915_private *i915)
185 {
186         GEM_TRACE("\n");
187
188         lockdep_assert_held(&i915->drm.struct_mutex);
189         GEM_BUG_ON(i915->gt.active_requests);
190
191         if (!i915->gt.awake)
192                 return;
193
194         /* Defer the actual call to __i915_gem_park() to prevent ping-pongs */
195         mod_delayed_work(i915->wq, &i915->gt.idle_work, msecs_to_jiffies(100));
196 }
197
198 void i915_gem_unpark(struct drm_i915_private *i915)
199 {
200         GEM_TRACE("\n");
201
202         lockdep_assert_held(&i915->drm.struct_mutex);
203         GEM_BUG_ON(!i915->gt.active_requests);
204
205         if (i915->gt.awake)
206                 return;
207
208         intel_runtime_pm_get_noresume(i915);
209
210         /*
211          * It seems that the DMC likes to transition between the DC states a lot
212          * when there are no connected displays (no active power domains) during
213          * command submission.
214          *
215          * This activity has negative impact on the performance of the chip with
216          * huge latencies observed in the interrupt handler and elsewhere.
217          *
218          * Work around it by grabbing a GT IRQ power domain whilst there is any
219          * GT activity, preventing any DC state transitions.
220          */
221         intel_display_power_get(i915, POWER_DOMAIN_GT_IRQ);
222
223         i915->gt.awake = true;
224         if (unlikely(++i915->gt.epoch == 0)) /* keep 0 as invalid */
225                 i915->gt.epoch = 1;
226
227         intel_enable_gt_powersave(i915);
228         i915_update_gfx_val(i915);
229         if (INTEL_GEN(i915) >= 6)
230                 gen6_rps_busy(i915);
231         i915_pmu_gt_unparked(i915);
232
233         intel_engines_unpark(i915);
234
235         i915_queue_hangcheck(i915);
236
237         queue_delayed_work(i915->wq,
238                            &i915->gt.retire_work,
239                            round_jiffies_up_relative(HZ));
240 }
241
242 int
243 i915_gem_get_aperture_ioctl(struct drm_device *dev, void *data,
244                             struct drm_file *file)
245 {
246         struct drm_i915_private *dev_priv = to_i915(dev);
247         struct i915_ggtt *ggtt = &dev_priv->ggtt;
248         struct drm_i915_gem_get_aperture *args = data;
249         struct i915_vma *vma;
250         u64 pinned;
251
252         pinned = ggtt->vm.reserved;
253         mutex_lock(&dev->struct_mutex);
254         list_for_each_entry(vma, &ggtt->vm.active_list, vm_link)
255                 if (i915_vma_is_pinned(vma))
256                         pinned += vma->node.size;
257         list_for_each_entry(vma, &ggtt->vm.inactive_list, vm_link)
258                 if (i915_vma_is_pinned(vma))
259                         pinned += vma->node.size;
260         mutex_unlock(&dev->struct_mutex);
261
262         args->aper_size = ggtt->vm.total;
263         args->aper_available_size = args->aper_size - pinned;
264
265         return 0;
266 }
267
268 static int i915_gem_object_get_pages_phys(struct drm_i915_gem_object *obj)
269 {
270         struct address_space *mapping = obj->base.filp->f_mapping;
271         drm_dma_handle_t *phys;
272         struct sg_table *st;
273         struct scatterlist *sg;
274         char *vaddr;
275         int i;
276         int err;
277
278         if (WARN_ON(i915_gem_object_needs_bit17_swizzle(obj)))
279                 return -EINVAL;
280
281         /* Always aligning to the object size, allows a single allocation
282          * to handle all possible callers, and given typical object sizes,
283          * the alignment of the buddy allocation will naturally match.
284          */
285         phys = drm_pci_alloc(obj->base.dev,
286                              roundup_pow_of_two(obj->base.size),
287                              roundup_pow_of_two(obj->base.size));
288         if (!phys)
289                 return -ENOMEM;
290
291         vaddr = phys->vaddr;
292         for (i = 0; i < obj->base.size / PAGE_SIZE; i++) {
293                 struct page *page;
294                 char *src;
295
296                 page = shmem_read_mapping_page(mapping, i);
297                 if (IS_ERR(page)) {
298                         err = PTR_ERR(page);
299                         goto err_phys;
300                 }
301
302                 src = kmap_atomic(page);
303                 memcpy(vaddr, src, PAGE_SIZE);
304                 drm_clflush_virt_range(vaddr, PAGE_SIZE);
305                 kunmap_atomic(src);
306
307                 put_page(page);
308                 vaddr += PAGE_SIZE;
309         }
310
311         i915_gem_chipset_flush(to_i915(obj->base.dev));
312
313         st = kmalloc(sizeof(*st), GFP_KERNEL);
314         if (!st) {
315                 err = -ENOMEM;
316                 goto err_phys;
317         }
318
319         if (sg_alloc_table(st, 1, GFP_KERNEL)) {
320                 kfree(st);
321                 err = -ENOMEM;
322                 goto err_phys;
323         }
324
325         sg = st->sgl;
326         sg->offset = 0;
327         sg->length = obj->base.size;
328
329         sg_dma_address(sg) = phys->busaddr;
330         sg_dma_len(sg) = obj->base.size;
331
332         obj->phys_handle = phys;
333
334         __i915_gem_object_set_pages(obj, st, sg->length);
335
336         return 0;
337
338 err_phys:
339         drm_pci_free(obj->base.dev, phys);
340
341         return err;
342 }
343
344 static void __start_cpu_write(struct drm_i915_gem_object *obj)
345 {
346         obj->read_domains = I915_GEM_DOMAIN_CPU;
347         obj->write_domain = I915_GEM_DOMAIN_CPU;
348         if (cpu_write_needs_clflush(obj))
349                 obj->cache_dirty = true;
350 }
351
352 static void
353 __i915_gem_object_release_shmem(struct drm_i915_gem_object *obj,
354                                 struct sg_table *pages,
355                                 bool needs_clflush)
356 {
357         GEM_BUG_ON(obj->mm.madv == __I915_MADV_PURGED);
358
359         if (obj->mm.madv == I915_MADV_DONTNEED)
360                 obj->mm.dirty = false;
361
362         if (needs_clflush &&
363             (obj->read_domains & I915_GEM_DOMAIN_CPU) == 0 &&
364             !(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ))
365                 drm_clflush_sg(pages);
366
367         __start_cpu_write(obj);
368 }
369
370 static void
371 i915_gem_object_put_pages_phys(struct drm_i915_gem_object *obj,
372                                struct sg_table *pages)
373 {
374         __i915_gem_object_release_shmem(obj, pages, false);
375
376         if (obj->mm.dirty) {
377                 struct address_space *mapping = obj->base.filp->f_mapping;
378                 char *vaddr = obj->phys_handle->vaddr;
379                 int i;
380
381                 for (i = 0; i < obj->base.size / PAGE_SIZE; i++) {
382                         struct page *page;
383                         char *dst;
384
385                         page = shmem_read_mapping_page(mapping, i);
386                         if (IS_ERR(page))
387                                 continue;
388
389                         dst = kmap_atomic(page);
390                         drm_clflush_virt_range(vaddr, PAGE_SIZE);
391                         memcpy(dst, vaddr, PAGE_SIZE);
392                         kunmap_atomic(dst);
393
394                         set_page_dirty(page);
395                         if (obj->mm.madv == I915_MADV_WILLNEED)
396                                 mark_page_accessed(page);
397                         put_page(page);
398                         vaddr += PAGE_SIZE;
399                 }
400                 obj->mm.dirty = false;
401         }
402
403         sg_free_table(pages);
404         kfree(pages);
405
406         drm_pci_free(obj->base.dev, obj->phys_handle);
407 }
408
409 static void
410 i915_gem_object_release_phys(struct drm_i915_gem_object *obj)
411 {
412         i915_gem_object_unpin_pages(obj);
413 }
414
415 static const struct drm_i915_gem_object_ops i915_gem_phys_ops = {
416         .get_pages = i915_gem_object_get_pages_phys,
417         .put_pages = i915_gem_object_put_pages_phys,
418         .release = i915_gem_object_release_phys,
419 };
420
421 static const struct drm_i915_gem_object_ops i915_gem_object_ops;
422
423 int i915_gem_object_unbind(struct drm_i915_gem_object *obj)
424 {
425         struct i915_vma *vma;
426         LIST_HEAD(still_in_list);
427         int ret;
428
429         lockdep_assert_held(&obj->base.dev->struct_mutex);
430
431         /* Closed vma are removed from the obj->vma_list - but they may
432          * still have an active binding on the object. To remove those we
433          * must wait for all rendering to complete to the object (as unbinding
434          * must anyway), and retire the requests.
435          */
436         ret = i915_gem_object_set_to_cpu_domain(obj, false);
437         if (ret)
438                 return ret;
439
440         while ((vma = list_first_entry_or_null(&obj->vma_list,
441                                                struct i915_vma,
442                                                obj_link))) {
443                 list_move_tail(&vma->obj_link, &still_in_list);
444                 ret = i915_vma_unbind(vma);
445                 if (ret)
446                         break;
447         }
448         list_splice(&still_in_list, &obj->vma_list);
449
450         return ret;
451 }
452
453 static long
454 i915_gem_object_wait_fence(struct dma_fence *fence,
455                            unsigned int flags,
456                            long timeout,
457                            struct intel_rps_client *rps_client)
458 {
459         struct i915_request *rq;
460
461         BUILD_BUG_ON(I915_WAIT_INTERRUPTIBLE != 0x1);
462
463         if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags))
464                 return timeout;
465
466         if (!dma_fence_is_i915(fence))
467                 return dma_fence_wait_timeout(fence,
468                                               flags & I915_WAIT_INTERRUPTIBLE,
469                                               timeout);
470
471         rq = to_request(fence);
472         if (i915_request_completed(rq))
473                 goto out;
474
475         /*
476          * This client is about to stall waiting for the GPU. In many cases
477          * this is undesirable and limits the throughput of the system, as
478          * many clients cannot continue processing user input/output whilst
479          * blocked. RPS autotuning may take tens of milliseconds to respond
480          * to the GPU load and thus incurs additional latency for the client.
481          * We can circumvent that by promoting the GPU frequency to maximum
482          * before we wait. This makes the GPU throttle up much more quickly
483          * (good for benchmarks and user experience, e.g. window animations),
484          * but at a cost of spending more power processing the workload
485          * (bad for battery). Not all clients even want their results
486          * immediately and for them we should just let the GPU select its own
487          * frequency to maximise efficiency. To prevent a single client from
488          * forcing the clocks too high for the whole system, we only allow
489          * each client to waitboost once in a busy period.
490          */
491         if (rps_client && !i915_request_started(rq)) {
492                 if (INTEL_GEN(rq->i915) >= 6)
493                         gen6_rps_boost(rq, rps_client);
494         }
495
496         timeout = i915_request_wait(rq, flags, timeout);
497
498 out:
499         if (flags & I915_WAIT_LOCKED && i915_request_completed(rq))
500                 i915_request_retire_upto(rq);
501
502         return timeout;
503 }
504
505 static long
506 i915_gem_object_wait_reservation(struct reservation_object *resv,
507                                  unsigned int flags,
508                                  long timeout,
509                                  struct intel_rps_client *rps_client)
510 {
511         unsigned int seq = __read_seqcount_begin(&resv->seq);
512         struct dma_fence *excl;
513         bool prune_fences = false;
514
515         if (flags & I915_WAIT_ALL) {
516                 struct dma_fence **shared;
517                 unsigned int count, i;
518                 int ret;
519
520                 ret = reservation_object_get_fences_rcu(resv,
521                                                         &excl, &count, &shared);
522                 if (ret)
523                         return ret;
524
525                 for (i = 0; i < count; i++) {
526                         timeout = i915_gem_object_wait_fence(shared[i],
527                                                              flags, timeout,
528                                                              rps_client);
529                         if (timeout < 0)
530                                 break;
531
532                         dma_fence_put(shared[i]);
533                 }
534
535                 for (; i < count; i++)
536                         dma_fence_put(shared[i]);
537                 kfree(shared);
538
539                 /*
540                  * If both shared fences and an exclusive fence exist,
541                  * then by construction the shared fences must be later
542                  * than the exclusive fence. If we successfully wait for
543                  * all the shared fences, we know that the exclusive fence
544                  * must all be signaled. If all the shared fences are
545                  * signaled, we can prune the array and recover the
546                  * floating references on the fences/requests.
547                  */
548                 prune_fences = count && timeout >= 0;
549         } else {
550                 excl = reservation_object_get_excl_rcu(resv);
551         }
552
553         if (excl && timeout >= 0)
554                 timeout = i915_gem_object_wait_fence(excl, flags, timeout,
555                                                      rps_client);
556
557         dma_fence_put(excl);
558
559         /*
560          * Opportunistically prune the fences iff we know they have *all* been
561          * signaled and that the reservation object has not been changed (i.e.
562          * no new fences have been added).
563          */
564         if (prune_fences && !__read_seqcount_retry(&resv->seq, seq)) {
565                 if (reservation_object_trylock(resv)) {
566                         if (!__read_seqcount_retry(&resv->seq, seq))
567                                 reservation_object_add_excl_fence(resv, NULL);
568                         reservation_object_unlock(resv);
569                 }
570         }
571
572         return timeout;
573 }
574
575 static void __fence_set_priority(struct dma_fence *fence,
576                                  const struct i915_sched_attr *attr)
577 {
578         struct i915_request *rq;
579         struct intel_engine_cs *engine;
580
581         if (dma_fence_is_signaled(fence) || !dma_fence_is_i915(fence))
582                 return;
583
584         rq = to_request(fence);
585         engine = rq->engine;
586
587         local_bh_disable();
588         rcu_read_lock(); /* RCU serialisation for set-wedged protection */
589         if (engine->schedule)
590                 engine->schedule(rq, attr);
591         rcu_read_unlock();
592         local_bh_enable(); /* kick the tasklets if queues were reprioritised */
593 }
594
595 static void fence_set_priority(struct dma_fence *fence,
596                                const struct i915_sched_attr *attr)
597 {
598         /* Recurse once into a fence-array */
599         if (dma_fence_is_array(fence)) {
600                 struct dma_fence_array *array = to_dma_fence_array(fence);
601                 int i;
602
603                 for (i = 0; i < array->num_fences; i++)
604                         __fence_set_priority(array->fences[i], attr);
605         } else {
606                 __fence_set_priority(fence, attr);
607         }
608 }
609
610 int
611 i915_gem_object_wait_priority(struct drm_i915_gem_object *obj,
612                               unsigned int flags,
613                               const struct i915_sched_attr *attr)
614 {
615         struct dma_fence *excl;
616
617         if (flags & I915_WAIT_ALL) {
618                 struct dma_fence **shared;
619                 unsigned int count, i;
620                 int ret;
621
622                 ret = reservation_object_get_fences_rcu(obj->resv,
623                                                         &excl, &count, &shared);
624                 if (ret)
625                         return ret;
626
627                 for (i = 0; i < count; i++) {
628                         fence_set_priority(shared[i], attr);
629                         dma_fence_put(shared[i]);
630                 }
631
632                 kfree(shared);
633         } else {
634                 excl = reservation_object_get_excl_rcu(obj->resv);
635         }
636
637         if (excl) {
638                 fence_set_priority(excl, attr);
639                 dma_fence_put(excl);
640         }
641         return 0;
642 }
643
644 /**
645  * Waits for rendering to the object to be completed
646  * @obj: i915 gem object
647  * @flags: how to wait (under a lock, for all rendering or just for writes etc)
648  * @timeout: how long to wait
649  * @rps_client: client (user process) to charge for any waitboosting
650  */
651 int
652 i915_gem_object_wait(struct drm_i915_gem_object *obj,
653                      unsigned int flags,
654                      long timeout,
655                      struct intel_rps_client *rps_client)
656 {
657         might_sleep();
658 #if IS_ENABLED(CONFIG_LOCKDEP)
659         GEM_BUG_ON(debug_locks &&
660                    !!lockdep_is_held(&obj->base.dev->struct_mutex) !=
661                    !!(flags & I915_WAIT_LOCKED));
662 #endif
663         GEM_BUG_ON(timeout < 0);
664
665         timeout = i915_gem_object_wait_reservation(obj->resv,
666                                                    flags, timeout,
667                                                    rps_client);
668         return timeout < 0 ? timeout : 0;
669 }
670
671 static struct intel_rps_client *to_rps_client(struct drm_file *file)
672 {
673         struct drm_i915_file_private *fpriv = file->driver_priv;
674
675         return &fpriv->rps_client;
676 }
677
678 static int
679 i915_gem_phys_pwrite(struct drm_i915_gem_object *obj,
680                      struct drm_i915_gem_pwrite *args,
681                      struct drm_file *file)
682 {
683         void *vaddr = obj->phys_handle->vaddr + args->offset;
684         char __user *user_data = u64_to_user_ptr(args->data_ptr);
685
686         /* We manually control the domain here and pretend that it
687          * remains coherent i.e. in the GTT domain, like shmem_pwrite.
688          */
689         intel_fb_obj_invalidate(obj, ORIGIN_CPU);
690         if (copy_from_user(vaddr, user_data, args->size))
691                 return -EFAULT;
692
693         drm_clflush_virt_range(vaddr, args->size);
694         i915_gem_chipset_flush(to_i915(obj->base.dev));
695
696         intel_fb_obj_flush(obj, ORIGIN_CPU);
697         return 0;
698 }
699
700 void *i915_gem_object_alloc(struct drm_i915_private *dev_priv)
701 {
702         return kmem_cache_zalloc(dev_priv->objects, GFP_KERNEL);
703 }
704
705 void i915_gem_object_free(struct drm_i915_gem_object *obj)
706 {
707         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
708         kmem_cache_free(dev_priv->objects, obj);
709 }
710
711 static int
712 i915_gem_create(struct drm_file *file,
713                 struct drm_i915_private *dev_priv,
714                 uint64_t size,
715                 uint32_t *handle_p)
716 {
717         struct drm_i915_gem_object *obj;
718         int ret;
719         u32 handle;
720
721         size = roundup(size, PAGE_SIZE);
722         if (size == 0)
723                 return -EINVAL;
724
725         /* Allocate the new object */
726         obj = i915_gem_object_create(dev_priv, size);
727         if (IS_ERR(obj))
728                 return PTR_ERR(obj);
729
730         ret = drm_gem_handle_create(file, &obj->base, &handle);
731         /* drop reference from allocate - handle holds it now */
732         i915_gem_object_put(obj);
733         if (ret)
734                 return ret;
735
736         *handle_p = handle;
737         return 0;
738 }
739
740 int
741 i915_gem_dumb_create(struct drm_file *file,
742                      struct drm_device *dev,
743                      struct drm_mode_create_dumb *args)
744 {
745         /* have to work out size/pitch and return them */
746         args->pitch = ALIGN(args->width * DIV_ROUND_UP(args->bpp, 8), 64);
747         args->size = args->pitch * args->height;
748         return i915_gem_create(file, to_i915(dev),
749                                args->size, &args->handle);
750 }
751
752 static bool gpu_write_needs_clflush(struct drm_i915_gem_object *obj)
753 {
754         return !(obj->cache_level == I915_CACHE_NONE ||
755                  obj->cache_level == I915_CACHE_WT);
756 }
757
758 /**
759  * Creates a new mm object and returns a handle to it.
760  * @dev: drm device pointer
761  * @data: ioctl data blob
762  * @file: drm file pointer
763  */
764 int
765 i915_gem_create_ioctl(struct drm_device *dev, void *data,
766                       struct drm_file *file)
767 {
768         struct drm_i915_private *dev_priv = to_i915(dev);
769         struct drm_i915_gem_create *args = data;
770
771         i915_gem_flush_free_objects(dev_priv);
772
773         return i915_gem_create(file, dev_priv,
774                                args->size, &args->handle);
775 }
776
777 static inline enum fb_op_origin
778 fb_write_origin(struct drm_i915_gem_object *obj, unsigned int domain)
779 {
780         return (domain == I915_GEM_DOMAIN_GTT ?
781                 obj->frontbuffer_ggtt_origin : ORIGIN_CPU);
782 }
783
784 void i915_gem_flush_ggtt_writes(struct drm_i915_private *dev_priv)
785 {
786         /*
787          * No actual flushing is required for the GTT write domain for reads
788          * from the GTT domain. Writes to it "immediately" go to main memory
789          * as far as we know, so there's no chipset flush. It also doesn't
790          * land in the GPU render cache.
791          *
792          * However, we do have to enforce the order so that all writes through
793          * the GTT land before any writes to the device, such as updates to
794          * the GATT itself.
795          *
796          * We also have to wait a bit for the writes to land from the GTT.
797          * An uncached read (i.e. mmio) seems to be ideal for the round-trip
798          * timing. This issue has only been observed when switching quickly
799          * between GTT writes and CPU reads from inside the kernel on recent hw,
800          * and it appears to only affect discrete GTT blocks (i.e. on LLC
801          * system agents we cannot reproduce this behaviour, until Cannonlake
802          * that was!).
803          */
804
805         wmb();
806
807         if (INTEL_INFO(dev_priv)->has_coherent_ggtt)
808                 return;
809
810         i915_gem_chipset_flush(dev_priv);
811
812         intel_runtime_pm_get(dev_priv);
813         spin_lock_irq(&dev_priv->uncore.lock);
814
815         POSTING_READ_FW(RING_HEAD(RENDER_RING_BASE));
816
817         spin_unlock_irq(&dev_priv->uncore.lock);
818         intel_runtime_pm_put(dev_priv);
819 }
820
821 static void
822 flush_write_domain(struct drm_i915_gem_object *obj, unsigned int flush_domains)
823 {
824         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
825         struct i915_vma *vma;
826
827         if (!(obj->write_domain & flush_domains))
828                 return;
829
830         switch (obj->write_domain) {
831         case I915_GEM_DOMAIN_GTT:
832                 i915_gem_flush_ggtt_writes(dev_priv);
833
834                 intel_fb_obj_flush(obj,
835                                    fb_write_origin(obj, I915_GEM_DOMAIN_GTT));
836
837                 for_each_ggtt_vma(vma, obj) {
838                         if (vma->iomap)
839                                 continue;
840
841                         i915_vma_unset_ggtt_write(vma);
842                 }
843                 break;
844
845         case I915_GEM_DOMAIN_WC:
846                 wmb();
847                 break;
848
849         case I915_GEM_DOMAIN_CPU:
850                 i915_gem_clflush_object(obj, I915_CLFLUSH_SYNC);
851                 break;
852
853         case I915_GEM_DOMAIN_RENDER:
854                 if (gpu_write_needs_clflush(obj))
855                         obj->cache_dirty = true;
856                 break;
857         }
858
859         obj->write_domain = 0;
860 }
861
862 static inline int
863 __copy_to_user_swizzled(char __user *cpu_vaddr,
864                         const char *gpu_vaddr, int gpu_offset,
865                         int length)
866 {
867         int ret, cpu_offset = 0;
868
869         while (length > 0) {
870                 int cacheline_end = ALIGN(gpu_offset + 1, 64);
871                 int this_length = min(cacheline_end - gpu_offset, length);
872                 int swizzled_gpu_offset = gpu_offset ^ 64;
873
874                 ret = __copy_to_user(cpu_vaddr + cpu_offset,
875                                      gpu_vaddr + swizzled_gpu_offset,
876                                      this_length);
877                 if (ret)
878                         return ret + length;
879
880                 cpu_offset += this_length;
881                 gpu_offset += this_length;
882                 length -= this_length;
883         }
884
885         return 0;
886 }
887
888 static inline int
889 __copy_from_user_swizzled(char *gpu_vaddr, int gpu_offset,
890                           const char __user *cpu_vaddr,
891                           int length)
892 {
893         int ret, cpu_offset = 0;
894
895         while (length > 0) {
896                 int cacheline_end = ALIGN(gpu_offset + 1, 64);
897                 int this_length = min(cacheline_end - gpu_offset, length);
898                 int swizzled_gpu_offset = gpu_offset ^ 64;
899
900                 ret = __copy_from_user(gpu_vaddr + swizzled_gpu_offset,
901                                        cpu_vaddr + cpu_offset,
902                                        this_length);
903                 if (ret)
904                         return ret + length;
905
906                 cpu_offset += this_length;
907                 gpu_offset += this_length;
908                 length -= this_length;
909         }
910
911         return 0;
912 }
913
914 /*
915  * Pins the specified object's pages and synchronizes the object with
916  * GPU accesses. Sets needs_clflush to non-zero if the caller should
917  * flush the object from the CPU cache.
918  */
919 int i915_gem_obj_prepare_shmem_read(struct drm_i915_gem_object *obj,
920                                     unsigned int *needs_clflush)
921 {
922         int ret;
923
924         lockdep_assert_held(&obj->base.dev->struct_mutex);
925
926         *needs_clflush = 0;
927         if (!i915_gem_object_has_struct_page(obj))
928                 return -ENODEV;
929
930         ret = i915_gem_object_wait(obj,
931                                    I915_WAIT_INTERRUPTIBLE |
932                                    I915_WAIT_LOCKED,
933                                    MAX_SCHEDULE_TIMEOUT,
934                                    NULL);
935         if (ret)
936                 return ret;
937
938         ret = i915_gem_object_pin_pages(obj);
939         if (ret)
940                 return ret;
941
942         if (obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ ||
943             !static_cpu_has(X86_FEATURE_CLFLUSH)) {
944                 ret = i915_gem_object_set_to_cpu_domain(obj, false);
945                 if (ret)
946                         goto err_unpin;
947                 else
948                         goto out;
949         }
950
951         flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
952
953         /* If we're not in the cpu read domain, set ourself into the gtt
954          * read domain and manually flush cachelines (if required). This
955          * optimizes for the case when the gpu will dirty the data
956          * anyway again before the next pread happens.
957          */
958         if (!obj->cache_dirty &&
959             !(obj->read_domains & I915_GEM_DOMAIN_CPU))
960                 *needs_clflush = CLFLUSH_BEFORE;
961
962 out:
963         /* return with the pages pinned */
964         return 0;
965
966 err_unpin:
967         i915_gem_object_unpin_pages(obj);
968         return ret;
969 }
970
971 int i915_gem_obj_prepare_shmem_write(struct drm_i915_gem_object *obj,
972                                      unsigned int *needs_clflush)
973 {
974         int ret;
975
976         lockdep_assert_held(&obj->base.dev->struct_mutex);
977
978         *needs_clflush = 0;
979         if (!i915_gem_object_has_struct_page(obj))
980                 return -ENODEV;
981
982         ret = i915_gem_object_wait(obj,
983                                    I915_WAIT_INTERRUPTIBLE |
984                                    I915_WAIT_LOCKED |
985                                    I915_WAIT_ALL,
986                                    MAX_SCHEDULE_TIMEOUT,
987                                    NULL);
988         if (ret)
989                 return ret;
990
991         ret = i915_gem_object_pin_pages(obj);
992         if (ret)
993                 return ret;
994
995         if (obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE ||
996             !static_cpu_has(X86_FEATURE_CLFLUSH)) {
997                 ret = i915_gem_object_set_to_cpu_domain(obj, true);
998                 if (ret)
999                         goto err_unpin;
1000                 else
1001                         goto out;
1002         }
1003
1004         flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
1005
1006         /* If we're not in the cpu write domain, set ourself into the
1007          * gtt write domain and manually flush cachelines (as required).
1008          * This optimizes for the case when the gpu will use the data
1009          * right away and we therefore have to clflush anyway.
1010          */
1011         if (!obj->cache_dirty) {
1012                 *needs_clflush |= CLFLUSH_AFTER;
1013
1014                 /*
1015                  * Same trick applies to invalidate partially written
1016                  * cachelines read before writing.
1017                  */
1018                 if (!(obj->read_domains & I915_GEM_DOMAIN_CPU))
1019                         *needs_clflush |= CLFLUSH_BEFORE;
1020         }
1021
1022 out:
1023         intel_fb_obj_invalidate(obj, ORIGIN_CPU);
1024         obj->mm.dirty = true;
1025         /* return with the pages pinned */
1026         return 0;
1027
1028 err_unpin:
1029         i915_gem_object_unpin_pages(obj);
1030         return ret;
1031 }
1032
1033 static void
1034 shmem_clflush_swizzled_range(char *addr, unsigned long length,
1035                              bool swizzled)
1036 {
1037         if (unlikely(swizzled)) {
1038                 unsigned long start = (unsigned long) addr;
1039                 unsigned long end = (unsigned long) addr + length;
1040
1041                 /* For swizzling simply ensure that we always flush both
1042                  * channels. Lame, but simple and it works. Swizzled
1043                  * pwrite/pread is far from a hotpath - current userspace
1044                  * doesn't use it at all. */
1045                 start = round_down(start, 128);
1046                 end = round_up(end, 128);
1047
1048                 drm_clflush_virt_range((void *)start, end - start);
1049         } else {
1050                 drm_clflush_virt_range(addr, length);
1051         }
1052
1053 }
1054
1055 /* Only difference to the fast-path function is that this can handle bit17
1056  * and uses non-atomic copy and kmap functions. */
1057 static int
1058 shmem_pread_slow(struct page *page, int offset, int length,
1059                  char __user *user_data,
1060                  bool page_do_bit17_swizzling, bool needs_clflush)
1061 {
1062         char *vaddr;
1063         int ret;
1064
1065         vaddr = kmap(page);
1066         if (needs_clflush)
1067                 shmem_clflush_swizzled_range(vaddr + offset, length,
1068                                              page_do_bit17_swizzling);
1069
1070         if (page_do_bit17_swizzling)
1071                 ret = __copy_to_user_swizzled(user_data, vaddr, offset, length);
1072         else
1073                 ret = __copy_to_user(user_data, vaddr + offset, length);
1074         kunmap(page);
1075
1076         return ret ? - EFAULT : 0;
1077 }
1078
1079 static int
1080 shmem_pread(struct page *page, int offset, int length, char __user *user_data,
1081             bool page_do_bit17_swizzling, bool needs_clflush)
1082 {
1083         int ret;
1084
1085         ret = -ENODEV;
1086         if (!page_do_bit17_swizzling) {
1087                 char *vaddr = kmap_atomic(page);
1088
1089                 if (needs_clflush)
1090                         drm_clflush_virt_range(vaddr + offset, length);
1091                 ret = __copy_to_user_inatomic(user_data, vaddr + offset, length);
1092                 kunmap_atomic(vaddr);
1093         }
1094         if (ret == 0)
1095                 return 0;
1096
1097         return shmem_pread_slow(page, offset, length, user_data,
1098                                 page_do_bit17_swizzling, needs_clflush);
1099 }
1100
1101 static int
1102 i915_gem_shmem_pread(struct drm_i915_gem_object *obj,
1103                      struct drm_i915_gem_pread *args)
1104 {
1105         char __user *user_data;
1106         u64 remain;
1107         unsigned int obj_do_bit17_swizzling;
1108         unsigned int needs_clflush;
1109         unsigned int idx, offset;
1110         int ret;
1111
1112         obj_do_bit17_swizzling = 0;
1113         if (i915_gem_object_needs_bit17_swizzle(obj))
1114                 obj_do_bit17_swizzling = BIT(17);
1115
1116         ret = mutex_lock_interruptible(&obj->base.dev->struct_mutex);
1117         if (ret)
1118                 return ret;
1119
1120         ret = i915_gem_obj_prepare_shmem_read(obj, &needs_clflush);
1121         mutex_unlock(&obj->base.dev->struct_mutex);
1122         if (ret)
1123                 return ret;
1124
1125         remain = args->size;
1126         user_data = u64_to_user_ptr(args->data_ptr);
1127         offset = offset_in_page(args->offset);
1128         for (idx = args->offset >> PAGE_SHIFT; remain; idx++) {
1129                 struct page *page = i915_gem_object_get_page(obj, idx);
1130                 unsigned int length = min_t(u64, remain, PAGE_SIZE - offset);
1131
1132                 ret = shmem_pread(page, offset, length, user_data,
1133                                   page_to_phys(page) & obj_do_bit17_swizzling,
1134                                   needs_clflush);
1135                 if (ret)
1136                         break;
1137
1138                 remain -= length;
1139                 user_data += length;
1140                 offset = 0;
1141         }
1142
1143         i915_gem_obj_finish_shmem_access(obj);
1144         return ret;
1145 }
1146
1147 static inline bool
1148 gtt_user_read(struct io_mapping *mapping,
1149               loff_t base, int offset,
1150               char __user *user_data, int length)
1151 {
1152         void __iomem *vaddr;
1153         unsigned long unwritten;
1154
1155         /* We can use the cpu mem copy function because this is X86. */
1156         vaddr = io_mapping_map_atomic_wc(mapping, base);
1157         unwritten = __copy_to_user_inatomic(user_data,
1158                                             (void __force *)vaddr + offset,
1159                                             length);
1160         io_mapping_unmap_atomic(vaddr);
1161         if (unwritten) {
1162                 vaddr = io_mapping_map_wc(mapping, base, PAGE_SIZE);
1163                 unwritten = copy_to_user(user_data,
1164                                          (void __force *)vaddr + offset,
1165                                          length);
1166                 io_mapping_unmap(vaddr);
1167         }
1168         return unwritten;
1169 }
1170
1171 static int
1172 i915_gem_gtt_pread(struct drm_i915_gem_object *obj,
1173                    const struct drm_i915_gem_pread *args)
1174 {
1175         struct drm_i915_private *i915 = to_i915(obj->base.dev);
1176         struct i915_ggtt *ggtt = &i915->ggtt;
1177         struct drm_mm_node node;
1178         struct i915_vma *vma;
1179         void __user *user_data;
1180         u64 remain, offset;
1181         int ret;
1182
1183         ret = mutex_lock_interruptible(&i915->drm.struct_mutex);
1184         if (ret)
1185                 return ret;
1186
1187         intel_runtime_pm_get(i915);
1188         vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
1189                                        PIN_MAPPABLE |
1190                                        PIN_NONFAULT |
1191                                        PIN_NONBLOCK);
1192         if (!IS_ERR(vma)) {
1193                 node.start = i915_ggtt_offset(vma);
1194                 node.allocated = false;
1195                 ret = i915_vma_put_fence(vma);
1196                 if (ret) {
1197                         i915_vma_unpin(vma);
1198                         vma = ERR_PTR(ret);
1199                 }
1200         }
1201         if (IS_ERR(vma)) {
1202                 ret = insert_mappable_node(ggtt, &node, PAGE_SIZE);
1203                 if (ret)
1204                         goto out_unlock;
1205                 GEM_BUG_ON(!node.allocated);
1206         }
1207
1208         ret = i915_gem_object_set_to_gtt_domain(obj, false);
1209         if (ret)
1210                 goto out_unpin;
1211
1212         mutex_unlock(&i915->drm.struct_mutex);
1213
1214         user_data = u64_to_user_ptr(args->data_ptr);
1215         remain = args->size;
1216         offset = args->offset;
1217
1218         while (remain > 0) {
1219                 /* Operation in this page
1220                  *
1221                  * page_base = page offset within aperture
1222                  * page_offset = offset within page
1223                  * page_length = bytes to copy for this page
1224                  */
1225                 u32 page_base = node.start;
1226                 unsigned page_offset = offset_in_page(offset);
1227                 unsigned page_length = PAGE_SIZE - page_offset;
1228                 page_length = remain < page_length ? remain : page_length;
1229                 if (node.allocated) {
1230                         wmb();
1231                         ggtt->vm.insert_page(&ggtt->vm,
1232                                              i915_gem_object_get_dma_address(obj, offset >> PAGE_SHIFT),
1233                                              node.start, I915_CACHE_NONE, 0);
1234                         wmb();
1235                 } else {
1236                         page_base += offset & PAGE_MASK;
1237                 }
1238
1239                 if (gtt_user_read(&ggtt->iomap, page_base, page_offset,
1240                                   user_data, page_length)) {
1241                         ret = -EFAULT;
1242                         break;
1243                 }
1244
1245                 remain -= page_length;
1246                 user_data += page_length;
1247                 offset += page_length;
1248         }
1249
1250         mutex_lock(&i915->drm.struct_mutex);
1251 out_unpin:
1252         if (node.allocated) {
1253                 wmb();
1254                 ggtt->vm.clear_range(&ggtt->vm, node.start, node.size);
1255                 remove_mappable_node(&node);
1256         } else {
1257                 i915_vma_unpin(vma);
1258         }
1259 out_unlock:
1260         intel_runtime_pm_put(i915);
1261         mutex_unlock(&i915->drm.struct_mutex);
1262
1263         return ret;
1264 }
1265
1266 /**
1267  * Reads data from the object referenced by handle.
1268  * @dev: drm device pointer
1269  * @data: ioctl data blob
1270  * @file: drm file pointer
1271  *
1272  * On error, the contents of *data are undefined.
1273  */
1274 int
1275 i915_gem_pread_ioctl(struct drm_device *dev, void *data,
1276                      struct drm_file *file)
1277 {
1278         struct drm_i915_gem_pread *args = data;
1279         struct drm_i915_gem_object *obj;
1280         int ret;
1281
1282         if (args->size == 0)
1283                 return 0;
1284
1285         if (!access_ok(u64_to_user_ptr(args->data_ptr),
1286                        args->size))
1287                 return -EFAULT;
1288
1289         obj = i915_gem_object_lookup(file, args->handle);
1290         if (!obj)
1291                 return -ENOENT;
1292
1293         /* Bounds check source.  */
1294         if (range_overflows_t(u64, args->offset, args->size, obj->base.size)) {
1295                 ret = -EINVAL;
1296                 goto out;
1297         }
1298
1299         trace_i915_gem_object_pread(obj, args->offset, args->size);
1300
1301         ret = i915_gem_object_wait(obj,
1302                                    I915_WAIT_INTERRUPTIBLE,
1303                                    MAX_SCHEDULE_TIMEOUT,
1304                                    to_rps_client(file));
1305         if (ret)
1306                 goto out;
1307
1308         ret = i915_gem_object_pin_pages(obj);
1309         if (ret)
1310                 goto out;
1311
1312         ret = i915_gem_shmem_pread(obj, args);
1313         if (ret == -EFAULT || ret == -ENODEV)
1314                 ret = i915_gem_gtt_pread(obj, args);
1315
1316         i915_gem_object_unpin_pages(obj);
1317 out:
1318         i915_gem_object_put(obj);
1319         return ret;
1320 }
1321
1322 /* This is the fast write path which cannot handle
1323  * page faults in the source data
1324  */
1325
1326 static inline bool
1327 ggtt_write(struct io_mapping *mapping,
1328            loff_t base, int offset,
1329            char __user *user_data, int length)
1330 {
1331         void __iomem *vaddr;
1332         unsigned long unwritten;
1333
1334         /* We can use the cpu mem copy function because this is X86. */
1335         vaddr = io_mapping_map_atomic_wc(mapping, base);
1336         unwritten = __copy_from_user_inatomic_nocache((void __force *)vaddr + offset,
1337                                                       user_data, length);
1338         io_mapping_unmap_atomic(vaddr);
1339         if (unwritten) {
1340                 vaddr = io_mapping_map_wc(mapping, base, PAGE_SIZE);
1341                 unwritten = copy_from_user((void __force *)vaddr + offset,
1342                                            user_data, length);
1343                 io_mapping_unmap(vaddr);
1344         }
1345
1346         return unwritten;
1347 }
1348
1349 /**
1350  * This is the fast pwrite path, where we copy the data directly from the
1351  * user into the GTT, uncached.
1352  * @obj: i915 GEM object
1353  * @args: pwrite arguments structure
1354  */
1355 static int
1356 i915_gem_gtt_pwrite_fast(struct drm_i915_gem_object *obj,
1357                          const struct drm_i915_gem_pwrite *args)
1358 {
1359         struct drm_i915_private *i915 = to_i915(obj->base.dev);
1360         struct i915_ggtt *ggtt = &i915->ggtt;
1361         struct drm_mm_node node;
1362         struct i915_vma *vma;
1363         u64 remain, offset;
1364         void __user *user_data;
1365         int ret;
1366
1367         ret = mutex_lock_interruptible(&i915->drm.struct_mutex);
1368         if (ret)
1369                 return ret;
1370
1371         if (i915_gem_object_has_struct_page(obj)) {
1372                 /*
1373                  * Avoid waking the device up if we can fallback, as
1374                  * waking/resuming is very slow (worst-case 10-100 ms
1375                  * depending on PCI sleeps and our own resume time).
1376                  * This easily dwarfs any performance advantage from
1377                  * using the cache bypass of indirect GGTT access.
1378                  */
1379                 if (!intel_runtime_pm_get_if_in_use(i915)) {
1380                         ret = -EFAULT;
1381                         goto out_unlock;
1382                 }
1383         } else {
1384                 /* No backing pages, no fallback, we must force GGTT access */
1385                 intel_runtime_pm_get(i915);
1386         }
1387
1388         vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
1389                                        PIN_MAPPABLE |
1390                                        PIN_NONFAULT |
1391                                        PIN_NONBLOCK);
1392         if (!IS_ERR(vma)) {
1393                 node.start = i915_ggtt_offset(vma);
1394                 node.allocated = false;
1395                 ret = i915_vma_put_fence(vma);
1396                 if (ret) {
1397                         i915_vma_unpin(vma);
1398                         vma = ERR_PTR(ret);
1399                 }
1400         }
1401         if (IS_ERR(vma)) {
1402                 ret = insert_mappable_node(ggtt, &node, PAGE_SIZE);
1403                 if (ret)
1404                         goto out_rpm;
1405                 GEM_BUG_ON(!node.allocated);
1406         }
1407
1408         ret = i915_gem_object_set_to_gtt_domain(obj, true);
1409         if (ret)
1410                 goto out_unpin;
1411
1412         mutex_unlock(&i915->drm.struct_mutex);
1413
1414         intel_fb_obj_invalidate(obj, ORIGIN_CPU);
1415
1416         user_data = u64_to_user_ptr(args->data_ptr);
1417         offset = args->offset;
1418         remain = args->size;
1419         while (remain) {
1420                 /* Operation in this page
1421                  *
1422                  * page_base = page offset within aperture
1423                  * page_offset = offset within page
1424                  * page_length = bytes to copy for this page
1425                  */
1426                 u32 page_base = node.start;
1427                 unsigned int page_offset = offset_in_page(offset);
1428                 unsigned int page_length = PAGE_SIZE - page_offset;
1429                 page_length = remain < page_length ? remain : page_length;
1430                 if (node.allocated) {
1431                         wmb(); /* flush the write before we modify the GGTT */
1432                         ggtt->vm.insert_page(&ggtt->vm,
1433                                              i915_gem_object_get_dma_address(obj, offset >> PAGE_SHIFT),
1434                                              node.start, I915_CACHE_NONE, 0);
1435                         wmb(); /* flush modifications to the GGTT (insert_page) */
1436                 } else {
1437                         page_base += offset & PAGE_MASK;
1438                 }
1439                 /* If we get a fault while copying data, then (presumably) our
1440                  * source page isn't available.  Return the error and we'll
1441                  * retry in the slow path.
1442                  * If the object is non-shmem backed, we retry again with the
1443                  * path that handles page fault.
1444                  */
1445                 if (ggtt_write(&ggtt->iomap, page_base, page_offset,
1446                                user_data, page_length)) {
1447                         ret = -EFAULT;
1448                         break;
1449                 }
1450
1451                 remain -= page_length;
1452                 user_data += page_length;
1453                 offset += page_length;
1454         }
1455         intel_fb_obj_flush(obj, ORIGIN_CPU);
1456
1457         mutex_lock(&i915->drm.struct_mutex);
1458 out_unpin:
1459         if (node.allocated) {
1460                 wmb();
1461                 ggtt->vm.clear_range(&ggtt->vm, node.start, node.size);
1462                 remove_mappable_node(&node);
1463         } else {
1464                 i915_vma_unpin(vma);
1465         }
1466 out_rpm:
1467         intel_runtime_pm_put(i915);
1468 out_unlock:
1469         mutex_unlock(&i915->drm.struct_mutex);
1470         return ret;
1471 }
1472
1473 static int
1474 shmem_pwrite_slow(struct page *page, int offset, int length,
1475                   char __user *user_data,
1476                   bool page_do_bit17_swizzling,
1477                   bool needs_clflush_before,
1478                   bool needs_clflush_after)
1479 {
1480         char *vaddr;
1481         int ret;
1482
1483         vaddr = kmap(page);
1484         if (unlikely(needs_clflush_before || page_do_bit17_swizzling))
1485                 shmem_clflush_swizzled_range(vaddr + offset, length,
1486                                              page_do_bit17_swizzling);
1487         if (page_do_bit17_swizzling)
1488                 ret = __copy_from_user_swizzled(vaddr, offset, user_data,
1489                                                 length);
1490         else
1491                 ret = __copy_from_user(vaddr + offset, user_data, length);
1492         if (needs_clflush_after)
1493                 shmem_clflush_swizzled_range(vaddr + offset, length,
1494                                              page_do_bit17_swizzling);
1495         kunmap(page);
1496
1497         return ret ? -EFAULT : 0;
1498 }
1499
1500 /* Per-page copy function for the shmem pwrite fastpath.
1501  * Flushes invalid cachelines before writing to the target if
1502  * needs_clflush_before is set and flushes out any written cachelines after
1503  * writing if needs_clflush is set.
1504  */
1505 static int
1506 shmem_pwrite(struct page *page, int offset, int len, char __user *user_data,
1507              bool page_do_bit17_swizzling,
1508              bool needs_clflush_before,
1509              bool needs_clflush_after)
1510 {
1511         int ret;
1512
1513         ret = -ENODEV;
1514         if (!page_do_bit17_swizzling) {
1515                 char *vaddr = kmap_atomic(page);
1516
1517                 if (needs_clflush_before)
1518                         drm_clflush_virt_range(vaddr + offset, len);
1519                 ret = __copy_from_user_inatomic(vaddr + offset, user_data, len);
1520                 if (needs_clflush_after)
1521                         drm_clflush_virt_range(vaddr + offset, len);
1522
1523                 kunmap_atomic(vaddr);
1524         }
1525         if (ret == 0)
1526                 return ret;
1527
1528         return shmem_pwrite_slow(page, offset, len, user_data,
1529                                  page_do_bit17_swizzling,
1530                                  needs_clflush_before,
1531                                  needs_clflush_after);
1532 }
1533
1534 static int
1535 i915_gem_shmem_pwrite(struct drm_i915_gem_object *obj,
1536                       const struct drm_i915_gem_pwrite *args)
1537 {
1538         struct drm_i915_private *i915 = to_i915(obj->base.dev);
1539         void __user *user_data;
1540         u64 remain;
1541         unsigned int obj_do_bit17_swizzling;
1542         unsigned int partial_cacheline_write;
1543         unsigned int needs_clflush;
1544         unsigned int offset, idx;
1545         int ret;
1546
1547         ret = mutex_lock_interruptible(&i915->drm.struct_mutex);
1548         if (ret)
1549                 return ret;
1550
1551         ret = i915_gem_obj_prepare_shmem_write(obj, &needs_clflush);
1552         mutex_unlock(&i915->drm.struct_mutex);
1553         if (ret)
1554                 return ret;
1555
1556         obj_do_bit17_swizzling = 0;
1557         if (i915_gem_object_needs_bit17_swizzle(obj))
1558                 obj_do_bit17_swizzling = BIT(17);
1559
1560         /* If we don't overwrite a cacheline completely we need to be
1561          * careful to have up-to-date data by first clflushing. Don't
1562          * overcomplicate things and flush the entire patch.
1563          */
1564         partial_cacheline_write = 0;
1565         if (needs_clflush & CLFLUSH_BEFORE)
1566                 partial_cacheline_write = boot_cpu_data.x86_clflush_size - 1;
1567
1568         user_data = u64_to_user_ptr(args->data_ptr);
1569         remain = args->size;
1570         offset = offset_in_page(args->offset);
1571         for (idx = args->offset >> PAGE_SHIFT; remain; idx++) {
1572                 struct page *page = i915_gem_object_get_page(obj, idx);
1573                 unsigned int length = min_t(u64, remain, PAGE_SIZE - offset);
1574
1575                 ret = shmem_pwrite(page, offset, length, user_data,
1576                                    page_to_phys(page) & obj_do_bit17_swizzling,
1577                                    (offset | length) & partial_cacheline_write,
1578                                    needs_clflush & CLFLUSH_AFTER);
1579                 if (ret)
1580                         break;
1581
1582                 remain -= length;
1583                 user_data += length;
1584                 offset = 0;
1585         }
1586
1587         intel_fb_obj_flush(obj, ORIGIN_CPU);
1588         i915_gem_obj_finish_shmem_access(obj);
1589         return ret;
1590 }
1591
1592 /**
1593  * Writes data to the object referenced by handle.
1594  * @dev: drm device
1595  * @data: ioctl data blob
1596  * @file: drm file
1597  *
1598  * On error, the contents of the buffer that were to be modified are undefined.
1599  */
1600 int
1601 i915_gem_pwrite_ioctl(struct drm_device *dev, void *data,
1602                       struct drm_file *file)
1603 {
1604         struct drm_i915_gem_pwrite *args = data;
1605         struct drm_i915_gem_object *obj;
1606         int ret;
1607
1608         if (args->size == 0)
1609                 return 0;
1610
1611         if (!access_ok(u64_to_user_ptr(args->data_ptr), args->size))
1612                 return -EFAULT;
1613
1614         obj = i915_gem_object_lookup(file, args->handle);
1615         if (!obj)
1616                 return -ENOENT;
1617
1618         /* Bounds check destination. */
1619         if (range_overflows_t(u64, args->offset, args->size, obj->base.size)) {
1620                 ret = -EINVAL;
1621                 goto err;
1622         }
1623
1624         /* Writes not allowed into this read-only object */
1625         if (i915_gem_object_is_readonly(obj)) {
1626                 ret = -EINVAL;
1627                 goto err;
1628         }
1629
1630         trace_i915_gem_object_pwrite(obj, args->offset, args->size);
1631
1632         ret = -ENODEV;
1633         if (obj->ops->pwrite)
1634                 ret = obj->ops->pwrite(obj, args);
1635         if (ret != -ENODEV)
1636                 goto err;
1637
1638         ret = i915_gem_object_wait(obj,
1639                                    I915_WAIT_INTERRUPTIBLE |
1640                                    I915_WAIT_ALL,
1641                                    MAX_SCHEDULE_TIMEOUT,
1642                                    to_rps_client(file));
1643         if (ret)
1644                 goto err;
1645
1646         ret = i915_gem_object_pin_pages(obj);
1647         if (ret)
1648                 goto err;
1649
1650         ret = -EFAULT;
1651         /* We can only do the GTT pwrite on untiled buffers, as otherwise
1652          * it would end up going through the fenced access, and we'll get
1653          * different detiling behavior between reading and writing.
1654          * pread/pwrite currently are reading and writing from the CPU
1655          * perspective, requiring manual detiling by the client.
1656          */
1657         if (!i915_gem_object_has_struct_page(obj) ||
1658             cpu_write_needs_clflush(obj))
1659                 /* Note that the gtt paths might fail with non-page-backed user
1660                  * pointers (e.g. gtt mappings when moving data between
1661                  * textures). Fallback to the shmem path in that case.
1662                  */
1663                 ret = i915_gem_gtt_pwrite_fast(obj, args);
1664
1665         if (ret == -EFAULT || ret == -ENOSPC) {
1666                 if (obj->phys_handle)
1667                         ret = i915_gem_phys_pwrite(obj, args, file);
1668                 else
1669                         ret = i915_gem_shmem_pwrite(obj, args);
1670         }
1671
1672         i915_gem_object_unpin_pages(obj);
1673 err:
1674         i915_gem_object_put(obj);
1675         return ret;
1676 }
1677
1678 static void i915_gem_object_bump_inactive_ggtt(struct drm_i915_gem_object *obj)
1679 {
1680         struct drm_i915_private *i915;
1681         struct list_head *list;
1682         struct i915_vma *vma;
1683
1684         GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
1685
1686         for_each_ggtt_vma(vma, obj) {
1687                 if (i915_vma_is_active(vma))
1688                         continue;
1689
1690                 if (!drm_mm_node_allocated(&vma->node))
1691                         continue;
1692
1693                 list_move_tail(&vma->vm_link, &vma->vm->inactive_list);
1694         }
1695
1696         i915 = to_i915(obj->base.dev);
1697         spin_lock(&i915->mm.obj_lock);
1698         list = obj->bind_count ? &i915->mm.bound_list : &i915->mm.unbound_list;
1699         list_move_tail(&obj->mm.link, list);
1700         spin_unlock(&i915->mm.obj_lock);
1701 }
1702
1703 /**
1704  * Called when user space prepares to use an object with the CPU, either
1705  * through the mmap ioctl's mapping or a GTT mapping.
1706  * @dev: drm device
1707  * @data: ioctl data blob
1708  * @file: drm file
1709  */
1710 int
1711 i915_gem_set_domain_ioctl(struct drm_device *dev, void *data,
1712                           struct drm_file *file)
1713 {
1714         struct drm_i915_gem_set_domain *args = data;
1715         struct drm_i915_gem_object *obj;
1716         uint32_t read_domains = args->read_domains;
1717         uint32_t write_domain = args->write_domain;
1718         int err;
1719
1720         /* Only handle setting domains to types used by the CPU. */
1721         if ((write_domain | read_domains) & I915_GEM_GPU_DOMAINS)
1722                 return -EINVAL;
1723
1724         /* Having something in the write domain implies it's in the read
1725          * domain, and only that read domain.  Enforce that in the request.
1726          */
1727         if (write_domain != 0 && read_domains != write_domain)
1728                 return -EINVAL;
1729
1730         obj = i915_gem_object_lookup(file, args->handle);
1731         if (!obj)
1732                 return -ENOENT;
1733
1734         /* Try to flush the object off the GPU without holding the lock.
1735          * We will repeat the flush holding the lock in the normal manner
1736          * to catch cases where we are gazumped.
1737          */
1738         err = i915_gem_object_wait(obj,
1739                                    I915_WAIT_INTERRUPTIBLE |
1740                                    I915_WAIT_PRIORITY |
1741                                    (write_domain ? I915_WAIT_ALL : 0),
1742                                    MAX_SCHEDULE_TIMEOUT,
1743                                    to_rps_client(file));
1744         if (err)
1745                 goto out;
1746
1747         /*
1748          * Proxy objects do not control access to the backing storage, ergo
1749          * they cannot be used as a means to manipulate the cache domain
1750          * tracking for that backing storage. The proxy object is always
1751          * considered to be outside of any cache domain.
1752          */
1753         if (i915_gem_object_is_proxy(obj)) {
1754                 err = -ENXIO;
1755                 goto out;
1756         }
1757
1758         /*
1759          * Flush and acquire obj->pages so that we are coherent through
1760          * direct access in memory with previous cached writes through
1761          * shmemfs and that our cache domain tracking remains valid.
1762          * For example, if the obj->filp was moved to swap without us
1763          * being notified and releasing the pages, we would mistakenly
1764          * continue to assume that the obj remained out of the CPU cached
1765          * domain.
1766          */
1767         err = i915_gem_object_pin_pages(obj);
1768         if (err)
1769                 goto out;
1770
1771         err = i915_mutex_lock_interruptible(dev);
1772         if (err)
1773                 goto out_unpin;
1774
1775         if (read_domains & I915_GEM_DOMAIN_WC)
1776                 err = i915_gem_object_set_to_wc_domain(obj, write_domain);
1777         else if (read_domains & I915_GEM_DOMAIN_GTT)
1778                 err = i915_gem_object_set_to_gtt_domain(obj, write_domain);
1779         else
1780                 err = i915_gem_object_set_to_cpu_domain(obj, write_domain);
1781
1782         /* And bump the LRU for this access */
1783         i915_gem_object_bump_inactive_ggtt(obj);
1784
1785         mutex_unlock(&dev->struct_mutex);
1786
1787         if (write_domain != 0)
1788                 intel_fb_obj_invalidate(obj,
1789                                         fb_write_origin(obj, write_domain));
1790
1791 out_unpin:
1792         i915_gem_object_unpin_pages(obj);
1793 out:
1794         i915_gem_object_put(obj);
1795         return err;
1796 }
1797
1798 /**
1799  * Called when user space has done writes to this buffer
1800  * @dev: drm device
1801  * @data: ioctl data blob
1802  * @file: drm file
1803  */
1804 int
1805 i915_gem_sw_finish_ioctl(struct drm_device *dev, void *data,
1806                          struct drm_file *file)
1807 {
1808         struct drm_i915_gem_sw_finish *args = data;
1809         struct drm_i915_gem_object *obj;
1810
1811         obj = i915_gem_object_lookup(file, args->handle);
1812         if (!obj)
1813                 return -ENOENT;
1814
1815         /*
1816          * Proxy objects are barred from CPU access, so there is no
1817          * need to ban sw_finish as it is a nop.
1818          */
1819
1820         /* Pinned buffers may be scanout, so flush the cache */
1821         i915_gem_object_flush_if_display(obj);
1822         i915_gem_object_put(obj);
1823
1824         return 0;
1825 }
1826
1827 /**
1828  * i915_gem_mmap_ioctl - Maps the contents of an object, returning the address
1829  *                       it is mapped to.
1830  * @dev: drm device
1831  * @data: ioctl data blob
1832  * @file: drm file
1833  *
1834  * While the mapping holds a reference on the contents of the object, it doesn't
1835  * imply a ref on the object itself.
1836  *
1837  * IMPORTANT:
1838  *
1839  * DRM driver writers who look a this function as an example for how to do GEM
1840  * mmap support, please don't implement mmap support like here. The modern way
1841  * to implement DRM mmap support is with an mmap offset ioctl (like
1842  * i915_gem_mmap_gtt) and then using the mmap syscall on the DRM fd directly.
1843  * That way debug tooling like valgrind will understand what's going on, hiding
1844  * the mmap call in a driver private ioctl will break that. The i915 driver only
1845  * does cpu mmaps this way because we didn't know better.
1846  */
1847 int
1848 i915_gem_mmap_ioctl(struct drm_device *dev, void *data,
1849                     struct drm_file *file)
1850 {
1851         struct drm_i915_gem_mmap *args = data;
1852         struct drm_i915_gem_object *obj;
1853         unsigned long addr;
1854
1855         if (args->flags & ~(I915_MMAP_WC))
1856                 return -EINVAL;
1857
1858         if (args->flags & I915_MMAP_WC && !boot_cpu_has(X86_FEATURE_PAT))
1859                 return -ENODEV;
1860
1861         obj = i915_gem_object_lookup(file, args->handle);
1862         if (!obj)
1863                 return -ENOENT;
1864
1865         /* prime objects have no backing filp to GEM mmap
1866          * pages from.
1867          */
1868         if (!obj->base.filp) {
1869                 i915_gem_object_put(obj);
1870                 return -ENXIO;
1871         }
1872
1873         addr = vm_mmap(obj->base.filp, 0, args->size,
1874                        PROT_READ | PROT_WRITE, MAP_SHARED,
1875                        args->offset);
1876         if (args->flags & I915_MMAP_WC) {
1877                 struct mm_struct *mm = current->mm;
1878                 struct vm_area_struct *vma;
1879
1880                 if (down_write_killable(&mm->mmap_sem)) {
1881                         i915_gem_object_put(obj);
1882                         return -EINTR;
1883                 }
1884                 vma = find_vma(mm, addr);
1885                 if (vma)
1886                         vma->vm_page_prot =
1887                                 pgprot_writecombine(vm_get_page_prot(vma->vm_flags));
1888                 else
1889                         addr = -ENOMEM;
1890                 up_write(&mm->mmap_sem);
1891
1892                 /* This may race, but that's ok, it only gets set */
1893                 WRITE_ONCE(obj->frontbuffer_ggtt_origin, ORIGIN_CPU);
1894         }
1895         i915_gem_object_put(obj);
1896         if (IS_ERR((void *)addr))
1897                 return addr;
1898
1899         args->addr_ptr = (uint64_t) addr;
1900
1901         return 0;
1902 }
1903
1904 static unsigned int tile_row_pages(const struct drm_i915_gem_object *obj)
1905 {
1906         return i915_gem_object_get_tile_row_size(obj) >> PAGE_SHIFT;
1907 }
1908
1909 /**
1910  * i915_gem_mmap_gtt_version - report the current feature set for GTT mmaps
1911  *
1912  * A history of the GTT mmap interface:
1913  *
1914  * 0 - Everything had to fit into the GTT. Both parties of a memcpy had to
1915  *     aligned and suitable for fencing, and still fit into the available
1916  *     mappable space left by the pinned display objects. A classic problem
1917  *     we called the page-fault-of-doom where we would ping-pong between
1918  *     two objects that could not fit inside the GTT and so the memcpy
1919  *     would page one object in at the expense of the other between every
1920  *     single byte.
1921  *
1922  * 1 - Objects can be any size, and have any compatible fencing (X Y, or none
1923  *     as set via i915_gem_set_tiling() [DRM_I915_GEM_SET_TILING]). If the
1924  *     object is too large for the available space (or simply too large
1925  *     for the mappable aperture!), a view is created instead and faulted
1926  *     into userspace. (This view is aligned and sized appropriately for
1927  *     fenced access.)
1928  *
1929  * 2 - Recognise WC as a separate cache domain so that we can flush the
1930  *     delayed writes via GTT before performing direct access via WC.
1931  *
1932  * Restrictions:
1933  *
1934  *  * snoopable objects cannot be accessed via the GTT. It can cause machine
1935  *    hangs on some architectures, corruption on others. An attempt to service
1936  *    a GTT page fault from a snoopable object will generate a SIGBUS.
1937  *
1938  *  * the object must be able to fit into RAM (physical memory, though no
1939  *    limited to the mappable aperture).
1940  *
1941  *
1942  * Caveats:
1943  *
1944  *  * a new GTT page fault will synchronize rendering from the GPU and flush
1945  *    all data to system memory. Subsequent access will not be synchronized.
1946  *
1947  *  * all mappings are revoked on runtime device suspend.
1948  *
1949  *  * there are only 8, 16 or 32 fence registers to share between all users
1950  *    (older machines require fence register for display and blitter access
1951  *    as well). Contention of the fence registers will cause the previous users
1952  *    to be unmapped and any new access will generate new page faults.
1953  *
1954  *  * running out of memory while servicing a fault may generate a SIGBUS,
1955  *    rather than the expected SIGSEGV.
1956  */
1957 int i915_gem_mmap_gtt_version(void)
1958 {
1959         return 2;
1960 }
1961
1962 static inline struct i915_ggtt_view
1963 compute_partial_view(const struct drm_i915_gem_object *obj,
1964                      pgoff_t page_offset,
1965                      unsigned int chunk)
1966 {
1967         struct i915_ggtt_view view;
1968
1969         if (i915_gem_object_is_tiled(obj))
1970                 chunk = roundup(chunk, tile_row_pages(obj));
1971
1972         view.type = I915_GGTT_VIEW_PARTIAL;
1973         view.partial.offset = rounddown(page_offset, chunk);
1974         view.partial.size =
1975                 min_t(unsigned int, chunk,
1976                       (obj->base.size >> PAGE_SHIFT) - view.partial.offset);
1977
1978         /* If the partial covers the entire object, just create a normal VMA. */
1979         if (chunk >= obj->base.size >> PAGE_SHIFT)
1980                 view.type = I915_GGTT_VIEW_NORMAL;
1981
1982         return view;
1983 }
1984
1985 /**
1986  * i915_gem_fault - fault a page into the GTT
1987  * @vmf: fault info
1988  *
1989  * The fault handler is set up by drm_gem_mmap() when a object is GTT mapped
1990  * from userspace.  The fault handler takes care of binding the object to
1991  * the GTT (if needed), allocating and programming a fence register (again,
1992  * only if needed based on whether the old reg is still valid or the object
1993  * is tiled) and inserting a new PTE into the faulting process.
1994  *
1995  * Note that the faulting process may involve evicting existing objects
1996  * from the GTT and/or fence registers to make room.  So performance may
1997  * suffer if the GTT working set is large or there are few fence registers
1998  * left.
1999  *
2000  * The current feature set supported by i915_gem_fault() and thus GTT mmaps
2001  * is exposed via I915_PARAM_MMAP_GTT_VERSION (see i915_gem_mmap_gtt_version).
2002  */
2003 vm_fault_t i915_gem_fault(struct vm_fault *vmf)
2004 {
2005 #define MIN_CHUNK_PAGES (SZ_1M >> PAGE_SHIFT)
2006         struct vm_area_struct *area = vmf->vma;
2007         struct drm_i915_gem_object *obj = to_intel_bo(area->vm_private_data);
2008         struct drm_device *dev = obj->base.dev;
2009         struct drm_i915_private *dev_priv = to_i915(dev);
2010         struct i915_ggtt *ggtt = &dev_priv->ggtt;
2011         bool write = area->vm_flags & VM_WRITE;
2012         struct i915_vma *vma;
2013         pgoff_t page_offset;
2014         int ret;
2015
2016         /* Sanity check that we allow writing into this object */
2017         if (i915_gem_object_is_readonly(obj) && write)
2018                 return VM_FAULT_SIGBUS;
2019
2020         /* We don't use vmf->pgoff since that has the fake offset */
2021         page_offset = (vmf->address - area->vm_start) >> PAGE_SHIFT;
2022
2023         trace_i915_gem_object_fault(obj, page_offset, true, write);
2024
2025         /* Try to flush the object off the GPU first without holding the lock.
2026          * Upon acquiring the lock, we will perform our sanity checks and then
2027          * repeat the flush holding the lock in the normal manner to catch cases
2028          * where we are gazumped.
2029          */
2030         ret = i915_gem_object_wait(obj,
2031                                    I915_WAIT_INTERRUPTIBLE,
2032                                    MAX_SCHEDULE_TIMEOUT,
2033                                    NULL);
2034         if (ret)
2035                 goto err;
2036
2037         ret = i915_gem_object_pin_pages(obj);
2038         if (ret)
2039                 goto err;
2040
2041         intel_runtime_pm_get(dev_priv);
2042
2043         ret = i915_mutex_lock_interruptible(dev);
2044         if (ret)
2045                 goto err_rpm;
2046
2047         /* Access to snoopable pages through the GTT is incoherent. */
2048         if (obj->cache_level != I915_CACHE_NONE && !HAS_LLC(dev_priv)) {
2049                 ret = -EFAULT;
2050                 goto err_unlock;
2051         }
2052
2053
2054         /* Now pin it into the GTT as needed */
2055         vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
2056                                        PIN_MAPPABLE |
2057                                        PIN_NONBLOCK |
2058                                        PIN_NONFAULT);
2059         if (IS_ERR(vma)) {
2060                 /* Use a partial view if it is bigger than available space */
2061                 struct i915_ggtt_view view =
2062                         compute_partial_view(obj, page_offset, MIN_CHUNK_PAGES);
2063                 unsigned int flags;
2064
2065                 flags = PIN_MAPPABLE;
2066                 if (view.type == I915_GGTT_VIEW_NORMAL)
2067                         flags |= PIN_NONBLOCK; /* avoid warnings for pinned */
2068
2069                 /*
2070                  * Userspace is now writing through an untracked VMA, abandon
2071                  * all hope that the hardware is able to track future writes.
2072                  */
2073                 obj->frontbuffer_ggtt_origin = ORIGIN_CPU;
2074
2075                 vma = i915_gem_object_ggtt_pin(obj, &view, 0, 0, flags);
2076                 if (IS_ERR(vma) && !view.type) {
2077                         flags = PIN_MAPPABLE;
2078                         view.type = I915_GGTT_VIEW_PARTIAL;
2079                         vma = i915_gem_object_ggtt_pin(obj, &view, 0, 0, flags);
2080                 }
2081         }
2082         if (IS_ERR(vma)) {
2083                 ret = PTR_ERR(vma);
2084                 goto err_unlock;
2085         }
2086
2087         ret = i915_gem_object_set_to_gtt_domain(obj, write);
2088         if (ret)
2089                 goto err_unpin;
2090
2091         ret = i915_vma_pin_fence(vma);
2092         if (ret)
2093                 goto err_unpin;
2094
2095         /* Finally, remap it using the new GTT offset */
2096         ret = remap_io_mapping(area,
2097                                area->vm_start + (vma->ggtt_view.partial.offset << PAGE_SHIFT),
2098                                (ggtt->gmadr.start + vma->node.start) >> PAGE_SHIFT,
2099                                min_t(u64, vma->size, area->vm_end - area->vm_start),
2100                                &ggtt->iomap);
2101         if (ret)
2102                 goto err_fence;
2103
2104         /* Mark as being mmapped into userspace for later revocation */
2105         assert_rpm_wakelock_held(dev_priv);
2106         if (!i915_vma_set_userfault(vma) && !obj->userfault_count++)
2107                 list_add(&obj->userfault_link, &dev_priv->mm.userfault_list);
2108         GEM_BUG_ON(!obj->userfault_count);
2109
2110         i915_vma_set_ggtt_write(vma);
2111
2112 err_fence:
2113         i915_vma_unpin_fence(vma);
2114 err_unpin:
2115         __i915_vma_unpin(vma);
2116 err_unlock:
2117         mutex_unlock(&dev->struct_mutex);
2118 err_rpm:
2119         intel_runtime_pm_put(dev_priv);
2120         i915_gem_object_unpin_pages(obj);
2121 err:
2122         switch (ret) {
2123         case -EIO:
2124                 /*
2125                  * We eat errors when the gpu is terminally wedged to avoid
2126                  * userspace unduly crashing (gl has no provisions for mmaps to
2127                  * fail). But any other -EIO isn't ours (e.g. swap in failure)
2128                  * and so needs to be reported.
2129                  */
2130                 if (!i915_terminally_wedged(&dev_priv->gpu_error))
2131                         return VM_FAULT_SIGBUS;
2132                 /* else: fall through */
2133         case -EAGAIN:
2134                 /*
2135                  * EAGAIN means the gpu is hung and we'll wait for the error
2136                  * handler to reset everything when re-faulting in
2137                  * i915_mutex_lock_interruptible.
2138                  */
2139         case 0:
2140         case -ERESTARTSYS:
2141         case -EINTR:
2142         case -EBUSY:
2143                 /*
2144                  * EBUSY is ok: this just means that another thread
2145                  * already did the job.
2146                  */
2147                 return VM_FAULT_NOPAGE;
2148         case -ENOMEM:
2149                 return VM_FAULT_OOM;
2150         case -ENOSPC:
2151         case -EFAULT:
2152                 return VM_FAULT_SIGBUS;
2153         default:
2154                 WARN_ONCE(ret, "unhandled error in i915_gem_fault: %i\n", ret);
2155                 return VM_FAULT_SIGBUS;
2156         }
2157 }
2158
2159 static void __i915_gem_object_release_mmap(struct drm_i915_gem_object *obj)
2160 {
2161         struct i915_vma *vma;
2162
2163         GEM_BUG_ON(!obj->userfault_count);
2164
2165         obj->userfault_count = 0;
2166         list_del(&obj->userfault_link);
2167         drm_vma_node_unmap(&obj->base.vma_node,
2168                            obj->base.dev->anon_inode->i_mapping);
2169
2170         for_each_ggtt_vma(vma, obj)
2171                 i915_vma_unset_userfault(vma);
2172 }
2173
2174 /**
2175  * i915_gem_release_mmap - remove physical page mappings
2176  * @obj: obj in question
2177  *
2178  * Preserve the reservation of the mmapping with the DRM core code, but
2179  * relinquish ownership of the pages back to the system.
2180  *
2181  * It is vital that we remove the page mapping if we have mapped a tiled
2182  * object through the GTT and then lose the fence register due to
2183  * resource pressure. Similarly if the object has been moved out of the
2184  * aperture, than pages mapped into userspace must be revoked. Removing the
2185  * mapping will then trigger a page fault on the next user access, allowing
2186  * fixup by i915_gem_fault().
2187  */
2188 void
2189 i915_gem_release_mmap(struct drm_i915_gem_object *obj)
2190 {
2191         struct drm_i915_private *i915 = to_i915(obj->base.dev);
2192
2193         /* Serialisation between user GTT access and our code depends upon
2194          * revoking the CPU's PTE whilst the mutex is held. The next user
2195          * pagefault then has to wait until we release the mutex.
2196          *
2197          * Note that RPM complicates somewhat by adding an additional
2198          * requirement that operations to the GGTT be made holding the RPM
2199          * wakeref.
2200          */
2201         lockdep_assert_held(&i915->drm.struct_mutex);
2202         intel_runtime_pm_get(i915);
2203
2204         if (!obj->userfault_count)
2205                 goto out;
2206
2207         __i915_gem_object_release_mmap(obj);
2208
2209         /* Ensure that the CPU's PTE are revoked and there are not outstanding
2210          * memory transactions from userspace before we return. The TLB
2211          * flushing implied above by changing the PTE above *should* be
2212          * sufficient, an extra barrier here just provides us with a bit
2213          * of paranoid documentation about our requirement to serialise
2214          * memory writes before touching registers / GSM.
2215          */
2216         wmb();
2217
2218 out:
2219         intel_runtime_pm_put(i915);
2220 }
2221
2222 void i915_gem_runtime_suspend(struct drm_i915_private *dev_priv)
2223 {
2224         struct drm_i915_gem_object *obj, *on;
2225         int i;
2226
2227         /*
2228          * Only called during RPM suspend. All users of the userfault_list
2229          * must be holding an RPM wakeref to ensure that this can not
2230          * run concurrently with themselves (and use the struct_mutex for
2231          * protection between themselves).
2232          */
2233
2234         list_for_each_entry_safe(obj, on,
2235                                  &dev_priv->mm.userfault_list, userfault_link)
2236                 __i915_gem_object_release_mmap(obj);
2237
2238         /* The fence will be lost when the device powers down. If any were
2239          * in use by hardware (i.e. they are pinned), we should not be powering
2240          * down! All other fences will be reacquired by the user upon waking.
2241          */
2242         for (i = 0; i < dev_priv->num_fence_regs; i++) {
2243                 struct drm_i915_fence_reg *reg = &dev_priv->fence_regs[i];
2244
2245                 /* Ideally we want to assert that the fence register is not
2246                  * live at this point (i.e. that no piece of code will be
2247                  * trying to write through fence + GTT, as that both violates
2248                  * our tracking of activity and associated locking/barriers,
2249                  * but also is illegal given that the hw is powered down).
2250                  *
2251                  * Previously we used reg->pin_count as a "liveness" indicator.
2252                  * That is not sufficient, and we need a more fine-grained
2253                  * tool if we want to have a sanity check here.
2254                  */
2255
2256                 if (!reg->vma)
2257                         continue;
2258
2259                 GEM_BUG_ON(i915_vma_has_userfault(reg->vma));
2260                 reg->dirty = true;
2261         }
2262 }
2263
2264 static int i915_gem_object_create_mmap_offset(struct drm_i915_gem_object *obj)
2265 {
2266         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
2267         int err;
2268
2269         err = drm_gem_create_mmap_offset(&obj->base);
2270         if (likely(!err))
2271                 return 0;
2272
2273         /* Attempt to reap some mmap space from dead objects */
2274         do {
2275                 err = i915_gem_wait_for_idle(dev_priv,
2276                                              I915_WAIT_INTERRUPTIBLE,
2277                                              MAX_SCHEDULE_TIMEOUT);
2278                 if (err)
2279                         break;
2280
2281                 i915_gem_drain_freed_objects(dev_priv);
2282                 err = drm_gem_create_mmap_offset(&obj->base);
2283                 if (!err)
2284                         break;
2285
2286         } while (flush_delayed_work(&dev_priv->gt.retire_work));
2287
2288         return err;
2289 }
2290
2291 static void i915_gem_object_free_mmap_offset(struct drm_i915_gem_object *obj)
2292 {
2293         drm_gem_free_mmap_offset(&obj->base);
2294 }
2295
2296 int
2297 i915_gem_mmap_gtt(struct drm_file *file,
2298                   struct drm_device *dev,
2299                   uint32_t handle,
2300                   uint64_t *offset)
2301 {
2302         struct drm_i915_gem_object *obj;
2303         int ret;
2304
2305         obj = i915_gem_object_lookup(file, handle);
2306         if (!obj)
2307                 return -ENOENT;
2308
2309         ret = i915_gem_object_create_mmap_offset(obj);
2310         if (ret == 0)
2311                 *offset = drm_vma_node_offset_addr(&obj->base.vma_node);
2312
2313         i915_gem_object_put(obj);
2314         return ret;
2315 }
2316
2317 /**
2318  * i915_gem_mmap_gtt_ioctl - prepare an object for GTT mmap'ing
2319  * @dev: DRM device
2320  * @data: GTT mapping ioctl data
2321  * @file: GEM object info
2322  *
2323  * Simply returns the fake offset to userspace so it can mmap it.
2324  * The mmap call will end up in drm_gem_mmap(), which will set things
2325  * up so we can get faults in the handler above.
2326  *
2327  * The fault handler will take care of binding the object into the GTT
2328  * (since it may have been evicted to make room for something), allocating
2329  * a fence register, and mapping the appropriate aperture address into
2330  * userspace.
2331  */
2332 int
2333 i915_gem_mmap_gtt_ioctl(struct drm_device *dev, void *data,
2334                         struct drm_file *file)
2335 {
2336         struct drm_i915_gem_mmap_gtt *args = data;
2337
2338         return i915_gem_mmap_gtt(file, dev, args->handle, &args->offset);
2339 }
2340
2341 /* Immediately discard the backing storage */
2342 static void
2343 i915_gem_object_truncate(struct drm_i915_gem_object *obj)
2344 {
2345         i915_gem_object_free_mmap_offset(obj);
2346
2347         if (obj->base.filp == NULL)
2348                 return;
2349
2350         /* Our goal here is to return as much of the memory as
2351          * is possible back to the system as we are called from OOM.
2352          * To do this we must instruct the shmfs to drop all of its
2353          * backing pages, *now*.
2354          */
2355         shmem_truncate_range(file_inode(obj->base.filp), 0, (loff_t)-1);
2356         obj->mm.madv = __I915_MADV_PURGED;
2357         obj->mm.pages = ERR_PTR(-EFAULT);
2358 }
2359
2360 /* Try to discard unwanted pages */
2361 void __i915_gem_object_invalidate(struct drm_i915_gem_object *obj)
2362 {
2363         struct address_space *mapping;
2364
2365         lockdep_assert_held(&obj->mm.lock);
2366         GEM_BUG_ON(i915_gem_object_has_pages(obj));
2367
2368         switch (obj->mm.madv) {
2369         case I915_MADV_DONTNEED:
2370                 i915_gem_object_truncate(obj);
2371         case __I915_MADV_PURGED:
2372                 return;
2373         }
2374
2375         if (obj->base.filp == NULL)
2376                 return;
2377
2378         mapping = obj->base.filp->f_mapping,
2379         invalidate_mapping_pages(mapping, 0, (loff_t)-1);
2380 }
2381
2382 /*
2383  * Move pages to appropriate lru and release the pagevec, decrementing the
2384  * ref count of those pages.
2385  */
2386 static void check_release_pagevec(struct pagevec *pvec)
2387 {
2388         check_move_unevictable_pages(pvec);
2389         __pagevec_release(pvec);
2390         cond_resched();
2391 }
2392
2393 static void
2394 i915_gem_object_put_pages_gtt(struct drm_i915_gem_object *obj,
2395                               struct sg_table *pages)
2396 {
2397         struct sgt_iter sgt_iter;
2398         struct pagevec pvec;
2399         struct page *page;
2400
2401         __i915_gem_object_release_shmem(obj, pages, true);
2402
2403         i915_gem_gtt_finish_pages(obj, pages);
2404
2405         if (i915_gem_object_needs_bit17_swizzle(obj))
2406                 i915_gem_object_save_bit_17_swizzle(obj, pages);
2407
2408         mapping_clear_unevictable(file_inode(obj->base.filp)->i_mapping);
2409
2410         pagevec_init(&pvec);
2411         for_each_sgt_page(page, sgt_iter, pages) {
2412                 if (obj->mm.dirty)
2413                         set_page_dirty(page);
2414
2415                 if (obj->mm.madv == I915_MADV_WILLNEED)
2416                         mark_page_accessed(page);
2417
2418                 if (!pagevec_add(&pvec, page))
2419                         check_release_pagevec(&pvec);
2420         }
2421         if (pagevec_count(&pvec))
2422                 check_release_pagevec(&pvec);
2423         obj->mm.dirty = false;
2424
2425         sg_free_table(pages);
2426         kfree(pages);
2427 }
2428
2429 static void __i915_gem_object_reset_page_iter(struct drm_i915_gem_object *obj)
2430 {
2431         struct radix_tree_iter iter;
2432         void __rcu **slot;
2433
2434         rcu_read_lock();
2435         radix_tree_for_each_slot(slot, &obj->mm.get_page.radix, &iter, 0)
2436                 radix_tree_delete(&obj->mm.get_page.radix, iter.index);
2437         rcu_read_unlock();
2438 }
2439
2440 static struct sg_table *
2441 __i915_gem_object_unset_pages(struct drm_i915_gem_object *obj)
2442 {
2443         struct drm_i915_private *i915 = to_i915(obj->base.dev);
2444         struct sg_table *pages;
2445
2446         pages = fetch_and_zero(&obj->mm.pages);
2447         if (!pages)
2448                 return NULL;
2449
2450         spin_lock(&i915->mm.obj_lock);
2451         list_del(&obj->mm.link);
2452         spin_unlock(&i915->mm.obj_lock);
2453
2454         if (obj->mm.mapping) {
2455                 void *ptr;
2456
2457                 ptr = page_mask_bits(obj->mm.mapping);
2458                 if (is_vmalloc_addr(ptr))
2459                         vunmap(ptr);
2460                 else
2461                         kunmap(kmap_to_page(ptr));
2462
2463                 obj->mm.mapping = NULL;
2464         }
2465
2466         __i915_gem_object_reset_page_iter(obj);
2467         obj->mm.page_sizes.phys = obj->mm.page_sizes.sg = 0;
2468
2469         return pages;
2470 }
2471
2472 void __i915_gem_object_put_pages(struct drm_i915_gem_object *obj,
2473                                  enum i915_mm_subclass subclass)
2474 {
2475         struct sg_table *pages;
2476
2477         if (i915_gem_object_has_pinned_pages(obj))
2478                 return;
2479
2480         GEM_BUG_ON(obj->bind_count);
2481         if (!i915_gem_object_has_pages(obj))
2482                 return;
2483
2484         /* May be called by shrinker from within get_pages() (on another bo) */
2485         mutex_lock_nested(&obj->mm.lock, subclass);
2486         if (unlikely(atomic_read(&obj->mm.pages_pin_count)))
2487                 goto unlock;
2488
2489         /*
2490          * ->put_pages might need to allocate memory for the bit17 swizzle
2491          * array, hence protect them from being reaped by removing them from gtt
2492          * lists early.
2493          */
2494         pages = __i915_gem_object_unset_pages(obj);
2495         if (!IS_ERR(pages))
2496                 obj->ops->put_pages(obj, pages);
2497
2498 unlock:
2499         mutex_unlock(&obj->mm.lock);
2500 }
2501
2502 bool i915_sg_trim(struct sg_table *orig_st)
2503 {
2504         struct sg_table new_st;
2505         struct scatterlist *sg, *new_sg;
2506         unsigned int i;
2507
2508         if (orig_st->nents == orig_st->orig_nents)
2509                 return false;
2510
2511         if (sg_alloc_table(&new_st, orig_st->nents, GFP_KERNEL | __GFP_NOWARN))
2512                 return false;
2513
2514         new_sg = new_st.sgl;
2515         for_each_sg(orig_st->sgl, sg, orig_st->nents, i) {
2516                 sg_set_page(new_sg, sg_page(sg), sg->length, 0);
2517                 sg_dma_address(new_sg) = sg_dma_address(sg);
2518                 sg_dma_len(new_sg) = sg_dma_len(sg);
2519
2520                 new_sg = sg_next(new_sg);
2521         }
2522         GEM_BUG_ON(new_sg); /* Should walk exactly nents and hit the end */
2523
2524         sg_free_table(orig_st);
2525
2526         *orig_st = new_st;
2527         return true;
2528 }
2529
2530 static int i915_gem_object_get_pages_gtt(struct drm_i915_gem_object *obj)
2531 {
2532         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
2533         const unsigned long page_count = obj->base.size / PAGE_SIZE;
2534         unsigned long i;
2535         struct address_space *mapping;
2536         struct sg_table *st;
2537         struct scatterlist *sg;
2538         struct sgt_iter sgt_iter;
2539         struct page *page;
2540         unsigned long last_pfn = 0;     /* suppress gcc warning */
2541         unsigned int max_segment = i915_sg_segment_size();
2542         unsigned int sg_page_sizes;
2543         struct pagevec pvec;
2544         gfp_t noreclaim;
2545         int ret;
2546
2547         /*
2548          * Assert that the object is not currently in any GPU domain. As it
2549          * wasn't in the GTT, there shouldn't be any way it could have been in
2550          * a GPU cache
2551          */
2552         GEM_BUG_ON(obj->read_domains & I915_GEM_GPU_DOMAINS);
2553         GEM_BUG_ON(obj->write_domain & I915_GEM_GPU_DOMAINS);
2554
2555         /*
2556          * If there's no chance of allocating enough pages for the whole
2557          * object, bail early.
2558          */
2559         if (page_count > totalram_pages())
2560                 return -ENOMEM;
2561
2562         st = kmalloc(sizeof(*st), GFP_KERNEL);
2563         if (st == NULL)
2564                 return -ENOMEM;
2565
2566 rebuild_st:
2567         if (sg_alloc_table(st, page_count, GFP_KERNEL)) {
2568                 kfree(st);
2569                 return -ENOMEM;
2570         }
2571
2572         /*
2573          * Get the list of pages out of our struct file.  They'll be pinned
2574          * at this point until we release them.
2575          *
2576          * Fail silently without starting the shrinker
2577          */
2578         mapping = obj->base.filp->f_mapping;
2579         mapping_set_unevictable(mapping);
2580         noreclaim = mapping_gfp_constraint(mapping, ~__GFP_RECLAIM);
2581         noreclaim |= __GFP_NORETRY | __GFP_NOWARN;
2582
2583         sg = st->sgl;
2584         st->nents = 0;
2585         sg_page_sizes = 0;
2586         for (i = 0; i < page_count; i++) {
2587                 const unsigned int shrink[] = {
2588                         I915_SHRINK_BOUND | I915_SHRINK_UNBOUND | I915_SHRINK_PURGEABLE,
2589                         0,
2590                 }, *s = shrink;
2591                 gfp_t gfp = noreclaim;
2592
2593                 do {
2594                         cond_resched();
2595                         page = shmem_read_mapping_page_gfp(mapping, i, gfp);
2596                         if (likely(!IS_ERR(page)))
2597                                 break;
2598
2599                         if (!*s) {
2600                                 ret = PTR_ERR(page);
2601                                 goto err_sg;
2602                         }
2603
2604                         i915_gem_shrink(dev_priv, 2 * page_count, NULL, *s++);
2605
2606                         /*
2607                          * We've tried hard to allocate the memory by reaping
2608                          * our own buffer, now let the real VM do its job and
2609                          * go down in flames if truly OOM.
2610                          *
2611                          * However, since graphics tend to be disposable,
2612                          * defer the oom here by reporting the ENOMEM back
2613                          * to userspace.
2614                          */
2615                         if (!*s) {
2616                                 /* reclaim and warn, but no oom */
2617                                 gfp = mapping_gfp_mask(mapping);
2618
2619                                 /*
2620                                  * Our bo are always dirty and so we require
2621                                  * kswapd to reclaim our pages (direct reclaim
2622                                  * does not effectively begin pageout of our
2623                                  * buffers on its own). However, direct reclaim
2624                                  * only waits for kswapd when under allocation
2625                                  * congestion. So as a result __GFP_RECLAIM is
2626                                  * unreliable and fails to actually reclaim our
2627                                  * dirty pages -- unless you try over and over
2628                                  * again with !__GFP_NORETRY. However, we still
2629                                  * want to fail this allocation rather than
2630                                  * trigger the out-of-memory killer and for
2631                                  * this we want __GFP_RETRY_MAYFAIL.
2632                                  */
2633                                 gfp |= __GFP_RETRY_MAYFAIL;
2634                         }
2635                 } while (1);
2636
2637                 if (!i ||
2638                     sg->length >= max_segment ||
2639                     page_to_pfn(page) != last_pfn + 1) {
2640                         if (i) {
2641                                 sg_page_sizes |= sg->length;
2642                                 sg = sg_next(sg);
2643                         }
2644                         st->nents++;
2645                         sg_set_page(sg, page, PAGE_SIZE, 0);
2646                 } else {
2647                         sg->length += PAGE_SIZE;
2648                 }
2649                 last_pfn = page_to_pfn(page);
2650
2651                 /* Check that the i965g/gm workaround works. */
2652                 WARN_ON((gfp & __GFP_DMA32) && (last_pfn >= 0x00100000UL));
2653         }
2654         if (sg) { /* loop terminated early; short sg table */
2655                 sg_page_sizes |= sg->length;
2656                 sg_mark_end(sg);
2657         }
2658
2659         /* Trim unused sg entries to avoid wasting memory. */
2660         i915_sg_trim(st);
2661
2662         ret = i915_gem_gtt_prepare_pages(obj, st);
2663         if (ret) {
2664                 /*
2665                  * DMA remapping failed? One possible cause is that
2666                  * it could not reserve enough large entries, asking
2667                  * for PAGE_SIZE chunks instead may be helpful.
2668                  */
2669                 if (max_segment > PAGE_SIZE) {
2670                         for_each_sgt_page(page, sgt_iter, st)
2671                                 put_page(page);
2672                         sg_free_table(st);
2673
2674                         max_segment = PAGE_SIZE;
2675                         goto rebuild_st;
2676                 } else {
2677                         dev_warn(&dev_priv->drm.pdev->dev,
2678                                  "Failed to DMA remap %lu pages\n",
2679                                  page_count);
2680                         goto err_pages;
2681                 }
2682         }
2683
2684         if (i915_gem_object_needs_bit17_swizzle(obj))
2685                 i915_gem_object_do_bit_17_swizzle(obj, st);
2686
2687         __i915_gem_object_set_pages(obj, st, sg_page_sizes);
2688
2689         return 0;
2690
2691 err_sg:
2692         sg_mark_end(sg);
2693 err_pages:
2694         mapping_clear_unevictable(mapping);
2695         pagevec_init(&pvec);
2696         for_each_sgt_page(page, sgt_iter, st) {
2697                 if (!pagevec_add(&pvec, page))
2698                         check_release_pagevec(&pvec);
2699         }
2700         if (pagevec_count(&pvec))
2701                 check_release_pagevec(&pvec);
2702         sg_free_table(st);
2703         kfree(st);
2704
2705         /*
2706          * shmemfs first checks if there is enough memory to allocate the page
2707          * and reports ENOSPC should there be insufficient, along with the usual
2708          * ENOMEM for a genuine allocation failure.
2709          *
2710          * We use ENOSPC in our driver to mean that we have run out of aperture
2711          * space and so want to translate the error from shmemfs back to our
2712          * usual understanding of ENOMEM.
2713          */
2714         if (ret == -ENOSPC)
2715                 ret = -ENOMEM;
2716
2717         return ret;
2718 }
2719
2720 void __i915_gem_object_set_pages(struct drm_i915_gem_object *obj,
2721                                  struct sg_table *pages,
2722                                  unsigned int sg_page_sizes)
2723 {
2724         struct drm_i915_private *i915 = to_i915(obj->base.dev);
2725         unsigned long supported = INTEL_INFO(i915)->page_sizes;
2726         int i;
2727
2728         lockdep_assert_held(&obj->mm.lock);
2729
2730         obj->mm.get_page.sg_pos = pages->sgl;
2731         obj->mm.get_page.sg_idx = 0;
2732
2733         obj->mm.pages = pages;
2734
2735         if (i915_gem_object_is_tiled(obj) &&
2736             i915->quirks & QUIRK_PIN_SWIZZLED_PAGES) {
2737                 GEM_BUG_ON(obj->mm.quirked);
2738                 __i915_gem_object_pin_pages(obj);
2739                 obj->mm.quirked = true;
2740         }
2741
2742         GEM_BUG_ON(!sg_page_sizes);
2743         obj->mm.page_sizes.phys = sg_page_sizes;
2744
2745         /*
2746          * Calculate the supported page-sizes which fit into the given
2747          * sg_page_sizes. This will give us the page-sizes which we may be able
2748          * to use opportunistically when later inserting into the GTT. For
2749          * example if phys=2G, then in theory we should be able to use 1G, 2M,
2750          * 64K or 4K pages, although in practice this will depend on a number of
2751          * other factors.
2752          */
2753         obj->mm.page_sizes.sg = 0;
2754         for_each_set_bit(i, &supported, ilog2(I915_GTT_MAX_PAGE_SIZE) + 1) {
2755                 if (obj->mm.page_sizes.phys & ~0u << i)
2756                         obj->mm.page_sizes.sg |= BIT(i);
2757         }
2758         GEM_BUG_ON(!HAS_PAGE_SIZES(i915, obj->mm.page_sizes.sg));
2759
2760         spin_lock(&i915->mm.obj_lock);
2761         list_add(&obj->mm.link, &i915->mm.unbound_list);
2762         spin_unlock(&i915->mm.obj_lock);
2763 }
2764
2765 static int ____i915_gem_object_get_pages(struct drm_i915_gem_object *obj)
2766 {
2767         int err;
2768
2769         if (unlikely(obj->mm.madv != I915_MADV_WILLNEED)) {
2770                 DRM_DEBUG("Attempting to obtain a purgeable object\n");
2771                 return -EFAULT;
2772         }
2773
2774         err = obj->ops->get_pages(obj);
2775         GEM_BUG_ON(!err && !i915_gem_object_has_pages(obj));
2776
2777         return err;
2778 }
2779
2780 /* Ensure that the associated pages are gathered from the backing storage
2781  * and pinned into our object. i915_gem_object_pin_pages() may be called
2782  * multiple times before they are released by a single call to
2783  * i915_gem_object_unpin_pages() - once the pages are no longer referenced
2784  * either as a result of memory pressure (reaping pages under the shrinker)
2785  * or as the object is itself released.
2786  */
2787 int __i915_gem_object_get_pages(struct drm_i915_gem_object *obj)
2788 {
2789         int err;
2790
2791         err = mutex_lock_interruptible(&obj->mm.lock);
2792         if (err)
2793                 return err;
2794
2795         if (unlikely(!i915_gem_object_has_pages(obj))) {
2796                 GEM_BUG_ON(i915_gem_object_has_pinned_pages(obj));
2797
2798                 err = ____i915_gem_object_get_pages(obj);
2799                 if (err)
2800                         goto unlock;
2801
2802                 smp_mb__before_atomic();
2803         }
2804         atomic_inc(&obj->mm.pages_pin_count);
2805
2806 unlock:
2807         mutex_unlock(&obj->mm.lock);
2808         return err;
2809 }
2810
2811 /* The 'mapping' part of i915_gem_object_pin_map() below */
2812 static void *i915_gem_object_map(const struct drm_i915_gem_object *obj,
2813                                  enum i915_map_type type)
2814 {
2815         unsigned long n_pages = obj->base.size >> PAGE_SHIFT;
2816         struct sg_table *sgt = obj->mm.pages;
2817         struct sgt_iter sgt_iter;
2818         struct page *page;
2819         struct page *stack_pages[32];
2820         struct page **pages = stack_pages;
2821         unsigned long i = 0;
2822         pgprot_t pgprot;
2823         void *addr;
2824
2825         /* A single page can always be kmapped */
2826         if (n_pages == 1 && type == I915_MAP_WB)
2827                 return kmap(sg_page(sgt->sgl));
2828
2829         if (n_pages > ARRAY_SIZE(stack_pages)) {
2830                 /* Too big for stack -- allocate temporary array instead */
2831                 pages = kvmalloc_array(n_pages, sizeof(*pages), GFP_KERNEL);
2832                 if (!pages)
2833                         return NULL;
2834         }
2835
2836         for_each_sgt_page(page, sgt_iter, sgt)
2837                 pages[i++] = page;
2838
2839         /* Check that we have the expected number of pages */
2840         GEM_BUG_ON(i != n_pages);
2841
2842         switch (type) {
2843         default:
2844                 MISSING_CASE(type);
2845                 /* fallthrough to use PAGE_KERNEL anyway */
2846         case I915_MAP_WB:
2847                 pgprot = PAGE_KERNEL;
2848                 break;
2849         case I915_MAP_WC:
2850                 pgprot = pgprot_writecombine(PAGE_KERNEL_IO);
2851                 break;
2852         }
2853         addr = vmap(pages, n_pages, 0, pgprot);
2854
2855         if (pages != stack_pages)
2856                 kvfree(pages);
2857
2858         return addr;
2859 }
2860
2861 /* get, pin, and map the pages of the object into kernel space */
2862 void *i915_gem_object_pin_map(struct drm_i915_gem_object *obj,
2863                               enum i915_map_type type)
2864 {
2865         enum i915_map_type has_type;
2866         bool pinned;
2867         void *ptr;
2868         int ret;
2869
2870         if (unlikely(!i915_gem_object_has_struct_page(obj)))
2871                 return ERR_PTR(-ENXIO);
2872
2873         ret = mutex_lock_interruptible(&obj->mm.lock);
2874         if (ret)
2875                 return ERR_PTR(ret);
2876
2877         pinned = !(type & I915_MAP_OVERRIDE);
2878         type &= ~I915_MAP_OVERRIDE;
2879
2880         if (!atomic_inc_not_zero(&obj->mm.pages_pin_count)) {
2881                 if (unlikely(!i915_gem_object_has_pages(obj))) {
2882                         GEM_BUG_ON(i915_gem_object_has_pinned_pages(obj));
2883
2884                         ret = ____i915_gem_object_get_pages(obj);
2885                         if (ret)
2886                                 goto err_unlock;
2887
2888                         smp_mb__before_atomic();
2889                 }
2890                 atomic_inc(&obj->mm.pages_pin_count);
2891                 pinned = false;
2892         }
2893         GEM_BUG_ON(!i915_gem_object_has_pages(obj));
2894
2895         ptr = page_unpack_bits(obj->mm.mapping, &has_type);
2896         if (ptr && has_type != type) {
2897                 if (pinned) {
2898                         ret = -EBUSY;
2899                         goto err_unpin;
2900                 }
2901
2902                 if (is_vmalloc_addr(ptr))
2903                         vunmap(ptr);
2904                 else
2905                         kunmap(kmap_to_page(ptr));
2906
2907                 ptr = obj->mm.mapping = NULL;
2908         }
2909
2910         if (!ptr) {
2911                 ptr = i915_gem_object_map(obj, type);
2912                 if (!ptr) {
2913                         ret = -ENOMEM;
2914                         goto err_unpin;
2915                 }
2916
2917                 obj->mm.mapping = page_pack_bits(ptr, type);
2918         }
2919
2920 out_unlock:
2921         mutex_unlock(&obj->mm.lock);
2922         return ptr;
2923
2924 err_unpin:
2925         atomic_dec(&obj->mm.pages_pin_count);
2926 err_unlock:
2927         ptr = ERR_PTR(ret);
2928         goto out_unlock;
2929 }
2930
2931 static int
2932 i915_gem_object_pwrite_gtt(struct drm_i915_gem_object *obj,
2933                            const struct drm_i915_gem_pwrite *arg)
2934 {
2935         struct address_space *mapping = obj->base.filp->f_mapping;
2936         char __user *user_data = u64_to_user_ptr(arg->data_ptr);
2937         u64 remain, offset;
2938         unsigned int pg;
2939
2940         /* Before we instantiate/pin the backing store for our use, we
2941          * can prepopulate the shmemfs filp efficiently using a write into
2942          * the pagecache. We avoid the penalty of instantiating all the
2943          * pages, important if the user is just writing to a few and never
2944          * uses the object on the GPU, and using a direct write into shmemfs
2945          * allows it to avoid the cost of retrieving a page (either swapin
2946          * or clearing-before-use) before it is overwritten.
2947          */
2948         if (i915_gem_object_has_pages(obj))
2949                 return -ENODEV;
2950
2951         if (obj->mm.madv != I915_MADV_WILLNEED)
2952                 return -EFAULT;
2953
2954         /* Before the pages are instantiated the object is treated as being
2955          * in the CPU domain. The pages will be clflushed as required before
2956          * use, and we can freely write into the pages directly. If userspace
2957          * races pwrite with any other operation; corruption will ensue -
2958          * that is userspace's prerogative!
2959          */
2960
2961         remain = arg->size;
2962         offset = arg->offset;
2963         pg = offset_in_page(offset);
2964
2965         do {
2966                 unsigned int len, unwritten;
2967                 struct page *page;
2968                 void *data, *vaddr;
2969                 int err;
2970
2971                 len = PAGE_SIZE - pg;
2972                 if (len > remain)
2973                         len = remain;
2974
2975                 err = pagecache_write_begin(obj->base.filp, mapping,
2976                                             offset, len, 0,
2977                                             &page, &data);
2978                 if (err < 0)
2979                         return err;
2980
2981                 vaddr = kmap(page);
2982                 unwritten = copy_from_user(vaddr + pg, user_data, len);
2983                 kunmap(page);
2984
2985                 err = pagecache_write_end(obj->base.filp, mapping,
2986                                           offset, len, len - unwritten,
2987                                           page, data);
2988                 if (err < 0)
2989                         return err;
2990
2991                 if (unwritten)
2992                         return -EFAULT;
2993
2994                 remain -= len;
2995                 user_data += len;
2996                 offset += len;
2997                 pg = 0;
2998         } while (remain);
2999
3000         return 0;
3001 }
3002
3003 static void i915_gem_client_mark_guilty(struct drm_i915_file_private *file_priv,
3004                                         const struct i915_gem_context *ctx)
3005 {
3006         unsigned int score;
3007         unsigned long prev_hang;
3008
3009         if (i915_gem_context_is_banned(ctx))
3010                 score = I915_CLIENT_SCORE_CONTEXT_BAN;
3011         else
3012                 score = 0;
3013
3014         prev_hang = xchg(&file_priv->hang_timestamp, jiffies);
3015         if (time_before(jiffies, prev_hang + I915_CLIENT_FAST_HANG_JIFFIES))
3016                 score += I915_CLIENT_SCORE_HANG_FAST;
3017
3018         if (score) {
3019                 atomic_add(score, &file_priv->ban_score);
3020
3021                 DRM_DEBUG_DRIVER("client %s: gained %u ban score, now %u\n",
3022                                  ctx->name, score,
3023                                  atomic_read(&file_priv->ban_score));
3024         }
3025 }
3026
3027 static void i915_gem_context_mark_guilty(struct i915_gem_context *ctx)
3028 {
3029         unsigned int score;
3030         bool banned, bannable;
3031
3032         atomic_inc(&ctx->guilty_count);
3033
3034         bannable = i915_gem_context_is_bannable(ctx);
3035         score = atomic_add_return(CONTEXT_SCORE_GUILTY, &ctx->ban_score);
3036         banned = score >= CONTEXT_SCORE_BAN_THRESHOLD;
3037
3038         /* Cool contexts don't accumulate client ban score */
3039         if (!bannable)
3040                 return;
3041
3042         if (banned) {
3043                 DRM_DEBUG_DRIVER("context %s: guilty %d, score %u, banned\n",
3044                                  ctx->name, atomic_read(&ctx->guilty_count),
3045                                  score);
3046                 i915_gem_context_set_banned(ctx);
3047         }
3048
3049         if (!IS_ERR_OR_NULL(ctx->file_priv))
3050                 i915_gem_client_mark_guilty(ctx->file_priv, ctx);
3051 }
3052
3053 static void i915_gem_context_mark_innocent(struct i915_gem_context *ctx)
3054 {
3055         atomic_inc(&ctx->active_count);
3056 }
3057
3058 struct i915_request *
3059 i915_gem_find_active_request(struct intel_engine_cs *engine)
3060 {
3061         struct i915_request *request, *active = NULL;
3062         unsigned long flags;
3063
3064         /*
3065          * We are called by the error capture, reset and to dump engine
3066          * state at random points in time. In particular, note that neither is
3067          * crucially ordered with an interrupt. After a hang, the GPU is dead
3068          * and we assume that no more writes can happen (we waited long enough
3069          * for all writes that were in transaction to be flushed) - adding an
3070          * extra delay for a recent interrupt is pointless. Hence, we do
3071          * not need an engine->irq_seqno_barrier() before the seqno reads.
3072          * At all other times, we must assume the GPU is still running, but
3073          * we only care about the snapshot of this moment.
3074          */
3075         spin_lock_irqsave(&engine->timeline.lock, flags);
3076         list_for_each_entry(request, &engine->timeline.requests, link) {
3077                 if (__i915_request_completed(request, request->global_seqno))
3078                         continue;
3079
3080                 active = request;
3081                 break;
3082         }
3083         spin_unlock_irqrestore(&engine->timeline.lock, flags);
3084
3085         return active;
3086 }
3087
3088 /*
3089  * Ensure irq handler finishes, and not run again.
3090  * Also return the active request so that we only search for it once.
3091  */
3092 struct i915_request *
3093 i915_gem_reset_prepare_engine(struct intel_engine_cs *engine)
3094 {
3095         struct i915_request *request;
3096
3097         /*
3098          * During the reset sequence, we must prevent the engine from
3099          * entering RC6. As the context state is undefined until we restart
3100          * the engine, if it does enter RC6 during the reset, the state
3101          * written to the powercontext is undefined and so we may lose
3102          * GPU state upon resume, i.e. fail to restart after a reset.
3103          */
3104         intel_uncore_forcewake_get(engine->i915, FORCEWAKE_ALL);
3105
3106         request = engine->reset.prepare(engine);
3107         if (request && request->fence.error == -EIO)
3108                 request = ERR_PTR(-EIO); /* Previous reset failed! */
3109
3110         return request;
3111 }
3112
3113 int i915_gem_reset_prepare(struct drm_i915_private *dev_priv)
3114 {
3115         struct intel_engine_cs *engine;
3116         struct i915_request *request;
3117         enum intel_engine_id id;
3118         int err = 0;
3119
3120         for_each_engine(engine, dev_priv, id) {
3121                 request = i915_gem_reset_prepare_engine(engine);
3122                 if (IS_ERR(request)) {
3123                         err = PTR_ERR(request);
3124                         continue;
3125                 }
3126
3127                 engine->hangcheck.active_request = request;
3128         }
3129
3130         i915_gem_revoke_fences(dev_priv);
3131         intel_uc_sanitize(dev_priv);
3132
3133         return err;
3134 }
3135
3136 static void engine_skip_context(struct i915_request *request)
3137 {
3138         struct intel_engine_cs *engine = request->engine;
3139         struct i915_gem_context *hung_ctx = request->gem_context;
3140         struct i915_timeline *timeline = request->timeline;
3141         unsigned long flags;
3142
3143         GEM_BUG_ON(timeline == &engine->timeline);
3144
3145         spin_lock_irqsave(&engine->timeline.lock, flags);
3146         spin_lock(&timeline->lock);
3147
3148         list_for_each_entry_continue(request, &engine->timeline.requests, link)
3149                 if (request->gem_context == hung_ctx)
3150                         i915_request_skip(request, -EIO);
3151
3152         list_for_each_entry(request, &timeline->requests, link)
3153                 i915_request_skip(request, -EIO);
3154
3155         spin_unlock(&timeline->lock);
3156         spin_unlock_irqrestore(&engine->timeline.lock, flags);
3157 }
3158
3159 /* Returns the request if it was guilty of the hang */
3160 static struct i915_request *
3161 i915_gem_reset_request(struct intel_engine_cs *engine,
3162                        struct i915_request *request,
3163                        bool stalled)
3164 {
3165         /* The guilty request will get skipped on a hung engine.
3166          *
3167          * Users of client default contexts do not rely on logical
3168          * state preserved between batches so it is safe to execute
3169          * queued requests following the hang. Non default contexts
3170          * rely on preserved state, so skipping a batch loses the
3171          * evolution of the state and it needs to be considered corrupted.
3172          * Executing more queued batches on top of corrupted state is
3173          * risky. But we take the risk by trying to advance through
3174          * the queued requests in order to make the client behaviour
3175          * more predictable around resets, by not throwing away random
3176          * amount of batches it has prepared for execution. Sophisticated
3177          * clients can use gem_reset_stats_ioctl and dma fence status
3178          * (exported via sync_file info ioctl on explicit fences) to observe
3179          * when it loses the context state and should rebuild accordingly.
3180          *
3181          * The context ban, and ultimately the client ban, mechanism are safety
3182          * valves if client submission ends up resulting in nothing more than
3183          * subsequent hangs.
3184          */
3185
3186         if (i915_request_completed(request)) {
3187                 GEM_TRACE("%s pardoned global=%d (fence %llx:%d), current %d\n",
3188                           engine->name, request->global_seqno,
3189                           request->fence.context, request->fence.seqno,
3190                           intel_engine_get_seqno(engine));
3191                 stalled = false;
3192         }
3193
3194         if (stalled) {
3195                 i915_gem_context_mark_guilty(request->gem_context);
3196                 i915_request_skip(request, -EIO);
3197
3198                 /* If this context is now banned, skip all pending requests. */
3199                 if (i915_gem_context_is_banned(request->gem_context))
3200                         engine_skip_context(request);
3201         } else {
3202                 /*
3203                  * Since this is not the hung engine, it may have advanced
3204                  * since the hang declaration. Double check by refinding
3205                  * the active request at the time of the reset.
3206                  */
3207                 request = i915_gem_find_active_request(engine);
3208                 if (request) {
3209                         unsigned long flags;
3210
3211                         i915_gem_context_mark_innocent(request->gem_context);
3212                         dma_fence_set_error(&request->fence, -EAGAIN);
3213
3214                         /* Rewind the engine to replay the incomplete rq */
3215                         spin_lock_irqsave(&engine->timeline.lock, flags);
3216                         request = list_prev_entry(request, link);
3217                         if (&request->link == &engine->timeline.requests)
3218                                 request = NULL;
3219                         spin_unlock_irqrestore(&engine->timeline.lock, flags);
3220                 }
3221         }
3222
3223         return request;
3224 }
3225
3226 void i915_gem_reset_engine(struct intel_engine_cs *engine,
3227                            struct i915_request *request,
3228                            bool stalled)
3229 {
3230         /*
3231          * Make sure this write is visible before we re-enable the interrupt
3232          * handlers on another CPU, as tasklet_enable() resolves to just
3233          * a compiler barrier which is insufficient for our purpose here.
3234          */
3235         smp_store_mb(engine->irq_posted, 0);
3236
3237         if (request)
3238                 request = i915_gem_reset_request(engine, request, stalled);
3239
3240         /* Setup the CS to resume from the breadcrumb of the hung request */
3241         engine->reset.reset(engine, request);
3242 }
3243
3244 void i915_gem_reset(struct drm_i915_private *dev_priv,
3245                     unsigned int stalled_mask)
3246 {
3247         struct intel_engine_cs *engine;
3248         enum intel_engine_id id;
3249
3250         lockdep_assert_held(&dev_priv->drm.struct_mutex);
3251
3252         i915_retire_requests(dev_priv);
3253
3254         for_each_engine(engine, dev_priv, id) {
3255                 struct intel_context *ce;
3256
3257                 i915_gem_reset_engine(engine,
3258                                       engine->hangcheck.active_request,
3259                                       stalled_mask & ENGINE_MASK(id));
3260                 ce = fetch_and_zero(&engine->last_retired_context);
3261                 if (ce)
3262                         intel_context_unpin(ce);
3263
3264                 /*
3265                  * Ostensibily, we always want a context loaded for powersaving,
3266                  * so if the engine is idle after the reset, send a request
3267                  * to load our scratch kernel_context.
3268                  *
3269                  * More mysteriously, if we leave the engine idle after a reset,
3270                  * the next userspace batch may hang, with what appears to be
3271                  * an incoherent read by the CS (presumably stale TLB). An
3272                  * empty request appears sufficient to paper over the glitch.
3273                  */
3274                 if (intel_engine_is_idle(engine)) {
3275                         struct i915_request *rq;
3276
3277                         rq = i915_request_alloc(engine,
3278                                                 dev_priv->kernel_context);
3279                         if (!IS_ERR(rq))
3280                                 i915_request_add(rq);
3281                 }
3282         }
3283
3284         i915_gem_restore_fences(dev_priv);
3285 }
3286
3287 void i915_gem_reset_finish_engine(struct intel_engine_cs *engine)
3288 {
3289         engine->reset.finish(engine);
3290
3291         intel_uncore_forcewake_put(engine->i915, FORCEWAKE_ALL);
3292 }
3293
3294 void i915_gem_reset_finish(struct drm_i915_private *dev_priv)
3295 {
3296         struct intel_engine_cs *engine;
3297         enum intel_engine_id id;
3298
3299         lockdep_assert_held(&dev_priv->drm.struct_mutex);
3300
3301         for_each_engine(engine, dev_priv, id) {
3302                 engine->hangcheck.active_request = NULL;
3303                 i915_gem_reset_finish_engine(engine);
3304         }
3305 }
3306
3307 static void nop_submit_request(struct i915_request *request)
3308 {
3309         unsigned long flags;
3310
3311         GEM_TRACE("%s fence %llx:%d -> -EIO\n",
3312                   request->engine->name,
3313                   request->fence.context, request->fence.seqno);
3314         dma_fence_set_error(&request->fence, -EIO);
3315
3316         spin_lock_irqsave(&request->engine->timeline.lock, flags);
3317         __i915_request_submit(request);
3318         intel_engine_init_global_seqno(request->engine, request->global_seqno);
3319         spin_unlock_irqrestore(&request->engine->timeline.lock, flags);
3320 }
3321
3322 void i915_gem_set_wedged(struct drm_i915_private *i915)
3323 {
3324         struct intel_engine_cs *engine;
3325         enum intel_engine_id id;
3326
3327         GEM_TRACE("start\n");
3328
3329         if (GEM_SHOW_DEBUG()) {
3330                 struct drm_printer p = drm_debug_printer(__func__);
3331
3332                 for_each_engine(engine, i915, id)
3333                         intel_engine_dump(engine, &p, "%s\n", engine->name);
3334         }
3335
3336         if (test_and_set_bit(I915_WEDGED, &i915->gpu_error.flags))
3337                 goto out;
3338
3339         /*
3340          * First, stop submission to hw, but do not yet complete requests by
3341          * rolling the global seqno forward (since this would complete requests
3342          * for which we haven't set the fence error to EIO yet).
3343          */
3344         for_each_engine(engine, i915, id)
3345                 i915_gem_reset_prepare_engine(engine);
3346
3347         /* Even if the GPU reset fails, it should still stop the engines */
3348         if (INTEL_GEN(i915) >= 5)
3349                 intel_gpu_reset(i915, ALL_ENGINES);
3350
3351         for_each_engine(engine, i915, id) {
3352                 engine->submit_request = nop_submit_request;
3353                 engine->schedule = NULL;
3354         }
3355         i915->caps.scheduler = 0;
3356
3357         /*
3358          * Make sure no request can slip through without getting completed by
3359          * either this call here to intel_engine_init_global_seqno, or the one
3360          * in nop_submit_request.
3361          */
3362         synchronize_rcu();
3363
3364         /* Mark all executing requests as skipped */
3365         for_each_engine(engine, i915, id)
3366                 engine->cancel_requests(engine);
3367
3368         for_each_engine(engine, i915, id) {
3369                 i915_gem_reset_finish_engine(engine);
3370                 intel_engine_wakeup(engine);
3371         }
3372
3373 out:
3374         GEM_TRACE("end\n");
3375
3376         wake_up_all(&i915->gpu_error.reset_queue);
3377 }
3378
3379 bool i915_gem_unset_wedged(struct drm_i915_private *i915)
3380 {
3381         struct i915_timeline *tl;
3382
3383         lockdep_assert_held(&i915->drm.struct_mutex);
3384         if (!test_bit(I915_WEDGED, &i915->gpu_error.flags))
3385                 return true;
3386
3387         GEM_TRACE("start\n");
3388
3389         /*
3390          * Before unwedging, make sure that all pending operations
3391          * are flushed and errored out - we may have requests waiting upon
3392          * third party fences. We marked all inflight requests as EIO, and
3393          * every execbuf since returned EIO, for consistency we want all
3394          * the currently pending requests to also be marked as EIO, which
3395          * is done inside our nop_submit_request - and so we must wait.
3396          *
3397          * No more can be submitted until we reset the wedged bit.
3398          */
3399         list_for_each_entry(tl, &i915->gt.timelines, link) {
3400                 struct i915_request *rq;
3401
3402                 rq = i915_gem_active_peek(&tl->last_request,
3403                                           &i915->drm.struct_mutex);
3404                 if (!rq)
3405                         continue;
3406
3407                 /*
3408                  * We can't use our normal waiter as we want to
3409                  * avoid recursively trying to handle the current
3410                  * reset. The basic dma_fence_default_wait() installs
3411                  * a callback for dma_fence_signal(), which is
3412                  * triggered by our nop handler (indirectly, the
3413                  * callback enables the signaler thread which is
3414                  * woken by the nop_submit_request() advancing the seqno
3415                  * and when the seqno passes the fence, the signaler
3416                  * then signals the fence waking us up).
3417                  */
3418                 if (dma_fence_default_wait(&rq->fence, true,
3419                                            MAX_SCHEDULE_TIMEOUT) < 0)
3420                         return false;
3421         }
3422         i915_retire_requests(i915);
3423         GEM_BUG_ON(i915->gt.active_requests);
3424
3425         if (!intel_gpu_reset(i915, ALL_ENGINES))
3426                 intel_engines_sanitize(i915);
3427
3428         /*
3429          * Undo nop_submit_request. We prevent all new i915 requests from
3430          * being queued (by disallowing execbuf whilst wedged) so having
3431          * waited for all active requests above, we know the system is idle
3432          * and do not have to worry about a thread being inside
3433          * engine->submit_request() as we swap over. So unlike installing
3434          * the nop_submit_request on reset, we can do this from normal
3435          * context and do not require stop_machine().
3436          */
3437         intel_engines_reset_default_submission(i915);
3438         i915_gem_contexts_lost(i915);
3439
3440         GEM_TRACE("end\n");
3441
3442         smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
3443         clear_bit(I915_WEDGED, &i915->gpu_error.flags);
3444
3445         return true;
3446 }
3447
3448 static void
3449 i915_gem_retire_work_handler(struct work_struct *work)
3450 {
3451         struct drm_i915_private *dev_priv =
3452                 container_of(work, typeof(*dev_priv), gt.retire_work.work);
3453         struct drm_device *dev = &dev_priv->drm;
3454
3455         /* Come back later if the device is busy... */
3456         if (mutex_trylock(&dev->struct_mutex)) {
3457                 i915_retire_requests(dev_priv);
3458                 mutex_unlock(&dev->struct_mutex);
3459         }
3460
3461         /*
3462          * Keep the retire handler running until we are finally idle.
3463          * We do not need to do this test under locking as in the worst-case
3464          * we queue the retire worker once too often.
3465          */
3466         if (READ_ONCE(dev_priv->gt.awake))
3467                 queue_delayed_work(dev_priv->wq,
3468                                    &dev_priv->gt.retire_work,
3469                                    round_jiffies_up_relative(HZ));
3470 }
3471
3472 static void shrink_caches(struct drm_i915_private *i915)
3473 {
3474         /*
3475          * kmem_cache_shrink() discards empty slabs and reorders partially
3476          * filled slabs to prioritise allocating from the mostly full slabs,
3477          * with the aim of reducing fragmentation.
3478          */
3479         kmem_cache_shrink(i915->priorities);
3480         kmem_cache_shrink(i915->dependencies);
3481         kmem_cache_shrink(i915->requests);
3482         kmem_cache_shrink(i915->luts);
3483         kmem_cache_shrink(i915->vmas);
3484         kmem_cache_shrink(i915->objects);
3485 }
3486
3487 struct sleep_rcu_work {
3488         union {
3489                 struct rcu_head rcu;
3490                 struct work_struct work;
3491         };
3492         struct drm_i915_private *i915;
3493         unsigned int epoch;
3494 };
3495
3496 static inline bool
3497 same_epoch(struct drm_i915_private *i915, unsigned int epoch)
3498 {
3499         /*
3500          * There is a small chance that the epoch wrapped since we started
3501          * sleeping. If we assume that epoch is at least a u32, then it will
3502          * take at least 2^32 * 100ms for it to wrap, or about 326 years.
3503          */
3504         return epoch == READ_ONCE(i915->gt.epoch);
3505 }
3506
3507 static void __sleep_work(struct work_struct *work)
3508 {
3509         struct sleep_rcu_work *s = container_of(work, typeof(*s), work);
3510         struct drm_i915_private *i915 = s->i915;
3511         unsigned int epoch = s->epoch;
3512
3513         kfree(s);
3514         if (same_epoch(i915, epoch))
3515                 shrink_caches(i915);
3516 }
3517
3518 static void __sleep_rcu(struct rcu_head *rcu)
3519 {
3520         struct sleep_rcu_work *s = container_of(rcu, typeof(*s), rcu);
3521         struct drm_i915_private *i915 = s->i915;
3522
3523         destroy_rcu_head(&s->rcu);
3524
3525         if (same_epoch(i915, s->epoch)) {
3526                 INIT_WORK(&s->work, __sleep_work);
3527                 queue_work(i915->wq, &s->work);
3528         } else {
3529                 kfree(s);
3530         }
3531 }
3532
3533 static inline bool
3534 new_requests_since_last_retire(const struct drm_i915_private *i915)
3535 {
3536         return (READ_ONCE(i915->gt.active_requests) ||
3537                 work_pending(&i915->gt.idle_work.work));
3538 }
3539
3540 static void assert_kernel_context_is_current(struct drm_i915_private *i915)
3541 {
3542         struct intel_engine_cs *engine;
3543         enum intel_engine_id id;
3544
3545         if (i915_terminally_wedged(&i915->gpu_error))
3546                 return;
3547
3548         GEM_BUG_ON(i915->gt.active_requests);
3549         for_each_engine(engine, i915, id) {
3550                 GEM_BUG_ON(__i915_gem_active_peek(&engine->timeline.last_request));
3551                 GEM_BUG_ON(engine->last_retired_context !=
3552                            to_intel_context(i915->kernel_context, engine));
3553         }
3554 }
3555
3556 static void
3557 i915_gem_idle_work_handler(struct work_struct *work)
3558 {
3559         struct drm_i915_private *dev_priv =
3560                 container_of(work, typeof(*dev_priv), gt.idle_work.work);
3561         unsigned int epoch = I915_EPOCH_INVALID;
3562         bool rearm_hangcheck;
3563
3564         if (!READ_ONCE(dev_priv->gt.awake))
3565                 return;
3566
3567         if (READ_ONCE(dev_priv->gt.active_requests))
3568                 return;
3569
3570         /*
3571          * Flush out the last user context, leaving only the pinned
3572          * kernel context resident. When we are idling on the kernel_context,
3573          * no more new requests (with a context switch) are emitted and we
3574          * can finally rest. A consequence is that the idle work handler is
3575          * always called at least twice before idling (and if the system is
3576          * idle that implies a round trip through the retire worker).
3577          */
3578         mutex_lock(&dev_priv->drm.struct_mutex);
3579         i915_gem_switch_to_kernel_context(dev_priv);
3580         mutex_unlock(&dev_priv->drm.struct_mutex);
3581
3582         GEM_TRACE("active_requests=%d (after switch-to-kernel-context)\n",
3583                   READ_ONCE(dev_priv->gt.active_requests));
3584
3585         /*
3586          * Wait for last execlists context complete, but bail out in case a
3587          * new request is submitted. As we don't trust the hardware, we
3588          * continue on if the wait times out. This is necessary to allow
3589          * the machine to suspend even if the hardware dies, and we will
3590          * try to recover in resume (after depriving the hardware of power,
3591          * it may be in a better mmod).
3592          */
3593         __wait_for(if (new_requests_since_last_retire(dev_priv)) return,
3594                    intel_engines_are_idle(dev_priv),
3595                    I915_IDLE_ENGINES_TIMEOUT * 1000,
3596                    10, 500);
3597
3598         rearm_hangcheck =
3599                 cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work);
3600
3601         if (!mutex_trylock(&dev_priv->drm.struct_mutex)) {
3602                 /* Currently busy, come back later */
3603                 mod_delayed_work(dev_priv->wq,
3604                                  &dev_priv->gt.idle_work,
3605                                  msecs_to_jiffies(50));
3606                 goto out_rearm;
3607         }
3608
3609         /*
3610          * New request retired after this work handler started, extend active
3611          * period until next instance of the work.
3612          */
3613         if (new_requests_since_last_retire(dev_priv))
3614                 goto out_unlock;
3615
3616         epoch = __i915_gem_park(dev_priv);
3617
3618         assert_kernel_context_is_current(dev_priv);
3619
3620         rearm_hangcheck = false;
3621 out_unlock:
3622         mutex_unlock(&dev_priv->drm.struct_mutex);
3623
3624 out_rearm:
3625         if (rearm_hangcheck) {
3626                 GEM_BUG_ON(!dev_priv->gt.awake);
3627                 i915_queue_hangcheck(dev_priv);
3628         }
3629
3630         /*
3631          * When we are idle, it is an opportune time to reap our caches.
3632          * However, we have many objects that utilise RCU and the ordered
3633          * i915->wq that this work is executing on. To try and flush any
3634          * pending frees now we are idle, we first wait for an RCU grace
3635          * period, and then queue a task (that will run last on the wq) to
3636          * shrink and re-optimize the caches.
3637          */
3638         if (same_epoch(dev_priv, epoch)) {
3639                 struct sleep_rcu_work *s = kmalloc(sizeof(*s), GFP_KERNEL);
3640                 if (s) {
3641                         init_rcu_head(&s->rcu);
3642                         s->i915 = dev_priv;
3643                         s->epoch = epoch;
3644                         call_rcu(&s->rcu, __sleep_rcu);
3645                 }
3646         }
3647 }
3648
3649 void i915_gem_close_object(struct drm_gem_object *gem, struct drm_file *file)
3650 {
3651         struct drm_i915_private *i915 = to_i915(gem->dev);
3652         struct drm_i915_gem_object *obj = to_intel_bo(gem);
3653         struct drm_i915_file_private *fpriv = file->driver_priv;
3654         struct i915_lut_handle *lut, *ln;
3655
3656         mutex_lock(&i915->drm.struct_mutex);
3657
3658         list_for_each_entry_safe(lut, ln, &obj->lut_list, obj_link) {
3659                 struct i915_gem_context *ctx = lut->ctx;
3660                 struct i915_vma *vma;
3661
3662                 GEM_BUG_ON(ctx->file_priv == ERR_PTR(-EBADF));
3663                 if (ctx->file_priv != fpriv)
3664                         continue;
3665
3666                 vma = radix_tree_delete(&ctx->handles_vma, lut->handle);
3667                 GEM_BUG_ON(vma->obj != obj);
3668
3669                 /* We allow the process to have multiple handles to the same
3670                  * vma, in the same fd namespace, by virtue of flink/open.
3671                  */
3672                 GEM_BUG_ON(!vma->open_count);
3673                 if (!--vma->open_count && !i915_vma_is_ggtt(vma))
3674                         i915_vma_close(vma);
3675
3676                 list_del(&lut->obj_link);
3677                 list_del(&lut->ctx_link);
3678
3679                 kmem_cache_free(i915->luts, lut);
3680                 __i915_gem_object_release_unless_active(obj);
3681         }
3682
3683         mutex_unlock(&i915->drm.struct_mutex);
3684 }
3685
3686 static unsigned long to_wait_timeout(s64 timeout_ns)
3687 {
3688         if (timeout_ns < 0)
3689                 return MAX_SCHEDULE_TIMEOUT;
3690
3691         if (timeout_ns == 0)
3692                 return 0;
3693
3694         return nsecs_to_jiffies_timeout(timeout_ns);
3695 }
3696
3697 /**
3698  * i915_gem_wait_ioctl - implements DRM_IOCTL_I915_GEM_WAIT
3699  * @dev: drm device pointer
3700  * @data: ioctl data blob
3701  * @file: drm file pointer
3702  *
3703  * Returns 0 if successful, else an error is returned with the remaining time in
3704  * the timeout parameter.
3705  *  -ETIME: object is still busy after timeout
3706  *  -ERESTARTSYS: signal interrupted the wait
3707  *  -ENONENT: object doesn't exist
3708  * Also possible, but rare:
3709  *  -EAGAIN: incomplete, restart syscall
3710  *  -ENOMEM: damn
3711  *  -ENODEV: Internal IRQ fail
3712  *  -E?: The add request failed
3713  *
3714  * The wait ioctl with a timeout of 0 reimplements the busy ioctl. With any
3715  * non-zero timeout parameter the wait ioctl will wait for the given number of
3716  * nanoseconds on an object becoming unbusy. Since the wait itself does so
3717  * without holding struct_mutex the object may become re-busied before this
3718  * function completes. A similar but shorter * race condition exists in the busy
3719  * ioctl
3720  */
3721 int
3722 i915_gem_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
3723 {
3724         struct drm_i915_gem_wait *args = data;
3725         struct drm_i915_gem_object *obj;
3726         ktime_t start;
3727         long ret;
3728
3729         if (args->flags != 0)
3730                 return -EINVAL;
3731
3732         obj = i915_gem_object_lookup(file, args->bo_handle);
3733         if (!obj)
3734                 return -ENOENT;
3735
3736         start = ktime_get();
3737
3738         ret = i915_gem_object_wait(obj,
3739                                    I915_WAIT_INTERRUPTIBLE |
3740                                    I915_WAIT_PRIORITY |
3741                                    I915_WAIT_ALL,
3742                                    to_wait_timeout(args->timeout_ns),
3743                                    to_rps_client(file));
3744
3745         if (args->timeout_ns > 0) {
3746                 args->timeout_ns -= ktime_to_ns(ktime_sub(ktime_get(), start));
3747                 if (args->timeout_ns < 0)
3748                         args->timeout_ns = 0;
3749
3750                 /*
3751                  * Apparently ktime isn't accurate enough and occasionally has a
3752                  * bit of mismatch in the jiffies<->nsecs<->ktime loop. So patch
3753                  * things up to make the test happy. We allow up to 1 jiffy.
3754                  *
3755                  * This is a regression from the timespec->ktime conversion.
3756                  */
3757                 if (ret == -ETIME && !nsecs_to_jiffies(args->timeout_ns))
3758                         args->timeout_ns = 0;
3759
3760                 /* Asked to wait beyond the jiffie/scheduler precision? */
3761                 if (ret == -ETIME && args->timeout_ns)
3762                         ret = -EAGAIN;
3763         }
3764
3765         i915_gem_object_put(obj);
3766         return ret;
3767 }
3768
3769 static long wait_for_timeline(struct i915_timeline *tl,
3770                               unsigned int flags, long timeout)
3771 {
3772         struct i915_request *rq;
3773
3774         rq = i915_gem_active_get_unlocked(&tl->last_request);
3775         if (!rq)
3776                 return timeout;
3777
3778         /*
3779          * "Race-to-idle".
3780          *
3781          * Switching to the kernel context is often used a synchronous
3782          * step prior to idling, e.g. in suspend for flushing all
3783          * current operations to memory before sleeping. These we
3784          * want to complete as quickly as possible to avoid prolonged
3785          * stalls, so allow the gpu to boost to maximum clocks.
3786          */
3787         if (flags & I915_WAIT_FOR_IDLE_BOOST)
3788                 gen6_rps_boost(rq, NULL);
3789
3790         timeout = i915_request_wait(rq, flags, timeout);
3791         i915_request_put(rq);
3792
3793         return timeout;
3794 }
3795
3796 static int wait_for_engines(struct drm_i915_private *i915)
3797 {
3798         if (wait_for(intel_engines_are_idle(i915), I915_IDLE_ENGINES_TIMEOUT)) {
3799                 dev_err(i915->drm.dev,
3800                         "Failed to idle engines, declaring wedged!\n");
3801                 GEM_TRACE_DUMP();
3802                 i915_gem_set_wedged(i915);
3803                 return -EIO;
3804         }
3805
3806         return 0;
3807 }
3808
3809 int i915_gem_wait_for_idle(struct drm_i915_private *i915,
3810                            unsigned int flags, long timeout)
3811 {
3812         GEM_TRACE("flags=%x (%s), timeout=%ld%s\n",
3813                   flags, flags & I915_WAIT_LOCKED ? "locked" : "unlocked",
3814                   timeout, timeout == MAX_SCHEDULE_TIMEOUT ? " (forever)" : "");
3815
3816         /* If the device is asleep, we have no requests outstanding */
3817         if (!READ_ONCE(i915->gt.awake))
3818                 return 0;
3819
3820         if (flags & I915_WAIT_LOCKED) {
3821                 struct i915_timeline *tl;
3822                 int err;
3823
3824                 lockdep_assert_held(&i915->drm.struct_mutex);
3825
3826                 list_for_each_entry(tl, &i915->gt.timelines, link) {
3827                         timeout = wait_for_timeline(tl, flags, timeout);
3828                         if (timeout < 0)
3829                                 return timeout;
3830                 }
3831                 if (GEM_SHOW_DEBUG() && !timeout) {
3832                         /* Presume that timeout was non-zero to begin with! */
3833                         dev_warn(&i915->drm.pdev->dev,
3834                                  "Missed idle-completion interrupt!\n");
3835                         GEM_TRACE_DUMP();
3836                 }
3837
3838                 err = wait_for_engines(i915);
3839                 if (err)
3840                         return err;
3841
3842                 i915_retire_requests(i915);
3843                 GEM_BUG_ON(i915->gt.active_requests);
3844         } else {
3845                 struct intel_engine_cs *engine;
3846                 enum intel_engine_id id;
3847
3848                 for_each_engine(engine, i915, id) {
3849                         struct i915_timeline *tl = &engine->timeline;
3850
3851                         timeout = wait_for_timeline(tl, flags, timeout);
3852                         if (timeout < 0)
3853                                 return timeout;
3854                 }
3855         }
3856
3857         return 0;
3858 }
3859
3860 static void __i915_gem_object_flush_for_display(struct drm_i915_gem_object *obj)
3861 {
3862         /*
3863          * We manually flush the CPU domain so that we can override and
3864          * force the flush for the display, and perform it asyncrhonously.
3865          */
3866         flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
3867         if (obj->cache_dirty)
3868                 i915_gem_clflush_object(obj, I915_CLFLUSH_FORCE);
3869         obj->write_domain = 0;
3870 }
3871
3872 void i915_gem_object_flush_if_display(struct drm_i915_gem_object *obj)
3873 {
3874         if (!READ_ONCE(obj->pin_global))
3875                 return;
3876
3877         mutex_lock(&obj->base.dev->struct_mutex);
3878         __i915_gem_object_flush_for_display(obj);
3879         mutex_unlock(&obj->base.dev->struct_mutex);
3880 }
3881
3882 /**
3883  * Moves a single object to the WC read, and possibly write domain.
3884  * @obj: object to act on
3885  * @write: ask for write access or read only
3886  *
3887  * This function returns when the move is complete, including waiting on
3888  * flushes to occur.
3889  */
3890 int
3891 i915_gem_object_set_to_wc_domain(struct drm_i915_gem_object *obj, bool write)
3892 {
3893         int ret;
3894
3895         lockdep_assert_held(&obj->base.dev->struct_mutex);
3896
3897         ret = i915_gem_object_wait(obj,
3898                                    I915_WAIT_INTERRUPTIBLE |
3899                                    I915_WAIT_LOCKED |
3900                                    (write ? I915_WAIT_ALL : 0),
3901                                    MAX_SCHEDULE_TIMEOUT,
3902                                    NULL);
3903         if (ret)
3904                 return ret;
3905
3906         if (obj->write_domain == I915_GEM_DOMAIN_WC)
3907                 return 0;
3908
3909         /* Flush and acquire obj->pages so that we are coherent through
3910          * direct access in memory with previous cached writes through
3911          * shmemfs and that our cache domain tracking remains valid.
3912          * For example, if the obj->filp was moved to swap without us
3913          * being notified and releasing the pages, we would mistakenly
3914          * continue to assume that the obj remained out of the CPU cached
3915          * domain.
3916          */
3917         ret = i915_gem_object_pin_pages(obj);
3918         if (ret)
3919                 return ret;
3920
3921         flush_write_domain(obj, ~I915_GEM_DOMAIN_WC);
3922
3923         /* Serialise direct access to this object with the barriers for
3924          * coherent writes from the GPU, by effectively invalidating the
3925          * WC domain upon first access.
3926          */
3927         if ((obj->read_domains & I915_GEM_DOMAIN_WC) == 0)
3928                 mb();
3929
3930         /* It should now be out of any other write domains, and we can update
3931          * the domain values for our changes.
3932          */
3933         GEM_BUG_ON((obj->write_domain & ~I915_GEM_DOMAIN_WC) != 0);
3934         obj->read_domains |= I915_GEM_DOMAIN_WC;
3935         if (write) {
3936                 obj->read_domains = I915_GEM_DOMAIN_WC;
3937                 obj->write_domain = I915_GEM_DOMAIN_WC;
3938                 obj->mm.dirty = true;
3939         }
3940
3941         i915_gem_object_unpin_pages(obj);
3942         return 0;
3943 }
3944
3945 /**
3946  * Moves a single object to the GTT read, and possibly write domain.
3947  * @obj: object to act on
3948  * @write: ask for write access or read only
3949  *
3950  * This function returns when the move is complete, including waiting on
3951  * flushes to occur.
3952  */
3953 int
3954 i915_gem_object_set_to_gtt_domain(struct drm_i915_gem_object *obj, bool write)
3955 {
3956         int ret;
3957
3958         lockdep_assert_held(&obj->base.dev->struct_mutex);
3959
3960         ret = i915_gem_object_wait(obj,
3961                                    I915_WAIT_INTERRUPTIBLE |
3962                                    I915_WAIT_LOCKED |
3963                                    (write ? I915_WAIT_ALL : 0),
3964                                    MAX_SCHEDULE_TIMEOUT,
3965                                    NULL);
3966         if (ret)
3967                 return ret;
3968
3969         if (obj->write_domain == I915_GEM_DOMAIN_GTT)
3970                 return 0;
3971
3972         /* Flush and acquire obj->pages so that we are coherent through
3973          * direct access in memory with previous cached writes through
3974          * shmemfs and that our cache domain tracking remains valid.
3975          * For example, if the obj->filp was moved to swap without us
3976          * being notified and releasing the pages, we would mistakenly
3977          * continue to assume that the obj remained out of the CPU cached
3978          * domain.
3979          */
3980         ret = i915_gem_object_pin_pages(obj);
3981         if (ret)
3982                 return ret;
3983
3984         flush_write_domain(obj, ~I915_GEM_DOMAIN_GTT);
3985
3986         /* Serialise direct access to this object with the barriers for
3987          * coherent writes from the GPU, by effectively invalidating the
3988          * GTT domain upon first access.
3989          */
3990         if ((obj->read_domains & I915_GEM_DOMAIN_GTT) == 0)
3991                 mb();
3992
3993         /* It should now be out of any other write domains, and we can update
3994          * the domain values for our changes.
3995          */
3996         GEM_BUG_ON((obj->write_domain & ~I915_GEM_DOMAIN_GTT) != 0);
3997         obj->read_domains |= I915_GEM_DOMAIN_GTT;
3998         if (write) {
3999                 obj->read_domains = I915_GEM_DOMAIN_GTT;
4000                 obj->write_domain = I915_GEM_DOMAIN_GTT;
4001                 obj->mm.dirty = true;
4002         }
4003
4004         i915_gem_object_unpin_pages(obj);
4005         return 0;
4006 }
4007
4008 /**
4009  * Changes the cache-level of an object across all VMA.
4010  * @obj: object to act on
4011  * @cache_level: new cache level to set for the object
4012  *
4013  * After this function returns, the object will be in the new cache-level
4014  * across all GTT and the contents of the backing storage will be coherent,
4015  * with respect to the new cache-level. In order to keep the backing storage
4016  * coherent for all users, we only allow a single cache level to be set
4017  * globally on the object and prevent it from being changed whilst the
4018  * hardware is reading from the object. That is if the object is currently
4019  * on the scanout it will be set to uncached (or equivalent display
4020  * cache coherency) and all non-MOCS GPU access will also be uncached so
4021  * that all direct access to the scanout remains coherent.
4022  */
4023 int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
4024                                     enum i915_cache_level cache_level)
4025 {
4026         struct i915_vma *vma;
4027         int ret;
4028
4029         lockdep_assert_held(&obj->base.dev->struct_mutex);
4030
4031         if (obj->cache_level == cache_level)
4032                 return 0;
4033
4034         /* Inspect the list of currently bound VMA and unbind any that would
4035          * be invalid given the new cache-level. This is principally to
4036          * catch the issue of the CS prefetch crossing page boundaries and
4037          * reading an invalid PTE on older architectures.
4038          */
4039 restart:
4040         list_for_each_entry(vma, &obj->vma_list, obj_link) {
4041                 if (!drm_mm_node_allocated(&vma->node))
4042                         continue;
4043
4044                 if (i915_vma_is_pinned(vma)) {
4045                         DRM_DEBUG("can not change the cache level of pinned objects\n");
4046                         return -EBUSY;
4047                 }
4048
4049                 if (!i915_vma_is_closed(vma) &&
4050                     i915_gem_valid_gtt_space(vma, cache_level))
4051                         continue;
4052
4053                 ret = i915_vma_unbind(vma);
4054                 if (ret)
4055                         return ret;
4056
4057                 /* As unbinding may affect other elements in the
4058                  * obj->vma_list (due to side-effects from retiring
4059                  * an active vma), play safe and restart the iterator.
4060                  */
4061                 goto restart;
4062         }
4063
4064         /* We can reuse the existing drm_mm nodes but need to change the
4065          * cache-level on the PTE. We could simply unbind them all and
4066          * rebind with the correct cache-level on next use. However since
4067          * we already have a valid slot, dma mapping, pages etc, we may as
4068          * rewrite the PTE in the belief that doing so tramples upon less
4069          * state and so involves less work.
4070          */
4071         if (obj->bind_count) {
4072                 /* Before we change the PTE, the GPU must not be accessing it.
4073                  * If we wait upon the object, we know that all the bound
4074                  * VMA are no longer active.
4075                  */
4076                 ret = i915_gem_object_wait(obj,
4077                                            I915_WAIT_INTERRUPTIBLE |
4078                                            I915_WAIT_LOCKED |
4079                                            I915_WAIT_ALL,
4080                                            MAX_SCHEDULE_TIMEOUT,
4081                                            NULL);
4082                 if (ret)
4083                         return ret;
4084
4085                 if (!HAS_LLC(to_i915(obj->base.dev)) &&
4086                     cache_level != I915_CACHE_NONE) {
4087                         /* Access to snoopable pages through the GTT is
4088                          * incoherent and on some machines causes a hard
4089                          * lockup. Relinquish the CPU mmaping to force
4090                          * userspace to refault in the pages and we can
4091                          * then double check if the GTT mapping is still
4092                          * valid for that pointer access.
4093                          */
4094                         i915_gem_release_mmap(obj);
4095
4096                         /* As we no longer need a fence for GTT access,
4097                          * we can relinquish it now (and so prevent having
4098                          * to steal a fence from someone else on the next
4099                          * fence request). Note GPU activity would have
4100                          * dropped the fence as all snoopable access is
4101                          * supposed to be linear.
4102                          */
4103                         for_each_ggtt_vma(vma, obj) {
4104                                 ret = i915_vma_put_fence(vma);
4105                                 if (ret)
4106                                         return ret;
4107                         }
4108                 } else {
4109                         /* We either have incoherent backing store and
4110                          * so no GTT access or the architecture is fully
4111                          * coherent. In such cases, existing GTT mmaps
4112                          * ignore the cache bit in the PTE and we can
4113                          * rewrite it without confusing the GPU or having
4114                          * to force userspace to fault back in its mmaps.
4115                          */
4116                 }
4117
4118                 list_for_each_entry(vma, &obj->vma_list, obj_link) {
4119                         if (!drm_mm_node_allocated(&vma->node))
4120                                 continue;
4121
4122                         ret = i915_vma_bind(vma, cache_level, PIN_UPDATE);
4123                         if (ret)
4124                                 return ret;
4125                 }
4126         }
4127
4128         list_for_each_entry(vma, &obj->vma_list, obj_link)
4129                 vma->node.color = cache_level;
4130         i915_gem_object_set_cache_coherency(obj, cache_level);
4131         obj->cache_dirty = true; /* Always invalidate stale cachelines */
4132
4133         return 0;
4134 }
4135
4136 int i915_gem_get_caching_ioctl(struct drm_device *dev, void *data,
4137                                struct drm_file *file)
4138 {
4139         struct drm_i915_gem_caching *args = data;
4140         struct drm_i915_gem_object *obj;
4141         int err = 0;
4142
4143         rcu_read_lock();
4144         obj = i915_gem_object_lookup_rcu(file, args->handle);
4145         if (!obj) {
4146                 err = -ENOENT;
4147                 goto out;
4148         }
4149
4150         switch (obj->cache_level) {
4151         case I915_CACHE_LLC:
4152         case I915_CACHE_L3_LLC:
4153                 args->caching = I915_CACHING_CACHED;
4154                 break;
4155
4156         case I915_CACHE_WT:
4157                 args->caching = I915_CACHING_DISPLAY;
4158                 break;
4159
4160         default:
4161                 args->caching = I915_CACHING_NONE;
4162                 break;
4163         }
4164 out:
4165         rcu_read_unlock();
4166         return err;
4167 }
4168
4169 int i915_gem_set_caching_ioctl(struct drm_device *dev, void *data,
4170                                struct drm_file *file)
4171 {
4172         struct drm_i915_private *i915 = to_i915(dev);
4173         struct drm_i915_gem_caching *args = data;
4174         struct drm_i915_gem_object *obj;
4175         enum i915_cache_level level;
4176         int ret = 0;
4177
4178         switch (args->caching) {
4179         case I915_CACHING_NONE:
4180                 level = I915_CACHE_NONE;
4181                 break;
4182         case I915_CACHING_CACHED:
4183                 /*
4184                  * Due to a HW issue on BXT A stepping, GPU stores via a
4185                  * snooped mapping may leave stale data in a corresponding CPU
4186                  * cacheline, whereas normally such cachelines would get
4187                  * invalidated.
4188                  */
4189                 if (!HAS_LLC(i915) && !HAS_SNOOP(i915))
4190                         return -ENODEV;
4191
4192                 level = I915_CACHE_LLC;
4193                 break;
4194         case I915_CACHING_DISPLAY:
4195                 level = HAS_WT(i915) ? I915_CACHE_WT : I915_CACHE_NONE;
4196                 break;
4197         default:
4198                 return -EINVAL;
4199         }
4200
4201         obj = i915_gem_object_lookup(file, args->handle);
4202         if (!obj)
4203                 return -ENOENT;
4204
4205         /*
4206          * The caching mode of proxy object is handled by its generator, and
4207          * not allowed to be changed by userspace.
4208          */
4209         if (i915_gem_object_is_proxy(obj)) {
4210                 ret = -ENXIO;
4211                 goto out;
4212         }
4213
4214         if (obj->cache_level == level)
4215                 goto out;
4216
4217         ret = i915_gem_object_wait(obj,
4218                                    I915_WAIT_INTERRUPTIBLE,
4219                                    MAX_SCHEDULE_TIMEOUT,
4220                                    to_rps_client(file));
4221         if (ret)
4222                 goto out;
4223
4224         ret = i915_mutex_lock_interruptible(dev);
4225         if (ret)
4226                 goto out;
4227
4228         ret = i915_gem_object_set_cache_level(obj, level);
4229         mutex_unlock(&dev->struct_mutex);
4230
4231 out:
4232         i915_gem_object_put(obj);
4233         return ret;
4234 }
4235
4236 /*
4237  * Prepare buffer for display plane (scanout, cursors, etc). Can be called from
4238  * an uninterruptible phase (modesetting) and allows any flushes to be pipelined
4239  * (for pageflips). We only flush the caches while preparing the buffer for
4240  * display, the callers are responsible for frontbuffer flush.
4241  */
4242 struct i915_vma *
4243 i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj,
4244                                      u32 alignment,
4245                                      const struct i915_ggtt_view *view,
4246                                      unsigned int flags)
4247 {
4248         struct i915_vma *vma;
4249         int ret;
4250
4251         lockdep_assert_held(&obj->base.dev->struct_mutex);
4252
4253         /* Mark the global pin early so that we account for the
4254          * display coherency whilst setting up the cache domains.
4255          */
4256         obj->pin_global++;
4257
4258         /* The display engine is not coherent with the LLC cache on gen6.  As
4259          * a result, we make sure that the pinning that is about to occur is
4260          * done with uncached PTEs. This is lowest common denominator for all
4261          * chipsets.
4262          *
4263          * However for gen6+, we could do better by using the GFDT bit instead
4264          * of uncaching, which would allow us to flush all the LLC-cached data
4265          * with that bit in the PTE to main memory with just one PIPE_CONTROL.
4266          */
4267         ret = i915_gem_object_set_cache_level(obj,
4268                                               HAS_WT(to_i915(obj->base.dev)) ?
4269                                               I915_CACHE_WT : I915_CACHE_NONE);
4270         if (ret) {
4271                 vma = ERR_PTR(ret);
4272                 goto err_unpin_global;
4273         }
4274
4275         /* As the user may map the buffer once pinned in the display plane
4276          * (e.g. libkms for the bootup splash), we have to ensure that we
4277          * always use map_and_fenceable for all scanout buffers. However,
4278          * it may simply be too big to fit into mappable, in which case
4279          * put it anyway and hope that userspace can cope (but always first
4280          * try to preserve the existing ABI).
4281          */
4282         vma = ERR_PTR(-ENOSPC);
4283         if ((flags & PIN_MAPPABLE) == 0 &&
4284             (!view || view->type == I915_GGTT_VIEW_NORMAL))
4285                 vma = i915_gem_object_ggtt_pin(obj, view, 0, alignment,
4286                                                flags |
4287                                                PIN_MAPPABLE |
4288                                                PIN_NONBLOCK);
4289         if (IS_ERR(vma))
4290                 vma = i915_gem_object_ggtt_pin(obj, view, 0, alignment, flags);
4291         if (IS_ERR(vma))
4292                 goto err_unpin_global;
4293
4294         vma->display_alignment = max_t(u64, vma->display_alignment, alignment);
4295
4296         __i915_gem_object_flush_for_display(obj);
4297
4298         /* It should now be out of any other write domains, and we can update
4299          * the domain values for our changes.
4300          */
4301         obj->read_domains |= I915_GEM_DOMAIN_GTT;
4302
4303         return vma;
4304
4305 err_unpin_global:
4306         obj->pin_global--;
4307         return vma;
4308 }
4309
4310 void
4311 i915_gem_object_unpin_from_display_plane(struct i915_vma *vma)
4312 {
4313         lockdep_assert_held(&vma->vm->i915->drm.struct_mutex);
4314
4315         if (WARN_ON(vma->obj->pin_global == 0))
4316                 return;
4317
4318         if (--vma->obj->pin_global == 0)
4319                 vma->display_alignment = I915_GTT_MIN_ALIGNMENT;
4320
4321         /* Bump the LRU to try and avoid premature eviction whilst flipping  */
4322         i915_gem_object_bump_inactive_ggtt(vma->obj);
4323
4324         i915_vma_unpin(vma);
4325 }
4326
4327 /**
4328  * Moves a single object to the CPU read, and possibly write domain.
4329  * @obj: object to act on
4330  * @write: requesting write or read-only access
4331  *
4332  * This function returns when the move is complete, including waiting on
4333  * flushes to occur.
4334  */
4335 int
4336 i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write)
4337 {
4338         int ret;
4339
4340         lockdep_assert_held(&obj->base.dev->struct_mutex);
4341
4342         ret = i915_gem_object_wait(obj,
4343                                    I915_WAIT_INTERRUPTIBLE |
4344                                    I915_WAIT_LOCKED |
4345                                    (write ? I915_WAIT_ALL : 0),
4346                                    MAX_SCHEDULE_TIMEOUT,
4347                                    NULL);
4348         if (ret)
4349                 return ret;
4350
4351         flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
4352
4353         /* Flush the CPU cache if it's still invalid. */
4354         if ((obj->read_domains & I915_GEM_DOMAIN_CPU) == 0) {
4355                 i915_gem_clflush_object(obj, I915_CLFLUSH_SYNC);
4356                 obj->read_domains |= I915_GEM_DOMAIN_CPU;
4357         }
4358
4359         /* It should now be out of any other write domains, and we can update
4360          * the domain values for our changes.
4361          */
4362         GEM_BUG_ON(obj->write_domain & ~I915_GEM_DOMAIN_CPU);
4363
4364         /* If we're writing through the CPU, then the GPU read domains will
4365          * need to be invalidated at next use.
4366          */
4367         if (write)
4368                 __start_cpu_write(obj);
4369
4370         return 0;
4371 }
4372
4373 /* Throttle our rendering by waiting until the ring has completed our requests
4374  * emitted over 20 msec ago.
4375  *
4376  * Note that if we were to use the current jiffies each time around the loop,
4377  * we wouldn't escape the function with any frames outstanding if the time to
4378  * render a frame was over 20ms.
4379  *
4380  * This should get us reasonable parallelism between CPU and GPU but also
4381  * relatively low latency when blocking on a particular request to finish.
4382  */
4383 static int
4384 i915_gem_ring_throttle(struct drm_device *dev, struct drm_file *file)
4385 {
4386         struct drm_i915_private *dev_priv = to_i915(dev);
4387         struct drm_i915_file_private *file_priv = file->driver_priv;
4388         unsigned long recent_enough = jiffies - DRM_I915_THROTTLE_JIFFIES;
4389         struct i915_request *request, *target = NULL;
4390         long ret;
4391
4392         /* ABI: return -EIO if already wedged */
4393         if (i915_terminally_wedged(&dev_priv->gpu_error))
4394                 return -EIO;
4395
4396         spin_lock(&file_priv->mm.lock);
4397         list_for_each_entry(request, &file_priv->mm.request_list, client_link) {
4398                 if (time_after_eq(request->emitted_jiffies, recent_enough))
4399                         break;
4400
4401                 if (target) {
4402                         list_del(&target->client_link);
4403                         target->file_priv = NULL;
4404                 }
4405
4406                 target = request;
4407         }
4408         if (target)
4409                 i915_request_get(target);
4410         spin_unlock(&file_priv->mm.lock);
4411
4412         if (target == NULL)
4413                 return 0;
4414
4415         ret = i915_request_wait(target,
4416                                 I915_WAIT_INTERRUPTIBLE,
4417                                 MAX_SCHEDULE_TIMEOUT);
4418         i915_request_put(target);
4419
4420         return ret < 0 ? ret : 0;
4421 }
4422
4423 struct i915_vma *
4424 i915_gem_object_ggtt_pin(struct drm_i915_gem_object *obj,
4425                          const struct i915_ggtt_view *view,
4426                          u64 size,
4427                          u64 alignment,
4428                          u64 flags)
4429 {
4430         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
4431         struct i915_address_space *vm = &dev_priv->ggtt.vm;
4432         struct i915_vma *vma;
4433         int ret;
4434
4435         lockdep_assert_held(&obj->base.dev->struct_mutex);
4436
4437         if (flags & PIN_MAPPABLE &&
4438             (!view || view->type == I915_GGTT_VIEW_NORMAL)) {
4439                 /* If the required space is larger than the available
4440                  * aperture, we will not able to find a slot for the
4441                  * object and unbinding the object now will be in
4442                  * vain. Worse, doing so may cause us to ping-pong
4443                  * the object in and out of the Global GTT and
4444                  * waste a lot of cycles under the mutex.
4445                  */
4446                 if (obj->base.size > dev_priv->ggtt.mappable_end)
4447                         return ERR_PTR(-E2BIG);
4448
4449                 /* If NONBLOCK is set the caller is optimistically
4450                  * trying to cache the full object within the mappable
4451                  * aperture, and *must* have a fallback in place for
4452                  * situations where we cannot bind the object. We
4453                  * can be a little more lax here and use the fallback
4454                  * more often to avoid costly migrations of ourselves
4455                  * and other objects within the aperture.
4456                  *
4457                  * Half-the-aperture is used as a simple heuristic.
4458                  * More interesting would to do search for a free
4459                  * block prior to making the commitment to unbind.
4460                  * That caters for the self-harm case, and with a
4461                  * little more heuristics (e.g. NOFAULT, NOEVICT)
4462                  * we could try to minimise harm to others.
4463                  */
4464                 if (flags & PIN_NONBLOCK &&
4465                     obj->base.size > dev_priv->ggtt.mappable_end / 2)
4466                         return ERR_PTR(-ENOSPC);
4467         }
4468
4469         vma = i915_vma_instance(obj, vm, view);
4470         if (unlikely(IS_ERR(vma)))
4471                 return vma;
4472
4473         if (i915_vma_misplaced(vma, size, alignment, flags)) {
4474                 if (flags & PIN_NONBLOCK) {
4475                         if (i915_vma_is_pinned(vma) || i915_vma_is_active(vma))
4476                                 return ERR_PTR(-ENOSPC);
4477
4478                         if (flags & PIN_MAPPABLE &&
4479                             vma->fence_size > dev_priv->ggtt.mappable_end / 2)
4480                                 return ERR_PTR(-ENOSPC);
4481                 }
4482
4483                 WARN(i915_vma_is_pinned(vma),
4484                      "bo is already pinned in ggtt with incorrect alignment:"
4485                      " offset=%08x, req.alignment=%llx,"
4486                      " req.map_and_fenceable=%d, vma->map_and_fenceable=%d\n",
4487                      i915_ggtt_offset(vma), alignment,
4488                      !!(flags & PIN_MAPPABLE),
4489                      i915_vma_is_map_and_fenceable(vma));
4490                 ret = i915_vma_unbind(vma);
4491                 if (ret)
4492                         return ERR_PTR(ret);
4493         }
4494
4495         ret = i915_vma_pin(vma, size, alignment, flags | PIN_GLOBAL);
4496         if (ret)
4497                 return ERR_PTR(ret);
4498
4499         return vma;
4500 }
4501
4502 static __always_inline unsigned int __busy_read_flag(unsigned int id)
4503 {
4504         /* Note that we could alias engines in the execbuf API, but
4505          * that would be very unwise as it prevents userspace from
4506          * fine control over engine selection. Ahem.
4507          *
4508          * This should be something like EXEC_MAX_ENGINE instead of
4509          * I915_NUM_ENGINES.
4510          */
4511         BUILD_BUG_ON(I915_NUM_ENGINES > 16);
4512         return 0x10000 << id;
4513 }
4514
4515 static __always_inline unsigned int __busy_write_id(unsigned int id)
4516 {
4517         /* The uABI guarantees an active writer is also amongst the read
4518          * engines. This would be true if we accessed the activity tracking
4519          * under the lock, but as we perform the lookup of the object and
4520          * its activity locklessly we can not guarantee that the last_write
4521          * being active implies that we have set the same engine flag from
4522          * last_read - hence we always set both read and write busy for
4523          * last_write.
4524          */
4525         return id | __busy_read_flag(id);
4526 }
4527
4528 static __always_inline unsigned int
4529 __busy_set_if_active(const struct dma_fence *fence,
4530                      unsigned int (*flag)(unsigned int id))
4531 {
4532         struct i915_request *rq;
4533
4534         /* We have to check the current hw status of the fence as the uABI
4535          * guarantees forward progress. We could rely on the idle worker
4536          * to eventually flush us, but to minimise latency just ask the
4537          * hardware.
4538          *
4539          * Note we only report on the status of native fences.
4540          */
4541         if (!dma_fence_is_i915(fence))
4542                 return 0;
4543
4544         /* opencode to_request() in order to avoid const warnings */
4545         rq = container_of(fence, struct i915_request, fence);
4546         if (i915_request_completed(rq))
4547                 return 0;
4548
4549         return flag(rq->engine->uabi_id);
4550 }
4551
4552 static __always_inline unsigned int
4553 busy_check_reader(const struct dma_fence *fence)
4554 {
4555         return __busy_set_if_active(fence, __busy_read_flag);
4556 }
4557
4558 static __always_inline unsigned int
4559 busy_check_writer(const struct dma_fence *fence)
4560 {
4561         if (!fence)
4562                 return 0;
4563
4564         return __busy_set_if_active(fence, __busy_write_id);
4565 }
4566
4567 int
4568 i915_gem_busy_ioctl(struct drm_device *dev, void *data,
4569                     struct drm_file *file)
4570 {
4571         struct drm_i915_gem_busy *args = data;
4572         struct drm_i915_gem_object *obj;
4573         struct reservation_object_list *list;
4574         unsigned int seq;
4575         int err;
4576
4577         err = -ENOENT;
4578         rcu_read_lock();
4579         obj = i915_gem_object_lookup_rcu(file, args->handle);
4580         if (!obj)
4581                 goto out;
4582
4583         /* A discrepancy here is that we do not report the status of
4584          * non-i915 fences, i.e. even though we may report the object as idle,
4585          * a call to set-domain may still stall waiting for foreign rendering.
4586          * This also means that wait-ioctl may report an object as busy,
4587          * where busy-ioctl considers it idle.
4588          *
4589          * We trade the ability to warn of foreign fences to report on which
4590          * i915 engines are active for the object.
4591          *
4592          * Alternatively, we can trade that extra information on read/write
4593          * activity with
4594          *      args->busy =
4595          *              !reservation_object_test_signaled_rcu(obj->resv, true);
4596          * to report the overall busyness. This is what the wait-ioctl does.
4597          *
4598          */
4599 retry:
4600         seq = raw_read_seqcount(&obj->resv->seq);
4601
4602         /* Translate the exclusive fence to the READ *and* WRITE engine */
4603         args->busy = busy_check_writer(rcu_dereference(obj->resv->fence_excl));
4604
4605         /* Translate shared fences to READ set of engines */
4606         list = rcu_dereference(obj->resv->fence);
4607         if (list) {
4608                 unsigned int shared_count = list->shared_count, i;
4609
4610                 for (i = 0; i < shared_count; ++i) {
4611                         struct dma_fence *fence =
4612                                 rcu_dereference(list->shared[i]);
4613
4614                         args->busy |= busy_check_reader(fence);
4615                 }
4616         }
4617
4618         if (args->busy && read_seqcount_retry(&obj->resv->seq, seq))
4619                 goto retry;
4620
4621         err = 0;
4622 out:
4623         rcu_read_unlock();
4624         return err;
4625 }
4626
4627 int
4628 i915_gem_throttle_ioctl(struct drm_device *dev, void *data,
4629                         struct drm_file *file_priv)
4630 {
4631         return i915_gem_ring_throttle(dev, file_priv);
4632 }
4633
4634 int
4635 i915_gem_madvise_ioctl(struct drm_device *dev, void *data,
4636                        struct drm_file *file_priv)
4637 {
4638         struct drm_i915_private *dev_priv = to_i915(dev);
4639         struct drm_i915_gem_madvise *args = data;
4640         struct drm_i915_gem_object *obj;
4641         int err;
4642
4643         switch (args->madv) {
4644         case I915_MADV_DONTNEED:
4645         case I915_MADV_WILLNEED:
4646             break;
4647         default:
4648             return -EINVAL;
4649         }
4650
4651         obj = i915_gem_object_lookup(file_priv, args->handle);
4652         if (!obj)
4653                 return -ENOENT;
4654
4655         err = mutex_lock_interruptible(&obj->mm.lock);
4656         if (err)
4657                 goto out;
4658
4659         if (i915_gem_object_has_pages(obj) &&
4660             i915_gem_object_is_tiled(obj) &&
4661             dev_priv->quirks & QUIRK_PIN_SWIZZLED_PAGES) {
4662                 if (obj->mm.madv == I915_MADV_WILLNEED) {
4663                         GEM_BUG_ON(!obj->mm.quirked);
4664                         __i915_gem_object_unpin_pages(obj);
4665                         obj->mm.quirked = false;
4666                 }
4667                 if (args->madv == I915_MADV_WILLNEED) {
4668                         GEM_BUG_ON(obj->mm.quirked);
4669                         __i915_gem_object_pin_pages(obj);
4670                         obj->mm.quirked = true;
4671                 }
4672         }
4673
4674         if (obj->mm.madv != __I915_MADV_PURGED)
4675                 obj->mm.madv = args->madv;
4676
4677         /* if the object is no longer attached, discard its backing storage */
4678         if (obj->mm.madv == I915_MADV_DONTNEED &&
4679             !i915_gem_object_has_pages(obj))
4680                 i915_gem_object_truncate(obj);
4681
4682         args->retained = obj->mm.madv != __I915_MADV_PURGED;
4683         mutex_unlock(&obj->mm.lock);
4684
4685 out:
4686         i915_gem_object_put(obj);
4687         return err;
4688 }
4689
4690 static void
4691 frontbuffer_retire(struct i915_gem_active *active, struct i915_request *request)
4692 {
4693         struct drm_i915_gem_object *obj =
4694                 container_of(active, typeof(*obj), frontbuffer_write);
4695
4696         intel_fb_obj_flush(obj, ORIGIN_CS);
4697 }
4698
4699 void i915_gem_object_init(struct drm_i915_gem_object *obj,
4700                           const struct drm_i915_gem_object_ops *ops)
4701 {
4702         mutex_init(&obj->mm.lock);
4703
4704         INIT_LIST_HEAD(&obj->vma_list);
4705         INIT_LIST_HEAD(&obj->lut_list);
4706         INIT_LIST_HEAD(&obj->batch_pool_link);
4707
4708         init_rcu_head(&obj->rcu);
4709
4710         obj->ops = ops;
4711
4712         reservation_object_init(&obj->__builtin_resv);
4713         obj->resv = &obj->__builtin_resv;
4714
4715         obj->frontbuffer_ggtt_origin = ORIGIN_GTT;
4716         init_request_active(&obj->frontbuffer_write, frontbuffer_retire);
4717
4718         obj->mm.madv = I915_MADV_WILLNEED;
4719         INIT_RADIX_TREE(&obj->mm.get_page.radix, GFP_KERNEL | __GFP_NOWARN);
4720         mutex_init(&obj->mm.get_page.lock);
4721
4722         i915_gem_info_add_obj(to_i915(obj->base.dev), obj->base.size);
4723 }
4724
4725 static const struct drm_i915_gem_object_ops i915_gem_object_ops = {
4726         .flags = I915_GEM_OBJECT_HAS_STRUCT_PAGE |
4727                  I915_GEM_OBJECT_IS_SHRINKABLE,
4728
4729         .get_pages = i915_gem_object_get_pages_gtt,
4730         .put_pages = i915_gem_object_put_pages_gtt,
4731
4732         .pwrite = i915_gem_object_pwrite_gtt,
4733 };
4734
4735 static int i915_gem_object_create_shmem(struct drm_device *dev,
4736                                         struct drm_gem_object *obj,
4737                                         size_t size)
4738 {
4739         struct drm_i915_private *i915 = to_i915(dev);
4740         unsigned long flags = VM_NORESERVE;
4741         struct file *filp;
4742
4743         drm_gem_private_object_init(dev, obj, size);
4744
4745         if (i915->mm.gemfs)
4746                 filp = shmem_file_setup_with_mnt(i915->mm.gemfs, "i915", size,
4747                                                  flags);
4748         else
4749                 filp = shmem_file_setup("i915", size, flags);
4750
4751         if (IS_ERR(filp))
4752                 return PTR_ERR(filp);
4753
4754         obj->filp = filp;
4755
4756         return 0;
4757 }
4758
4759 struct drm_i915_gem_object *
4760 i915_gem_object_create(struct drm_i915_private *dev_priv, u64 size)
4761 {
4762         struct drm_i915_gem_object *obj;
4763         struct address_space *mapping;
4764         unsigned int cache_level;
4765         gfp_t mask;
4766         int ret;
4767
4768         /* There is a prevalence of the assumption that we fit the object's
4769          * page count inside a 32bit _signed_ variable. Let's document this and
4770          * catch if we ever need to fix it. In the meantime, if you do spot
4771          * such a local variable, please consider fixing!
4772          */
4773         if (size >> PAGE_SHIFT > INT_MAX)
4774                 return ERR_PTR(-E2BIG);
4775
4776         if (overflows_type(size, obj->base.size))
4777                 return ERR_PTR(-E2BIG);
4778
4779         obj = i915_gem_object_alloc(dev_priv);
4780         if (obj == NULL)
4781                 return ERR_PTR(-ENOMEM);
4782
4783         ret = i915_gem_object_create_shmem(&dev_priv->drm, &obj->base, size);
4784         if (ret)
4785                 goto fail;
4786
4787         mask = GFP_HIGHUSER | __GFP_RECLAIMABLE;
4788         if (IS_I965GM(dev_priv) || IS_I965G(dev_priv)) {
4789                 /* 965gm cannot relocate objects above 4GiB. */
4790                 mask &= ~__GFP_HIGHMEM;
4791                 mask |= __GFP_DMA32;
4792         }
4793
4794         mapping = obj->base.filp->f_mapping;
4795         mapping_set_gfp_mask(mapping, mask);
4796         GEM_BUG_ON(!(mapping_gfp_mask(mapping) & __GFP_RECLAIM));
4797
4798         i915_gem_object_init(obj, &i915_gem_object_ops);
4799
4800         obj->write_domain = I915_GEM_DOMAIN_CPU;
4801         obj->read_domains = I915_GEM_DOMAIN_CPU;
4802
4803         if (HAS_LLC(dev_priv))
4804                 /* On some devices, we can have the GPU use the LLC (the CPU
4805                  * cache) for about a 10% performance improvement
4806                  * compared to uncached.  Graphics requests other than
4807                  * display scanout are coherent with the CPU in
4808                  * accessing this cache.  This means in this mode we
4809                  * don't need to clflush on the CPU side, and on the
4810                  * GPU side we only need to flush internal caches to
4811                  * get data visible to the CPU.
4812                  *
4813                  * However, we maintain the display planes as UC, and so
4814                  * need to rebind when first used as such.
4815                  */
4816                 cache_level = I915_CACHE_LLC;
4817         else
4818                 cache_level = I915_CACHE_NONE;
4819
4820         i915_gem_object_set_cache_coherency(obj, cache_level);
4821
4822         trace_i915_gem_object_create(obj);
4823
4824         return obj;
4825
4826 fail:
4827         i915_gem_object_free(obj);
4828         return ERR_PTR(ret);
4829 }
4830
4831 static bool discard_backing_storage(struct drm_i915_gem_object *obj)
4832 {
4833         /* If we are the last user of the backing storage (be it shmemfs
4834          * pages or stolen etc), we know that the pages are going to be
4835          * immediately released. In this case, we can then skip copying
4836          * back the contents from the GPU.
4837          */
4838
4839         if (obj->mm.madv != I915_MADV_WILLNEED)
4840                 return false;
4841
4842         if (obj->base.filp == NULL)
4843                 return true;
4844
4845         /* At first glance, this looks racy, but then again so would be
4846          * userspace racing mmap against close. However, the first external
4847          * reference to the filp can only be obtained through the
4848          * i915_gem_mmap_ioctl() which safeguards us against the user
4849          * acquiring such a reference whilst we are in the middle of
4850          * freeing the object.
4851          */
4852         return atomic_long_read(&obj->base.filp->f_count) == 1;
4853 }
4854
4855 static void __i915_gem_free_objects(struct drm_i915_private *i915,
4856                                     struct llist_node *freed)
4857 {
4858         struct drm_i915_gem_object *obj, *on;
4859
4860         intel_runtime_pm_get(i915);
4861         llist_for_each_entry_safe(obj, on, freed, freed) {
4862                 struct i915_vma *vma, *vn;
4863
4864                 trace_i915_gem_object_destroy(obj);
4865
4866                 mutex_lock(&i915->drm.struct_mutex);
4867
4868                 GEM_BUG_ON(i915_gem_object_is_active(obj));
4869                 list_for_each_entry_safe(vma, vn,
4870                                          &obj->vma_list, obj_link) {
4871                         GEM_BUG_ON(i915_vma_is_active(vma));
4872                         vma->flags &= ~I915_VMA_PIN_MASK;
4873                         i915_vma_destroy(vma);
4874                 }
4875                 GEM_BUG_ON(!list_empty(&obj->vma_list));
4876                 GEM_BUG_ON(!RB_EMPTY_ROOT(&obj->vma_tree));
4877
4878                 /* This serializes freeing with the shrinker. Since the free
4879                  * is delayed, first by RCU then by the workqueue, we want the
4880                  * shrinker to be able to free pages of unreferenced objects,
4881                  * or else we may oom whilst there are plenty of deferred
4882                  * freed objects.
4883                  */
4884                 if (i915_gem_object_has_pages(obj)) {
4885                         spin_lock(&i915->mm.obj_lock);
4886                         list_del_init(&obj->mm.link);
4887                         spin_unlock(&i915->mm.obj_lock);
4888                 }
4889
4890                 mutex_unlock(&i915->drm.struct_mutex);
4891
4892                 GEM_BUG_ON(obj->bind_count);
4893                 GEM_BUG_ON(obj->userfault_count);
4894                 GEM_BUG_ON(atomic_read(&obj->frontbuffer_bits));
4895                 GEM_BUG_ON(!list_empty(&obj->lut_list));
4896
4897                 if (obj->ops->release)
4898                         obj->ops->release(obj);
4899
4900                 if (WARN_ON(i915_gem_object_has_pinned_pages(obj)))
4901                         atomic_set(&obj->mm.pages_pin_count, 0);
4902                 __i915_gem_object_put_pages(obj, I915_MM_NORMAL);
4903                 GEM_BUG_ON(i915_gem_object_has_pages(obj));
4904
4905                 if (obj->base.import_attach)
4906                         drm_prime_gem_destroy(&obj->base, NULL);
4907
4908                 reservation_object_fini(&obj->__builtin_resv);
4909                 drm_gem_object_release(&obj->base);
4910                 i915_gem_info_remove_obj(i915, obj->base.size);
4911
4912                 kfree(obj->bit_17);
4913                 i915_gem_object_free(obj);
4914
4915                 GEM_BUG_ON(!atomic_read(&i915->mm.free_count));
4916                 atomic_dec(&i915->mm.free_count);
4917
4918                 if (on)
4919                         cond_resched();
4920         }
4921         intel_runtime_pm_put(i915);
4922 }
4923
4924 static void i915_gem_flush_free_objects(struct drm_i915_private *i915)
4925 {
4926         struct llist_node *freed;
4927
4928         /* Free the oldest, most stale object to keep the free_list short */
4929         freed = NULL;
4930         if (!llist_empty(&i915->mm.free_list)) { /* quick test for hotpath */
4931                 /* Only one consumer of llist_del_first() allowed */
4932                 spin_lock(&i915->mm.free_lock);
4933                 freed = llist_del_first(&i915->mm.free_list);
4934                 spin_unlock(&i915->mm.free_lock);
4935         }
4936         if (unlikely(freed)) {
4937                 freed->next = NULL;
4938                 __i915_gem_free_objects(i915, freed);
4939         }
4940 }
4941
4942 static void __i915_gem_free_work(struct work_struct *work)
4943 {
4944         struct drm_i915_private *i915 =
4945                 container_of(work, struct drm_i915_private, mm.free_work);
4946         struct llist_node *freed;
4947
4948         /*
4949          * All file-owned VMA should have been released by this point through
4950          * i915_gem_close_object(), or earlier by i915_gem_context_close().
4951          * However, the object may also be bound into the global GTT (e.g.
4952          * older GPUs without per-process support, or for direct access through
4953          * the GTT either for the user or for scanout). Those VMA still need to
4954          * unbound now.
4955          */
4956
4957         spin_lock(&i915->mm.free_lock);
4958         while ((freed = llist_del_all(&i915->mm.free_list))) {
4959                 spin_unlock(&i915->mm.free_lock);
4960
4961                 __i915_gem_free_objects(i915, freed);
4962                 if (need_resched())
4963                         return;
4964
4965                 spin_lock(&i915->mm.free_lock);
4966         }
4967         spin_unlock(&i915->mm.free_lock);
4968 }
4969
4970 static void __i915_gem_free_object_rcu(struct rcu_head *head)
4971 {
4972         struct drm_i915_gem_object *obj =
4973                 container_of(head, typeof(*obj), rcu);
4974         struct drm_i915_private *i915 = to_i915(obj->base.dev);
4975
4976         /*
4977          * We reuse obj->rcu for the freed list, so we had better not treat
4978          * it like a rcu_head from this point forwards. And we expect all
4979          * objects to be freed via this path.
4980          */
4981         destroy_rcu_head(&obj->rcu);
4982
4983         /*
4984          * Since we require blocking on struct_mutex to unbind the freed
4985          * object from the GPU before releasing resources back to the
4986          * system, we can not do that directly from the RCU callback (which may
4987          * be a softirq context), but must instead then defer that work onto a
4988          * kthread. We use the RCU callback rather than move the freed object
4989          * directly onto the work queue so that we can mix between using the
4990          * worker and performing frees directly from subsequent allocations for
4991          * crude but effective memory throttling.
4992          */
4993         if (llist_add(&obj->freed, &i915->mm.free_list))
4994                 queue_work(i915->wq, &i915->mm.free_work);
4995 }
4996
4997 void i915_gem_free_object(struct drm_gem_object *gem_obj)
4998 {
4999         struct drm_i915_gem_object *obj = to_intel_bo(gem_obj);
5000
5001         if (obj->mm.quirked)
5002                 __i915_gem_object_unpin_pages(obj);
5003
5004         if (discard_backing_storage(obj))
5005                 obj->mm.madv = I915_MADV_DONTNEED;
5006
5007         /*
5008          * Before we free the object, make sure any pure RCU-only
5009          * read-side critical sections are complete, e.g.
5010          * i915_gem_busy_ioctl(). For the corresponding synchronized
5011          * lookup see i915_gem_object_lookup_rcu().
5012          */
5013         atomic_inc(&to_i915(obj->base.dev)->mm.free_count);
5014         call_rcu(&obj->rcu, __i915_gem_free_object_rcu);
5015 }
5016
5017 void __i915_gem_object_release_unless_active(struct drm_i915_gem_object *obj)
5018 {
5019         lockdep_assert_held(&obj->base.dev->struct_mutex);
5020
5021         if (!i915_gem_object_has_active_reference(obj) &&
5022             i915_gem_object_is_active(obj))
5023                 i915_gem_object_set_active_reference(obj);
5024         else
5025                 i915_gem_object_put(obj);
5026 }
5027
5028 void i915_gem_sanitize(struct drm_i915_private *i915)
5029 {
5030         int err;
5031
5032         GEM_TRACE("\n");
5033
5034         mutex_lock(&i915->drm.struct_mutex);
5035
5036         intel_runtime_pm_get(i915);
5037         intel_uncore_forcewake_get(i915, FORCEWAKE_ALL);
5038
5039         /*
5040          * As we have just resumed the machine and woken the device up from
5041          * deep PCI sleep (presumably D3_cold), assume the HW has been reset
5042          * back to defaults, recovering from whatever wedged state we left it
5043          * in and so worth trying to use the device once more.
5044          */
5045         if (i915_terminally_wedged(&i915->gpu_error))
5046                 i915_gem_unset_wedged(i915);
5047
5048         /*
5049          * If we inherit context state from the BIOS or earlier occupants
5050          * of the GPU, the GPU may be in an inconsistent state when we
5051          * try to take over. The only way to remove the earlier state
5052          * is by resetting. However, resetting on earlier gen is tricky as
5053          * it may impact the display and we are uncertain about the stability
5054          * of the reset, so this could be applied to even earlier gen.
5055          */
5056         err = -ENODEV;
5057         if (INTEL_GEN(i915) >= 5 && intel_has_gpu_reset(i915))
5058                 err = WARN_ON(intel_gpu_reset(i915, ALL_ENGINES));
5059         if (!err)
5060                 intel_engines_sanitize(i915);
5061
5062         intel_uncore_forcewake_put(i915, FORCEWAKE_ALL);
5063         intel_runtime_pm_put(i915);
5064
5065         i915_gem_contexts_lost(i915);
5066         mutex_unlock(&i915->drm.struct_mutex);
5067 }
5068
5069 int i915_gem_suspend(struct drm_i915_private *i915)
5070 {
5071         int ret;
5072
5073         GEM_TRACE("\n");
5074
5075         intel_runtime_pm_get(i915);
5076         intel_suspend_gt_powersave(i915);
5077
5078         mutex_lock(&i915->drm.struct_mutex);
5079
5080         /*
5081          * We have to flush all the executing contexts to main memory so
5082          * that they can saved in the hibernation image. To ensure the last
5083          * context image is coherent, we have to switch away from it. That
5084          * leaves the i915->kernel_context still active when
5085          * we actually suspend, and its image in memory may not match the GPU
5086          * state. Fortunately, the kernel_context is disposable and we do
5087          * not rely on its state.
5088          */
5089         if (!i915_terminally_wedged(&i915->gpu_error)) {
5090                 ret = i915_gem_switch_to_kernel_context(i915);
5091                 if (ret)
5092                         goto err_unlock;
5093
5094                 ret = i915_gem_wait_for_idle(i915,
5095                                              I915_WAIT_INTERRUPTIBLE |
5096                                              I915_WAIT_LOCKED |
5097                                              I915_WAIT_FOR_IDLE_BOOST,
5098                                              MAX_SCHEDULE_TIMEOUT);
5099                 if (ret && ret != -EIO)
5100                         goto err_unlock;
5101
5102                 assert_kernel_context_is_current(i915);
5103         }
5104         i915_retire_requests(i915); /* ensure we flush after wedging */
5105
5106         mutex_unlock(&i915->drm.struct_mutex);
5107
5108         intel_uc_suspend(i915);
5109
5110         cancel_delayed_work_sync(&i915->gpu_error.hangcheck_work);
5111         cancel_delayed_work_sync(&i915->gt.retire_work);
5112
5113         /*
5114          * As the idle_work is rearming if it detects a race, play safe and
5115          * repeat the flush until it is definitely idle.
5116          */
5117         drain_delayed_work(&i915->gt.idle_work);
5118
5119         /*
5120          * Assert that we successfully flushed all the work and
5121          * reset the GPU back to its idle, low power state.
5122          */
5123         WARN_ON(i915->gt.awake);
5124         if (WARN_ON(!intel_engines_are_idle(i915)))
5125                 i915_gem_set_wedged(i915); /* no hope, discard everything */
5126
5127         intel_runtime_pm_put(i915);
5128         return 0;
5129
5130 err_unlock:
5131         mutex_unlock(&i915->drm.struct_mutex);
5132         intel_runtime_pm_put(i915);
5133         return ret;
5134 }
5135
5136 void i915_gem_suspend_late(struct drm_i915_private *i915)
5137 {
5138         struct drm_i915_gem_object *obj;
5139         struct list_head *phases[] = {
5140                 &i915->mm.unbound_list,
5141                 &i915->mm.bound_list,
5142                 NULL
5143         }, **phase;
5144
5145         /*
5146          * Neither the BIOS, ourselves or any other kernel
5147          * expects the system to be in execlists mode on startup,
5148          * so we need to reset the GPU back to legacy mode. And the only
5149          * known way to disable logical contexts is through a GPU reset.
5150          *
5151          * So in order to leave the system in a known default configuration,
5152          * always reset the GPU upon unload and suspend. Afterwards we then
5153          * clean up the GEM state tracking, flushing off the requests and
5154          * leaving the system in a known idle state.
5155          *
5156          * Note that is of the upmost importance that the GPU is idle and
5157          * all stray writes are flushed *before* we dismantle the backing
5158          * storage for the pinned objects.
5159          *
5160          * However, since we are uncertain that resetting the GPU on older
5161          * machines is a good idea, we don't - just in case it leaves the
5162          * machine in an unusable condition.
5163          */
5164
5165         mutex_lock(&i915->drm.struct_mutex);
5166         for (phase = phases; *phase; phase++) {
5167                 list_for_each_entry(obj, *phase, mm.link)
5168                         WARN_ON(i915_gem_object_set_to_gtt_domain(obj, false));
5169         }
5170         mutex_unlock(&i915->drm.struct_mutex);
5171
5172         intel_uc_sanitize(i915);
5173         i915_gem_sanitize(i915);
5174 }
5175
5176 void i915_gem_resume(struct drm_i915_private *i915)
5177 {
5178         GEM_TRACE("\n");
5179
5180         WARN_ON(i915->gt.awake);
5181
5182         mutex_lock(&i915->drm.struct_mutex);
5183         intel_uncore_forcewake_get(i915, FORCEWAKE_ALL);
5184
5185         i915_gem_restore_gtt_mappings(i915);
5186         i915_gem_restore_fences(i915);
5187
5188         /*
5189          * As we didn't flush the kernel context before suspend, we cannot
5190          * guarantee that the context image is complete. So let's just reset
5191          * it and start again.
5192          */
5193         i915->gt.resume(i915);
5194
5195         if (i915_gem_init_hw(i915))
5196                 goto err_wedged;
5197
5198         intel_uc_resume(i915);
5199
5200         /* Always reload a context for powersaving. */
5201         if (i915_gem_switch_to_kernel_context(i915))
5202                 goto err_wedged;
5203
5204 out_unlock:
5205         intel_uncore_forcewake_put(i915, FORCEWAKE_ALL);
5206         mutex_unlock(&i915->drm.struct_mutex);
5207         return;
5208
5209 err_wedged:
5210         if (!i915_terminally_wedged(&i915->gpu_error)) {
5211                 DRM_ERROR("failed to re-initialize GPU, declaring wedged!\n");
5212                 i915_gem_set_wedged(i915);
5213         }
5214         goto out_unlock;
5215 }
5216
5217 void i915_gem_init_swizzling(struct drm_i915_private *dev_priv)
5218 {
5219         if (INTEL_GEN(dev_priv) < 5 ||
5220             dev_priv->mm.bit_6_swizzle_x == I915_BIT_6_SWIZZLE_NONE)
5221                 return;
5222
5223         I915_WRITE(DISP_ARB_CTL, I915_READ(DISP_ARB_CTL) |
5224                                  DISP_TILE_SURFACE_SWIZZLING);
5225
5226         if (IS_GEN5(dev_priv))
5227                 return;
5228
5229         I915_WRITE(TILECTL, I915_READ(TILECTL) | TILECTL_SWZCTL);
5230         if (IS_GEN6(dev_priv))
5231                 I915_WRITE(ARB_MODE, _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_SNB));
5232         else if (IS_GEN7(dev_priv))
5233                 I915_WRITE(ARB_MODE, _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_IVB));
5234         else if (IS_GEN8(dev_priv))
5235                 I915_WRITE(GAMTARBMODE, _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_BDW));
5236         else
5237                 BUG();
5238 }
5239
5240 static void init_unused_ring(struct drm_i915_private *dev_priv, u32 base)
5241 {
5242         I915_WRITE(RING_CTL(base), 0);
5243         I915_WRITE(RING_HEAD(base), 0);
5244         I915_WRITE(RING_TAIL(base), 0);
5245         I915_WRITE(RING_START(base), 0);
5246 }
5247
5248 static void init_unused_rings(struct drm_i915_private *dev_priv)
5249 {
5250         if (IS_I830(dev_priv)) {
5251                 init_unused_ring(dev_priv, PRB1_BASE);
5252                 init_unused_ring(dev_priv, SRB0_BASE);
5253                 init_unused_ring(dev_priv, SRB1_BASE);
5254                 init_unused_ring(dev_priv, SRB2_BASE);
5255                 init_unused_ring(dev_priv, SRB3_BASE);
5256         } else if (IS_GEN2(dev_priv)) {
5257                 init_unused_ring(dev_priv, SRB0_BASE);
5258                 init_unused_ring(dev_priv, SRB1_BASE);
5259         } else if (IS_GEN3(dev_priv)) {
5260                 init_unused_ring(dev_priv, PRB1_BASE);
5261                 init_unused_ring(dev_priv, PRB2_BASE);
5262         }
5263 }
5264
5265 static int __i915_gem_restart_engines(void *data)
5266 {
5267         struct drm_i915_private *i915 = data;
5268         struct intel_engine_cs *engine;
5269         enum intel_engine_id id;
5270         int err;
5271
5272         for_each_engine(engine, i915, id) {
5273                 err = engine->init_hw(engine);
5274                 if (err) {
5275                         DRM_ERROR("Failed to restart %s (%d)\n",
5276                                   engine->name, err);
5277                         return err;
5278                 }
5279         }
5280
5281         return 0;
5282 }
5283
5284 int i915_gem_init_hw(struct drm_i915_private *dev_priv)
5285 {
5286         int ret;
5287
5288         dev_priv->gt.last_init_time = ktime_get();
5289
5290         /* Double layer security blanket, see i915_gem_init() */
5291         intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
5292
5293         if (HAS_EDRAM(dev_priv) && INTEL_GEN(dev_priv) < 9)
5294                 I915_WRITE(HSW_IDICR, I915_READ(HSW_IDICR) | IDIHASHMSK(0xf));
5295
5296         if (IS_HASWELL(dev_priv))
5297                 I915_WRITE(MI_PREDICATE_RESULT_2, IS_HSW_GT3(dev_priv) ?
5298                            LOWER_SLICE_ENABLED : LOWER_SLICE_DISABLED);
5299
5300         /* Apply the GT workarounds... */
5301         intel_gt_apply_workarounds(dev_priv);
5302         /* ...and determine whether they are sticking. */
5303         intel_gt_verify_workarounds(dev_priv, "init");
5304
5305         i915_gem_init_swizzling(dev_priv);
5306
5307         /*
5308          * At least 830 can leave some of the unused rings
5309          * "active" (ie. head != tail) after resume which
5310          * will prevent c3 entry. Makes sure all unused rings
5311          * are totally idle.
5312          */
5313         init_unused_rings(dev_priv);
5314
5315         BUG_ON(!dev_priv->kernel_context);
5316         if (i915_terminally_wedged(&dev_priv->gpu_error)) {
5317                 ret = -EIO;
5318                 goto out;
5319         }
5320
5321         ret = i915_ppgtt_init_hw(dev_priv);
5322         if (ret) {
5323                 DRM_ERROR("Enabling PPGTT failed (%d)\n", ret);
5324                 goto out;
5325         }
5326
5327         ret = intel_wopcm_init_hw(&dev_priv->wopcm);
5328         if (ret) {
5329                 DRM_ERROR("Enabling WOPCM failed (%d)\n", ret);
5330                 goto out;
5331         }
5332
5333         /* We can't enable contexts until all firmware is loaded */
5334         ret = intel_uc_init_hw(dev_priv);
5335         if (ret) {
5336                 DRM_ERROR("Enabling uc failed (%d)\n", ret);
5337                 goto out;
5338         }
5339
5340         intel_mocs_init_l3cc_table(dev_priv);
5341
5342         /* Only when the HW is re-initialised, can we replay the requests */
5343         ret = __i915_gem_restart_engines(dev_priv);
5344         if (ret)
5345                 goto cleanup_uc;
5346
5347         intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
5348
5349         return 0;
5350
5351 cleanup_uc:
5352         intel_uc_fini_hw(dev_priv);
5353 out:
5354         intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
5355
5356         return ret;
5357 }
5358
5359 static int __intel_engines_record_defaults(struct drm_i915_private *i915)
5360 {
5361         struct i915_gem_context *ctx;
5362         struct intel_engine_cs *engine;
5363         enum intel_engine_id id;
5364         int err;
5365
5366         /*
5367          * As we reset the gpu during very early sanitisation, the current
5368          * register state on the GPU should reflect its defaults values.
5369          * We load a context onto the hw (with restore-inhibit), then switch
5370          * over to a second context to save that default register state. We
5371          * can then prime every new context with that state so they all start
5372          * from the same default HW values.
5373          */
5374
5375         ctx = i915_gem_context_create_kernel(i915, 0);
5376         if (IS_ERR(ctx))
5377                 return PTR_ERR(ctx);
5378
5379         for_each_engine(engine, i915, id) {
5380                 struct i915_request *rq;
5381
5382                 rq = i915_request_alloc(engine, ctx);
5383                 if (IS_ERR(rq)) {
5384                         err = PTR_ERR(rq);
5385                         goto out_ctx;
5386                 }
5387
5388                 err = 0;
5389                 if (engine->init_context)
5390                         err = engine->init_context(rq);
5391
5392                 i915_request_add(rq);
5393                 if (err)
5394                         goto err_active;
5395         }
5396
5397         err = i915_gem_switch_to_kernel_context(i915);
5398         if (err)
5399                 goto err_active;
5400
5401         if (i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED, HZ / 5)) {
5402                 i915_gem_set_wedged(i915);
5403                 err = -EIO; /* Caller will declare us wedged */
5404                 goto err_active;
5405         }
5406
5407         assert_kernel_context_is_current(i915);
5408
5409         /*
5410          * Immediately park the GPU so that we enable powersaving and
5411          * treat it as idle. The next time we issue a request, we will
5412          * unpark and start using the engine->pinned_default_state, otherwise
5413          * it is in limbo and an early reset may fail.
5414          */
5415         __i915_gem_park(i915);
5416
5417         for_each_engine(engine, i915, id) {
5418                 struct i915_vma *state;
5419                 void *vaddr;
5420
5421                 GEM_BUG_ON(to_intel_context(ctx, engine)->pin_count);
5422
5423                 state = to_intel_context(ctx, engine)->state;
5424                 if (!state)
5425                         continue;
5426
5427                 /*
5428                  * As we will hold a reference to the logical state, it will
5429                  * not be torn down with the context, and importantly the
5430                  * object will hold onto its vma (making it possible for a
5431                  * stray GTT write to corrupt our defaults). Unmap the vma
5432                  * from the GTT to prevent such accidents and reclaim the
5433                  * space.
5434                  */
5435                 err = i915_vma_unbind(state);
5436                 if (err)
5437                         goto err_active;
5438
5439                 err = i915_gem_object_set_to_cpu_domain(state->obj, false);
5440                 if (err)
5441                         goto err_active;
5442
5443                 engine->default_state = i915_gem_object_get(state->obj);
5444
5445                 /* Check we can acquire the image of the context state */
5446                 vaddr = i915_gem_object_pin_map(engine->default_state,
5447                                                 I915_MAP_FORCE_WB);
5448                 if (IS_ERR(vaddr)) {
5449                         err = PTR_ERR(vaddr);
5450                         goto err_active;
5451                 }
5452
5453                 i915_gem_object_unpin_map(engine->default_state);
5454         }
5455
5456         if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) {
5457                 unsigned int found = intel_engines_has_context_isolation(i915);
5458
5459                 /*
5460                  * Make sure that classes with multiple engine instances all
5461                  * share the same basic configuration.
5462                  */
5463                 for_each_engine(engine, i915, id) {
5464                         unsigned int bit = BIT(engine->uabi_class);
5465                         unsigned int expected = engine->default_state ? bit : 0;
5466
5467                         if ((found & bit) != expected) {
5468                                 DRM_ERROR("mismatching default context state for class %d on engine %s\n",
5469                                           engine->uabi_class, engine->name);
5470                         }
5471                 }
5472         }
5473
5474 out_ctx:
5475         i915_gem_context_set_closed(ctx);
5476         i915_gem_context_put(ctx);
5477         return err;
5478
5479 err_active:
5480         /*
5481          * If we have to abandon now, we expect the engines to be idle
5482          * and ready to be torn-down. First try to flush any remaining
5483          * request, ensure we are pointing at the kernel context and
5484          * then remove it.
5485          */
5486         if (WARN_ON(i915_gem_switch_to_kernel_context(i915)))
5487                 goto out_ctx;
5488
5489         if (WARN_ON(i915_gem_wait_for_idle(i915,
5490                                            I915_WAIT_LOCKED,
5491                                            MAX_SCHEDULE_TIMEOUT)))
5492                 goto out_ctx;
5493
5494         i915_gem_contexts_lost(i915);
5495         goto out_ctx;
5496 }
5497
5498 static int
5499 i915_gem_init_scratch(struct drm_i915_private *i915, unsigned int size)
5500 {
5501         struct drm_i915_gem_object *obj;
5502         struct i915_vma *vma;
5503         int ret;
5504
5505         obj = i915_gem_object_create_stolen(i915, size);
5506         if (!obj)
5507                 obj = i915_gem_object_create_internal(i915, size);
5508         if (IS_ERR(obj)) {
5509                 DRM_ERROR("Failed to allocate scratch page\n");
5510                 return PTR_ERR(obj);
5511         }
5512
5513         vma = i915_vma_instance(obj, &i915->ggtt.vm, NULL);
5514         if (IS_ERR(vma)) {
5515                 ret = PTR_ERR(vma);
5516                 goto err_unref;
5517         }
5518
5519         ret = i915_vma_pin(vma, 0, 0, PIN_GLOBAL | PIN_HIGH);
5520         if (ret)
5521                 goto err_unref;
5522
5523         i915->gt.scratch = vma;
5524         return 0;
5525
5526 err_unref:
5527         i915_gem_object_put(obj);
5528         return ret;
5529 }
5530
5531 static void i915_gem_fini_scratch(struct drm_i915_private *i915)
5532 {
5533         i915_vma_unpin_and_release(&i915->gt.scratch, 0);
5534 }
5535
5536 int i915_gem_init(struct drm_i915_private *dev_priv)
5537 {
5538         int ret;
5539
5540         /* We need to fallback to 4K pages if host doesn't support huge gtt. */
5541         if (intel_vgpu_active(dev_priv) && !intel_vgpu_has_huge_gtt(dev_priv))
5542                 mkwrite_device_info(dev_priv)->page_sizes =
5543                         I915_GTT_PAGE_SIZE_4K;
5544
5545         dev_priv->mm.unordered_timeline = dma_fence_context_alloc(1);
5546
5547         if (HAS_LOGICAL_RING_CONTEXTS(dev_priv)) {
5548                 dev_priv->gt.resume = intel_lr_context_resume;
5549                 dev_priv->gt.cleanup_engine = intel_logical_ring_cleanup;
5550         } else {
5551                 dev_priv->gt.resume = intel_legacy_submission_resume;
5552                 dev_priv->gt.cleanup_engine = intel_engine_cleanup;
5553         }
5554
5555         ret = i915_gem_init_userptr(dev_priv);
5556         if (ret)
5557                 return ret;
5558
5559         ret = intel_uc_init_misc(dev_priv);
5560         if (ret)
5561                 return ret;
5562
5563         ret = intel_wopcm_init(&dev_priv->wopcm);
5564         if (ret)
5565                 goto err_uc_misc;
5566
5567         /* This is just a security blanket to placate dragons.
5568          * On some systems, we very sporadically observe that the first TLBs
5569          * used by the CS may be stale, despite us poking the TLB reset. If
5570          * we hold the forcewake during initialisation these problems
5571          * just magically go away.
5572          */
5573         mutex_lock(&dev_priv->drm.struct_mutex);
5574         intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
5575
5576         ret = i915_gem_init_ggtt(dev_priv);
5577         if (ret) {
5578                 GEM_BUG_ON(ret == -EIO);
5579                 goto err_unlock;
5580         }
5581
5582         ret = i915_gem_init_scratch(dev_priv,
5583                                     IS_GEN2(dev_priv) ? SZ_256K : PAGE_SIZE);
5584         if (ret) {
5585                 GEM_BUG_ON(ret == -EIO);
5586                 goto err_ggtt;
5587         }
5588
5589         ret = i915_gem_contexts_init(dev_priv);
5590         if (ret) {
5591                 GEM_BUG_ON(ret == -EIO);
5592                 goto err_scratch;
5593         }
5594
5595         ret = intel_engines_init(dev_priv);
5596         if (ret) {
5597                 GEM_BUG_ON(ret == -EIO);
5598                 goto err_context;
5599         }
5600
5601         intel_init_gt_powersave(dev_priv);
5602
5603         ret = intel_uc_init(dev_priv);
5604         if (ret)
5605                 goto err_pm;
5606
5607         ret = i915_gem_init_hw(dev_priv);
5608         if (ret)
5609                 goto err_uc_init;
5610
5611         /*
5612          * Despite its name intel_init_clock_gating applies both display
5613          * clock gating workarounds; GT mmio workarounds and the occasional
5614          * GT power context workaround. Worse, sometimes it includes a context
5615          * register workaround which we need to apply before we record the
5616          * default HW state for all contexts.
5617          *
5618          * FIXME: break up the workarounds and apply them at the right time!
5619          */
5620         intel_init_clock_gating(dev_priv);
5621
5622         ret = __intel_engines_record_defaults(dev_priv);
5623         if (ret)
5624                 goto err_init_hw;
5625
5626         if (i915_inject_load_failure()) {
5627                 ret = -ENODEV;
5628                 goto err_init_hw;
5629         }
5630
5631         if (i915_inject_load_failure()) {
5632                 ret = -EIO;
5633                 goto err_init_hw;
5634         }
5635
5636         intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
5637         mutex_unlock(&dev_priv->drm.struct_mutex);
5638
5639         return 0;
5640
5641         /*
5642          * Unwinding is complicated by that we want to handle -EIO to mean
5643          * disable GPU submission but keep KMS alive. We want to mark the
5644          * HW as irrevisibly wedged, but keep enough state around that the
5645          * driver doesn't explode during runtime.
5646          */
5647 err_init_hw:
5648         mutex_unlock(&dev_priv->drm.struct_mutex);
5649
5650         WARN_ON(i915_gem_suspend(dev_priv));
5651         i915_gem_suspend_late(dev_priv);
5652
5653         i915_gem_drain_workqueue(dev_priv);
5654
5655         mutex_lock(&dev_priv->drm.struct_mutex);
5656         intel_uc_fini_hw(dev_priv);
5657 err_uc_init:
5658         intel_uc_fini(dev_priv);
5659 err_pm:
5660         if (ret != -EIO) {
5661                 intel_cleanup_gt_powersave(dev_priv);
5662                 i915_gem_cleanup_engines(dev_priv);
5663         }
5664 err_context:
5665         if (ret != -EIO)
5666                 i915_gem_contexts_fini(dev_priv);
5667 err_scratch:
5668         i915_gem_fini_scratch(dev_priv);
5669 err_ggtt:
5670 err_unlock:
5671         intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
5672         mutex_unlock(&dev_priv->drm.struct_mutex);
5673
5674 err_uc_misc:
5675         intel_uc_fini_misc(dev_priv);
5676
5677         if (ret != -EIO)
5678                 i915_gem_cleanup_userptr(dev_priv);
5679
5680         if (ret == -EIO) {
5681                 mutex_lock(&dev_priv->drm.struct_mutex);
5682
5683                 /*
5684                  * Allow engine initialisation to fail by marking the GPU as
5685                  * wedged. But we only want to do this where the GPU is angry,
5686                  * for all other failure, such as an allocation failure, bail.
5687                  */
5688                 if (!i915_terminally_wedged(&dev_priv->gpu_error)) {
5689                         i915_load_error(dev_priv,
5690                                         "Failed to initialize GPU, declaring it wedged!\n");
5691                         i915_gem_set_wedged(dev_priv);
5692                 }
5693
5694                 /* Minimal basic recovery for KMS */
5695                 ret = i915_ggtt_enable_hw(dev_priv);
5696                 i915_gem_restore_gtt_mappings(dev_priv);
5697                 i915_gem_restore_fences(dev_priv);
5698                 intel_init_clock_gating(dev_priv);
5699
5700                 mutex_unlock(&dev_priv->drm.struct_mutex);
5701         }
5702
5703         i915_gem_drain_freed_objects(dev_priv);
5704         return ret;
5705 }
5706
5707 void i915_gem_fini(struct drm_i915_private *dev_priv)
5708 {
5709         i915_gem_suspend_late(dev_priv);
5710         intel_disable_gt_powersave(dev_priv);
5711
5712         /* Flush any outstanding unpin_work. */
5713         i915_gem_drain_workqueue(dev_priv);
5714
5715         mutex_lock(&dev_priv->drm.struct_mutex);
5716         intel_uc_fini_hw(dev_priv);
5717         intel_uc_fini(dev_priv);
5718         i915_gem_cleanup_engines(dev_priv);
5719         i915_gem_contexts_fini(dev_priv);
5720         i915_gem_fini_scratch(dev_priv);
5721         mutex_unlock(&dev_priv->drm.struct_mutex);
5722
5723         intel_wa_list_free(&dev_priv->gt_wa_list);
5724
5725         intel_cleanup_gt_powersave(dev_priv);
5726
5727         intel_uc_fini_misc(dev_priv);
5728         i915_gem_cleanup_userptr(dev_priv);
5729
5730         i915_gem_drain_freed_objects(dev_priv);
5731
5732         WARN_ON(!list_empty(&dev_priv->contexts.list));
5733 }
5734
5735 void i915_gem_init_mmio(struct drm_i915_private *i915)
5736 {
5737         i915_gem_sanitize(i915);
5738 }
5739
5740 void
5741 i915_gem_cleanup_engines(struct drm_i915_private *dev_priv)
5742 {
5743         struct intel_engine_cs *engine;
5744         enum intel_engine_id id;
5745
5746         for_each_engine(engine, dev_priv, id)
5747                 dev_priv->gt.cleanup_engine(engine);
5748 }
5749
5750 void
5751 i915_gem_load_init_fences(struct drm_i915_private *dev_priv)
5752 {
5753         int i;
5754
5755         if (INTEL_GEN(dev_priv) >= 7 && !IS_VALLEYVIEW(dev_priv) &&
5756             !IS_CHERRYVIEW(dev_priv))
5757                 dev_priv->num_fence_regs = 32;
5758         else if (INTEL_GEN(dev_priv) >= 4 ||
5759                  IS_I945G(dev_priv) || IS_I945GM(dev_priv) ||
5760                  IS_G33(dev_priv) || IS_PINEVIEW(dev_priv))
5761                 dev_priv->num_fence_regs = 16;
5762         else
5763                 dev_priv->num_fence_regs = 8;
5764
5765         if (intel_vgpu_active(dev_priv))
5766                 dev_priv->num_fence_regs =
5767                                 I915_READ(vgtif_reg(avail_rs.fence_num));
5768
5769         /* Initialize fence registers to zero */
5770         for (i = 0; i < dev_priv->num_fence_regs; i++) {
5771                 struct drm_i915_fence_reg *fence = &dev_priv->fence_regs[i];
5772
5773                 fence->i915 = dev_priv;
5774                 fence->id = i;
5775                 list_add_tail(&fence->link, &dev_priv->mm.fence_list);
5776         }
5777         i915_gem_restore_fences(dev_priv);
5778
5779         i915_gem_detect_bit_6_swizzle(dev_priv);
5780 }
5781
5782 static void i915_gem_init__mm(struct drm_i915_private *i915)
5783 {
5784         spin_lock_init(&i915->mm.object_stat_lock);
5785         spin_lock_init(&i915->mm.obj_lock);
5786         spin_lock_init(&i915->mm.free_lock);
5787
5788         init_llist_head(&i915->mm.free_list);
5789
5790         INIT_LIST_HEAD(&i915->mm.unbound_list);
5791         INIT_LIST_HEAD(&i915->mm.bound_list);
5792         INIT_LIST_HEAD(&i915->mm.fence_list);
5793         INIT_LIST_HEAD(&i915->mm.userfault_list);
5794
5795         INIT_WORK(&i915->mm.free_work, __i915_gem_free_work);
5796 }
5797
5798 int i915_gem_init_early(struct drm_i915_private *dev_priv)
5799 {
5800         int err = -ENOMEM;
5801
5802         dev_priv->objects = KMEM_CACHE(drm_i915_gem_object, SLAB_HWCACHE_ALIGN);
5803         if (!dev_priv->objects)
5804                 goto err_out;
5805
5806         dev_priv->vmas = KMEM_CACHE(i915_vma, SLAB_HWCACHE_ALIGN);
5807         if (!dev_priv->vmas)
5808                 goto err_objects;
5809
5810         dev_priv->luts = KMEM_CACHE(i915_lut_handle, 0);
5811         if (!dev_priv->luts)
5812                 goto err_vmas;
5813
5814         dev_priv->requests = KMEM_CACHE(i915_request,
5815                                         SLAB_HWCACHE_ALIGN |
5816                                         SLAB_RECLAIM_ACCOUNT |
5817                                         SLAB_TYPESAFE_BY_RCU);
5818         if (!dev_priv->requests)
5819                 goto err_luts;
5820
5821         dev_priv->dependencies = KMEM_CACHE(i915_dependency,
5822                                             SLAB_HWCACHE_ALIGN |
5823                                             SLAB_RECLAIM_ACCOUNT);
5824         if (!dev_priv->dependencies)
5825                 goto err_requests;
5826
5827         dev_priv->priorities = KMEM_CACHE(i915_priolist, SLAB_HWCACHE_ALIGN);
5828         if (!dev_priv->priorities)
5829                 goto err_dependencies;
5830
5831         INIT_LIST_HEAD(&dev_priv->gt.timelines);
5832         INIT_LIST_HEAD(&dev_priv->gt.active_rings);
5833         INIT_LIST_HEAD(&dev_priv->gt.closed_vma);
5834
5835         i915_gem_init__mm(dev_priv);
5836
5837         INIT_DELAYED_WORK(&dev_priv->gt.retire_work,
5838                           i915_gem_retire_work_handler);
5839         INIT_DELAYED_WORK(&dev_priv->gt.idle_work,
5840                           i915_gem_idle_work_handler);
5841         init_waitqueue_head(&dev_priv->gpu_error.wait_queue);
5842         init_waitqueue_head(&dev_priv->gpu_error.reset_queue);
5843
5844         atomic_set(&dev_priv->mm.bsd_engine_dispatch_index, 0);
5845
5846         spin_lock_init(&dev_priv->fb_tracking.lock);
5847
5848         err = i915_gemfs_init(dev_priv);
5849         if (err)
5850                 DRM_NOTE("Unable to create a private tmpfs mount, hugepage support will be disabled(%d).\n", err);
5851
5852         return 0;
5853
5854 err_dependencies:
5855         kmem_cache_destroy(dev_priv->dependencies);
5856 err_requests:
5857         kmem_cache_destroy(dev_priv->requests);
5858 err_luts:
5859         kmem_cache_destroy(dev_priv->luts);
5860 err_vmas:
5861         kmem_cache_destroy(dev_priv->vmas);
5862 err_objects:
5863         kmem_cache_destroy(dev_priv->objects);
5864 err_out:
5865         return err;
5866 }
5867
5868 void i915_gem_cleanup_early(struct drm_i915_private *dev_priv)
5869 {
5870         i915_gem_drain_freed_objects(dev_priv);
5871         GEM_BUG_ON(!llist_empty(&dev_priv->mm.free_list));
5872         GEM_BUG_ON(atomic_read(&dev_priv->mm.free_count));
5873         WARN_ON(dev_priv->mm.object_count);
5874         WARN_ON(!list_empty(&dev_priv->gt.timelines));
5875
5876         kmem_cache_destroy(dev_priv->priorities);
5877         kmem_cache_destroy(dev_priv->dependencies);
5878         kmem_cache_destroy(dev_priv->requests);
5879         kmem_cache_destroy(dev_priv->luts);
5880         kmem_cache_destroy(dev_priv->vmas);
5881         kmem_cache_destroy(dev_priv->objects);
5882
5883         /* And ensure that our DESTROY_BY_RCU slabs are truly destroyed */
5884         rcu_barrier();
5885
5886         i915_gemfs_fini(dev_priv);
5887 }
5888
5889 int i915_gem_freeze(struct drm_i915_private *dev_priv)
5890 {
5891         /* Discard all purgeable objects, let userspace recover those as
5892          * required after resuming.
5893          */
5894         i915_gem_shrink_all(dev_priv);
5895
5896         return 0;
5897 }
5898
5899 int i915_gem_freeze_late(struct drm_i915_private *i915)
5900 {
5901         struct drm_i915_gem_object *obj;
5902         struct list_head *phases[] = {
5903                 &i915->mm.unbound_list,
5904                 &i915->mm.bound_list,
5905                 NULL
5906         }, **phase;
5907
5908         /*
5909          * Called just before we write the hibernation image.
5910          *
5911          * We need to update the domain tracking to reflect that the CPU
5912          * will be accessing all the pages to create and restore from the
5913          * hibernation, and so upon restoration those pages will be in the
5914          * CPU domain.
5915          *
5916          * To make sure the hibernation image contains the latest state,
5917          * we update that state just before writing out the image.
5918          *
5919          * To try and reduce the hibernation image, we manually shrink
5920          * the objects as well, see i915_gem_freeze()
5921          */
5922
5923         i915_gem_shrink(i915, -1UL, NULL, I915_SHRINK_UNBOUND);
5924         i915_gem_drain_freed_objects(i915);
5925
5926         mutex_lock(&i915->drm.struct_mutex);
5927         for (phase = phases; *phase; phase++) {
5928                 list_for_each_entry(obj, *phase, mm.link)
5929                         WARN_ON(i915_gem_object_set_to_cpu_domain(obj, true));
5930         }
5931         mutex_unlock(&i915->drm.struct_mutex);
5932
5933         return 0;
5934 }
5935
5936 void i915_gem_release(struct drm_device *dev, struct drm_file *file)
5937 {
5938         struct drm_i915_file_private *file_priv = file->driver_priv;
5939         struct i915_request *request;
5940
5941         /* Clean up our request list when the client is going away, so that
5942          * later retire_requests won't dereference our soon-to-be-gone
5943          * file_priv.
5944          */
5945         spin_lock(&file_priv->mm.lock);
5946         list_for_each_entry(request, &file_priv->mm.request_list, client_link)
5947                 request->file_priv = NULL;
5948         spin_unlock(&file_priv->mm.lock);
5949 }
5950
5951 int i915_gem_open(struct drm_i915_private *i915, struct drm_file *file)
5952 {
5953         struct drm_i915_file_private *file_priv;
5954         int ret;
5955
5956         DRM_DEBUG("\n");
5957
5958         file_priv = kzalloc(sizeof(*file_priv), GFP_KERNEL);
5959         if (!file_priv)
5960                 return -ENOMEM;
5961
5962         file->driver_priv = file_priv;
5963         file_priv->dev_priv = i915;
5964         file_priv->file = file;
5965
5966         spin_lock_init(&file_priv->mm.lock);
5967         INIT_LIST_HEAD(&file_priv->mm.request_list);
5968
5969         file_priv->bsd_engine = -1;
5970         file_priv->hang_timestamp = jiffies;
5971
5972         ret = i915_gem_context_open(i915, file);
5973         if (ret)
5974                 kfree(file_priv);
5975
5976         return ret;
5977 }
5978
5979 /**
5980  * i915_gem_track_fb - update frontbuffer tracking
5981  * @old: current GEM buffer for the frontbuffer slots
5982  * @new: new GEM buffer for the frontbuffer slots
5983  * @frontbuffer_bits: bitmask of frontbuffer slots
5984  *
5985  * This updates the frontbuffer tracking bits @frontbuffer_bits by clearing them
5986  * from @old and setting them in @new. Both @old and @new can be NULL.
5987  */
5988 void i915_gem_track_fb(struct drm_i915_gem_object *old,
5989                        struct drm_i915_gem_object *new,
5990                        unsigned frontbuffer_bits)
5991 {
5992         /* Control of individual bits within the mask are guarded by
5993          * the owning plane->mutex, i.e. we can never see concurrent
5994          * manipulation of individual bits. But since the bitfield as a whole
5995          * is updated using RMW, we need to use atomics in order to update
5996          * the bits.
5997          */
5998         BUILD_BUG_ON(INTEL_FRONTBUFFER_BITS_PER_PIPE * I915_MAX_PIPES >
5999                      BITS_PER_TYPE(atomic_t));
6000
6001         if (old) {
6002                 WARN_ON(!(atomic_read(&old->frontbuffer_bits) & frontbuffer_bits));
6003                 atomic_andnot(frontbuffer_bits, &old->frontbuffer_bits);
6004         }
6005
6006         if (new) {
6007                 WARN_ON(atomic_read(&new->frontbuffer_bits) & frontbuffer_bits);
6008                 atomic_or(frontbuffer_bits, &new->frontbuffer_bits);
6009         }
6010 }
6011
6012 /* Allocate a new GEM object and fill it with the supplied data */
6013 struct drm_i915_gem_object *
6014 i915_gem_object_create_from_data(struct drm_i915_private *dev_priv,
6015                                  const void *data, size_t size)
6016 {
6017         struct drm_i915_gem_object *obj;
6018         struct file *file;
6019         size_t offset;
6020         int err;
6021
6022         obj = i915_gem_object_create(dev_priv, round_up(size, PAGE_SIZE));
6023         if (IS_ERR(obj))
6024                 return obj;
6025
6026         GEM_BUG_ON(obj->write_domain != I915_GEM_DOMAIN_CPU);
6027
6028         file = obj->base.filp;
6029         offset = 0;
6030         do {
6031                 unsigned int len = min_t(typeof(size), size, PAGE_SIZE);
6032                 struct page *page;
6033                 void *pgdata, *vaddr;
6034
6035                 err = pagecache_write_begin(file, file->f_mapping,
6036                                             offset, len, 0,
6037                                             &page, &pgdata);
6038                 if (err < 0)
6039                         goto fail;
6040
6041                 vaddr = kmap(page);
6042                 memcpy(vaddr, data, len);
6043                 kunmap(page);
6044
6045                 err = pagecache_write_end(file, file->f_mapping,
6046                                           offset, len, len,
6047                                           page, pgdata);
6048                 if (err < 0)
6049                         goto fail;
6050
6051                 size -= len;
6052                 data += len;
6053                 offset += len;
6054         } while (size);
6055
6056         return obj;
6057
6058 fail:
6059         i915_gem_object_put(obj);
6060         return ERR_PTR(err);
6061 }
6062
6063 struct scatterlist *
6064 i915_gem_object_get_sg(struct drm_i915_gem_object *obj,
6065                        unsigned int n,
6066                        unsigned int *offset)
6067 {
6068         struct i915_gem_object_page_iter *iter = &obj->mm.get_page;
6069         struct scatterlist *sg;
6070         unsigned int idx, count;
6071
6072         might_sleep();
6073         GEM_BUG_ON(n >= obj->base.size >> PAGE_SHIFT);
6074         GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
6075
6076         /* As we iterate forward through the sg, we record each entry in a
6077          * radixtree for quick repeated (backwards) lookups. If we have seen
6078          * this index previously, we will have an entry for it.
6079          *
6080          * Initial lookup is O(N), but this is amortized to O(1) for
6081          * sequential page access (where each new request is consecutive
6082          * to the previous one). Repeated lookups are O(lg(obj->base.size)),
6083          * i.e. O(1) with a large constant!
6084          */
6085         if (n < READ_ONCE(iter->sg_idx))
6086                 goto lookup;
6087
6088         mutex_lock(&iter->lock);
6089
6090         /* We prefer to reuse the last sg so that repeated lookup of this
6091          * (or the subsequent) sg are fast - comparing against the last
6092          * sg is faster than going through the radixtree.
6093          */
6094
6095         sg = iter->sg_pos;
6096         idx = iter->sg_idx;
6097         count = __sg_page_count(sg);
6098
6099         while (idx + count <= n) {
6100                 void *entry;
6101                 unsigned long i;
6102                 int ret;
6103
6104                 /* If we cannot allocate and insert this entry, or the
6105                  * individual pages from this range, cancel updating the
6106                  * sg_idx so that on this lookup we are forced to linearly
6107                  * scan onwards, but on future lookups we will try the
6108                  * insertion again (in which case we need to be careful of
6109                  * the error return reporting that we have already inserted
6110                  * this index).
6111                  */
6112                 ret = radix_tree_insert(&iter->radix, idx, sg);
6113                 if (ret && ret != -EEXIST)
6114                         goto scan;
6115
6116                 entry = xa_mk_value(idx);
6117                 for (i = 1; i < count; i++) {
6118                         ret = radix_tree_insert(&iter->radix, idx + i, entry);
6119                         if (ret && ret != -EEXIST)
6120                                 goto scan;
6121                 }
6122
6123                 idx += count;
6124                 sg = ____sg_next(sg);
6125                 count = __sg_page_count(sg);
6126         }
6127
6128 scan:
6129         iter->sg_pos = sg;
6130         iter->sg_idx = idx;
6131
6132         mutex_unlock(&iter->lock);
6133
6134         if (unlikely(n < idx)) /* insertion completed by another thread */
6135                 goto lookup;
6136
6137         /* In case we failed to insert the entry into the radixtree, we need
6138          * to look beyond the current sg.
6139          */
6140         while (idx + count <= n) {
6141                 idx += count;
6142                 sg = ____sg_next(sg);
6143                 count = __sg_page_count(sg);
6144         }
6145
6146         *offset = n - idx;
6147         return sg;
6148
6149 lookup:
6150         rcu_read_lock();
6151
6152         sg = radix_tree_lookup(&iter->radix, n);
6153         GEM_BUG_ON(!sg);
6154
6155         /* If this index is in the middle of multi-page sg entry,
6156          * the radix tree will contain a value entry that points
6157          * to the start of that range. We will return the pointer to
6158          * the base page and the offset of this page within the
6159          * sg entry's range.
6160          */
6161         *offset = 0;
6162         if (unlikely(xa_is_value(sg))) {
6163                 unsigned long base = xa_to_value(sg);
6164
6165                 sg = radix_tree_lookup(&iter->radix, base);
6166                 GEM_BUG_ON(!sg);
6167
6168                 *offset = n - base;
6169         }
6170
6171         rcu_read_unlock();
6172
6173         return sg;
6174 }
6175
6176 struct page *
6177 i915_gem_object_get_page(struct drm_i915_gem_object *obj, unsigned int n)
6178 {
6179         struct scatterlist *sg;
6180         unsigned int offset;
6181
6182         GEM_BUG_ON(!i915_gem_object_has_struct_page(obj));
6183
6184         sg = i915_gem_object_get_sg(obj, n, &offset);
6185         return nth_page(sg_page(sg), offset);
6186 }
6187
6188 /* Like i915_gem_object_get_page(), but mark the returned page dirty */
6189 struct page *
6190 i915_gem_object_get_dirty_page(struct drm_i915_gem_object *obj,
6191                                unsigned int n)
6192 {
6193         struct page *page;
6194
6195         page = i915_gem_object_get_page(obj, n);
6196         if (!obj->mm.dirty)
6197                 set_page_dirty(page);
6198
6199         return page;
6200 }
6201
6202 dma_addr_t
6203 i915_gem_object_get_dma_address(struct drm_i915_gem_object *obj,
6204                                 unsigned long n)
6205 {
6206         struct scatterlist *sg;
6207         unsigned int offset;
6208
6209         sg = i915_gem_object_get_sg(obj, n, &offset);
6210         return sg_dma_address(sg) + (offset << PAGE_SHIFT);
6211 }
6212
6213 int i915_gem_object_attach_phys(struct drm_i915_gem_object *obj, int align)
6214 {
6215         struct sg_table *pages;
6216         int err;
6217
6218         if (align > obj->base.size)
6219                 return -EINVAL;
6220
6221         if (obj->ops == &i915_gem_phys_ops)
6222                 return 0;
6223
6224         if (obj->ops != &i915_gem_object_ops)
6225                 return -EINVAL;
6226
6227         err = i915_gem_object_unbind(obj);
6228         if (err)
6229                 return err;
6230
6231         mutex_lock(&obj->mm.lock);
6232
6233         if (obj->mm.madv != I915_MADV_WILLNEED) {
6234                 err = -EFAULT;
6235                 goto err_unlock;
6236         }
6237
6238         if (obj->mm.quirked) {
6239                 err = -EFAULT;
6240                 goto err_unlock;
6241         }
6242
6243         if (obj->mm.mapping) {
6244                 err = -EBUSY;
6245                 goto err_unlock;
6246         }
6247
6248         pages = __i915_gem_object_unset_pages(obj);
6249
6250         obj->ops = &i915_gem_phys_ops;
6251
6252         err = ____i915_gem_object_get_pages(obj);
6253         if (err)
6254                 goto err_xfer;
6255
6256         /* Perma-pin (until release) the physical set of pages */
6257         __i915_gem_object_pin_pages(obj);
6258
6259         if (!IS_ERR_OR_NULL(pages))
6260                 i915_gem_object_ops.put_pages(obj, pages);
6261         mutex_unlock(&obj->mm.lock);
6262         return 0;
6263
6264 err_xfer:
6265         obj->ops = &i915_gem_object_ops;
6266         if (!IS_ERR_OR_NULL(pages)) {
6267                 unsigned int sg_page_sizes = i915_sg_page_sizes(pages->sgl);
6268
6269                 __i915_gem_object_set_pages(obj, pages, sg_page_sizes);
6270         }
6271 err_unlock:
6272         mutex_unlock(&obj->mm.lock);
6273         return err;
6274 }
6275
6276 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
6277 #include "selftests/scatterlist.c"
6278 #include "selftests/mock_gem_device.c"
6279 #include "selftests/huge_gem_object.c"
6280 #include "selftests/huge_pages.c"
6281 #include "selftests/i915_gem_object.c"
6282 #include "selftests/i915_gem_coherency.c"
6283 #include "selftests/i915_gem.c"
6284 #endif