2 * SPDX-License-Identifier: MIT
4 * Copyright © 2014-2016 Intel Corporation
8 #include "i915_gem_clflush.h"
9 #include "i915_gem_gtt.h"
10 #include "i915_gem_ioctls.h"
11 #include "i915_gem_object.h"
13 #include "intel_frontbuffer.h"
15 static void __i915_gem_object_flush_for_display(struct drm_i915_gem_object *obj)
18 * We manually flush the CPU domain so that we can override and
19 * force the flush for the display, and perform it asyncrhonously.
21 i915_gem_object_flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
23 i915_gem_clflush_object(obj, I915_CLFLUSH_FORCE);
24 obj->write_domain = 0;
27 void i915_gem_object_flush_if_display(struct drm_i915_gem_object *obj)
29 if (!READ_ONCE(obj->pin_global))
32 i915_gem_object_lock(obj);
33 __i915_gem_object_flush_for_display(obj);
34 i915_gem_object_unlock(obj);
38 * Moves a single object to the WC read, and possibly write domain.
39 * @obj: object to act on
40 * @write: ask for write access or read only
42 * This function returns when the move is complete, including waiting on
46 i915_gem_object_set_to_wc_domain(struct drm_i915_gem_object *obj, bool write)
50 assert_object_held(obj);
52 ret = i915_gem_object_wait(obj,
53 I915_WAIT_INTERRUPTIBLE |
54 (write ? I915_WAIT_ALL : 0),
55 MAX_SCHEDULE_TIMEOUT);
59 if (obj->write_domain == I915_GEM_DOMAIN_WC)
62 /* Flush and acquire obj->pages so that we are coherent through
63 * direct access in memory with previous cached writes through
64 * shmemfs and that our cache domain tracking remains valid.
65 * For example, if the obj->filp was moved to swap without us
66 * being notified and releasing the pages, we would mistakenly
67 * continue to assume that the obj remained out of the CPU cached
70 ret = i915_gem_object_pin_pages(obj);
74 i915_gem_object_flush_write_domain(obj, ~I915_GEM_DOMAIN_WC);
76 /* Serialise direct access to this object with the barriers for
77 * coherent writes from the GPU, by effectively invalidating the
78 * WC domain upon first access.
80 if ((obj->read_domains & I915_GEM_DOMAIN_WC) == 0)
83 /* It should now be out of any other write domains, and we can update
84 * the domain values for our changes.
86 GEM_BUG_ON((obj->write_domain & ~I915_GEM_DOMAIN_WC) != 0);
87 obj->read_domains |= I915_GEM_DOMAIN_WC;
89 obj->read_domains = I915_GEM_DOMAIN_WC;
90 obj->write_domain = I915_GEM_DOMAIN_WC;
94 i915_gem_object_unpin_pages(obj);
99 * Moves a single object to the GTT read, and possibly write domain.
100 * @obj: object to act on
101 * @write: ask for write access or read only
103 * This function returns when the move is complete, including waiting on
107 i915_gem_object_set_to_gtt_domain(struct drm_i915_gem_object *obj, bool write)
111 assert_object_held(obj);
113 ret = i915_gem_object_wait(obj,
114 I915_WAIT_INTERRUPTIBLE |
115 (write ? I915_WAIT_ALL : 0),
116 MAX_SCHEDULE_TIMEOUT);
120 if (obj->write_domain == I915_GEM_DOMAIN_GTT)
123 /* Flush and acquire obj->pages so that we are coherent through
124 * direct access in memory with previous cached writes through
125 * shmemfs and that our cache domain tracking remains valid.
126 * For example, if the obj->filp was moved to swap without us
127 * being notified and releasing the pages, we would mistakenly
128 * continue to assume that the obj remained out of the CPU cached
131 ret = i915_gem_object_pin_pages(obj);
135 i915_gem_object_flush_write_domain(obj, ~I915_GEM_DOMAIN_GTT);
137 /* Serialise direct access to this object with the barriers for
138 * coherent writes from the GPU, by effectively invalidating the
139 * GTT domain upon first access.
141 if ((obj->read_domains & I915_GEM_DOMAIN_GTT) == 0)
144 /* It should now be out of any other write domains, and we can update
145 * the domain values for our changes.
147 GEM_BUG_ON((obj->write_domain & ~I915_GEM_DOMAIN_GTT) != 0);
148 obj->read_domains |= I915_GEM_DOMAIN_GTT;
150 obj->read_domains = I915_GEM_DOMAIN_GTT;
151 obj->write_domain = I915_GEM_DOMAIN_GTT;
152 obj->mm.dirty = true;
155 i915_gem_object_unpin_pages(obj);
160 * Changes the cache-level of an object across all VMA.
161 * @obj: object to act on
162 * @cache_level: new cache level to set for the object
164 * After this function returns, the object will be in the new cache-level
165 * across all GTT and the contents of the backing storage will be coherent,
166 * with respect to the new cache-level. In order to keep the backing storage
167 * coherent for all users, we only allow a single cache level to be set
168 * globally on the object and prevent it from being changed whilst the
169 * hardware is reading from the object. That is if the object is currently
170 * on the scanout it will be set to uncached (or equivalent display
171 * cache coherency) and all non-MOCS GPU access will also be uncached so
172 * that all direct access to the scanout remains coherent.
174 int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
175 enum i915_cache_level cache_level)
177 struct i915_vma *vma;
180 assert_object_held(obj);
182 if (obj->cache_level == cache_level)
185 /* Inspect the list of currently bound VMA and unbind any that would
186 * be invalid given the new cache-level. This is principally to
187 * catch the issue of the CS prefetch crossing page boundaries and
188 * reading an invalid PTE on older architectures.
191 list_for_each_entry(vma, &obj->vma.list, obj_link) {
192 if (!drm_mm_node_allocated(&vma->node))
195 if (i915_vma_is_pinned(vma)) {
196 DRM_DEBUG("can not change the cache level of pinned objects\n");
200 if (!i915_vma_is_closed(vma) &&
201 i915_gem_valid_gtt_space(vma, cache_level))
204 ret = i915_vma_unbind(vma);
208 /* As unbinding may affect other elements in the
209 * obj->vma_list (due to side-effects from retiring
210 * an active vma), play safe and restart the iterator.
215 /* We can reuse the existing drm_mm nodes but need to change the
216 * cache-level on the PTE. We could simply unbind them all and
217 * rebind with the correct cache-level on next use. However since
218 * we already have a valid slot, dma mapping, pages etc, we may as
219 * rewrite the PTE in the belief that doing so tramples upon less
220 * state and so involves less work.
222 if (obj->bind_count) {
223 /* Before we change the PTE, the GPU must not be accessing it.
224 * If we wait upon the object, we know that all the bound
225 * VMA are no longer active.
227 ret = i915_gem_object_wait(obj,
228 I915_WAIT_INTERRUPTIBLE |
230 MAX_SCHEDULE_TIMEOUT);
234 if (!HAS_LLC(to_i915(obj->base.dev)) &&
235 cache_level != I915_CACHE_NONE) {
236 /* Access to snoopable pages through the GTT is
237 * incoherent and on some machines causes a hard
238 * lockup. Relinquish the CPU mmaping to force
239 * userspace to refault in the pages and we can
240 * then double check if the GTT mapping is still
241 * valid for that pointer access.
243 i915_gem_object_release_mmap(obj);
245 /* As we no longer need a fence for GTT access,
246 * we can relinquish it now (and so prevent having
247 * to steal a fence from someone else on the next
248 * fence request). Note GPU activity would have
249 * dropped the fence as all snoopable access is
250 * supposed to be linear.
252 for_each_ggtt_vma(vma, obj) {
253 ret = i915_vma_put_fence(vma);
258 /* We either have incoherent backing store and
259 * so no GTT access or the architecture is fully
260 * coherent. In such cases, existing GTT mmaps
261 * ignore the cache bit in the PTE and we can
262 * rewrite it without confusing the GPU or having
263 * to force userspace to fault back in its mmaps.
267 list_for_each_entry(vma, &obj->vma.list, obj_link) {
268 if (!drm_mm_node_allocated(&vma->node))
271 ret = i915_vma_bind(vma, cache_level, PIN_UPDATE);
277 list_for_each_entry(vma, &obj->vma.list, obj_link)
278 vma->node.color = cache_level;
279 i915_gem_object_set_cache_coherency(obj, cache_level);
280 obj->cache_dirty = true; /* Always invalidate stale cachelines */
285 int i915_gem_get_caching_ioctl(struct drm_device *dev, void *data,
286 struct drm_file *file)
288 struct drm_i915_gem_caching *args = data;
289 struct drm_i915_gem_object *obj;
293 obj = i915_gem_object_lookup_rcu(file, args->handle);
299 switch (obj->cache_level) {
301 case I915_CACHE_L3_LLC:
302 args->caching = I915_CACHING_CACHED;
306 args->caching = I915_CACHING_DISPLAY;
310 args->caching = I915_CACHING_NONE;
318 int i915_gem_set_caching_ioctl(struct drm_device *dev, void *data,
319 struct drm_file *file)
321 struct drm_i915_private *i915 = to_i915(dev);
322 struct drm_i915_gem_caching *args = data;
323 struct drm_i915_gem_object *obj;
324 enum i915_cache_level level;
327 switch (args->caching) {
328 case I915_CACHING_NONE:
329 level = I915_CACHE_NONE;
331 case I915_CACHING_CACHED:
333 * Due to a HW issue on BXT A stepping, GPU stores via a
334 * snooped mapping may leave stale data in a corresponding CPU
335 * cacheline, whereas normally such cachelines would get
338 if (!HAS_LLC(i915) && !HAS_SNOOP(i915))
341 level = I915_CACHE_LLC;
343 case I915_CACHING_DISPLAY:
344 level = HAS_WT(i915) ? I915_CACHE_WT : I915_CACHE_NONE;
350 obj = i915_gem_object_lookup(file, args->handle);
355 * The caching mode of proxy object is handled by its generator, and
356 * not allowed to be changed by userspace.
358 if (i915_gem_object_is_proxy(obj)) {
363 if (obj->cache_level == level)
366 ret = i915_gem_object_wait(obj,
367 I915_WAIT_INTERRUPTIBLE,
368 MAX_SCHEDULE_TIMEOUT);
372 ret = mutex_lock_interruptible(&i915->drm.struct_mutex);
376 ret = i915_gem_object_lock_interruptible(obj);
378 ret = i915_gem_object_set_cache_level(obj, level);
379 i915_gem_object_unlock(obj);
381 mutex_unlock(&i915->drm.struct_mutex);
384 i915_gem_object_put(obj);
389 * Prepare buffer for display plane (scanout, cursors, etc). Can be called from
390 * an uninterruptible phase (modesetting) and allows any flushes to be pipelined
391 * (for pageflips). We only flush the caches while preparing the buffer for
392 * display, the callers are responsible for frontbuffer flush.
395 i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj,
397 const struct i915_ggtt_view *view,
400 struct i915_vma *vma;
403 assert_object_held(obj);
405 /* Mark the global pin early so that we account for the
406 * display coherency whilst setting up the cache domains.
410 /* The display engine is not coherent with the LLC cache on gen6. As
411 * a result, we make sure that the pinning that is about to occur is
412 * done with uncached PTEs. This is lowest common denominator for all
415 * However for gen6+, we could do better by using the GFDT bit instead
416 * of uncaching, which would allow us to flush all the LLC-cached data
417 * with that bit in the PTE to main memory with just one PIPE_CONTROL.
419 ret = i915_gem_object_set_cache_level(obj,
420 HAS_WT(to_i915(obj->base.dev)) ?
421 I915_CACHE_WT : I915_CACHE_NONE);
424 goto err_unpin_global;
427 /* As the user may map the buffer once pinned in the display plane
428 * (e.g. libkms for the bootup splash), we have to ensure that we
429 * always use map_and_fenceable for all scanout buffers. However,
430 * it may simply be too big to fit into mappable, in which case
431 * put it anyway and hope that userspace can cope (but always first
432 * try to preserve the existing ABI).
434 vma = ERR_PTR(-ENOSPC);
435 if ((flags & PIN_MAPPABLE) == 0 &&
436 (!view || view->type == I915_GGTT_VIEW_NORMAL))
437 vma = i915_gem_object_ggtt_pin(obj, view, 0, alignment,
442 vma = i915_gem_object_ggtt_pin(obj, view, 0, alignment, flags);
444 goto err_unpin_global;
446 vma->display_alignment = max_t(u64, vma->display_alignment, alignment);
448 __i915_gem_object_flush_for_display(obj);
450 /* It should now be out of any other write domains, and we can update
451 * the domain values for our changes.
453 obj->read_domains |= I915_GEM_DOMAIN_GTT;
462 static void i915_gem_object_bump_inactive_ggtt(struct drm_i915_gem_object *obj)
464 struct drm_i915_private *i915 = to_i915(obj->base.dev);
465 struct i915_vma *vma;
467 GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
469 mutex_lock(&i915->ggtt.vm.mutex);
470 for_each_ggtt_vma(vma, obj) {
471 if (!drm_mm_node_allocated(&vma->node))
474 list_move_tail(&vma->vm_link, &vma->vm->bound_list);
476 mutex_unlock(&i915->ggtt.vm.mutex);
478 if (obj->mm.madv == I915_MADV_WILLNEED) {
479 struct list_head *list;
481 spin_lock(&i915->mm.obj_lock);
482 list = obj->bind_count ?
483 &i915->mm.bound_list : &i915->mm.unbound_list;
484 list_move_tail(&obj->mm.link, list);
485 spin_unlock(&i915->mm.obj_lock);
490 i915_gem_object_unpin_from_display_plane(struct i915_vma *vma)
492 struct drm_i915_gem_object *obj = vma->obj;
494 assert_object_held(obj);
496 if (WARN_ON(obj->pin_global == 0))
499 if (--obj->pin_global == 0)
500 vma->display_alignment = I915_GTT_MIN_ALIGNMENT;
502 /* Bump the LRU to try and avoid premature eviction whilst flipping */
503 i915_gem_object_bump_inactive_ggtt(obj);
509 * Moves a single object to the CPU read, and possibly write domain.
510 * @obj: object to act on
511 * @write: requesting write or read-only access
513 * This function returns when the move is complete, including waiting on
517 i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write)
521 assert_object_held(obj);
523 ret = i915_gem_object_wait(obj,
524 I915_WAIT_INTERRUPTIBLE |
525 (write ? I915_WAIT_ALL : 0),
526 MAX_SCHEDULE_TIMEOUT);
530 i915_gem_object_flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
532 /* Flush the CPU cache if it's still invalid. */
533 if ((obj->read_domains & I915_GEM_DOMAIN_CPU) == 0) {
534 i915_gem_clflush_object(obj, I915_CLFLUSH_SYNC);
535 obj->read_domains |= I915_GEM_DOMAIN_CPU;
538 /* It should now be out of any other write domains, and we can update
539 * the domain values for our changes.
541 GEM_BUG_ON(obj->write_domain & ~I915_GEM_DOMAIN_CPU);
543 /* If we're writing through the CPU, then the GPU read domains will
544 * need to be invalidated at next use.
547 __start_cpu_write(obj);
552 static inline enum fb_op_origin
553 fb_write_origin(struct drm_i915_gem_object *obj, unsigned int domain)
555 return (domain == I915_GEM_DOMAIN_GTT ?
556 obj->frontbuffer_ggtt_origin : ORIGIN_CPU);
560 * Called when user space prepares to use an object with the CPU, either
561 * through the mmap ioctl's mapping or a GTT mapping.
563 * @data: ioctl data blob
567 i915_gem_set_domain_ioctl(struct drm_device *dev, void *data,
568 struct drm_file *file)
570 struct drm_i915_gem_set_domain *args = data;
571 struct drm_i915_gem_object *obj;
572 u32 read_domains = args->read_domains;
573 u32 write_domain = args->write_domain;
576 /* Only handle setting domains to types used by the CPU. */
577 if ((write_domain | read_domains) & I915_GEM_GPU_DOMAINS)
581 * Having something in the write domain implies it's in the read
582 * domain, and only that read domain. Enforce that in the request.
584 if (write_domain && read_domains != write_domain)
590 obj = i915_gem_object_lookup(file, args->handle);
595 * Already in the desired write domain? Nothing for us to do!
597 * We apply a little bit of cunning here to catch a broader set of
598 * no-ops. If obj->write_domain is set, we must be in the same
599 * obj->read_domains, and only that domain. Therefore, if that
600 * obj->write_domain matches the request read_domains, we are
601 * already in the same read/write domain and can skip the operation,
602 * without having to further check the requested write_domain.
604 if (READ_ONCE(obj->write_domain) == read_domains) {
610 * Try to flush the object off the GPU without holding the lock.
611 * We will repeat the flush holding the lock in the normal manner
612 * to catch cases where we are gazumped.
614 err = i915_gem_object_wait(obj,
615 I915_WAIT_INTERRUPTIBLE |
617 (write_domain ? I915_WAIT_ALL : 0),
618 MAX_SCHEDULE_TIMEOUT);
623 * Proxy objects do not control access to the backing storage, ergo
624 * they cannot be used as a means to manipulate the cache domain
625 * tracking for that backing storage. The proxy object is always
626 * considered to be outside of any cache domain.
628 if (i915_gem_object_is_proxy(obj)) {
634 * Flush and acquire obj->pages so that we are coherent through
635 * direct access in memory with previous cached writes through
636 * shmemfs and that our cache domain tracking remains valid.
637 * For example, if the obj->filp was moved to swap without us
638 * being notified and releasing the pages, we would mistakenly
639 * continue to assume that the obj remained out of the CPU cached
642 err = i915_gem_object_pin_pages(obj);
646 err = i915_gem_object_lock_interruptible(obj);
650 if (read_domains & I915_GEM_DOMAIN_WC)
651 err = i915_gem_object_set_to_wc_domain(obj, write_domain);
652 else if (read_domains & I915_GEM_DOMAIN_GTT)
653 err = i915_gem_object_set_to_gtt_domain(obj, write_domain);
655 err = i915_gem_object_set_to_cpu_domain(obj, write_domain);
657 /* And bump the LRU for this access */
658 i915_gem_object_bump_inactive_ggtt(obj);
660 i915_gem_object_unlock(obj);
662 if (write_domain != 0)
663 intel_fb_obj_invalidate(obj,
664 fb_write_origin(obj, write_domain));
667 i915_gem_object_unpin_pages(obj);
669 i915_gem_object_put(obj);
674 * Pins the specified object's pages and synchronizes the object with
675 * GPU accesses. Sets needs_clflush to non-zero if the caller should
676 * flush the object from the CPU cache.
678 int i915_gem_object_prepare_read(struct drm_i915_gem_object *obj,
679 unsigned int *needs_clflush)
684 if (!i915_gem_object_has_struct_page(obj))
687 ret = i915_gem_object_lock_interruptible(obj);
691 ret = i915_gem_object_wait(obj,
692 I915_WAIT_INTERRUPTIBLE,
693 MAX_SCHEDULE_TIMEOUT);
697 ret = i915_gem_object_pin_pages(obj);
701 if (obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ ||
702 !static_cpu_has(X86_FEATURE_CLFLUSH)) {
703 ret = i915_gem_object_set_to_cpu_domain(obj, false);
710 i915_gem_object_flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
712 /* If we're not in the cpu read domain, set ourself into the gtt
713 * read domain and manually flush cachelines (if required). This
714 * optimizes for the case when the gpu will dirty the data
715 * anyway again before the next pread happens.
717 if (!obj->cache_dirty &&
718 !(obj->read_domains & I915_GEM_DOMAIN_CPU))
719 *needs_clflush = CLFLUSH_BEFORE;
722 /* return with the pages pinned */
726 i915_gem_object_unpin_pages(obj);
728 i915_gem_object_unlock(obj);
732 int i915_gem_object_prepare_write(struct drm_i915_gem_object *obj,
733 unsigned int *needs_clflush)
738 if (!i915_gem_object_has_struct_page(obj))
741 ret = i915_gem_object_lock_interruptible(obj);
745 ret = i915_gem_object_wait(obj,
746 I915_WAIT_INTERRUPTIBLE |
748 MAX_SCHEDULE_TIMEOUT);
752 ret = i915_gem_object_pin_pages(obj);
756 if (obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE ||
757 !static_cpu_has(X86_FEATURE_CLFLUSH)) {
758 ret = i915_gem_object_set_to_cpu_domain(obj, true);
765 i915_gem_object_flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
767 /* If we're not in the cpu write domain, set ourself into the
768 * gtt write domain and manually flush cachelines (as required).
769 * This optimizes for the case when the gpu will use the data
770 * right away and we therefore have to clflush anyway.
772 if (!obj->cache_dirty) {
773 *needs_clflush |= CLFLUSH_AFTER;
776 * Same trick applies to invalidate partially written
777 * cachelines read before writing.
779 if (!(obj->read_domains & I915_GEM_DOMAIN_CPU))
780 *needs_clflush |= CLFLUSH_BEFORE;
784 intel_fb_obj_invalidate(obj, ORIGIN_CPU);
785 obj->mm.dirty = true;
786 /* return with the pages pinned */
790 i915_gem_object_unpin_pages(obj);
792 i915_gem_object_unlock(obj);