2 * KVMGT - the implementation of Intel mediated pass-through framework for KVM
4 * Copyright(c) 2014-2016 Intel Corporation. All rights reserved.
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26 * Kevin Tian <kevin.tian@intel.com>
27 * Jike Song <jike.song@intel.com>
28 * Xiaoguang Chen <xiaoguang.chen@intel.com>
31 #include <linux/init.h>
32 #include <linux/device.h>
34 #include <linux/kthread.h>
35 #include <linux/sched/mm.h>
36 #include <linux/types.h>
37 #include <linux/list.h>
38 #include <linux/rbtree.h>
39 #include <linux/spinlock.h>
40 #include <linux/eventfd.h>
41 #include <linux/uuid.h>
42 #include <linux/kvm_host.h>
43 #include <linux/vfio.h>
44 #include <linux/mdev.h>
45 #include <linux/debugfs.h>
47 #include <linux/nospec.h>
52 static const struct intel_gvt_ops *intel_gvt_ops;
54 /* helper macros copied from vfio-pci */
55 #define VFIO_PCI_OFFSET_SHIFT 40
56 #define VFIO_PCI_OFFSET_TO_INDEX(off) (off >> VFIO_PCI_OFFSET_SHIFT)
57 #define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT)
58 #define VFIO_PCI_OFFSET_MASK (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1)
60 #define EDID_BLOB_OFFSET (PAGE_SIZE/2)
62 #define OPREGION_SIGNATURE "IntelGraphicsMem"
65 struct intel_vgpu_regops {
66 size_t (*rw)(struct intel_vgpu *vgpu, char *buf,
67 size_t count, loff_t *ppos, bool iswrite);
68 void (*release)(struct intel_vgpu *vgpu,
69 struct vfio_region *region);
77 const struct intel_vgpu_regops *ops;
81 struct vfio_edid_region {
82 struct vfio_region_gfx_edid vfio_edid_regs;
88 struct hlist_node hnode;
91 struct kvmgt_guest_info {
93 struct intel_vgpu *vgpu;
94 struct kvm_page_track_notifier_node track_node;
95 #define NR_BKT (1 << 18)
96 struct hlist_head ptable[NR_BKT];
98 struct dentry *debugfs_cache_entries;
102 struct intel_vgpu *vgpu;
103 struct rb_node gfn_node;
104 struct rb_node dma_addr_node;
112 struct intel_vgpu *vgpu;
113 struct mdev_device *mdev;
114 struct vfio_region *region;
116 struct eventfd_ctx *intx_trigger;
117 struct eventfd_ctx *msi_trigger;
120 * Two caches are used to avoid mapping duplicated pages (eg.
121 * scratch pages). This help to reduce dma setup overhead.
123 struct rb_root gfn_cache;
124 struct rb_root dma_addr_cache;
125 unsigned long nr_cache_entries;
126 struct mutex cache_lock;
128 struct notifier_block iommu_notifier;
129 struct notifier_block group_notifier;
131 struct work_struct release_work;
133 struct vfio_device *vfio_device;
134 struct vfio_group *vfio_group;
137 static inline struct kvmgt_vdev *kvmgt_vdev(struct intel_vgpu *vgpu)
139 return intel_vgpu_vdev(vgpu);
142 static inline bool handle_valid(unsigned long handle)
144 return !!(handle & ~0xff);
147 static ssize_t available_instances_show(struct mdev_type *mtype,
148 struct mdev_type_attribute *attr,
151 struct intel_vgpu_type *type;
152 unsigned int num = 0;
153 struct intel_gvt *gvt = kdev_to_i915(mtype_get_parent_dev(mtype))->gvt;
155 type = &gvt->types[mtype_get_type_group_id(mtype)];
159 num = type->avail_instance;
161 return sprintf(buf, "%u\n", num);
164 static ssize_t device_api_show(struct mdev_type *mtype,
165 struct mdev_type_attribute *attr, char *buf)
167 return sprintf(buf, "%s\n", VFIO_DEVICE_API_PCI_STRING);
170 static ssize_t description_show(struct mdev_type *mtype,
171 struct mdev_type_attribute *attr, char *buf)
173 struct intel_vgpu_type *type;
174 struct intel_gvt *gvt = kdev_to_i915(mtype_get_parent_dev(mtype))->gvt;
176 type = &gvt->types[mtype_get_type_group_id(mtype)];
180 return sprintf(buf, "low_gm_size: %dMB\nhigh_gm_size: %dMB\n"
181 "fence: %d\nresolution: %s\n"
183 BYTES_TO_MB(type->low_gm_size),
184 BYTES_TO_MB(type->high_gm_size),
185 type->fence, vgpu_edid_str(type->resolution),
189 static MDEV_TYPE_ATTR_RO(available_instances);
190 static MDEV_TYPE_ATTR_RO(device_api);
191 static MDEV_TYPE_ATTR_RO(description);
193 static struct attribute *gvt_type_attrs[] = {
194 &mdev_type_attr_available_instances.attr,
195 &mdev_type_attr_device_api.attr,
196 &mdev_type_attr_description.attr,
200 static struct attribute_group *gvt_vgpu_type_groups[] = {
201 [0 ... NR_MAX_INTEL_VGPU_TYPES - 1] = NULL,
204 static int intel_gvt_init_vgpu_type_groups(struct intel_gvt *gvt)
207 struct intel_vgpu_type *type;
208 struct attribute_group *group;
210 for (i = 0; i < gvt->num_types; i++) {
211 type = &gvt->types[i];
213 group = kzalloc(sizeof(struct attribute_group), GFP_KERNEL);
217 group->name = type->name;
218 group->attrs = gvt_type_attrs;
219 gvt_vgpu_type_groups[i] = group;
225 for (j = 0; j < i; j++) {
226 group = gvt_vgpu_type_groups[j];
233 static void intel_gvt_cleanup_vgpu_type_groups(struct intel_gvt *gvt)
236 struct attribute_group *group;
238 for (i = 0; i < gvt->num_types; i++) {
239 group = gvt_vgpu_type_groups[i];
240 gvt_vgpu_type_groups[i] = NULL;
245 static int kvmgt_guest_init(struct mdev_device *mdev);
246 static void intel_vgpu_release_work(struct work_struct *work);
247 static bool kvmgt_guest_exit(struct kvmgt_guest_info *info);
249 static void gvt_unpin_guest_page(struct intel_vgpu *vgpu, unsigned long gfn,
252 struct drm_i915_private *i915 = vgpu->gvt->gt->i915;
253 struct kvmgt_vdev *vdev = kvmgt_vdev(vgpu);
258 total_pages = roundup(size, PAGE_SIZE) / PAGE_SIZE;
260 for (npage = 0; npage < total_pages; npage++) {
261 unsigned long cur_gfn = gfn + npage;
263 ret = vfio_group_unpin_pages(vdev->vfio_group, &cur_gfn, 1);
264 drm_WARN_ON(&i915->drm, ret != 1);
268 /* Pin a normal or compound guest page for dma. */
269 static int gvt_pin_guest_page(struct intel_vgpu *vgpu, unsigned long gfn,
270 unsigned long size, struct page **page)
272 struct kvmgt_vdev *vdev = kvmgt_vdev(vgpu);
273 unsigned long base_pfn = 0;
278 total_pages = roundup(size, PAGE_SIZE) / PAGE_SIZE;
280 * We pin the pages one-by-one to avoid allocating a big arrary
281 * on stack to hold pfns.
283 for (npage = 0; npage < total_pages; npage++) {
284 unsigned long cur_gfn = gfn + npage;
287 ret = vfio_group_pin_pages(vdev->vfio_group, &cur_gfn, 1,
288 IOMMU_READ | IOMMU_WRITE, &pfn);
290 gvt_vgpu_err("vfio_pin_pages failed for gfn 0x%lx, ret %d\n",
295 if (!pfn_valid(pfn)) {
296 gvt_vgpu_err("pfn 0x%lx is not mem backed\n", pfn);
304 else if (base_pfn + npage != pfn) {
305 gvt_vgpu_err("The pages are not continuous\n");
312 *page = pfn_to_page(base_pfn);
315 gvt_unpin_guest_page(vgpu, gfn, npage * PAGE_SIZE);
319 static int gvt_dma_map_page(struct intel_vgpu *vgpu, unsigned long gfn,
320 dma_addr_t *dma_addr, unsigned long size)
322 struct device *dev = vgpu->gvt->gt->i915->drm.dev;
323 struct page *page = NULL;
326 ret = gvt_pin_guest_page(vgpu, gfn, size, &page);
330 /* Setup DMA mapping. */
331 *dma_addr = dma_map_page(dev, page, 0, size, PCI_DMA_BIDIRECTIONAL);
332 if (dma_mapping_error(dev, *dma_addr)) {
333 gvt_vgpu_err("DMA mapping failed for pfn 0x%lx, ret %d\n",
334 page_to_pfn(page), ret);
335 gvt_unpin_guest_page(vgpu, gfn, size);
342 static void gvt_dma_unmap_page(struct intel_vgpu *vgpu, unsigned long gfn,
343 dma_addr_t dma_addr, unsigned long size)
345 struct device *dev = vgpu->gvt->gt->i915->drm.dev;
347 dma_unmap_page(dev, dma_addr, size, PCI_DMA_BIDIRECTIONAL);
348 gvt_unpin_guest_page(vgpu, gfn, size);
351 static struct gvt_dma *__gvt_cache_find_dma_addr(struct intel_vgpu *vgpu,
354 struct rb_node *node = kvmgt_vdev(vgpu)->dma_addr_cache.rb_node;
358 itr = rb_entry(node, struct gvt_dma, dma_addr_node);
360 if (dma_addr < itr->dma_addr)
361 node = node->rb_left;
362 else if (dma_addr > itr->dma_addr)
363 node = node->rb_right;
370 static struct gvt_dma *__gvt_cache_find_gfn(struct intel_vgpu *vgpu, gfn_t gfn)
372 struct rb_node *node = kvmgt_vdev(vgpu)->gfn_cache.rb_node;
376 itr = rb_entry(node, struct gvt_dma, gfn_node);
379 node = node->rb_left;
380 else if (gfn > itr->gfn)
381 node = node->rb_right;
388 static int __gvt_cache_add(struct intel_vgpu *vgpu, gfn_t gfn,
389 dma_addr_t dma_addr, unsigned long size)
391 struct gvt_dma *new, *itr;
392 struct rb_node **link, *parent = NULL;
393 struct kvmgt_vdev *vdev = kvmgt_vdev(vgpu);
395 new = kzalloc(sizeof(struct gvt_dma), GFP_KERNEL);
401 new->dma_addr = dma_addr;
403 kref_init(&new->ref);
405 /* gfn_cache maps gfn to struct gvt_dma. */
406 link = &vdev->gfn_cache.rb_node;
409 itr = rb_entry(parent, struct gvt_dma, gfn_node);
412 link = &parent->rb_left;
414 link = &parent->rb_right;
416 rb_link_node(&new->gfn_node, parent, link);
417 rb_insert_color(&new->gfn_node, &vdev->gfn_cache);
419 /* dma_addr_cache maps dma addr to struct gvt_dma. */
421 link = &vdev->dma_addr_cache.rb_node;
424 itr = rb_entry(parent, struct gvt_dma, dma_addr_node);
426 if (dma_addr < itr->dma_addr)
427 link = &parent->rb_left;
429 link = &parent->rb_right;
431 rb_link_node(&new->dma_addr_node, parent, link);
432 rb_insert_color(&new->dma_addr_node, &vdev->dma_addr_cache);
434 vdev->nr_cache_entries++;
438 static void __gvt_cache_remove_entry(struct intel_vgpu *vgpu,
439 struct gvt_dma *entry)
441 struct kvmgt_vdev *vdev = kvmgt_vdev(vgpu);
443 rb_erase(&entry->gfn_node, &vdev->gfn_cache);
444 rb_erase(&entry->dma_addr_node, &vdev->dma_addr_cache);
446 vdev->nr_cache_entries--;
449 static void gvt_cache_destroy(struct intel_vgpu *vgpu)
452 struct rb_node *node = NULL;
453 struct kvmgt_vdev *vdev = kvmgt_vdev(vgpu);
456 mutex_lock(&vdev->cache_lock);
457 node = rb_first(&vdev->gfn_cache);
459 mutex_unlock(&vdev->cache_lock);
462 dma = rb_entry(node, struct gvt_dma, gfn_node);
463 gvt_dma_unmap_page(vgpu, dma->gfn, dma->dma_addr, dma->size);
464 __gvt_cache_remove_entry(vgpu, dma);
465 mutex_unlock(&vdev->cache_lock);
469 static void gvt_cache_init(struct intel_vgpu *vgpu)
471 struct kvmgt_vdev *vdev = kvmgt_vdev(vgpu);
473 vdev->gfn_cache = RB_ROOT;
474 vdev->dma_addr_cache = RB_ROOT;
475 vdev->nr_cache_entries = 0;
476 mutex_init(&vdev->cache_lock);
479 static void kvmgt_protect_table_init(struct kvmgt_guest_info *info)
481 hash_init(info->ptable);
484 static void kvmgt_protect_table_destroy(struct kvmgt_guest_info *info)
486 struct kvmgt_pgfn *p;
487 struct hlist_node *tmp;
490 hash_for_each_safe(info->ptable, i, tmp, p, hnode) {
496 static struct kvmgt_pgfn *
497 __kvmgt_protect_table_find(struct kvmgt_guest_info *info, gfn_t gfn)
499 struct kvmgt_pgfn *p, *res = NULL;
501 hash_for_each_possible(info->ptable, p, hnode, gfn) {
511 static bool kvmgt_gfn_is_write_protected(struct kvmgt_guest_info *info,
514 struct kvmgt_pgfn *p;
516 p = __kvmgt_protect_table_find(info, gfn);
520 static void kvmgt_protect_table_add(struct kvmgt_guest_info *info, gfn_t gfn)
522 struct kvmgt_pgfn *p;
524 if (kvmgt_gfn_is_write_protected(info, gfn))
527 p = kzalloc(sizeof(struct kvmgt_pgfn), GFP_ATOMIC);
528 if (WARN(!p, "gfn: 0x%llx\n", gfn))
532 hash_add(info->ptable, &p->hnode, gfn);
535 static void kvmgt_protect_table_del(struct kvmgt_guest_info *info,
538 struct kvmgt_pgfn *p;
540 p = __kvmgt_protect_table_find(info, gfn);
547 static size_t intel_vgpu_reg_rw_opregion(struct intel_vgpu *vgpu, char *buf,
548 size_t count, loff_t *ppos, bool iswrite)
550 struct kvmgt_vdev *vdev = kvmgt_vdev(vgpu);
551 unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) -
552 VFIO_PCI_NUM_REGIONS;
553 void *base = vdev->region[i].data;
554 loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
557 if (pos >= vdev->region[i].size || iswrite) {
558 gvt_vgpu_err("invalid op or offset for Intel vgpu OpRegion\n");
561 count = min(count, (size_t)(vdev->region[i].size - pos));
562 memcpy(buf, base + pos, count);
567 static void intel_vgpu_reg_release_opregion(struct intel_vgpu *vgpu,
568 struct vfio_region *region)
572 static const struct intel_vgpu_regops intel_vgpu_regops_opregion = {
573 .rw = intel_vgpu_reg_rw_opregion,
574 .release = intel_vgpu_reg_release_opregion,
577 static int handle_edid_regs(struct intel_vgpu *vgpu,
578 struct vfio_edid_region *region, char *buf,
579 size_t count, u16 offset, bool is_write)
581 struct vfio_region_gfx_edid *regs = ®ion->vfio_edid_regs;
584 if (offset + count > sizeof(*regs))
591 data = *((unsigned int *)buf);
593 case offsetof(struct vfio_region_gfx_edid, link_state):
594 if (data == VFIO_DEVICE_GFX_LINK_STATE_UP) {
595 if (!drm_edid_block_valid(
596 (u8 *)region->edid_blob,
600 gvt_vgpu_err("invalid EDID blob\n");
603 intel_gvt_ops->emulate_hotplug(vgpu, true);
604 } else if (data == VFIO_DEVICE_GFX_LINK_STATE_DOWN)
605 intel_gvt_ops->emulate_hotplug(vgpu, false);
607 gvt_vgpu_err("invalid EDID link state %d\n",
611 regs->link_state = data;
613 case offsetof(struct vfio_region_gfx_edid, edid_size):
614 if (data > regs->edid_max_size) {
615 gvt_vgpu_err("EDID size is bigger than %d!\n",
616 regs->edid_max_size);
619 regs->edid_size = data;
623 gvt_vgpu_err("write read-only EDID region at offset %d\n",
628 memcpy(buf, (char *)regs + offset, count);
634 static int handle_edid_blob(struct vfio_edid_region *region, char *buf,
635 size_t count, u16 offset, bool is_write)
637 if (offset + count > region->vfio_edid_regs.edid_size)
641 memcpy(region->edid_blob + offset, buf, count);
643 memcpy(buf, region->edid_blob + offset, count);
648 static size_t intel_vgpu_reg_rw_edid(struct intel_vgpu *vgpu, char *buf,
649 size_t count, loff_t *ppos, bool iswrite)
652 unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) -
653 VFIO_PCI_NUM_REGIONS;
654 struct vfio_edid_region *region =
655 (struct vfio_edid_region *)kvmgt_vdev(vgpu)->region[i].data;
656 loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
658 if (pos < region->vfio_edid_regs.edid_offset) {
659 ret = handle_edid_regs(vgpu, region, buf, count, pos, iswrite);
661 pos -= EDID_BLOB_OFFSET;
662 ret = handle_edid_blob(region, buf, count, pos, iswrite);
666 gvt_vgpu_err("failed to access EDID region\n");
671 static void intel_vgpu_reg_release_edid(struct intel_vgpu *vgpu,
672 struct vfio_region *region)
677 static const struct intel_vgpu_regops intel_vgpu_regops_edid = {
678 .rw = intel_vgpu_reg_rw_edid,
679 .release = intel_vgpu_reg_release_edid,
682 static int intel_vgpu_register_reg(struct intel_vgpu *vgpu,
683 unsigned int type, unsigned int subtype,
684 const struct intel_vgpu_regops *ops,
685 size_t size, u32 flags, void *data)
687 struct kvmgt_vdev *vdev = kvmgt_vdev(vgpu);
688 struct vfio_region *region;
690 region = krealloc(vdev->region,
691 (vdev->num_regions + 1) * sizeof(*region),
696 vdev->region = region;
697 vdev->region[vdev->num_regions].type = type;
698 vdev->region[vdev->num_regions].subtype = subtype;
699 vdev->region[vdev->num_regions].ops = ops;
700 vdev->region[vdev->num_regions].size = size;
701 vdev->region[vdev->num_regions].flags = flags;
702 vdev->region[vdev->num_regions].data = data;
707 static int kvmgt_get_vfio_device(void *p_vgpu)
709 struct intel_vgpu *vgpu = (struct intel_vgpu *)p_vgpu;
710 struct kvmgt_vdev *vdev = kvmgt_vdev(vgpu);
712 vdev->vfio_device = vfio_device_get_from_dev(
713 mdev_dev(vdev->mdev));
714 if (!vdev->vfio_device) {
715 gvt_vgpu_err("failed to get vfio device\n");
722 static int kvmgt_set_opregion(void *p_vgpu)
724 struct intel_vgpu *vgpu = (struct intel_vgpu *)p_vgpu;
728 /* Each vgpu has its own opregion, although VFIO would create another
729 * one later. This one is used to expose opregion to VFIO. And the
730 * other one created by VFIO later, is used by guest actually.
732 base = vgpu_opregion(vgpu)->va;
736 if (memcmp(base, OPREGION_SIGNATURE, 16)) {
741 ret = intel_vgpu_register_reg(vgpu,
742 PCI_VENDOR_ID_INTEL | VFIO_REGION_TYPE_PCI_VENDOR_TYPE,
743 VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION,
744 &intel_vgpu_regops_opregion, OPREGION_SIZE,
745 VFIO_REGION_INFO_FLAG_READ, base);
750 static int kvmgt_set_edid(void *p_vgpu, int port_num)
752 struct intel_vgpu *vgpu = (struct intel_vgpu *)p_vgpu;
753 struct intel_vgpu_port *port = intel_vgpu_port(vgpu, port_num);
754 struct vfio_edid_region *base;
757 base = kzalloc(sizeof(*base), GFP_KERNEL);
761 /* TODO: Add multi-port and EDID extension block support */
762 base->vfio_edid_regs.edid_offset = EDID_BLOB_OFFSET;
763 base->vfio_edid_regs.edid_max_size = EDID_SIZE;
764 base->vfio_edid_regs.edid_size = EDID_SIZE;
765 base->vfio_edid_regs.max_xres = vgpu_edid_xres(port->id);
766 base->vfio_edid_regs.max_yres = vgpu_edid_yres(port->id);
767 base->edid_blob = port->edid->edid_block;
769 ret = intel_vgpu_register_reg(vgpu,
770 VFIO_REGION_TYPE_GFX,
771 VFIO_REGION_SUBTYPE_GFX_EDID,
772 &intel_vgpu_regops_edid, EDID_SIZE,
773 VFIO_REGION_INFO_FLAG_READ |
774 VFIO_REGION_INFO_FLAG_WRITE |
775 VFIO_REGION_INFO_FLAG_CAPS, base);
780 static void kvmgt_put_vfio_device(void *vgpu)
782 struct kvmgt_vdev *vdev = kvmgt_vdev((struct intel_vgpu *)vgpu);
784 if (WARN_ON(!vdev->vfio_device))
787 vfio_device_put(vdev->vfio_device);
790 static int intel_vgpu_create(struct mdev_device *mdev)
792 struct intel_vgpu *vgpu = NULL;
793 struct intel_vgpu_type *type;
795 struct intel_gvt *gvt;
798 pdev = mdev_parent_dev(mdev);
799 gvt = kdev_to_i915(pdev)->gvt;
801 type = &gvt->types[mdev_get_type_group_id(mdev)];
807 vgpu = intel_gvt_ops->vgpu_create(gvt, type);
808 if (IS_ERR_OR_NULL(vgpu)) {
809 ret = vgpu == NULL ? -EFAULT : PTR_ERR(vgpu);
810 gvt_err("failed to create intel vgpu: %d\n", ret);
814 INIT_WORK(&kvmgt_vdev(vgpu)->release_work, intel_vgpu_release_work);
816 kvmgt_vdev(vgpu)->mdev = mdev;
817 mdev_set_drvdata(mdev, vgpu);
819 gvt_dbg_core("intel_vgpu_create succeeded for mdev: %s\n",
820 dev_name(mdev_dev(mdev)));
827 static int intel_vgpu_remove(struct mdev_device *mdev)
829 struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
831 if (handle_valid(vgpu->handle))
834 intel_gvt_ops->vgpu_destroy(vgpu);
838 static int intel_vgpu_iommu_notifier(struct notifier_block *nb,
839 unsigned long action, void *data)
841 struct kvmgt_vdev *vdev = container_of(nb,
844 struct intel_vgpu *vgpu = vdev->vgpu;
846 if (action == VFIO_IOMMU_NOTIFY_DMA_UNMAP) {
847 struct vfio_iommu_type1_dma_unmap *unmap = data;
848 struct gvt_dma *entry;
849 unsigned long iov_pfn, end_iov_pfn;
851 iov_pfn = unmap->iova >> PAGE_SHIFT;
852 end_iov_pfn = iov_pfn + unmap->size / PAGE_SIZE;
854 mutex_lock(&vdev->cache_lock);
855 for (; iov_pfn < end_iov_pfn; iov_pfn++) {
856 entry = __gvt_cache_find_gfn(vgpu, iov_pfn);
860 gvt_dma_unmap_page(vgpu, entry->gfn, entry->dma_addr,
862 __gvt_cache_remove_entry(vgpu, entry);
864 mutex_unlock(&vdev->cache_lock);
870 static int intel_vgpu_group_notifier(struct notifier_block *nb,
871 unsigned long action, void *data)
873 struct kvmgt_vdev *vdev = container_of(nb,
877 /* the only action we care about */
878 if (action == VFIO_GROUP_NOTIFY_SET_KVM) {
882 schedule_work(&vdev->release_work);
888 static int intel_vgpu_open(struct mdev_device *mdev)
890 struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
891 struct kvmgt_vdev *vdev = kvmgt_vdev(vgpu);
892 unsigned long events;
894 struct vfio_group *vfio_group;
896 vdev->iommu_notifier.notifier_call = intel_vgpu_iommu_notifier;
897 vdev->group_notifier.notifier_call = intel_vgpu_group_notifier;
899 events = VFIO_IOMMU_NOTIFY_DMA_UNMAP;
900 ret = vfio_register_notifier(mdev_dev(mdev), VFIO_IOMMU_NOTIFY, &events,
901 &vdev->iommu_notifier);
903 gvt_vgpu_err("vfio_register_notifier for iommu failed: %d\n",
908 events = VFIO_GROUP_NOTIFY_SET_KVM;
909 ret = vfio_register_notifier(mdev_dev(mdev), VFIO_GROUP_NOTIFY, &events,
910 &vdev->group_notifier);
912 gvt_vgpu_err("vfio_register_notifier for group failed: %d\n",
917 vfio_group = vfio_group_get_external_user_from_dev(mdev_dev(mdev));
918 if (IS_ERR_OR_NULL(vfio_group)) {
919 ret = !vfio_group ? -EFAULT : PTR_ERR(vfio_group);
920 gvt_vgpu_err("vfio_group_get_external_user_from_dev failed\n");
923 vdev->vfio_group = vfio_group;
925 /* Take a module reference as mdev core doesn't take
926 * a reference for vendor driver.
928 if (!try_module_get(THIS_MODULE)) {
933 ret = kvmgt_guest_init(mdev);
937 intel_gvt_ops->vgpu_activate(vgpu);
939 atomic_set(&vdev->released, 0);
943 vfio_group_put_external_user(vdev->vfio_group);
944 vdev->vfio_group = NULL;
947 vfio_unregister_notifier(mdev_dev(mdev), VFIO_GROUP_NOTIFY,
948 &vdev->group_notifier);
951 vfio_unregister_notifier(mdev_dev(mdev), VFIO_IOMMU_NOTIFY,
952 &vdev->iommu_notifier);
957 static void intel_vgpu_release_msi_eventfd_ctx(struct intel_vgpu *vgpu)
959 struct kvmgt_vdev *vdev = kvmgt_vdev(vgpu);
960 struct eventfd_ctx *trigger;
962 trigger = vdev->msi_trigger;
964 eventfd_ctx_put(trigger);
965 vdev->msi_trigger = NULL;
969 static void __intel_vgpu_release(struct intel_vgpu *vgpu)
971 struct kvmgt_vdev *vdev = kvmgt_vdev(vgpu);
972 struct drm_i915_private *i915 = vgpu->gvt->gt->i915;
973 struct kvmgt_guest_info *info;
976 if (!handle_valid(vgpu->handle))
979 if (atomic_cmpxchg(&vdev->released, 0, 1))
982 intel_gvt_ops->vgpu_release(vgpu);
984 ret = vfio_unregister_notifier(mdev_dev(vdev->mdev), VFIO_IOMMU_NOTIFY,
985 &vdev->iommu_notifier);
986 drm_WARN(&i915->drm, ret,
987 "vfio_unregister_notifier for iommu failed: %d\n", ret);
989 ret = vfio_unregister_notifier(mdev_dev(vdev->mdev), VFIO_GROUP_NOTIFY,
990 &vdev->group_notifier);
991 drm_WARN(&i915->drm, ret,
992 "vfio_unregister_notifier for group failed: %d\n", ret);
994 /* dereference module reference taken at open */
995 module_put(THIS_MODULE);
997 info = (struct kvmgt_guest_info *)vgpu->handle;
998 kvmgt_guest_exit(info);
1000 intel_vgpu_release_msi_eventfd_ctx(vgpu);
1001 vfio_group_put_external_user(vdev->vfio_group);
1007 static void intel_vgpu_release(struct mdev_device *mdev)
1009 struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
1011 __intel_vgpu_release(vgpu);
1014 static void intel_vgpu_release_work(struct work_struct *work)
1016 struct kvmgt_vdev *vdev = container_of(work, struct kvmgt_vdev,
1019 __intel_vgpu_release(vdev->vgpu);
1022 static u64 intel_vgpu_get_bar_addr(struct intel_vgpu *vgpu, int bar)
1024 u32 start_lo, start_hi;
1027 start_lo = (*(u32 *)(vgpu->cfg_space.virtual_cfg_space + bar)) &
1028 PCI_BASE_ADDRESS_MEM_MASK;
1029 mem_type = (*(u32 *)(vgpu->cfg_space.virtual_cfg_space + bar)) &
1030 PCI_BASE_ADDRESS_MEM_TYPE_MASK;
1033 case PCI_BASE_ADDRESS_MEM_TYPE_64:
1034 start_hi = (*(u32 *)(vgpu->cfg_space.virtual_cfg_space
1037 case PCI_BASE_ADDRESS_MEM_TYPE_32:
1038 case PCI_BASE_ADDRESS_MEM_TYPE_1M:
1039 /* 1M mem BAR treated as 32-bit BAR */
1041 /* mem unknown type treated as 32-bit BAR */
1046 return ((u64)start_hi << 32) | start_lo;
1049 static int intel_vgpu_bar_rw(struct intel_vgpu *vgpu, int bar, u64 off,
1050 void *buf, unsigned int count, bool is_write)
1052 u64 bar_start = intel_vgpu_get_bar_addr(vgpu, bar);
1056 ret = intel_gvt_ops->emulate_mmio_write(vgpu,
1057 bar_start + off, buf, count);
1059 ret = intel_gvt_ops->emulate_mmio_read(vgpu,
1060 bar_start + off, buf, count);
1064 static inline bool intel_vgpu_in_aperture(struct intel_vgpu *vgpu, u64 off)
1066 return off >= vgpu_aperture_offset(vgpu) &&
1067 off < vgpu_aperture_offset(vgpu) + vgpu_aperture_sz(vgpu);
1070 static int intel_vgpu_aperture_rw(struct intel_vgpu *vgpu, u64 off,
1071 void *buf, unsigned long count, bool is_write)
1073 void __iomem *aperture_va;
1075 if (!intel_vgpu_in_aperture(vgpu, off) ||
1076 !intel_vgpu_in_aperture(vgpu, off + count)) {
1077 gvt_vgpu_err("Invalid aperture offset %llu\n", off);
1081 aperture_va = io_mapping_map_wc(&vgpu->gvt->gt->ggtt->iomap,
1082 ALIGN_DOWN(off, PAGE_SIZE),
1083 count + offset_in_page(off));
1088 memcpy_toio(aperture_va + offset_in_page(off), buf, count);
1090 memcpy_fromio(buf, aperture_va + offset_in_page(off), count);
1092 io_mapping_unmap(aperture_va);
1097 static ssize_t intel_vgpu_rw(struct mdev_device *mdev, char *buf,
1098 size_t count, loff_t *ppos, bool is_write)
1100 struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
1101 struct kvmgt_vdev *vdev = kvmgt_vdev(vgpu);
1102 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
1103 u64 pos = *ppos & VFIO_PCI_OFFSET_MASK;
1107 if (index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions) {
1108 gvt_vgpu_err("invalid index: %u\n", index);
1113 case VFIO_PCI_CONFIG_REGION_INDEX:
1115 ret = intel_gvt_ops->emulate_cfg_write(vgpu, pos,
1118 ret = intel_gvt_ops->emulate_cfg_read(vgpu, pos,
1121 case VFIO_PCI_BAR0_REGION_INDEX:
1122 ret = intel_vgpu_bar_rw(vgpu, PCI_BASE_ADDRESS_0, pos,
1123 buf, count, is_write);
1125 case VFIO_PCI_BAR2_REGION_INDEX:
1126 ret = intel_vgpu_aperture_rw(vgpu, pos, buf, count, is_write);
1128 case VFIO_PCI_BAR1_REGION_INDEX:
1129 case VFIO_PCI_BAR3_REGION_INDEX:
1130 case VFIO_PCI_BAR4_REGION_INDEX:
1131 case VFIO_PCI_BAR5_REGION_INDEX:
1132 case VFIO_PCI_VGA_REGION_INDEX:
1133 case VFIO_PCI_ROM_REGION_INDEX:
1136 if (index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions)
1139 index -= VFIO_PCI_NUM_REGIONS;
1140 return vdev->region[index].ops->rw(vgpu, buf, count,
1144 return ret == 0 ? count : ret;
1147 static bool gtt_entry(struct mdev_device *mdev, loff_t *ppos)
1149 struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
1150 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
1151 struct intel_gvt *gvt = vgpu->gvt;
1154 /* Only allow MMIO GGTT entry access */
1155 if (index != PCI_BASE_ADDRESS_0)
1158 offset = (u64)(*ppos & VFIO_PCI_OFFSET_MASK) -
1159 intel_vgpu_get_bar_gpa(vgpu, PCI_BASE_ADDRESS_0);
1161 return (offset >= gvt->device_info.gtt_start_offset &&
1162 offset < gvt->device_info.gtt_start_offset + gvt_ggtt_sz(gvt)) ?
1166 static ssize_t intel_vgpu_read(struct mdev_device *mdev, char __user *buf,
1167 size_t count, loff_t *ppos)
1169 unsigned int done = 0;
1175 /* Only support GGTT entry 8 bytes read */
1176 if (count >= 8 && !(*ppos % 8) &&
1177 gtt_entry(mdev, ppos)) {
1180 ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val),
1185 if (copy_to_user(buf, &val, sizeof(val)))
1189 } else if (count >= 4 && !(*ppos % 4)) {
1192 ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val),
1197 if (copy_to_user(buf, &val, sizeof(val)))
1201 } else if (count >= 2 && !(*ppos % 2)) {
1204 ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val),
1209 if (copy_to_user(buf, &val, sizeof(val)))
1216 ret = intel_vgpu_rw(mdev, &val, sizeof(val), ppos,
1221 if (copy_to_user(buf, &val, sizeof(val)))
1239 static ssize_t intel_vgpu_write(struct mdev_device *mdev,
1240 const char __user *buf,
1241 size_t count, loff_t *ppos)
1243 unsigned int done = 0;
1249 /* Only support GGTT entry 8 bytes write */
1250 if (count >= 8 && !(*ppos % 8) &&
1251 gtt_entry(mdev, ppos)) {
1254 if (copy_from_user(&val, buf, sizeof(val)))
1257 ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val),
1263 } else if (count >= 4 && !(*ppos % 4)) {
1266 if (copy_from_user(&val, buf, sizeof(val)))
1269 ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val),
1275 } else if (count >= 2 && !(*ppos % 2)) {
1278 if (copy_from_user(&val, buf, sizeof(val)))
1281 ret = intel_vgpu_rw(mdev, (char *)&val,
1282 sizeof(val), ppos, true);
1290 if (copy_from_user(&val, buf, sizeof(val)))
1293 ret = intel_vgpu_rw(mdev, &val, sizeof(val),
1312 static int intel_vgpu_mmap(struct mdev_device *mdev, struct vm_area_struct *vma)
1316 unsigned long req_size, pgoff, req_start;
1318 struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
1320 index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
1321 if (index >= VFIO_PCI_ROM_REGION_INDEX)
1324 if (vma->vm_end < vma->vm_start)
1326 if ((vma->vm_flags & VM_SHARED) == 0)
1328 if (index != VFIO_PCI_BAR2_REGION_INDEX)
1331 pg_prot = vma->vm_page_prot;
1332 virtaddr = vma->vm_start;
1333 req_size = vma->vm_end - vma->vm_start;
1334 pgoff = vma->vm_pgoff &
1335 ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
1336 req_start = pgoff << PAGE_SHIFT;
1338 if (!intel_vgpu_in_aperture(vgpu, req_start))
1340 if (req_start + req_size >
1341 vgpu_aperture_offset(vgpu) + vgpu_aperture_sz(vgpu))
1344 pgoff = (gvt_aperture_pa_base(vgpu->gvt) >> PAGE_SHIFT) + pgoff;
1346 return remap_pfn_range(vma, virtaddr, pgoff, req_size, pg_prot);
1349 static int intel_vgpu_get_irq_count(struct intel_vgpu *vgpu, int type)
1351 if (type == VFIO_PCI_INTX_IRQ_INDEX || type == VFIO_PCI_MSI_IRQ_INDEX)
1357 static int intel_vgpu_set_intx_mask(struct intel_vgpu *vgpu,
1358 unsigned int index, unsigned int start,
1359 unsigned int count, u32 flags,
1365 static int intel_vgpu_set_intx_unmask(struct intel_vgpu *vgpu,
1366 unsigned int index, unsigned int start,
1367 unsigned int count, u32 flags, void *data)
1372 static int intel_vgpu_set_intx_trigger(struct intel_vgpu *vgpu,
1373 unsigned int index, unsigned int start, unsigned int count,
1374 u32 flags, void *data)
1379 static int intel_vgpu_set_msi_trigger(struct intel_vgpu *vgpu,
1380 unsigned int index, unsigned int start, unsigned int count,
1381 u32 flags, void *data)
1383 struct eventfd_ctx *trigger;
1385 if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
1386 int fd = *(int *)data;
1388 trigger = eventfd_ctx_fdget(fd);
1389 if (IS_ERR(trigger)) {
1390 gvt_vgpu_err("eventfd_ctx_fdget failed\n");
1391 return PTR_ERR(trigger);
1393 kvmgt_vdev(vgpu)->msi_trigger = trigger;
1394 } else if ((flags & VFIO_IRQ_SET_DATA_NONE) && !count)
1395 intel_vgpu_release_msi_eventfd_ctx(vgpu);
1400 static int intel_vgpu_set_irqs(struct intel_vgpu *vgpu, u32 flags,
1401 unsigned int index, unsigned int start, unsigned int count,
1404 int (*func)(struct intel_vgpu *vgpu, unsigned int index,
1405 unsigned int start, unsigned int count, u32 flags,
1409 case VFIO_PCI_INTX_IRQ_INDEX:
1410 switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
1411 case VFIO_IRQ_SET_ACTION_MASK:
1412 func = intel_vgpu_set_intx_mask;
1414 case VFIO_IRQ_SET_ACTION_UNMASK:
1415 func = intel_vgpu_set_intx_unmask;
1417 case VFIO_IRQ_SET_ACTION_TRIGGER:
1418 func = intel_vgpu_set_intx_trigger;
1422 case VFIO_PCI_MSI_IRQ_INDEX:
1423 switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
1424 case VFIO_IRQ_SET_ACTION_MASK:
1425 case VFIO_IRQ_SET_ACTION_UNMASK:
1426 /* XXX Need masking support exported */
1428 case VFIO_IRQ_SET_ACTION_TRIGGER:
1429 func = intel_vgpu_set_msi_trigger;
1438 return func(vgpu, index, start, count, flags, data);
1441 static long intel_vgpu_ioctl(struct mdev_device *mdev, unsigned int cmd,
1444 struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
1445 struct kvmgt_vdev *vdev = kvmgt_vdev(vgpu);
1446 unsigned long minsz;
1448 gvt_dbg_core("vgpu%d ioctl, cmd: %d\n", vgpu->id, cmd);
1450 if (cmd == VFIO_DEVICE_GET_INFO) {
1451 struct vfio_device_info info;
1453 minsz = offsetofend(struct vfio_device_info, num_irqs);
1455 if (copy_from_user(&info, (void __user *)arg, minsz))
1458 if (info.argsz < minsz)
1461 info.flags = VFIO_DEVICE_FLAGS_PCI;
1462 info.flags |= VFIO_DEVICE_FLAGS_RESET;
1463 info.num_regions = VFIO_PCI_NUM_REGIONS +
1465 info.num_irqs = VFIO_PCI_NUM_IRQS;
1467 return copy_to_user((void __user *)arg, &info, minsz) ?
1470 } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) {
1471 struct vfio_region_info info;
1472 struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
1475 struct vfio_region_info_cap_sparse_mmap *sparse = NULL;
1479 minsz = offsetofend(struct vfio_region_info, offset);
1481 if (copy_from_user(&info, (void __user *)arg, minsz))
1484 if (info.argsz < minsz)
1487 switch (info.index) {
1488 case VFIO_PCI_CONFIG_REGION_INDEX:
1489 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1490 info.size = vgpu->gvt->device_info.cfg_space_size;
1491 info.flags = VFIO_REGION_INFO_FLAG_READ |
1492 VFIO_REGION_INFO_FLAG_WRITE;
1494 case VFIO_PCI_BAR0_REGION_INDEX:
1495 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1496 info.size = vgpu->cfg_space.bar[info.index].size;
1502 info.flags = VFIO_REGION_INFO_FLAG_READ |
1503 VFIO_REGION_INFO_FLAG_WRITE;
1505 case VFIO_PCI_BAR1_REGION_INDEX:
1506 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1510 case VFIO_PCI_BAR2_REGION_INDEX:
1511 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1512 info.flags = VFIO_REGION_INFO_FLAG_CAPS |
1513 VFIO_REGION_INFO_FLAG_MMAP |
1514 VFIO_REGION_INFO_FLAG_READ |
1515 VFIO_REGION_INFO_FLAG_WRITE;
1516 info.size = gvt_aperture_sz(vgpu->gvt);
1518 sparse = kzalloc(struct_size(sparse, areas, nr_areas),
1523 sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
1524 sparse->header.version = 1;
1525 sparse->nr_areas = nr_areas;
1526 cap_type_id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
1527 sparse->areas[0].offset =
1528 PAGE_ALIGN(vgpu_aperture_offset(vgpu));
1529 sparse->areas[0].size = vgpu_aperture_sz(vgpu);
1532 case VFIO_PCI_BAR3_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
1533 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1537 gvt_dbg_core("get region info bar:%d\n", info.index);
1540 case VFIO_PCI_ROM_REGION_INDEX:
1541 case VFIO_PCI_VGA_REGION_INDEX:
1542 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1546 gvt_dbg_core("get region info index:%d\n", info.index);
1550 struct vfio_region_info_cap_type cap_type = {
1551 .header.id = VFIO_REGION_INFO_CAP_TYPE,
1552 .header.version = 1 };
1554 if (info.index >= VFIO_PCI_NUM_REGIONS +
1558 array_index_nospec(info.index,
1559 VFIO_PCI_NUM_REGIONS +
1562 i = info.index - VFIO_PCI_NUM_REGIONS;
1565 VFIO_PCI_INDEX_TO_OFFSET(info.index);
1566 info.size = vdev->region[i].size;
1567 info.flags = vdev->region[i].flags;
1569 cap_type.type = vdev->region[i].type;
1570 cap_type.subtype = vdev->region[i].subtype;
1572 ret = vfio_info_add_capability(&caps,
1580 if ((info.flags & VFIO_REGION_INFO_FLAG_CAPS) && sparse) {
1581 switch (cap_type_id) {
1582 case VFIO_REGION_INFO_CAP_SPARSE_MMAP:
1583 ret = vfio_info_add_capability(&caps,
1585 struct_size(sparse, areas,
1599 info.flags |= VFIO_REGION_INFO_FLAG_CAPS;
1600 if (info.argsz < sizeof(info) + caps.size) {
1601 info.argsz = sizeof(info) + caps.size;
1602 info.cap_offset = 0;
1604 vfio_info_cap_shift(&caps, sizeof(info));
1605 if (copy_to_user((void __user *)arg +
1606 sizeof(info), caps.buf,
1612 info.cap_offset = sizeof(info);
1619 return copy_to_user((void __user *)arg, &info, minsz) ?
1621 } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) {
1622 struct vfio_irq_info info;
1624 minsz = offsetofend(struct vfio_irq_info, count);
1626 if (copy_from_user(&info, (void __user *)arg, minsz))
1629 if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS)
1632 switch (info.index) {
1633 case VFIO_PCI_INTX_IRQ_INDEX:
1634 case VFIO_PCI_MSI_IRQ_INDEX:
1640 info.flags = VFIO_IRQ_INFO_EVENTFD;
1642 info.count = intel_vgpu_get_irq_count(vgpu, info.index);
1644 if (info.index == VFIO_PCI_INTX_IRQ_INDEX)
1645 info.flags |= (VFIO_IRQ_INFO_MASKABLE |
1646 VFIO_IRQ_INFO_AUTOMASKED);
1648 info.flags |= VFIO_IRQ_INFO_NORESIZE;
1650 return copy_to_user((void __user *)arg, &info, minsz) ?
1652 } else if (cmd == VFIO_DEVICE_SET_IRQS) {
1653 struct vfio_irq_set hdr;
1656 size_t data_size = 0;
1658 minsz = offsetofend(struct vfio_irq_set, count);
1660 if (copy_from_user(&hdr, (void __user *)arg, minsz))
1663 if (!(hdr.flags & VFIO_IRQ_SET_DATA_NONE)) {
1664 int max = intel_vgpu_get_irq_count(vgpu, hdr.index);
1666 ret = vfio_set_irqs_validate_and_prepare(&hdr, max,
1667 VFIO_PCI_NUM_IRQS, &data_size);
1669 gvt_vgpu_err("intel:vfio_set_irqs_validate_and_prepare failed\n");
1673 data = memdup_user((void __user *)(arg + minsz),
1676 return PTR_ERR(data);
1680 ret = intel_vgpu_set_irqs(vgpu, hdr.flags, hdr.index,
1681 hdr.start, hdr.count, data);
1685 } else if (cmd == VFIO_DEVICE_RESET) {
1686 intel_gvt_ops->vgpu_reset(vgpu);
1688 } else if (cmd == VFIO_DEVICE_QUERY_GFX_PLANE) {
1689 struct vfio_device_gfx_plane_info dmabuf;
1692 minsz = offsetofend(struct vfio_device_gfx_plane_info,
1694 if (copy_from_user(&dmabuf, (void __user *)arg, minsz))
1696 if (dmabuf.argsz < minsz)
1699 ret = intel_gvt_ops->vgpu_query_plane(vgpu, &dmabuf);
1703 return copy_to_user((void __user *)arg, &dmabuf, minsz) ?
1705 } else if (cmd == VFIO_DEVICE_GET_GFX_DMABUF) {
1709 if (get_user(dmabuf_id, (__u32 __user *)arg))
1712 dmabuf_fd = intel_gvt_ops->vgpu_get_dmabuf(vgpu, dmabuf_id);
1721 vgpu_id_show(struct device *dev, struct device_attribute *attr,
1724 struct mdev_device *mdev = mdev_from_dev(dev);
1727 struct intel_vgpu *vgpu = (struct intel_vgpu *)
1728 mdev_get_drvdata(mdev);
1729 return sprintf(buf, "%d\n", vgpu->id);
1731 return sprintf(buf, "\n");
1734 static DEVICE_ATTR_RO(vgpu_id);
1736 static struct attribute *intel_vgpu_attrs[] = {
1737 &dev_attr_vgpu_id.attr,
1741 static const struct attribute_group intel_vgpu_group = {
1742 .name = "intel_vgpu",
1743 .attrs = intel_vgpu_attrs,
1746 static const struct attribute_group *intel_vgpu_groups[] = {
1751 static struct mdev_parent_ops intel_vgpu_ops = {
1752 .mdev_attr_groups = intel_vgpu_groups,
1753 .create = intel_vgpu_create,
1754 .remove = intel_vgpu_remove,
1756 .open = intel_vgpu_open,
1757 .release = intel_vgpu_release,
1759 .read = intel_vgpu_read,
1760 .write = intel_vgpu_write,
1761 .mmap = intel_vgpu_mmap,
1762 .ioctl = intel_vgpu_ioctl,
1765 static int kvmgt_host_init(struct device *dev, void *gvt, const void *ops)
1769 ret = intel_gvt_init_vgpu_type_groups((struct intel_gvt *)gvt);
1773 intel_gvt_ops = ops;
1774 intel_vgpu_ops.supported_type_groups = gvt_vgpu_type_groups;
1776 ret = mdev_register_device(dev, &intel_vgpu_ops);
1778 intel_gvt_cleanup_vgpu_type_groups((struct intel_gvt *)gvt);
1783 static void kvmgt_host_exit(struct device *dev, void *gvt)
1785 mdev_unregister_device(dev);
1786 intel_gvt_cleanup_vgpu_type_groups((struct intel_gvt *)gvt);
1789 static int kvmgt_page_track_add(unsigned long handle, u64 gfn)
1791 struct kvmgt_guest_info *info;
1793 struct kvm_memory_slot *slot;
1796 if (!handle_valid(handle))
1799 info = (struct kvmgt_guest_info *)handle;
1802 idx = srcu_read_lock(&kvm->srcu);
1803 slot = gfn_to_memslot(kvm, gfn);
1805 srcu_read_unlock(&kvm->srcu, idx);
1809 write_lock(&kvm->mmu_lock);
1811 if (kvmgt_gfn_is_write_protected(info, gfn))
1814 kvm_slot_page_track_add_page(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE);
1815 kvmgt_protect_table_add(info, gfn);
1818 write_unlock(&kvm->mmu_lock);
1819 srcu_read_unlock(&kvm->srcu, idx);
1823 static int kvmgt_page_track_remove(unsigned long handle, u64 gfn)
1825 struct kvmgt_guest_info *info;
1827 struct kvm_memory_slot *slot;
1830 if (!handle_valid(handle))
1833 info = (struct kvmgt_guest_info *)handle;
1836 idx = srcu_read_lock(&kvm->srcu);
1837 slot = gfn_to_memslot(kvm, gfn);
1839 srcu_read_unlock(&kvm->srcu, idx);
1843 write_lock(&kvm->mmu_lock);
1845 if (!kvmgt_gfn_is_write_protected(info, gfn))
1848 kvm_slot_page_track_remove_page(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE);
1849 kvmgt_protect_table_del(info, gfn);
1852 write_unlock(&kvm->mmu_lock);
1853 srcu_read_unlock(&kvm->srcu, idx);
1857 static void kvmgt_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1858 const u8 *val, int len,
1859 struct kvm_page_track_notifier_node *node)
1861 struct kvmgt_guest_info *info = container_of(node,
1862 struct kvmgt_guest_info, track_node);
1864 if (kvmgt_gfn_is_write_protected(info, gpa_to_gfn(gpa)))
1865 intel_gvt_ops->write_protect_handler(info->vgpu, gpa,
1869 static void kvmgt_page_track_flush_slot(struct kvm *kvm,
1870 struct kvm_memory_slot *slot,
1871 struct kvm_page_track_notifier_node *node)
1875 struct kvmgt_guest_info *info = container_of(node,
1876 struct kvmgt_guest_info, track_node);
1878 write_lock(&kvm->mmu_lock);
1879 for (i = 0; i < slot->npages; i++) {
1880 gfn = slot->base_gfn + i;
1881 if (kvmgt_gfn_is_write_protected(info, gfn)) {
1882 kvm_slot_page_track_remove_page(kvm, slot, gfn,
1883 KVM_PAGE_TRACK_WRITE);
1884 kvmgt_protect_table_del(info, gfn);
1887 write_unlock(&kvm->mmu_lock);
1890 static bool __kvmgt_vgpu_exist(struct intel_vgpu *vgpu, struct kvm *kvm)
1892 struct intel_vgpu *itr;
1893 struct kvmgt_guest_info *info;
1897 mutex_lock(&vgpu->gvt->lock);
1898 for_each_active_vgpu(vgpu->gvt, itr, id) {
1899 if (!handle_valid(itr->handle))
1902 info = (struct kvmgt_guest_info *)itr->handle;
1903 if (kvm && kvm == info->kvm) {
1909 mutex_unlock(&vgpu->gvt->lock);
1913 static int kvmgt_guest_init(struct mdev_device *mdev)
1915 struct kvmgt_guest_info *info;
1916 struct intel_vgpu *vgpu;
1917 struct kvmgt_vdev *vdev;
1920 vgpu = mdev_get_drvdata(mdev);
1921 if (handle_valid(vgpu->handle))
1924 vdev = kvmgt_vdev(vgpu);
1926 if (!kvm || kvm->mm != current->mm) {
1927 gvt_vgpu_err("KVM is required to use Intel vGPU\n");
1931 if (__kvmgt_vgpu_exist(vgpu, kvm))
1934 info = vzalloc(sizeof(struct kvmgt_guest_info));
1938 vgpu->handle = (unsigned long)info;
1941 kvm_get_kvm(info->kvm);
1943 kvmgt_protect_table_init(info);
1944 gvt_cache_init(vgpu);
1946 info->track_node.track_write = kvmgt_page_track_write;
1947 info->track_node.track_flush_slot = kvmgt_page_track_flush_slot;
1948 kvm_page_track_register_notifier(kvm, &info->track_node);
1950 info->debugfs_cache_entries = debugfs_create_ulong(
1951 "kvmgt_nr_cache_entries",
1952 0444, vgpu->debugfs,
1953 &vdev->nr_cache_entries);
1957 static bool kvmgt_guest_exit(struct kvmgt_guest_info *info)
1959 debugfs_remove(info->debugfs_cache_entries);
1961 kvm_page_track_unregister_notifier(info->kvm, &info->track_node);
1962 kvm_put_kvm(info->kvm);
1963 kvmgt_protect_table_destroy(info);
1964 gvt_cache_destroy(info->vgpu);
1970 static int kvmgt_attach_vgpu(void *p_vgpu, unsigned long *handle)
1972 struct intel_vgpu *vgpu = (struct intel_vgpu *)p_vgpu;
1974 vgpu->vdev = kzalloc(sizeof(struct kvmgt_vdev), GFP_KERNEL);
1979 kvmgt_vdev(vgpu)->vgpu = vgpu;
1984 static void kvmgt_detach_vgpu(void *p_vgpu)
1987 struct intel_vgpu *vgpu = (struct intel_vgpu *)p_vgpu;
1988 struct kvmgt_vdev *vdev = kvmgt_vdev(vgpu);
1993 for (i = 0; i < vdev->num_regions; i++)
1994 if (vdev->region[i].ops->release)
1995 vdev->region[i].ops->release(vgpu,
1997 vdev->num_regions = 0;
1998 kfree(vdev->region);
1999 vdev->region = NULL;
2004 static int kvmgt_inject_msi(unsigned long handle, u32 addr, u16 data)
2006 struct kvmgt_guest_info *info;
2007 struct intel_vgpu *vgpu;
2008 struct kvmgt_vdev *vdev;
2010 if (!handle_valid(handle))
2013 info = (struct kvmgt_guest_info *)handle;
2015 vdev = kvmgt_vdev(vgpu);
2018 * When guest is poweroff, msi_trigger is set to NULL, but vgpu's
2019 * config and mmio register isn't restored to default during guest
2020 * poweroff. If this vgpu is still used in next vm, this vgpu's pipe
2021 * may be enabled, then once this vgpu is active, it will get inject
2022 * vblank interrupt request. But msi_trigger is null until msi is
2023 * enabled by guest. so if msi_trigger is null, success is still
2024 * returned and don't inject interrupt into guest.
2026 if (vdev->msi_trigger == NULL)
2029 if (eventfd_signal(vdev->msi_trigger, 1) == 1)
2035 static unsigned long kvmgt_gfn_to_pfn(unsigned long handle, unsigned long gfn)
2037 struct kvmgt_guest_info *info;
2040 if (!handle_valid(handle))
2041 return INTEL_GVT_INVALID_ADDR;
2043 info = (struct kvmgt_guest_info *)handle;
2045 pfn = gfn_to_pfn(info->kvm, gfn);
2046 if (is_error_noslot_pfn(pfn))
2047 return INTEL_GVT_INVALID_ADDR;
2052 static int kvmgt_dma_map_guest_page(unsigned long handle, unsigned long gfn,
2053 unsigned long size, dma_addr_t *dma_addr)
2055 struct intel_vgpu *vgpu;
2056 struct kvmgt_vdev *vdev;
2057 struct gvt_dma *entry;
2060 if (!handle_valid(handle))
2063 vgpu = ((struct kvmgt_guest_info *)handle)->vgpu;
2064 vdev = kvmgt_vdev(vgpu);
2066 mutex_lock(&vdev->cache_lock);
2068 entry = __gvt_cache_find_gfn(vgpu, gfn);
2070 ret = gvt_dma_map_page(vgpu, gfn, dma_addr, size);
2074 ret = __gvt_cache_add(vgpu, gfn, *dma_addr, size);
2077 } else if (entry->size != size) {
2078 /* the same gfn with different size: unmap and re-map */
2079 gvt_dma_unmap_page(vgpu, gfn, entry->dma_addr, entry->size);
2080 __gvt_cache_remove_entry(vgpu, entry);
2082 ret = gvt_dma_map_page(vgpu, gfn, dma_addr, size);
2086 ret = __gvt_cache_add(vgpu, gfn, *dma_addr, size);
2090 kref_get(&entry->ref);
2091 *dma_addr = entry->dma_addr;
2094 mutex_unlock(&vdev->cache_lock);
2098 gvt_dma_unmap_page(vgpu, gfn, *dma_addr, size);
2100 mutex_unlock(&vdev->cache_lock);
2104 static int kvmgt_dma_pin_guest_page(unsigned long handle, dma_addr_t dma_addr)
2106 struct kvmgt_guest_info *info;
2107 struct kvmgt_vdev *vdev;
2108 struct gvt_dma *entry;
2111 if (!handle_valid(handle))
2114 info = (struct kvmgt_guest_info *)handle;
2115 vdev = kvmgt_vdev(info->vgpu);
2117 mutex_lock(&vdev->cache_lock);
2118 entry = __gvt_cache_find_dma_addr(info->vgpu, dma_addr);
2120 kref_get(&entry->ref);
2123 mutex_unlock(&vdev->cache_lock);
2128 static void __gvt_dma_release(struct kref *ref)
2130 struct gvt_dma *entry = container_of(ref, typeof(*entry), ref);
2132 gvt_dma_unmap_page(entry->vgpu, entry->gfn, entry->dma_addr,
2134 __gvt_cache_remove_entry(entry->vgpu, entry);
2137 static void kvmgt_dma_unmap_guest_page(unsigned long handle, dma_addr_t dma_addr)
2139 struct intel_vgpu *vgpu;
2140 struct kvmgt_vdev *vdev;
2141 struct gvt_dma *entry;
2143 if (!handle_valid(handle))
2146 vgpu = ((struct kvmgt_guest_info *)handle)->vgpu;
2147 vdev = kvmgt_vdev(vgpu);
2149 mutex_lock(&vdev->cache_lock);
2150 entry = __gvt_cache_find_dma_addr(vgpu, dma_addr);
2152 kref_put(&entry->ref, __gvt_dma_release);
2153 mutex_unlock(&vdev->cache_lock);
2156 static int kvmgt_rw_gpa(unsigned long handle, unsigned long gpa,
2157 void *buf, unsigned long len, bool write)
2159 struct kvmgt_guest_info *info;
2161 if (!handle_valid(handle))
2164 info = (struct kvmgt_guest_info *)handle;
2166 return vfio_dma_rw(kvmgt_vdev(info->vgpu)->vfio_group,
2167 gpa, buf, len, write);
2170 static int kvmgt_read_gpa(unsigned long handle, unsigned long gpa,
2171 void *buf, unsigned long len)
2173 return kvmgt_rw_gpa(handle, gpa, buf, len, false);
2176 static int kvmgt_write_gpa(unsigned long handle, unsigned long gpa,
2177 void *buf, unsigned long len)
2179 return kvmgt_rw_gpa(handle, gpa, buf, len, true);
2182 static unsigned long kvmgt_virt_to_pfn(void *addr)
2184 return PFN_DOWN(__pa(addr));
2187 static bool kvmgt_is_valid_gfn(unsigned long handle, unsigned long gfn)
2189 struct kvmgt_guest_info *info;
2194 if (!handle_valid(handle))
2197 info = (struct kvmgt_guest_info *)handle;
2200 idx = srcu_read_lock(&kvm->srcu);
2201 ret = kvm_is_visible_gfn(kvm, gfn);
2202 srcu_read_unlock(&kvm->srcu, idx);
2207 static const struct intel_gvt_mpt kvmgt_mpt = {
2208 .type = INTEL_GVT_HYPERVISOR_KVM,
2209 .host_init = kvmgt_host_init,
2210 .host_exit = kvmgt_host_exit,
2211 .attach_vgpu = kvmgt_attach_vgpu,
2212 .detach_vgpu = kvmgt_detach_vgpu,
2213 .inject_msi = kvmgt_inject_msi,
2214 .from_virt_to_mfn = kvmgt_virt_to_pfn,
2215 .enable_page_track = kvmgt_page_track_add,
2216 .disable_page_track = kvmgt_page_track_remove,
2217 .read_gpa = kvmgt_read_gpa,
2218 .write_gpa = kvmgt_write_gpa,
2219 .gfn_to_mfn = kvmgt_gfn_to_pfn,
2220 .dma_map_guest_page = kvmgt_dma_map_guest_page,
2221 .dma_unmap_guest_page = kvmgt_dma_unmap_guest_page,
2222 .dma_pin_guest_page = kvmgt_dma_pin_guest_page,
2223 .set_opregion = kvmgt_set_opregion,
2224 .set_edid = kvmgt_set_edid,
2225 .get_vfio_device = kvmgt_get_vfio_device,
2226 .put_vfio_device = kvmgt_put_vfio_device,
2227 .is_valid_gfn = kvmgt_is_valid_gfn,
2230 static int __init kvmgt_init(void)
2232 if (intel_gvt_register_hypervisor(&kvmgt_mpt) < 0)
2237 static void __exit kvmgt_exit(void)
2239 intel_gvt_unregister_hypervisor();
2242 module_init(kvmgt_init);
2243 module_exit(kvmgt_exit);
2245 MODULE_LICENSE("GPL and additional rights");
2246 MODULE_AUTHOR("Intel Corporation");