a45f46d8537f15bd187fd4195d7de2c6b6dffd2a
[sfrench/cifs-2.6.git] / drivers / gpu / drm / i915 / gvt / kvmgt.c
1 /*
2  * KVMGT - the implementation of Intel mediated pass-through framework for KVM
3  *
4  * Copyright(c) 2014-2016 Intel Corporation. All rights reserved.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10  * and/or sell copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice (including the next
14  * paragraph) shall be included in all copies or substantial portions of the
15  * Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23  * SOFTWARE.
24  *
25  * Authors:
26  *    Kevin Tian <kevin.tian@intel.com>
27  *    Jike Song <jike.song@intel.com>
28  *    Xiaoguang Chen <xiaoguang.chen@intel.com>
29  */
30
31 #include <linux/init.h>
32 #include <linux/device.h>
33 #include <linux/mm.h>
34 #include <linux/mmu_context.h>
35 #include <linux/types.h>
36 #include <linux/list.h>
37 #include <linux/rbtree.h>
38 #include <linux/spinlock.h>
39 #include <linux/eventfd.h>
40 #include <linux/uuid.h>
41 #include <linux/kvm_host.h>
42 #include <linux/vfio.h>
43 #include <linux/mdev.h>
44 #include <linux/debugfs.h>
45
46 #include <linux/nospec.h>
47
48 #include "i915_drv.h"
49 #include "gvt.h"
50
51 static const struct intel_gvt_ops *intel_gvt_ops;
52
53 /* helper macros copied from vfio-pci */
54 #define VFIO_PCI_OFFSET_SHIFT   40
55 #define VFIO_PCI_OFFSET_TO_INDEX(off)   (off >> VFIO_PCI_OFFSET_SHIFT)
56 #define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT)
57 #define VFIO_PCI_OFFSET_MASK    (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1)
58
59 #define OPREGION_SIGNATURE "IntelGraphicsMem"
60
61 struct vfio_region;
62 struct intel_vgpu_regops {
63         size_t (*rw)(struct intel_vgpu *vgpu, char *buf,
64                         size_t count, loff_t *ppos, bool iswrite);
65         void (*release)(struct intel_vgpu *vgpu,
66                         struct vfio_region *region);
67 };
68
69 struct vfio_region {
70         u32                             type;
71         u32                             subtype;
72         size_t                          size;
73         u32                             flags;
74         const struct intel_vgpu_regops  *ops;
75         void                            *data;
76 };
77
78 struct kvmgt_pgfn {
79         gfn_t gfn;
80         struct hlist_node hnode;
81 };
82
83 struct kvmgt_guest_info {
84         struct kvm *kvm;
85         struct intel_vgpu *vgpu;
86         struct kvm_page_track_notifier_node track_node;
87 #define NR_BKT (1 << 18)
88         struct hlist_head ptable[NR_BKT];
89 #undef NR_BKT
90         struct dentry *debugfs_cache_entries;
91 };
92
93 struct gvt_dma {
94         struct intel_vgpu *vgpu;
95         struct rb_node gfn_node;
96         struct rb_node dma_addr_node;
97         gfn_t gfn;
98         dma_addr_t dma_addr;
99         unsigned long size;
100         struct kref ref;
101 };
102
103 static inline bool handle_valid(unsigned long handle)
104 {
105         return !!(handle & ~0xff);
106 }
107
108 static int kvmgt_guest_init(struct mdev_device *mdev);
109 static void intel_vgpu_release_work(struct work_struct *work);
110 static bool kvmgt_guest_exit(struct kvmgt_guest_info *info);
111
112 static void gvt_unpin_guest_page(struct intel_vgpu *vgpu, unsigned long gfn,
113                 unsigned long size)
114 {
115         int total_pages;
116         int npage;
117         int ret;
118
119         total_pages = roundup(size, PAGE_SIZE) / PAGE_SIZE;
120
121         for (npage = 0; npage < total_pages; npage++) {
122                 unsigned long cur_gfn = gfn + npage;
123
124                 ret = vfio_unpin_pages(mdev_dev(vgpu->vdev.mdev), &cur_gfn, 1);
125                 WARN_ON(ret != 1);
126         }
127 }
128
129 /* Pin a normal or compound guest page for dma. */
130 static int gvt_pin_guest_page(struct intel_vgpu *vgpu, unsigned long gfn,
131                 unsigned long size, struct page **page)
132 {
133         unsigned long base_pfn = 0;
134         int total_pages;
135         int npage;
136         int ret;
137
138         total_pages = roundup(size, PAGE_SIZE) / PAGE_SIZE;
139         /*
140          * We pin the pages one-by-one to avoid allocating a big arrary
141          * on stack to hold pfns.
142          */
143         for (npage = 0; npage < total_pages; npage++) {
144                 unsigned long cur_gfn = gfn + npage;
145                 unsigned long pfn;
146
147                 ret = vfio_pin_pages(mdev_dev(vgpu->vdev.mdev), &cur_gfn, 1,
148                                      IOMMU_READ | IOMMU_WRITE, &pfn);
149                 if (ret != 1) {
150                         gvt_vgpu_err("vfio_pin_pages failed for gfn 0x%lx, ret %d\n",
151                                      cur_gfn, ret);
152                         goto err;
153                 }
154
155                 if (!pfn_valid(pfn)) {
156                         gvt_vgpu_err("pfn 0x%lx is not mem backed\n", pfn);
157                         npage++;
158                         ret = -EFAULT;
159                         goto err;
160                 }
161
162                 if (npage == 0)
163                         base_pfn = pfn;
164                 else if (base_pfn + npage != pfn) {
165                         gvt_vgpu_err("The pages are not continuous\n");
166                         ret = -EINVAL;
167                         npage++;
168                         goto err;
169                 }
170         }
171
172         *page = pfn_to_page(base_pfn);
173         return 0;
174 err:
175         gvt_unpin_guest_page(vgpu, gfn, npage * PAGE_SIZE);
176         return ret;
177 }
178
179 static int gvt_dma_map_page(struct intel_vgpu *vgpu, unsigned long gfn,
180                 dma_addr_t *dma_addr, unsigned long size)
181 {
182         struct device *dev = &vgpu->gvt->dev_priv->drm.pdev->dev;
183         struct page *page = NULL;
184         int ret;
185
186         ret = gvt_pin_guest_page(vgpu, gfn, size, &page);
187         if (ret)
188                 return ret;
189
190         /* Setup DMA mapping. */
191         *dma_addr = dma_map_page(dev, page, 0, size, PCI_DMA_BIDIRECTIONAL);
192         if (dma_mapping_error(dev, *dma_addr)) {
193                 gvt_vgpu_err("DMA mapping failed for pfn 0x%lx, ret %d\n",
194                              page_to_pfn(page), ret);
195                 gvt_unpin_guest_page(vgpu, gfn, size);
196                 return -ENOMEM;
197         }
198
199         return 0;
200 }
201
202 static void gvt_dma_unmap_page(struct intel_vgpu *vgpu, unsigned long gfn,
203                 dma_addr_t dma_addr, unsigned long size)
204 {
205         struct device *dev = &vgpu->gvt->dev_priv->drm.pdev->dev;
206
207         dma_unmap_page(dev, dma_addr, size, PCI_DMA_BIDIRECTIONAL);
208         gvt_unpin_guest_page(vgpu, gfn, size);
209 }
210
211 static struct gvt_dma *__gvt_cache_find_dma_addr(struct intel_vgpu *vgpu,
212                 dma_addr_t dma_addr)
213 {
214         struct rb_node *node = vgpu->vdev.dma_addr_cache.rb_node;
215         struct gvt_dma *itr;
216
217         while (node) {
218                 itr = rb_entry(node, struct gvt_dma, dma_addr_node);
219
220                 if (dma_addr < itr->dma_addr)
221                         node = node->rb_left;
222                 else if (dma_addr > itr->dma_addr)
223                         node = node->rb_right;
224                 else
225                         return itr;
226         }
227         return NULL;
228 }
229
230 static struct gvt_dma *__gvt_cache_find_gfn(struct intel_vgpu *vgpu, gfn_t gfn)
231 {
232         struct rb_node *node = vgpu->vdev.gfn_cache.rb_node;
233         struct gvt_dma *itr;
234
235         while (node) {
236                 itr = rb_entry(node, struct gvt_dma, gfn_node);
237
238                 if (gfn < itr->gfn)
239                         node = node->rb_left;
240                 else if (gfn > itr->gfn)
241                         node = node->rb_right;
242                 else
243                         return itr;
244         }
245         return NULL;
246 }
247
248 static int __gvt_cache_add(struct intel_vgpu *vgpu, gfn_t gfn,
249                 dma_addr_t dma_addr, unsigned long size)
250 {
251         struct gvt_dma *new, *itr;
252         struct rb_node **link, *parent = NULL;
253
254         new = kzalloc(sizeof(struct gvt_dma), GFP_KERNEL);
255         if (!new)
256                 return -ENOMEM;
257
258         new->vgpu = vgpu;
259         new->gfn = gfn;
260         new->dma_addr = dma_addr;
261         new->size = size;
262         kref_init(&new->ref);
263
264         /* gfn_cache maps gfn to struct gvt_dma. */
265         link = &vgpu->vdev.gfn_cache.rb_node;
266         while (*link) {
267                 parent = *link;
268                 itr = rb_entry(parent, struct gvt_dma, gfn_node);
269
270                 if (gfn < itr->gfn)
271                         link = &parent->rb_left;
272                 else
273                         link = &parent->rb_right;
274         }
275         rb_link_node(&new->gfn_node, parent, link);
276         rb_insert_color(&new->gfn_node, &vgpu->vdev.gfn_cache);
277
278         /* dma_addr_cache maps dma addr to struct gvt_dma. */
279         parent = NULL;
280         link = &vgpu->vdev.dma_addr_cache.rb_node;
281         while (*link) {
282                 parent = *link;
283                 itr = rb_entry(parent, struct gvt_dma, dma_addr_node);
284
285                 if (dma_addr < itr->dma_addr)
286                         link = &parent->rb_left;
287                 else
288                         link = &parent->rb_right;
289         }
290         rb_link_node(&new->dma_addr_node, parent, link);
291         rb_insert_color(&new->dma_addr_node, &vgpu->vdev.dma_addr_cache);
292
293         vgpu->vdev.nr_cache_entries++;
294         return 0;
295 }
296
297 static void __gvt_cache_remove_entry(struct intel_vgpu *vgpu,
298                                 struct gvt_dma *entry)
299 {
300         rb_erase(&entry->gfn_node, &vgpu->vdev.gfn_cache);
301         rb_erase(&entry->dma_addr_node, &vgpu->vdev.dma_addr_cache);
302         kfree(entry);
303         vgpu->vdev.nr_cache_entries--;
304 }
305
306 static void gvt_cache_destroy(struct intel_vgpu *vgpu)
307 {
308         struct gvt_dma *dma;
309         struct rb_node *node = NULL;
310
311         for (;;) {
312                 mutex_lock(&vgpu->vdev.cache_lock);
313                 node = rb_first(&vgpu->vdev.gfn_cache);
314                 if (!node) {
315                         mutex_unlock(&vgpu->vdev.cache_lock);
316                         break;
317                 }
318                 dma = rb_entry(node, struct gvt_dma, gfn_node);
319                 gvt_dma_unmap_page(vgpu, dma->gfn, dma->dma_addr, dma->size);
320                 __gvt_cache_remove_entry(vgpu, dma);
321                 mutex_unlock(&vgpu->vdev.cache_lock);
322         }
323 }
324
325 static void gvt_cache_init(struct intel_vgpu *vgpu)
326 {
327         vgpu->vdev.gfn_cache = RB_ROOT;
328         vgpu->vdev.dma_addr_cache = RB_ROOT;
329         vgpu->vdev.nr_cache_entries = 0;
330         mutex_init(&vgpu->vdev.cache_lock);
331 }
332
333 static void kvmgt_protect_table_init(struct kvmgt_guest_info *info)
334 {
335         hash_init(info->ptable);
336 }
337
338 static void kvmgt_protect_table_destroy(struct kvmgt_guest_info *info)
339 {
340         struct kvmgt_pgfn *p;
341         struct hlist_node *tmp;
342         int i;
343
344         hash_for_each_safe(info->ptable, i, tmp, p, hnode) {
345                 hash_del(&p->hnode);
346                 kfree(p);
347         }
348 }
349
350 static struct kvmgt_pgfn *
351 __kvmgt_protect_table_find(struct kvmgt_guest_info *info, gfn_t gfn)
352 {
353         struct kvmgt_pgfn *p, *res = NULL;
354
355         hash_for_each_possible(info->ptable, p, hnode, gfn) {
356                 if (gfn == p->gfn) {
357                         res = p;
358                         break;
359                 }
360         }
361
362         return res;
363 }
364
365 static bool kvmgt_gfn_is_write_protected(struct kvmgt_guest_info *info,
366                                 gfn_t gfn)
367 {
368         struct kvmgt_pgfn *p;
369
370         p = __kvmgt_protect_table_find(info, gfn);
371         return !!p;
372 }
373
374 static void kvmgt_protect_table_add(struct kvmgt_guest_info *info, gfn_t gfn)
375 {
376         struct kvmgt_pgfn *p;
377
378         if (kvmgt_gfn_is_write_protected(info, gfn))
379                 return;
380
381         p = kzalloc(sizeof(struct kvmgt_pgfn), GFP_ATOMIC);
382         if (WARN(!p, "gfn: 0x%llx\n", gfn))
383                 return;
384
385         p->gfn = gfn;
386         hash_add(info->ptable, &p->hnode, gfn);
387 }
388
389 static void kvmgt_protect_table_del(struct kvmgt_guest_info *info,
390                                 gfn_t gfn)
391 {
392         struct kvmgt_pgfn *p;
393
394         p = __kvmgt_protect_table_find(info, gfn);
395         if (p) {
396                 hash_del(&p->hnode);
397                 kfree(p);
398         }
399 }
400
401 static size_t intel_vgpu_reg_rw_opregion(struct intel_vgpu *vgpu, char *buf,
402                 size_t count, loff_t *ppos, bool iswrite)
403 {
404         unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) -
405                         VFIO_PCI_NUM_REGIONS;
406         void *base = vgpu->vdev.region[i].data;
407         loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
408
409         if (pos >= vgpu->vdev.region[i].size || iswrite) {
410                 gvt_vgpu_err("invalid op or offset for Intel vgpu OpRegion\n");
411                 return -EINVAL;
412         }
413         count = min(count, (size_t)(vgpu->vdev.region[i].size - pos));
414         memcpy(buf, base + pos, count);
415
416         return count;
417 }
418
419 static void intel_vgpu_reg_release_opregion(struct intel_vgpu *vgpu,
420                 struct vfio_region *region)
421 {
422 }
423
424 static const struct intel_vgpu_regops intel_vgpu_regops_opregion = {
425         .rw = intel_vgpu_reg_rw_opregion,
426         .release = intel_vgpu_reg_release_opregion,
427 };
428
429 static int intel_vgpu_register_reg(struct intel_vgpu *vgpu,
430                 unsigned int type, unsigned int subtype,
431                 const struct intel_vgpu_regops *ops,
432                 size_t size, u32 flags, void *data)
433 {
434         struct vfio_region *region;
435
436         region = krealloc(vgpu->vdev.region,
437                         (vgpu->vdev.num_regions + 1) * sizeof(*region),
438                         GFP_KERNEL);
439         if (!region)
440                 return -ENOMEM;
441
442         vgpu->vdev.region = region;
443         vgpu->vdev.region[vgpu->vdev.num_regions].type = type;
444         vgpu->vdev.region[vgpu->vdev.num_regions].subtype = subtype;
445         vgpu->vdev.region[vgpu->vdev.num_regions].ops = ops;
446         vgpu->vdev.region[vgpu->vdev.num_regions].size = size;
447         vgpu->vdev.region[vgpu->vdev.num_regions].flags = flags;
448         vgpu->vdev.region[vgpu->vdev.num_regions].data = data;
449         vgpu->vdev.num_regions++;
450         return 0;
451 }
452
453 static int kvmgt_get_vfio_device(void *p_vgpu)
454 {
455         struct intel_vgpu *vgpu = (struct intel_vgpu *)p_vgpu;
456
457         vgpu->vdev.vfio_device = vfio_device_get_from_dev(
458                 mdev_dev(vgpu->vdev.mdev));
459         if (!vgpu->vdev.vfio_device) {
460                 gvt_vgpu_err("failed to get vfio device\n");
461                 return -ENODEV;
462         }
463         return 0;
464 }
465
466
467 static int kvmgt_set_opregion(void *p_vgpu)
468 {
469         struct intel_vgpu *vgpu = (struct intel_vgpu *)p_vgpu;
470         void *base;
471         int ret;
472
473         /* Each vgpu has its own opregion, although VFIO would create another
474          * one later. This one is used to expose opregion to VFIO. And the
475          * other one created by VFIO later, is used by guest actually.
476          */
477         base = vgpu_opregion(vgpu)->va;
478         if (!base)
479                 return -ENOMEM;
480
481         if (memcmp(base, OPREGION_SIGNATURE, 16)) {
482                 memunmap(base);
483                 return -EINVAL;
484         }
485
486         ret = intel_vgpu_register_reg(vgpu,
487                         PCI_VENDOR_ID_INTEL | VFIO_REGION_TYPE_PCI_VENDOR_TYPE,
488                         VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION,
489                         &intel_vgpu_regops_opregion, OPREGION_SIZE,
490                         VFIO_REGION_INFO_FLAG_READ, base);
491
492         return ret;
493 }
494
495 static void kvmgt_put_vfio_device(void *vgpu)
496 {
497         if (WARN_ON(!((struct intel_vgpu *)vgpu)->vdev.vfio_device))
498                 return;
499
500         vfio_device_put(((struct intel_vgpu *)vgpu)->vdev.vfio_device);
501 }
502
503 static int intel_vgpu_create(struct kobject *kobj, struct mdev_device *mdev)
504 {
505         struct intel_vgpu *vgpu = NULL;
506         struct intel_vgpu_type *type;
507         struct device *pdev;
508         void *gvt;
509         int ret;
510
511         pdev = mdev_parent_dev(mdev);
512         gvt = kdev_to_i915(pdev)->gvt;
513
514         type = intel_gvt_ops->gvt_find_vgpu_type(gvt, kobject_name(kobj));
515         if (!type) {
516                 gvt_vgpu_err("failed to find type %s to create\n",
517                                                 kobject_name(kobj));
518                 ret = -EINVAL;
519                 goto out;
520         }
521
522         vgpu = intel_gvt_ops->vgpu_create(gvt, type);
523         if (IS_ERR_OR_NULL(vgpu)) {
524                 ret = vgpu == NULL ? -EFAULT : PTR_ERR(vgpu);
525                 gvt_err("failed to create intel vgpu: %d\n", ret);
526                 goto out;
527         }
528
529         INIT_WORK(&vgpu->vdev.release_work, intel_vgpu_release_work);
530
531         vgpu->vdev.mdev = mdev;
532         mdev_set_drvdata(mdev, vgpu);
533
534         gvt_dbg_core("intel_vgpu_create succeeded for mdev: %s\n",
535                      dev_name(mdev_dev(mdev)));
536         ret = 0;
537
538 out:
539         return ret;
540 }
541
542 static int intel_vgpu_remove(struct mdev_device *mdev)
543 {
544         struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
545
546         if (handle_valid(vgpu->handle))
547                 return -EBUSY;
548
549         intel_gvt_ops->vgpu_destroy(vgpu);
550         return 0;
551 }
552
553 static int intel_vgpu_iommu_notifier(struct notifier_block *nb,
554                                      unsigned long action, void *data)
555 {
556         struct intel_vgpu *vgpu = container_of(nb,
557                                         struct intel_vgpu,
558                                         vdev.iommu_notifier);
559
560         if (action == VFIO_IOMMU_NOTIFY_DMA_UNMAP) {
561                 struct vfio_iommu_type1_dma_unmap *unmap = data;
562                 struct gvt_dma *entry;
563                 unsigned long iov_pfn, end_iov_pfn;
564
565                 iov_pfn = unmap->iova >> PAGE_SHIFT;
566                 end_iov_pfn = iov_pfn + unmap->size / PAGE_SIZE;
567
568                 mutex_lock(&vgpu->vdev.cache_lock);
569                 for (; iov_pfn < end_iov_pfn; iov_pfn++) {
570                         entry = __gvt_cache_find_gfn(vgpu, iov_pfn);
571                         if (!entry)
572                                 continue;
573
574                         gvt_dma_unmap_page(vgpu, entry->gfn, entry->dma_addr,
575                                            entry->size);
576                         __gvt_cache_remove_entry(vgpu, entry);
577                 }
578                 mutex_unlock(&vgpu->vdev.cache_lock);
579         }
580
581         return NOTIFY_OK;
582 }
583
584 static int intel_vgpu_group_notifier(struct notifier_block *nb,
585                                      unsigned long action, void *data)
586 {
587         struct intel_vgpu *vgpu = container_of(nb,
588                                         struct intel_vgpu,
589                                         vdev.group_notifier);
590
591         /* the only action we care about */
592         if (action == VFIO_GROUP_NOTIFY_SET_KVM) {
593                 vgpu->vdev.kvm = data;
594
595                 if (!data)
596                         schedule_work(&vgpu->vdev.release_work);
597         }
598
599         return NOTIFY_OK;
600 }
601
602 static int intel_vgpu_open(struct mdev_device *mdev)
603 {
604         struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
605         unsigned long events;
606         int ret;
607
608         vgpu->vdev.iommu_notifier.notifier_call = intel_vgpu_iommu_notifier;
609         vgpu->vdev.group_notifier.notifier_call = intel_vgpu_group_notifier;
610
611         events = VFIO_IOMMU_NOTIFY_DMA_UNMAP;
612         ret = vfio_register_notifier(mdev_dev(mdev), VFIO_IOMMU_NOTIFY, &events,
613                                 &vgpu->vdev.iommu_notifier);
614         if (ret != 0) {
615                 gvt_vgpu_err("vfio_register_notifier for iommu failed: %d\n",
616                         ret);
617                 goto out;
618         }
619
620         events = VFIO_GROUP_NOTIFY_SET_KVM;
621         ret = vfio_register_notifier(mdev_dev(mdev), VFIO_GROUP_NOTIFY, &events,
622                                 &vgpu->vdev.group_notifier);
623         if (ret != 0) {
624                 gvt_vgpu_err("vfio_register_notifier for group failed: %d\n",
625                         ret);
626                 goto undo_iommu;
627         }
628
629         ret = kvmgt_guest_init(mdev);
630         if (ret)
631                 goto undo_group;
632
633         intel_gvt_ops->vgpu_activate(vgpu);
634
635         atomic_set(&vgpu->vdev.released, 0);
636         return ret;
637
638 undo_group:
639         vfio_unregister_notifier(mdev_dev(mdev), VFIO_GROUP_NOTIFY,
640                                         &vgpu->vdev.group_notifier);
641
642 undo_iommu:
643         vfio_unregister_notifier(mdev_dev(mdev), VFIO_IOMMU_NOTIFY,
644                                         &vgpu->vdev.iommu_notifier);
645 out:
646         return ret;
647 }
648
649 static void intel_vgpu_release_msi_eventfd_ctx(struct intel_vgpu *vgpu)
650 {
651         struct eventfd_ctx *trigger;
652
653         trigger = vgpu->vdev.msi_trigger;
654         if (trigger) {
655                 eventfd_ctx_put(trigger);
656                 vgpu->vdev.msi_trigger = NULL;
657         }
658 }
659
660 static void __intel_vgpu_release(struct intel_vgpu *vgpu)
661 {
662         struct kvmgt_guest_info *info;
663         int ret;
664
665         if (!handle_valid(vgpu->handle))
666                 return;
667
668         if (atomic_cmpxchg(&vgpu->vdev.released, 0, 1))
669                 return;
670
671         intel_gvt_ops->vgpu_release(vgpu);
672
673         ret = vfio_unregister_notifier(mdev_dev(vgpu->vdev.mdev), VFIO_IOMMU_NOTIFY,
674                                         &vgpu->vdev.iommu_notifier);
675         WARN(ret, "vfio_unregister_notifier for iommu failed: %d\n", ret);
676
677         ret = vfio_unregister_notifier(mdev_dev(vgpu->vdev.mdev), VFIO_GROUP_NOTIFY,
678                                         &vgpu->vdev.group_notifier);
679         WARN(ret, "vfio_unregister_notifier for group failed: %d\n", ret);
680
681         info = (struct kvmgt_guest_info *)vgpu->handle;
682         kvmgt_guest_exit(info);
683
684         intel_vgpu_release_msi_eventfd_ctx(vgpu);
685
686         vgpu->vdev.kvm = NULL;
687         vgpu->handle = 0;
688 }
689
690 static void intel_vgpu_release(struct mdev_device *mdev)
691 {
692         struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
693
694         __intel_vgpu_release(vgpu);
695 }
696
697 static void intel_vgpu_release_work(struct work_struct *work)
698 {
699         struct intel_vgpu *vgpu = container_of(work, struct intel_vgpu,
700                                         vdev.release_work);
701
702         __intel_vgpu_release(vgpu);
703 }
704
705 static uint64_t intel_vgpu_get_bar_addr(struct intel_vgpu *vgpu, int bar)
706 {
707         u32 start_lo, start_hi;
708         u32 mem_type;
709
710         start_lo = (*(u32 *)(vgpu->cfg_space.virtual_cfg_space + bar)) &
711                         PCI_BASE_ADDRESS_MEM_MASK;
712         mem_type = (*(u32 *)(vgpu->cfg_space.virtual_cfg_space + bar)) &
713                         PCI_BASE_ADDRESS_MEM_TYPE_MASK;
714
715         switch (mem_type) {
716         case PCI_BASE_ADDRESS_MEM_TYPE_64:
717                 start_hi = (*(u32 *)(vgpu->cfg_space.virtual_cfg_space
718                                                 + bar + 4));
719                 break;
720         case PCI_BASE_ADDRESS_MEM_TYPE_32:
721         case PCI_BASE_ADDRESS_MEM_TYPE_1M:
722                 /* 1M mem BAR treated as 32-bit BAR */
723         default:
724                 /* mem unknown type treated as 32-bit BAR */
725                 start_hi = 0;
726                 break;
727         }
728
729         return ((u64)start_hi << 32) | start_lo;
730 }
731
732 static int intel_vgpu_bar_rw(struct intel_vgpu *vgpu, int bar, uint64_t off,
733                              void *buf, unsigned int count, bool is_write)
734 {
735         uint64_t bar_start = intel_vgpu_get_bar_addr(vgpu, bar);
736         int ret;
737
738         if (is_write)
739                 ret = intel_gvt_ops->emulate_mmio_write(vgpu,
740                                         bar_start + off, buf, count);
741         else
742                 ret = intel_gvt_ops->emulate_mmio_read(vgpu,
743                                         bar_start + off, buf, count);
744         return ret;
745 }
746
747 static inline bool intel_vgpu_in_aperture(struct intel_vgpu *vgpu, uint64_t off)
748 {
749         return off >= vgpu_aperture_offset(vgpu) &&
750                off < vgpu_aperture_offset(vgpu) + vgpu_aperture_sz(vgpu);
751 }
752
753 static int intel_vgpu_aperture_rw(struct intel_vgpu *vgpu, uint64_t off,
754                 void *buf, unsigned long count, bool is_write)
755 {
756         void *aperture_va;
757
758         if (!intel_vgpu_in_aperture(vgpu, off) ||
759             !intel_vgpu_in_aperture(vgpu, off + count)) {
760                 gvt_vgpu_err("Invalid aperture offset %llu\n", off);
761                 return -EINVAL;
762         }
763
764         aperture_va = io_mapping_map_wc(&vgpu->gvt->dev_priv->ggtt.iomap,
765                                         ALIGN_DOWN(off, PAGE_SIZE),
766                                         count + offset_in_page(off));
767         if (!aperture_va)
768                 return -EIO;
769
770         if (is_write)
771                 memcpy(aperture_va + offset_in_page(off), buf, count);
772         else
773                 memcpy(buf, aperture_va + offset_in_page(off), count);
774
775         io_mapping_unmap(aperture_va);
776
777         return 0;
778 }
779
780 static ssize_t intel_vgpu_rw(struct mdev_device *mdev, char *buf,
781                         size_t count, loff_t *ppos, bool is_write)
782 {
783         struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
784         unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
785         uint64_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
786         int ret = -EINVAL;
787
788
789         if (index >= VFIO_PCI_NUM_REGIONS + vgpu->vdev.num_regions) {
790                 gvt_vgpu_err("invalid index: %u\n", index);
791                 return -EINVAL;
792         }
793
794         switch (index) {
795         case VFIO_PCI_CONFIG_REGION_INDEX:
796                 if (is_write)
797                         ret = intel_gvt_ops->emulate_cfg_write(vgpu, pos,
798                                                 buf, count);
799                 else
800                         ret = intel_gvt_ops->emulate_cfg_read(vgpu, pos,
801                                                 buf, count);
802                 break;
803         case VFIO_PCI_BAR0_REGION_INDEX:
804                 ret = intel_vgpu_bar_rw(vgpu, PCI_BASE_ADDRESS_0, pos,
805                                         buf, count, is_write);
806                 break;
807         case VFIO_PCI_BAR2_REGION_INDEX:
808                 ret = intel_vgpu_aperture_rw(vgpu, pos, buf, count, is_write);
809                 break;
810         case VFIO_PCI_BAR1_REGION_INDEX:
811         case VFIO_PCI_BAR3_REGION_INDEX:
812         case VFIO_PCI_BAR4_REGION_INDEX:
813         case VFIO_PCI_BAR5_REGION_INDEX:
814         case VFIO_PCI_VGA_REGION_INDEX:
815         case VFIO_PCI_ROM_REGION_INDEX:
816                 break;
817         default:
818                 if (index >= VFIO_PCI_NUM_REGIONS + vgpu->vdev.num_regions)
819                         return -EINVAL;
820
821                 index -= VFIO_PCI_NUM_REGIONS;
822                 return vgpu->vdev.region[index].ops->rw(vgpu, buf, count,
823                                 ppos, is_write);
824         }
825
826         return ret == 0 ? count : ret;
827 }
828
829 static bool gtt_entry(struct mdev_device *mdev, loff_t *ppos)
830 {
831         struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
832         unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
833         struct intel_gvt *gvt = vgpu->gvt;
834         int offset;
835
836         /* Only allow MMIO GGTT entry access */
837         if (index != PCI_BASE_ADDRESS_0)
838                 return false;
839
840         offset = (u64)(*ppos & VFIO_PCI_OFFSET_MASK) -
841                 intel_vgpu_get_bar_gpa(vgpu, PCI_BASE_ADDRESS_0);
842
843         return (offset >= gvt->device_info.gtt_start_offset &&
844                 offset < gvt->device_info.gtt_start_offset + gvt_ggtt_sz(gvt)) ?
845                         true : false;
846 }
847
848 static ssize_t intel_vgpu_read(struct mdev_device *mdev, char __user *buf,
849                         size_t count, loff_t *ppos)
850 {
851         unsigned int done = 0;
852         int ret;
853
854         while (count) {
855                 size_t filled;
856
857                 /* Only support GGTT entry 8 bytes read */
858                 if (count >= 8 && !(*ppos % 8) &&
859                         gtt_entry(mdev, ppos)) {
860                         u64 val;
861
862                         ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val),
863                                         ppos, false);
864                         if (ret <= 0)
865                                 goto read_err;
866
867                         if (copy_to_user(buf, &val, sizeof(val)))
868                                 goto read_err;
869
870                         filled = 8;
871                 } else if (count >= 4 && !(*ppos % 4)) {
872                         u32 val;
873
874                         ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val),
875                                         ppos, false);
876                         if (ret <= 0)
877                                 goto read_err;
878
879                         if (copy_to_user(buf, &val, sizeof(val)))
880                                 goto read_err;
881
882                         filled = 4;
883                 } else if (count >= 2 && !(*ppos % 2)) {
884                         u16 val;
885
886                         ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val),
887                                         ppos, false);
888                         if (ret <= 0)
889                                 goto read_err;
890
891                         if (copy_to_user(buf, &val, sizeof(val)))
892                                 goto read_err;
893
894                         filled = 2;
895                 } else {
896                         u8 val;
897
898                         ret = intel_vgpu_rw(mdev, &val, sizeof(val), ppos,
899                                         false);
900                         if (ret <= 0)
901                                 goto read_err;
902
903                         if (copy_to_user(buf, &val, sizeof(val)))
904                                 goto read_err;
905
906                         filled = 1;
907                 }
908
909                 count -= filled;
910                 done += filled;
911                 *ppos += filled;
912                 buf += filled;
913         }
914
915         return done;
916
917 read_err:
918         return -EFAULT;
919 }
920
921 static ssize_t intel_vgpu_write(struct mdev_device *mdev,
922                                 const char __user *buf,
923                                 size_t count, loff_t *ppos)
924 {
925         unsigned int done = 0;
926         int ret;
927
928         while (count) {
929                 size_t filled;
930
931                 /* Only support GGTT entry 8 bytes write */
932                 if (count >= 8 && !(*ppos % 8) &&
933                         gtt_entry(mdev, ppos)) {
934                         u64 val;
935
936                         if (copy_from_user(&val, buf, sizeof(val)))
937                                 goto write_err;
938
939                         ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val),
940                                         ppos, true);
941                         if (ret <= 0)
942                                 goto write_err;
943
944                         filled = 8;
945                 } else if (count >= 4 && !(*ppos % 4)) {
946                         u32 val;
947
948                         if (copy_from_user(&val, buf, sizeof(val)))
949                                 goto write_err;
950
951                         ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val),
952                                         ppos, true);
953                         if (ret <= 0)
954                                 goto write_err;
955
956                         filled = 4;
957                 } else if (count >= 2 && !(*ppos % 2)) {
958                         u16 val;
959
960                         if (copy_from_user(&val, buf, sizeof(val)))
961                                 goto write_err;
962
963                         ret = intel_vgpu_rw(mdev, (char *)&val,
964                                         sizeof(val), ppos, true);
965                         if (ret <= 0)
966                                 goto write_err;
967
968                         filled = 2;
969                 } else {
970                         u8 val;
971
972                         if (copy_from_user(&val, buf, sizeof(val)))
973                                 goto write_err;
974
975                         ret = intel_vgpu_rw(mdev, &val, sizeof(val),
976                                         ppos, true);
977                         if (ret <= 0)
978                                 goto write_err;
979
980                         filled = 1;
981                 }
982
983                 count -= filled;
984                 done += filled;
985                 *ppos += filled;
986                 buf += filled;
987         }
988
989         return done;
990 write_err:
991         return -EFAULT;
992 }
993
994 static int intel_vgpu_mmap(struct mdev_device *mdev, struct vm_area_struct *vma)
995 {
996         unsigned int index;
997         u64 virtaddr;
998         unsigned long req_size, pgoff = 0;
999         pgprot_t pg_prot;
1000         struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
1001
1002         index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
1003         if (index >= VFIO_PCI_ROM_REGION_INDEX)
1004                 return -EINVAL;
1005
1006         if (vma->vm_end < vma->vm_start)
1007                 return -EINVAL;
1008         if ((vma->vm_flags & VM_SHARED) == 0)
1009                 return -EINVAL;
1010         if (index != VFIO_PCI_BAR2_REGION_INDEX)
1011                 return -EINVAL;
1012
1013         pg_prot = vma->vm_page_prot;
1014         virtaddr = vma->vm_start;
1015         req_size = vma->vm_end - vma->vm_start;
1016         pgoff = vgpu_aperture_pa_base(vgpu) >> PAGE_SHIFT;
1017
1018         return remap_pfn_range(vma, virtaddr, pgoff, req_size, pg_prot);
1019 }
1020
1021 static int intel_vgpu_get_irq_count(struct intel_vgpu *vgpu, int type)
1022 {
1023         if (type == VFIO_PCI_INTX_IRQ_INDEX || type == VFIO_PCI_MSI_IRQ_INDEX)
1024                 return 1;
1025
1026         return 0;
1027 }
1028
1029 static int intel_vgpu_set_intx_mask(struct intel_vgpu *vgpu,
1030                         unsigned int index, unsigned int start,
1031                         unsigned int count, uint32_t flags,
1032                         void *data)
1033 {
1034         return 0;
1035 }
1036
1037 static int intel_vgpu_set_intx_unmask(struct intel_vgpu *vgpu,
1038                         unsigned int index, unsigned int start,
1039                         unsigned int count, uint32_t flags, void *data)
1040 {
1041         return 0;
1042 }
1043
1044 static int intel_vgpu_set_intx_trigger(struct intel_vgpu *vgpu,
1045                 unsigned int index, unsigned int start, unsigned int count,
1046                 uint32_t flags, void *data)
1047 {
1048         return 0;
1049 }
1050
1051 static int intel_vgpu_set_msi_trigger(struct intel_vgpu *vgpu,
1052                 unsigned int index, unsigned int start, unsigned int count,
1053                 uint32_t flags, void *data)
1054 {
1055         struct eventfd_ctx *trigger;
1056
1057         if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
1058                 int fd = *(int *)data;
1059
1060                 trigger = eventfd_ctx_fdget(fd);
1061                 if (IS_ERR(trigger)) {
1062                         gvt_vgpu_err("eventfd_ctx_fdget failed\n");
1063                         return PTR_ERR(trigger);
1064                 }
1065                 vgpu->vdev.msi_trigger = trigger;
1066         } else if ((flags & VFIO_IRQ_SET_DATA_NONE) && !count)
1067                 intel_vgpu_release_msi_eventfd_ctx(vgpu);
1068
1069         return 0;
1070 }
1071
1072 static int intel_vgpu_set_irqs(struct intel_vgpu *vgpu, uint32_t flags,
1073                 unsigned int index, unsigned int start, unsigned int count,
1074                 void *data)
1075 {
1076         int (*func)(struct intel_vgpu *vgpu, unsigned int index,
1077                         unsigned int start, unsigned int count, uint32_t flags,
1078                         void *data) = NULL;
1079
1080         switch (index) {
1081         case VFIO_PCI_INTX_IRQ_INDEX:
1082                 switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
1083                 case VFIO_IRQ_SET_ACTION_MASK:
1084                         func = intel_vgpu_set_intx_mask;
1085                         break;
1086                 case VFIO_IRQ_SET_ACTION_UNMASK:
1087                         func = intel_vgpu_set_intx_unmask;
1088                         break;
1089                 case VFIO_IRQ_SET_ACTION_TRIGGER:
1090                         func = intel_vgpu_set_intx_trigger;
1091                         break;
1092                 }
1093                 break;
1094         case VFIO_PCI_MSI_IRQ_INDEX:
1095                 switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
1096                 case VFIO_IRQ_SET_ACTION_MASK:
1097                 case VFIO_IRQ_SET_ACTION_UNMASK:
1098                         /* XXX Need masking support exported */
1099                         break;
1100                 case VFIO_IRQ_SET_ACTION_TRIGGER:
1101                         func = intel_vgpu_set_msi_trigger;
1102                         break;
1103                 }
1104                 break;
1105         }
1106
1107         if (!func)
1108                 return -ENOTTY;
1109
1110         return func(vgpu, index, start, count, flags, data);
1111 }
1112
1113 static long intel_vgpu_ioctl(struct mdev_device *mdev, unsigned int cmd,
1114                              unsigned long arg)
1115 {
1116         struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
1117         unsigned long minsz;
1118
1119         gvt_dbg_core("vgpu%d ioctl, cmd: %d\n", vgpu->id, cmd);
1120
1121         if (cmd == VFIO_DEVICE_GET_INFO) {
1122                 struct vfio_device_info info;
1123
1124                 minsz = offsetofend(struct vfio_device_info, num_irqs);
1125
1126                 if (copy_from_user(&info, (void __user *)arg, minsz))
1127                         return -EFAULT;
1128
1129                 if (info.argsz < minsz)
1130                         return -EINVAL;
1131
1132                 info.flags = VFIO_DEVICE_FLAGS_PCI;
1133                 info.flags |= VFIO_DEVICE_FLAGS_RESET;
1134                 info.num_regions = VFIO_PCI_NUM_REGIONS +
1135                                 vgpu->vdev.num_regions;
1136                 info.num_irqs = VFIO_PCI_NUM_IRQS;
1137
1138                 return copy_to_user((void __user *)arg, &info, minsz) ?
1139                         -EFAULT : 0;
1140
1141         } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) {
1142                 struct vfio_region_info info;
1143                 struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
1144                 unsigned int i;
1145                 int ret;
1146                 struct vfio_region_info_cap_sparse_mmap *sparse = NULL;
1147                 size_t size;
1148                 int nr_areas = 1;
1149                 int cap_type_id;
1150
1151                 minsz = offsetofend(struct vfio_region_info, offset);
1152
1153                 if (copy_from_user(&info, (void __user *)arg, minsz))
1154                         return -EFAULT;
1155
1156                 if (info.argsz < minsz)
1157                         return -EINVAL;
1158
1159                 switch (info.index) {
1160                 case VFIO_PCI_CONFIG_REGION_INDEX:
1161                         info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1162                         info.size = vgpu->gvt->device_info.cfg_space_size;
1163                         info.flags = VFIO_REGION_INFO_FLAG_READ |
1164                                      VFIO_REGION_INFO_FLAG_WRITE;
1165                         break;
1166                 case VFIO_PCI_BAR0_REGION_INDEX:
1167                         info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1168                         info.size = vgpu->cfg_space.bar[info.index].size;
1169                         if (!info.size) {
1170                                 info.flags = 0;
1171                                 break;
1172                         }
1173
1174                         info.flags = VFIO_REGION_INFO_FLAG_READ |
1175                                      VFIO_REGION_INFO_FLAG_WRITE;
1176                         break;
1177                 case VFIO_PCI_BAR1_REGION_INDEX:
1178                         info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1179                         info.size = 0;
1180                         info.flags = 0;
1181                         break;
1182                 case VFIO_PCI_BAR2_REGION_INDEX:
1183                         info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1184                         info.flags = VFIO_REGION_INFO_FLAG_CAPS |
1185                                         VFIO_REGION_INFO_FLAG_MMAP |
1186                                         VFIO_REGION_INFO_FLAG_READ |
1187                                         VFIO_REGION_INFO_FLAG_WRITE;
1188                         info.size = gvt_aperture_sz(vgpu->gvt);
1189
1190                         size = sizeof(*sparse) +
1191                                         (nr_areas * sizeof(*sparse->areas));
1192                         sparse = kzalloc(size, GFP_KERNEL);
1193                         if (!sparse)
1194                                 return -ENOMEM;
1195
1196                         sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
1197                         sparse->header.version = 1;
1198                         sparse->nr_areas = nr_areas;
1199                         cap_type_id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
1200                         sparse->areas[0].offset =
1201                                         PAGE_ALIGN(vgpu_aperture_offset(vgpu));
1202                         sparse->areas[0].size = vgpu_aperture_sz(vgpu);
1203                         break;
1204
1205                 case VFIO_PCI_BAR3_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
1206                         info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1207                         info.size = 0;
1208                         info.flags = 0;
1209
1210                         gvt_dbg_core("get region info bar:%d\n", info.index);
1211                         break;
1212
1213                 case VFIO_PCI_ROM_REGION_INDEX:
1214                 case VFIO_PCI_VGA_REGION_INDEX:
1215                         info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1216                         info.size = 0;
1217                         info.flags = 0;
1218
1219                         gvt_dbg_core("get region info index:%d\n", info.index);
1220                         break;
1221                 default:
1222                         {
1223                                 struct vfio_region_info_cap_type cap_type = {
1224                                         .header.id = VFIO_REGION_INFO_CAP_TYPE,
1225                                         .header.version = 1 };
1226
1227                                 if (info.index >= VFIO_PCI_NUM_REGIONS +
1228                                                 vgpu->vdev.num_regions)
1229                                         return -EINVAL;
1230                                 info.index =
1231                                         array_index_nospec(info.index,
1232                                                         VFIO_PCI_NUM_REGIONS +
1233                                                         vgpu->vdev.num_regions);
1234
1235                                 i = info.index - VFIO_PCI_NUM_REGIONS;
1236
1237                                 info.offset =
1238                                         VFIO_PCI_INDEX_TO_OFFSET(info.index);
1239                                 info.size = vgpu->vdev.region[i].size;
1240                                 info.flags = vgpu->vdev.region[i].flags;
1241
1242                                 cap_type.type = vgpu->vdev.region[i].type;
1243                                 cap_type.subtype = vgpu->vdev.region[i].subtype;
1244
1245                                 ret = vfio_info_add_capability(&caps,
1246                                                         &cap_type.header,
1247                                                         sizeof(cap_type));
1248                                 if (ret)
1249                                         return ret;
1250                         }
1251                 }
1252
1253                 if ((info.flags & VFIO_REGION_INFO_FLAG_CAPS) && sparse) {
1254                         switch (cap_type_id) {
1255                         case VFIO_REGION_INFO_CAP_SPARSE_MMAP:
1256                                 ret = vfio_info_add_capability(&caps,
1257                                         &sparse->header, sizeof(*sparse) +
1258                                         (sparse->nr_areas *
1259                                                 sizeof(*sparse->areas)));
1260                                 if (ret) {
1261                                         kfree(sparse);
1262                                         return ret;
1263                                 }
1264                                 break;
1265                         default:
1266                                 kfree(sparse);
1267                                 return -EINVAL;
1268                         }
1269                 }
1270
1271                 if (caps.size) {
1272                         info.flags |= VFIO_REGION_INFO_FLAG_CAPS;
1273                         if (info.argsz < sizeof(info) + caps.size) {
1274                                 info.argsz = sizeof(info) + caps.size;
1275                                 info.cap_offset = 0;
1276                         } else {
1277                                 vfio_info_cap_shift(&caps, sizeof(info));
1278                                 if (copy_to_user((void __user *)arg +
1279                                                   sizeof(info), caps.buf,
1280                                                   caps.size)) {
1281                                         kfree(caps.buf);
1282                                         kfree(sparse);
1283                                         return -EFAULT;
1284                                 }
1285                                 info.cap_offset = sizeof(info);
1286                         }
1287
1288                         kfree(caps.buf);
1289                 }
1290
1291                 kfree(sparse);
1292                 return copy_to_user((void __user *)arg, &info, minsz) ?
1293                         -EFAULT : 0;
1294         } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) {
1295                 struct vfio_irq_info info;
1296
1297                 minsz = offsetofend(struct vfio_irq_info, count);
1298
1299                 if (copy_from_user(&info, (void __user *)arg, minsz))
1300                         return -EFAULT;
1301
1302                 if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS)
1303                         return -EINVAL;
1304
1305                 switch (info.index) {
1306                 case VFIO_PCI_INTX_IRQ_INDEX:
1307                 case VFIO_PCI_MSI_IRQ_INDEX:
1308                         break;
1309                 default:
1310                         return -EINVAL;
1311                 }
1312
1313                 info.flags = VFIO_IRQ_INFO_EVENTFD;
1314
1315                 info.count = intel_vgpu_get_irq_count(vgpu, info.index);
1316
1317                 if (info.index == VFIO_PCI_INTX_IRQ_INDEX)
1318                         info.flags |= (VFIO_IRQ_INFO_MASKABLE |
1319                                        VFIO_IRQ_INFO_AUTOMASKED);
1320                 else
1321                         info.flags |= VFIO_IRQ_INFO_NORESIZE;
1322
1323                 return copy_to_user((void __user *)arg, &info, minsz) ?
1324                         -EFAULT : 0;
1325         } else if (cmd == VFIO_DEVICE_SET_IRQS) {
1326                 struct vfio_irq_set hdr;
1327                 u8 *data = NULL;
1328                 int ret = 0;
1329                 size_t data_size = 0;
1330
1331                 minsz = offsetofend(struct vfio_irq_set, count);
1332
1333                 if (copy_from_user(&hdr, (void __user *)arg, minsz))
1334                         return -EFAULT;
1335
1336                 if (!(hdr.flags & VFIO_IRQ_SET_DATA_NONE)) {
1337                         int max = intel_vgpu_get_irq_count(vgpu, hdr.index);
1338
1339                         ret = vfio_set_irqs_validate_and_prepare(&hdr, max,
1340                                                 VFIO_PCI_NUM_IRQS, &data_size);
1341                         if (ret) {
1342                                 gvt_vgpu_err("intel:vfio_set_irqs_validate_and_prepare failed\n");
1343                                 return -EINVAL;
1344                         }
1345                         if (data_size) {
1346                                 data = memdup_user((void __user *)(arg + minsz),
1347                                                    data_size);
1348                                 if (IS_ERR(data))
1349                                         return PTR_ERR(data);
1350                         }
1351                 }
1352
1353                 ret = intel_vgpu_set_irqs(vgpu, hdr.flags, hdr.index,
1354                                         hdr.start, hdr.count, data);
1355                 kfree(data);
1356
1357                 return ret;
1358         } else if (cmd == VFIO_DEVICE_RESET) {
1359                 intel_gvt_ops->vgpu_reset(vgpu);
1360                 return 0;
1361         } else if (cmd == VFIO_DEVICE_QUERY_GFX_PLANE) {
1362                 struct vfio_device_gfx_plane_info dmabuf;
1363                 int ret = 0;
1364
1365                 minsz = offsetofend(struct vfio_device_gfx_plane_info,
1366                                     dmabuf_id);
1367                 if (copy_from_user(&dmabuf, (void __user *)arg, minsz))
1368                         return -EFAULT;
1369                 if (dmabuf.argsz < minsz)
1370                         return -EINVAL;
1371
1372                 ret = intel_gvt_ops->vgpu_query_plane(vgpu, &dmabuf);
1373                 if (ret != 0)
1374                         return ret;
1375
1376                 return copy_to_user((void __user *)arg, &dmabuf, minsz) ?
1377                                                                 -EFAULT : 0;
1378         } else if (cmd == VFIO_DEVICE_GET_GFX_DMABUF) {
1379                 __u32 dmabuf_id;
1380                 __s32 dmabuf_fd;
1381
1382                 if (get_user(dmabuf_id, (__u32 __user *)arg))
1383                         return -EFAULT;
1384
1385                 dmabuf_fd = intel_gvt_ops->vgpu_get_dmabuf(vgpu, dmabuf_id);
1386                 return dmabuf_fd;
1387
1388         }
1389
1390         return -ENOTTY;
1391 }
1392
1393 static ssize_t
1394 vgpu_id_show(struct device *dev, struct device_attribute *attr,
1395              char *buf)
1396 {
1397         struct mdev_device *mdev = mdev_from_dev(dev);
1398
1399         if (mdev) {
1400                 struct intel_vgpu *vgpu = (struct intel_vgpu *)
1401                         mdev_get_drvdata(mdev);
1402                 return sprintf(buf, "%d\n", vgpu->id);
1403         }
1404         return sprintf(buf, "\n");
1405 }
1406
1407 static ssize_t
1408 hw_id_show(struct device *dev, struct device_attribute *attr,
1409            char *buf)
1410 {
1411         struct mdev_device *mdev = mdev_from_dev(dev);
1412
1413         if (mdev) {
1414                 struct intel_vgpu *vgpu = (struct intel_vgpu *)
1415                         mdev_get_drvdata(mdev);
1416                 return sprintf(buf, "%u\n",
1417                                vgpu->submission.shadow_ctx->hw_id);
1418         }
1419         return sprintf(buf, "\n");
1420 }
1421
1422 static DEVICE_ATTR_RO(vgpu_id);
1423 static DEVICE_ATTR_RO(hw_id);
1424
1425 static struct attribute *intel_vgpu_attrs[] = {
1426         &dev_attr_vgpu_id.attr,
1427         &dev_attr_hw_id.attr,
1428         NULL
1429 };
1430
1431 static const struct attribute_group intel_vgpu_group = {
1432         .name = "intel_vgpu",
1433         .attrs = intel_vgpu_attrs,
1434 };
1435
1436 static const struct attribute_group *intel_vgpu_groups[] = {
1437         &intel_vgpu_group,
1438         NULL,
1439 };
1440
1441 static struct mdev_parent_ops intel_vgpu_ops = {
1442         .mdev_attr_groups       = intel_vgpu_groups,
1443         .create                 = intel_vgpu_create,
1444         .remove                 = intel_vgpu_remove,
1445
1446         .open                   = intel_vgpu_open,
1447         .release                = intel_vgpu_release,
1448
1449         .read                   = intel_vgpu_read,
1450         .write                  = intel_vgpu_write,
1451         .mmap                   = intel_vgpu_mmap,
1452         .ioctl                  = intel_vgpu_ioctl,
1453 };
1454
1455 static int kvmgt_host_init(struct device *dev, void *gvt, const void *ops)
1456 {
1457         struct attribute **kvm_type_attrs;
1458         struct attribute_group **kvm_vgpu_type_groups;
1459
1460         intel_gvt_ops = ops;
1461         if (!intel_gvt_ops->get_gvt_attrs(&kvm_type_attrs,
1462                         &kvm_vgpu_type_groups))
1463                 return -EFAULT;
1464         intel_vgpu_ops.supported_type_groups = kvm_vgpu_type_groups;
1465
1466         return mdev_register_device(dev, &intel_vgpu_ops);
1467 }
1468
1469 static void kvmgt_host_exit(struct device *dev, void *gvt)
1470 {
1471         mdev_unregister_device(dev);
1472 }
1473
1474 static int kvmgt_page_track_add(unsigned long handle, u64 gfn)
1475 {
1476         struct kvmgt_guest_info *info;
1477         struct kvm *kvm;
1478         struct kvm_memory_slot *slot;
1479         int idx;
1480
1481         if (!handle_valid(handle))
1482                 return -ESRCH;
1483
1484         info = (struct kvmgt_guest_info *)handle;
1485         kvm = info->kvm;
1486
1487         idx = srcu_read_lock(&kvm->srcu);
1488         slot = gfn_to_memslot(kvm, gfn);
1489         if (!slot) {
1490                 srcu_read_unlock(&kvm->srcu, idx);
1491                 return -EINVAL;
1492         }
1493
1494         spin_lock(&kvm->mmu_lock);
1495
1496         if (kvmgt_gfn_is_write_protected(info, gfn))
1497                 goto out;
1498
1499         kvm_slot_page_track_add_page(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE);
1500         kvmgt_protect_table_add(info, gfn);
1501
1502 out:
1503         spin_unlock(&kvm->mmu_lock);
1504         srcu_read_unlock(&kvm->srcu, idx);
1505         return 0;
1506 }
1507
1508 static int kvmgt_page_track_remove(unsigned long handle, u64 gfn)
1509 {
1510         struct kvmgt_guest_info *info;
1511         struct kvm *kvm;
1512         struct kvm_memory_slot *slot;
1513         int idx;
1514
1515         if (!handle_valid(handle))
1516                 return 0;
1517
1518         info = (struct kvmgt_guest_info *)handle;
1519         kvm = info->kvm;
1520
1521         idx = srcu_read_lock(&kvm->srcu);
1522         slot = gfn_to_memslot(kvm, gfn);
1523         if (!slot) {
1524                 srcu_read_unlock(&kvm->srcu, idx);
1525                 return -EINVAL;
1526         }
1527
1528         spin_lock(&kvm->mmu_lock);
1529
1530         if (!kvmgt_gfn_is_write_protected(info, gfn))
1531                 goto out;
1532
1533         kvm_slot_page_track_remove_page(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE);
1534         kvmgt_protect_table_del(info, gfn);
1535
1536 out:
1537         spin_unlock(&kvm->mmu_lock);
1538         srcu_read_unlock(&kvm->srcu, idx);
1539         return 0;
1540 }
1541
1542 static void kvmgt_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1543                 const u8 *val, int len,
1544                 struct kvm_page_track_notifier_node *node)
1545 {
1546         struct kvmgt_guest_info *info = container_of(node,
1547                                         struct kvmgt_guest_info, track_node);
1548
1549         if (kvmgt_gfn_is_write_protected(info, gpa_to_gfn(gpa)))
1550                 intel_gvt_ops->write_protect_handler(info->vgpu, gpa,
1551                                                      (void *)val, len);
1552 }
1553
1554 static void kvmgt_page_track_flush_slot(struct kvm *kvm,
1555                 struct kvm_memory_slot *slot,
1556                 struct kvm_page_track_notifier_node *node)
1557 {
1558         int i;
1559         gfn_t gfn;
1560         struct kvmgt_guest_info *info = container_of(node,
1561                                         struct kvmgt_guest_info, track_node);
1562
1563         spin_lock(&kvm->mmu_lock);
1564         for (i = 0; i < slot->npages; i++) {
1565                 gfn = slot->base_gfn + i;
1566                 if (kvmgt_gfn_is_write_protected(info, gfn)) {
1567                         kvm_slot_page_track_remove_page(kvm, slot, gfn,
1568                                                 KVM_PAGE_TRACK_WRITE);
1569                         kvmgt_protect_table_del(info, gfn);
1570                 }
1571         }
1572         spin_unlock(&kvm->mmu_lock);
1573 }
1574
1575 static bool __kvmgt_vgpu_exist(struct intel_vgpu *vgpu, struct kvm *kvm)
1576 {
1577         struct intel_vgpu *itr;
1578         struct kvmgt_guest_info *info;
1579         int id;
1580         bool ret = false;
1581
1582         mutex_lock(&vgpu->gvt->lock);
1583         for_each_active_vgpu(vgpu->gvt, itr, id) {
1584                 if (!handle_valid(itr->handle))
1585                         continue;
1586
1587                 info = (struct kvmgt_guest_info *)itr->handle;
1588                 if (kvm && kvm == info->kvm) {
1589                         ret = true;
1590                         goto out;
1591                 }
1592         }
1593 out:
1594         mutex_unlock(&vgpu->gvt->lock);
1595         return ret;
1596 }
1597
1598 static int kvmgt_guest_init(struct mdev_device *mdev)
1599 {
1600         struct kvmgt_guest_info *info;
1601         struct intel_vgpu *vgpu;
1602         struct kvm *kvm;
1603
1604         vgpu = mdev_get_drvdata(mdev);
1605         if (handle_valid(vgpu->handle))
1606                 return -EEXIST;
1607
1608         kvm = vgpu->vdev.kvm;
1609         if (!kvm || kvm->mm != current->mm) {
1610                 gvt_vgpu_err("KVM is required to use Intel vGPU\n");
1611                 return -ESRCH;
1612         }
1613
1614         if (__kvmgt_vgpu_exist(vgpu, kvm))
1615                 return -EEXIST;
1616
1617         info = vzalloc(sizeof(struct kvmgt_guest_info));
1618         if (!info)
1619                 return -ENOMEM;
1620
1621         vgpu->handle = (unsigned long)info;
1622         info->vgpu = vgpu;
1623         info->kvm = kvm;
1624         kvm_get_kvm(info->kvm);
1625
1626         kvmgt_protect_table_init(info);
1627         gvt_cache_init(vgpu);
1628
1629         init_completion(&vgpu->vblank_done);
1630
1631         info->track_node.track_write = kvmgt_page_track_write;
1632         info->track_node.track_flush_slot = kvmgt_page_track_flush_slot;
1633         kvm_page_track_register_notifier(kvm, &info->track_node);
1634
1635         info->debugfs_cache_entries = debugfs_create_ulong(
1636                                                 "kvmgt_nr_cache_entries",
1637                                                 0444, vgpu->debugfs,
1638                                                 &vgpu->vdev.nr_cache_entries);
1639         if (!info->debugfs_cache_entries)
1640                 gvt_vgpu_err("Cannot create kvmgt debugfs entry\n");
1641
1642         return 0;
1643 }
1644
1645 static bool kvmgt_guest_exit(struct kvmgt_guest_info *info)
1646 {
1647         debugfs_remove(info->debugfs_cache_entries);
1648
1649         kvm_page_track_unregister_notifier(info->kvm, &info->track_node);
1650         kvm_put_kvm(info->kvm);
1651         kvmgt_protect_table_destroy(info);
1652         gvt_cache_destroy(info->vgpu);
1653         vfree(info);
1654
1655         return true;
1656 }
1657
1658 static int kvmgt_attach_vgpu(void *vgpu, unsigned long *handle)
1659 {
1660         /* nothing to do here */
1661         return 0;
1662 }
1663
1664 static void kvmgt_detach_vgpu(unsigned long handle)
1665 {
1666         /* nothing to do here */
1667 }
1668
1669 static int kvmgt_inject_msi(unsigned long handle, u32 addr, u16 data)
1670 {
1671         struct kvmgt_guest_info *info;
1672         struct intel_vgpu *vgpu;
1673
1674         if (!handle_valid(handle))
1675                 return -ESRCH;
1676
1677         info = (struct kvmgt_guest_info *)handle;
1678         vgpu = info->vgpu;
1679
1680         /*
1681          * When guest is poweroff, msi_trigger is set to NULL, but vgpu's
1682          * config and mmio register isn't restored to default during guest
1683          * poweroff. If this vgpu is still used in next vm, this vgpu's pipe
1684          * may be enabled, then once this vgpu is active, it will get inject
1685          * vblank interrupt request. But msi_trigger is null until msi is
1686          * enabled by guest. so if msi_trigger is null, success is still
1687          * returned and don't inject interrupt into guest.
1688          */
1689         if (vgpu->vdev.msi_trigger == NULL)
1690                 return 0;
1691
1692         if (eventfd_signal(vgpu->vdev.msi_trigger, 1) == 1)
1693                 return 0;
1694
1695         return -EFAULT;
1696 }
1697
1698 static unsigned long kvmgt_gfn_to_pfn(unsigned long handle, unsigned long gfn)
1699 {
1700         struct kvmgt_guest_info *info;
1701         kvm_pfn_t pfn;
1702
1703         if (!handle_valid(handle))
1704                 return INTEL_GVT_INVALID_ADDR;
1705
1706         info = (struct kvmgt_guest_info *)handle;
1707
1708         pfn = gfn_to_pfn(info->kvm, gfn);
1709         if (is_error_noslot_pfn(pfn))
1710                 return INTEL_GVT_INVALID_ADDR;
1711
1712         return pfn;
1713 }
1714
1715 int kvmgt_dma_map_guest_page(unsigned long handle, unsigned long gfn,
1716                 unsigned long size, dma_addr_t *dma_addr)
1717 {
1718         struct kvmgt_guest_info *info;
1719         struct intel_vgpu *vgpu;
1720         struct gvt_dma *entry;
1721         int ret;
1722
1723         if (!handle_valid(handle))
1724                 return -EINVAL;
1725
1726         info = (struct kvmgt_guest_info *)handle;
1727         vgpu = info->vgpu;
1728
1729         mutex_lock(&info->vgpu->vdev.cache_lock);
1730
1731         entry = __gvt_cache_find_gfn(info->vgpu, gfn);
1732         if (!entry) {
1733                 ret = gvt_dma_map_page(vgpu, gfn, dma_addr, size);
1734                 if (ret)
1735                         goto err_unlock;
1736
1737                 ret = __gvt_cache_add(info->vgpu, gfn, *dma_addr, size);
1738                 if (ret)
1739                         goto err_unmap;
1740         } else {
1741                 kref_get(&entry->ref);
1742                 *dma_addr = entry->dma_addr;
1743         }
1744
1745         mutex_unlock(&info->vgpu->vdev.cache_lock);
1746         return 0;
1747
1748 err_unmap:
1749         gvt_dma_unmap_page(vgpu, gfn, *dma_addr, size);
1750 err_unlock:
1751         mutex_unlock(&info->vgpu->vdev.cache_lock);
1752         return ret;
1753 }
1754
1755 static void __gvt_dma_release(struct kref *ref)
1756 {
1757         struct gvt_dma *entry = container_of(ref, typeof(*entry), ref);
1758
1759         gvt_dma_unmap_page(entry->vgpu, entry->gfn, entry->dma_addr,
1760                            entry->size);
1761         __gvt_cache_remove_entry(entry->vgpu, entry);
1762 }
1763
1764 void kvmgt_dma_unmap_guest_page(unsigned long handle, dma_addr_t dma_addr)
1765 {
1766         struct kvmgt_guest_info *info;
1767         struct gvt_dma *entry;
1768
1769         if (!handle_valid(handle))
1770                 return;
1771
1772         info = (struct kvmgt_guest_info *)handle;
1773
1774         mutex_lock(&info->vgpu->vdev.cache_lock);
1775         entry = __gvt_cache_find_dma_addr(info->vgpu, dma_addr);
1776         if (entry)
1777                 kref_put(&entry->ref, __gvt_dma_release);
1778         mutex_unlock(&info->vgpu->vdev.cache_lock);
1779 }
1780
1781 static int kvmgt_rw_gpa(unsigned long handle, unsigned long gpa,
1782                         void *buf, unsigned long len, bool write)
1783 {
1784         struct kvmgt_guest_info *info;
1785         struct kvm *kvm;
1786         int idx, ret;
1787         bool kthread = current->mm == NULL;
1788
1789         if (!handle_valid(handle))
1790                 return -ESRCH;
1791
1792         info = (struct kvmgt_guest_info *)handle;
1793         kvm = info->kvm;
1794
1795         if (kthread)
1796                 use_mm(kvm->mm);
1797
1798         idx = srcu_read_lock(&kvm->srcu);
1799         ret = write ? kvm_write_guest(kvm, gpa, buf, len) :
1800                       kvm_read_guest(kvm, gpa, buf, len);
1801         srcu_read_unlock(&kvm->srcu, idx);
1802
1803         if (kthread)
1804                 unuse_mm(kvm->mm);
1805
1806         return ret;
1807 }
1808
1809 static int kvmgt_read_gpa(unsigned long handle, unsigned long gpa,
1810                         void *buf, unsigned long len)
1811 {
1812         return kvmgt_rw_gpa(handle, gpa, buf, len, false);
1813 }
1814
1815 static int kvmgt_write_gpa(unsigned long handle, unsigned long gpa,
1816                         void *buf, unsigned long len)
1817 {
1818         return kvmgt_rw_gpa(handle, gpa, buf, len, true);
1819 }
1820
1821 static unsigned long kvmgt_virt_to_pfn(void *addr)
1822 {
1823         return PFN_DOWN(__pa(addr));
1824 }
1825
1826 static bool kvmgt_is_valid_gfn(unsigned long handle, unsigned long gfn)
1827 {
1828         struct kvmgt_guest_info *info;
1829         struct kvm *kvm;
1830
1831         if (!handle_valid(handle))
1832                 return false;
1833
1834         info = (struct kvmgt_guest_info *)handle;
1835         kvm = info->kvm;
1836
1837         return kvm_is_visible_gfn(kvm, gfn);
1838
1839 }
1840
1841 struct intel_gvt_mpt kvmgt_mpt = {
1842         .host_init = kvmgt_host_init,
1843         .host_exit = kvmgt_host_exit,
1844         .attach_vgpu = kvmgt_attach_vgpu,
1845         .detach_vgpu = kvmgt_detach_vgpu,
1846         .inject_msi = kvmgt_inject_msi,
1847         .from_virt_to_mfn = kvmgt_virt_to_pfn,
1848         .enable_page_track = kvmgt_page_track_add,
1849         .disable_page_track = kvmgt_page_track_remove,
1850         .read_gpa = kvmgt_read_gpa,
1851         .write_gpa = kvmgt_write_gpa,
1852         .gfn_to_mfn = kvmgt_gfn_to_pfn,
1853         .dma_map_guest_page = kvmgt_dma_map_guest_page,
1854         .dma_unmap_guest_page = kvmgt_dma_unmap_guest_page,
1855         .set_opregion = kvmgt_set_opregion,
1856         .get_vfio_device = kvmgt_get_vfio_device,
1857         .put_vfio_device = kvmgt_put_vfio_device,
1858         .is_valid_gfn = kvmgt_is_valid_gfn,
1859 };
1860 EXPORT_SYMBOL_GPL(kvmgt_mpt);
1861
1862 static int __init kvmgt_init(void)
1863 {
1864         return 0;
1865 }
1866
1867 static void __exit kvmgt_exit(void)
1868 {
1869 }
1870
1871 module_init(kvmgt_init);
1872 module_exit(kvmgt_exit);
1873
1874 MODULE_LICENSE("GPL and additional rights");
1875 MODULE_AUTHOR("Intel Corporation");