drm/i915/gvt: switch to kernel types
[sfrench/cifs-2.6.git] / drivers / gpu / drm / i915 / gvt / kvmgt.c
1 /*
2  * KVMGT - the implementation of Intel mediated pass-through framework for KVM
3  *
4  * Copyright(c) 2014-2016 Intel Corporation. All rights reserved.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10  * and/or sell copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice (including the next
14  * paragraph) shall be included in all copies or substantial portions of the
15  * Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23  * SOFTWARE.
24  *
25  * Authors:
26  *    Kevin Tian <kevin.tian@intel.com>
27  *    Jike Song <jike.song@intel.com>
28  *    Xiaoguang Chen <xiaoguang.chen@intel.com>
29  */
30
31 #include <linux/init.h>
32 #include <linux/device.h>
33 #include <linux/mm.h>
34 #include <linux/mmu_context.h>
35 #include <linux/sched/mm.h>
36 #include <linux/types.h>
37 #include <linux/list.h>
38 #include <linux/rbtree.h>
39 #include <linux/spinlock.h>
40 #include <linux/eventfd.h>
41 #include <linux/uuid.h>
42 #include <linux/kvm_host.h>
43 #include <linux/vfio.h>
44 #include <linux/mdev.h>
45 #include <linux/debugfs.h>
46
47 #include <linux/nospec.h>
48
49 #include "i915_drv.h"
50 #include "gvt.h"
51
52 static const struct intel_gvt_ops *intel_gvt_ops;
53
54 /* helper macros copied from vfio-pci */
55 #define VFIO_PCI_OFFSET_SHIFT   40
56 #define VFIO_PCI_OFFSET_TO_INDEX(off)   (off >> VFIO_PCI_OFFSET_SHIFT)
57 #define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT)
58 #define VFIO_PCI_OFFSET_MASK    (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1)
59
60 #define OPREGION_SIGNATURE "IntelGraphicsMem"
61
62 struct vfio_region;
63 struct intel_vgpu_regops {
64         size_t (*rw)(struct intel_vgpu *vgpu, char *buf,
65                         size_t count, loff_t *ppos, bool iswrite);
66         void (*release)(struct intel_vgpu *vgpu,
67                         struct vfio_region *region);
68 };
69
70 struct vfio_region {
71         u32                             type;
72         u32                             subtype;
73         size_t                          size;
74         u32                             flags;
75         const struct intel_vgpu_regops  *ops;
76         void                            *data;
77 };
78
79 struct kvmgt_pgfn {
80         gfn_t gfn;
81         struct hlist_node hnode;
82 };
83
84 struct kvmgt_guest_info {
85         struct kvm *kvm;
86         struct intel_vgpu *vgpu;
87         struct kvm_page_track_notifier_node track_node;
88 #define NR_BKT (1 << 18)
89         struct hlist_head ptable[NR_BKT];
90 #undef NR_BKT
91         struct dentry *debugfs_cache_entries;
92 };
93
94 struct gvt_dma {
95         struct intel_vgpu *vgpu;
96         struct rb_node gfn_node;
97         struct rb_node dma_addr_node;
98         gfn_t gfn;
99         dma_addr_t dma_addr;
100         unsigned long size;
101         struct kref ref;
102 };
103
104 static inline bool handle_valid(unsigned long handle)
105 {
106         return !!(handle & ~0xff);
107 }
108
109 static int kvmgt_guest_init(struct mdev_device *mdev);
110 static void intel_vgpu_release_work(struct work_struct *work);
111 static bool kvmgt_guest_exit(struct kvmgt_guest_info *info);
112
113 static void gvt_unpin_guest_page(struct intel_vgpu *vgpu, unsigned long gfn,
114                 unsigned long size)
115 {
116         int total_pages;
117         int npage;
118         int ret;
119
120         total_pages = roundup(size, PAGE_SIZE) / PAGE_SIZE;
121
122         for (npage = 0; npage < total_pages; npage++) {
123                 unsigned long cur_gfn = gfn + npage;
124
125                 ret = vfio_unpin_pages(mdev_dev(vgpu->vdev.mdev), &cur_gfn, 1);
126                 WARN_ON(ret != 1);
127         }
128 }
129
130 /* Pin a normal or compound guest page for dma. */
131 static int gvt_pin_guest_page(struct intel_vgpu *vgpu, unsigned long gfn,
132                 unsigned long size, struct page **page)
133 {
134         unsigned long base_pfn = 0;
135         int total_pages;
136         int npage;
137         int ret;
138
139         total_pages = roundup(size, PAGE_SIZE) / PAGE_SIZE;
140         /*
141          * We pin the pages one-by-one to avoid allocating a big arrary
142          * on stack to hold pfns.
143          */
144         for (npage = 0; npage < total_pages; npage++) {
145                 unsigned long cur_gfn = gfn + npage;
146                 unsigned long pfn;
147
148                 ret = vfio_pin_pages(mdev_dev(vgpu->vdev.mdev), &cur_gfn, 1,
149                                      IOMMU_READ | IOMMU_WRITE, &pfn);
150                 if (ret != 1) {
151                         gvt_vgpu_err("vfio_pin_pages failed for gfn 0x%lx, ret %d\n",
152                                      cur_gfn, ret);
153                         goto err;
154                 }
155
156                 if (!pfn_valid(pfn)) {
157                         gvt_vgpu_err("pfn 0x%lx is not mem backed\n", pfn);
158                         npage++;
159                         ret = -EFAULT;
160                         goto err;
161                 }
162
163                 if (npage == 0)
164                         base_pfn = pfn;
165                 else if (base_pfn + npage != pfn) {
166                         gvt_vgpu_err("The pages are not continuous\n");
167                         ret = -EINVAL;
168                         npage++;
169                         goto err;
170                 }
171         }
172
173         *page = pfn_to_page(base_pfn);
174         return 0;
175 err:
176         gvt_unpin_guest_page(vgpu, gfn, npage * PAGE_SIZE);
177         return ret;
178 }
179
180 static int gvt_dma_map_page(struct intel_vgpu *vgpu, unsigned long gfn,
181                 dma_addr_t *dma_addr, unsigned long size)
182 {
183         struct device *dev = &vgpu->gvt->dev_priv->drm.pdev->dev;
184         struct page *page = NULL;
185         int ret;
186
187         ret = gvt_pin_guest_page(vgpu, gfn, size, &page);
188         if (ret)
189                 return ret;
190
191         /* Setup DMA mapping. */
192         *dma_addr = dma_map_page(dev, page, 0, size, PCI_DMA_BIDIRECTIONAL);
193         if (dma_mapping_error(dev, *dma_addr)) {
194                 gvt_vgpu_err("DMA mapping failed for pfn 0x%lx, ret %d\n",
195                              page_to_pfn(page), ret);
196                 gvt_unpin_guest_page(vgpu, gfn, size);
197                 return -ENOMEM;
198         }
199
200         return 0;
201 }
202
203 static void gvt_dma_unmap_page(struct intel_vgpu *vgpu, unsigned long gfn,
204                 dma_addr_t dma_addr, unsigned long size)
205 {
206         struct device *dev = &vgpu->gvt->dev_priv->drm.pdev->dev;
207
208         dma_unmap_page(dev, dma_addr, size, PCI_DMA_BIDIRECTIONAL);
209         gvt_unpin_guest_page(vgpu, gfn, size);
210 }
211
212 static struct gvt_dma *__gvt_cache_find_dma_addr(struct intel_vgpu *vgpu,
213                 dma_addr_t dma_addr)
214 {
215         struct rb_node *node = vgpu->vdev.dma_addr_cache.rb_node;
216         struct gvt_dma *itr;
217
218         while (node) {
219                 itr = rb_entry(node, struct gvt_dma, dma_addr_node);
220
221                 if (dma_addr < itr->dma_addr)
222                         node = node->rb_left;
223                 else if (dma_addr > itr->dma_addr)
224                         node = node->rb_right;
225                 else
226                         return itr;
227         }
228         return NULL;
229 }
230
231 static struct gvt_dma *__gvt_cache_find_gfn(struct intel_vgpu *vgpu, gfn_t gfn)
232 {
233         struct rb_node *node = vgpu->vdev.gfn_cache.rb_node;
234         struct gvt_dma *itr;
235
236         while (node) {
237                 itr = rb_entry(node, struct gvt_dma, gfn_node);
238
239                 if (gfn < itr->gfn)
240                         node = node->rb_left;
241                 else if (gfn > itr->gfn)
242                         node = node->rb_right;
243                 else
244                         return itr;
245         }
246         return NULL;
247 }
248
249 static int __gvt_cache_add(struct intel_vgpu *vgpu, gfn_t gfn,
250                 dma_addr_t dma_addr, unsigned long size)
251 {
252         struct gvt_dma *new, *itr;
253         struct rb_node **link, *parent = NULL;
254
255         new = kzalloc(sizeof(struct gvt_dma), GFP_KERNEL);
256         if (!new)
257                 return -ENOMEM;
258
259         new->vgpu = vgpu;
260         new->gfn = gfn;
261         new->dma_addr = dma_addr;
262         new->size = size;
263         kref_init(&new->ref);
264
265         /* gfn_cache maps gfn to struct gvt_dma. */
266         link = &vgpu->vdev.gfn_cache.rb_node;
267         while (*link) {
268                 parent = *link;
269                 itr = rb_entry(parent, struct gvt_dma, gfn_node);
270
271                 if (gfn < itr->gfn)
272                         link = &parent->rb_left;
273                 else
274                         link = &parent->rb_right;
275         }
276         rb_link_node(&new->gfn_node, parent, link);
277         rb_insert_color(&new->gfn_node, &vgpu->vdev.gfn_cache);
278
279         /* dma_addr_cache maps dma addr to struct gvt_dma. */
280         parent = NULL;
281         link = &vgpu->vdev.dma_addr_cache.rb_node;
282         while (*link) {
283                 parent = *link;
284                 itr = rb_entry(parent, struct gvt_dma, dma_addr_node);
285
286                 if (dma_addr < itr->dma_addr)
287                         link = &parent->rb_left;
288                 else
289                         link = &parent->rb_right;
290         }
291         rb_link_node(&new->dma_addr_node, parent, link);
292         rb_insert_color(&new->dma_addr_node, &vgpu->vdev.dma_addr_cache);
293
294         vgpu->vdev.nr_cache_entries++;
295         return 0;
296 }
297
298 static void __gvt_cache_remove_entry(struct intel_vgpu *vgpu,
299                                 struct gvt_dma *entry)
300 {
301         rb_erase(&entry->gfn_node, &vgpu->vdev.gfn_cache);
302         rb_erase(&entry->dma_addr_node, &vgpu->vdev.dma_addr_cache);
303         kfree(entry);
304         vgpu->vdev.nr_cache_entries--;
305 }
306
307 static void gvt_cache_destroy(struct intel_vgpu *vgpu)
308 {
309         struct gvt_dma *dma;
310         struct rb_node *node = NULL;
311
312         for (;;) {
313                 mutex_lock(&vgpu->vdev.cache_lock);
314                 node = rb_first(&vgpu->vdev.gfn_cache);
315                 if (!node) {
316                         mutex_unlock(&vgpu->vdev.cache_lock);
317                         break;
318                 }
319                 dma = rb_entry(node, struct gvt_dma, gfn_node);
320                 gvt_dma_unmap_page(vgpu, dma->gfn, dma->dma_addr, dma->size);
321                 __gvt_cache_remove_entry(vgpu, dma);
322                 mutex_unlock(&vgpu->vdev.cache_lock);
323         }
324 }
325
326 static void gvt_cache_init(struct intel_vgpu *vgpu)
327 {
328         vgpu->vdev.gfn_cache = RB_ROOT;
329         vgpu->vdev.dma_addr_cache = RB_ROOT;
330         vgpu->vdev.nr_cache_entries = 0;
331         mutex_init(&vgpu->vdev.cache_lock);
332 }
333
334 static void kvmgt_protect_table_init(struct kvmgt_guest_info *info)
335 {
336         hash_init(info->ptable);
337 }
338
339 static void kvmgt_protect_table_destroy(struct kvmgt_guest_info *info)
340 {
341         struct kvmgt_pgfn *p;
342         struct hlist_node *tmp;
343         int i;
344
345         hash_for_each_safe(info->ptable, i, tmp, p, hnode) {
346                 hash_del(&p->hnode);
347                 kfree(p);
348         }
349 }
350
351 static struct kvmgt_pgfn *
352 __kvmgt_protect_table_find(struct kvmgt_guest_info *info, gfn_t gfn)
353 {
354         struct kvmgt_pgfn *p, *res = NULL;
355
356         hash_for_each_possible(info->ptable, p, hnode, gfn) {
357                 if (gfn == p->gfn) {
358                         res = p;
359                         break;
360                 }
361         }
362
363         return res;
364 }
365
366 static bool kvmgt_gfn_is_write_protected(struct kvmgt_guest_info *info,
367                                 gfn_t gfn)
368 {
369         struct kvmgt_pgfn *p;
370
371         p = __kvmgt_protect_table_find(info, gfn);
372         return !!p;
373 }
374
375 static void kvmgt_protect_table_add(struct kvmgt_guest_info *info, gfn_t gfn)
376 {
377         struct kvmgt_pgfn *p;
378
379         if (kvmgt_gfn_is_write_protected(info, gfn))
380                 return;
381
382         p = kzalloc(sizeof(struct kvmgt_pgfn), GFP_ATOMIC);
383         if (WARN(!p, "gfn: 0x%llx\n", gfn))
384                 return;
385
386         p->gfn = gfn;
387         hash_add(info->ptable, &p->hnode, gfn);
388 }
389
390 static void kvmgt_protect_table_del(struct kvmgt_guest_info *info,
391                                 gfn_t gfn)
392 {
393         struct kvmgt_pgfn *p;
394
395         p = __kvmgt_protect_table_find(info, gfn);
396         if (p) {
397                 hash_del(&p->hnode);
398                 kfree(p);
399         }
400 }
401
402 static size_t intel_vgpu_reg_rw_opregion(struct intel_vgpu *vgpu, char *buf,
403                 size_t count, loff_t *ppos, bool iswrite)
404 {
405         unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) -
406                         VFIO_PCI_NUM_REGIONS;
407         void *base = vgpu->vdev.region[i].data;
408         loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
409
410         if (pos >= vgpu->vdev.region[i].size || iswrite) {
411                 gvt_vgpu_err("invalid op or offset for Intel vgpu OpRegion\n");
412                 return -EINVAL;
413         }
414         count = min(count, (size_t)(vgpu->vdev.region[i].size - pos));
415         memcpy(buf, base + pos, count);
416
417         return count;
418 }
419
420 static void intel_vgpu_reg_release_opregion(struct intel_vgpu *vgpu,
421                 struct vfio_region *region)
422 {
423 }
424
425 static const struct intel_vgpu_regops intel_vgpu_regops_opregion = {
426         .rw = intel_vgpu_reg_rw_opregion,
427         .release = intel_vgpu_reg_release_opregion,
428 };
429
430 static int intel_vgpu_register_reg(struct intel_vgpu *vgpu,
431                 unsigned int type, unsigned int subtype,
432                 const struct intel_vgpu_regops *ops,
433                 size_t size, u32 flags, void *data)
434 {
435         struct vfio_region *region;
436
437         region = krealloc(vgpu->vdev.region,
438                         (vgpu->vdev.num_regions + 1) * sizeof(*region),
439                         GFP_KERNEL);
440         if (!region)
441                 return -ENOMEM;
442
443         vgpu->vdev.region = region;
444         vgpu->vdev.region[vgpu->vdev.num_regions].type = type;
445         vgpu->vdev.region[vgpu->vdev.num_regions].subtype = subtype;
446         vgpu->vdev.region[vgpu->vdev.num_regions].ops = ops;
447         vgpu->vdev.region[vgpu->vdev.num_regions].size = size;
448         vgpu->vdev.region[vgpu->vdev.num_regions].flags = flags;
449         vgpu->vdev.region[vgpu->vdev.num_regions].data = data;
450         vgpu->vdev.num_regions++;
451         return 0;
452 }
453
454 static int kvmgt_get_vfio_device(void *p_vgpu)
455 {
456         struct intel_vgpu *vgpu = (struct intel_vgpu *)p_vgpu;
457
458         vgpu->vdev.vfio_device = vfio_device_get_from_dev(
459                 mdev_dev(vgpu->vdev.mdev));
460         if (!vgpu->vdev.vfio_device) {
461                 gvt_vgpu_err("failed to get vfio device\n");
462                 return -ENODEV;
463         }
464         return 0;
465 }
466
467
468 static int kvmgt_set_opregion(void *p_vgpu)
469 {
470         struct intel_vgpu *vgpu = (struct intel_vgpu *)p_vgpu;
471         void *base;
472         int ret;
473
474         /* Each vgpu has its own opregion, although VFIO would create another
475          * one later. This one is used to expose opregion to VFIO. And the
476          * other one created by VFIO later, is used by guest actually.
477          */
478         base = vgpu_opregion(vgpu)->va;
479         if (!base)
480                 return -ENOMEM;
481
482         if (memcmp(base, OPREGION_SIGNATURE, 16)) {
483                 memunmap(base);
484                 return -EINVAL;
485         }
486
487         ret = intel_vgpu_register_reg(vgpu,
488                         PCI_VENDOR_ID_INTEL | VFIO_REGION_TYPE_PCI_VENDOR_TYPE,
489                         VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION,
490                         &intel_vgpu_regops_opregion, OPREGION_SIZE,
491                         VFIO_REGION_INFO_FLAG_READ, base);
492
493         return ret;
494 }
495
496 static void kvmgt_put_vfio_device(void *vgpu)
497 {
498         if (WARN_ON(!((struct intel_vgpu *)vgpu)->vdev.vfio_device))
499                 return;
500
501         vfio_device_put(((struct intel_vgpu *)vgpu)->vdev.vfio_device);
502 }
503
504 static int intel_vgpu_create(struct kobject *kobj, struct mdev_device *mdev)
505 {
506         struct intel_vgpu *vgpu = NULL;
507         struct intel_vgpu_type *type;
508         struct device *pdev;
509         void *gvt;
510         int ret;
511
512         pdev = mdev_parent_dev(mdev);
513         gvt = kdev_to_i915(pdev)->gvt;
514
515         type = intel_gvt_ops->gvt_find_vgpu_type(gvt, kobject_name(kobj));
516         if (!type) {
517                 gvt_vgpu_err("failed to find type %s to create\n",
518                                                 kobject_name(kobj));
519                 ret = -EINVAL;
520                 goto out;
521         }
522
523         vgpu = intel_gvt_ops->vgpu_create(gvt, type);
524         if (IS_ERR_OR_NULL(vgpu)) {
525                 ret = vgpu == NULL ? -EFAULT : PTR_ERR(vgpu);
526                 gvt_err("failed to create intel vgpu: %d\n", ret);
527                 goto out;
528         }
529
530         INIT_WORK(&vgpu->vdev.release_work, intel_vgpu_release_work);
531
532         vgpu->vdev.mdev = mdev;
533         mdev_set_drvdata(mdev, vgpu);
534
535         gvt_dbg_core("intel_vgpu_create succeeded for mdev: %s\n",
536                      dev_name(mdev_dev(mdev)));
537         ret = 0;
538
539 out:
540         return ret;
541 }
542
543 static int intel_vgpu_remove(struct mdev_device *mdev)
544 {
545         struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
546
547         if (handle_valid(vgpu->handle))
548                 return -EBUSY;
549
550         intel_gvt_ops->vgpu_destroy(vgpu);
551         return 0;
552 }
553
554 static int intel_vgpu_iommu_notifier(struct notifier_block *nb,
555                                      unsigned long action, void *data)
556 {
557         struct intel_vgpu *vgpu = container_of(nb,
558                                         struct intel_vgpu,
559                                         vdev.iommu_notifier);
560
561         if (action == VFIO_IOMMU_NOTIFY_DMA_UNMAP) {
562                 struct vfio_iommu_type1_dma_unmap *unmap = data;
563                 struct gvt_dma *entry;
564                 unsigned long iov_pfn, end_iov_pfn;
565
566                 iov_pfn = unmap->iova >> PAGE_SHIFT;
567                 end_iov_pfn = iov_pfn + unmap->size / PAGE_SIZE;
568
569                 mutex_lock(&vgpu->vdev.cache_lock);
570                 for (; iov_pfn < end_iov_pfn; iov_pfn++) {
571                         entry = __gvt_cache_find_gfn(vgpu, iov_pfn);
572                         if (!entry)
573                                 continue;
574
575                         gvt_dma_unmap_page(vgpu, entry->gfn, entry->dma_addr,
576                                            entry->size);
577                         __gvt_cache_remove_entry(vgpu, entry);
578                 }
579                 mutex_unlock(&vgpu->vdev.cache_lock);
580         }
581
582         return NOTIFY_OK;
583 }
584
585 static int intel_vgpu_group_notifier(struct notifier_block *nb,
586                                      unsigned long action, void *data)
587 {
588         struct intel_vgpu *vgpu = container_of(nb,
589                                         struct intel_vgpu,
590                                         vdev.group_notifier);
591
592         /* the only action we care about */
593         if (action == VFIO_GROUP_NOTIFY_SET_KVM) {
594                 vgpu->vdev.kvm = data;
595
596                 if (!data)
597                         schedule_work(&vgpu->vdev.release_work);
598         }
599
600         return NOTIFY_OK;
601 }
602
603 static int intel_vgpu_open(struct mdev_device *mdev)
604 {
605         struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
606         unsigned long events;
607         int ret;
608
609         vgpu->vdev.iommu_notifier.notifier_call = intel_vgpu_iommu_notifier;
610         vgpu->vdev.group_notifier.notifier_call = intel_vgpu_group_notifier;
611
612         events = VFIO_IOMMU_NOTIFY_DMA_UNMAP;
613         ret = vfio_register_notifier(mdev_dev(mdev), VFIO_IOMMU_NOTIFY, &events,
614                                 &vgpu->vdev.iommu_notifier);
615         if (ret != 0) {
616                 gvt_vgpu_err("vfio_register_notifier for iommu failed: %d\n",
617                         ret);
618                 goto out;
619         }
620
621         events = VFIO_GROUP_NOTIFY_SET_KVM;
622         ret = vfio_register_notifier(mdev_dev(mdev), VFIO_GROUP_NOTIFY, &events,
623                                 &vgpu->vdev.group_notifier);
624         if (ret != 0) {
625                 gvt_vgpu_err("vfio_register_notifier for group failed: %d\n",
626                         ret);
627                 goto undo_iommu;
628         }
629
630         /* Take a module reference as mdev core doesn't take
631          * a reference for vendor driver.
632          */
633         if (!try_module_get(THIS_MODULE))
634                 goto undo_group;
635
636         ret = kvmgt_guest_init(mdev);
637         if (ret)
638                 goto undo_group;
639
640         intel_gvt_ops->vgpu_activate(vgpu);
641
642         atomic_set(&vgpu->vdev.released, 0);
643         return ret;
644
645 undo_group:
646         vfio_unregister_notifier(mdev_dev(mdev), VFIO_GROUP_NOTIFY,
647                                         &vgpu->vdev.group_notifier);
648
649 undo_iommu:
650         vfio_unregister_notifier(mdev_dev(mdev), VFIO_IOMMU_NOTIFY,
651                                         &vgpu->vdev.iommu_notifier);
652 out:
653         return ret;
654 }
655
656 static void intel_vgpu_release_msi_eventfd_ctx(struct intel_vgpu *vgpu)
657 {
658         struct eventfd_ctx *trigger;
659
660         trigger = vgpu->vdev.msi_trigger;
661         if (trigger) {
662                 eventfd_ctx_put(trigger);
663                 vgpu->vdev.msi_trigger = NULL;
664         }
665 }
666
667 static void __intel_vgpu_release(struct intel_vgpu *vgpu)
668 {
669         struct kvmgt_guest_info *info;
670         int ret;
671
672         if (!handle_valid(vgpu->handle))
673                 return;
674
675         if (atomic_cmpxchg(&vgpu->vdev.released, 0, 1))
676                 return;
677
678         intel_gvt_ops->vgpu_release(vgpu);
679
680         ret = vfio_unregister_notifier(mdev_dev(vgpu->vdev.mdev), VFIO_IOMMU_NOTIFY,
681                                         &vgpu->vdev.iommu_notifier);
682         WARN(ret, "vfio_unregister_notifier for iommu failed: %d\n", ret);
683
684         ret = vfio_unregister_notifier(mdev_dev(vgpu->vdev.mdev), VFIO_GROUP_NOTIFY,
685                                         &vgpu->vdev.group_notifier);
686         WARN(ret, "vfio_unregister_notifier for group failed: %d\n", ret);
687
688         /* dereference module reference taken at open */
689         module_put(THIS_MODULE);
690
691         info = (struct kvmgt_guest_info *)vgpu->handle;
692         kvmgt_guest_exit(info);
693
694         intel_vgpu_release_msi_eventfd_ctx(vgpu);
695
696         vgpu->vdev.kvm = NULL;
697         vgpu->handle = 0;
698 }
699
700 static void intel_vgpu_release(struct mdev_device *mdev)
701 {
702         struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
703
704         __intel_vgpu_release(vgpu);
705 }
706
707 static void intel_vgpu_release_work(struct work_struct *work)
708 {
709         struct intel_vgpu *vgpu = container_of(work, struct intel_vgpu,
710                                         vdev.release_work);
711
712         __intel_vgpu_release(vgpu);
713 }
714
715 static u64 intel_vgpu_get_bar_addr(struct intel_vgpu *vgpu, int bar)
716 {
717         u32 start_lo, start_hi;
718         u32 mem_type;
719
720         start_lo = (*(u32 *)(vgpu->cfg_space.virtual_cfg_space + bar)) &
721                         PCI_BASE_ADDRESS_MEM_MASK;
722         mem_type = (*(u32 *)(vgpu->cfg_space.virtual_cfg_space + bar)) &
723                         PCI_BASE_ADDRESS_MEM_TYPE_MASK;
724
725         switch (mem_type) {
726         case PCI_BASE_ADDRESS_MEM_TYPE_64:
727                 start_hi = (*(u32 *)(vgpu->cfg_space.virtual_cfg_space
728                                                 + bar + 4));
729                 break;
730         case PCI_BASE_ADDRESS_MEM_TYPE_32:
731         case PCI_BASE_ADDRESS_MEM_TYPE_1M:
732                 /* 1M mem BAR treated as 32-bit BAR */
733         default:
734                 /* mem unknown type treated as 32-bit BAR */
735                 start_hi = 0;
736                 break;
737         }
738
739         return ((u64)start_hi << 32) | start_lo;
740 }
741
742 static int intel_vgpu_bar_rw(struct intel_vgpu *vgpu, int bar, u64 off,
743                              void *buf, unsigned int count, bool is_write)
744 {
745         u64 bar_start = intel_vgpu_get_bar_addr(vgpu, bar);
746         int ret;
747
748         if (is_write)
749                 ret = intel_gvt_ops->emulate_mmio_write(vgpu,
750                                         bar_start + off, buf, count);
751         else
752                 ret = intel_gvt_ops->emulate_mmio_read(vgpu,
753                                         bar_start + off, buf, count);
754         return ret;
755 }
756
757 static inline bool intel_vgpu_in_aperture(struct intel_vgpu *vgpu, u64 off)
758 {
759         return off >= vgpu_aperture_offset(vgpu) &&
760                off < vgpu_aperture_offset(vgpu) + vgpu_aperture_sz(vgpu);
761 }
762
763 static int intel_vgpu_aperture_rw(struct intel_vgpu *vgpu, u64 off,
764                 void *buf, unsigned long count, bool is_write)
765 {
766         void *aperture_va;
767
768         if (!intel_vgpu_in_aperture(vgpu, off) ||
769             !intel_vgpu_in_aperture(vgpu, off + count)) {
770                 gvt_vgpu_err("Invalid aperture offset %llu\n", off);
771                 return -EINVAL;
772         }
773
774         aperture_va = io_mapping_map_wc(&vgpu->gvt->dev_priv->ggtt.iomap,
775                                         ALIGN_DOWN(off, PAGE_SIZE),
776                                         count + offset_in_page(off));
777         if (!aperture_va)
778                 return -EIO;
779
780         if (is_write)
781                 memcpy(aperture_va + offset_in_page(off), buf, count);
782         else
783                 memcpy(buf, aperture_va + offset_in_page(off), count);
784
785         io_mapping_unmap(aperture_va);
786
787         return 0;
788 }
789
790 static ssize_t intel_vgpu_rw(struct mdev_device *mdev, char *buf,
791                         size_t count, loff_t *ppos, bool is_write)
792 {
793         struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
794         unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
795         u64 pos = *ppos & VFIO_PCI_OFFSET_MASK;
796         int ret = -EINVAL;
797
798
799         if (index >= VFIO_PCI_NUM_REGIONS + vgpu->vdev.num_regions) {
800                 gvt_vgpu_err("invalid index: %u\n", index);
801                 return -EINVAL;
802         }
803
804         switch (index) {
805         case VFIO_PCI_CONFIG_REGION_INDEX:
806                 if (is_write)
807                         ret = intel_gvt_ops->emulate_cfg_write(vgpu, pos,
808                                                 buf, count);
809                 else
810                         ret = intel_gvt_ops->emulate_cfg_read(vgpu, pos,
811                                                 buf, count);
812                 break;
813         case VFIO_PCI_BAR0_REGION_INDEX:
814                 ret = intel_vgpu_bar_rw(vgpu, PCI_BASE_ADDRESS_0, pos,
815                                         buf, count, is_write);
816                 break;
817         case VFIO_PCI_BAR2_REGION_INDEX:
818                 ret = intel_vgpu_aperture_rw(vgpu, pos, buf, count, is_write);
819                 break;
820         case VFIO_PCI_BAR1_REGION_INDEX:
821         case VFIO_PCI_BAR3_REGION_INDEX:
822         case VFIO_PCI_BAR4_REGION_INDEX:
823         case VFIO_PCI_BAR5_REGION_INDEX:
824         case VFIO_PCI_VGA_REGION_INDEX:
825         case VFIO_PCI_ROM_REGION_INDEX:
826                 break;
827         default:
828                 if (index >= VFIO_PCI_NUM_REGIONS + vgpu->vdev.num_regions)
829                         return -EINVAL;
830
831                 index -= VFIO_PCI_NUM_REGIONS;
832                 return vgpu->vdev.region[index].ops->rw(vgpu, buf, count,
833                                 ppos, is_write);
834         }
835
836         return ret == 0 ? count : ret;
837 }
838
839 static bool gtt_entry(struct mdev_device *mdev, loff_t *ppos)
840 {
841         struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
842         unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
843         struct intel_gvt *gvt = vgpu->gvt;
844         int offset;
845
846         /* Only allow MMIO GGTT entry access */
847         if (index != PCI_BASE_ADDRESS_0)
848                 return false;
849
850         offset = (u64)(*ppos & VFIO_PCI_OFFSET_MASK) -
851                 intel_vgpu_get_bar_gpa(vgpu, PCI_BASE_ADDRESS_0);
852
853         return (offset >= gvt->device_info.gtt_start_offset &&
854                 offset < gvt->device_info.gtt_start_offset + gvt_ggtt_sz(gvt)) ?
855                         true : false;
856 }
857
858 static ssize_t intel_vgpu_read(struct mdev_device *mdev, char __user *buf,
859                         size_t count, loff_t *ppos)
860 {
861         unsigned int done = 0;
862         int ret;
863
864         while (count) {
865                 size_t filled;
866
867                 /* Only support GGTT entry 8 bytes read */
868                 if (count >= 8 && !(*ppos % 8) &&
869                         gtt_entry(mdev, ppos)) {
870                         u64 val;
871
872                         ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val),
873                                         ppos, false);
874                         if (ret <= 0)
875                                 goto read_err;
876
877                         if (copy_to_user(buf, &val, sizeof(val)))
878                                 goto read_err;
879
880                         filled = 8;
881                 } else if (count >= 4 && !(*ppos % 4)) {
882                         u32 val;
883
884                         ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val),
885                                         ppos, false);
886                         if (ret <= 0)
887                                 goto read_err;
888
889                         if (copy_to_user(buf, &val, sizeof(val)))
890                                 goto read_err;
891
892                         filled = 4;
893                 } else if (count >= 2 && !(*ppos % 2)) {
894                         u16 val;
895
896                         ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val),
897                                         ppos, false);
898                         if (ret <= 0)
899                                 goto read_err;
900
901                         if (copy_to_user(buf, &val, sizeof(val)))
902                                 goto read_err;
903
904                         filled = 2;
905                 } else {
906                         u8 val;
907
908                         ret = intel_vgpu_rw(mdev, &val, sizeof(val), ppos,
909                                         false);
910                         if (ret <= 0)
911                                 goto read_err;
912
913                         if (copy_to_user(buf, &val, sizeof(val)))
914                                 goto read_err;
915
916                         filled = 1;
917                 }
918
919                 count -= filled;
920                 done += filled;
921                 *ppos += filled;
922                 buf += filled;
923         }
924
925         return done;
926
927 read_err:
928         return -EFAULT;
929 }
930
931 static ssize_t intel_vgpu_write(struct mdev_device *mdev,
932                                 const char __user *buf,
933                                 size_t count, loff_t *ppos)
934 {
935         unsigned int done = 0;
936         int ret;
937
938         while (count) {
939                 size_t filled;
940
941                 /* Only support GGTT entry 8 bytes write */
942                 if (count >= 8 && !(*ppos % 8) &&
943                         gtt_entry(mdev, ppos)) {
944                         u64 val;
945
946                         if (copy_from_user(&val, buf, sizeof(val)))
947                                 goto write_err;
948
949                         ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val),
950                                         ppos, true);
951                         if (ret <= 0)
952                                 goto write_err;
953
954                         filled = 8;
955                 } else if (count >= 4 && !(*ppos % 4)) {
956                         u32 val;
957
958                         if (copy_from_user(&val, buf, sizeof(val)))
959                                 goto write_err;
960
961                         ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val),
962                                         ppos, true);
963                         if (ret <= 0)
964                                 goto write_err;
965
966                         filled = 4;
967                 } else if (count >= 2 && !(*ppos % 2)) {
968                         u16 val;
969
970                         if (copy_from_user(&val, buf, sizeof(val)))
971                                 goto write_err;
972
973                         ret = intel_vgpu_rw(mdev, (char *)&val,
974                                         sizeof(val), ppos, true);
975                         if (ret <= 0)
976                                 goto write_err;
977
978                         filled = 2;
979                 } else {
980                         u8 val;
981
982                         if (copy_from_user(&val, buf, sizeof(val)))
983                                 goto write_err;
984
985                         ret = intel_vgpu_rw(mdev, &val, sizeof(val),
986                                         ppos, true);
987                         if (ret <= 0)
988                                 goto write_err;
989
990                         filled = 1;
991                 }
992
993                 count -= filled;
994                 done += filled;
995                 *ppos += filled;
996                 buf += filled;
997         }
998
999         return done;
1000 write_err:
1001         return -EFAULT;
1002 }
1003
1004 static int intel_vgpu_mmap(struct mdev_device *mdev, struct vm_area_struct *vma)
1005 {
1006         unsigned int index;
1007         u64 virtaddr;
1008         unsigned long req_size, pgoff = 0;
1009         pgprot_t pg_prot;
1010         struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
1011
1012         index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
1013         if (index >= VFIO_PCI_ROM_REGION_INDEX)
1014                 return -EINVAL;
1015
1016         if (vma->vm_end < vma->vm_start)
1017                 return -EINVAL;
1018         if ((vma->vm_flags & VM_SHARED) == 0)
1019                 return -EINVAL;
1020         if (index != VFIO_PCI_BAR2_REGION_INDEX)
1021                 return -EINVAL;
1022
1023         pg_prot = vma->vm_page_prot;
1024         virtaddr = vma->vm_start;
1025         req_size = vma->vm_end - vma->vm_start;
1026         pgoff = vgpu_aperture_pa_base(vgpu) >> PAGE_SHIFT;
1027
1028         return remap_pfn_range(vma, virtaddr, pgoff, req_size, pg_prot);
1029 }
1030
1031 static int intel_vgpu_get_irq_count(struct intel_vgpu *vgpu, int type)
1032 {
1033         if (type == VFIO_PCI_INTX_IRQ_INDEX || type == VFIO_PCI_MSI_IRQ_INDEX)
1034                 return 1;
1035
1036         return 0;
1037 }
1038
1039 static int intel_vgpu_set_intx_mask(struct intel_vgpu *vgpu,
1040                         unsigned int index, unsigned int start,
1041                         unsigned int count, u32 flags,
1042                         void *data)
1043 {
1044         return 0;
1045 }
1046
1047 static int intel_vgpu_set_intx_unmask(struct intel_vgpu *vgpu,
1048                         unsigned int index, unsigned int start,
1049                         unsigned int count, u32 flags, void *data)
1050 {
1051         return 0;
1052 }
1053
1054 static int intel_vgpu_set_intx_trigger(struct intel_vgpu *vgpu,
1055                 unsigned int index, unsigned int start, unsigned int count,
1056                 u32 flags, void *data)
1057 {
1058         return 0;
1059 }
1060
1061 static int intel_vgpu_set_msi_trigger(struct intel_vgpu *vgpu,
1062                 unsigned int index, unsigned int start, unsigned int count,
1063                 u32 flags, void *data)
1064 {
1065         struct eventfd_ctx *trigger;
1066
1067         if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
1068                 int fd = *(int *)data;
1069
1070                 trigger = eventfd_ctx_fdget(fd);
1071                 if (IS_ERR(trigger)) {
1072                         gvt_vgpu_err("eventfd_ctx_fdget failed\n");
1073                         return PTR_ERR(trigger);
1074                 }
1075                 vgpu->vdev.msi_trigger = trigger;
1076         } else if ((flags & VFIO_IRQ_SET_DATA_NONE) && !count)
1077                 intel_vgpu_release_msi_eventfd_ctx(vgpu);
1078
1079         return 0;
1080 }
1081
1082 static int intel_vgpu_set_irqs(struct intel_vgpu *vgpu, u32 flags,
1083                 unsigned int index, unsigned int start, unsigned int count,
1084                 void *data)
1085 {
1086         int (*func)(struct intel_vgpu *vgpu, unsigned int index,
1087                         unsigned int start, unsigned int count, u32 flags,
1088                         void *data) = NULL;
1089
1090         switch (index) {
1091         case VFIO_PCI_INTX_IRQ_INDEX:
1092                 switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
1093                 case VFIO_IRQ_SET_ACTION_MASK:
1094                         func = intel_vgpu_set_intx_mask;
1095                         break;
1096                 case VFIO_IRQ_SET_ACTION_UNMASK:
1097                         func = intel_vgpu_set_intx_unmask;
1098                         break;
1099                 case VFIO_IRQ_SET_ACTION_TRIGGER:
1100                         func = intel_vgpu_set_intx_trigger;
1101                         break;
1102                 }
1103                 break;
1104         case VFIO_PCI_MSI_IRQ_INDEX:
1105                 switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
1106                 case VFIO_IRQ_SET_ACTION_MASK:
1107                 case VFIO_IRQ_SET_ACTION_UNMASK:
1108                         /* XXX Need masking support exported */
1109                         break;
1110                 case VFIO_IRQ_SET_ACTION_TRIGGER:
1111                         func = intel_vgpu_set_msi_trigger;
1112                         break;
1113                 }
1114                 break;
1115         }
1116
1117         if (!func)
1118                 return -ENOTTY;
1119
1120         return func(vgpu, index, start, count, flags, data);
1121 }
1122
1123 static long intel_vgpu_ioctl(struct mdev_device *mdev, unsigned int cmd,
1124                              unsigned long arg)
1125 {
1126         struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
1127         unsigned long minsz;
1128
1129         gvt_dbg_core("vgpu%d ioctl, cmd: %d\n", vgpu->id, cmd);
1130
1131         if (cmd == VFIO_DEVICE_GET_INFO) {
1132                 struct vfio_device_info info;
1133
1134                 minsz = offsetofend(struct vfio_device_info, num_irqs);
1135
1136                 if (copy_from_user(&info, (void __user *)arg, minsz))
1137                         return -EFAULT;
1138
1139                 if (info.argsz < minsz)
1140                         return -EINVAL;
1141
1142                 info.flags = VFIO_DEVICE_FLAGS_PCI;
1143                 info.flags |= VFIO_DEVICE_FLAGS_RESET;
1144                 info.num_regions = VFIO_PCI_NUM_REGIONS +
1145                                 vgpu->vdev.num_regions;
1146                 info.num_irqs = VFIO_PCI_NUM_IRQS;
1147
1148                 return copy_to_user((void __user *)arg, &info, minsz) ?
1149                         -EFAULT : 0;
1150
1151         } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) {
1152                 struct vfio_region_info info;
1153                 struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
1154                 unsigned int i;
1155                 int ret;
1156                 struct vfio_region_info_cap_sparse_mmap *sparse = NULL;
1157                 size_t size;
1158                 int nr_areas = 1;
1159                 int cap_type_id;
1160
1161                 minsz = offsetofend(struct vfio_region_info, offset);
1162
1163                 if (copy_from_user(&info, (void __user *)arg, minsz))
1164                         return -EFAULT;
1165
1166                 if (info.argsz < minsz)
1167                         return -EINVAL;
1168
1169                 switch (info.index) {
1170                 case VFIO_PCI_CONFIG_REGION_INDEX:
1171                         info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1172                         info.size = vgpu->gvt->device_info.cfg_space_size;
1173                         info.flags = VFIO_REGION_INFO_FLAG_READ |
1174                                      VFIO_REGION_INFO_FLAG_WRITE;
1175                         break;
1176                 case VFIO_PCI_BAR0_REGION_INDEX:
1177                         info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1178                         info.size = vgpu->cfg_space.bar[info.index].size;
1179                         if (!info.size) {
1180                                 info.flags = 0;
1181                                 break;
1182                         }
1183
1184                         info.flags = VFIO_REGION_INFO_FLAG_READ |
1185                                      VFIO_REGION_INFO_FLAG_WRITE;
1186                         break;
1187                 case VFIO_PCI_BAR1_REGION_INDEX:
1188                         info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1189                         info.size = 0;
1190                         info.flags = 0;
1191                         break;
1192                 case VFIO_PCI_BAR2_REGION_INDEX:
1193                         info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1194                         info.flags = VFIO_REGION_INFO_FLAG_CAPS |
1195                                         VFIO_REGION_INFO_FLAG_MMAP |
1196                                         VFIO_REGION_INFO_FLAG_READ |
1197                                         VFIO_REGION_INFO_FLAG_WRITE;
1198                         info.size = gvt_aperture_sz(vgpu->gvt);
1199
1200                         size = sizeof(*sparse) +
1201                                         (nr_areas * sizeof(*sparse->areas));
1202                         sparse = kzalloc(size, GFP_KERNEL);
1203                         if (!sparse)
1204                                 return -ENOMEM;
1205
1206                         sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
1207                         sparse->header.version = 1;
1208                         sparse->nr_areas = nr_areas;
1209                         cap_type_id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
1210                         sparse->areas[0].offset =
1211                                         PAGE_ALIGN(vgpu_aperture_offset(vgpu));
1212                         sparse->areas[0].size = vgpu_aperture_sz(vgpu);
1213                         break;
1214
1215                 case VFIO_PCI_BAR3_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
1216                         info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1217                         info.size = 0;
1218                         info.flags = 0;
1219
1220                         gvt_dbg_core("get region info bar:%d\n", info.index);
1221                         break;
1222
1223                 case VFIO_PCI_ROM_REGION_INDEX:
1224                 case VFIO_PCI_VGA_REGION_INDEX:
1225                         info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1226                         info.size = 0;
1227                         info.flags = 0;
1228
1229                         gvt_dbg_core("get region info index:%d\n", info.index);
1230                         break;
1231                 default:
1232                         {
1233                                 struct vfio_region_info_cap_type cap_type = {
1234                                         .header.id = VFIO_REGION_INFO_CAP_TYPE,
1235                                         .header.version = 1 };
1236
1237                                 if (info.index >= VFIO_PCI_NUM_REGIONS +
1238                                                 vgpu->vdev.num_regions)
1239                                         return -EINVAL;
1240                                 info.index =
1241                                         array_index_nospec(info.index,
1242                                                         VFIO_PCI_NUM_REGIONS +
1243                                                         vgpu->vdev.num_regions);
1244
1245                                 i = info.index - VFIO_PCI_NUM_REGIONS;
1246
1247                                 info.offset =
1248                                         VFIO_PCI_INDEX_TO_OFFSET(info.index);
1249                                 info.size = vgpu->vdev.region[i].size;
1250                                 info.flags = vgpu->vdev.region[i].flags;
1251
1252                                 cap_type.type = vgpu->vdev.region[i].type;
1253                                 cap_type.subtype = vgpu->vdev.region[i].subtype;
1254
1255                                 ret = vfio_info_add_capability(&caps,
1256                                                         &cap_type.header,
1257                                                         sizeof(cap_type));
1258                                 if (ret)
1259                                         return ret;
1260                         }
1261                 }
1262
1263                 if ((info.flags & VFIO_REGION_INFO_FLAG_CAPS) && sparse) {
1264                         switch (cap_type_id) {
1265                         case VFIO_REGION_INFO_CAP_SPARSE_MMAP:
1266                                 ret = vfio_info_add_capability(&caps,
1267                                         &sparse->header, sizeof(*sparse) +
1268                                         (sparse->nr_areas *
1269                                                 sizeof(*sparse->areas)));
1270                                 if (ret) {
1271                                         kfree(sparse);
1272                                         return ret;
1273                                 }
1274                                 break;
1275                         default:
1276                                 kfree(sparse);
1277                                 return -EINVAL;
1278                         }
1279                 }
1280
1281                 if (caps.size) {
1282                         info.flags |= VFIO_REGION_INFO_FLAG_CAPS;
1283                         if (info.argsz < sizeof(info) + caps.size) {
1284                                 info.argsz = sizeof(info) + caps.size;
1285                                 info.cap_offset = 0;
1286                         } else {
1287                                 vfio_info_cap_shift(&caps, sizeof(info));
1288                                 if (copy_to_user((void __user *)arg +
1289                                                   sizeof(info), caps.buf,
1290                                                   caps.size)) {
1291                                         kfree(caps.buf);
1292                                         kfree(sparse);
1293                                         return -EFAULT;
1294                                 }
1295                                 info.cap_offset = sizeof(info);
1296                         }
1297
1298                         kfree(caps.buf);
1299                 }
1300
1301                 kfree(sparse);
1302                 return copy_to_user((void __user *)arg, &info, minsz) ?
1303                         -EFAULT : 0;
1304         } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) {
1305                 struct vfio_irq_info info;
1306
1307                 minsz = offsetofend(struct vfio_irq_info, count);
1308
1309                 if (copy_from_user(&info, (void __user *)arg, minsz))
1310                         return -EFAULT;
1311
1312                 if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS)
1313                         return -EINVAL;
1314
1315                 switch (info.index) {
1316                 case VFIO_PCI_INTX_IRQ_INDEX:
1317                 case VFIO_PCI_MSI_IRQ_INDEX:
1318                         break;
1319                 default:
1320                         return -EINVAL;
1321                 }
1322
1323                 info.flags = VFIO_IRQ_INFO_EVENTFD;
1324
1325                 info.count = intel_vgpu_get_irq_count(vgpu, info.index);
1326
1327                 if (info.index == VFIO_PCI_INTX_IRQ_INDEX)
1328                         info.flags |= (VFIO_IRQ_INFO_MASKABLE |
1329                                        VFIO_IRQ_INFO_AUTOMASKED);
1330                 else
1331                         info.flags |= VFIO_IRQ_INFO_NORESIZE;
1332
1333                 return copy_to_user((void __user *)arg, &info, minsz) ?
1334                         -EFAULT : 0;
1335         } else if (cmd == VFIO_DEVICE_SET_IRQS) {
1336                 struct vfio_irq_set hdr;
1337                 u8 *data = NULL;
1338                 int ret = 0;
1339                 size_t data_size = 0;
1340
1341                 minsz = offsetofend(struct vfio_irq_set, count);
1342
1343                 if (copy_from_user(&hdr, (void __user *)arg, minsz))
1344                         return -EFAULT;
1345
1346                 if (!(hdr.flags & VFIO_IRQ_SET_DATA_NONE)) {
1347                         int max = intel_vgpu_get_irq_count(vgpu, hdr.index);
1348
1349                         ret = vfio_set_irqs_validate_and_prepare(&hdr, max,
1350                                                 VFIO_PCI_NUM_IRQS, &data_size);
1351                         if (ret) {
1352                                 gvt_vgpu_err("intel:vfio_set_irqs_validate_and_prepare failed\n");
1353                                 return -EINVAL;
1354                         }
1355                         if (data_size) {
1356                                 data = memdup_user((void __user *)(arg + minsz),
1357                                                    data_size);
1358                                 if (IS_ERR(data))
1359                                         return PTR_ERR(data);
1360                         }
1361                 }
1362
1363                 ret = intel_vgpu_set_irqs(vgpu, hdr.flags, hdr.index,
1364                                         hdr.start, hdr.count, data);
1365                 kfree(data);
1366
1367                 return ret;
1368         } else if (cmd == VFIO_DEVICE_RESET) {
1369                 intel_gvt_ops->vgpu_reset(vgpu);
1370                 return 0;
1371         } else if (cmd == VFIO_DEVICE_QUERY_GFX_PLANE) {
1372                 struct vfio_device_gfx_plane_info dmabuf;
1373                 int ret = 0;
1374
1375                 minsz = offsetofend(struct vfio_device_gfx_plane_info,
1376                                     dmabuf_id);
1377                 if (copy_from_user(&dmabuf, (void __user *)arg, minsz))
1378                         return -EFAULT;
1379                 if (dmabuf.argsz < minsz)
1380                         return -EINVAL;
1381
1382                 ret = intel_gvt_ops->vgpu_query_plane(vgpu, &dmabuf);
1383                 if (ret != 0)
1384                         return ret;
1385
1386                 return copy_to_user((void __user *)arg, &dmabuf, minsz) ?
1387                                                                 -EFAULT : 0;
1388         } else if (cmd == VFIO_DEVICE_GET_GFX_DMABUF) {
1389                 __u32 dmabuf_id;
1390                 __s32 dmabuf_fd;
1391
1392                 if (get_user(dmabuf_id, (__u32 __user *)arg))
1393                         return -EFAULT;
1394
1395                 dmabuf_fd = intel_gvt_ops->vgpu_get_dmabuf(vgpu, dmabuf_id);
1396                 return dmabuf_fd;
1397
1398         }
1399
1400         return -ENOTTY;
1401 }
1402
1403 static ssize_t
1404 vgpu_id_show(struct device *dev, struct device_attribute *attr,
1405              char *buf)
1406 {
1407         struct mdev_device *mdev = mdev_from_dev(dev);
1408
1409         if (mdev) {
1410                 struct intel_vgpu *vgpu = (struct intel_vgpu *)
1411                         mdev_get_drvdata(mdev);
1412                 return sprintf(buf, "%d\n", vgpu->id);
1413         }
1414         return sprintf(buf, "\n");
1415 }
1416
1417 static ssize_t
1418 hw_id_show(struct device *dev, struct device_attribute *attr,
1419            char *buf)
1420 {
1421         struct mdev_device *mdev = mdev_from_dev(dev);
1422
1423         if (mdev) {
1424                 struct intel_vgpu *vgpu = (struct intel_vgpu *)
1425                         mdev_get_drvdata(mdev);
1426                 return sprintf(buf, "%u\n",
1427                                vgpu->submission.shadow_ctx->hw_id);
1428         }
1429         return sprintf(buf, "\n");
1430 }
1431
1432 static DEVICE_ATTR_RO(vgpu_id);
1433 static DEVICE_ATTR_RO(hw_id);
1434
1435 static struct attribute *intel_vgpu_attrs[] = {
1436         &dev_attr_vgpu_id.attr,
1437         &dev_attr_hw_id.attr,
1438         NULL
1439 };
1440
1441 static const struct attribute_group intel_vgpu_group = {
1442         .name = "intel_vgpu",
1443         .attrs = intel_vgpu_attrs,
1444 };
1445
1446 static const struct attribute_group *intel_vgpu_groups[] = {
1447         &intel_vgpu_group,
1448         NULL,
1449 };
1450
1451 static struct mdev_parent_ops intel_vgpu_ops = {
1452         .mdev_attr_groups       = intel_vgpu_groups,
1453         .create                 = intel_vgpu_create,
1454         .remove                 = intel_vgpu_remove,
1455
1456         .open                   = intel_vgpu_open,
1457         .release                = intel_vgpu_release,
1458
1459         .read                   = intel_vgpu_read,
1460         .write                  = intel_vgpu_write,
1461         .mmap                   = intel_vgpu_mmap,
1462         .ioctl                  = intel_vgpu_ioctl,
1463 };
1464
1465 static int kvmgt_host_init(struct device *dev, void *gvt, const void *ops)
1466 {
1467         struct attribute **kvm_type_attrs;
1468         struct attribute_group **kvm_vgpu_type_groups;
1469
1470         intel_gvt_ops = ops;
1471         if (!intel_gvt_ops->get_gvt_attrs(&kvm_type_attrs,
1472                         &kvm_vgpu_type_groups))
1473                 return -EFAULT;
1474         intel_vgpu_ops.supported_type_groups = kvm_vgpu_type_groups;
1475
1476         return mdev_register_device(dev, &intel_vgpu_ops);
1477 }
1478
1479 static void kvmgt_host_exit(struct device *dev)
1480 {
1481         mdev_unregister_device(dev);
1482 }
1483
1484 static int kvmgt_page_track_add(unsigned long handle, u64 gfn)
1485 {
1486         struct kvmgt_guest_info *info;
1487         struct kvm *kvm;
1488         struct kvm_memory_slot *slot;
1489         int idx;
1490
1491         if (!handle_valid(handle))
1492                 return -ESRCH;
1493
1494         info = (struct kvmgt_guest_info *)handle;
1495         kvm = info->kvm;
1496
1497         idx = srcu_read_lock(&kvm->srcu);
1498         slot = gfn_to_memslot(kvm, gfn);
1499         if (!slot) {
1500                 srcu_read_unlock(&kvm->srcu, idx);
1501                 return -EINVAL;
1502         }
1503
1504         spin_lock(&kvm->mmu_lock);
1505
1506         if (kvmgt_gfn_is_write_protected(info, gfn))
1507                 goto out;
1508
1509         kvm_slot_page_track_add_page(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE);
1510         kvmgt_protect_table_add(info, gfn);
1511
1512 out:
1513         spin_unlock(&kvm->mmu_lock);
1514         srcu_read_unlock(&kvm->srcu, idx);
1515         return 0;
1516 }
1517
1518 static int kvmgt_page_track_remove(unsigned long handle, u64 gfn)
1519 {
1520         struct kvmgt_guest_info *info;
1521         struct kvm *kvm;
1522         struct kvm_memory_slot *slot;
1523         int idx;
1524
1525         if (!handle_valid(handle))
1526                 return 0;
1527
1528         info = (struct kvmgt_guest_info *)handle;
1529         kvm = info->kvm;
1530
1531         idx = srcu_read_lock(&kvm->srcu);
1532         slot = gfn_to_memslot(kvm, gfn);
1533         if (!slot) {
1534                 srcu_read_unlock(&kvm->srcu, idx);
1535                 return -EINVAL;
1536         }
1537
1538         spin_lock(&kvm->mmu_lock);
1539
1540         if (!kvmgt_gfn_is_write_protected(info, gfn))
1541                 goto out;
1542
1543         kvm_slot_page_track_remove_page(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE);
1544         kvmgt_protect_table_del(info, gfn);
1545
1546 out:
1547         spin_unlock(&kvm->mmu_lock);
1548         srcu_read_unlock(&kvm->srcu, idx);
1549         return 0;
1550 }
1551
1552 static void kvmgt_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1553                 const u8 *val, int len,
1554                 struct kvm_page_track_notifier_node *node)
1555 {
1556         struct kvmgt_guest_info *info = container_of(node,
1557                                         struct kvmgt_guest_info, track_node);
1558
1559         if (kvmgt_gfn_is_write_protected(info, gpa_to_gfn(gpa)))
1560                 intel_gvt_ops->write_protect_handler(info->vgpu, gpa,
1561                                                      (void *)val, len);
1562 }
1563
1564 static void kvmgt_page_track_flush_slot(struct kvm *kvm,
1565                 struct kvm_memory_slot *slot,
1566                 struct kvm_page_track_notifier_node *node)
1567 {
1568         int i;
1569         gfn_t gfn;
1570         struct kvmgt_guest_info *info = container_of(node,
1571                                         struct kvmgt_guest_info, track_node);
1572
1573         spin_lock(&kvm->mmu_lock);
1574         for (i = 0; i < slot->npages; i++) {
1575                 gfn = slot->base_gfn + i;
1576                 if (kvmgt_gfn_is_write_protected(info, gfn)) {
1577                         kvm_slot_page_track_remove_page(kvm, slot, gfn,
1578                                                 KVM_PAGE_TRACK_WRITE);
1579                         kvmgt_protect_table_del(info, gfn);
1580                 }
1581         }
1582         spin_unlock(&kvm->mmu_lock);
1583 }
1584
1585 static bool __kvmgt_vgpu_exist(struct intel_vgpu *vgpu, struct kvm *kvm)
1586 {
1587         struct intel_vgpu *itr;
1588         struct kvmgt_guest_info *info;
1589         int id;
1590         bool ret = false;
1591
1592         mutex_lock(&vgpu->gvt->lock);
1593         for_each_active_vgpu(vgpu->gvt, itr, id) {
1594                 if (!handle_valid(itr->handle))
1595                         continue;
1596
1597                 info = (struct kvmgt_guest_info *)itr->handle;
1598                 if (kvm && kvm == info->kvm) {
1599                         ret = true;
1600                         goto out;
1601                 }
1602         }
1603 out:
1604         mutex_unlock(&vgpu->gvt->lock);
1605         return ret;
1606 }
1607
1608 static int kvmgt_guest_init(struct mdev_device *mdev)
1609 {
1610         struct kvmgt_guest_info *info;
1611         struct intel_vgpu *vgpu;
1612         struct kvm *kvm;
1613
1614         vgpu = mdev_get_drvdata(mdev);
1615         if (handle_valid(vgpu->handle))
1616                 return -EEXIST;
1617
1618         kvm = vgpu->vdev.kvm;
1619         if (!kvm || kvm->mm != current->mm) {
1620                 gvt_vgpu_err("KVM is required to use Intel vGPU\n");
1621                 return -ESRCH;
1622         }
1623
1624         if (__kvmgt_vgpu_exist(vgpu, kvm))
1625                 return -EEXIST;
1626
1627         info = vzalloc(sizeof(struct kvmgt_guest_info));
1628         if (!info)
1629                 return -ENOMEM;
1630
1631         vgpu->handle = (unsigned long)info;
1632         info->vgpu = vgpu;
1633         info->kvm = kvm;
1634         kvm_get_kvm(info->kvm);
1635
1636         kvmgt_protect_table_init(info);
1637         gvt_cache_init(vgpu);
1638
1639         init_completion(&vgpu->vblank_done);
1640
1641         info->track_node.track_write = kvmgt_page_track_write;
1642         info->track_node.track_flush_slot = kvmgt_page_track_flush_slot;
1643         kvm_page_track_register_notifier(kvm, &info->track_node);
1644
1645         info->debugfs_cache_entries = debugfs_create_ulong(
1646                                                 "kvmgt_nr_cache_entries",
1647                                                 0444, vgpu->debugfs,
1648                                                 &vgpu->vdev.nr_cache_entries);
1649         if (!info->debugfs_cache_entries)
1650                 gvt_vgpu_err("Cannot create kvmgt debugfs entry\n");
1651
1652         return 0;
1653 }
1654
1655 static bool kvmgt_guest_exit(struct kvmgt_guest_info *info)
1656 {
1657         debugfs_remove(info->debugfs_cache_entries);
1658
1659         kvm_page_track_unregister_notifier(info->kvm, &info->track_node);
1660         kvm_put_kvm(info->kvm);
1661         kvmgt_protect_table_destroy(info);
1662         gvt_cache_destroy(info->vgpu);
1663         vfree(info);
1664
1665         return true;
1666 }
1667
1668 static int kvmgt_attach_vgpu(void *vgpu, unsigned long *handle)
1669 {
1670         /* nothing to do here */
1671         return 0;
1672 }
1673
1674 static void kvmgt_detach_vgpu(unsigned long handle)
1675 {
1676         /* nothing to do here */
1677 }
1678
1679 static int kvmgt_inject_msi(unsigned long handle, u32 addr, u16 data)
1680 {
1681         struct kvmgt_guest_info *info;
1682         struct intel_vgpu *vgpu;
1683
1684         if (!handle_valid(handle))
1685                 return -ESRCH;
1686
1687         info = (struct kvmgt_guest_info *)handle;
1688         vgpu = info->vgpu;
1689
1690         /*
1691          * When guest is poweroff, msi_trigger is set to NULL, but vgpu's
1692          * config and mmio register isn't restored to default during guest
1693          * poweroff. If this vgpu is still used in next vm, this vgpu's pipe
1694          * may be enabled, then once this vgpu is active, it will get inject
1695          * vblank interrupt request. But msi_trigger is null until msi is
1696          * enabled by guest. so if msi_trigger is null, success is still
1697          * returned and don't inject interrupt into guest.
1698          */
1699         if (vgpu->vdev.msi_trigger == NULL)
1700                 return 0;
1701
1702         if (eventfd_signal(vgpu->vdev.msi_trigger, 1) == 1)
1703                 return 0;
1704
1705         return -EFAULT;
1706 }
1707
1708 static unsigned long kvmgt_gfn_to_pfn(unsigned long handle, unsigned long gfn)
1709 {
1710         struct kvmgt_guest_info *info;
1711         kvm_pfn_t pfn;
1712
1713         if (!handle_valid(handle))
1714                 return INTEL_GVT_INVALID_ADDR;
1715
1716         info = (struct kvmgt_guest_info *)handle;
1717
1718         pfn = gfn_to_pfn(info->kvm, gfn);
1719         if (is_error_noslot_pfn(pfn))
1720                 return INTEL_GVT_INVALID_ADDR;
1721
1722         return pfn;
1723 }
1724
1725 static int kvmgt_dma_map_guest_page(unsigned long handle, unsigned long gfn,
1726                 unsigned long size, dma_addr_t *dma_addr)
1727 {
1728         struct kvmgt_guest_info *info;
1729         struct intel_vgpu *vgpu;
1730         struct gvt_dma *entry;
1731         int ret;
1732
1733         if (!handle_valid(handle))
1734                 return -EINVAL;
1735
1736         info = (struct kvmgt_guest_info *)handle;
1737         vgpu = info->vgpu;
1738
1739         mutex_lock(&info->vgpu->vdev.cache_lock);
1740
1741         entry = __gvt_cache_find_gfn(info->vgpu, gfn);
1742         if (!entry) {
1743                 ret = gvt_dma_map_page(vgpu, gfn, dma_addr, size);
1744                 if (ret)
1745                         goto err_unlock;
1746
1747                 ret = __gvt_cache_add(info->vgpu, gfn, *dma_addr, size);
1748                 if (ret)
1749                         goto err_unmap;
1750         } else {
1751                 kref_get(&entry->ref);
1752                 *dma_addr = entry->dma_addr;
1753         }
1754
1755         mutex_unlock(&info->vgpu->vdev.cache_lock);
1756         return 0;
1757
1758 err_unmap:
1759         gvt_dma_unmap_page(vgpu, gfn, *dma_addr, size);
1760 err_unlock:
1761         mutex_unlock(&info->vgpu->vdev.cache_lock);
1762         return ret;
1763 }
1764
1765 static void __gvt_dma_release(struct kref *ref)
1766 {
1767         struct gvt_dma *entry = container_of(ref, typeof(*entry), ref);
1768
1769         gvt_dma_unmap_page(entry->vgpu, entry->gfn, entry->dma_addr,
1770                            entry->size);
1771         __gvt_cache_remove_entry(entry->vgpu, entry);
1772 }
1773
1774 static void kvmgt_dma_unmap_guest_page(unsigned long handle, dma_addr_t dma_addr)
1775 {
1776         struct kvmgt_guest_info *info;
1777         struct gvt_dma *entry;
1778
1779         if (!handle_valid(handle))
1780                 return;
1781
1782         info = (struct kvmgt_guest_info *)handle;
1783
1784         mutex_lock(&info->vgpu->vdev.cache_lock);
1785         entry = __gvt_cache_find_dma_addr(info->vgpu, dma_addr);
1786         if (entry)
1787                 kref_put(&entry->ref, __gvt_dma_release);
1788         mutex_unlock(&info->vgpu->vdev.cache_lock);
1789 }
1790
1791 static int kvmgt_rw_gpa(unsigned long handle, unsigned long gpa,
1792                         void *buf, unsigned long len, bool write)
1793 {
1794         struct kvmgt_guest_info *info;
1795         struct kvm *kvm;
1796         int idx, ret;
1797         bool kthread = current->mm == NULL;
1798
1799         if (!handle_valid(handle))
1800                 return -ESRCH;
1801
1802         info = (struct kvmgt_guest_info *)handle;
1803         kvm = info->kvm;
1804
1805         if (kthread) {
1806                 if (!mmget_not_zero(kvm->mm))
1807                         return -EFAULT;
1808                 use_mm(kvm->mm);
1809         }
1810
1811         idx = srcu_read_lock(&kvm->srcu);
1812         ret = write ? kvm_write_guest(kvm, gpa, buf, len) :
1813                       kvm_read_guest(kvm, gpa, buf, len);
1814         srcu_read_unlock(&kvm->srcu, idx);
1815
1816         if (kthread) {
1817                 unuse_mm(kvm->mm);
1818                 mmput(kvm->mm);
1819         }
1820
1821         return ret;
1822 }
1823
1824 static int kvmgt_read_gpa(unsigned long handle, unsigned long gpa,
1825                         void *buf, unsigned long len)
1826 {
1827         return kvmgt_rw_gpa(handle, gpa, buf, len, false);
1828 }
1829
1830 static int kvmgt_write_gpa(unsigned long handle, unsigned long gpa,
1831                         void *buf, unsigned long len)
1832 {
1833         return kvmgt_rw_gpa(handle, gpa, buf, len, true);
1834 }
1835
1836 static unsigned long kvmgt_virt_to_pfn(void *addr)
1837 {
1838         return PFN_DOWN(__pa(addr));
1839 }
1840
1841 static bool kvmgt_is_valid_gfn(unsigned long handle, unsigned long gfn)
1842 {
1843         struct kvmgt_guest_info *info;
1844         struct kvm *kvm;
1845         int idx;
1846         bool ret;
1847
1848         if (!handle_valid(handle))
1849                 return false;
1850
1851         info = (struct kvmgt_guest_info *)handle;
1852         kvm = info->kvm;
1853
1854         idx = srcu_read_lock(&kvm->srcu);
1855         ret = kvm_is_visible_gfn(kvm, gfn);
1856         srcu_read_unlock(&kvm->srcu, idx);
1857
1858         return ret;
1859 }
1860
1861 static struct intel_gvt_mpt kvmgt_mpt = {
1862         .type = INTEL_GVT_HYPERVISOR_KVM,
1863         .host_init = kvmgt_host_init,
1864         .host_exit = kvmgt_host_exit,
1865         .attach_vgpu = kvmgt_attach_vgpu,
1866         .detach_vgpu = kvmgt_detach_vgpu,
1867         .inject_msi = kvmgt_inject_msi,
1868         .from_virt_to_mfn = kvmgt_virt_to_pfn,
1869         .enable_page_track = kvmgt_page_track_add,
1870         .disable_page_track = kvmgt_page_track_remove,
1871         .read_gpa = kvmgt_read_gpa,
1872         .write_gpa = kvmgt_write_gpa,
1873         .gfn_to_mfn = kvmgt_gfn_to_pfn,
1874         .dma_map_guest_page = kvmgt_dma_map_guest_page,
1875         .dma_unmap_guest_page = kvmgt_dma_unmap_guest_page,
1876         .set_opregion = kvmgt_set_opregion,
1877         .get_vfio_device = kvmgt_get_vfio_device,
1878         .put_vfio_device = kvmgt_put_vfio_device,
1879         .is_valid_gfn = kvmgt_is_valid_gfn,
1880 };
1881
1882 static int __init kvmgt_init(void)
1883 {
1884         if (intel_gvt_register_hypervisor(&kvmgt_mpt) < 0)
1885                 return -ENODEV;
1886         return 0;
1887 }
1888
1889 static void __exit kvmgt_exit(void)
1890 {
1891         intel_gvt_unregister_hypervisor();
1892 }
1893
1894 module_init(kvmgt_init);
1895 module_exit(kvmgt_exit);
1896
1897 MODULE_LICENSE("GPL and additional rights");
1898 MODULE_AUTHOR("Intel Corporation");