Linux 6.10-rc5
[sfrench/cifs-2.6.git] / drivers / vfio / vfio_main.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * VFIO core
4  *
5  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
6  *     Author: Alex Williamson <alex.williamson@redhat.com>
7  *
8  * Derived from original vfio:
9  * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
10  * Author: Tom Lyon, pugs@cisco.com
11  */
12
13 #include <linux/cdev.h>
14 #include <linux/compat.h>
15 #include <linux/device.h>
16 #include <linux/fs.h>
17 #include <linux/idr.h>
18 #include <linux/iommu.h>
19 #if IS_ENABLED(CONFIG_KVM)
20 #include <linux/kvm_host.h>
21 #endif
22 #include <linux/list.h>
23 #include <linux/miscdevice.h>
24 #include <linux/module.h>
25 #include <linux/mount.h>
26 #include <linux/mutex.h>
27 #include <linux/pci.h>
28 #include <linux/pseudo_fs.h>
29 #include <linux/rwsem.h>
30 #include <linux/sched.h>
31 #include <linux/slab.h>
32 #include <linux/stat.h>
33 #include <linux/string.h>
34 #include <linux/uaccess.h>
35 #include <linux/vfio.h>
36 #include <linux/wait.h>
37 #include <linux/sched/signal.h>
38 #include <linux/pm_runtime.h>
39 #include <linux/interval_tree.h>
40 #include <linux/iova_bitmap.h>
41 #include <linux/iommufd.h>
42 #include "vfio.h"
43
44 #define DRIVER_VERSION  "0.3"
45 #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
46 #define DRIVER_DESC     "VFIO - User Level meta-driver"
47
48 #define VFIO_MAGIC 0x5646494f /* "VFIO" */
49
50 static struct vfio {
51         struct class                    *device_class;
52         struct ida                      device_ida;
53         struct vfsmount                 *vfs_mount;
54         int                             fs_count;
55 } vfio;
56
57 #ifdef CONFIG_VFIO_NOIOMMU
58 bool vfio_noiommu __read_mostly;
59 module_param_named(enable_unsafe_noiommu_mode,
60                    vfio_noiommu, bool, S_IRUGO | S_IWUSR);
61 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode.  This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel.  If you do not know what this is for, step away. (default: false)");
62 #endif
63
64 static DEFINE_XARRAY(vfio_device_set_xa);
65
66 int vfio_assign_device_set(struct vfio_device *device, void *set_id)
67 {
68         unsigned long idx = (unsigned long)set_id;
69         struct vfio_device_set *new_dev_set;
70         struct vfio_device_set *dev_set;
71
72         if (WARN_ON(!set_id))
73                 return -EINVAL;
74
75         /*
76          * Atomically acquire a singleton object in the xarray for this set_id
77          */
78         xa_lock(&vfio_device_set_xa);
79         dev_set = xa_load(&vfio_device_set_xa, idx);
80         if (dev_set)
81                 goto found_get_ref;
82         xa_unlock(&vfio_device_set_xa);
83
84         new_dev_set = kzalloc(sizeof(*new_dev_set), GFP_KERNEL);
85         if (!new_dev_set)
86                 return -ENOMEM;
87         mutex_init(&new_dev_set->lock);
88         INIT_LIST_HEAD(&new_dev_set->device_list);
89         new_dev_set->set_id = set_id;
90
91         xa_lock(&vfio_device_set_xa);
92         dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set,
93                                GFP_KERNEL);
94         if (!dev_set) {
95                 dev_set = new_dev_set;
96                 goto found_get_ref;
97         }
98
99         kfree(new_dev_set);
100         if (xa_is_err(dev_set)) {
101                 xa_unlock(&vfio_device_set_xa);
102                 return xa_err(dev_set);
103         }
104
105 found_get_ref:
106         dev_set->device_count++;
107         xa_unlock(&vfio_device_set_xa);
108         mutex_lock(&dev_set->lock);
109         device->dev_set = dev_set;
110         list_add_tail(&device->dev_set_list, &dev_set->device_list);
111         mutex_unlock(&dev_set->lock);
112         return 0;
113 }
114 EXPORT_SYMBOL_GPL(vfio_assign_device_set);
115
116 static void vfio_release_device_set(struct vfio_device *device)
117 {
118         struct vfio_device_set *dev_set = device->dev_set;
119
120         if (!dev_set)
121                 return;
122
123         mutex_lock(&dev_set->lock);
124         list_del(&device->dev_set_list);
125         mutex_unlock(&dev_set->lock);
126
127         xa_lock(&vfio_device_set_xa);
128         if (!--dev_set->device_count) {
129                 __xa_erase(&vfio_device_set_xa,
130                            (unsigned long)dev_set->set_id);
131                 mutex_destroy(&dev_set->lock);
132                 kfree(dev_set);
133         }
134         xa_unlock(&vfio_device_set_xa);
135 }
136
137 unsigned int vfio_device_set_open_count(struct vfio_device_set *dev_set)
138 {
139         struct vfio_device *cur;
140         unsigned int open_count = 0;
141
142         lockdep_assert_held(&dev_set->lock);
143
144         list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
145                 open_count += cur->open_count;
146         return open_count;
147 }
148 EXPORT_SYMBOL_GPL(vfio_device_set_open_count);
149
150 struct vfio_device *
151 vfio_find_device_in_devset(struct vfio_device_set *dev_set,
152                            struct device *dev)
153 {
154         struct vfio_device *cur;
155
156         lockdep_assert_held(&dev_set->lock);
157
158         list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
159                 if (cur->dev == dev)
160                         return cur;
161         return NULL;
162 }
163 EXPORT_SYMBOL_GPL(vfio_find_device_in_devset);
164
165 /*
166  * Device objects - create, release, get, put, search
167  */
168 /* Device reference always implies a group reference */
169 void vfio_device_put_registration(struct vfio_device *device)
170 {
171         if (refcount_dec_and_test(&device->refcount))
172                 complete(&device->comp);
173 }
174
175 bool vfio_device_try_get_registration(struct vfio_device *device)
176 {
177         return refcount_inc_not_zero(&device->refcount);
178 }
179
180 /*
181  * VFIO driver API
182  */
183 /* Release helper called by vfio_put_device() */
184 static void vfio_device_release(struct device *dev)
185 {
186         struct vfio_device *device =
187                         container_of(dev, struct vfio_device, device);
188
189         vfio_release_device_set(device);
190         ida_free(&vfio.device_ida, device->index);
191
192         if (device->ops->release)
193                 device->ops->release(device);
194
195         iput(device->inode);
196         simple_release_fs(&vfio.vfs_mount, &vfio.fs_count);
197         kvfree(device);
198 }
199
200 static int vfio_init_device(struct vfio_device *device, struct device *dev,
201                             const struct vfio_device_ops *ops);
202
203 /*
204  * Allocate and initialize vfio_device so it can be registered to vfio
205  * core.
206  *
207  * Drivers should use the wrapper vfio_alloc_device() for allocation.
208  * @size is the size of the structure to be allocated, including any
209  * private data used by the driver.
210  *
211  * Driver may provide an @init callback to cover device private data.
212  *
213  * Use vfio_put_device() to release the structure after success return.
214  */
215 struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev,
216                                        const struct vfio_device_ops *ops)
217 {
218         struct vfio_device *device;
219         int ret;
220
221         if (WARN_ON(size < sizeof(struct vfio_device)))
222                 return ERR_PTR(-EINVAL);
223
224         device = kvzalloc(size, GFP_KERNEL);
225         if (!device)
226                 return ERR_PTR(-ENOMEM);
227
228         ret = vfio_init_device(device, dev, ops);
229         if (ret)
230                 goto out_free;
231         return device;
232
233 out_free:
234         kvfree(device);
235         return ERR_PTR(ret);
236 }
237 EXPORT_SYMBOL_GPL(_vfio_alloc_device);
238
239 static int vfio_fs_init_fs_context(struct fs_context *fc)
240 {
241         return init_pseudo(fc, VFIO_MAGIC) ? 0 : -ENOMEM;
242 }
243
244 static struct file_system_type vfio_fs_type = {
245         .name = "vfio",
246         .owner = THIS_MODULE,
247         .init_fs_context = vfio_fs_init_fs_context,
248         .kill_sb = kill_anon_super,
249 };
250
251 static struct inode *vfio_fs_inode_new(void)
252 {
253         struct inode *inode;
254         int ret;
255
256         ret = simple_pin_fs(&vfio_fs_type, &vfio.vfs_mount, &vfio.fs_count);
257         if (ret)
258                 return ERR_PTR(ret);
259
260         inode = alloc_anon_inode(vfio.vfs_mount->mnt_sb);
261         if (IS_ERR(inode))
262                 simple_release_fs(&vfio.vfs_mount, &vfio.fs_count);
263
264         return inode;
265 }
266
267 /*
268  * Initialize a vfio_device so it can be registered to vfio core.
269  */
270 static int vfio_init_device(struct vfio_device *device, struct device *dev,
271                             const struct vfio_device_ops *ops)
272 {
273         int ret;
274
275         ret = ida_alloc_max(&vfio.device_ida, MINORMASK, GFP_KERNEL);
276         if (ret < 0) {
277                 dev_dbg(dev, "Error to alloc index\n");
278                 return ret;
279         }
280
281         device->index = ret;
282         init_completion(&device->comp);
283         device->dev = dev;
284         device->ops = ops;
285         device->inode = vfio_fs_inode_new();
286         if (IS_ERR(device->inode)) {
287                 ret = PTR_ERR(device->inode);
288                 goto out_inode;
289         }
290
291         if (ops->init) {
292                 ret = ops->init(device);
293                 if (ret)
294                         goto out_uninit;
295         }
296
297         device_initialize(&device->device);
298         device->device.release = vfio_device_release;
299         device->device.class = vfio.device_class;
300         device->device.parent = device->dev;
301         return 0;
302
303 out_uninit:
304         iput(device->inode);
305         simple_release_fs(&vfio.vfs_mount, &vfio.fs_count);
306 out_inode:
307         vfio_release_device_set(device);
308         ida_free(&vfio.device_ida, device->index);
309         return ret;
310 }
311
312 static int __vfio_register_dev(struct vfio_device *device,
313                                enum vfio_group_type type)
314 {
315         int ret;
316
317         if (WARN_ON(IS_ENABLED(CONFIG_IOMMUFD) &&
318                     (!device->ops->bind_iommufd ||
319                      !device->ops->unbind_iommufd ||
320                      !device->ops->attach_ioas ||
321                      !device->ops->detach_ioas)))
322                 return -EINVAL;
323
324         /*
325          * If the driver doesn't specify a set then the device is added to a
326          * singleton set just for itself.
327          */
328         if (!device->dev_set)
329                 vfio_assign_device_set(device, device);
330
331         ret = dev_set_name(&device->device, "vfio%d", device->index);
332         if (ret)
333                 return ret;
334
335         ret = vfio_device_set_group(device, type);
336         if (ret)
337                 return ret;
338
339         /*
340          * VFIO always sets IOMMU_CACHE because we offer no way for userspace to
341          * restore cache coherency. It has to be checked here because it is only
342          * valid for cases where we are using iommu groups.
343          */
344         if (type == VFIO_IOMMU && !vfio_device_is_noiommu(device) &&
345             !device_iommu_capable(device->dev, IOMMU_CAP_CACHE_COHERENCY)) {
346                 ret = -EINVAL;
347                 goto err_out;
348         }
349
350         ret = vfio_device_add(device);
351         if (ret)
352                 goto err_out;
353
354         /* Refcounting can't start until the driver calls register */
355         refcount_set(&device->refcount, 1);
356
357         vfio_device_group_register(device);
358         vfio_device_debugfs_init(device);
359
360         return 0;
361 err_out:
362         vfio_device_remove_group(device);
363         return ret;
364 }
365
366 int vfio_register_group_dev(struct vfio_device *device)
367 {
368         return __vfio_register_dev(device, VFIO_IOMMU);
369 }
370 EXPORT_SYMBOL_GPL(vfio_register_group_dev);
371
372 /*
373  * Register a virtual device without IOMMU backing.  The user of this
374  * device must not be able to directly trigger unmediated DMA.
375  */
376 int vfio_register_emulated_iommu_dev(struct vfio_device *device)
377 {
378         return __vfio_register_dev(device, VFIO_EMULATED_IOMMU);
379 }
380 EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev);
381
382 /*
383  * Decrement the device reference count and wait for the device to be
384  * removed.  Open file descriptors for the device... */
385 void vfio_unregister_group_dev(struct vfio_device *device)
386 {
387         unsigned int i = 0;
388         bool interrupted = false;
389         long rc;
390
391         /*
392          * Prevent new device opened by userspace via the
393          * VFIO_GROUP_GET_DEVICE_FD in the group path.
394          */
395         vfio_device_group_unregister(device);
396
397         /*
398          * Balances vfio_device_add() in register path, also prevents
399          * new device opened by userspace in the cdev path.
400          */
401         vfio_device_del(device);
402
403         vfio_device_put_registration(device);
404         rc = try_wait_for_completion(&device->comp);
405         while (rc <= 0) {
406                 if (device->ops->request)
407                         device->ops->request(device, i++);
408
409                 if (interrupted) {
410                         rc = wait_for_completion_timeout(&device->comp,
411                                                          HZ * 10);
412                 } else {
413                         rc = wait_for_completion_interruptible_timeout(
414                                 &device->comp, HZ * 10);
415                         if (rc < 0) {
416                                 interrupted = true;
417                                 dev_warn(device->dev,
418                                          "Device is currently in use, task"
419                                          " \"%s\" (%d) "
420                                          "blocked until device is released",
421                                          current->comm, task_pid_nr(current));
422                         }
423                 }
424         }
425
426         vfio_device_debugfs_exit(device);
427         /* Balances vfio_device_set_group in register path */
428         vfio_device_remove_group(device);
429 }
430 EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
431
432 #if IS_ENABLED(CONFIG_KVM)
433 void vfio_device_get_kvm_safe(struct vfio_device *device, struct kvm *kvm)
434 {
435         void (*pfn)(struct kvm *kvm);
436         bool (*fn)(struct kvm *kvm);
437         bool ret;
438
439         lockdep_assert_held(&device->dev_set->lock);
440
441         if (!kvm)
442                 return;
443
444         pfn = symbol_get(kvm_put_kvm);
445         if (WARN_ON(!pfn))
446                 return;
447
448         fn = symbol_get(kvm_get_kvm_safe);
449         if (WARN_ON(!fn)) {
450                 symbol_put(kvm_put_kvm);
451                 return;
452         }
453
454         ret = fn(kvm);
455         symbol_put(kvm_get_kvm_safe);
456         if (!ret) {
457                 symbol_put(kvm_put_kvm);
458                 return;
459         }
460
461         device->put_kvm = pfn;
462         device->kvm = kvm;
463 }
464
465 void vfio_device_put_kvm(struct vfio_device *device)
466 {
467         lockdep_assert_held(&device->dev_set->lock);
468
469         if (!device->kvm)
470                 return;
471
472         if (WARN_ON(!device->put_kvm))
473                 goto clear;
474
475         device->put_kvm(device->kvm);
476         device->put_kvm = NULL;
477         symbol_put(kvm_put_kvm);
478
479 clear:
480         device->kvm = NULL;
481 }
482 #endif
483
484 /* true if the vfio_device has open_device() called but not close_device() */
485 static bool vfio_assert_device_open(struct vfio_device *device)
486 {
487         return !WARN_ON_ONCE(!READ_ONCE(device->open_count));
488 }
489
490 struct vfio_device_file *
491 vfio_allocate_device_file(struct vfio_device *device)
492 {
493         struct vfio_device_file *df;
494
495         df = kzalloc(sizeof(*df), GFP_KERNEL_ACCOUNT);
496         if (!df)
497                 return ERR_PTR(-ENOMEM);
498
499         df->device = device;
500         spin_lock_init(&df->kvm_ref_lock);
501
502         return df;
503 }
504
505 static int vfio_df_device_first_open(struct vfio_device_file *df)
506 {
507         struct vfio_device *device = df->device;
508         struct iommufd_ctx *iommufd = df->iommufd;
509         int ret;
510
511         lockdep_assert_held(&device->dev_set->lock);
512
513         if (!try_module_get(device->dev->driver->owner))
514                 return -ENODEV;
515
516         if (iommufd)
517                 ret = vfio_df_iommufd_bind(df);
518         else
519                 ret = vfio_device_group_use_iommu(device);
520         if (ret)
521                 goto err_module_put;
522
523         if (device->ops->open_device) {
524                 ret = device->ops->open_device(device);
525                 if (ret)
526                         goto err_unuse_iommu;
527         }
528         return 0;
529
530 err_unuse_iommu:
531         if (iommufd)
532                 vfio_df_iommufd_unbind(df);
533         else
534                 vfio_device_group_unuse_iommu(device);
535 err_module_put:
536         module_put(device->dev->driver->owner);
537         return ret;
538 }
539
540 static void vfio_df_device_last_close(struct vfio_device_file *df)
541 {
542         struct vfio_device *device = df->device;
543         struct iommufd_ctx *iommufd = df->iommufd;
544
545         lockdep_assert_held(&device->dev_set->lock);
546
547         if (device->ops->close_device)
548                 device->ops->close_device(device);
549         if (iommufd)
550                 vfio_df_iommufd_unbind(df);
551         else
552                 vfio_device_group_unuse_iommu(device);
553         module_put(device->dev->driver->owner);
554 }
555
556 int vfio_df_open(struct vfio_device_file *df)
557 {
558         struct vfio_device *device = df->device;
559         int ret = 0;
560
561         lockdep_assert_held(&device->dev_set->lock);
562
563         /*
564          * Only the group path allows the device to be opened multiple
565          * times.  The device cdev path doesn't have a secure way for it.
566          */
567         if (device->open_count != 0 && !df->group)
568                 return -EINVAL;
569
570         device->open_count++;
571         if (device->open_count == 1) {
572                 ret = vfio_df_device_first_open(df);
573                 if (ret)
574                         device->open_count--;
575         }
576
577         return ret;
578 }
579
580 void vfio_df_close(struct vfio_device_file *df)
581 {
582         struct vfio_device *device = df->device;
583
584         lockdep_assert_held(&device->dev_set->lock);
585
586         vfio_assert_device_open(device);
587         if (device->open_count == 1)
588                 vfio_df_device_last_close(df);
589         device->open_count--;
590 }
591
592 /*
593  * Wrapper around pm_runtime_resume_and_get().
594  * Return error code on failure or 0 on success.
595  */
596 static inline int vfio_device_pm_runtime_get(struct vfio_device *device)
597 {
598         struct device *dev = device->dev;
599
600         if (dev->driver && dev->driver->pm) {
601                 int ret;
602
603                 ret = pm_runtime_resume_and_get(dev);
604                 if (ret) {
605                         dev_info_ratelimited(dev,
606                                 "vfio: runtime resume failed %d\n", ret);
607                         return -EIO;
608                 }
609         }
610
611         return 0;
612 }
613
614 /*
615  * Wrapper around pm_runtime_put().
616  */
617 static inline void vfio_device_pm_runtime_put(struct vfio_device *device)
618 {
619         struct device *dev = device->dev;
620
621         if (dev->driver && dev->driver->pm)
622                 pm_runtime_put(dev);
623 }
624
625 /*
626  * VFIO Device fd
627  */
628 static int vfio_device_fops_release(struct inode *inode, struct file *filep)
629 {
630         struct vfio_device_file *df = filep->private_data;
631         struct vfio_device *device = df->device;
632
633         if (df->group)
634                 vfio_df_group_close(df);
635         else
636                 vfio_df_unbind_iommufd(df);
637
638         vfio_device_put_registration(device);
639
640         kfree(df);
641
642         return 0;
643 }
644
645 /*
646  * vfio_mig_get_next_state - Compute the next step in the FSM
647  * @cur_fsm - The current state the device is in
648  * @new_fsm - The target state to reach
649  * @next_fsm - Pointer to the next step to get to new_fsm
650  *
651  * Return 0 upon success, otherwise -errno
652  * Upon success the next step in the state progression between cur_fsm and
653  * new_fsm will be set in next_fsm.
654  *
655  * This breaks down requests for combination transitions into smaller steps and
656  * returns the next step to get to new_fsm. The function may need to be called
657  * multiple times before reaching new_fsm.
658  *
659  */
660 int vfio_mig_get_next_state(struct vfio_device *device,
661                             enum vfio_device_mig_state cur_fsm,
662                             enum vfio_device_mig_state new_fsm,
663                             enum vfio_device_mig_state *next_fsm)
664 {
665         enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_PRE_COPY_P2P + 1 };
666         /*
667          * The coding in this table requires the driver to implement the
668          * following FSM arcs:
669          *         RESUMING -> STOP
670          *         STOP -> RESUMING
671          *         STOP -> STOP_COPY
672          *         STOP_COPY -> STOP
673          *
674          * If P2P is supported then the driver must also implement these FSM
675          * arcs:
676          *         RUNNING -> RUNNING_P2P
677          *         RUNNING_P2P -> RUNNING
678          *         RUNNING_P2P -> STOP
679          *         STOP -> RUNNING_P2P
680          *
681          * If precopy is supported then the driver must support these additional
682          * FSM arcs:
683          *         RUNNING -> PRE_COPY
684          *         PRE_COPY -> RUNNING
685          *         PRE_COPY -> STOP_COPY
686          * However, if precopy and P2P are supported together then the driver
687          * must support these additional arcs beyond the P2P arcs above:
688          *         PRE_COPY -> RUNNING
689          *         PRE_COPY -> PRE_COPY_P2P
690          *         PRE_COPY_P2P -> PRE_COPY
691          *         PRE_COPY_P2P -> RUNNING_P2P
692          *         PRE_COPY_P2P -> STOP_COPY
693          *         RUNNING -> PRE_COPY
694          *         RUNNING_P2P -> PRE_COPY_P2P
695          *
696          * Without P2P and precopy the driver must implement:
697          *         RUNNING -> STOP
698          *         STOP -> RUNNING
699          *
700          * The coding will step through multiple states for some combination
701          * transitions; if all optional features are supported, this means the
702          * following ones:
703          *         PRE_COPY -> PRE_COPY_P2P -> STOP_COPY
704          *         PRE_COPY -> RUNNING -> RUNNING_P2P
705          *         PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP
706          *         PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP -> RESUMING
707          *         PRE_COPY_P2P -> RUNNING_P2P -> RUNNING
708          *         PRE_COPY_P2P -> RUNNING_P2P -> STOP
709          *         PRE_COPY_P2P -> RUNNING_P2P -> STOP -> RESUMING
710          *         RESUMING -> STOP -> RUNNING_P2P
711          *         RESUMING -> STOP -> RUNNING_P2P -> PRE_COPY_P2P
712          *         RESUMING -> STOP -> RUNNING_P2P -> RUNNING
713          *         RESUMING -> STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
714          *         RESUMING -> STOP -> STOP_COPY
715          *         RUNNING -> RUNNING_P2P -> PRE_COPY_P2P
716          *         RUNNING -> RUNNING_P2P -> STOP
717          *         RUNNING -> RUNNING_P2P -> STOP -> RESUMING
718          *         RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY
719          *         RUNNING_P2P -> RUNNING -> PRE_COPY
720          *         RUNNING_P2P -> STOP -> RESUMING
721          *         RUNNING_P2P -> STOP -> STOP_COPY
722          *         STOP -> RUNNING_P2P -> PRE_COPY_P2P
723          *         STOP -> RUNNING_P2P -> RUNNING
724          *         STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
725          *         STOP_COPY -> STOP -> RESUMING
726          *         STOP_COPY -> STOP -> RUNNING_P2P
727          *         STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING
728          *
729          *  The following transitions are blocked:
730          *         STOP_COPY -> PRE_COPY
731          *         STOP_COPY -> PRE_COPY_P2P
732          */
733         static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = {
734                 [VFIO_DEVICE_STATE_STOP] = {
735                         [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
736                         [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
737                         [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
738                         [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
739                         [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
740                         [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
741                         [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
742                         [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
743                 },
744                 [VFIO_DEVICE_STATE_RUNNING] = {
745                         [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
746                         [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
747                         [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
748                         [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
749                         [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
750                         [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
751                         [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
752                         [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
753                 },
754                 [VFIO_DEVICE_STATE_PRE_COPY] = {
755                         [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING,
756                         [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
757                         [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
758                         [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
759                         [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
760                         [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING,
761                         [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING,
762                         [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
763                 },
764                 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = {
765                         [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
766                         [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
767                         [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
768                         [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
769                         [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
770                         [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
771                         [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
772                         [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
773                 },
774                 [VFIO_DEVICE_STATE_STOP_COPY] = {
775                         [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
776                         [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
777                         [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
778                         [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
779                         [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
780                         [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
781                         [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
782                         [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
783                 },
784                 [VFIO_DEVICE_STATE_RESUMING] = {
785                         [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
786                         [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
787                         [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_STOP,
788                         [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_STOP,
789                         [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
790                         [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
791                         [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
792                         [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
793                 },
794                 [VFIO_DEVICE_STATE_RUNNING_P2P] = {
795                         [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
796                         [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
797                         [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING,
798                         [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
799                         [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
800                         [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
801                         [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
802                         [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
803                 },
804                 [VFIO_DEVICE_STATE_ERROR] = {
805                         [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR,
806                         [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR,
807                         [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
808                         [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
809                         [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR,
810                         [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR,
811                         [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR,
812                         [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
813                 },
814         };
815
816         static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = {
817                 [VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY,
818                 [VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY,
819                 [VFIO_DEVICE_STATE_PRE_COPY] =
820                         VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY,
821                 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_MIGRATION_STOP_COPY |
822                                                    VFIO_MIGRATION_P2P |
823                                                    VFIO_MIGRATION_PRE_COPY,
824                 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY,
825                 [VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY,
826                 [VFIO_DEVICE_STATE_RUNNING_P2P] =
827                         VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P,
828                 [VFIO_DEVICE_STATE_ERROR] = ~0U,
829         };
830
831         if (WARN_ON(cur_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
832                     (state_flags_table[cur_fsm] & device->migration_flags) !=
833                         state_flags_table[cur_fsm]))
834                 return -EINVAL;
835
836         if (new_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
837            (state_flags_table[new_fsm] & device->migration_flags) !=
838                         state_flags_table[new_fsm])
839                 return -EINVAL;
840
841         /*
842          * Arcs touching optional and unsupported states are skipped over. The
843          * driver will instead see an arc from the original state to the next
844          * logical state, as per the above comment.
845          */
846         *next_fsm = vfio_from_fsm_table[cur_fsm][new_fsm];
847         while ((state_flags_table[*next_fsm] & device->migration_flags) !=
848                         state_flags_table[*next_fsm])
849                 *next_fsm = vfio_from_fsm_table[*next_fsm][new_fsm];
850
851         return (*next_fsm != VFIO_DEVICE_STATE_ERROR) ? 0 : -EINVAL;
852 }
853 EXPORT_SYMBOL_GPL(vfio_mig_get_next_state);
854
855 /*
856  * Convert the drivers's struct file into a FD number and return it to userspace
857  */
858 static int vfio_ioct_mig_return_fd(struct file *filp, void __user *arg,
859                                    struct vfio_device_feature_mig_state *mig)
860 {
861         int ret;
862         int fd;
863
864         fd = get_unused_fd_flags(O_CLOEXEC);
865         if (fd < 0) {
866                 ret = fd;
867                 goto out_fput;
868         }
869
870         mig->data_fd = fd;
871         if (copy_to_user(arg, mig, sizeof(*mig))) {
872                 ret = -EFAULT;
873                 goto out_put_unused;
874         }
875         fd_install(fd, filp);
876         return 0;
877
878 out_put_unused:
879         put_unused_fd(fd);
880 out_fput:
881         fput(filp);
882         return ret;
883 }
884
885 static int
886 vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device,
887                                            u32 flags, void __user *arg,
888                                            size_t argsz)
889 {
890         size_t minsz =
891                 offsetofend(struct vfio_device_feature_mig_state, data_fd);
892         struct vfio_device_feature_mig_state mig;
893         struct file *filp = NULL;
894         int ret;
895
896         if (!device->mig_ops)
897                 return -ENOTTY;
898
899         ret = vfio_check_feature(flags, argsz,
900                                  VFIO_DEVICE_FEATURE_SET |
901                                  VFIO_DEVICE_FEATURE_GET,
902                                  sizeof(mig));
903         if (ret != 1)
904                 return ret;
905
906         if (copy_from_user(&mig, arg, minsz))
907                 return -EFAULT;
908
909         if (flags & VFIO_DEVICE_FEATURE_GET) {
910                 enum vfio_device_mig_state curr_state;
911
912                 ret = device->mig_ops->migration_get_state(device,
913                                                            &curr_state);
914                 if (ret)
915                         return ret;
916                 mig.device_state = curr_state;
917                 goto out_copy;
918         }
919
920         /* Handle the VFIO_DEVICE_FEATURE_SET */
921         filp = device->mig_ops->migration_set_state(device, mig.device_state);
922         if (IS_ERR(filp) || !filp)
923                 goto out_copy;
924
925         return vfio_ioct_mig_return_fd(filp, arg, &mig);
926 out_copy:
927         mig.data_fd = -1;
928         if (copy_to_user(arg, &mig, sizeof(mig)))
929                 return -EFAULT;
930         if (IS_ERR(filp))
931                 return PTR_ERR(filp);
932         return 0;
933 }
934
935 static int
936 vfio_ioctl_device_feature_migration_data_size(struct vfio_device *device,
937                                               u32 flags, void __user *arg,
938                                               size_t argsz)
939 {
940         struct vfio_device_feature_mig_data_size data_size = {};
941         unsigned long stop_copy_length;
942         int ret;
943
944         if (!device->mig_ops)
945                 return -ENOTTY;
946
947         ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
948                                  sizeof(data_size));
949         if (ret != 1)
950                 return ret;
951
952         ret = device->mig_ops->migration_get_data_size(device, &stop_copy_length);
953         if (ret)
954                 return ret;
955
956         data_size.stop_copy_length = stop_copy_length;
957         if (copy_to_user(arg, &data_size, sizeof(data_size)))
958                 return -EFAULT;
959
960         return 0;
961 }
962
963 static int vfio_ioctl_device_feature_migration(struct vfio_device *device,
964                                                u32 flags, void __user *arg,
965                                                size_t argsz)
966 {
967         struct vfio_device_feature_migration mig = {
968                 .flags = device->migration_flags,
969         };
970         int ret;
971
972         if (!device->mig_ops)
973                 return -ENOTTY;
974
975         ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
976                                  sizeof(mig));
977         if (ret != 1)
978                 return ret;
979         if (copy_to_user(arg, &mig, sizeof(mig)))
980                 return -EFAULT;
981         return 0;
982 }
983
984 void vfio_combine_iova_ranges(struct rb_root_cached *root, u32 cur_nodes,
985                               u32 req_nodes)
986 {
987         struct interval_tree_node *prev, *curr, *comb_start, *comb_end;
988         unsigned long min_gap, curr_gap;
989
990         /* Special shortcut when a single range is required */
991         if (req_nodes == 1) {
992                 unsigned long last;
993
994                 comb_start = interval_tree_iter_first(root, 0, ULONG_MAX);
995
996                 /* Empty list */
997                 if (WARN_ON_ONCE(!comb_start))
998                         return;
999
1000                 curr = comb_start;
1001                 while (curr) {
1002                         last = curr->last;
1003                         prev = curr;
1004                         curr = interval_tree_iter_next(curr, 0, ULONG_MAX);
1005                         if (prev != comb_start)
1006                                 interval_tree_remove(prev, root);
1007                 }
1008                 comb_start->last = last;
1009                 return;
1010         }
1011
1012         /* Combine ranges which have the smallest gap */
1013         while (cur_nodes > req_nodes) {
1014                 prev = NULL;
1015                 min_gap = ULONG_MAX;
1016                 curr = interval_tree_iter_first(root, 0, ULONG_MAX);
1017                 while (curr) {
1018                         if (prev) {
1019                                 curr_gap = curr->start - prev->last;
1020                                 if (curr_gap < min_gap) {
1021                                         min_gap = curr_gap;
1022                                         comb_start = prev;
1023                                         comb_end = curr;
1024                                 }
1025                         }
1026                         prev = curr;
1027                         curr = interval_tree_iter_next(curr, 0, ULONG_MAX);
1028                 }
1029
1030                 /* Empty list or no nodes to combine */
1031                 if (WARN_ON_ONCE(min_gap == ULONG_MAX))
1032                         break;
1033
1034                 comb_start->last = comb_end->last;
1035                 interval_tree_remove(comb_end, root);
1036                 cur_nodes--;
1037         }
1038 }
1039 EXPORT_SYMBOL_GPL(vfio_combine_iova_ranges);
1040
1041 /* Ranges should fit into a single kernel page */
1042 #define LOG_MAX_RANGES \
1043         (PAGE_SIZE / sizeof(struct vfio_device_feature_dma_logging_range))
1044
1045 static int
1046 vfio_ioctl_device_feature_logging_start(struct vfio_device *device,
1047                                         u32 flags, void __user *arg,
1048                                         size_t argsz)
1049 {
1050         size_t minsz =
1051                 offsetofend(struct vfio_device_feature_dma_logging_control,
1052                             ranges);
1053         struct vfio_device_feature_dma_logging_range __user *ranges;
1054         struct vfio_device_feature_dma_logging_control control;
1055         struct vfio_device_feature_dma_logging_range range;
1056         struct rb_root_cached root = RB_ROOT_CACHED;
1057         struct interval_tree_node *nodes;
1058         u64 iova_end;
1059         u32 nnodes;
1060         int i, ret;
1061
1062         if (!device->log_ops)
1063                 return -ENOTTY;
1064
1065         ret = vfio_check_feature(flags, argsz,
1066                                  VFIO_DEVICE_FEATURE_SET,
1067                                  sizeof(control));
1068         if (ret != 1)
1069                 return ret;
1070
1071         if (copy_from_user(&control, arg, minsz))
1072                 return -EFAULT;
1073
1074         nnodes = control.num_ranges;
1075         if (!nnodes)
1076                 return -EINVAL;
1077
1078         if (nnodes > LOG_MAX_RANGES)
1079                 return -E2BIG;
1080
1081         ranges = u64_to_user_ptr(control.ranges);
1082         nodes = kmalloc_array(nnodes, sizeof(struct interval_tree_node),
1083                               GFP_KERNEL);
1084         if (!nodes)
1085                 return -ENOMEM;
1086
1087         for (i = 0; i < nnodes; i++) {
1088                 if (copy_from_user(&range, &ranges[i], sizeof(range))) {
1089                         ret = -EFAULT;
1090                         goto end;
1091                 }
1092                 if (!IS_ALIGNED(range.iova, control.page_size) ||
1093                     !IS_ALIGNED(range.length, control.page_size)) {
1094                         ret = -EINVAL;
1095                         goto end;
1096                 }
1097
1098                 if (check_add_overflow(range.iova, range.length, &iova_end) ||
1099                     iova_end > ULONG_MAX) {
1100                         ret = -EOVERFLOW;
1101                         goto end;
1102                 }
1103
1104                 nodes[i].start = range.iova;
1105                 nodes[i].last = range.iova + range.length - 1;
1106                 if (interval_tree_iter_first(&root, nodes[i].start,
1107                                              nodes[i].last)) {
1108                         /* Range overlapping */
1109                         ret = -EINVAL;
1110                         goto end;
1111                 }
1112                 interval_tree_insert(nodes + i, &root);
1113         }
1114
1115         ret = device->log_ops->log_start(device, &root, nnodes,
1116                                          &control.page_size);
1117         if (ret)
1118                 goto end;
1119
1120         if (copy_to_user(arg, &control, sizeof(control))) {
1121                 ret = -EFAULT;
1122                 device->log_ops->log_stop(device);
1123         }
1124
1125 end:
1126         kfree(nodes);
1127         return ret;
1128 }
1129
1130 static int
1131 vfio_ioctl_device_feature_logging_stop(struct vfio_device *device,
1132                                        u32 flags, void __user *arg,
1133                                        size_t argsz)
1134 {
1135         int ret;
1136
1137         if (!device->log_ops)
1138                 return -ENOTTY;
1139
1140         ret = vfio_check_feature(flags, argsz,
1141                                  VFIO_DEVICE_FEATURE_SET, 0);
1142         if (ret != 1)
1143                 return ret;
1144
1145         return device->log_ops->log_stop(device);
1146 }
1147
1148 static int vfio_device_log_read_and_clear(struct iova_bitmap *iter,
1149                                           unsigned long iova, size_t length,
1150                                           void *opaque)
1151 {
1152         struct vfio_device *device = opaque;
1153
1154         return device->log_ops->log_read_and_clear(device, iova, length, iter);
1155 }
1156
1157 static int
1158 vfio_ioctl_device_feature_logging_report(struct vfio_device *device,
1159                                          u32 flags, void __user *arg,
1160                                          size_t argsz)
1161 {
1162         size_t minsz =
1163                 offsetofend(struct vfio_device_feature_dma_logging_report,
1164                             bitmap);
1165         struct vfio_device_feature_dma_logging_report report;
1166         struct iova_bitmap *iter;
1167         u64 iova_end;
1168         int ret;
1169
1170         if (!device->log_ops)
1171                 return -ENOTTY;
1172
1173         ret = vfio_check_feature(flags, argsz,
1174                                  VFIO_DEVICE_FEATURE_GET,
1175                                  sizeof(report));
1176         if (ret != 1)
1177                 return ret;
1178
1179         if (copy_from_user(&report, arg, minsz))
1180                 return -EFAULT;
1181
1182         if (report.page_size < SZ_4K || !is_power_of_2(report.page_size))
1183                 return -EINVAL;
1184
1185         if (check_add_overflow(report.iova, report.length, &iova_end) ||
1186             iova_end > ULONG_MAX)
1187                 return -EOVERFLOW;
1188
1189         iter = iova_bitmap_alloc(report.iova, report.length,
1190                                  report.page_size,
1191                                  u64_to_user_ptr(report.bitmap));
1192         if (IS_ERR(iter))
1193                 return PTR_ERR(iter);
1194
1195         ret = iova_bitmap_for_each(iter, device,
1196                                    vfio_device_log_read_and_clear);
1197
1198         iova_bitmap_free(iter);
1199         return ret;
1200 }
1201
1202 static int vfio_ioctl_device_feature(struct vfio_device *device,
1203                                      struct vfio_device_feature __user *arg)
1204 {
1205         size_t minsz = offsetofend(struct vfio_device_feature, flags);
1206         struct vfio_device_feature feature;
1207
1208         if (copy_from_user(&feature, arg, minsz))
1209                 return -EFAULT;
1210
1211         if (feature.argsz < minsz)
1212                 return -EINVAL;
1213
1214         /* Check unknown flags */
1215         if (feature.flags &
1216             ~(VFIO_DEVICE_FEATURE_MASK | VFIO_DEVICE_FEATURE_SET |
1217               VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_PROBE))
1218                 return -EINVAL;
1219
1220         /* GET & SET are mutually exclusive except with PROBE */
1221         if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) &&
1222             (feature.flags & VFIO_DEVICE_FEATURE_SET) &&
1223             (feature.flags & VFIO_DEVICE_FEATURE_GET))
1224                 return -EINVAL;
1225
1226         switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) {
1227         case VFIO_DEVICE_FEATURE_MIGRATION:
1228                 return vfio_ioctl_device_feature_migration(
1229                         device, feature.flags, arg->data,
1230                         feature.argsz - minsz);
1231         case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE:
1232                 return vfio_ioctl_device_feature_mig_device_state(
1233                         device, feature.flags, arg->data,
1234                         feature.argsz - minsz);
1235         case VFIO_DEVICE_FEATURE_DMA_LOGGING_START:
1236                 return vfio_ioctl_device_feature_logging_start(
1237                         device, feature.flags, arg->data,
1238                         feature.argsz - minsz);
1239         case VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP:
1240                 return vfio_ioctl_device_feature_logging_stop(
1241                         device, feature.flags, arg->data,
1242                         feature.argsz - minsz);
1243         case VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT:
1244                 return vfio_ioctl_device_feature_logging_report(
1245                         device, feature.flags, arg->data,
1246                         feature.argsz - minsz);
1247         case VFIO_DEVICE_FEATURE_MIG_DATA_SIZE:
1248                 return vfio_ioctl_device_feature_migration_data_size(
1249                         device, feature.flags, arg->data,
1250                         feature.argsz - minsz);
1251         default:
1252                 if (unlikely(!device->ops->device_feature))
1253                         return -EINVAL;
1254                 return device->ops->device_feature(device, feature.flags,
1255                                                    arg->data,
1256                                                    feature.argsz - minsz);
1257         }
1258 }
1259
1260 static long vfio_device_fops_unl_ioctl(struct file *filep,
1261                                        unsigned int cmd, unsigned long arg)
1262 {
1263         struct vfio_device_file *df = filep->private_data;
1264         struct vfio_device *device = df->device;
1265         void __user *uptr = (void __user *)arg;
1266         int ret;
1267
1268         if (cmd == VFIO_DEVICE_BIND_IOMMUFD)
1269                 return vfio_df_ioctl_bind_iommufd(df, uptr);
1270
1271         /* Paired with smp_store_release() following vfio_df_open() */
1272         if (!smp_load_acquire(&df->access_granted))
1273                 return -EINVAL;
1274
1275         ret = vfio_device_pm_runtime_get(device);
1276         if (ret)
1277                 return ret;
1278
1279         /* cdev only ioctls */
1280         if (IS_ENABLED(CONFIG_VFIO_DEVICE_CDEV) && !df->group) {
1281                 switch (cmd) {
1282                 case VFIO_DEVICE_ATTACH_IOMMUFD_PT:
1283                         ret = vfio_df_ioctl_attach_pt(df, uptr);
1284                         goto out;
1285
1286                 case VFIO_DEVICE_DETACH_IOMMUFD_PT:
1287                         ret = vfio_df_ioctl_detach_pt(df, uptr);
1288                         goto out;
1289                 }
1290         }
1291
1292         switch (cmd) {
1293         case VFIO_DEVICE_FEATURE:
1294                 ret = vfio_ioctl_device_feature(device, uptr);
1295                 break;
1296
1297         default:
1298                 if (unlikely(!device->ops->ioctl))
1299                         ret = -EINVAL;
1300                 else
1301                         ret = device->ops->ioctl(device, cmd, arg);
1302                 break;
1303         }
1304 out:
1305         vfio_device_pm_runtime_put(device);
1306         return ret;
1307 }
1308
1309 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1310                                      size_t count, loff_t *ppos)
1311 {
1312         struct vfio_device_file *df = filep->private_data;
1313         struct vfio_device *device = df->device;
1314
1315         /* Paired with smp_store_release() following vfio_df_open() */
1316         if (!smp_load_acquire(&df->access_granted))
1317                 return -EINVAL;
1318
1319         if (unlikely(!device->ops->read))
1320                 return -EINVAL;
1321
1322         return device->ops->read(device, buf, count, ppos);
1323 }
1324
1325 static ssize_t vfio_device_fops_write(struct file *filep,
1326                                       const char __user *buf,
1327                                       size_t count, loff_t *ppos)
1328 {
1329         struct vfio_device_file *df = filep->private_data;
1330         struct vfio_device *device = df->device;
1331
1332         /* Paired with smp_store_release() following vfio_df_open() */
1333         if (!smp_load_acquire(&df->access_granted))
1334                 return -EINVAL;
1335
1336         if (unlikely(!device->ops->write))
1337                 return -EINVAL;
1338
1339         return device->ops->write(device, buf, count, ppos);
1340 }
1341
1342 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1343 {
1344         struct vfio_device_file *df = filep->private_data;
1345         struct vfio_device *device = df->device;
1346
1347         /* Paired with smp_store_release() following vfio_df_open() */
1348         if (!smp_load_acquire(&df->access_granted))
1349                 return -EINVAL;
1350
1351         if (unlikely(!device->ops->mmap))
1352                 return -EINVAL;
1353
1354         return device->ops->mmap(device, vma);
1355 }
1356
1357 const struct file_operations vfio_device_fops = {
1358         .owner          = THIS_MODULE,
1359         .open           = vfio_device_fops_cdev_open,
1360         .release        = vfio_device_fops_release,
1361         .read           = vfio_device_fops_read,
1362         .write          = vfio_device_fops_write,
1363         .unlocked_ioctl = vfio_device_fops_unl_ioctl,
1364         .compat_ioctl   = compat_ptr_ioctl,
1365         .mmap           = vfio_device_fops_mmap,
1366 };
1367
1368 static struct vfio_device *vfio_device_from_file(struct file *file)
1369 {
1370         struct vfio_device_file *df = file->private_data;
1371
1372         if (file->f_op != &vfio_device_fops)
1373                 return NULL;
1374         return df->device;
1375 }
1376
1377 /**
1378  * vfio_file_is_valid - True if the file is valid vfio file
1379  * @file: VFIO group file or VFIO device file
1380  */
1381 bool vfio_file_is_valid(struct file *file)
1382 {
1383         return vfio_group_from_file(file) ||
1384                vfio_device_from_file(file);
1385 }
1386 EXPORT_SYMBOL_GPL(vfio_file_is_valid);
1387
1388 /**
1389  * vfio_file_enforced_coherent - True if the DMA associated with the VFIO file
1390  *        is always CPU cache coherent
1391  * @file: VFIO group file or VFIO device file
1392  *
1393  * Enforced coherency means that the IOMMU ignores things like the PCIe no-snoop
1394  * bit in DMA transactions. A return of false indicates that the user has
1395  * rights to access additional instructions such as wbinvd on x86.
1396  */
1397 bool vfio_file_enforced_coherent(struct file *file)
1398 {
1399         struct vfio_device *device;
1400         struct vfio_group *group;
1401
1402         group = vfio_group_from_file(file);
1403         if (group)
1404                 return vfio_group_enforced_coherent(group);
1405
1406         device = vfio_device_from_file(file);
1407         if (device)
1408                 return device_iommu_capable(device->dev,
1409                                             IOMMU_CAP_ENFORCE_CACHE_COHERENCY);
1410
1411         return true;
1412 }
1413 EXPORT_SYMBOL_GPL(vfio_file_enforced_coherent);
1414
1415 static void vfio_device_file_set_kvm(struct file *file, struct kvm *kvm)
1416 {
1417         struct vfio_device_file *df = file->private_data;
1418
1419         /*
1420          * The kvm is first recorded in the vfio_device_file, and will
1421          * be propagated to vfio_device::kvm when the file is bound to
1422          * iommufd successfully in the vfio device cdev path.
1423          */
1424         spin_lock(&df->kvm_ref_lock);
1425         df->kvm = kvm;
1426         spin_unlock(&df->kvm_ref_lock);
1427 }
1428
1429 /**
1430  * vfio_file_set_kvm - Link a kvm with VFIO drivers
1431  * @file: VFIO group file or VFIO device file
1432  * @kvm: KVM to link
1433  *
1434  * When a VFIO device is first opened the KVM will be available in
1435  * device->kvm if one was associated with the file.
1436  */
1437 void vfio_file_set_kvm(struct file *file, struct kvm *kvm)
1438 {
1439         struct vfio_group *group;
1440
1441         group = vfio_group_from_file(file);
1442         if (group)
1443                 vfio_group_set_kvm(group, kvm);
1444
1445         if (vfio_device_from_file(file))
1446                 vfio_device_file_set_kvm(file, kvm);
1447 }
1448 EXPORT_SYMBOL_GPL(vfio_file_set_kvm);
1449
1450 /*
1451  * Sub-module support
1452  */
1453 /*
1454  * Helper for managing a buffer of info chain capabilities, allocate or
1455  * reallocate a buffer with additional @size, filling in @id and @version
1456  * of the capability.  A pointer to the new capability is returned.
1457  *
1458  * NB. The chain is based at the head of the buffer, so new entries are
1459  * added to the tail, vfio_info_cap_shift() should be called to fixup the
1460  * next offsets prior to copying to the user buffer.
1461  */
1462 struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1463                                                size_t size, u16 id, u16 version)
1464 {
1465         void *buf;
1466         struct vfio_info_cap_header *header, *tmp;
1467
1468         /* Ensure that the next capability struct will be aligned */
1469         size = ALIGN(size, sizeof(u64));
1470
1471         buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1472         if (!buf) {
1473                 kfree(caps->buf);
1474                 caps->buf = NULL;
1475                 caps->size = 0;
1476                 return ERR_PTR(-ENOMEM);
1477         }
1478
1479         caps->buf = buf;
1480         header = buf + caps->size;
1481
1482         /* Eventually copied to user buffer, zero */
1483         memset(header, 0, size);
1484
1485         header->id = id;
1486         header->version = version;
1487
1488         /* Add to the end of the capability chain */
1489         for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1490                 ; /* nothing */
1491
1492         tmp->next = caps->size;
1493         caps->size += size;
1494
1495         return header;
1496 }
1497 EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1498
1499 void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1500 {
1501         struct vfio_info_cap_header *tmp;
1502         void *buf = (void *)caps->buf;
1503
1504         /* Capability structs should start with proper alignment */
1505         WARN_ON(!IS_ALIGNED(offset, sizeof(u64)));
1506
1507         for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1508                 tmp->next += offset;
1509 }
1510 EXPORT_SYMBOL(vfio_info_cap_shift);
1511
1512 int vfio_info_add_capability(struct vfio_info_cap *caps,
1513                              struct vfio_info_cap_header *cap, size_t size)
1514 {
1515         struct vfio_info_cap_header *header;
1516
1517         header = vfio_info_cap_add(caps, size, cap->id, cap->version);
1518         if (IS_ERR(header))
1519                 return PTR_ERR(header);
1520
1521         memcpy(header + 1, cap + 1, size - sizeof(*header));
1522
1523         return 0;
1524 }
1525 EXPORT_SYMBOL(vfio_info_add_capability);
1526
1527 int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1528                                        int max_irq_type, size_t *data_size)
1529 {
1530         unsigned long minsz;
1531         size_t size;
1532
1533         minsz = offsetofend(struct vfio_irq_set, count);
1534
1535         if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1536             (hdr->count >= (U32_MAX - hdr->start)) ||
1537             (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1538                                 VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1539                 return -EINVAL;
1540
1541         if (data_size)
1542                 *data_size = 0;
1543
1544         if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1545                 return -EINVAL;
1546
1547         switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1548         case VFIO_IRQ_SET_DATA_NONE:
1549                 size = 0;
1550                 break;
1551         case VFIO_IRQ_SET_DATA_BOOL:
1552                 size = sizeof(uint8_t);
1553                 break;
1554         case VFIO_IRQ_SET_DATA_EVENTFD:
1555                 size = sizeof(int32_t);
1556                 break;
1557         default:
1558                 return -EINVAL;
1559         }
1560
1561         if (size) {
1562                 if (hdr->argsz - minsz < hdr->count * size)
1563                         return -EINVAL;
1564
1565                 if (!data_size)
1566                         return -EINVAL;
1567
1568                 *data_size = hdr->count * size;
1569         }
1570
1571         return 0;
1572 }
1573 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1574
1575 /*
1576  * Pin contiguous user pages and return their associated host pages for local
1577  * domain only.
1578  * @device [in]  : device
1579  * @iova [in]    : starting IOVA of user pages to be pinned.
1580  * @npage [in]   : count of pages to be pinned.  This count should not
1581  *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1582  * @prot [in]    : protection flags
1583  * @pages[out]   : array of host pages
1584  * Return error or number of pages pinned.
1585  *
1586  * A driver may only call this function if the vfio_device was created
1587  * by vfio_register_emulated_iommu_dev() due to vfio_device_container_pin_pages().
1588  */
1589 int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova,
1590                    int npage, int prot, struct page **pages)
1591 {
1592         /* group->container cannot change while a vfio device is open */
1593         if (!pages || !npage || WARN_ON(!vfio_assert_device_open(device)))
1594                 return -EINVAL;
1595         if (!device->ops->dma_unmap)
1596                 return -EINVAL;
1597         if (vfio_device_has_container(device))
1598                 return vfio_device_container_pin_pages(device, iova,
1599                                                        npage, prot, pages);
1600         if (device->iommufd_access) {
1601                 int ret;
1602
1603                 if (iova > ULONG_MAX)
1604                         return -EINVAL;
1605                 /*
1606                  * VFIO ignores the sub page offset, npages is from the start of
1607                  * a PAGE_SIZE chunk of IOVA. The caller is expected to recover
1608                  * the sub page offset by doing:
1609                  *     pages[0] + (iova % PAGE_SIZE)
1610                  */
1611                 ret = iommufd_access_pin_pages(
1612                         device->iommufd_access, ALIGN_DOWN(iova, PAGE_SIZE),
1613                         npage * PAGE_SIZE, pages,
1614                         (prot & IOMMU_WRITE) ? IOMMUFD_ACCESS_RW_WRITE : 0);
1615                 if (ret)
1616                         return ret;
1617                 return npage;
1618         }
1619         return -EINVAL;
1620 }
1621 EXPORT_SYMBOL(vfio_pin_pages);
1622
1623 /*
1624  * Unpin contiguous host pages for local domain only.
1625  * @device [in]  : device
1626  * @iova [in]    : starting address of user pages to be unpinned.
1627  * @npage [in]   : count of pages to be unpinned.  This count should not
1628  *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1629  */
1630 void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage)
1631 {
1632         if (WARN_ON(!vfio_assert_device_open(device)))
1633                 return;
1634         if (WARN_ON(!device->ops->dma_unmap))
1635                 return;
1636
1637         if (vfio_device_has_container(device)) {
1638                 vfio_device_container_unpin_pages(device, iova, npage);
1639                 return;
1640         }
1641         if (device->iommufd_access) {
1642                 if (WARN_ON(iova > ULONG_MAX))
1643                         return;
1644                 iommufd_access_unpin_pages(device->iommufd_access,
1645                                            ALIGN_DOWN(iova, PAGE_SIZE),
1646                                            npage * PAGE_SIZE);
1647                 return;
1648         }
1649 }
1650 EXPORT_SYMBOL(vfio_unpin_pages);
1651
1652 /*
1653  * This interface allows the CPUs to perform some sort of virtual DMA on
1654  * behalf of the device.
1655  *
1656  * CPUs read/write from/into a range of IOVAs pointing to user space memory
1657  * into/from a kernel buffer.
1658  *
1659  * As the read/write of user space memory is conducted via the CPUs and is
1660  * not a real device DMA, it is not necessary to pin the user space memory.
1661  *
1662  * @device [in]         : VFIO device
1663  * @iova [in]           : base IOVA of a user space buffer
1664  * @data [in]           : pointer to kernel buffer
1665  * @len [in]            : kernel buffer length
1666  * @write               : indicate read or write
1667  * Return error code on failure or 0 on success.
1668  */
1669 int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data,
1670                 size_t len, bool write)
1671 {
1672         if (!data || len <= 0 || !vfio_assert_device_open(device))
1673                 return -EINVAL;
1674
1675         if (vfio_device_has_container(device))
1676                 return vfio_device_container_dma_rw(device, iova,
1677                                                     data, len, write);
1678
1679         if (device->iommufd_access) {
1680                 unsigned int flags = 0;
1681
1682                 if (iova > ULONG_MAX)
1683                         return -EINVAL;
1684
1685                 /* VFIO historically tries to auto-detect a kthread */
1686                 if (!current->mm)
1687                         flags |= IOMMUFD_ACCESS_RW_KTHREAD;
1688                 if (write)
1689                         flags |= IOMMUFD_ACCESS_RW_WRITE;
1690                 return iommufd_access_rw(device->iommufd_access, iova, data,
1691                                          len, flags);
1692         }
1693         return -EINVAL;
1694 }
1695 EXPORT_SYMBOL(vfio_dma_rw);
1696
1697 /*
1698  * Module/class support
1699  */
1700 static int __init vfio_init(void)
1701 {
1702         int ret;
1703
1704         ida_init(&vfio.device_ida);
1705
1706         ret = vfio_group_init();
1707         if (ret)
1708                 return ret;
1709
1710         ret = vfio_virqfd_init();
1711         if (ret)
1712                 goto err_virqfd;
1713
1714         /* /sys/class/vfio-dev/vfioX */
1715         vfio.device_class = class_create("vfio-dev");
1716         if (IS_ERR(vfio.device_class)) {
1717                 ret = PTR_ERR(vfio.device_class);
1718                 goto err_dev_class;
1719         }
1720
1721         ret = vfio_cdev_init(vfio.device_class);
1722         if (ret)
1723                 goto err_alloc_dev_chrdev;
1724
1725         vfio_debugfs_create_root();
1726         pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
1727         return 0;
1728
1729 err_alloc_dev_chrdev:
1730         class_destroy(vfio.device_class);
1731         vfio.device_class = NULL;
1732 err_dev_class:
1733         vfio_virqfd_exit();
1734 err_virqfd:
1735         vfio_group_cleanup();
1736         return ret;
1737 }
1738
1739 static void __exit vfio_cleanup(void)
1740 {
1741         vfio_debugfs_remove_root();
1742         ida_destroy(&vfio.device_ida);
1743         vfio_cdev_cleanup();
1744         class_destroy(vfio.device_class);
1745         vfio.device_class = NULL;
1746         vfio_virqfd_exit();
1747         vfio_group_cleanup();
1748         xa_destroy(&vfio_device_set_xa);
1749 }
1750
1751 module_init(vfio_init);
1752 module_exit(vfio_cleanup);
1753
1754 MODULE_IMPORT_NS(IOMMUFD);
1755 MODULE_VERSION(DRIVER_VERSION);
1756 MODULE_LICENSE("GPL v2");
1757 MODULE_AUTHOR(DRIVER_AUTHOR);
1758 MODULE_DESCRIPTION(DRIVER_DESC);
1759 MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");