Merge branch 'for-6.9/amd-sfh' into for-linus
[sfrench/cifs-2.6.git] / drivers / vfio / vfio_main.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * VFIO core
4  *
5  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
6  *     Author: Alex Williamson <alex.williamson@redhat.com>
7  *
8  * Derived from original vfio:
9  * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
10  * Author: Tom Lyon, pugs@cisco.com
11  */
12
13 #include <linux/cdev.h>
14 #include <linux/compat.h>
15 #include <linux/device.h>
16 #include <linux/fs.h>
17 #include <linux/idr.h>
18 #include <linux/iommu.h>
19 #ifdef CONFIG_HAVE_KVM
20 #include <linux/kvm_host.h>
21 #endif
22 #include <linux/list.h>
23 #include <linux/miscdevice.h>
24 #include <linux/module.h>
25 #include <linux/mutex.h>
26 #include <linux/pci.h>
27 #include <linux/rwsem.h>
28 #include <linux/sched.h>
29 #include <linux/slab.h>
30 #include <linux/stat.h>
31 #include <linux/string.h>
32 #include <linux/uaccess.h>
33 #include <linux/vfio.h>
34 #include <linux/wait.h>
35 #include <linux/sched/signal.h>
36 #include <linux/pm_runtime.h>
37 #include <linux/interval_tree.h>
38 #include <linux/iova_bitmap.h>
39 #include <linux/iommufd.h>
40 #include "vfio.h"
41
42 #define DRIVER_VERSION  "0.3"
43 #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
44 #define DRIVER_DESC     "VFIO - User Level meta-driver"
45
46 static struct vfio {
47         struct class                    *device_class;
48         struct ida                      device_ida;
49 } vfio;
50
51 #ifdef CONFIG_VFIO_NOIOMMU
52 bool vfio_noiommu __read_mostly;
53 module_param_named(enable_unsafe_noiommu_mode,
54                    vfio_noiommu, bool, S_IRUGO | S_IWUSR);
55 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode.  This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel.  If you do not know what this is for, step away. (default: false)");
56 #endif
57
58 static DEFINE_XARRAY(vfio_device_set_xa);
59
60 int vfio_assign_device_set(struct vfio_device *device, void *set_id)
61 {
62         unsigned long idx = (unsigned long)set_id;
63         struct vfio_device_set *new_dev_set;
64         struct vfio_device_set *dev_set;
65
66         if (WARN_ON(!set_id))
67                 return -EINVAL;
68
69         /*
70          * Atomically acquire a singleton object in the xarray for this set_id
71          */
72         xa_lock(&vfio_device_set_xa);
73         dev_set = xa_load(&vfio_device_set_xa, idx);
74         if (dev_set)
75                 goto found_get_ref;
76         xa_unlock(&vfio_device_set_xa);
77
78         new_dev_set = kzalloc(sizeof(*new_dev_set), GFP_KERNEL);
79         if (!new_dev_set)
80                 return -ENOMEM;
81         mutex_init(&new_dev_set->lock);
82         INIT_LIST_HEAD(&new_dev_set->device_list);
83         new_dev_set->set_id = set_id;
84
85         xa_lock(&vfio_device_set_xa);
86         dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set,
87                                GFP_KERNEL);
88         if (!dev_set) {
89                 dev_set = new_dev_set;
90                 goto found_get_ref;
91         }
92
93         kfree(new_dev_set);
94         if (xa_is_err(dev_set)) {
95                 xa_unlock(&vfio_device_set_xa);
96                 return xa_err(dev_set);
97         }
98
99 found_get_ref:
100         dev_set->device_count++;
101         xa_unlock(&vfio_device_set_xa);
102         mutex_lock(&dev_set->lock);
103         device->dev_set = dev_set;
104         list_add_tail(&device->dev_set_list, &dev_set->device_list);
105         mutex_unlock(&dev_set->lock);
106         return 0;
107 }
108 EXPORT_SYMBOL_GPL(vfio_assign_device_set);
109
110 static void vfio_release_device_set(struct vfio_device *device)
111 {
112         struct vfio_device_set *dev_set = device->dev_set;
113
114         if (!dev_set)
115                 return;
116
117         mutex_lock(&dev_set->lock);
118         list_del(&device->dev_set_list);
119         mutex_unlock(&dev_set->lock);
120
121         xa_lock(&vfio_device_set_xa);
122         if (!--dev_set->device_count) {
123                 __xa_erase(&vfio_device_set_xa,
124                            (unsigned long)dev_set->set_id);
125                 mutex_destroy(&dev_set->lock);
126                 kfree(dev_set);
127         }
128         xa_unlock(&vfio_device_set_xa);
129 }
130
131 unsigned int vfio_device_set_open_count(struct vfio_device_set *dev_set)
132 {
133         struct vfio_device *cur;
134         unsigned int open_count = 0;
135
136         lockdep_assert_held(&dev_set->lock);
137
138         list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
139                 open_count += cur->open_count;
140         return open_count;
141 }
142 EXPORT_SYMBOL_GPL(vfio_device_set_open_count);
143
144 struct vfio_device *
145 vfio_find_device_in_devset(struct vfio_device_set *dev_set,
146                            struct device *dev)
147 {
148         struct vfio_device *cur;
149
150         lockdep_assert_held(&dev_set->lock);
151
152         list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
153                 if (cur->dev == dev)
154                         return cur;
155         return NULL;
156 }
157 EXPORT_SYMBOL_GPL(vfio_find_device_in_devset);
158
159 /*
160  * Device objects - create, release, get, put, search
161  */
162 /* Device reference always implies a group reference */
163 void vfio_device_put_registration(struct vfio_device *device)
164 {
165         if (refcount_dec_and_test(&device->refcount))
166                 complete(&device->comp);
167 }
168
169 bool vfio_device_try_get_registration(struct vfio_device *device)
170 {
171         return refcount_inc_not_zero(&device->refcount);
172 }
173
174 /*
175  * VFIO driver API
176  */
177 /* Release helper called by vfio_put_device() */
178 static void vfio_device_release(struct device *dev)
179 {
180         struct vfio_device *device =
181                         container_of(dev, struct vfio_device, device);
182
183         vfio_release_device_set(device);
184         ida_free(&vfio.device_ida, device->index);
185
186         if (device->ops->release)
187                 device->ops->release(device);
188
189         kvfree(device);
190 }
191
192 static int vfio_init_device(struct vfio_device *device, struct device *dev,
193                             const struct vfio_device_ops *ops);
194
195 /*
196  * Allocate and initialize vfio_device so it can be registered to vfio
197  * core.
198  *
199  * Drivers should use the wrapper vfio_alloc_device() for allocation.
200  * @size is the size of the structure to be allocated, including any
201  * private data used by the driver.
202  *
203  * Driver may provide an @init callback to cover device private data.
204  *
205  * Use vfio_put_device() to release the structure after success return.
206  */
207 struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev,
208                                        const struct vfio_device_ops *ops)
209 {
210         struct vfio_device *device;
211         int ret;
212
213         if (WARN_ON(size < sizeof(struct vfio_device)))
214                 return ERR_PTR(-EINVAL);
215
216         device = kvzalloc(size, GFP_KERNEL);
217         if (!device)
218                 return ERR_PTR(-ENOMEM);
219
220         ret = vfio_init_device(device, dev, ops);
221         if (ret)
222                 goto out_free;
223         return device;
224
225 out_free:
226         kvfree(device);
227         return ERR_PTR(ret);
228 }
229 EXPORT_SYMBOL_GPL(_vfio_alloc_device);
230
231 /*
232  * Initialize a vfio_device so it can be registered to vfio core.
233  */
234 static int vfio_init_device(struct vfio_device *device, struct device *dev,
235                             const struct vfio_device_ops *ops)
236 {
237         int ret;
238
239         ret = ida_alloc_max(&vfio.device_ida, MINORMASK, GFP_KERNEL);
240         if (ret < 0) {
241                 dev_dbg(dev, "Error to alloc index\n");
242                 return ret;
243         }
244
245         device->index = ret;
246         init_completion(&device->comp);
247         device->dev = dev;
248         device->ops = ops;
249
250         if (ops->init) {
251                 ret = ops->init(device);
252                 if (ret)
253                         goto out_uninit;
254         }
255
256         device_initialize(&device->device);
257         device->device.release = vfio_device_release;
258         device->device.class = vfio.device_class;
259         device->device.parent = device->dev;
260         return 0;
261
262 out_uninit:
263         vfio_release_device_set(device);
264         ida_free(&vfio.device_ida, device->index);
265         return ret;
266 }
267
268 static int __vfio_register_dev(struct vfio_device *device,
269                                enum vfio_group_type type)
270 {
271         int ret;
272
273         if (WARN_ON(IS_ENABLED(CONFIG_IOMMUFD) &&
274                     (!device->ops->bind_iommufd ||
275                      !device->ops->unbind_iommufd ||
276                      !device->ops->attach_ioas ||
277                      !device->ops->detach_ioas)))
278                 return -EINVAL;
279
280         /*
281          * If the driver doesn't specify a set then the device is added to a
282          * singleton set just for itself.
283          */
284         if (!device->dev_set)
285                 vfio_assign_device_set(device, device);
286
287         ret = dev_set_name(&device->device, "vfio%d", device->index);
288         if (ret)
289                 return ret;
290
291         ret = vfio_device_set_group(device, type);
292         if (ret)
293                 return ret;
294
295         /*
296          * VFIO always sets IOMMU_CACHE because we offer no way for userspace to
297          * restore cache coherency. It has to be checked here because it is only
298          * valid for cases where we are using iommu groups.
299          */
300         if (type == VFIO_IOMMU && !vfio_device_is_noiommu(device) &&
301             !device_iommu_capable(device->dev, IOMMU_CAP_CACHE_COHERENCY)) {
302                 ret = -EINVAL;
303                 goto err_out;
304         }
305
306         ret = vfio_device_add(device);
307         if (ret)
308                 goto err_out;
309
310         /* Refcounting can't start until the driver calls register */
311         refcount_set(&device->refcount, 1);
312
313         vfio_device_group_register(device);
314         vfio_device_debugfs_init(device);
315
316         return 0;
317 err_out:
318         vfio_device_remove_group(device);
319         return ret;
320 }
321
322 int vfio_register_group_dev(struct vfio_device *device)
323 {
324         return __vfio_register_dev(device, VFIO_IOMMU);
325 }
326 EXPORT_SYMBOL_GPL(vfio_register_group_dev);
327
328 /*
329  * Register a virtual device without IOMMU backing.  The user of this
330  * device must not be able to directly trigger unmediated DMA.
331  */
332 int vfio_register_emulated_iommu_dev(struct vfio_device *device)
333 {
334         return __vfio_register_dev(device, VFIO_EMULATED_IOMMU);
335 }
336 EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev);
337
338 /*
339  * Decrement the device reference count and wait for the device to be
340  * removed.  Open file descriptors for the device... */
341 void vfio_unregister_group_dev(struct vfio_device *device)
342 {
343         unsigned int i = 0;
344         bool interrupted = false;
345         long rc;
346
347         /*
348          * Prevent new device opened by userspace via the
349          * VFIO_GROUP_GET_DEVICE_FD in the group path.
350          */
351         vfio_device_group_unregister(device);
352
353         /*
354          * Balances vfio_device_add() in register path, also prevents
355          * new device opened by userspace in the cdev path.
356          */
357         vfio_device_del(device);
358
359         vfio_device_put_registration(device);
360         rc = try_wait_for_completion(&device->comp);
361         while (rc <= 0) {
362                 if (device->ops->request)
363                         device->ops->request(device, i++);
364
365                 if (interrupted) {
366                         rc = wait_for_completion_timeout(&device->comp,
367                                                          HZ * 10);
368                 } else {
369                         rc = wait_for_completion_interruptible_timeout(
370                                 &device->comp, HZ * 10);
371                         if (rc < 0) {
372                                 interrupted = true;
373                                 dev_warn(device->dev,
374                                          "Device is currently in use, task"
375                                          " \"%s\" (%d) "
376                                          "blocked until device is released",
377                                          current->comm, task_pid_nr(current));
378                         }
379                 }
380         }
381
382         vfio_device_debugfs_exit(device);
383         /* Balances vfio_device_set_group in register path */
384         vfio_device_remove_group(device);
385 }
386 EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
387
388 #ifdef CONFIG_HAVE_KVM
389 void vfio_device_get_kvm_safe(struct vfio_device *device, struct kvm *kvm)
390 {
391         void (*pfn)(struct kvm *kvm);
392         bool (*fn)(struct kvm *kvm);
393         bool ret;
394
395         lockdep_assert_held(&device->dev_set->lock);
396
397         if (!kvm)
398                 return;
399
400         pfn = symbol_get(kvm_put_kvm);
401         if (WARN_ON(!pfn))
402                 return;
403
404         fn = symbol_get(kvm_get_kvm_safe);
405         if (WARN_ON(!fn)) {
406                 symbol_put(kvm_put_kvm);
407                 return;
408         }
409
410         ret = fn(kvm);
411         symbol_put(kvm_get_kvm_safe);
412         if (!ret) {
413                 symbol_put(kvm_put_kvm);
414                 return;
415         }
416
417         device->put_kvm = pfn;
418         device->kvm = kvm;
419 }
420
421 void vfio_device_put_kvm(struct vfio_device *device)
422 {
423         lockdep_assert_held(&device->dev_set->lock);
424
425         if (!device->kvm)
426                 return;
427
428         if (WARN_ON(!device->put_kvm))
429                 goto clear;
430
431         device->put_kvm(device->kvm);
432         device->put_kvm = NULL;
433         symbol_put(kvm_put_kvm);
434
435 clear:
436         device->kvm = NULL;
437 }
438 #endif
439
440 /* true if the vfio_device has open_device() called but not close_device() */
441 static bool vfio_assert_device_open(struct vfio_device *device)
442 {
443         return !WARN_ON_ONCE(!READ_ONCE(device->open_count));
444 }
445
446 struct vfio_device_file *
447 vfio_allocate_device_file(struct vfio_device *device)
448 {
449         struct vfio_device_file *df;
450
451         df = kzalloc(sizeof(*df), GFP_KERNEL_ACCOUNT);
452         if (!df)
453                 return ERR_PTR(-ENOMEM);
454
455         df->device = device;
456         spin_lock_init(&df->kvm_ref_lock);
457
458         return df;
459 }
460
461 static int vfio_df_device_first_open(struct vfio_device_file *df)
462 {
463         struct vfio_device *device = df->device;
464         struct iommufd_ctx *iommufd = df->iommufd;
465         int ret;
466
467         lockdep_assert_held(&device->dev_set->lock);
468
469         if (!try_module_get(device->dev->driver->owner))
470                 return -ENODEV;
471
472         if (iommufd)
473                 ret = vfio_df_iommufd_bind(df);
474         else
475                 ret = vfio_device_group_use_iommu(device);
476         if (ret)
477                 goto err_module_put;
478
479         if (device->ops->open_device) {
480                 ret = device->ops->open_device(device);
481                 if (ret)
482                         goto err_unuse_iommu;
483         }
484         return 0;
485
486 err_unuse_iommu:
487         if (iommufd)
488                 vfio_df_iommufd_unbind(df);
489         else
490                 vfio_device_group_unuse_iommu(device);
491 err_module_put:
492         module_put(device->dev->driver->owner);
493         return ret;
494 }
495
496 static void vfio_df_device_last_close(struct vfio_device_file *df)
497 {
498         struct vfio_device *device = df->device;
499         struct iommufd_ctx *iommufd = df->iommufd;
500
501         lockdep_assert_held(&device->dev_set->lock);
502
503         if (device->ops->close_device)
504                 device->ops->close_device(device);
505         if (iommufd)
506                 vfio_df_iommufd_unbind(df);
507         else
508                 vfio_device_group_unuse_iommu(device);
509         module_put(device->dev->driver->owner);
510 }
511
512 int vfio_df_open(struct vfio_device_file *df)
513 {
514         struct vfio_device *device = df->device;
515         int ret = 0;
516
517         lockdep_assert_held(&device->dev_set->lock);
518
519         /*
520          * Only the group path allows the device to be opened multiple
521          * times.  The device cdev path doesn't have a secure way for it.
522          */
523         if (device->open_count != 0 && !df->group)
524                 return -EINVAL;
525
526         device->open_count++;
527         if (device->open_count == 1) {
528                 ret = vfio_df_device_first_open(df);
529                 if (ret)
530                         device->open_count--;
531         }
532
533         return ret;
534 }
535
536 void vfio_df_close(struct vfio_device_file *df)
537 {
538         struct vfio_device *device = df->device;
539
540         lockdep_assert_held(&device->dev_set->lock);
541
542         vfio_assert_device_open(device);
543         if (device->open_count == 1)
544                 vfio_df_device_last_close(df);
545         device->open_count--;
546 }
547
548 /*
549  * Wrapper around pm_runtime_resume_and_get().
550  * Return error code on failure or 0 on success.
551  */
552 static inline int vfio_device_pm_runtime_get(struct vfio_device *device)
553 {
554         struct device *dev = device->dev;
555
556         if (dev->driver && dev->driver->pm) {
557                 int ret;
558
559                 ret = pm_runtime_resume_and_get(dev);
560                 if (ret) {
561                         dev_info_ratelimited(dev,
562                                 "vfio: runtime resume failed %d\n", ret);
563                         return -EIO;
564                 }
565         }
566
567         return 0;
568 }
569
570 /*
571  * Wrapper around pm_runtime_put().
572  */
573 static inline void vfio_device_pm_runtime_put(struct vfio_device *device)
574 {
575         struct device *dev = device->dev;
576
577         if (dev->driver && dev->driver->pm)
578                 pm_runtime_put(dev);
579 }
580
581 /*
582  * VFIO Device fd
583  */
584 static int vfio_device_fops_release(struct inode *inode, struct file *filep)
585 {
586         struct vfio_device_file *df = filep->private_data;
587         struct vfio_device *device = df->device;
588
589         if (df->group)
590                 vfio_df_group_close(df);
591         else
592                 vfio_df_unbind_iommufd(df);
593
594         vfio_device_put_registration(device);
595
596         kfree(df);
597
598         return 0;
599 }
600
601 /*
602  * vfio_mig_get_next_state - Compute the next step in the FSM
603  * @cur_fsm - The current state the device is in
604  * @new_fsm - The target state to reach
605  * @next_fsm - Pointer to the next step to get to new_fsm
606  *
607  * Return 0 upon success, otherwise -errno
608  * Upon success the next step in the state progression between cur_fsm and
609  * new_fsm will be set in next_fsm.
610  *
611  * This breaks down requests for combination transitions into smaller steps and
612  * returns the next step to get to new_fsm. The function may need to be called
613  * multiple times before reaching new_fsm.
614  *
615  */
616 int vfio_mig_get_next_state(struct vfio_device *device,
617                             enum vfio_device_mig_state cur_fsm,
618                             enum vfio_device_mig_state new_fsm,
619                             enum vfio_device_mig_state *next_fsm)
620 {
621         enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_PRE_COPY_P2P + 1 };
622         /*
623          * The coding in this table requires the driver to implement the
624          * following FSM arcs:
625          *         RESUMING -> STOP
626          *         STOP -> RESUMING
627          *         STOP -> STOP_COPY
628          *         STOP_COPY -> STOP
629          *
630          * If P2P is supported then the driver must also implement these FSM
631          * arcs:
632          *         RUNNING -> RUNNING_P2P
633          *         RUNNING_P2P -> RUNNING
634          *         RUNNING_P2P -> STOP
635          *         STOP -> RUNNING_P2P
636          *
637          * If precopy is supported then the driver must support these additional
638          * FSM arcs:
639          *         RUNNING -> PRE_COPY
640          *         PRE_COPY -> RUNNING
641          *         PRE_COPY -> STOP_COPY
642          * However, if precopy and P2P are supported together then the driver
643          * must support these additional arcs beyond the P2P arcs above:
644          *         PRE_COPY -> RUNNING
645          *         PRE_COPY -> PRE_COPY_P2P
646          *         PRE_COPY_P2P -> PRE_COPY
647          *         PRE_COPY_P2P -> RUNNING_P2P
648          *         PRE_COPY_P2P -> STOP_COPY
649          *         RUNNING -> PRE_COPY
650          *         RUNNING_P2P -> PRE_COPY_P2P
651          *
652          * Without P2P and precopy the driver must implement:
653          *         RUNNING -> STOP
654          *         STOP -> RUNNING
655          *
656          * The coding will step through multiple states for some combination
657          * transitions; if all optional features are supported, this means the
658          * following ones:
659          *         PRE_COPY -> PRE_COPY_P2P -> STOP_COPY
660          *         PRE_COPY -> RUNNING -> RUNNING_P2P
661          *         PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP
662          *         PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP -> RESUMING
663          *         PRE_COPY_P2P -> RUNNING_P2P -> RUNNING
664          *         PRE_COPY_P2P -> RUNNING_P2P -> STOP
665          *         PRE_COPY_P2P -> RUNNING_P2P -> STOP -> RESUMING
666          *         RESUMING -> STOP -> RUNNING_P2P
667          *         RESUMING -> STOP -> RUNNING_P2P -> PRE_COPY_P2P
668          *         RESUMING -> STOP -> RUNNING_P2P -> RUNNING
669          *         RESUMING -> STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
670          *         RESUMING -> STOP -> STOP_COPY
671          *         RUNNING -> RUNNING_P2P -> PRE_COPY_P2P
672          *         RUNNING -> RUNNING_P2P -> STOP
673          *         RUNNING -> RUNNING_P2P -> STOP -> RESUMING
674          *         RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY
675          *         RUNNING_P2P -> RUNNING -> PRE_COPY
676          *         RUNNING_P2P -> STOP -> RESUMING
677          *         RUNNING_P2P -> STOP -> STOP_COPY
678          *         STOP -> RUNNING_P2P -> PRE_COPY_P2P
679          *         STOP -> RUNNING_P2P -> RUNNING
680          *         STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
681          *         STOP_COPY -> STOP -> RESUMING
682          *         STOP_COPY -> STOP -> RUNNING_P2P
683          *         STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING
684          *
685          *  The following transitions are blocked:
686          *         STOP_COPY -> PRE_COPY
687          *         STOP_COPY -> PRE_COPY_P2P
688          */
689         static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = {
690                 [VFIO_DEVICE_STATE_STOP] = {
691                         [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
692                         [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
693                         [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
694                         [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
695                         [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
696                         [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
697                         [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
698                         [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
699                 },
700                 [VFIO_DEVICE_STATE_RUNNING] = {
701                         [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
702                         [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
703                         [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
704                         [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
705                         [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
706                         [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
707                         [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
708                         [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
709                 },
710                 [VFIO_DEVICE_STATE_PRE_COPY] = {
711                         [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING,
712                         [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
713                         [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
714                         [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
715                         [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
716                         [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING,
717                         [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING,
718                         [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
719                 },
720                 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = {
721                         [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
722                         [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
723                         [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
724                         [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
725                         [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
726                         [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
727                         [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
728                         [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
729                 },
730                 [VFIO_DEVICE_STATE_STOP_COPY] = {
731                         [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
732                         [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
733                         [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
734                         [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
735                         [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
736                         [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
737                         [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
738                         [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
739                 },
740                 [VFIO_DEVICE_STATE_RESUMING] = {
741                         [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
742                         [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
743                         [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_STOP,
744                         [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_STOP,
745                         [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
746                         [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
747                         [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
748                         [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
749                 },
750                 [VFIO_DEVICE_STATE_RUNNING_P2P] = {
751                         [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
752                         [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
753                         [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING,
754                         [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
755                         [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
756                         [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
757                         [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
758                         [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
759                 },
760                 [VFIO_DEVICE_STATE_ERROR] = {
761                         [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR,
762                         [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR,
763                         [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
764                         [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
765                         [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR,
766                         [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR,
767                         [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR,
768                         [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
769                 },
770         };
771
772         static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = {
773                 [VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY,
774                 [VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY,
775                 [VFIO_DEVICE_STATE_PRE_COPY] =
776                         VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY,
777                 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_MIGRATION_STOP_COPY |
778                                                    VFIO_MIGRATION_P2P |
779                                                    VFIO_MIGRATION_PRE_COPY,
780                 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY,
781                 [VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY,
782                 [VFIO_DEVICE_STATE_RUNNING_P2P] =
783                         VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P,
784                 [VFIO_DEVICE_STATE_ERROR] = ~0U,
785         };
786
787         if (WARN_ON(cur_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
788                     (state_flags_table[cur_fsm] & device->migration_flags) !=
789                         state_flags_table[cur_fsm]))
790                 return -EINVAL;
791
792         if (new_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
793            (state_flags_table[new_fsm] & device->migration_flags) !=
794                         state_flags_table[new_fsm])
795                 return -EINVAL;
796
797         /*
798          * Arcs touching optional and unsupported states are skipped over. The
799          * driver will instead see an arc from the original state to the next
800          * logical state, as per the above comment.
801          */
802         *next_fsm = vfio_from_fsm_table[cur_fsm][new_fsm];
803         while ((state_flags_table[*next_fsm] & device->migration_flags) !=
804                         state_flags_table[*next_fsm])
805                 *next_fsm = vfio_from_fsm_table[*next_fsm][new_fsm];
806
807         return (*next_fsm != VFIO_DEVICE_STATE_ERROR) ? 0 : -EINVAL;
808 }
809 EXPORT_SYMBOL_GPL(vfio_mig_get_next_state);
810
811 /*
812  * Convert the drivers's struct file into a FD number and return it to userspace
813  */
814 static int vfio_ioct_mig_return_fd(struct file *filp, void __user *arg,
815                                    struct vfio_device_feature_mig_state *mig)
816 {
817         int ret;
818         int fd;
819
820         fd = get_unused_fd_flags(O_CLOEXEC);
821         if (fd < 0) {
822                 ret = fd;
823                 goto out_fput;
824         }
825
826         mig->data_fd = fd;
827         if (copy_to_user(arg, mig, sizeof(*mig))) {
828                 ret = -EFAULT;
829                 goto out_put_unused;
830         }
831         fd_install(fd, filp);
832         return 0;
833
834 out_put_unused:
835         put_unused_fd(fd);
836 out_fput:
837         fput(filp);
838         return ret;
839 }
840
841 static int
842 vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device,
843                                            u32 flags, void __user *arg,
844                                            size_t argsz)
845 {
846         size_t minsz =
847                 offsetofend(struct vfio_device_feature_mig_state, data_fd);
848         struct vfio_device_feature_mig_state mig;
849         struct file *filp = NULL;
850         int ret;
851
852         if (!device->mig_ops)
853                 return -ENOTTY;
854
855         ret = vfio_check_feature(flags, argsz,
856                                  VFIO_DEVICE_FEATURE_SET |
857                                  VFIO_DEVICE_FEATURE_GET,
858                                  sizeof(mig));
859         if (ret != 1)
860                 return ret;
861
862         if (copy_from_user(&mig, arg, minsz))
863                 return -EFAULT;
864
865         if (flags & VFIO_DEVICE_FEATURE_GET) {
866                 enum vfio_device_mig_state curr_state;
867
868                 ret = device->mig_ops->migration_get_state(device,
869                                                            &curr_state);
870                 if (ret)
871                         return ret;
872                 mig.device_state = curr_state;
873                 goto out_copy;
874         }
875
876         /* Handle the VFIO_DEVICE_FEATURE_SET */
877         filp = device->mig_ops->migration_set_state(device, mig.device_state);
878         if (IS_ERR(filp) || !filp)
879                 goto out_copy;
880
881         return vfio_ioct_mig_return_fd(filp, arg, &mig);
882 out_copy:
883         mig.data_fd = -1;
884         if (copy_to_user(arg, &mig, sizeof(mig)))
885                 return -EFAULT;
886         if (IS_ERR(filp))
887                 return PTR_ERR(filp);
888         return 0;
889 }
890
891 static int
892 vfio_ioctl_device_feature_migration_data_size(struct vfio_device *device,
893                                               u32 flags, void __user *arg,
894                                               size_t argsz)
895 {
896         struct vfio_device_feature_mig_data_size data_size = {};
897         unsigned long stop_copy_length;
898         int ret;
899
900         if (!device->mig_ops)
901                 return -ENOTTY;
902
903         ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
904                                  sizeof(data_size));
905         if (ret != 1)
906                 return ret;
907
908         ret = device->mig_ops->migration_get_data_size(device, &stop_copy_length);
909         if (ret)
910                 return ret;
911
912         data_size.stop_copy_length = stop_copy_length;
913         if (copy_to_user(arg, &data_size, sizeof(data_size)))
914                 return -EFAULT;
915
916         return 0;
917 }
918
919 static int vfio_ioctl_device_feature_migration(struct vfio_device *device,
920                                                u32 flags, void __user *arg,
921                                                size_t argsz)
922 {
923         struct vfio_device_feature_migration mig = {
924                 .flags = device->migration_flags,
925         };
926         int ret;
927
928         if (!device->mig_ops)
929                 return -ENOTTY;
930
931         ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
932                                  sizeof(mig));
933         if (ret != 1)
934                 return ret;
935         if (copy_to_user(arg, &mig, sizeof(mig)))
936                 return -EFAULT;
937         return 0;
938 }
939
940 void vfio_combine_iova_ranges(struct rb_root_cached *root, u32 cur_nodes,
941                               u32 req_nodes)
942 {
943         struct interval_tree_node *prev, *curr, *comb_start, *comb_end;
944         unsigned long min_gap, curr_gap;
945
946         /* Special shortcut when a single range is required */
947         if (req_nodes == 1) {
948                 unsigned long last;
949
950                 comb_start = interval_tree_iter_first(root, 0, ULONG_MAX);
951
952                 /* Empty list */
953                 if (WARN_ON_ONCE(!comb_start))
954                         return;
955
956                 curr = comb_start;
957                 while (curr) {
958                         last = curr->last;
959                         prev = curr;
960                         curr = interval_tree_iter_next(curr, 0, ULONG_MAX);
961                         if (prev != comb_start)
962                                 interval_tree_remove(prev, root);
963                 }
964                 comb_start->last = last;
965                 return;
966         }
967
968         /* Combine ranges which have the smallest gap */
969         while (cur_nodes > req_nodes) {
970                 prev = NULL;
971                 min_gap = ULONG_MAX;
972                 curr = interval_tree_iter_first(root, 0, ULONG_MAX);
973                 while (curr) {
974                         if (prev) {
975                                 curr_gap = curr->start - prev->last;
976                                 if (curr_gap < min_gap) {
977                                         min_gap = curr_gap;
978                                         comb_start = prev;
979                                         comb_end = curr;
980                                 }
981                         }
982                         prev = curr;
983                         curr = interval_tree_iter_next(curr, 0, ULONG_MAX);
984                 }
985
986                 /* Empty list or no nodes to combine */
987                 if (WARN_ON_ONCE(min_gap == ULONG_MAX))
988                         break;
989
990                 comb_start->last = comb_end->last;
991                 interval_tree_remove(comb_end, root);
992                 cur_nodes--;
993         }
994 }
995 EXPORT_SYMBOL_GPL(vfio_combine_iova_ranges);
996
997 /* Ranges should fit into a single kernel page */
998 #define LOG_MAX_RANGES \
999         (PAGE_SIZE / sizeof(struct vfio_device_feature_dma_logging_range))
1000
1001 static int
1002 vfio_ioctl_device_feature_logging_start(struct vfio_device *device,
1003                                         u32 flags, void __user *arg,
1004                                         size_t argsz)
1005 {
1006         size_t minsz =
1007                 offsetofend(struct vfio_device_feature_dma_logging_control,
1008                             ranges);
1009         struct vfio_device_feature_dma_logging_range __user *ranges;
1010         struct vfio_device_feature_dma_logging_control control;
1011         struct vfio_device_feature_dma_logging_range range;
1012         struct rb_root_cached root = RB_ROOT_CACHED;
1013         struct interval_tree_node *nodes;
1014         u64 iova_end;
1015         u32 nnodes;
1016         int i, ret;
1017
1018         if (!device->log_ops)
1019                 return -ENOTTY;
1020
1021         ret = vfio_check_feature(flags, argsz,
1022                                  VFIO_DEVICE_FEATURE_SET,
1023                                  sizeof(control));
1024         if (ret != 1)
1025                 return ret;
1026
1027         if (copy_from_user(&control, arg, minsz))
1028                 return -EFAULT;
1029
1030         nnodes = control.num_ranges;
1031         if (!nnodes)
1032                 return -EINVAL;
1033
1034         if (nnodes > LOG_MAX_RANGES)
1035                 return -E2BIG;
1036
1037         ranges = u64_to_user_ptr(control.ranges);
1038         nodes = kmalloc_array(nnodes, sizeof(struct interval_tree_node),
1039                               GFP_KERNEL);
1040         if (!nodes)
1041                 return -ENOMEM;
1042
1043         for (i = 0; i < nnodes; i++) {
1044                 if (copy_from_user(&range, &ranges[i], sizeof(range))) {
1045                         ret = -EFAULT;
1046                         goto end;
1047                 }
1048                 if (!IS_ALIGNED(range.iova, control.page_size) ||
1049                     !IS_ALIGNED(range.length, control.page_size)) {
1050                         ret = -EINVAL;
1051                         goto end;
1052                 }
1053
1054                 if (check_add_overflow(range.iova, range.length, &iova_end) ||
1055                     iova_end > ULONG_MAX) {
1056                         ret = -EOVERFLOW;
1057                         goto end;
1058                 }
1059
1060                 nodes[i].start = range.iova;
1061                 nodes[i].last = range.iova + range.length - 1;
1062                 if (interval_tree_iter_first(&root, nodes[i].start,
1063                                              nodes[i].last)) {
1064                         /* Range overlapping */
1065                         ret = -EINVAL;
1066                         goto end;
1067                 }
1068                 interval_tree_insert(nodes + i, &root);
1069         }
1070
1071         ret = device->log_ops->log_start(device, &root, nnodes,
1072                                          &control.page_size);
1073         if (ret)
1074                 goto end;
1075
1076         if (copy_to_user(arg, &control, sizeof(control))) {
1077                 ret = -EFAULT;
1078                 device->log_ops->log_stop(device);
1079         }
1080
1081 end:
1082         kfree(nodes);
1083         return ret;
1084 }
1085
1086 static int
1087 vfio_ioctl_device_feature_logging_stop(struct vfio_device *device,
1088                                        u32 flags, void __user *arg,
1089                                        size_t argsz)
1090 {
1091         int ret;
1092
1093         if (!device->log_ops)
1094                 return -ENOTTY;
1095
1096         ret = vfio_check_feature(flags, argsz,
1097                                  VFIO_DEVICE_FEATURE_SET, 0);
1098         if (ret != 1)
1099                 return ret;
1100
1101         return device->log_ops->log_stop(device);
1102 }
1103
1104 static int vfio_device_log_read_and_clear(struct iova_bitmap *iter,
1105                                           unsigned long iova, size_t length,
1106                                           void *opaque)
1107 {
1108         struct vfio_device *device = opaque;
1109
1110         return device->log_ops->log_read_and_clear(device, iova, length, iter);
1111 }
1112
1113 static int
1114 vfio_ioctl_device_feature_logging_report(struct vfio_device *device,
1115                                          u32 flags, void __user *arg,
1116                                          size_t argsz)
1117 {
1118         size_t minsz =
1119                 offsetofend(struct vfio_device_feature_dma_logging_report,
1120                             bitmap);
1121         struct vfio_device_feature_dma_logging_report report;
1122         struct iova_bitmap *iter;
1123         u64 iova_end;
1124         int ret;
1125
1126         if (!device->log_ops)
1127                 return -ENOTTY;
1128
1129         ret = vfio_check_feature(flags, argsz,
1130                                  VFIO_DEVICE_FEATURE_GET,
1131                                  sizeof(report));
1132         if (ret != 1)
1133                 return ret;
1134
1135         if (copy_from_user(&report, arg, minsz))
1136                 return -EFAULT;
1137
1138         if (report.page_size < SZ_4K || !is_power_of_2(report.page_size))
1139                 return -EINVAL;
1140
1141         if (check_add_overflow(report.iova, report.length, &iova_end) ||
1142             iova_end > ULONG_MAX)
1143                 return -EOVERFLOW;
1144
1145         iter = iova_bitmap_alloc(report.iova, report.length,
1146                                  report.page_size,
1147                                  u64_to_user_ptr(report.bitmap));
1148         if (IS_ERR(iter))
1149                 return PTR_ERR(iter);
1150
1151         ret = iova_bitmap_for_each(iter, device,
1152                                    vfio_device_log_read_and_clear);
1153
1154         iova_bitmap_free(iter);
1155         return ret;
1156 }
1157
1158 static int vfio_ioctl_device_feature(struct vfio_device *device,
1159                                      struct vfio_device_feature __user *arg)
1160 {
1161         size_t minsz = offsetofend(struct vfio_device_feature, flags);
1162         struct vfio_device_feature feature;
1163
1164         if (copy_from_user(&feature, arg, minsz))
1165                 return -EFAULT;
1166
1167         if (feature.argsz < minsz)
1168                 return -EINVAL;
1169
1170         /* Check unknown flags */
1171         if (feature.flags &
1172             ~(VFIO_DEVICE_FEATURE_MASK | VFIO_DEVICE_FEATURE_SET |
1173               VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_PROBE))
1174                 return -EINVAL;
1175
1176         /* GET & SET are mutually exclusive except with PROBE */
1177         if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) &&
1178             (feature.flags & VFIO_DEVICE_FEATURE_SET) &&
1179             (feature.flags & VFIO_DEVICE_FEATURE_GET))
1180                 return -EINVAL;
1181
1182         switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) {
1183         case VFIO_DEVICE_FEATURE_MIGRATION:
1184                 return vfio_ioctl_device_feature_migration(
1185                         device, feature.flags, arg->data,
1186                         feature.argsz - minsz);
1187         case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE:
1188                 return vfio_ioctl_device_feature_mig_device_state(
1189                         device, feature.flags, arg->data,
1190                         feature.argsz - minsz);
1191         case VFIO_DEVICE_FEATURE_DMA_LOGGING_START:
1192                 return vfio_ioctl_device_feature_logging_start(
1193                         device, feature.flags, arg->data,
1194                         feature.argsz - minsz);
1195         case VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP:
1196                 return vfio_ioctl_device_feature_logging_stop(
1197                         device, feature.flags, arg->data,
1198                         feature.argsz - minsz);
1199         case VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT:
1200                 return vfio_ioctl_device_feature_logging_report(
1201                         device, feature.flags, arg->data,
1202                         feature.argsz - minsz);
1203         case VFIO_DEVICE_FEATURE_MIG_DATA_SIZE:
1204                 return vfio_ioctl_device_feature_migration_data_size(
1205                         device, feature.flags, arg->data,
1206                         feature.argsz - minsz);
1207         default:
1208                 if (unlikely(!device->ops->device_feature))
1209                         return -EINVAL;
1210                 return device->ops->device_feature(device, feature.flags,
1211                                                    arg->data,
1212                                                    feature.argsz - minsz);
1213         }
1214 }
1215
1216 static long vfio_device_fops_unl_ioctl(struct file *filep,
1217                                        unsigned int cmd, unsigned long arg)
1218 {
1219         struct vfio_device_file *df = filep->private_data;
1220         struct vfio_device *device = df->device;
1221         void __user *uptr = (void __user *)arg;
1222         int ret;
1223
1224         if (cmd == VFIO_DEVICE_BIND_IOMMUFD)
1225                 return vfio_df_ioctl_bind_iommufd(df, uptr);
1226
1227         /* Paired with smp_store_release() following vfio_df_open() */
1228         if (!smp_load_acquire(&df->access_granted))
1229                 return -EINVAL;
1230
1231         ret = vfio_device_pm_runtime_get(device);
1232         if (ret)
1233                 return ret;
1234
1235         /* cdev only ioctls */
1236         if (IS_ENABLED(CONFIG_VFIO_DEVICE_CDEV) && !df->group) {
1237                 switch (cmd) {
1238                 case VFIO_DEVICE_ATTACH_IOMMUFD_PT:
1239                         ret = vfio_df_ioctl_attach_pt(df, uptr);
1240                         goto out;
1241
1242                 case VFIO_DEVICE_DETACH_IOMMUFD_PT:
1243                         ret = vfio_df_ioctl_detach_pt(df, uptr);
1244                         goto out;
1245                 }
1246         }
1247
1248         switch (cmd) {
1249         case VFIO_DEVICE_FEATURE:
1250                 ret = vfio_ioctl_device_feature(device, uptr);
1251                 break;
1252
1253         default:
1254                 if (unlikely(!device->ops->ioctl))
1255                         ret = -EINVAL;
1256                 else
1257                         ret = device->ops->ioctl(device, cmd, arg);
1258                 break;
1259         }
1260 out:
1261         vfio_device_pm_runtime_put(device);
1262         return ret;
1263 }
1264
1265 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1266                                      size_t count, loff_t *ppos)
1267 {
1268         struct vfio_device_file *df = filep->private_data;
1269         struct vfio_device *device = df->device;
1270
1271         /* Paired with smp_store_release() following vfio_df_open() */
1272         if (!smp_load_acquire(&df->access_granted))
1273                 return -EINVAL;
1274
1275         if (unlikely(!device->ops->read))
1276                 return -EINVAL;
1277
1278         return device->ops->read(device, buf, count, ppos);
1279 }
1280
1281 static ssize_t vfio_device_fops_write(struct file *filep,
1282                                       const char __user *buf,
1283                                       size_t count, loff_t *ppos)
1284 {
1285         struct vfio_device_file *df = filep->private_data;
1286         struct vfio_device *device = df->device;
1287
1288         /* Paired with smp_store_release() following vfio_df_open() */
1289         if (!smp_load_acquire(&df->access_granted))
1290                 return -EINVAL;
1291
1292         if (unlikely(!device->ops->write))
1293                 return -EINVAL;
1294
1295         return device->ops->write(device, buf, count, ppos);
1296 }
1297
1298 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1299 {
1300         struct vfio_device_file *df = filep->private_data;
1301         struct vfio_device *device = df->device;
1302
1303         /* Paired with smp_store_release() following vfio_df_open() */
1304         if (!smp_load_acquire(&df->access_granted))
1305                 return -EINVAL;
1306
1307         if (unlikely(!device->ops->mmap))
1308                 return -EINVAL;
1309
1310         return device->ops->mmap(device, vma);
1311 }
1312
1313 const struct file_operations vfio_device_fops = {
1314         .owner          = THIS_MODULE,
1315         .open           = vfio_device_fops_cdev_open,
1316         .release        = vfio_device_fops_release,
1317         .read           = vfio_device_fops_read,
1318         .write          = vfio_device_fops_write,
1319         .unlocked_ioctl = vfio_device_fops_unl_ioctl,
1320         .compat_ioctl   = compat_ptr_ioctl,
1321         .mmap           = vfio_device_fops_mmap,
1322 };
1323
1324 static struct vfio_device *vfio_device_from_file(struct file *file)
1325 {
1326         struct vfio_device_file *df = file->private_data;
1327
1328         if (file->f_op != &vfio_device_fops)
1329                 return NULL;
1330         return df->device;
1331 }
1332
1333 /**
1334  * vfio_file_is_valid - True if the file is valid vfio file
1335  * @file: VFIO group file or VFIO device file
1336  */
1337 bool vfio_file_is_valid(struct file *file)
1338 {
1339         return vfio_group_from_file(file) ||
1340                vfio_device_from_file(file);
1341 }
1342 EXPORT_SYMBOL_GPL(vfio_file_is_valid);
1343
1344 /**
1345  * vfio_file_enforced_coherent - True if the DMA associated with the VFIO file
1346  *        is always CPU cache coherent
1347  * @file: VFIO group file or VFIO device file
1348  *
1349  * Enforced coherency means that the IOMMU ignores things like the PCIe no-snoop
1350  * bit in DMA transactions. A return of false indicates that the user has
1351  * rights to access additional instructions such as wbinvd on x86.
1352  */
1353 bool vfio_file_enforced_coherent(struct file *file)
1354 {
1355         struct vfio_device *device;
1356         struct vfio_group *group;
1357
1358         group = vfio_group_from_file(file);
1359         if (group)
1360                 return vfio_group_enforced_coherent(group);
1361
1362         device = vfio_device_from_file(file);
1363         if (device)
1364                 return device_iommu_capable(device->dev,
1365                                             IOMMU_CAP_ENFORCE_CACHE_COHERENCY);
1366
1367         return true;
1368 }
1369 EXPORT_SYMBOL_GPL(vfio_file_enforced_coherent);
1370
1371 static void vfio_device_file_set_kvm(struct file *file, struct kvm *kvm)
1372 {
1373         struct vfio_device_file *df = file->private_data;
1374
1375         /*
1376          * The kvm is first recorded in the vfio_device_file, and will
1377          * be propagated to vfio_device::kvm when the file is bound to
1378          * iommufd successfully in the vfio device cdev path.
1379          */
1380         spin_lock(&df->kvm_ref_lock);
1381         df->kvm = kvm;
1382         spin_unlock(&df->kvm_ref_lock);
1383 }
1384
1385 /**
1386  * vfio_file_set_kvm - Link a kvm with VFIO drivers
1387  * @file: VFIO group file or VFIO device file
1388  * @kvm: KVM to link
1389  *
1390  * When a VFIO device is first opened the KVM will be available in
1391  * device->kvm if one was associated with the file.
1392  */
1393 void vfio_file_set_kvm(struct file *file, struct kvm *kvm)
1394 {
1395         struct vfio_group *group;
1396
1397         group = vfio_group_from_file(file);
1398         if (group)
1399                 vfio_group_set_kvm(group, kvm);
1400
1401         if (vfio_device_from_file(file))
1402                 vfio_device_file_set_kvm(file, kvm);
1403 }
1404 EXPORT_SYMBOL_GPL(vfio_file_set_kvm);
1405
1406 /*
1407  * Sub-module support
1408  */
1409 /*
1410  * Helper for managing a buffer of info chain capabilities, allocate or
1411  * reallocate a buffer with additional @size, filling in @id and @version
1412  * of the capability.  A pointer to the new capability is returned.
1413  *
1414  * NB. The chain is based at the head of the buffer, so new entries are
1415  * added to the tail, vfio_info_cap_shift() should be called to fixup the
1416  * next offsets prior to copying to the user buffer.
1417  */
1418 struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1419                                                size_t size, u16 id, u16 version)
1420 {
1421         void *buf;
1422         struct vfio_info_cap_header *header, *tmp;
1423
1424         /* Ensure that the next capability struct will be aligned */
1425         size = ALIGN(size, sizeof(u64));
1426
1427         buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1428         if (!buf) {
1429                 kfree(caps->buf);
1430                 caps->buf = NULL;
1431                 caps->size = 0;
1432                 return ERR_PTR(-ENOMEM);
1433         }
1434
1435         caps->buf = buf;
1436         header = buf + caps->size;
1437
1438         /* Eventually copied to user buffer, zero */
1439         memset(header, 0, size);
1440
1441         header->id = id;
1442         header->version = version;
1443
1444         /* Add to the end of the capability chain */
1445         for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1446                 ; /* nothing */
1447
1448         tmp->next = caps->size;
1449         caps->size += size;
1450
1451         return header;
1452 }
1453 EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1454
1455 void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1456 {
1457         struct vfio_info_cap_header *tmp;
1458         void *buf = (void *)caps->buf;
1459
1460         /* Capability structs should start with proper alignment */
1461         WARN_ON(!IS_ALIGNED(offset, sizeof(u64)));
1462
1463         for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1464                 tmp->next += offset;
1465 }
1466 EXPORT_SYMBOL(vfio_info_cap_shift);
1467
1468 int vfio_info_add_capability(struct vfio_info_cap *caps,
1469                              struct vfio_info_cap_header *cap, size_t size)
1470 {
1471         struct vfio_info_cap_header *header;
1472
1473         header = vfio_info_cap_add(caps, size, cap->id, cap->version);
1474         if (IS_ERR(header))
1475                 return PTR_ERR(header);
1476
1477         memcpy(header + 1, cap + 1, size - sizeof(*header));
1478
1479         return 0;
1480 }
1481 EXPORT_SYMBOL(vfio_info_add_capability);
1482
1483 int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1484                                        int max_irq_type, size_t *data_size)
1485 {
1486         unsigned long minsz;
1487         size_t size;
1488
1489         minsz = offsetofend(struct vfio_irq_set, count);
1490
1491         if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1492             (hdr->count >= (U32_MAX - hdr->start)) ||
1493             (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1494                                 VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1495                 return -EINVAL;
1496
1497         if (data_size)
1498                 *data_size = 0;
1499
1500         if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1501                 return -EINVAL;
1502
1503         switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1504         case VFIO_IRQ_SET_DATA_NONE:
1505                 size = 0;
1506                 break;
1507         case VFIO_IRQ_SET_DATA_BOOL:
1508                 size = sizeof(uint8_t);
1509                 break;
1510         case VFIO_IRQ_SET_DATA_EVENTFD:
1511                 size = sizeof(int32_t);
1512                 break;
1513         default:
1514                 return -EINVAL;
1515         }
1516
1517         if (size) {
1518                 if (hdr->argsz - minsz < hdr->count * size)
1519                         return -EINVAL;
1520
1521                 if (!data_size)
1522                         return -EINVAL;
1523
1524                 *data_size = hdr->count * size;
1525         }
1526
1527         return 0;
1528 }
1529 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1530
1531 /*
1532  * Pin contiguous user pages and return their associated host pages for local
1533  * domain only.
1534  * @device [in]  : device
1535  * @iova [in]    : starting IOVA of user pages to be pinned.
1536  * @npage [in]   : count of pages to be pinned.  This count should not
1537  *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1538  * @prot [in]    : protection flags
1539  * @pages[out]   : array of host pages
1540  * Return error or number of pages pinned.
1541  *
1542  * A driver may only call this function if the vfio_device was created
1543  * by vfio_register_emulated_iommu_dev() due to vfio_device_container_pin_pages().
1544  */
1545 int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova,
1546                    int npage, int prot, struct page **pages)
1547 {
1548         /* group->container cannot change while a vfio device is open */
1549         if (!pages || !npage || WARN_ON(!vfio_assert_device_open(device)))
1550                 return -EINVAL;
1551         if (!device->ops->dma_unmap)
1552                 return -EINVAL;
1553         if (vfio_device_has_container(device))
1554                 return vfio_device_container_pin_pages(device, iova,
1555                                                        npage, prot, pages);
1556         if (device->iommufd_access) {
1557                 int ret;
1558
1559                 if (iova > ULONG_MAX)
1560                         return -EINVAL;
1561                 /*
1562                  * VFIO ignores the sub page offset, npages is from the start of
1563                  * a PAGE_SIZE chunk of IOVA. The caller is expected to recover
1564                  * the sub page offset by doing:
1565                  *     pages[0] + (iova % PAGE_SIZE)
1566                  */
1567                 ret = iommufd_access_pin_pages(
1568                         device->iommufd_access, ALIGN_DOWN(iova, PAGE_SIZE),
1569                         npage * PAGE_SIZE, pages,
1570                         (prot & IOMMU_WRITE) ? IOMMUFD_ACCESS_RW_WRITE : 0);
1571                 if (ret)
1572                         return ret;
1573                 return npage;
1574         }
1575         return -EINVAL;
1576 }
1577 EXPORT_SYMBOL(vfio_pin_pages);
1578
1579 /*
1580  * Unpin contiguous host pages for local domain only.
1581  * @device [in]  : device
1582  * @iova [in]    : starting address of user pages to be unpinned.
1583  * @npage [in]   : count of pages to be unpinned.  This count should not
1584  *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1585  */
1586 void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage)
1587 {
1588         if (WARN_ON(!vfio_assert_device_open(device)))
1589                 return;
1590         if (WARN_ON(!device->ops->dma_unmap))
1591                 return;
1592
1593         if (vfio_device_has_container(device)) {
1594                 vfio_device_container_unpin_pages(device, iova, npage);
1595                 return;
1596         }
1597         if (device->iommufd_access) {
1598                 if (WARN_ON(iova > ULONG_MAX))
1599                         return;
1600                 iommufd_access_unpin_pages(device->iommufd_access,
1601                                            ALIGN_DOWN(iova, PAGE_SIZE),
1602                                            npage * PAGE_SIZE);
1603                 return;
1604         }
1605 }
1606 EXPORT_SYMBOL(vfio_unpin_pages);
1607
1608 /*
1609  * This interface allows the CPUs to perform some sort of virtual DMA on
1610  * behalf of the device.
1611  *
1612  * CPUs read/write from/into a range of IOVAs pointing to user space memory
1613  * into/from a kernel buffer.
1614  *
1615  * As the read/write of user space memory is conducted via the CPUs and is
1616  * not a real device DMA, it is not necessary to pin the user space memory.
1617  *
1618  * @device [in]         : VFIO device
1619  * @iova [in]           : base IOVA of a user space buffer
1620  * @data [in]           : pointer to kernel buffer
1621  * @len [in]            : kernel buffer length
1622  * @write               : indicate read or write
1623  * Return error code on failure or 0 on success.
1624  */
1625 int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data,
1626                 size_t len, bool write)
1627 {
1628         if (!data || len <= 0 || !vfio_assert_device_open(device))
1629                 return -EINVAL;
1630
1631         if (vfio_device_has_container(device))
1632                 return vfio_device_container_dma_rw(device, iova,
1633                                                     data, len, write);
1634
1635         if (device->iommufd_access) {
1636                 unsigned int flags = 0;
1637
1638                 if (iova > ULONG_MAX)
1639                         return -EINVAL;
1640
1641                 /* VFIO historically tries to auto-detect a kthread */
1642                 if (!current->mm)
1643                         flags |= IOMMUFD_ACCESS_RW_KTHREAD;
1644                 if (write)
1645                         flags |= IOMMUFD_ACCESS_RW_WRITE;
1646                 return iommufd_access_rw(device->iommufd_access, iova, data,
1647                                          len, flags);
1648         }
1649         return -EINVAL;
1650 }
1651 EXPORT_SYMBOL(vfio_dma_rw);
1652
1653 /*
1654  * Module/class support
1655  */
1656 static int __init vfio_init(void)
1657 {
1658         int ret;
1659
1660         ida_init(&vfio.device_ida);
1661
1662         ret = vfio_group_init();
1663         if (ret)
1664                 return ret;
1665
1666         ret = vfio_virqfd_init();
1667         if (ret)
1668                 goto err_virqfd;
1669
1670         /* /sys/class/vfio-dev/vfioX */
1671         vfio.device_class = class_create("vfio-dev");
1672         if (IS_ERR(vfio.device_class)) {
1673                 ret = PTR_ERR(vfio.device_class);
1674                 goto err_dev_class;
1675         }
1676
1677         ret = vfio_cdev_init(vfio.device_class);
1678         if (ret)
1679                 goto err_alloc_dev_chrdev;
1680
1681         vfio_debugfs_create_root();
1682         pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
1683         return 0;
1684
1685 err_alloc_dev_chrdev:
1686         class_destroy(vfio.device_class);
1687         vfio.device_class = NULL;
1688 err_dev_class:
1689         vfio_virqfd_exit();
1690 err_virqfd:
1691         vfio_group_cleanup();
1692         return ret;
1693 }
1694
1695 static void __exit vfio_cleanup(void)
1696 {
1697         vfio_debugfs_remove_root();
1698         ida_destroy(&vfio.device_ida);
1699         vfio_cdev_cleanup();
1700         class_destroy(vfio.device_class);
1701         vfio.device_class = NULL;
1702         vfio_virqfd_exit();
1703         vfio_group_cleanup();
1704         xa_destroy(&vfio_device_set_xa);
1705 }
1706
1707 module_init(vfio_init);
1708 module_exit(vfio_cleanup);
1709
1710 MODULE_IMPORT_NS(IOMMUFD);
1711 MODULE_VERSION(DRIVER_VERSION);
1712 MODULE_LICENSE("GPL v2");
1713 MODULE_AUTHOR(DRIVER_AUTHOR);
1714 MODULE_DESCRIPTION(DRIVER_DESC);
1715 MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");