4 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
5 * Author: Alex Williamson <alex.williamson@redhat.com>
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
11 * Derived from original vfio:
12 * Copyright 2010 Cisco Systems, Inc. All rights reserved.
13 * Author: Tom Lyon, pugs@cisco.com
16 #include <linux/cdev.h>
17 #include <linux/compat.h>
18 #include <linux/device.h>
19 #include <linux/file.h>
20 #include <linux/anon_inodes.h>
22 #include <linux/idr.h>
23 #include <linux/iommu.h>
24 #include <linux/list.h>
25 #include <linux/miscdevice.h>
26 #include <linux/module.h>
27 #include <linux/mutex.h>
28 #include <linux/pci.h>
29 #include <linux/rwsem.h>
30 #include <linux/sched.h>
31 #include <linux/slab.h>
32 #include <linux/stat.h>
33 #include <linux/string.h>
34 #include <linux/uaccess.h>
35 #include <linux/vfio.h>
36 #include <linux/wait.h>
38 #define DRIVER_VERSION "0.3"
39 #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
40 #define DRIVER_DESC "VFIO - User Level meta-driver"
44 struct list_head iommu_drivers_list;
45 struct mutex iommu_drivers_lock;
46 struct list_head group_list;
48 struct mutex group_lock;
49 struct cdev group_cdev;
51 wait_queue_head_t release_q;
54 struct vfio_iommu_driver {
55 const struct vfio_iommu_driver_ops *ops;
56 struct list_head vfio_next;
59 struct vfio_container {
61 struct list_head group_list;
62 struct rw_semaphore group_lock;
63 struct vfio_iommu_driver *iommu_driver;
68 struct vfio_unbound_dev {
70 struct list_head unbound_next;
76 atomic_t container_users;
77 struct iommu_group *iommu_group;
78 struct vfio_container *container;
79 struct list_head device_list;
80 struct mutex device_lock;
82 struct notifier_block nb;
83 struct list_head vfio_next;
84 struct list_head container_next;
85 struct list_head unbound_list;
86 struct mutex unbound_lock;
90 struct blocking_notifier_head notifier;
96 const struct vfio_device_ops *ops;
97 struct vfio_group *group;
98 struct list_head group_next;
102 #ifdef CONFIG_VFIO_NOIOMMU
103 static bool noiommu __read_mostly;
104 module_param_named(enable_unsafe_noiommu_mode,
105 noiommu, bool, S_IRUGO | S_IWUSR);
106 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode. This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel. If you do not know what this is for, step away. (default: false)");
110 * vfio_iommu_group_{get,put} are only intended for VFIO bus driver probe
111 * and remove functions, any use cases other than acquiring the first
112 * reference for the purpose of calling vfio_add_group_dev() or removing
113 * that symmetric reference after vfio_del_group_dev() should use the raw
114 * iommu_group_{get,put} functions. In particular, vfio_iommu_group_put()
115 * removes the device from the dummy group and cannot be nested.
117 struct iommu_group *vfio_iommu_group_get(struct device *dev)
119 struct iommu_group *group;
120 int __maybe_unused ret;
122 group = iommu_group_get(dev);
124 #ifdef CONFIG_VFIO_NOIOMMU
126 * With noiommu enabled, an IOMMU group will be created for a device
127 * that doesn't already have one and doesn't have an iommu_ops on their
128 * bus. We set iommudata simply to be able to identify these groups
129 * as special use and for reclamation later.
131 if (group || !noiommu || iommu_present(dev->bus))
134 group = iommu_group_alloc();
138 iommu_group_set_name(group, "vfio-noiommu");
139 iommu_group_set_iommudata(group, &noiommu, NULL);
140 ret = iommu_group_add_device(group, dev);
141 iommu_group_put(group);
146 * Where to taint? At this point we've added an IOMMU group for a
147 * device that is not backed by iommu_ops, therefore any iommu_
148 * callback using iommu_ops can legitimately Oops. So, while we may
149 * be about to give a DMA capable device to a user without IOMMU
150 * protection, which is clearly taint-worthy, let's go ahead and do
153 add_taint(TAINT_USER, LOCKDEP_STILL_OK);
154 dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n");
159 EXPORT_SYMBOL_GPL(vfio_iommu_group_get);
161 void vfio_iommu_group_put(struct iommu_group *group, struct device *dev)
163 #ifdef CONFIG_VFIO_NOIOMMU
164 if (iommu_group_get_iommudata(group) == &noiommu)
165 iommu_group_remove_device(dev);
168 iommu_group_put(group);
170 EXPORT_SYMBOL_GPL(vfio_iommu_group_put);
172 #ifdef CONFIG_VFIO_NOIOMMU
173 static void *vfio_noiommu_open(unsigned long arg)
175 if (arg != VFIO_NOIOMMU_IOMMU)
176 return ERR_PTR(-EINVAL);
177 if (!capable(CAP_SYS_RAWIO))
178 return ERR_PTR(-EPERM);
183 static void vfio_noiommu_release(void *iommu_data)
187 static long vfio_noiommu_ioctl(void *iommu_data,
188 unsigned int cmd, unsigned long arg)
190 if (cmd == VFIO_CHECK_EXTENSION)
191 return noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0;
196 static int vfio_noiommu_attach_group(void *iommu_data,
197 struct iommu_group *iommu_group)
199 return iommu_group_get_iommudata(iommu_group) == &noiommu ? 0 : -EINVAL;
202 static void vfio_noiommu_detach_group(void *iommu_data,
203 struct iommu_group *iommu_group)
207 static const struct vfio_iommu_driver_ops vfio_noiommu_ops = {
208 .name = "vfio-noiommu",
209 .owner = THIS_MODULE,
210 .open = vfio_noiommu_open,
211 .release = vfio_noiommu_release,
212 .ioctl = vfio_noiommu_ioctl,
213 .attach_group = vfio_noiommu_attach_group,
214 .detach_group = vfio_noiommu_detach_group,
220 * IOMMU driver registration
222 int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops)
224 struct vfio_iommu_driver *driver, *tmp;
226 driver = kzalloc(sizeof(*driver), GFP_KERNEL);
232 mutex_lock(&vfio.iommu_drivers_lock);
234 /* Check for duplicates */
235 list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) {
236 if (tmp->ops == ops) {
237 mutex_unlock(&vfio.iommu_drivers_lock);
243 list_add(&driver->vfio_next, &vfio.iommu_drivers_list);
245 mutex_unlock(&vfio.iommu_drivers_lock);
249 EXPORT_SYMBOL_GPL(vfio_register_iommu_driver);
251 void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops)
253 struct vfio_iommu_driver *driver;
255 mutex_lock(&vfio.iommu_drivers_lock);
256 list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
257 if (driver->ops == ops) {
258 list_del(&driver->vfio_next);
259 mutex_unlock(&vfio.iommu_drivers_lock);
264 mutex_unlock(&vfio.iommu_drivers_lock);
266 EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver);
269 * Group minor allocation/free - both called with vfio.group_lock held
271 static int vfio_alloc_group_minor(struct vfio_group *group)
273 return idr_alloc(&vfio.group_idr, group, 0, MINORMASK + 1, GFP_KERNEL);
276 static void vfio_free_group_minor(int minor)
278 idr_remove(&vfio.group_idr, minor);
281 static int vfio_iommu_group_notifier(struct notifier_block *nb,
282 unsigned long action, void *data);
283 static void vfio_group_get(struct vfio_group *group);
286 * Container objects - containers are created when /dev/vfio/vfio is
287 * opened, but their lifecycle extends until the last user is done, so
288 * it's freed via kref. Must support container/group/device being
289 * closed in any order.
291 static void vfio_container_get(struct vfio_container *container)
293 kref_get(&container->kref);
296 static void vfio_container_release(struct kref *kref)
298 struct vfio_container *container;
299 container = container_of(kref, struct vfio_container, kref);
304 static void vfio_container_put(struct vfio_container *container)
306 kref_put(&container->kref, vfio_container_release);
309 static void vfio_group_unlock_and_free(struct vfio_group *group)
311 mutex_unlock(&vfio.group_lock);
313 * Unregister outside of lock. A spurious callback is harmless now
314 * that the group is no longer in vfio.group_list.
316 iommu_group_unregister_notifier(group->iommu_group, &group->nb);
321 * Group objects - create, release, get, put, search
323 static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group)
325 struct vfio_group *group, *tmp;
329 group = kzalloc(sizeof(*group), GFP_KERNEL);
331 return ERR_PTR(-ENOMEM);
333 kref_init(&group->kref);
334 INIT_LIST_HEAD(&group->device_list);
335 mutex_init(&group->device_lock);
336 INIT_LIST_HEAD(&group->unbound_list);
337 mutex_init(&group->unbound_lock);
338 atomic_set(&group->container_users, 0);
339 atomic_set(&group->opened, 0);
340 group->iommu_group = iommu_group;
341 #ifdef CONFIG_VFIO_NOIOMMU
342 group->noiommu = (iommu_group_get_iommudata(iommu_group) == &noiommu);
344 BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
346 group->nb.notifier_call = vfio_iommu_group_notifier;
349 * blocking notifiers acquire a rwsem around registering and hold
350 * it around callback. Therefore, need to register outside of
351 * vfio.group_lock to avoid A-B/B-A contention. Our callback won't
352 * do anything unless it can find the group in vfio.group_list, so
353 * no harm in registering early.
355 ret = iommu_group_register_notifier(iommu_group, &group->nb);
361 mutex_lock(&vfio.group_lock);
363 /* Did we race creating this group? */
364 list_for_each_entry(tmp, &vfio.group_list, vfio_next) {
365 if (tmp->iommu_group == iommu_group) {
367 vfio_group_unlock_and_free(group);
372 minor = vfio_alloc_group_minor(group);
374 vfio_group_unlock_and_free(group);
375 return ERR_PTR(minor);
378 dev = device_create(vfio.class, NULL,
379 MKDEV(MAJOR(vfio.group_devt), minor),
380 group, "%s%d", group->noiommu ? "noiommu-" : "",
381 iommu_group_id(iommu_group));
383 vfio_free_group_minor(minor);
384 vfio_group_unlock_and_free(group);
385 return ERR_CAST(dev);
388 group->minor = minor;
391 list_add(&group->vfio_next, &vfio.group_list);
393 mutex_unlock(&vfio.group_lock);
398 /* called with vfio.group_lock held */
399 static void vfio_group_release(struct kref *kref)
401 struct vfio_group *group = container_of(kref, struct vfio_group, kref);
402 struct vfio_unbound_dev *unbound, *tmp;
403 struct iommu_group *iommu_group = group->iommu_group;
405 WARN_ON(!list_empty(&group->device_list));
406 WARN_ON(group->notifier.head);
408 list_for_each_entry_safe(unbound, tmp,
409 &group->unbound_list, unbound_next) {
410 list_del(&unbound->unbound_next);
414 device_destroy(vfio.class, MKDEV(MAJOR(vfio.group_devt), group->minor));
415 list_del(&group->vfio_next);
416 vfio_free_group_minor(group->minor);
417 vfio_group_unlock_and_free(group);
418 iommu_group_put(iommu_group);
421 static void vfio_group_put(struct vfio_group *group)
423 kref_put_mutex(&group->kref, vfio_group_release, &vfio.group_lock);
426 struct vfio_group_put_work {
427 struct work_struct work;
428 struct vfio_group *group;
431 static void vfio_group_put_bg(struct work_struct *work)
433 struct vfio_group_put_work *do_work;
435 do_work = container_of(work, struct vfio_group_put_work, work);
437 vfio_group_put(do_work->group);
441 static void vfio_group_schedule_put(struct vfio_group *group)
443 struct vfio_group_put_work *do_work;
445 do_work = kmalloc(sizeof(*do_work), GFP_KERNEL);
446 if (WARN_ON(!do_work))
449 INIT_WORK(&do_work->work, vfio_group_put_bg);
450 do_work->group = group;
451 schedule_work(&do_work->work);
454 /* Assume group_lock or group reference is held */
455 static void vfio_group_get(struct vfio_group *group)
457 kref_get(&group->kref);
461 * Not really a try as we will sleep for mutex, but we need to make
462 * sure the group pointer is valid under lock and get a reference.
464 static struct vfio_group *vfio_group_try_get(struct vfio_group *group)
466 struct vfio_group *target = group;
468 mutex_lock(&vfio.group_lock);
469 list_for_each_entry(group, &vfio.group_list, vfio_next) {
470 if (group == target) {
471 vfio_group_get(group);
472 mutex_unlock(&vfio.group_lock);
476 mutex_unlock(&vfio.group_lock);
482 struct vfio_group *vfio_group_get_from_iommu(struct iommu_group *iommu_group)
484 struct vfio_group *group;
486 mutex_lock(&vfio.group_lock);
487 list_for_each_entry(group, &vfio.group_list, vfio_next) {
488 if (group->iommu_group == iommu_group) {
489 vfio_group_get(group);
490 mutex_unlock(&vfio.group_lock);
494 mutex_unlock(&vfio.group_lock);
499 static struct vfio_group *vfio_group_get_from_minor(int minor)
501 struct vfio_group *group;
503 mutex_lock(&vfio.group_lock);
504 group = idr_find(&vfio.group_idr, minor);
506 mutex_unlock(&vfio.group_lock);
509 vfio_group_get(group);
510 mutex_unlock(&vfio.group_lock);
515 static struct vfio_group *vfio_group_get_from_dev(struct device *dev)
517 struct iommu_group *iommu_group;
518 struct vfio_group *group;
520 iommu_group = iommu_group_get(dev);
524 group = vfio_group_get_from_iommu(iommu_group);
525 iommu_group_put(iommu_group);
531 * Device objects - create, release, get, put, search
534 struct vfio_device *vfio_group_create_device(struct vfio_group *group,
536 const struct vfio_device_ops *ops,
539 struct vfio_device *device;
541 device = kzalloc(sizeof(*device), GFP_KERNEL);
543 return ERR_PTR(-ENOMEM);
545 kref_init(&device->kref);
547 device->group = group;
549 device->device_data = device_data;
550 dev_set_drvdata(dev, device);
552 /* No need to get group_lock, caller has group reference */
553 vfio_group_get(group);
555 mutex_lock(&group->device_lock);
556 list_add(&device->group_next, &group->device_list);
557 mutex_unlock(&group->device_lock);
562 static void vfio_device_release(struct kref *kref)
564 struct vfio_device *device = container_of(kref,
565 struct vfio_device, kref);
566 struct vfio_group *group = device->group;
568 list_del(&device->group_next);
569 mutex_unlock(&group->device_lock);
571 dev_set_drvdata(device->dev, NULL);
575 /* vfio_del_group_dev may be waiting for this device */
576 wake_up(&vfio.release_q);
579 /* Device reference always implies a group reference */
580 void vfio_device_put(struct vfio_device *device)
582 struct vfio_group *group = device->group;
583 kref_put_mutex(&device->kref, vfio_device_release, &group->device_lock);
584 vfio_group_put(group);
586 EXPORT_SYMBOL_GPL(vfio_device_put);
588 static void vfio_device_get(struct vfio_device *device)
590 vfio_group_get(device->group);
591 kref_get(&device->kref);
594 static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
597 struct vfio_device *device;
599 mutex_lock(&group->device_lock);
600 list_for_each_entry(device, &group->device_list, group_next) {
601 if (device->dev == dev) {
602 vfio_device_get(device);
603 mutex_unlock(&group->device_lock);
607 mutex_unlock(&group->device_lock);
612 * Some drivers, like pci-stub, are only used to prevent other drivers from
613 * claiming a device and are therefore perfectly legitimate for a user owned
614 * group. The pci-stub driver has no dependencies on DMA or the IOVA mapping
615 * of the device, but it does prevent the user from having direct access to
616 * the device, which is useful in some circumstances.
618 * We also assume that we can include PCI interconnect devices, ie. bridges.
619 * IOMMU grouping on PCI necessitates that if we lack isolation on a bridge
620 * then all of the downstream devices will be part of the same IOMMU group as
621 * the bridge. Thus, if placing the bridge into the user owned IOVA space
622 * breaks anything, it only does so for user owned devices downstream. Note
623 * that error notification via MSI can be affected for platforms that handle
624 * MSI within the same IOVA space as DMA.
626 static const char * const vfio_driver_whitelist[] = { "pci-stub" };
628 static bool vfio_dev_whitelisted(struct device *dev, struct device_driver *drv)
632 if (dev_is_pci(dev)) {
633 struct pci_dev *pdev = to_pci_dev(dev);
635 if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL)
639 for (i = 0; i < ARRAY_SIZE(vfio_driver_whitelist); i++) {
640 if (!strcmp(drv->name, vfio_driver_whitelist[i]))
648 * A vfio group is viable for use by userspace if all devices are in
649 * one of the following states:
651 * - bound to a vfio driver
652 * - bound to a whitelisted driver
653 * - a PCI interconnect device
655 * We use two methods to determine whether a device is bound to a vfio
656 * driver. The first is to test whether the device exists in the vfio
657 * group. The second is to test if the device exists on the group
658 * unbound_list, indicating it's in the middle of transitioning from
659 * a vfio driver to driver-less.
661 static int vfio_dev_viable(struct device *dev, void *data)
663 struct vfio_group *group = data;
664 struct vfio_device *device;
665 struct device_driver *drv = ACCESS_ONCE(dev->driver);
666 struct vfio_unbound_dev *unbound;
669 mutex_lock(&group->unbound_lock);
670 list_for_each_entry(unbound, &group->unbound_list, unbound_next) {
671 if (dev == unbound->dev) {
676 mutex_unlock(&group->unbound_lock);
678 if (!ret || !drv || vfio_dev_whitelisted(dev, drv))
681 device = vfio_group_get_device(group, dev);
683 vfio_device_put(device);
691 * Async device support
693 static int vfio_group_nb_add_dev(struct vfio_group *group, struct device *dev)
695 struct vfio_device *device;
697 /* Do we already know about it? We shouldn't */
698 device = vfio_group_get_device(group, dev);
699 if (WARN_ON_ONCE(device)) {
700 vfio_device_put(device);
704 /* Nothing to do for idle groups */
705 if (!atomic_read(&group->container_users))
708 /* TODO Prevent device auto probing */
709 WARN(1, "Device %s added to live group %d!\n", dev_name(dev),
710 iommu_group_id(group->iommu_group));
715 static int vfio_group_nb_verify(struct vfio_group *group, struct device *dev)
717 /* We don't care what happens when the group isn't in use */
718 if (!atomic_read(&group->container_users))
721 return vfio_dev_viable(dev, group);
724 static int vfio_iommu_group_notifier(struct notifier_block *nb,
725 unsigned long action, void *data)
727 struct vfio_group *group = container_of(nb, struct vfio_group, nb);
728 struct device *dev = data;
729 struct vfio_unbound_dev *unbound;
732 * Need to go through a group_lock lookup to get a reference or we
733 * risk racing a group being removed. Ignore spurious notifies.
735 group = vfio_group_try_get(group);
740 case IOMMU_GROUP_NOTIFY_ADD_DEVICE:
741 vfio_group_nb_add_dev(group, dev);
743 case IOMMU_GROUP_NOTIFY_DEL_DEVICE:
745 * Nothing to do here. If the device is in use, then the
746 * vfio sub-driver should block the remove callback until
747 * it is unused. If the device is unused or attached to a
748 * stub driver, then it should be released and we don't
749 * care that it will be going away.
752 case IOMMU_GROUP_NOTIFY_BIND_DRIVER:
753 pr_debug("%s: Device %s, group %d binding to driver\n",
754 __func__, dev_name(dev),
755 iommu_group_id(group->iommu_group));
757 case IOMMU_GROUP_NOTIFY_BOUND_DRIVER:
758 pr_debug("%s: Device %s, group %d bound to driver %s\n",
759 __func__, dev_name(dev),
760 iommu_group_id(group->iommu_group), dev->driver->name);
761 BUG_ON(vfio_group_nb_verify(group, dev));
763 case IOMMU_GROUP_NOTIFY_UNBIND_DRIVER:
764 pr_debug("%s: Device %s, group %d unbinding from driver %s\n",
765 __func__, dev_name(dev),
766 iommu_group_id(group->iommu_group), dev->driver->name);
768 case IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER:
769 pr_debug("%s: Device %s, group %d unbound from driver\n",
770 __func__, dev_name(dev),
771 iommu_group_id(group->iommu_group));
773 * XXX An unbound device in a live group is ok, but we'd
774 * really like to avoid the above BUG_ON by preventing other
775 * drivers from binding to it. Once that occurs, we have to
776 * stop the system to maintain isolation. At a minimum, we'd
777 * want a toggle to disable driver auto probe for this device.
780 mutex_lock(&group->unbound_lock);
781 list_for_each_entry(unbound,
782 &group->unbound_list, unbound_next) {
783 if (dev == unbound->dev) {
784 list_del(&unbound->unbound_next);
789 mutex_unlock(&group->unbound_lock);
794 * If we're the last reference to the group, the group will be
795 * released, which includes unregistering the iommu group notifier.
796 * We hold a read-lock on that notifier list, unregistering needs
797 * a write-lock... deadlock. Release our reference asynchronously
798 * to avoid that situation.
800 vfio_group_schedule_put(group);
807 int vfio_add_group_dev(struct device *dev,
808 const struct vfio_device_ops *ops, void *device_data)
810 struct iommu_group *iommu_group;
811 struct vfio_group *group;
812 struct vfio_device *device;
814 iommu_group = iommu_group_get(dev);
818 group = vfio_group_get_from_iommu(iommu_group);
820 group = vfio_create_group(iommu_group);
822 iommu_group_put(iommu_group);
823 return PTR_ERR(group);
827 * A found vfio_group already holds a reference to the
828 * iommu_group. A created vfio_group keeps the reference.
830 iommu_group_put(iommu_group);
833 device = vfio_group_get_device(group, dev);
835 WARN(1, "Device %s already exists on group %d\n",
836 dev_name(dev), iommu_group_id(iommu_group));
837 vfio_device_put(device);
838 vfio_group_put(group);
842 device = vfio_group_create_device(group, dev, ops, device_data);
843 if (IS_ERR(device)) {
844 vfio_group_put(group);
845 return PTR_ERR(device);
849 * Drop all but the vfio_device reference. The vfio_device holds
850 * a reference to the vfio_group, which holds a reference to the
853 vfio_group_put(group);
857 EXPORT_SYMBOL_GPL(vfio_add_group_dev);
860 * Get a reference to the vfio_device for a device. Even if the
861 * caller thinks they own the device, they could be racing with a
862 * release call path, so we can't trust drvdata for the shortcut.
863 * Go the long way around, from the iommu_group to the vfio_group
864 * to the vfio_device.
866 struct vfio_device *vfio_device_get_from_dev(struct device *dev)
868 struct vfio_group *group;
869 struct vfio_device *device;
871 group = vfio_group_get_from_dev(dev);
875 device = vfio_group_get_device(group, dev);
876 vfio_group_put(group);
880 EXPORT_SYMBOL_GPL(vfio_device_get_from_dev);
882 static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
885 struct vfio_device *it, *device = NULL;
887 mutex_lock(&group->device_lock);
888 list_for_each_entry(it, &group->device_list, group_next) {
889 if (!strcmp(dev_name(it->dev), buf)) {
891 vfio_device_get(device);
895 mutex_unlock(&group->device_lock);
901 * Caller must hold a reference to the vfio_device
903 void *vfio_device_data(struct vfio_device *device)
905 return device->device_data;
907 EXPORT_SYMBOL_GPL(vfio_device_data);
909 /* Given a referenced group, check if it contains the device */
910 static bool vfio_dev_present(struct vfio_group *group, struct device *dev)
912 struct vfio_device *device;
914 device = vfio_group_get_device(group, dev);
918 vfio_device_put(device);
923 * Decrement the device reference count and wait for the device to be
924 * removed. Open file descriptors for the device... */
925 void *vfio_del_group_dev(struct device *dev)
927 struct vfio_device *device = dev_get_drvdata(dev);
928 struct vfio_group *group = device->group;
929 void *device_data = device->device_data;
930 struct vfio_unbound_dev *unbound;
933 bool interrupted = false;
936 * The group exists so long as we have a device reference. Get
937 * a group reference and use it to scan for the device going away.
939 vfio_group_get(group);
942 * When the device is removed from the group, the group suddenly
943 * becomes non-viable; the device has a driver (until the unbind
944 * completes), but it's not present in the group. This is bad news
945 * for any external users that need to re-acquire a group reference
946 * in order to match and release their existing reference. To
947 * solve this, we track such devices on the unbound_list to bridge
948 * the gap until they're fully unbound.
950 unbound = kzalloc(sizeof(*unbound), GFP_KERNEL);
953 mutex_lock(&group->unbound_lock);
954 list_add(&unbound->unbound_next, &group->unbound_list);
955 mutex_unlock(&group->unbound_lock);
959 vfio_device_put(device);
962 * If the device is still present in the group after the above
963 * 'put', then it is in use and we need to request it from the
964 * bus driver. The driver may in turn need to request the
965 * device from the user. We send the request on an arbitrary
966 * interval with counter to allow the driver to take escalating
967 * measures to release the device if it has the ability to do so.
970 device = vfio_group_get_device(group, dev);
974 if (device->ops->request)
975 device->ops->request(device_data, i++);
977 vfio_device_put(device);
980 ret = wait_event_timeout(vfio.release_q,
981 !vfio_dev_present(group, dev), HZ * 10);
983 ret = wait_event_interruptible_timeout(vfio.release_q,
984 !vfio_dev_present(group, dev), HZ * 10);
985 if (ret == -ERESTARTSYS) {
988 "Device is currently in use, task"
990 "blocked until device is released",
991 current->comm, task_pid_nr(current));
996 vfio_group_put(group);
1000 EXPORT_SYMBOL_GPL(vfio_del_group_dev);
1003 * VFIO base fd, /dev/vfio/vfio
1005 static long vfio_ioctl_check_extension(struct vfio_container *container,
1008 struct vfio_iommu_driver *driver;
1011 down_read(&container->group_lock);
1013 driver = container->iommu_driver;
1016 /* No base extensions yet */
1019 * If no driver is set, poll all registered drivers for
1020 * extensions and return the first positive result. If
1021 * a driver is already set, further queries will be passed
1022 * only to that driver.
1025 mutex_lock(&vfio.iommu_drivers_lock);
1026 list_for_each_entry(driver, &vfio.iommu_drivers_list,
1029 #ifdef CONFIG_VFIO_NOIOMMU
1030 if (!list_empty(&container->group_list) &&
1031 (container->noiommu !=
1032 (driver->ops == &vfio_noiommu_ops)))
1036 if (!try_module_get(driver->ops->owner))
1039 ret = driver->ops->ioctl(NULL,
1040 VFIO_CHECK_EXTENSION,
1042 module_put(driver->ops->owner);
1046 mutex_unlock(&vfio.iommu_drivers_lock);
1048 ret = driver->ops->ioctl(container->iommu_data,
1049 VFIO_CHECK_EXTENSION, arg);
1052 up_read(&container->group_lock);
1057 /* hold write lock on container->group_lock */
1058 static int __vfio_container_attach_groups(struct vfio_container *container,
1059 struct vfio_iommu_driver *driver,
1062 struct vfio_group *group;
1065 list_for_each_entry(group, &container->group_list, container_next) {
1066 ret = driver->ops->attach_group(data, group->iommu_group);
1074 list_for_each_entry_continue_reverse(group, &container->group_list,
1076 driver->ops->detach_group(data, group->iommu_group);
1082 static long vfio_ioctl_set_iommu(struct vfio_container *container,
1085 struct vfio_iommu_driver *driver;
1088 down_write(&container->group_lock);
1091 * The container is designed to be an unprivileged interface while
1092 * the group can be assigned to specific users. Therefore, only by
1093 * adding a group to a container does the user get the privilege of
1094 * enabling the iommu, which may allocate finite resources. There
1095 * is no unset_iommu, but by removing all the groups from a container,
1096 * the container is deprivileged and returns to an unset state.
1098 if (list_empty(&container->group_list) || container->iommu_driver) {
1099 up_write(&container->group_lock);
1103 mutex_lock(&vfio.iommu_drivers_lock);
1104 list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
1107 #ifdef CONFIG_VFIO_NOIOMMU
1109 * Only noiommu containers can use vfio-noiommu and noiommu
1110 * containers can only use vfio-noiommu.
1112 if (container->noiommu != (driver->ops == &vfio_noiommu_ops))
1116 if (!try_module_get(driver->ops->owner))
1120 * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION,
1121 * so test which iommu driver reported support for this
1122 * extension and call open on them. We also pass them the
1123 * magic, allowing a single driver to support multiple
1124 * interfaces if they'd like.
1126 if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) {
1127 module_put(driver->ops->owner);
1131 data = driver->ops->open(arg);
1133 ret = PTR_ERR(data);
1134 module_put(driver->ops->owner);
1138 ret = __vfio_container_attach_groups(container, driver, data);
1140 driver->ops->release(data);
1141 module_put(driver->ops->owner);
1145 container->iommu_driver = driver;
1146 container->iommu_data = data;
1150 mutex_unlock(&vfio.iommu_drivers_lock);
1151 up_write(&container->group_lock);
1156 static long vfio_fops_unl_ioctl(struct file *filep,
1157 unsigned int cmd, unsigned long arg)
1159 struct vfio_container *container = filep->private_data;
1160 struct vfio_iommu_driver *driver;
1168 case VFIO_GET_API_VERSION:
1169 ret = VFIO_API_VERSION;
1171 case VFIO_CHECK_EXTENSION:
1172 ret = vfio_ioctl_check_extension(container, arg);
1174 case VFIO_SET_IOMMU:
1175 ret = vfio_ioctl_set_iommu(container, arg);
1178 down_read(&container->group_lock);
1180 driver = container->iommu_driver;
1181 data = container->iommu_data;
1183 if (driver) /* passthrough all unrecognized ioctls */
1184 ret = driver->ops->ioctl(data, cmd, arg);
1186 up_read(&container->group_lock);
1192 #ifdef CONFIG_COMPAT
1193 static long vfio_fops_compat_ioctl(struct file *filep,
1194 unsigned int cmd, unsigned long arg)
1196 arg = (unsigned long)compat_ptr(arg);
1197 return vfio_fops_unl_ioctl(filep, cmd, arg);
1199 #endif /* CONFIG_COMPAT */
1201 static int vfio_fops_open(struct inode *inode, struct file *filep)
1203 struct vfio_container *container;
1205 container = kzalloc(sizeof(*container), GFP_KERNEL);
1209 INIT_LIST_HEAD(&container->group_list);
1210 init_rwsem(&container->group_lock);
1211 kref_init(&container->kref);
1213 filep->private_data = container;
1218 static int vfio_fops_release(struct inode *inode, struct file *filep)
1220 struct vfio_container *container = filep->private_data;
1222 filep->private_data = NULL;
1224 vfio_container_put(container);
1230 * Once an iommu driver is set, we optionally pass read/write/mmap
1231 * on to the driver, allowing management interfaces beyond ioctl.
1233 static ssize_t vfio_fops_read(struct file *filep, char __user *buf,
1234 size_t count, loff_t *ppos)
1236 struct vfio_container *container = filep->private_data;
1237 struct vfio_iommu_driver *driver;
1238 ssize_t ret = -EINVAL;
1240 down_read(&container->group_lock);
1242 driver = container->iommu_driver;
1243 if (likely(driver && driver->ops->read))
1244 ret = driver->ops->read(container->iommu_data,
1247 up_read(&container->group_lock);
1252 static ssize_t vfio_fops_write(struct file *filep, const char __user *buf,
1253 size_t count, loff_t *ppos)
1255 struct vfio_container *container = filep->private_data;
1256 struct vfio_iommu_driver *driver;
1257 ssize_t ret = -EINVAL;
1259 down_read(&container->group_lock);
1261 driver = container->iommu_driver;
1262 if (likely(driver && driver->ops->write))
1263 ret = driver->ops->write(container->iommu_data,
1266 up_read(&container->group_lock);
1271 static int vfio_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1273 struct vfio_container *container = filep->private_data;
1274 struct vfio_iommu_driver *driver;
1277 down_read(&container->group_lock);
1279 driver = container->iommu_driver;
1280 if (likely(driver && driver->ops->mmap))
1281 ret = driver->ops->mmap(container->iommu_data, vma);
1283 up_read(&container->group_lock);
1288 static const struct file_operations vfio_fops = {
1289 .owner = THIS_MODULE,
1290 .open = vfio_fops_open,
1291 .release = vfio_fops_release,
1292 .read = vfio_fops_read,
1293 .write = vfio_fops_write,
1294 .unlocked_ioctl = vfio_fops_unl_ioctl,
1295 #ifdef CONFIG_COMPAT
1296 .compat_ioctl = vfio_fops_compat_ioctl,
1298 .mmap = vfio_fops_mmap,
1302 * VFIO Group fd, /dev/vfio/$GROUP
1304 static void __vfio_group_unset_container(struct vfio_group *group)
1306 struct vfio_container *container = group->container;
1307 struct vfio_iommu_driver *driver;
1309 down_write(&container->group_lock);
1311 driver = container->iommu_driver;
1313 driver->ops->detach_group(container->iommu_data,
1314 group->iommu_group);
1316 group->container = NULL;
1317 list_del(&group->container_next);
1319 /* Detaching the last group deprivileges a container, remove iommu */
1320 if (driver && list_empty(&container->group_list)) {
1321 driver->ops->release(container->iommu_data);
1322 module_put(driver->ops->owner);
1323 container->iommu_driver = NULL;
1324 container->iommu_data = NULL;
1327 up_write(&container->group_lock);
1329 vfio_container_put(container);
1333 * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or
1334 * if there was no container to unset. Since the ioctl is called on
1335 * the group, we know that still exists, therefore the only valid
1336 * transition here is 1->0.
1338 static int vfio_group_unset_container(struct vfio_group *group)
1340 int users = atomic_cmpxchg(&group->container_users, 1, 0);
1347 __vfio_group_unset_container(group);
1353 * When removing container users, anything that removes the last user
1354 * implicitly removes the group from the container. That is, if the
1355 * group file descriptor is closed, as well as any device file descriptors,
1356 * the group is free.
1358 static void vfio_group_try_dissolve_container(struct vfio_group *group)
1360 if (0 == atomic_dec_if_positive(&group->container_users))
1361 __vfio_group_unset_container(group);
1364 static int vfio_group_set_container(struct vfio_group *group, int container_fd)
1367 struct vfio_container *container;
1368 struct vfio_iommu_driver *driver;
1371 if (atomic_read(&group->container_users))
1374 if (group->noiommu && !capable(CAP_SYS_RAWIO))
1377 f = fdget(container_fd);
1381 /* Sanity check, is this really our fd? */
1382 if (f.file->f_op != &vfio_fops) {
1387 container = f.file->private_data;
1388 WARN_ON(!container); /* fget ensures we don't race vfio_release */
1390 down_write(&container->group_lock);
1392 /* Real groups and fake groups cannot mix */
1393 if (!list_empty(&container->group_list) &&
1394 container->noiommu != group->noiommu) {
1399 driver = container->iommu_driver;
1401 ret = driver->ops->attach_group(container->iommu_data,
1402 group->iommu_group);
1407 group->container = container;
1408 container->noiommu = group->noiommu;
1409 list_add(&group->container_next, &container->group_list);
1411 /* Get a reference on the container and mark a user within the group */
1412 vfio_container_get(container);
1413 atomic_inc(&group->container_users);
1416 up_write(&container->group_lock);
1421 static bool vfio_group_viable(struct vfio_group *group)
1423 return (iommu_group_for_each_dev(group->iommu_group,
1424 group, vfio_dev_viable) == 0);
1427 static int vfio_group_add_container_user(struct vfio_group *group)
1429 if (!atomic_inc_not_zero(&group->container_users))
1432 if (group->noiommu) {
1433 atomic_dec(&group->container_users);
1436 if (!group->container->iommu_driver || !vfio_group_viable(group)) {
1437 atomic_dec(&group->container_users);
1444 static const struct file_operations vfio_device_fops;
1446 static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
1448 struct vfio_device *device;
1452 if (0 == atomic_read(&group->container_users) ||
1453 !group->container->iommu_driver || !vfio_group_viable(group))
1456 if (group->noiommu && !capable(CAP_SYS_RAWIO))
1459 device = vfio_device_get_from_name(group, buf);
1463 ret = device->ops->open(device->device_data);
1465 vfio_device_put(device);
1470 * We can't use anon_inode_getfd() because we need to modify
1471 * the f_mode flags directly to allow more than just ioctls
1473 ret = get_unused_fd_flags(O_CLOEXEC);
1475 device->ops->release(device->device_data);
1476 vfio_device_put(device);
1480 filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
1482 if (IS_ERR(filep)) {
1484 ret = PTR_ERR(filep);
1485 device->ops->release(device->device_data);
1486 vfio_device_put(device);
1491 * TODO: add an anon_inode interface to do this.
1492 * Appears to be missing by lack of need rather than
1493 * explicitly prevented. Now there's need.
1495 filep->f_mode |= (FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
1497 atomic_inc(&group->container_users);
1499 fd_install(ret, filep);
1502 dev_warn(device->dev, "vfio-noiommu device opened by user "
1503 "(%s:%d)\n", current->comm, task_pid_nr(current));
1508 static long vfio_group_fops_unl_ioctl(struct file *filep,
1509 unsigned int cmd, unsigned long arg)
1511 struct vfio_group *group = filep->private_data;
1515 case VFIO_GROUP_GET_STATUS:
1517 struct vfio_group_status status;
1518 unsigned long minsz;
1520 minsz = offsetofend(struct vfio_group_status, flags);
1522 if (copy_from_user(&status, (void __user *)arg, minsz))
1525 if (status.argsz < minsz)
1530 if (vfio_group_viable(group))
1531 status.flags |= VFIO_GROUP_FLAGS_VIABLE;
1533 if (group->container)
1534 status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET;
1536 if (copy_to_user((void __user *)arg, &status, minsz))
1542 case VFIO_GROUP_SET_CONTAINER:
1546 if (get_user(fd, (int __user *)arg))
1552 ret = vfio_group_set_container(group, fd);
1555 case VFIO_GROUP_UNSET_CONTAINER:
1556 ret = vfio_group_unset_container(group);
1558 case VFIO_GROUP_GET_DEVICE_FD:
1562 buf = strndup_user((const char __user *)arg, PAGE_SIZE);
1564 return PTR_ERR(buf);
1566 ret = vfio_group_get_device_fd(group, buf);
1575 #ifdef CONFIG_COMPAT
1576 static long vfio_group_fops_compat_ioctl(struct file *filep,
1577 unsigned int cmd, unsigned long arg)
1579 arg = (unsigned long)compat_ptr(arg);
1580 return vfio_group_fops_unl_ioctl(filep, cmd, arg);
1582 #endif /* CONFIG_COMPAT */
1584 static int vfio_group_fops_open(struct inode *inode, struct file *filep)
1586 struct vfio_group *group;
1589 group = vfio_group_get_from_minor(iminor(inode));
1593 if (group->noiommu && !capable(CAP_SYS_RAWIO)) {
1594 vfio_group_put(group);
1598 /* Do we need multiple instances of the group open? Seems not. */
1599 opened = atomic_cmpxchg(&group->opened, 0, 1);
1601 vfio_group_put(group);
1605 /* Is something still in use from a previous open? */
1606 if (group->container) {
1607 atomic_dec(&group->opened);
1608 vfio_group_put(group);
1612 /* Warn if previous user didn't cleanup and re-init to drop them */
1613 if (WARN_ON(group->notifier.head))
1614 BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
1616 filep->private_data = group;
1621 static int vfio_group_fops_release(struct inode *inode, struct file *filep)
1623 struct vfio_group *group = filep->private_data;
1625 filep->private_data = NULL;
1627 vfio_group_try_dissolve_container(group);
1629 atomic_dec(&group->opened);
1631 vfio_group_put(group);
1636 static const struct file_operations vfio_group_fops = {
1637 .owner = THIS_MODULE,
1638 .unlocked_ioctl = vfio_group_fops_unl_ioctl,
1639 #ifdef CONFIG_COMPAT
1640 .compat_ioctl = vfio_group_fops_compat_ioctl,
1642 .open = vfio_group_fops_open,
1643 .release = vfio_group_fops_release,
1649 static int vfio_device_fops_release(struct inode *inode, struct file *filep)
1651 struct vfio_device *device = filep->private_data;
1653 device->ops->release(device->device_data);
1655 vfio_group_try_dissolve_container(device->group);
1657 vfio_device_put(device);
1662 static long vfio_device_fops_unl_ioctl(struct file *filep,
1663 unsigned int cmd, unsigned long arg)
1665 struct vfio_device *device = filep->private_data;
1667 if (unlikely(!device->ops->ioctl))
1670 return device->ops->ioctl(device->device_data, cmd, arg);
1673 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1674 size_t count, loff_t *ppos)
1676 struct vfio_device *device = filep->private_data;
1678 if (unlikely(!device->ops->read))
1681 return device->ops->read(device->device_data, buf, count, ppos);
1684 static ssize_t vfio_device_fops_write(struct file *filep,
1685 const char __user *buf,
1686 size_t count, loff_t *ppos)
1688 struct vfio_device *device = filep->private_data;
1690 if (unlikely(!device->ops->write))
1693 return device->ops->write(device->device_data, buf, count, ppos);
1696 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1698 struct vfio_device *device = filep->private_data;
1700 if (unlikely(!device->ops->mmap))
1703 return device->ops->mmap(device->device_data, vma);
1706 #ifdef CONFIG_COMPAT
1707 static long vfio_device_fops_compat_ioctl(struct file *filep,
1708 unsigned int cmd, unsigned long arg)
1710 arg = (unsigned long)compat_ptr(arg);
1711 return vfio_device_fops_unl_ioctl(filep, cmd, arg);
1713 #endif /* CONFIG_COMPAT */
1715 static const struct file_operations vfio_device_fops = {
1716 .owner = THIS_MODULE,
1717 .release = vfio_device_fops_release,
1718 .read = vfio_device_fops_read,
1719 .write = vfio_device_fops_write,
1720 .unlocked_ioctl = vfio_device_fops_unl_ioctl,
1721 #ifdef CONFIG_COMPAT
1722 .compat_ioctl = vfio_device_fops_compat_ioctl,
1724 .mmap = vfio_device_fops_mmap,
1728 * External user API, exported by symbols to be linked dynamically.
1730 * The protocol includes:
1731 * 1. do normal VFIO init operation:
1732 * - opening a new container;
1733 * - attaching group(s) to it;
1734 * - setting an IOMMU driver for a container.
1735 * When IOMMU is set for a container, all groups in it are
1736 * considered ready to use by an external user.
1738 * 2. User space passes a group fd to an external user.
1739 * The external user calls vfio_group_get_external_user()
1741 * - the group is initialized;
1742 * - IOMMU is set for it.
1743 * If both checks passed, vfio_group_get_external_user()
1744 * increments the container user counter to prevent
1745 * the VFIO group from disposal before KVM exits.
1747 * 3. The external user calls vfio_external_user_iommu_id()
1748 * to know an IOMMU ID.
1750 * 4. When the external KVM finishes, it calls
1751 * vfio_group_put_external_user() to release the VFIO group.
1752 * This call decrements the container user counter.
1754 struct vfio_group *vfio_group_get_external_user(struct file *filep)
1756 struct vfio_group *group = filep->private_data;
1759 if (filep->f_op != &vfio_group_fops)
1760 return ERR_PTR(-EINVAL);
1762 ret = vfio_group_add_container_user(group);
1764 return ERR_PTR(ret);
1766 vfio_group_get(group);
1770 EXPORT_SYMBOL_GPL(vfio_group_get_external_user);
1772 void vfio_group_put_external_user(struct vfio_group *group)
1774 vfio_group_try_dissolve_container(group);
1775 vfio_group_put(group);
1777 EXPORT_SYMBOL_GPL(vfio_group_put_external_user);
1779 bool vfio_external_group_match_file(struct vfio_group *test_group,
1782 struct vfio_group *group = filep->private_data;
1784 return (filep->f_op == &vfio_group_fops) && (group == test_group);
1786 EXPORT_SYMBOL_GPL(vfio_external_group_match_file);
1788 int vfio_external_user_iommu_id(struct vfio_group *group)
1790 return iommu_group_id(group->iommu_group);
1792 EXPORT_SYMBOL_GPL(vfio_external_user_iommu_id);
1794 long vfio_external_check_extension(struct vfio_group *group, unsigned long arg)
1796 return vfio_ioctl_check_extension(group->container, arg);
1798 EXPORT_SYMBOL_GPL(vfio_external_check_extension);
1801 * Sub-module support
1804 * Helper for managing a buffer of info chain capabilities, allocate or
1805 * reallocate a buffer with additional @size, filling in @id and @version
1806 * of the capability. A pointer to the new capability is returned.
1808 * NB. The chain is based at the head of the buffer, so new entries are
1809 * added to the tail, vfio_info_cap_shift() should be called to fixup the
1810 * next offsets prior to copying to the user buffer.
1812 struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1813 size_t size, u16 id, u16 version)
1816 struct vfio_info_cap_header *header, *tmp;
1818 buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1822 return ERR_PTR(-ENOMEM);
1826 header = buf + caps->size;
1828 /* Eventually copied to user buffer, zero */
1829 memset(header, 0, size);
1832 header->version = version;
1834 /* Add to the end of the capability chain */
1835 for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1838 tmp->next = caps->size;
1843 EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1845 void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1847 struct vfio_info_cap_header *tmp;
1848 void *buf = (void *)caps->buf;
1850 for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1851 tmp->next += offset;
1853 EXPORT_SYMBOL(vfio_info_cap_shift);
1855 static int sparse_mmap_cap(struct vfio_info_cap *caps, void *cap_type)
1857 struct vfio_info_cap_header *header;
1858 struct vfio_region_info_cap_sparse_mmap *sparse_cap, *sparse = cap_type;
1861 size = sizeof(*sparse) + sparse->nr_areas * sizeof(*sparse->areas);
1862 header = vfio_info_cap_add(caps, size,
1863 VFIO_REGION_INFO_CAP_SPARSE_MMAP, 1);
1865 return PTR_ERR(header);
1867 sparse_cap = container_of(header,
1868 struct vfio_region_info_cap_sparse_mmap, header);
1869 sparse_cap->nr_areas = sparse->nr_areas;
1870 memcpy(sparse_cap->areas, sparse->areas,
1871 sparse->nr_areas * sizeof(*sparse->areas));
1875 static int region_type_cap(struct vfio_info_cap *caps, void *cap_type)
1877 struct vfio_info_cap_header *header;
1878 struct vfio_region_info_cap_type *type_cap, *cap = cap_type;
1880 header = vfio_info_cap_add(caps, sizeof(*cap),
1881 VFIO_REGION_INFO_CAP_TYPE, 1);
1883 return PTR_ERR(header);
1885 type_cap = container_of(header, struct vfio_region_info_cap_type,
1887 type_cap->type = cap->type;
1888 type_cap->subtype = cap->subtype;
1892 int vfio_info_add_capability(struct vfio_info_cap *caps, int cap_type_id,
1900 switch (cap_type_id) {
1901 case VFIO_REGION_INFO_CAP_SPARSE_MMAP:
1902 ret = sparse_mmap_cap(caps, cap_type);
1905 case VFIO_REGION_INFO_CAP_TYPE:
1906 ret = region_type_cap(caps, cap_type);
1912 EXPORT_SYMBOL(vfio_info_add_capability);
1914 int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1915 int max_irq_type, size_t *data_size)
1917 unsigned long minsz;
1920 minsz = offsetofend(struct vfio_irq_set, count);
1922 if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1923 (hdr->count >= (U32_MAX - hdr->start)) ||
1924 (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1925 VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1931 if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1934 switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1935 case VFIO_IRQ_SET_DATA_NONE:
1938 case VFIO_IRQ_SET_DATA_BOOL:
1939 size = sizeof(uint8_t);
1941 case VFIO_IRQ_SET_DATA_EVENTFD:
1942 size = sizeof(int32_t);
1949 if (hdr->argsz - minsz < hdr->count * size)
1955 *data_size = hdr->count * size;
1960 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1963 * Pin a set of guest PFNs and return their associated host PFNs for local
1965 * @dev [in] : device
1966 * @user_pfn [in]: array of user/guest PFNs to be pinned.
1967 * @npage [in] : count of elements in user_pfn array. This count should not
1968 * be greater VFIO_PIN_PAGES_MAX_ENTRIES.
1969 * @prot [in] : protection flags
1970 * @phys_pfn[out]: array of host PFNs
1971 * Return error or number of pages pinned.
1973 int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, int npage,
1974 int prot, unsigned long *phys_pfn)
1976 struct vfio_container *container;
1977 struct vfio_group *group;
1978 struct vfio_iommu_driver *driver;
1981 if (!dev || !user_pfn || !phys_pfn || !npage)
1984 if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1987 group = vfio_group_get_from_dev(dev);
1991 ret = vfio_group_add_container_user(group);
1995 container = group->container;
1996 down_read(&container->group_lock);
1998 driver = container->iommu_driver;
1999 if (likely(driver && driver->ops->pin_pages))
2000 ret = driver->ops->pin_pages(container->iommu_data, user_pfn,
2001 npage, prot, phys_pfn);
2005 up_read(&container->group_lock);
2006 vfio_group_try_dissolve_container(group);
2009 vfio_group_put(group);
2012 EXPORT_SYMBOL(vfio_pin_pages);
2015 * Unpin set of host PFNs for local domain only.
2016 * @dev [in] : device
2017 * @user_pfn [in]: array of user/guest PFNs to be unpinned. Number of user/guest
2018 * PFNs should not be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
2019 * @npage [in] : count of elements in user_pfn array. This count should not
2020 * be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
2021 * Return error or number of pages unpinned.
2023 int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, int npage)
2025 struct vfio_container *container;
2026 struct vfio_group *group;
2027 struct vfio_iommu_driver *driver;
2030 if (!dev || !user_pfn || !npage)
2033 if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
2036 group = vfio_group_get_from_dev(dev);
2040 ret = vfio_group_add_container_user(group);
2042 goto err_unpin_pages;
2044 container = group->container;
2045 down_read(&container->group_lock);
2047 driver = container->iommu_driver;
2048 if (likely(driver && driver->ops->unpin_pages))
2049 ret = driver->ops->unpin_pages(container->iommu_data, user_pfn,
2054 up_read(&container->group_lock);
2055 vfio_group_try_dissolve_container(group);
2058 vfio_group_put(group);
2061 EXPORT_SYMBOL(vfio_unpin_pages);
2063 static int vfio_register_iommu_notifier(struct vfio_group *group,
2064 unsigned long *events,
2065 struct notifier_block *nb)
2067 struct vfio_container *container;
2068 struct vfio_iommu_driver *driver;
2071 ret = vfio_group_add_container_user(group);
2075 container = group->container;
2076 down_read(&container->group_lock);
2078 driver = container->iommu_driver;
2079 if (likely(driver && driver->ops->register_notifier))
2080 ret = driver->ops->register_notifier(container->iommu_data,
2085 up_read(&container->group_lock);
2086 vfio_group_try_dissolve_container(group);
2091 static int vfio_unregister_iommu_notifier(struct vfio_group *group,
2092 struct notifier_block *nb)
2094 struct vfio_container *container;
2095 struct vfio_iommu_driver *driver;
2098 ret = vfio_group_add_container_user(group);
2102 container = group->container;
2103 down_read(&container->group_lock);
2105 driver = container->iommu_driver;
2106 if (likely(driver && driver->ops->unregister_notifier))
2107 ret = driver->ops->unregister_notifier(container->iommu_data,
2112 up_read(&container->group_lock);
2113 vfio_group_try_dissolve_container(group);
2118 void vfio_group_set_kvm(struct vfio_group *group, struct kvm *kvm)
2121 blocking_notifier_call_chain(&group->notifier,
2122 VFIO_GROUP_NOTIFY_SET_KVM, kvm);
2124 EXPORT_SYMBOL_GPL(vfio_group_set_kvm);
2126 static int vfio_register_group_notifier(struct vfio_group *group,
2127 unsigned long *events,
2128 struct notifier_block *nb)
2130 struct vfio_container *container;
2132 bool set_kvm = false;
2134 if (*events & VFIO_GROUP_NOTIFY_SET_KVM)
2137 /* clear known events */
2138 *events &= ~VFIO_GROUP_NOTIFY_SET_KVM;
2140 /* refuse to continue if still events remaining */
2144 ret = vfio_group_add_container_user(group);
2148 container = group->container;
2149 down_read(&container->group_lock);
2151 ret = blocking_notifier_chain_register(&group->notifier, nb);
2154 * The attaching of kvm and vfio_group might already happen, so
2155 * here we replay once upon registration.
2157 if (!ret && set_kvm && group->kvm)
2158 blocking_notifier_call_chain(&group->notifier,
2159 VFIO_GROUP_NOTIFY_SET_KVM, group->kvm);
2161 up_read(&container->group_lock);
2162 vfio_group_try_dissolve_container(group);
2167 static int vfio_unregister_group_notifier(struct vfio_group *group,
2168 struct notifier_block *nb)
2170 struct vfio_container *container;
2173 ret = vfio_group_add_container_user(group);
2177 container = group->container;
2178 down_read(&container->group_lock);
2180 ret = blocking_notifier_chain_unregister(&group->notifier, nb);
2182 up_read(&container->group_lock);
2183 vfio_group_try_dissolve_container(group);
2188 int vfio_register_notifier(struct device *dev, enum vfio_notify_type type,
2189 unsigned long *events, struct notifier_block *nb)
2191 struct vfio_group *group;
2194 if (!dev || !nb || !events || (*events == 0))
2197 group = vfio_group_get_from_dev(dev);
2202 case VFIO_IOMMU_NOTIFY:
2203 ret = vfio_register_iommu_notifier(group, events, nb);
2205 case VFIO_GROUP_NOTIFY:
2206 ret = vfio_register_group_notifier(group, events, nb);
2212 vfio_group_put(group);
2215 EXPORT_SYMBOL(vfio_register_notifier);
2217 int vfio_unregister_notifier(struct device *dev, enum vfio_notify_type type,
2218 struct notifier_block *nb)
2220 struct vfio_group *group;
2226 group = vfio_group_get_from_dev(dev);
2231 case VFIO_IOMMU_NOTIFY:
2232 ret = vfio_unregister_iommu_notifier(group, nb);
2234 case VFIO_GROUP_NOTIFY:
2235 ret = vfio_unregister_group_notifier(group, nb);
2241 vfio_group_put(group);
2244 EXPORT_SYMBOL(vfio_unregister_notifier);
2247 * Module/class support
2249 static char *vfio_devnode(struct device *dev, umode_t *mode)
2251 return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
2254 static struct miscdevice vfio_dev = {
2255 .minor = VFIO_MINOR,
2258 .nodename = "vfio/vfio",
2259 .mode = S_IRUGO | S_IWUGO,
2262 static int __init vfio_init(void)
2266 idr_init(&vfio.group_idr);
2267 mutex_init(&vfio.group_lock);
2268 mutex_init(&vfio.iommu_drivers_lock);
2269 INIT_LIST_HEAD(&vfio.group_list);
2270 INIT_LIST_HEAD(&vfio.iommu_drivers_list);
2271 init_waitqueue_head(&vfio.release_q);
2273 ret = misc_register(&vfio_dev);
2275 pr_err("vfio: misc device register failed\n");
2279 /* /dev/vfio/$GROUP */
2280 vfio.class = class_create(THIS_MODULE, "vfio");
2281 if (IS_ERR(vfio.class)) {
2282 ret = PTR_ERR(vfio.class);
2286 vfio.class->devnode = vfio_devnode;
2288 ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK, "vfio");
2290 goto err_alloc_chrdev;
2292 cdev_init(&vfio.group_cdev, &vfio_group_fops);
2293 ret = cdev_add(&vfio.group_cdev, vfio.group_devt, MINORMASK);
2297 pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
2299 #ifdef CONFIG_VFIO_NOIOMMU
2300 vfio_register_iommu_driver(&vfio_noiommu_ops);
2305 unregister_chrdev_region(vfio.group_devt, MINORMASK);
2307 class_destroy(vfio.class);
2310 misc_deregister(&vfio_dev);
2314 static void __exit vfio_cleanup(void)
2316 WARN_ON(!list_empty(&vfio.group_list));
2318 #ifdef CONFIG_VFIO_NOIOMMU
2319 vfio_unregister_iommu_driver(&vfio_noiommu_ops);
2321 idr_destroy(&vfio.group_idr);
2322 cdev_del(&vfio.group_cdev);
2323 unregister_chrdev_region(vfio.group_devt, MINORMASK);
2324 class_destroy(vfio.class);
2326 misc_deregister(&vfio_dev);
2329 module_init(vfio_init);
2330 module_exit(vfio_cleanup);
2332 MODULE_VERSION(DRIVER_VERSION);
2333 MODULE_LICENSE("GPL v2");
2334 MODULE_AUTHOR(DRIVER_AUTHOR);
2335 MODULE_DESCRIPTION(DRIVER_DESC);
2336 MODULE_ALIAS_MISCDEV(VFIO_MINOR);
2337 MODULE_ALIAS("devname:vfio/vfio");
2338 MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");