drivers/vfio/vfio.c

   1 /*
   2  * VFIO core
   3  *
   4  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
   5  *     Author: Alex Williamson <alex.williamson@redhat.com>
   6  *
   7  * This program is free software; you can redistribute it and/or modify
   8  * it under the terms of the GNU General Public License version 2 as
   9  * published by the Free Software Foundation.
  10  *
  11  * Derived from original vfio:
  12  * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
  13  * Author: Tom Lyon, pugs@cisco.com
  14  */
  15
  16 #include <linux/cdev.h>
  17 #include <linux/compat.h>
  18 #include <linux/device.h>
  19 #include <linux/file.h>
  20 #include <linux/anon_inodes.h>
  21 #include <linux/fs.h>
  22 #include <linux/idr.h>
  23 #include <linux/iommu.h>
  24 #include <linux/list.h>
  25 #include <linux/miscdevice.h>
  26 #include <linux/module.h>
  27 #include <linux/mutex.h>
  28 #include <linux/pci.h>
  29 #include <linux/rwsem.h>
  30 #include <linux/sched.h>
  31 #include <linux/slab.h>
  32 #include <linux/stat.h>
  33 #include <linux/string.h>
  34 #include <linux/uaccess.h>
  35 #include <linux/vfio.h>
  36 #include <linux/wait.h>
  37
  38 #define DRIVER_VERSION  "0.3"
  39 #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
  40 #define DRIVER_DESC     "VFIO - User Level meta-driver"
  41
  42 static struct vfio {
  43         struct class                    *class;
  44         struct list_head                iommu_drivers_list;
  45         struct mutex                    iommu_drivers_lock;
  46         struct list_head                group_list;
  47         struct idr                      group_idr;
  48         struct mutex                    group_lock;
  49         struct cdev                     group_cdev;
  50         dev_t                           group_devt;
  51         wait_queue_head_t               release_q;
  52 } vfio;
  53
  54 struct vfio_iommu_driver {
  55         const struct vfio_iommu_driver_ops      *ops;
  56         struct list_head                        vfio_next;
  57 };
  58
  59 struct vfio_container {
  60         struct kref                     kref;
  61         struct list_head                group_list;
  62         struct rw_semaphore             group_lock;
  63         struct vfio_iommu_driver        *iommu_driver;
  64         void                            *iommu_data;
  65         bool                            noiommu;
  66 };
  67
  68 struct vfio_unbound_dev {
  69         struct device                   *dev;
  70         struct list_head                unbound_next;
  71 };
  72
  73 struct vfio_group {
  74         struct kref                     kref;
  75         int                             minor;
  76         atomic_t                        container_users;
  77         struct iommu_group              *iommu_group;
  78         struct vfio_container           *container;
  79         struct list_head                device_list;
  80         struct mutex                    device_lock;
  81         struct device                   *dev;
  82         struct notifier_block           nb;
  83         struct list_head                vfio_next;
  84         struct list_head                container_next;
  85         struct list_head                unbound_list;
  86         struct mutex                    unbound_lock;
  87         atomic_t                        opened;
  88         bool                            noiommu;
  89         struct kvm                      *kvm;
  90         struct blocking_notifier_head   notifier;
  91 };
  92
  93 struct vfio_device {
  94         struct kref                     kref;
  95         struct device                   *dev;
  96         const struct vfio_device_ops    *ops;
  97         struct vfio_group               *group;
  98         struct list_head                group_next;
  99         void                            *device_data;
 100 };
 101
 102 #ifdef CONFIG_VFIO_NOIOMMU
 103 static bool noiommu __read_mostly;
 104 module_param_named(enable_unsafe_noiommu_mode,
 105                    noiommu, bool, S_IRUGO | S_IWUSR);
 106 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode.  This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel.  If you do not know what this is for, step away. (default: false)");
 107 #endif
 108
 109 /*
 110  * vfio_iommu_group_{get,put} are only intended for VFIO bus driver probe
 111  * and remove functions, any use cases other than acquiring the first
 112  * reference for the purpose of calling vfio_add_group_dev() or removing
 113  * that symmetric reference after vfio_del_group_dev() should use the raw
 114  * iommu_group_{get,put} functions.  In particular, vfio_iommu_group_put()
 115  * removes the device from the dummy group and cannot be nested.
 116  */
 117 struct iommu_group *vfio_iommu_group_get(struct device *dev)
 118 {
 119         struct iommu_group *group;
 120         int __maybe_unused ret;
 121
 122         group = iommu_group_get(dev);
 123
 124 #ifdef CONFIG_VFIO_NOIOMMU
 125         /*
 126          * With noiommu enabled, an IOMMU group will be created for a device
 127          * that doesn't already have one and doesn't have an iommu_ops on their
 128          * bus.  We set iommudata simply to be able to identify these groups
 129          * as special use and for reclamation later.
 130          */
 131         if (group || !noiommu || iommu_present(dev->bus))
 132                 return group;
 133
 134         group = iommu_group_alloc();
 135         if (IS_ERR(group))
 136                 return NULL;
 137
 138         iommu_group_set_name(group, "vfio-noiommu");
 139         iommu_group_set_iommudata(group, &noiommu, NULL);
 140         ret = iommu_group_add_device(group, dev);
 141         iommu_group_put(group);
 142         if (ret)
 143                 return NULL;
 144
 145         /*
 146          * Where to taint?  At this point we've added an IOMMU group for a
 147          * device that is not backed by iommu_ops, therefore any iommu_
 148          * callback using iommu_ops can legitimately Oops.  So, while we may
 149          * be about to give a DMA capable device to a user without IOMMU
 150          * protection, which is clearly taint-worthy, let's go ahead and do
 151          * it here.
 152          */
 153         add_taint(TAINT_USER, LOCKDEP_STILL_OK);
 154         dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n");
 155 #endif
 156
 157         return group;
 158 }
 159 EXPORT_SYMBOL_GPL(vfio_iommu_group_get);
 160
 161 void vfio_iommu_group_put(struct iommu_group *group, struct device *dev)
 162 {
 163 #ifdef CONFIG_VFIO_NOIOMMU
 164         if (iommu_group_get_iommudata(group) == &noiommu)
 165                 iommu_group_remove_device(dev);
 166 #endif
 167
 168         iommu_group_put(group);
 169 }
 170 EXPORT_SYMBOL_GPL(vfio_iommu_group_put);
 171
 172 #ifdef CONFIG_VFIO_NOIOMMU
 173 static void *vfio_noiommu_open(unsigned long arg)
 174 {
 175         if (arg != VFIO_NOIOMMU_IOMMU)
 176                 return ERR_PTR(-EINVAL);
 177         if (!capable(CAP_SYS_RAWIO))
 178                 return ERR_PTR(-EPERM);
 179
 180         return NULL;
 181 }
 182
 183 static void vfio_noiommu_release(void *iommu_data)
 184 {
 185 }
 186
 187 static long vfio_noiommu_ioctl(void *iommu_data,
 188                                unsigned int cmd, unsigned long arg)
 189 {
 190         if (cmd == VFIO_CHECK_EXTENSION)
 191                 return noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0;
 192
 193         return -ENOTTY;
 194 }
 195
 196 static int vfio_noiommu_attach_group(void *iommu_data,
 197                                      struct iommu_group *iommu_group)
 198 {
 199         return iommu_group_get_iommudata(iommu_group) == &noiommu ? 0 : -EINVAL;
 200 }
 201
 202 static void vfio_noiommu_detach_group(void *iommu_data,
 203                                       struct iommu_group *iommu_group)
 204 {
 205 }
 206
 207 static const struct vfio_iommu_driver_ops vfio_noiommu_ops = {
 208         .name = "vfio-noiommu",
 209         .owner = THIS_MODULE,
 210         .open = vfio_noiommu_open,
 211         .release = vfio_noiommu_release,
 212         .ioctl = vfio_noiommu_ioctl,
 213         .attach_group = vfio_noiommu_attach_group,
 214         .detach_group = vfio_noiommu_detach_group,
 215 };
 216 #endif
 217
 218
 219 /**
 220  * IOMMU driver registration
 221  */
 222 int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops)
 223 {
 224         struct vfio_iommu_driver *driver, *tmp;
 225
 226         driver = kzalloc(sizeof(*driver), GFP_KERNEL);
 227         if (!driver)
 228                 return -ENOMEM;
 229
 230         driver->ops = ops;
 231
 232         mutex_lock(&vfio.iommu_drivers_lock);
 233
 234         /* Check for duplicates */
 235         list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) {
 236                 if (tmp->ops == ops) {
 237                         mutex_unlock(&vfio.iommu_drivers_lock);
 238                         kfree(driver);
 239                         return -EINVAL;
 240                 }
 241         }
 242
 243         list_add(&driver->vfio_next, &vfio.iommu_drivers_list);
 244
 245         mutex_unlock(&vfio.iommu_drivers_lock);
 246
 247         return 0;
 248 }
 249 EXPORT_SYMBOL_GPL(vfio_register_iommu_driver);
 250
 251 void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops)
 252 {
 253         struct vfio_iommu_driver *driver;
 254
 255         mutex_lock(&vfio.iommu_drivers_lock);
 256         list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
 257                 if (driver->ops == ops) {
 258                         list_del(&driver->vfio_next);
 259                         mutex_unlock(&vfio.iommu_drivers_lock);
 260                         kfree(driver);
 261                         return;
 262                 }
 263         }
 264         mutex_unlock(&vfio.iommu_drivers_lock);
 265 }
 266 EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver);
 267
 268 /**
 269  * Group minor allocation/free - both called with vfio.group_lock held
 270  */
 271 static int vfio_alloc_group_minor(struct vfio_group *group)
 272 {
 273         return idr_alloc(&vfio.group_idr, group, 0, MINORMASK + 1, GFP_KERNEL);
 274 }
 275
 276 static void vfio_free_group_minor(int minor)
 277 {
 278         idr_remove(&vfio.group_idr, minor);
 279 }
 280
 281 static int vfio_iommu_group_notifier(struct notifier_block *nb,
 282                                      unsigned long action, void *data);
 283 static void vfio_group_get(struct vfio_group *group);
 284
 285 /**
 286  * Container objects - containers are created when /dev/vfio/vfio is
 287  * opened, but their lifecycle extends until the last user is done, so
 288  * it's freed via kref.  Must support container/group/device being
 289  * closed in any order.
 290  */
 291 static void vfio_container_get(struct vfio_container *container)
 292 {
 293         kref_get(&container->kref);
 294 }
 295
 296 static void vfio_container_release(struct kref *kref)
 297 {
 298         struct vfio_container *container;
 299         container = container_of(kref, struct vfio_container, kref);
 300
 301         kfree(container);
 302 }
 303
 304 static void vfio_container_put(struct vfio_container *container)
 305 {
 306         kref_put(&container->kref, vfio_container_release);
 307 }
 308
 309 static void vfio_group_unlock_and_free(struct vfio_group *group)
 310 {
 311         mutex_unlock(&vfio.group_lock);
 312         /*
 313          * Unregister outside of lock.  A spurious callback is harmless now
 314          * that the group is no longer in vfio.group_list.
 315          */
 316         iommu_group_unregister_notifier(group->iommu_group, &group->nb);
 317         kfree(group);
 318 }
 319
 320 /**
 321  * Group objects - create, release, get, put, search
 322  */
 323 static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group)
 324 {
 325         struct vfio_group *group, *tmp;
 326         struct device *dev;
 327         int ret, minor;
 328
 329         group = kzalloc(sizeof(*group), GFP_KERNEL);
 330         if (!group)
 331                 return ERR_PTR(-ENOMEM);
 332
 333         kref_init(&group->kref);
 334         INIT_LIST_HEAD(&group->device_list);
 335         mutex_init(&group->device_lock);
 336         INIT_LIST_HEAD(&group->unbound_list);
 337         mutex_init(&group->unbound_lock);
 338         atomic_set(&group->container_users, 0);
 339         atomic_set(&group->opened, 0);
 340         group->iommu_group = iommu_group;
 341 #ifdef CONFIG_VFIO_NOIOMMU
 342         group->noiommu = (iommu_group_get_iommudata(iommu_group) == &noiommu);
 343 #endif
 344         BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
 345
 346         group->nb.notifier_call = vfio_iommu_group_notifier;
 347
 348         /*
 349          * blocking notifiers acquire a rwsem around registering and hold
 350          * it around callback.  Therefore, need to register outside of
 351          * vfio.group_lock to avoid A-B/B-A contention.  Our callback won't
 352          * do anything unless it can find the group in vfio.group_list, so
 353          * no harm in registering early.
 354          */
 355         ret = iommu_group_register_notifier(iommu_group, &group->nb);
 356         if (ret) {
 357                 kfree(group);
 358                 return ERR_PTR(ret);
 359         }
 360
 361         mutex_lock(&vfio.group_lock);
 362
 363         /* Did we race creating this group? */
 364         list_for_each_entry(tmp, &vfio.group_list, vfio_next) {
 365                 if (tmp->iommu_group == iommu_group) {
 366                         vfio_group_get(tmp);
 367                         vfio_group_unlock_and_free(group);
 368                         return tmp;
 369                 }
 370         }
 371
 372         minor = vfio_alloc_group_minor(group);
 373         if (minor < 0) {
 374                 vfio_group_unlock_and_free(group);
 375                 return ERR_PTR(minor);
 376         }
 377
 378         dev = device_create(vfio.class, NULL,
 379                             MKDEV(MAJOR(vfio.group_devt), minor),
 380                             group, "%s%d", group->noiommu ? "noiommu-" : "",
 381                             iommu_group_id(iommu_group));
 382         if (IS_ERR(dev)) {
 383                 vfio_free_group_minor(minor);
 384                 vfio_group_unlock_and_free(group);
 385                 return ERR_CAST(dev);
 386         }
 387
 388         group->minor = minor;
 389         group->dev = dev;
 390
 391         list_add(&group->vfio_next, &vfio.group_list);
 392
 393         mutex_unlock(&vfio.group_lock);
 394
 395         return group;
 396 }
 397
 398 /* called with vfio.group_lock held */
 399 static void vfio_group_release(struct kref *kref)
 400 {
 401         struct vfio_group *group = container_of(kref, struct vfio_group, kref);
 402         struct vfio_unbound_dev *unbound, *tmp;
 403         struct iommu_group *iommu_group = group->iommu_group;
 404
 405         WARN_ON(!list_empty(&group->device_list));
 406         WARN_ON(group->notifier.head);
 407
 408         list_for_each_entry_safe(unbound, tmp,
 409                                  &group->unbound_list, unbound_next) {
 410                 list_del(&unbound->unbound_next);
 411                 kfree(unbound);
 412         }
 413
 414         device_destroy(vfio.class, MKDEV(MAJOR(vfio.group_devt), group->minor));
 415         list_del(&group->vfio_next);
 416         vfio_free_group_minor(group->minor);
 417         vfio_group_unlock_and_free(group);
 418         iommu_group_put(iommu_group);
 419 }
 420
 421 static void vfio_group_put(struct vfio_group *group)
 422 {
 423         kref_put_mutex(&group->kref, vfio_group_release, &vfio.group_lock);
 424 }
 425
 426 struct vfio_group_put_work {
 427         struct work_struct work;
 428         struct vfio_group *group;
 429 };
 430
 431 static void vfio_group_put_bg(struct work_struct *work)
 432 {
 433         struct vfio_group_put_work *do_work;
 434
 435         do_work = container_of(work, struct vfio_group_put_work, work);
 436
 437         vfio_group_put(do_work->group);
 438         kfree(do_work);
 439 }
 440
 441 static void vfio_group_schedule_put(struct vfio_group *group)
 442 {
 443         struct vfio_group_put_work *do_work;
 444
 445         do_work = kmalloc(sizeof(*do_work), GFP_KERNEL);
 446         if (WARN_ON(!do_work))
 447                 return;
 448
 449         INIT_WORK(&do_work->work, vfio_group_put_bg);
 450         do_work->group = group;
 451         schedule_work(&do_work->work);
 452 }
 453
 454 /* Assume group_lock or group reference is held */
 455 static void vfio_group_get(struct vfio_group *group)
 456 {
 457         kref_get(&group->kref);
 458 }
 459
 460 /*
 461  * Not really a try as we will sleep for mutex, but we need to make
 462  * sure the group pointer is valid under lock and get a reference.
 463  */
 464 static struct vfio_group *vfio_group_try_get(struct vfio_group *group)
 465 {
 466         struct vfio_group *target = group;
 467
 468         mutex_lock(&vfio.group_lock);
 469         list_for_each_entry(group, &vfio.group_list, vfio_next) {
 470                 if (group == target) {
 471                         vfio_group_get(group);
 472                         mutex_unlock(&vfio.group_lock);
 473                         return group;
 474                 }
 475         }
 476         mutex_unlock(&vfio.group_lock);
 477
 478         return NULL;
 479 }
 480
 481 static
 482 struct vfio_group *vfio_group_get_from_iommu(struct iommu_group *iommu_group)
 483 {
 484         struct vfio_group *group;
 485
 486         mutex_lock(&vfio.group_lock);
 487         list_for_each_entry(group, &vfio.group_list, vfio_next) {
 488                 if (group->iommu_group == iommu_group) {
 489                         vfio_group_get(group);
 490                         mutex_unlock(&vfio.group_lock);
 491                         return group;
 492                 }
 493         }
 494         mutex_unlock(&vfio.group_lock);
 495
 496         return NULL;
 497 }
 498
 499 static struct vfio_group *vfio_group_get_from_minor(int minor)
 500 {
 501         struct vfio_group *group;
 502
 503         mutex_lock(&vfio.group_lock);
 504         group = idr_find(&vfio.group_idr, minor);
 505         if (!group) {
 506                 mutex_unlock(&vfio.group_lock);
 507                 return NULL;
 508         }
 509         vfio_group_get(group);
 510         mutex_unlock(&vfio.group_lock);
 511
 512         return group;
 513 }
 514
 515 static struct vfio_group *vfio_group_get_from_dev(struct device *dev)
 516 {
 517         struct iommu_group *iommu_group;
 518         struct vfio_group *group;
 519
 520         iommu_group = iommu_group_get(dev);
 521         if (!iommu_group)
 522                 return NULL;
 523
 524         group = vfio_group_get_from_iommu(iommu_group);
 525         iommu_group_put(iommu_group);
 526
 527         return group;
 528 }
 529
 530 /**
 531  * Device objects - create, release, get, put, search
 532  */
 533 static
 534 struct vfio_device *vfio_group_create_device(struct vfio_group *group,
 535                                              struct device *dev,
 536                                              const struct vfio_device_ops *ops,
 537                                              void *device_data)
 538 {
 539         struct vfio_device *device;
 540
 541         device = kzalloc(sizeof(*device), GFP_KERNEL);
 542         if (!device)
 543                 return ERR_PTR(-ENOMEM);
 544
 545         kref_init(&device->kref);
 546         device->dev = dev;
 547         device->group = group;
 548         device->ops = ops;
 549         device->device_data = device_data;
 550         dev_set_drvdata(dev, device);
 551
 552         /* No need to get group_lock, caller has group reference */
 553         vfio_group_get(group);
 554
 555         mutex_lock(&group->device_lock);
 556         list_add(&device->group_next, &group->device_list);
 557         mutex_unlock(&group->device_lock);
 558
 559         return device;
 560 }
 561
 562 static void vfio_device_release(struct kref *kref)
 563 {
 564         struct vfio_device *device = container_of(kref,
 565                                                   struct vfio_device, kref);
 566         struct vfio_group *group = device->group;
 567
 568         list_del(&device->group_next);
 569         mutex_unlock(&group->device_lock);
 570
 571         dev_set_drvdata(device->dev, NULL);
 572
 573         kfree(device);
 574
 575         /* vfio_del_group_dev may be waiting for this device */
 576         wake_up(&vfio.release_q);
 577 }
 578
 579 /* Device reference always implies a group reference */
 580 void vfio_device_put(struct vfio_device *device)
 581 {
 582         struct vfio_group *group = device->group;
 583         kref_put_mutex(&device->kref, vfio_device_release, &group->device_lock);
 584         vfio_group_put(group);
 585 }
 586 EXPORT_SYMBOL_GPL(vfio_device_put);
 587
 588 static void vfio_device_get(struct vfio_device *device)
 589 {
 590         vfio_group_get(device->group);
 591         kref_get(&device->kref);
 592 }
 593
 594 static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
 595                                                  struct device *dev)
 596 {
 597         struct vfio_device *device;
 598
 599         mutex_lock(&group->device_lock);
 600         list_for_each_entry(device, &group->device_list, group_next) {
 601                 if (device->dev == dev) {
 602                         vfio_device_get(device);
 603                         mutex_unlock(&group->device_lock);
 604                         return device;
 605                 }
 606         }
 607         mutex_unlock(&group->device_lock);
 608         return NULL;
 609 }
 610
 611 /*
 612  * Some drivers, like pci-stub, are only used to prevent other drivers from
 613  * claiming a device and are therefore perfectly legitimate for a user owned
 614  * group.  The pci-stub driver has no dependencies on DMA or the IOVA mapping
 615  * of the device, but it does prevent the user from having direct access to
 616  * the device, which is useful in some circumstances.
 617  *
 618  * We also assume that we can include PCI interconnect devices, ie. bridges.
 619  * IOMMU grouping on PCI necessitates that if we lack isolation on a bridge
 620  * then all of the downstream devices will be part of the same IOMMU group as
 621  * the bridge.  Thus, if placing the bridge into the user owned IOVA space
 622  * breaks anything, it only does so for user owned devices downstream.  Note
 623  * that error notification via MSI can be affected for platforms that handle
 624  * MSI within the same IOVA space as DMA.
 625  */
 626 static const char * const vfio_driver_whitelist[] = { "pci-stub" };
 627
 628 static bool vfio_dev_whitelisted(struct device *dev, struct device_driver *drv)
 629 {
 630         int i;
 631
 632         if (dev_is_pci(dev)) {
 633                 struct pci_dev *pdev = to_pci_dev(dev);
 634
 635                 if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL)
 636                         return true;
 637         }
 638
 639         for (i = 0; i < ARRAY_SIZE(vfio_driver_whitelist); i++) {
 640                 if (!strcmp(drv->name, vfio_driver_whitelist[i]))
 641                         return true;
 642         }
 643
 644         return false;
 645 }
 646
 647 /*
 648  * A vfio group is viable for use by userspace if all devices are in
 649  * one of the following states:
 650  *  - driver-less
 651  *  - bound to a vfio driver
 652  *  - bound to a whitelisted driver
 653  *  - a PCI interconnect device
 654  *
 655  * We use two methods to determine whether a device is bound to a vfio
 656  * driver.  The first is to test whether the device exists in the vfio
 657  * group.  The second is to test if the device exists on the group
 658  * unbound_list, indicating it's in the middle of transitioning from
 659  * a vfio driver to driver-less.
 660  */
 661 static int vfio_dev_viable(struct device *dev, void *data)
 662 {
 663         struct vfio_group *group = data;
 664         struct vfio_device *device;
 665         struct device_driver *drv = ACCESS_ONCE(dev->driver);
 666         struct vfio_unbound_dev *unbound;
 667         int ret = -EINVAL;
 668
 669         mutex_lock(&group->unbound_lock);
 670         list_for_each_entry(unbound, &group->unbound_list, unbound_next) {
 671                 if (dev == unbound->dev) {
 672                         ret = 0;
 673                         break;
 674                 }
 675         }
 676         mutex_unlock(&group->unbound_lock);
 677
 678         if (!ret || !drv || vfio_dev_whitelisted(dev, drv))
 679                 return 0;
 680
 681         device = vfio_group_get_device(group, dev);
 682         if (device) {
 683                 vfio_device_put(device);
 684                 return 0;
 685         }
 686
 687         return ret;
 688 }
 689
 690 /**
 691  * Async device support
 692  */
 693 static int vfio_group_nb_add_dev(struct vfio_group *group, struct device *dev)
 694 {
 695         struct vfio_device *device;
 696
 697         /* Do we already know about it?  We shouldn't */
 698         device = vfio_group_get_device(group, dev);
 699         if (WARN_ON_ONCE(device)) {
 700                 vfio_device_put(device);
 701                 return 0;
 702         }
 703
 704         /* Nothing to do for idle groups */
 705         if (!atomic_read(&group->container_users))
 706                 return 0;
 707
 708         /* TODO Prevent device auto probing */
 709         WARN(1, "Device %s added to live group %d!\n", dev_name(dev),
 710              iommu_group_id(group->iommu_group));
 711
 712         return 0;
 713 }
 714
 715 static int vfio_group_nb_verify(struct vfio_group *group, struct device *dev)
 716 {
 717         /* We don't care what happens when the group isn't in use */
 718         if (!atomic_read(&group->container_users))
 719                 return 0;
 720
 721         return vfio_dev_viable(dev, group);
 722 }
 723
 724 static int vfio_iommu_group_notifier(struct notifier_block *nb,
 725                                      unsigned long action, void *data)
 726 {
 727         struct vfio_group *group = container_of(nb, struct vfio_group, nb);
 728         struct device *dev = data;
 729         struct vfio_unbound_dev *unbound;
 730
 731         /*
 732          * Need to go through a group_lock lookup to get a reference or we
 733          * risk racing a group being removed.  Ignore spurious notifies.
 734          */
 735         group = vfio_group_try_get(group);
 736         if (!group)
 737                 return NOTIFY_OK;
 738
 739         switch (action) {
 740         case IOMMU_GROUP_NOTIFY_ADD_DEVICE:
 741                 vfio_group_nb_add_dev(group, dev);
 742                 break;
 743         case IOMMU_GROUP_NOTIFY_DEL_DEVICE:
 744                 /*
 745                  * Nothing to do here.  If the device is in use, then the
 746                  * vfio sub-driver should block the remove callback until
 747                  * it is unused.  If the device is unused or attached to a
 748                  * stub driver, then it should be released and we don't
 749                  * care that it will be going away.
 750                  */
 751                 break;
 752         case IOMMU_GROUP_NOTIFY_BIND_DRIVER:
 753                 pr_debug("%s: Device %s, group %d binding to driver\n",
 754                          __func__, dev_name(dev),
 755                          iommu_group_id(group->iommu_group));
 756                 break;
 757         case IOMMU_GROUP_NOTIFY_BOUND_DRIVER:
 758                 pr_debug("%s: Device %s, group %d bound to driver %s\n",
 759                          __func__, dev_name(dev),
 760                          iommu_group_id(group->iommu_group), dev->driver->name);
 761                 BUG_ON(vfio_group_nb_verify(group, dev));
 762                 break;
 763         case IOMMU_GROUP_NOTIFY_UNBIND_DRIVER:
 764                 pr_debug("%s: Device %s, group %d unbinding from driver %s\n",
 765                          __func__, dev_name(dev),
 766                          iommu_group_id(group->iommu_group), dev->driver->name);
 767                 break;
 768         case IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER:
 769                 pr_debug("%s: Device %s, group %d unbound from driver\n",
 770                          __func__, dev_name(dev),
 771                          iommu_group_id(group->iommu_group));
 772                 /*
 773                  * XXX An unbound device in a live group is ok, but we'd
 774                  * really like to avoid the above BUG_ON by preventing other
 775                  * drivers from binding to it.  Once that occurs, we have to
 776                  * stop the system to maintain isolation.  At a minimum, we'd
 777                  * want a toggle to disable driver auto probe for this device.
 778                  */
 779
 780                 mutex_lock(&group->unbound_lock);
 781                 list_for_each_entry(unbound,
 782                                     &group->unbound_list, unbound_next) {
 783                         if (dev == unbound->dev) {
 784                                 list_del(&unbound->unbound_next);
 785                                 kfree(unbound);
 786                                 break;
 787                         }
 788                 }
 789                 mutex_unlock(&group->unbound_lock);
 790                 break;
 791         }
 792
 793         /*
 794          * If we're the last reference to the group, the group will be
 795          * released, which includes unregistering the iommu group notifier.
 796          * We hold a read-lock on that notifier list, unregistering needs
 797          * a write-lock... deadlock.  Release our reference asynchronously
 798          * to avoid that situation.
 799          */
 800         vfio_group_schedule_put(group);
 801         return NOTIFY_OK;
 802 }
 803
 804 /**
 805  * VFIO driver API
 806  */
 807 int vfio_add_group_dev(struct device *dev,
 808                        const struct vfio_device_ops *ops, void *device_data)
 809 {
 810         struct iommu_group *iommu_group;
 811         struct vfio_group *group;
 812         struct vfio_device *device;
 813
 814         iommu_group = iommu_group_get(dev);
 815         if (!iommu_group)
 816                 return -EINVAL;
 817
 818         group = vfio_group_get_from_iommu(iommu_group);
 819         if (!group) {
 820                 group = vfio_create_group(iommu_group);
 821                 if (IS_ERR(group)) {
 822                         iommu_group_put(iommu_group);
 823                         return PTR_ERR(group);
 824                 }
 825         } else {
 826                 /*
 827                  * A found vfio_group already holds a reference to the
 828                  * iommu_group.  A created vfio_group keeps the reference.
 829                  */
 830                 iommu_group_put(iommu_group);
 831         }
 832
 833         device = vfio_group_get_device(group, dev);
 834         if (device) {
 835                 WARN(1, "Device %s already exists on group %d\n",
 836                      dev_name(dev), iommu_group_id(iommu_group));
 837                 vfio_device_put(device);
 838                 vfio_group_put(group);
 839                 return -EBUSY;
 840         }
 841
 842         device = vfio_group_create_device(group, dev, ops, device_data);
 843         if (IS_ERR(device)) {
 844                 vfio_group_put(group);
 845                 return PTR_ERR(device);
 846         }
 847
 848         /*
 849          * Drop all but the vfio_device reference.  The vfio_device holds
 850          * a reference to the vfio_group, which holds a reference to the
 851          * iommu_group.
 852          */
 853         vfio_group_put(group);
 854
 855         return 0;
 856 }
 857 EXPORT_SYMBOL_GPL(vfio_add_group_dev);
 858
 859 /**
 860  * Get a reference to the vfio_device for a device.  Even if the
 861  * caller thinks they own the device, they could be racing with a
 862  * release call path, so we can't trust drvdata for the shortcut.
 863  * Go the long way around, from the iommu_group to the vfio_group
 864  * to the vfio_device.
 865  */
 866 struct vfio_device *vfio_device_get_from_dev(struct device *dev)
 867 {
 868         struct vfio_group *group;
 869         struct vfio_device *device;
 870
 871         group = vfio_group_get_from_dev(dev);
 872         if (!group)
 873                 return NULL;
 874
 875         device = vfio_group_get_device(group, dev);
 876         vfio_group_put(group);
 877
 878         return device;
 879 }
 880 EXPORT_SYMBOL_GPL(vfio_device_get_from_dev);
 881
 882 static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
 883                                                      char *buf)
 884 {
 885         struct vfio_device *it, *device = NULL;
 886
 887         mutex_lock(&group->device_lock);
 888         list_for_each_entry(it, &group->device_list, group_next) {
 889                 if (!strcmp(dev_name(it->dev), buf)) {
 890                         device = it;
 891                         vfio_device_get(device);
 892                         break;
 893                 }
 894         }
 895         mutex_unlock(&group->device_lock);
 896
 897         return device;
 898 }
 899
 900 /*
 901  * Caller must hold a reference to the vfio_device
 902  */
 903 void *vfio_device_data(struct vfio_device *device)
 904 {
 905         return device->device_data;
 906 }
 907 EXPORT_SYMBOL_GPL(vfio_device_data);
 908
 909 /* Given a referenced group, check if it contains the device */
 910 static bool vfio_dev_present(struct vfio_group *group, struct device *dev)
 911 {
 912         struct vfio_device *device;
 913
 914         device = vfio_group_get_device(group, dev);
 915         if (!device)
 916                 return false;
 917
 918         vfio_device_put(device);
 919         return true;
 920 }
 921
 922 /*
 923  * Decrement the device reference count and wait for the device to be
 924  * removed.  Open file descriptors for the device... */
 925 void *vfio_del_group_dev(struct device *dev)
 926 {
 927         struct vfio_device *device = dev_get_drvdata(dev);
 928         struct vfio_group *group = device->group;
 929         void *device_data = device->device_data;
 930         struct vfio_unbound_dev *unbound;
 931         unsigned int i = 0;
 932         long ret;
 933         bool interrupted = false;
 934
 935         /*
 936          * The group exists so long as we have a device reference.  Get
 937          * a group reference and use it to scan for the device going away.
 938          */
 939         vfio_group_get(group);
 940
 941         /*
 942          * When the device is removed from the group, the group suddenly
 943          * becomes non-viable; the device has a driver (until the unbind
 944          * completes), but it's not present in the group.  This is bad news
 945          * for any external users that need to re-acquire a group reference
 946          * in order to match and release their existing reference.  To
 947          * solve this, we track such devices on the unbound_list to bridge
 948          * the gap until they're fully unbound.
 949          */
 950         unbound = kzalloc(sizeof(*unbound), GFP_KERNEL);
 951         if (unbound) {
 952                 unbound->dev = dev;
 953                 mutex_lock(&group->unbound_lock);
 954                 list_add(&unbound->unbound_next, &group->unbound_list);
 955                 mutex_unlock(&group->unbound_lock);
 956         }
 957         WARN_ON(!unbound);
 958
 959         vfio_device_put(device);
 960
 961         /*
 962          * If the device is still present in the group after the above
 963          * 'put', then it is in use and we need to request it from the
 964          * bus driver.  The driver may in turn need to request the
 965          * device from the user.  We send the request on an arbitrary
 966          * interval with counter to allow the driver to take escalating
 967          * measures to release the device if it has the ability to do so.
 968          */
 969         do {
 970                 device = vfio_group_get_device(group, dev);
 971                 if (!device)
 972                         break;
 973
 974                 if (device->ops->request)
 975                         device->ops->request(device_data, i++);
 976
 977                 vfio_device_put(device);
 978
 979                 if (interrupted) {
 980                         ret = wait_event_timeout(vfio.release_q,
 981                                         !vfio_dev_present(group, dev), HZ * 10);
 982                 } else {
 983                         ret = wait_event_interruptible_timeout(vfio.release_q,
 984                                         !vfio_dev_present(group, dev), HZ * 10);
 985                         if (ret == -ERESTARTSYS) {
 986                                 interrupted = true;
 987                                 dev_warn(dev,
 988                                          "Device is currently in use, task"
 989                                          " \"%s\" (%d) "
 990                                          "blocked until device is released",
 991                                          current->comm, task_pid_nr(current));
 992                         }
 993                 }
 994         } while (ret <= 0);
 995
 996         vfio_group_put(group);
 997
 998         return device_data;
 999 }
1000 EXPORT_SYMBOL_GPL(vfio_del_group_dev);
1001
1002 /**
1003  * VFIO base fd, /dev/vfio/vfio
1004  */
1005 static long vfio_ioctl_check_extension(struct vfio_container *container,
1006                                        unsigned long arg)
1007 {
1008         struct vfio_iommu_driver *driver;
1009         long ret = 0;
1010
1011         down_read(&container->group_lock);
1012
1013         driver = container->iommu_driver;
1014
1015         switch (arg) {
1016                 /* No base extensions yet */
1017         default:
1018                 /*
1019                  * If no driver is set, poll all registered drivers for
1020                  * extensions and return the first positive result.  If
1021                  * a driver is already set, further queries will be passed
1022                  * only to that driver.
1023                  */
1024                 if (!driver) {
1025                         mutex_lock(&vfio.iommu_drivers_lock);
1026                         list_for_each_entry(driver, &vfio.iommu_drivers_list,
1027                                             vfio_next) {
1028
1029 #ifdef CONFIG_VFIO_NOIOMMU
1030                                 if (!list_empty(&container->group_list) &&
1031                                     (container->noiommu !=
1032                                      (driver->ops == &vfio_noiommu_ops)))
1033                                         continue;
1034 #endif
1035
1036                                 if (!try_module_get(driver->ops->owner))
1037                                         continue;
1038
1039                                 ret = driver->ops->ioctl(NULL,
1040                                                          VFIO_CHECK_EXTENSION,
1041                                                          arg);
1042                                 module_put(driver->ops->owner);
1043                                 if (ret > 0)
1044                                         break;
1045                         }
1046                         mutex_unlock(&vfio.iommu_drivers_lock);
1047                 } else
1048                         ret = driver->ops->ioctl(container->iommu_data,
1049                                                  VFIO_CHECK_EXTENSION, arg);
1050         }
1051
1052         up_read(&container->group_lock);
1053
1054         return ret;
1055 }
1056
1057 /* hold write lock on container->group_lock */
1058 static int __vfio_container_attach_groups(struct vfio_container *container,
1059                                           struct vfio_iommu_driver *driver,
1060                                           void *data)
1061 {
1062         struct vfio_group *group;
1063         int ret = -ENODEV;
1064
1065         list_for_each_entry(group, &container->group_list, container_next) {
1066                 ret = driver->ops->attach_group(data, group->iommu_group);
1067                 if (ret)
1068                         goto unwind;
1069         }
1070
1071         return ret;
1072
1073 unwind:
1074         list_for_each_entry_continue_reverse(group, &container->group_list,
1075                                              container_next) {
1076                 driver->ops->detach_group(data, group->iommu_group);
1077         }
1078
1079         return ret;
1080 }
1081
1082 static long vfio_ioctl_set_iommu(struct vfio_container *container,
1083                                  unsigned long arg)
1084 {
1085         struct vfio_iommu_driver *driver;
1086         long ret = -ENODEV;
1087
1088         down_write(&container->group_lock);
1089
1090         /*
1091          * The container is designed to be an unprivileged interface while
1092          * the group can be assigned to specific users.  Therefore, only by
1093          * adding a group to a container does the user get the privilege of
1094          * enabling the iommu, which may allocate finite resources.  There
1095          * is no unset_iommu, but by removing all the groups from a container,
1096          * the container is deprivileged and returns to an unset state.
1097          */
1098         if (list_empty(&container->group_list) || container->iommu_driver) {
1099                 up_write(&container->group_lock);
1100                 return -EINVAL;
1101         }
1102
1103         mutex_lock(&vfio.iommu_drivers_lock);
1104         list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
1105                 void *data;
1106
1107 #ifdef CONFIG_VFIO_NOIOMMU
1108                 /*
1109                  * Only noiommu containers can use vfio-noiommu and noiommu
1110                  * containers can only use vfio-noiommu.
1111                  */
1112                 if (container->noiommu != (driver->ops == &vfio_noiommu_ops))
1113                         continue;
1114 #endif
1115
1116                 if (!try_module_get(driver->ops->owner))
1117                         continue;
1118
1119                 /*
1120                  * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION,
1121                  * so test which iommu driver reported support for this
1122                  * extension and call open on them.  We also pass them the
1123                  * magic, allowing a single driver to support multiple
1124                  * interfaces if they'd like.
1125                  */
1126                 if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) {
1127                         module_put(driver->ops->owner);
1128                         continue;
1129                 }
1130
1131                 data = driver->ops->open(arg);
1132                 if (IS_ERR(data)) {
1133                         ret = PTR_ERR(data);
1134                         module_put(driver->ops->owner);
1135                         continue;
1136                 }
1137
1138                 ret = __vfio_container_attach_groups(container, driver, data);
1139                 if (ret) {
1140                         driver->ops->release(data);
1141                         module_put(driver->ops->owner);
1142                         continue;
1143                 }
1144
1145                 container->iommu_driver = driver;
1146                 container->iommu_data = data;
1147                 break;
1148         }
1149
1150         mutex_unlock(&vfio.iommu_drivers_lock);
1151         up_write(&container->group_lock);
1152
1153         return ret;
1154 }
1155
1156 static long vfio_fops_unl_ioctl(struct file *filep,
1157                                 unsigned int cmd, unsigned long arg)
1158 {
1159         struct vfio_container *container = filep->private_data;
1160         struct vfio_iommu_driver *driver;
1161         void *data;
1162         long ret = -EINVAL;
1163
1164         if (!container)
1165                 return ret;
1166
1167         switch (cmd) {
1168         case VFIO_GET_API_VERSION:
1169                 ret = VFIO_API_VERSION;
1170                 break;
1171         case VFIO_CHECK_EXTENSION:
1172                 ret = vfio_ioctl_check_extension(container, arg);
1173                 break;
1174         case VFIO_SET_IOMMU:
1175                 ret = vfio_ioctl_set_iommu(container, arg);
1176                 break;
1177         default:
1178                 down_read(&container->group_lock);
1179
1180                 driver = container->iommu_driver;
1181                 data = container->iommu_data;
1182
1183                 if (driver) /* passthrough all unrecognized ioctls */
1184                         ret = driver->ops->ioctl(data, cmd, arg);
1185
1186                 up_read(&container->group_lock);
1187         }
1188
1189         return ret;
1190 }
1191
1192 #ifdef CONFIG_COMPAT
1193 static long vfio_fops_compat_ioctl(struct file *filep,
1194                                    unsigned int cmd, unsigned long arg)
1195 {
1196         arg = (unsigned long)compat_ptr(arg);
1197         return vfio_fops_unl_ioctl(filep, cmd, arg);
1198 }
1199 #endif  /* CONFIG_COMPAT */
1200
1201 static int vfio_fops_open(struct inode *inode, struct file *filep)
1202 {
1203         struct vfio_container *container;
1204
1205         container = kzalloc(sizeof(*container), GFP_KERNEL);
1206         if (!container)
1207                 return -ENOMEM;
1208
1209         INIT_LIST_HEAD(&container->group_list);
1210         init_rwsem(&container->group_lock);
1211         kref_init(&container->kref);
1212
1213         filep->private_data = container;
1214
1215         return 0;
1216 }
1217
1218 static int vfio_fops_release(struct inode *inode, struct file *filep)
1219 {
1220         struct vfio_container *container = filep->private_data;
1221
1222         filep->private_data = NULL;
1223
1224         vfio_container_put(container);
1225
1226         return 0;
1227 }
1228
1229 /*
1230  * Once an iommu driver is set, we optionally pass read/write/mmap
1231  * on to the driver, allowing management interfaces beyond ioctl.
1232  */
1233 static ssize_t vfio_fops_read(struct file *filep, char __user *buf,
1234                               size_t count, loff_t *ppos)
1235 {
1236         struct vfio_container *container = filep->private_data;
1237         struct vfio_iommu_driver *driver;
1238         ssize_t ret = -EINVAL;
1239
1240         down_read(&container->group_lock);
1241
1242         driver = container->iommu_driver;
1243         if (likely(driver && driver->ops->read))
1244                 ret = driver->ops->read(container->iommu_data,
1245                                         buf, count, ppos);
1246
1247         up_read(&container->group_lock);
1248
1249         return ret;
1250 }
1251
1252 static ssize_t vfio_fops_write(struct file *filep, const char __user *buf,
1253                                size_t count, loff_t *ppos)
1254 {
1255         struct vfio_container *container = filep->private_data;
1256         struct vfio_iommu_driver *driver;
1257         ssize_t ret = -EINVAL;
1258
1259         down_read(&container->group_lock);
1260
1261         driver = container->iommu_driver;
1262         if (likely(driver && driver->ops->write))
1263                 ret = driver->ops->write(container->iommu_data,
1264                                          buf, count, ppos);
1265
1266         up_read(&container->group_lock);
1267
1268         return ret;
1269 }
1270
1271 static int vfio_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1272 {
1273         struct vfio_container *container = filep->private_data;
1274         struct vfio_iommu_driver *driver;
1275         int ret = -EINVAL;
1276
1277         down_read(&container->group_lock);
1278
1279         driver = container->iommu_driver;
1280         if (likely(driver && driver->ops->mmap))
1281                 ret = driver->ops->mmap(container->iommu_data, vma);
1282
1283         up_read(&container->group_lock);
1284
1285         return ret;
1286 }
1287
1288 static const struct file_operations vfio_fops = {
1289         .owner          = THIS_MODULE,
1290         .open           = vfio_fops_open,
1291         .release        = vfio_fops_release,
1292         .read           = vfio_fops_read,
1293         .write          = vfio_fops_write,
1294         .unlocked_ioctl = vfio_fops_unl_ioctl,
1295 #ifdef CONFIG_COMPAT
1296         .compat_ioctl   = vfio_fops_compat_ioctl,
1297 #endif
1298         .mmap           = vfio_fops_mmap,
1299 };
1300
1301 /**
1302  * VFIO Group fd, /dev/vfio/$GROUP
1303  */
1304 static void __vfio_group_unset_container(struct vfio_group *group)
1305 {
1306         struct vfio_container *container = group->container;
1307         struct vfio_iommu_driver *driver;
1308
1309         down_write(&container->group_lock);
1310
1311         driver = container->iommu_driver;
1312         if (driver)
1313                 driver->ops->detach_group(container->iommu_data,
1314                                           group->iommu_group);
1315
1316         group->container = NULL;
1317         list_del(&group->container_next);
1318
1319         /* Detaching the last group deprivileges a container, remove iommu */
1320         if (driver && list_empty(&container->group_list)) {
1321                 driver->ops->release(container->iommu_data);
1322                 module_put(driver->ops->owner);
1323                 container->iommu_driver = NULL;
1324                 container->iommu_data = NULL;
1325         }
1326
1327         up_write(&container->group_lock);
1328
1329         vfio_container_put(container);
1330 }
1331
1332 /*
1333  * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or
1334  * if there was no container to unset.  Since the ioctl is called on
1335  * the group, we know that still exists, therefore the only valid
1336  * transition here is 1->0.
1337  */
1338 static int vfio_group_unset_container(struct vfio_group *group)
1339 {
1340         int users = atomic_cmpxchg(&group->container_users, 1, 0);
1341
1342         if (!users)
1343                 return -EINVAL;
1344         if (users != 1)
1345                 return -EBUSY;
1346
1347         __vfio_group_unset_container(group);
1348
1349         return 0;
1350 }
1351
1352 /*
1353  * When removing container users, anything that removes the last user
1354  * implicitly removes the group from the container.  That is, if the
1355  * group file descriptor is closed, as well as any device file descriptors,
1356  * the group is free.
1357  */
1358 static void vfio_group_try_dissolve_container(struct vfio_group *group)
1359 {
1360         if (0 == atomic_dec_if_positive(&group->container_users))
1361                 __vfio_group_unset_container(group);
1362 }
1363
1364 static int vfio_group_set_container(struct vfio_group *group, int container_fd)
1365 {
1366         struct fd f;
1367         struct vfio_container *container;
1368         struct vfio_iommu_driver *driver;
1369         int ret = 0;
1370
1371         if (atomic_read(&group->container_users))
1372                 return -EINVAL;
1373
1374         if (group->noiommu && !capable(CAP_SYS_RAWIO))
1375                 return -EPERM;
1376
1377         f = fdget(container_fd);
1378         if (!f.file)
1379                 return -EBADF;
1380
1381         /* Sanity check, is this really our fd? */
1382         if (f.file->f_op != &vfio_fops) {
1383                 fdput(f);
1384                 return -EINVAL;
1385         }
1386
1387         container = f.file->private_data;
1388         WARN_ON(!container); /* fget ensures we don't race vfio_release */
1389
1390         down_write(&container->group_lock);
1391
1392         /* Real groups and fake groups cannot mix */
1393         if (!list_empty(&container->group_list) &&
1394             container->noiommu != group->noiommu) {
1395                 ret = -EPERM;
1396                 goto unlock_out;
1397         }
1398
1399         driver = container->iommu_driver;
1400         if (driver) {
1401                 ret = driver->ops->attach_group(container->iommu_data,
1402                                                 group->iommu_group);
1403                 if (ret)
1404                         goto unlock_out;
1405         }
1406
1407         group->container = container;
1408         container->noiommu = group->noiommu;
1409         list_add(&group->container_next, &container->group_list);
1410
1411         /* Get a reference on the container and mark a user within the group */
1412         vfio_container_get(container);
1413         atomic_inc(&group->container_users);
1414
1415 unlock_out:
1416         up_write(&container->group_lock);
1417         fdput(f);
1418         return ret;
1419 }
1420
1421 static bool vfio_group_viable(struct vfio_group *group)
1422 {
1423         return (iommu_group_for_each_dev(group->iommu_group,
1424                                          group, vfio_dev_viable) == 0);
1425 }
1426
1427 static int vfio_group_add_container_user(struct vfio_group *group)
1428 {
1429         if (!atomic_inc_not_zero(&group->container_users))
1430                 return -EINVAL;
1431
1432         if (group->noiommu) {
1433                 atomic_dec(&group->container_users);
1434                 return -EPERM;
1435         }
1436         if (!group->container->iommu_driver || !vfio_group_viable(group)) {
1437                 atomic_dec(&group->container_users);
1438                 return -EINVAL;
1439         }
1440
1441         return 0;
1442 }
1443
1444 static const struct file_operations vfio_device_fops;
1445
1446 static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
1447 {
1448         struct vfio_device *device;
1449         struct file *filep;
1450         int ret;
1451
1452         if (0 == atomic_read(&group->container_users) ||
1453             !group->container->iommu_driver || !vfio_group_viable(group))
1454                 return -EINVAL;
1455
1456         if (group->noiommu && !capable(CAP_SYS_RAWIO))
1457                 return -EPERM;
1458
1459         device = vfio_device_get_from_name(group, buf);
1460         if (!device)
1461                 return -ENODEV;
1462
1463         ret = device->ops->open(device->device_data);
1464         if (ret) {
1465                 vfio_device_put(device);
1466                 return ret;
1467         }
1468
1469         /*
1470          * We can't use anon_inode_getfd() because we need to modify
1471          * the f_mode flags directly to allow more than just ioctls
1472          */
1473         ret = get_unused_fd_flags(O_CLOEXEC);
1474         if (ret < 0) {
1475                 device->ops->release(device->device_data);
1476                 vfio_device_put(device);
1477                 return ret;
1478         }
1479
1480         filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
1481                                    device, O_RDWR);
1482         if (IS_ERR(filep)) {
1483                 put_unused_fd(ret);
1484                 ret = PTR_ERR(filep);
1485                 device->ops->release(device->device_data);
1486                 vfio_device_put(device);
1487                 return ret;
1488         }
1489
1490         /*
1491          * TODO: add an anon_inode interface to do this.
1492          * Appears to be missing by lack of need rather than
1493          * explicitly prevented.  Now there's need.
1494          */
1495         filep->f_mode |= (FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
1496
1497         atomic_inc(&group->container_users);
1498
1499         fd_install(ret, filep);
1500
1501         if (group->noiommu)
1502                 dev_warn(device->dev, "vfio-noiommu device opened by user "
1503                          "(%s:%d)\n", current->comm, task_pid_nr(current));
1504
1505         return ret;
1506 }
1507
1508 static long vfio_group_fops_unl_ioctl(struct file *filep,
1509                                       unsigned int cmd, unsigned long arg)
1510 {
1511         struct vfio_group *group = filep->private_data;
1512         long ret = -ENOTTY;
1513
1514         switch (cmd) {
1515         case VFIO_GROUP_GET_STATUS:
1516         {
1517                 struct vfio_group_status status;
1518                 unsigned long minsz;
1519
1520                 minsz = offsetofend(struct vfio_group_status, flags);
1521
1522                 if (copy_from_user(&status, (void __user *)arg, minsz))
1523                         return -EFAULT;
1524
1525                 if (status.argsz < minsz)
1526                         return -EINVAL;
1527
1528                 status.flags = 0;
1529
1530                 if (vfio_group_viable(group))
1531                         status.flags |= VFIO_GROUP_FLAGS_VIABLE;
1532
1533                 if (group->container)
1534                         status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET;
1535
1536                 if (copy_to_user((void __user *)arg, &status, minsz))
1537                         return -EFAULT;
1538
1539                 ret = 0;
1540                 break;
1541         }
1542         case VFIO_GROUP_SET_CONTAINER:
1543         {
1544                 int fd;
1545
1546                 if (get_user(fd, (int __user *)arg))
1547                         return -EFAULT;
1548
1549                 if (fd < 0)
1550                         return -EINVAL;
1551
1552                 ret = vfio_group_set_container(group, fd);
1553                 break;
1554         }
1555         case VFIO_GROUP_UNSET_CONTAINER:
1556                 ret = vfio_group_unset_container(group);
1557                 break;
1558         case VFIO_GROUP_GET_DEVICE_FD:
1559         {
1560                 char *buf;
1561
1562                 buf = strndup_user((const char __user *)arg, PAGE_SIZE);
1563                 if (IS_ERR(buf))
1564                         return PTR_ERR(buf);
1565
1566                 ret = vfio_group_get_device_fd(group, buf);
1567                 kfree(buf);
1568                 break;
1569         }
1570         }
1571
1572         return ret;
1573 }
1574
1575 #ifdef CONFIG_COMPAT
1576 static long vfio_group_fops_compat_ioctl(struct file *filep,
1577                                          unsigned int cmd, unsigned long arg)
1578 {
1579         arg = (unsigned long)compat_ptr(arg);
1580         return vfio_group_fops_unl_ioctl(filep, cmd, arg);
1581 }
1582 #endif  /* CONFIG_COMPAT */
1583
1584 static int vfio_group_fops_open(struct inode *inode, struct file *filep)
1585 {
1586         struct vfio_group *group;
1587         int opened;
1588
1589         group = vfio_group_get_from_minor(iminor(inode));
1590         if (!group)
1591                 return -ENODEV;
1592
1593         if (group->noiommu && !capable(CAP_SYS_RAWIO)) {
1594                 vfio_group_put(group);
1595                 return -EPERM;
1596         }
1597
1598         /* Do we need multiple instances of the group open?  Seems not. */
1599         opened = atomic_cmpxchg(&group->opened, 0, 1);
1600         if (opened) {
1601                 vfio_group_put(group);
1602                 return -EBUSY;
1603         }
1604
1605         /* Is something still in use from a previous open? */
1606         if (group->container) {
1607                 atomic_dec(&group->opened);
1608                 vfio_group_put(group);
1609                 return -EBUSY;
1610         }
1611
1612         /* Warn if previous user didn't cleanup and re-init to drop them */
1613         if (WARN_ON(group->notifier.head))
1614                 BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
1615
1616         filep->private_data = group;
1617
1618         return 0;
1619 }
1620
1621 static int vfio_group_fops_release(struct inode *inode, struct file *filep)
1622 {
1623         struct vfio_group *group = filep->private_data;
1624
1625         filep->private_data = NULL;
1626
1627         vfio_group_try_dissolve_container(group);
1628
1629         atomic_dec(&group->opened);
1630
1631         vfio_group_put(group);
1632
1633         return 0;
1634 }
1635
1636 static const struct file_operations vfio_group_fops = {
1637         .owner          = THIS_MODULE,
1638         .unlocked_ioctl = vfio_group_fops_unl_ioctl,
1639 #ifdef CONFIG_COMPAT
1640         .compat_ioctl   = vfio_group_fops_compat_ioctl,
1641 #endif
1642         .open           = vfio_group_fops_open,
1643         .release        = vfio_group_fops_release,
1644 };
1645
1646 /**
1647  * VFIO Device fd
1648  */
1649 static int vfio_device_fops_release(struct inode *inode, struct file *filep)
1650 {
1651         struct vfio_device *device = filep->private_data;
1652
1653         device->ops->release(device->device_data);
1654
1655         vfio_group_try_dissolve_container(device->group);
1656
1657         vfio_device_put(device);
1658
1659         return 0;
1660 }
1661
1662 static long vfio_device_fops_unl_ioctl(struct file *filep,
1663                                        unsigned int cmd, unsigned long arg)
1664 {
1665         struct vfio_device *device = filep->private_data;
1666
1667         if (unlikely(!device->ops->ioctl))
1668                 return -EINVAL;
1669
1670         return device->ops->ioctl(device->device_data, cmd, arg);
1671 }
1672
1673 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1674                                      size_t count, loff_t *ppos)
1675 {
1676         struct vfio_device *device = filep->private_data;
1677
1678         if (unlikely(!device->ops->read))
1679                 return -EINVAL;
1680
1681         return device->ops->read(device->device_data, buf, count, ppos);
1682 }
1683
1684 static ssize_t vfio_device_fops_write(struct file *filep,
1685                                       const char __user *buf,
1686                                       size_t count, loff_t *ppos)
1687 {
1688         struct vfio_device *device = filep->private_data;
1689
1690         if (unlikely(!device->ops->write))
1691                 return -EINVAL;
1692
1693         return device->ops->write(device->device_data, buf, count, ppos);
1694 }
1695
1696 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1697 {
1698         struct vfio_device *device = filep->private_data;
1699
1700         if (unlikely(!device->ops->mmap))
1701                 return -EINVAL;
1702
1703         return device->ops->mmap(device->device_data, vma);
1704 }
1705
1706 #ifdef CONFIG_COMPAT
1707 static long vfio_device_fops_compat_ioctl(struct file *filep,
1708                                           unsigned int cmd, unsigned long arg)
1709 {
1710         arg = (unsigned long)compat_ptr(arg);
1711         return vfio_device_fops_unl_ioctl(filep, cmd, arg);
1712 }
1713 #endif  /* CONFIG_COMPAT */
1714
1715 static const struct file_operations vfio_device_fops = {
1716         .owner          = THIS_MODULE,
1717         .release        = vfio_device_fops_release,
1718         .read           = vfio_device_fops_read,
1719         .write          = vfio_device_fops_write,
1720         .unlocked_ioctl = vfio_device_fops_unl_ioctl,
1721 #ifdef CONFIG_COMPAT
1722         .compat_ioctl   = vfio_device_fops_compat_ioctl,
1723 #endif
1724         .mmap           = vfio_device_fops_mmap,
1725 };
1726
1727 /**
1728  * External user API, exported by symbols to be linked dynamically.
1729  *
1730  * The protocol includes:
1731  *  1. do normal VFIO init operation:
1732  *      - opening a new container;
1733  *      - attaching group(s) to it;
1734  *      - setting an IOMMU driver for a container.
1735  * When IOMMU is set for a container, all groups in it are
1736  * considered ready to use by an external user.
1737  *
1738  * 2. User space passes a group fd to an external user.
1739  * The external user calls vfio_group_get_external_user()
1740  * to verify that:
1741  *      - the group is initialized;
1742  *      - IOMMU is set for it.
1743  * If both checks passed, vfio_group_get_external_user()
1744  * increments the container user counter to prevent
1745  * the VFIO group from disposal before KVM exits.
1746  *
1747  * 3. The external user calls vfio_external_user_iommu_id()
1748  * to know an IOMMU ID.
1749  *
1750  * 4. When the external KVM finishes, it calls
1751  * vfio_group_put_external_user() to release the VFIO group.
1752  * This call decrements the container user counter.
1753  */
1754 struct vfio_group *vfio_group_get_external_user(struct file *filep)
1755 {
1756         struct vfio_group *group = filep->private_data;
1757         int ret;
1758
1759         if (filep->f_op != &vfio_group_fops)
1760                 return ERR_PTR(-EINVAL);
1761
1762         ret = vfio_group_add_container_user(group);
1763         if (ret)
1764                 return ERR_PTR(ret);
1765
1766         vfio_group_get(group);
1767
1768         return group;
1769 }
1770 EXPORT_SYMBOL_GPL(vfio_group_get_external_user);
1771
1772 void vfio_group_put_external_user(struct vfio_group *group)
1773 {
1774         vfio_group_try_dissolve_container(group);
1775         vfio_group_put(group);
1776 }
1777 EXPORT_SYMBOL_GPL(vfio_group_put_external_user);
1778
1779 bool vfio_external_group_match_file(struct vfio_group *test_group,
1780                                     struct file *filep)
1781 {
1782         struct vfio_group *group = filep->private_data;
1783
1784         return (filep->f_op == &vfio_group_fops) && (group == test_group);
1785 }
1786 EXPORT_SYMBOL_GPL(vfio_external_group_match_file);
1787
1788 int vfio_external_user_iommu_id(struct vfio_group *group)
1789 {
1790         return iommu_group_id(group->iommu_group);
1791 }
1792 EXPORT_SYMBOL_GPL(vfio_external_user_iommu_id);
1793
1794 long vfio_external_check_extension(struct vfio_group *group, unsigned long arg)
1795 {
1796         return vfio_ioctl_check_extension(group->container, arg);
1797 }
1798 EXPORT_SYMBOL_GPL(vfio_external_check_extension);
1799
1800 /**
1801  * Sub-module support
1802  */
1803 /*
1804  * Helper for managing a buffer of info chain capabilities, allocate or
1805  * reallocate a buffer with additional @size, filling in @id and @version
1806  * of the capability.  A pointer to the new capability is returned.
1807  *
1808  * NB. The chain is based at the head of the buffer, so new entries are
1809  * added to the tail, vfio_info_cap_shift() should be called to fixup the
1810  * next offsets prior to copying to the user buffer.
1811  */
1812 struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1813                                                size_t size, u16 id, u16 version)
1814 {
1815         void *buf;
1816         struct vfio_info_cap_header *header, *tmp;
1817
1818         buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1819         if (!buf) {
1820                 kfree(caps->buf);
1821                 caps->size = 0;
1822                 return ERR_PTR(-ENOMEM);
1823         }
1824
1825         caps->buf = buf;
1826         header = buf + caps->size;
1827
1828         /* Eventually copied to user buffer, zero */
1829         memset(header, 0, size);
1830
1831         header->id = id;
1832         header->version = version;
1833
1834         /* Add to the end of the capability chain */
1835         for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1836                 ; /* nothing */
1837
1838         tmp->next = caps->size;
1839         caps->size += size;
1840
1841         return header;
1842 }
1843 EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1844
1845 void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1846 {
1847         struct vfio_info_cap_header *tmp;
1848         void *buf = (void *)caps->buf;
1849
1850         for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1851                 tmp->next += offset;
1852 }
1853 EXPORT_SYMBOL(vfio_info_cap_shift);
1854
1855 static int sparse_mmap_cap(struct vfio_info_cap *caps, void *cap_type)
1856 {
1857         struct vfio_info_cap_header *header;
1858         struct vfio_region_info_cap_sparse_mmap *sparse_cap, *sparse = cap_type;
1859         size_t size;
1860
1861         size = sizeof(*sparse) + sparse->nr_areas *  sizeof(*sparse->areas);
1862         header = vfio_info_cap_add(caps, size,
1863                                    VFIO_REGION_INFO_CAP_SPARSE_MMAP, 1);
1864         if (IS_ERR(header))
1865                 return PTR_ERR(header);
1866
1867         sparse_cap = container_of(header,
1868                         struct vfio_region_info_cap_sparse_mmap, header);
1869         sparse_cap->nr_areas = sparse->nr_areas;
1870         memcpy(sparse_cap->areas, sparse->areas,
1871                sparse->nr_areas * sizeof(*sparse->areas));
1872         return 0;
1873 }
1874
1875 static int region_type_cap(struct vfio_info_cap *caps, void *cap_type)
1876 {
1877         struct vfio_info_cap_header *header;
1878         struct vfio_region_info_cap_type *type_cap, *cap = cap_type;
1879
1880         header = vfio_info_cap_add(caps, sizeof(*cap),
1881                                    VFIO_REGION_INFO_CAP_TYPE, 1);
1882         if (IS_ERR(header))
1883                 return PTR_ERR(header);
1884
1885         type_cap = container_of(header, struct vfio_region_info_cap_type,
1886                                 header);
1887         type_cap->type = cap->type;
1888         type_cap->subtype = cap->subtype;
1889         return 0;
1890 }
1891
1892 int vfio_info_add_capability(struct vfio_info_cap *caps, int cap_type_id,
1893                              void *cap_type)
1894 {
1895         int ret = -EINVAL;
1896
1897         if (!cap_type)
1898                 return 0;
1899
1900         switch (cap_type_id) {
1901         case VFIO_REGION_INFO_CAP_SPARSE_MMAP:
1902                 ret = sparse_mmap_cap(caps, cap_type);
1903                 break;
1904
1905         case VFIO_REGION_INFO_CAP_TYPE:
1906                 ret = region_type_cap(caps, cap_type);
1907                 break;
1908         }
1909
1910         return ret;
1911 }
1912 EXPORT_SYMBOL(vfio_info_add_capability);
1913
1914 int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1915                                        int max_irq_type, size_t *data_size)
1916 {
1917         unsigned long minsz;
1918         size_t size;
1919
1920         minsz = offsetofend(struct vfio_irq_set, count);
1921
1922         if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1923             (hdr->count >= (U32_MAX - hdr->start)) ||
1924             (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1925                                 VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1926                 return -EINVAL;
1927
1928         if (data_size)
1929                 *data_size = 0;
1930
1931         if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1932                 return -EINVAL;
1933
1934         switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1935         case VFIO_IRQ_SET_DATA_NONE:
1936                 size = 0;
1937                 break;
1938         case VFIO_IRQ_SET_DATA_BOOL:
1939                 size = sizeof(uint8_t);
1940                 break;
1941         case VFIO_IRQ_SET_DATA_EVENTFD:
1942                 size = sizeof(int32_t);
1943                 break;
1944         default:
1945                 return -EINVAL;
1946         }
1947
1948         if (size) {
1949                 if (hdr->argsz - minsz < hdr->count * size)
1950                         return -EINVAL;
1951
1952                 if (!data_size)
1953                         return -EINVAL;
1954
1955                 *data_size = hdr->count * size;
1956         }
1957
1958         return 0;
1959 }
1960 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1961
1962 /*
1963  * Pin a set of guest PFNs and return their associated host PFNs for local
1964  * domain only.
1965  * @dev [in]     : device
1966  * @user_pfn [in]: array of user/guest PFNs to be pinned.
1967  * @npage [in]   : count of elements in user_pfn array.  This count should not
1968  *                 be greater VFIO_PIN_PAGES_MAX_ENTRIES.
1969  * @prot [in]    : protection flags
1970  * @phys_pfn[out]: array of host PFNs
1971  * Return error or number of pages pinned.
1972  */
1973 int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, int npage,
1974                    int prot, unsigned long *phys_pfn)
1975 {
1976         struct vfio_container *container;
1977         struct vfio_group *group;
1978         struct vfio_iommu_driver *driver;
1979         int ret;
1980
1981         if (!dev || !user_pfn || !phys_pfn || !npage)
1982                 return -EINVAL;
1983
1984         if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1985                 return -E2BIG;
1986
1987         group = vfio_group_get_from_dev(dev);
1988         if (!group)
1989                 return -ENODEV;
1990
1991         ret = vfio_group_add_container_user(group);
1992         if (ret)
1993                 goto err_pin_pages;
1994
1995         container = group->container;
1996         down_read(&container->group_lock);
1997
1998         driver = container->iommu_driver;
1999         if (likely(driver && driver->ops->pin_pages))
2000                 ret = driver->ops->pin_pages(container->iommu_data, user_pfn,
2001                                              npage, prot, phys_pfn);
2002         else
2003                 ret = -ENOTTY;
2004
2005         up_read(&container->group_lock);
2006         vfio_group_try_dissolve_container(group);
2007
2008 err_pin_pages:
2009         vfio_group_put(group);
2010         return ret;
2011 }
2012 EXPORT_SYMBOL(vfio_pin_pages);
2013
2014 /*
2015  * Unpin set of host PFNs for local domain only.
2016  * @dev [in]     : device
2017  * @user_pfn [in]: array of user/guest PFNs to be unpinned. Number of user/guest
2018  *                 PFNs should not be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
2019  * @npage [in]   : count of elements in user_pfn array.  This count should not
2020  *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
2021  * Return error or number of pages unpinned.
2022  */
2023 int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, int npage)
2024 {
2025         struct vfio_container *container;
2026         struct vfio_group *group;
2027         struct vfio_iommu_driver *driver;
2028         int ret;
2029
2030         if (!dev || !user_pfn || !npage)
2031                 return -EINVAL;
2032
2033         if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
2034                 return -E2BIG;
2035
2036         group = vfio_group_get_from_dev(dev);
2037         if (!group)
2038                 return -ENODEV;
2039
2040         ret = vfio_group_add_container_user(group);
2041         if (ret)
2042                 goto err_unpin_pages;
2043
2044         container = group->container;
2045         down_read(&container->group_lock);
2046
2047         driver = container->iommu_driver;
2048         if (likely(driver && driver->ops->unpin_pages))
2049                 ret = driver->ops->unpin_pages(container->iommu_data, user_pfn,
2050                                                npage);
2051         else
2052                 ret = -ENOTTY;
2053
2054         up_read(&container->group_lock);
2055         vfio_group_try_dissolve_container(group);
2056
2057 err_unpin_pages:
2058         vfio_group_put(group);
2059         return ret;
2060 }
2061 EXPORT_SYMBOL(vfio_unpin_pages);
2062
2063 static int vfio_register_iommu_notifier(struct vfio_group *group,
2064                                         unsigned long *events,
2065                                         struct notifier_block *nb)
2066 {
2067         struct vfio_container *container;
2068         struct vfio_iommu_driver *driver;
2069         int ret;
2070
2071         ret = vfio_group_add_container_user(group);
2072         if (ret)
2073                 return -EINVAL;
2074
2075         container = group->container;
2076         down_read(&container->group_lock);
2077
2078         driver = container->iommu_driver;
2079         if (likely(driver && driver->ops->register_notifier))
2080                 ret = driver->ops->register_notifier(container->iommu_data,
2081                                                      events, nb);
2082         else
2083                 ret = -ENOTTY;
2084
2085         up_read(&container->group_lock);
2086         vfio_group_try_dissolve_container(group);
2087
2088         return ret;
2089 }
2090
2091 static int vfio_unregister_iommu_notifier(struct vfio_group *group,
2092                                           struct notifier_block *nb)
2093 {
2094         struct vfio_container *container;
2095         struct vfio_iommu_driver *driver;
2096         int ret;
2097
2098         ret = vfio_group_add_container_user(group);
2099         if (ret)
2100                 return -EINVAL;
2101
2102         container = group->container;
2103         down_read(&container->group_lock);
2104
2105         driver = container->iommu_driver;
2106         if (likely(driver && driver->ops->unregister_notifier))
2107                 ret = driver->ops->unregister_notifier(container->iommu_data,
2108                                                        nb);
2109         else
2110                 ret = -ENOTTY;
2111
2112         up_read(&container->group_lock);
2113         vfio_group_try_dissolve_container(group);
2114
2115         return ret;
2116 }
2117
2118 void vfio_group_set_kvm(struct vfio_group *group, struct kvm *kvm)
2119 {
2120         group->kvm = kvm;
2121         blocking_notifier_call_chain(&group->notifier,
2122                                 VFIO_GROUP_NOTIFY_SET_KVM, kvm);
2123 }
2124 EXPORT_SYMBOL_GPL(vfio_group_set_kvm);
2125
2126 static int vfio_register_group_notifier(struct vfio_group *group,
2127                                         unsigned long *events,
2128                                         struct notifier_block *nb)
2129 {
2130         struct vfio_container *container;
2131         int ret;
2132         bool set_kvm = false;
2133
2134         if (*events & VFIO_GROUP_NOTIFY_SET_KVM)
2135                 set_kvm = true;
2136
2137         /* clear known events */
2138         *events &= ~VFIO_GROUP_NOTIFY_SET_KVM;
2139
2140         /* refuse to continue if still events remaining */
2141         if (*events)
2142                 return -EINVAL;
2143
2144         ret = vfio_group_add_container_user(group);
2145         if (ret)
2146                 return -EINVAL;
2147
2148         container = group->container;
2149         down_read(&container->group_lock);
2150
2151         ret = blocking_notifier_chain_register(&group->notifier, nb);
2152
2153         /*
2154          * The attaching of kvm and vfio_group might already happen, so
2155          * here we replay once upon registration.
2156          */
2157         if (!ret && set_kvm && group->kvm)
2158                 blocking_notifier_call_chain(&group->notifier,
2159                                         VFIO_GROUP_NOTIFY_SET_KVM, group->kvm);
2160
2161         up_read(&container->group_lock);
2162         vfio_group_try_dissolve_container(group);
2163
2164         return ret;
2165 }
2166
2167 static int vfio_unregister_group_notifier(struct vfio_group *group,
2168                                          struct notifier_block *nb)
2169 {
2170         struct vfio_container *container;
2171         int ret;
2172
2173         ret = vfio_group_add_container_user(group);
2174         if (ret)
2175                 return -EINVAL;
2176
2177         container = group->container;
2178         down_read(&container->group_lock);
2179
2180         ret = blocking_notifier_chain_unregister(&group->notifier, nb);
2181
2182         up_read(&container->group_lock);
2183         vfio_group_try_dissolve_container(group);
2184
2185         return ret;
2186 }
2187
2188 int vfio_register_notifier(struct device *dev, enum vfio_notify_type type,
2189                            unsigned long *events, struct notifier_block *nb)
2190 {
2191         struct vfio_group *group;
2192         int ret;
2193
2194         if (!dev || !nb || !events || (*events == 0))
2195                 return -EINVAL;
2196
2197         group = vfio_group_get_from_dev(dev);
2198         if (!group)
2199                 return -ENODEV;
2200
2201         switch (type) {
2202         case VFIO_IOMMU_NOTIFY:
2203                 ret = vfio_register_iommu_notifier(group, events, nb);
2204                 break;
2205         case VFIO_GROUP_NOTIFY:
2206                 ret = vfio_register_group_notifier(group, events, nb);
2207                 break;
2208         default:
2209                 ret = -EINVAL;
2210         }
2211
2212         vfio_group_put(group);
2213         return ret;
2214 }
2215 EXPORT_SYMBOL(vfio_register_notifier);
2216
2217 int vfio_unregister_notifier(struct device *dev, enum vfio_notify_type type,
2218                              struct notifier_block *nb)
2219 {
2220         struct vfio_group *group;
2221         int ret;
2222
2223         if (!dev || !nb)
2224                 return -EINVAL;
2225
2226         group = vfio_group_get_from_dev(dev);
2227         if (!group)
2228                 return -ENODEV;
2229
2230         switch (type) {
2231         case VFIO_IOMMU_NOTIFY:
2232                 ret = vfio_unregister_iommu_notifier(group, nb);
2233                 break;
2234         case VFIO_GROUP_NOTIFY:
2235                 ret = vfio_unregister_group_notifier(group, nb);
2236                 break;
2237         default:
2238                 ret = -EINVAL;
2239         }
2240
2241         vfio_group_put(group);
2242         return ret;
2243 }
2244 EXPORT_SYMBOL(vfio_unregister_notifier);
2245
2246 /**
2247  * Module/class support
2248  */
2249 static char *vfio_devnode(struct device *dev, umode_t *mode)
2250 {
2251         return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
2252 }
2253
2254 static struct miscdevice vfio_dev = {
2255         .minor = VFIO_MINOR,
2256         .name = "vfio",
2257         .fops = &vfio_fops,
2258         .nodename = "vfio/vfio",
2259         .mode = S_IRUGO | S_IWUGO,
2260 };
2261
2262 static int __init vfio_init(void)
2263 {
2264         int ret;
2265
2266         idr_init(&vfio.group_idr);
2267         mutex_init(&vfio.group_lock);
2268         mutex_init(&vfio.iommu_drivers_lock);
2269         INIT_LIST_HEAD(&vfio.group_list);
2270         INIT_LIST_HEAD(&vfio.iommu_drivers_list);
2271         init_waitqueue_head(&vfio.release_q);
2272
2273         ret = misc_register(&vfio_dev);
2274         if (ret) {
2275                 pr_err("vfio: misc device register failed\n");
2276                 return ret;
2277         }
2278
2279         /* /dev/vfio/$GROUP */
2280         vfio.class = class_create(THIS_MODULE, "vfio");
2281         if (IS_ERR(vfio.class)) {
2282                 ret = PTR_ERR(vfio.class);
2283                 goto err_class;
2284         }
2285
2286         vfio.class->devnode = vfio_devnode;
2287
2288         ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK, "vfio");
2289         if (ret)
2290                 goto err_alloc_chrdev;
2291
2292         cdev_init(&vfio.group_cdev, &vfio_group_fops);
2293         ret = cdev_add(&vfio.group_cdev, vfio.group_devt, MINORMASK);
2294         if (ret)
2295                 goto err_cdev_add;
2296
2297         pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
2298
2299 #ifdef CONFIG_VFIO_NOIOMMU
2300         vfio_register_iommu_driver(&vfio_noiommu_ops);
2301 #endif
2302         return 0;
2303
2304 err_cdev_add:
2305         unregister_chrdev_region(vfio.group_devt, MINORMASK);
2306 err_alloc_chrdev:
2307         class_destroy(vfio.class);
2308         vfio.class = NULL;
2309 err_class:
2310         misc_deregister(&vfio_dev);
2311         return ret;
2312 }
2313
2314 static void __exit vfio_cleanup(void)
2315 {
2316         WARN_ON(!list_empty(&vfio.group_list));
2317
2318 #ifdef CONFIG_VFIO_NOIOMMU
2319         vfio_unregister_iommu_driver(&vfio_noiommu_ops);
2320 #endif
2321         idr_destroy(&vfio.group_idr);
2322         cdev_del(&vfio.group_cdev);
2323         unregister_chrdev_region(vfio.group_devt, MINORMASK);
2324         class_destroy(vfio.class);
2325         vfio.class = NULL;
2326         misc_deregister(&vfio_dev);
2327 }
2328
2329 module_init(vfio_init);
2330 module_exit(vfio_cleanup);
2331
2332 MODULE_VERSION(DRIVER_VERSION);
2333 MODULE_LICENSE("GPL v2");
2334 MODULE_AUTHOR(DRIVER_AUTHOR);
2335 MODULE_DESCRIPTION(DRIVER_DESC);
2336 MODULE_ALIAS_MISCDEV(VFIO_MINOR);
2337 MODULE_ALIAS("devname:vfio/vfio");
2338 MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");