Merge tag 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost
authorLinus Torvalds <torvalds@linux-foundation.org>
Tue, 15 Aug 2023 06:03:44 +0000 (06:03 +0000)
committerLinus Torvalds <torvalds@linux-foundation.org>
Tue, 15 Aug 2023 06:03:44 +0000 (06:03 +0000)
Pull virtio fixes from Michael Tsirkin:
 "Just a bunch of bugfixes all over the place"

* tag 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost: (26 commits)
  virtio-mem: check if the config changed before fake offlining memory
  virtio-mem: keep retrying on offline_and_remove_memory() errors in Sub Block Mode (SBM)
  virtio-mem: convert most offline_and_remove_memory() errors to -EBUSY
  virtio-mem: remove unsafe unplug in Big Block Mode (BBM)
  pds_vdpa: fix up debugfs feature bit printing
  pds_vdpa: alloc irq vectors on DRIVER_OK
  pds_vdpa: clean and reset vqs entries
  pds_vdpa: always allow offering VIRTIO_NET_F_MAC
  pds_vdpa: reset to vdpa specified mac
  virtio-net: Zero max_tx_vq field for VIRTIO_NET_CTRL_MQ_HASH_CONFIG case
  vdpa/mlx5: Fix crash on shutdown for when no ndev exists
  vdpa/mlx5: Delete control vq iotlb in destroy_mr only when necessary
  vdpa/mlx5: Fix mr->initialized semantics
  vdpa/mlx5: Correct default number of queues when MQ is on
  virtio-vdpa: Fix cpumask memory leak in virtio_vdpa_find_vqs()
  vduse: Use proper spinlock for IRQ injection
  vdpa: Enable strict validation for netlinks ops
  vdpa: Add max vqp attr to vdpa_nl_policy for nlattr length check
  vdpa: Add queue index attr to vdpa_nl_policy for nlattr length check
  vdpa: Add features attr to vdpa_nl_policy for nlattr length check
  ...

17 files changed:
MAINTAINERS
drivers/net/virtio_net.c
drivers/vdpa/mlx5/core/mlx5_vdpa.h
drivers/vdpa/mlx5/core/mr.c
drivers/vdpa/mlx5/net/mlx5_vnet.c
drivers/vdpa/pds/Makefile
drivers/vdpa/pds/debugfs.c
drivers/vdpa/pds/vdpa_dev.c
drivers/vdpa/pds/vdpa_dev.h
drivers/vdpa/vdpa.c
drivers/vdpa/vdpa_user/vduse_dev.c
drivers/vhost/scsi.c
drivers/virtio/virtio_mem.c
drivers/virtio/virtio_mmio.c
drivers/virtio/virtio_pci_common.c
drivers/virtio/virtio_pci_legacy.c
drivers/virtio/virtio_vdpa.c

index 0903d87b17cbe9b2c3b9f6a4ed27fc60ec04a328..4227aac551f60b36501b334a8a12f6cdd93fec87 100644 (file)
@@ -22474,7 +22474,6 @@ L:      virtualization@lists.linux-foundation.org
 S:     Maintained
 F:     drivers/block/virtio_blk.c
 F:     drivers/scsi/virtio_scsi.c
-F:     drivers/vhost/scsi.c
 F:     include/uapi/linux/virtio_blk.h
 F:     include/uapi/linux/virtio_scsi.h
 
@@ -22573,6 +22572,16 @@ F:     include/linux/vhost_iotlb.h
 F:     include/uapi/linux/vhost.h
 F:     kernel/vhost_task.c
 
+VIRTIO HOST (VHOST-SCSI)
+M:     "Michael S. Tsirkin" <mst@redhat.com>
+M:     Jason Wang <jasowang@redhat.com>
+M:     Mike Christie <michael.christie@oracle.com>
+R:     Paolo Bonzini <pbonzini@redhat.com>
+R:     Stefan Hajnoczi <stefanha@redhat.com>
+L:     virtualization@lists.linux-foundation.org
+S:     Maintained
+F:     drivers/vhost/scsi.c
+
 VIRTIO I2C DRIVER
 M:     Conghui Chen <conghui.chen@intel.com>
 M:     Viresh Kumar <viresh.kumar@linaro.org>
index 1270c8d23463fa35849dd290b7df7496198aaf9b..8db38634ae82d2a575d3866e059d1e22567ab53c 100644 (file)
@@ -2761,7 +2761,7 @@ static void virtnet_init_default_rss(struct virtnet_info *vi)
                vi->ctrl->rss.indirection_table[i] = indir_val;
        }
 
-       vi->ctrl->rss.max_tx_vq = vi->curr_queue_pairs;
+       vi->ctrl->rss.max_tx_vq = vi->has_rss ? vi->curr_queue_pairs : 0;
        vi->ctrl->rss.hash_key_length = vi->rss_key_size;
 
        netdev_rss_key_fill(vi->ctrl->rss.key, vi->rss_key_size);
index 25fc4120b618de16483ae2bd89af9bb866206bcd..b53420e874acb331d4a9f9bc2bcb2bfb60312fd2 100644 (file)
@@ -31,6 +31,7 @@ struct mlx5_vdpa_mr {
        struct list_head head;
        unsigned long num_directs;
        unsigned long num_klms;
+       /* state of dvq mr */
        bool initialized;
 
        /* serialize mkey creation and destruction */
@@ -121,6 +122,7 @@ int mlx5_vdpa_handle_set_map(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *io
 int mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb,
                        unsigned int asid);
 void mlx5_vdpa_destroy_mr(struct mlx5_vdpa_dev *mvdev);
+void mlx5_vdpa_destroy_mr_asid(struct mlx5_vdpa_dev *mvdev, unsigned int asid);
 
 #define mlx5_vdpa_warn(__dev, format, ...)                                                         \
        dev_warn((__dev)->mdev->device, "%s:%d:(pid %d) warning: " format, __func__, __LINE__,     \
index 03e5432297912a8da178f9f798c29fb2f6fb66db..5a1971fcd87b109d33be10ae8bdb678428e5d2d8 100644 (file)
@@ -489,60 +489,103 @@ static void destroy_user_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_mr *mr
        }
 }
 
-void mlx5_vdpa_destroy_mr(struct mlx5_vdpa_dev *mvdev)
+static void _mlx5_vdpa_destroy_cvq_mr(struct mlx5_vdpa_dev *mvdev, unsigned int asid)
+{
+       if (mvdev->group2asid[MLX5_VDPA_CVQ_GROUP] != asid)
+               return;
+
+       prune_iotlb(mvdev);
+}
+
+static void _mlx5_vdpa_destroy_dvq_mr(struct mlx5_vdpa_dev *mvdev, unsigned int asid)
 {
        struct mlx5_vdpa_mr *mr = &mvdev->mr;
 
-       mutex_lock(&mr->mkey_mtx);
+       if (mvdev->group2asid[MLX5_VDPA_DATAVQ_GROUP] != asid)
+               return;
+
        if (!mr->initialized)
-               goto out;
+               return;
 
-       prune_iotlb(mvdev);
        if (mr->user_mr)
                destroy_user_mr(mvdev, mr);
        else
                destroy_dma_mr(mvdev, mr);
 
        mr->initialized = false;
-out:
+}
+
+void mlx5_vdpa_destroy_mr_asid(struct mlx5_vdpa_dev *mvdev, unsigned int asid)
+{
+       struct mlx5_vdpa_mr *mr = &mvdev->mr;
+
+       mutex_lock(&mr->mkey_mtx);
+
+       _mlx5_vdpa_destroy_dvq_mr(mvdev, asid);
+       _mlx5_vdpa_destroy_cvq_mr(mvdev, asid);
+
        mutex_unlock(&mr->mkey_mtx);
 }
 
-static int _mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev,
-                               struct vhost_iotlb *iotlb, unsigned int asid)
+void mlx5_vdpa_destroy_mr(struct mlx5_vdpa_dev *mvdev)
+{
+       mlx5_vdpa_destroy_mr_asid(mvdev, mvdev->group2asid[MLX5_VDPA_CVQ_GROUP]);
+       mlx5_vdpa_destroy_mr_asid(mvdev, mvdev->group2asid[MLX5_VDPA_DATAVQ_GROUP]);
+}
+
+static int _mlx5_vdpa_create_cvq_mr(struct mlx5_vdpa_dev *mvdev,
+                                   struct vhost_iotlb *iotlb,
+                                   unsigned int asid)
+{
+       if (mvdev->group2asid[MLX5_VDPA_CVQ_GROUP] != asid)
+               return 0;
+
+       return dup_iotlb(mvdev, iotlb);
+}
+
+static int _mlx5_vdpa_create_dvq_mr(struct mlx5_vdpa_dev *mvdev,
+                                   struct vhost_iotlb *iotlb,
+                                   unsigned int asid)
 {
        struct mlx5_vdpa_mr *mr = &mvdev->mr;
        int err;
 
-       if (mr->initialized)
+       if (mvdev->group2asid[MLX5_VDPA_DATAVQ_GROUP] != asid)
                return 0;
 
-       if (mvdev->group2asid[MLX5_VDPA_DATAVQ_GROUP] == asid) {
-               if (iotlb)
-                       err = create_user_mr(mvdev, iotlb);
-               else
-                       err = create_dma_mr(mvdev, mr);
+       if (mr->initialized)
+               return 0;
 
-               if (err)
-                       return err;
-       }
+       if (iotlb)
+               err = create_user_mr(mvdev, iotlb);
+       else
+               err = create_dma_mr(mvdev, mr);
 
-       if (mvdev->group2asid[MLX5_VDPA_CVQ_GROUP] == asid) {
-               err = dup_iotlb(mvdev, iotlb);
-               if (err)
-                       goto out_err;
-       }
+       if (err)
+               return err;
 
        mr->initialized = true;
+
+       return 0;
+}
+
+static int _mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev,
+                               struct vhost_iotlb *iotlb, unsigned int asid)
+{
+       int err;
+
+       err = _mlx5_vdpa_create_dvq_mr(mvdev, iotlb, asid);
+       if (err)
+               return err;
+
+       err = _mlx5_vdpa_create_cvq_mr(mvdev, iotlb, asid);
+       if (err)
+               goto out_err;
+
        return 0;
 
 out_err:
-       if (mvdev->group2asid[MLX5_VDPA_DATAVQ_GROUP] == asid) {
-               if (iotlb)
-                       destroy_user_mr(mvdev, mr);
-               else
-                       destroy_dma_mr(mvdev, mr);
-       }
+       _mlx5_vdpa_destroy_dvq_mr(mvdev, asid);
 
        return err;
 }
index 9138ef2fb2c853270ec11284a0824e84200276d3..37be945a0230884be614ea557cf2db921f2ed6d0 100644 (file)
@@ -2517,7 +2517,15 @@ static int mlx5_vdpa_set_driver_features(struct vdpa_device *vdev, u64 features)
        else
                ndev->rqt_size = 1;
 
-       ndev->cur_num_vqs = 2 * ndev->rqt_size;
+       /* Device must start with 1 queue pair, as per VIRTIO v1.2 spec, section
+        * 5.1.6.5.5 "Device operation in multiqueue mode":
+        *
+        * Multiqueue is disabled by default.
+        * The driver enables multiqueue by sending a command using class
+        * VIRTIO_NET_CTRL_MQ. The command selects the mode of multiqueue
+        * operation, as follows: ...
+        */
+       ndev->cur_num_vqs = 2;
 
        update_cvq_info(mvdev);
        return err;
@@ -2636,7 +2644,7 @@ static int mlx5_vdpa_change_map(struct mlx5_vdpa_dev *mvdev,
                goto err_mr;
 
        teardown_driver(ndev);
-       mlx5_vdpa_destroy_mr(mvdev);
+       mlx5_vdpa_destroy_mr_asid(mvdev, asid);
        err = mlx5_vdpa_create_mr(mvdev, iotlb, asid);
        if (err)
                goto err_mr;
@@ -2652,7 +2660,7 @@ static int mlx5_vdpa_change_map(struct mlx5_vdpa_dev *mvdev,
        return 0;
 
 err_setup:
-       mlx5_vdpa_destroy_mr(mvdev);
+       mlx5_vdpa_destroy_mr_asid(mvdev, asid);
 err_mr:
        return err;
 }
@@ -3548,17 +3556,6 @@ static void mlx5v_remove(struct auxiliary_device *adev)
        kfree(mgtdev);
 }
 
-static void mlx5v_shutdown(struct auxiliary_device *auxdev)
-{
-       struct mlx5_vdpa_mgmtdev *mgtdev;
-       struct mlx5_vdpa_net *ndev;
-
-       mgtdev = auxiliary_get_drvdata(auxdev);
-       ndev = mgtdev->ndev;
-
-       free_irqs(ndev);
-}
-
 static const struct auxiliary_device_id mlx5v_id_table[] = {
        { .name = MLX5_ADEV_NAME ".vnet", },
        {},
@@ -3570,7 +3567,6 @@ static struct auxiliary_driver mlx5v_driver = {
        .name = "vnet",
        .probe = mlx5v_probe,
        .remove = mlx5v_remove,
-       .shutdown = mlx5v_shutdown,
        .id_table = mlx5v_id_table,
 };
 
index 2e22418e3ab3058d0974dbf99aea59a3ef217560..c2d314d4614d3a8e84708fa9d6b47c53024ce229 100644 (file)
@@ -5,6 +5,5 @@ obj-$(CONFIG_PDS_VDPA) := pds_vdpa.o
 
 pds_vdpa-y := aux_drv.o \
              cmds.o \
+             debugfs.o \
              vdpa_dev.o
-
-pds_vdpa-$(CONFIG_DEBUG_FS) += debugfs.o
index 21a0dc0cb607ca5768dab0bf982dbee1cab8b863..9b04aad6ec35d7499da38d1209fb53a1cdae91a9 100644 (file)
@@ -176,6 +176,7 @@ static int identity_show(struct seq_file *seq, void *v)
 {
        struct pds_vdpa_aux *vdpa_aux = seq->private;
        struct vdpa_mgmt_dev *mgmt;
+       u64 hw_features;
 
        seq_printf(seq, "aux_dev:            %s\n",
                   dev_name(&vdpa_aux->padev->aux_dev.dev));
@@ -183,8 +184,9 @@ static int identity_show(struct seq_file *seq, void *v)
        mgmt = &vdpa_aux->vdpa_mdev;
        seq_printf(seq, "max_vqs:            %d\n", mgmt->max_supported_vqs);
        seq_printf(seq, "config_attr_mask:   %#llx\n", mgmt->config_attr_mask);
-       seq_printf(seq, "supported_features: %#llx\n", mgmt->supported_features);
-       print_feature_bits_all(seq, mgmt->supported_features);
+       hw_features = le64_to_cpu(vdpa_aux->ident.hw_features);
+       seq_printf(seq, "hw_features:        %#llx\n", hw_features);
+       print_feature_bits_all(seq, hw_features);
 
        return 0;
 }
@@ -200,7 +202,6 @@ static int config_show(struct seq_file *seq, void *v)
 {
        struct pds_vdpa_device *pdsv = seq->private;
        struct virtio_net_config vc;
-       u64 driver_features;
        u8 status;
 
        memcpy_fromio(&vc, pdsv->vdpa_aux->vd_mdev.device,
@@ -223,12 +224,8 @@ static int config_show(struct seq_file *seq, void *v)
        status = vp_modern_get_status(&pdsv->vdpa_aux->vd_mdev);
        seq_printf(seq, "dev_status:           %#x\n", status);
        print_status_bits(seq, status);
-
-       seq_printf(seq, "req_features:         %#llx\n", pdsv->req_features);
-       print_feature_bits_all(seq, pdsv->req_features);
-       driver_features = vp_modern_get_driver_features(&pdsv->vdpa_aux->vd_mdev);
-       seq_printf(seq, "driver_features:      %#llx\n", driver_features);
-       print_feature_bits_all(seq, driver_features);
+       seq_printf(seq, "negotiated_features:  %#llx\n", pdsv->negotiated_features);
+       print_feature_bits_all(seq, pdsv->negotiated_features);
        seq_printf(seq, "vdpa_index:           %d\n", pdsv->vdpa_index);
        seq_printf(seq, "num_vqs:              %d\n", pdsv->num_vqs);
 
index 5071a4d58f8db37f6d619d2f2d42f44d7f5188d6..52b2449182ad71976cc68cb58aa8b52a77ff5cea 100644 (file)
@@ -126,11 +126,9 @@ static void pds_vdpa_release_irq(struct pds_vdpa_device *pdsv, int qid)
 static void pds_vdpa_set_vq_ready(struct vdpa_device *vdpa_dev, u16 qid, bool ready)
 {
        struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev);
-       struct pci_dev *pdev = pdsv->vdpa_aux->padev->vf_pdev;
        struct device *dev = &pdsv->vdpa_dev.dev;
        u64 driver_features;
        u16 invert_idx = 0;
-       int irq;
        int err;
 
        dev_dbg(dev, "%s: qid %d ready %d => %d\n",
@@ -143,19 +141,6 @@ static void pds_vdpa_set_vq_ready(struct vdpa_device *vdpa_dev, u16 qid, bool re
                invert_idx = PDS_VDPA_PACKED_INVERT_IDX;
 
        if (ready) {
-               irq = pci_irq_vector(pdev, qid);
-               snprintf(pdsv->vqs[qid].irq_name, sizeof(pdsv->vqs[qid].irq_name),
-                        "vdpa-%s-%d", dev_name(dev), qid);
-
-               err = request_irq(irq, pds_vdpa_isr, 0,
-                                 pdsv->vqs[qid].irq_name, &pdsv->vqs[qid]);
-               if (err) {
-                       dev_err(dev, "%s: no irq for qid %d: %pe\n",
-                               __func__, qid, ERR_PTR(err));
-                       return;
-               }
-               pdsv->vqs[qid].irq = irq;
-
                /* Pass vq setup info to DSC using adminq to gather up and
                 * send all info at once so FW can do its full set up in
                 * one easy operation
@@ -164,7 +149,6 @@ static void pds_vdpa_set_vq_ready(struct vdpa_device *vdpa_dev, u16 qid, bool re
                if (err) {
                        dev_err(dev, "Failed to init vq %d: %pe\n",
                                qid, ERR_PTR(err));
-                       pds_vdpa_release_irq(pdsv, qid);
                        ready = false;
                }
        } else {
@@ -172,7 +156,6 @@ static void pds_vdpa_set_vq_ready(struct vdpa_device *vdpa_dev, u16 qid, bool re
                if (err)
                        dev_err(dev, "%s: reset_vq failed qid %d: %pe\n",
                                __func__, qid, ERR_PTR(err));
-               pds_vdpa_release_irq(pdsv, qid);
        }
 
        pdsv->vqs[qid].ready = ready;
@@ -318,6 +301,7 @@ static int pds_vdpa_set_driver_features(struct vdpa_device *vdpa_dev, u64 featur
        struct device *dev = &pdsv->vdpa_dev.dev;
        u64 driver_features;
        u64 nego_features;
+       u64 hw_features;
        u64 missing;
 
        if (!(features & BIT_ULL(VIRTIO_F_ACCESS_PLATFORM)) && features) {
@@ -325,21 +309,26 @@ static int pds_vdpa_set_driver_features(struct vdpa_device *vdpa_dev, u64 featur
                return -EOPNOTSUPP;
        }
 
-       pdsv->req_features = features;
-
        /* Check for valid feature bits */
-       nego_features = features & le64_to_cpu(pdsv->vdpa_aux->ident.hw_features);
-       missing = pdsv->req_features & ~nego_features;
+       nego_features = features & pdsv->supported_features;
+       missing = features & ~nego_features;
        if (missing) {
                dev_err(dev, "Can't support all requested features in %#llx, missing %#llx features\n",
-                       pdsv->req_features, missing);
+                       features, missing);
                return -EOPNOTSUPP;
        }
 
+       pdsv->negotiated_features = nego_features;
+
        driver_features = pds_vdpa_get_driver_features(vdpa_dev);
        dev_dbg(dev, "%s: %#llx => %#llx\n",
                __func__, driver_features, nego_features);
 
+       /* if we're faking the F_MAC, strip it before writing to device */
+       hw_features = le64_to_cpu(pdsv->vdpa_aux->ident.hw_features);
+       if (!(hw_features & BIT_ULL(VIRTIO_NET_F_MAC)))
+               nego_features &= ~BIT_ULL(VIRTIO_NET_F_MAC);
+
        if (driver_features == nego_features)
                return 0;
 
@@ -352,7 +341,7 @@ static u64 pds_vdpa_get_driver_features(struct vdpa_device *vdpa_dev)
 {
        struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev);
 
-       return vp_modern_get_driver_features(&pdsv->vdpa_aux->vd_mdev);
+       return pdsv->negotiated_features;
 }
 
 static void pds_vdpa_set_config_cb(struct vdpa_device *vdpa_dev,
@@ -389,6 +378,72 @@ static u8 pds_vdpa_get_status(struct vdpa_device *vdpa_dev)
        return vp_modern_get_status(&pdsv->vdpa_aux->vd_mdev);
 }
 
+static int pds_vdpa_request_irqs(struct pds_vdpa_device *pdsv)
+{
+       struct pci_dev *pdev = pdsv->vdpa_aux->padev->vf_pdev;
+       struct pds_vdpa_aux *vdpa_aux = pdsv->vdpa_aux;
+       struct device *dev = &pdsv->vdpa_dev.dev;
+       int max_vq, nintrs, qid, err;
+
+       max_vq = vdpa_aux->vdpa_mdev.max_supported_vqs;
+
+       nintrs = pci_alloc_irq_vectors(pdev, max_vq, max_vq, PCI_IRQ_MSIX);
+       if (nintrs < 0) {
+               dev_err(dev, "Couldn't get %d msix vectors: %pe\n",
+                       max_vq, ERR_PTR(nintrs));
+               return nintrs;
+       }
+
+       for (qid = 0; qid < pdsv->num_vqs; ++qid) {
+               int irq = pci_irq_vector(pdev, qid);
+
+               snprintf(pdsv->vqs[qid].irq_name, sizeof(pdsv->vqs[qid].irq_name),
+                        "vdpa-%s-%d", dev_name(dev), qid);
+
+               err = request_irq(irq, pds_vdpa_isr, 0,
+                                 pdsv->vqs[qid].irq_name,
+                                 &pdsv->vqs[qid]);
+               if (err) {
+                       dev_err(dev, "%s: no irq for qid %d: %pe\n",
+                               __func__, qid, ERR_PTR(err));
+                       goto err_release;
+               }
+
+               pdsv->vqs[qid].irq = irq;
+       }
+
+       vdpa_aux->nintrs = nintrs;
+
+       return 0;
+
+err_release:
+       while (qid--)
+               pds_vdpa_release_irq(pdsv, qid);
+
+       pci_free_irq_vectors(pdev);
+
+       vdpa_aux->nintrs = 0;
+
+       return err;
+}
+
+static void pds_vdpa_release_irqs(struct pds_vdpa_device *pdsv)
+{
+       struct pci_dev *pdev = pdsv->vdpa_aux->padev->vf_pdev;
+       struct pds_vdpa_aux *vdpa_aux = pdsv->vdpa_aux;
+       int qid;
+
+       if (!vdpa_aux->nintrs)
+               return;
+
+       for (qid = 0; qid < pdsv->num_vqs; qid++)
+               pds_vdpa_release_irq(pdsv, qid);
+
+       pci_free_irq_vectors(pdev);
+
+       vdpa_aux->nintrs = 0;
+}
+
 static void pds_vdpa_set_status(struct vdpa_device *vdpa_dev, u8 status)
 {
        struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev);
@@ -399,6 +454,11 @@ static void pds_vdpa_set_status(struct vdpa_device *vdpa_dev, u8 status)
        old_status = pds_vdpa_get_status(vdpa_dev);
        dev_dbg(dev, "%s: old %#x new %#x\n", __func__, old_status, status);
 
+       if (status & ~old_status & VIRTIO_CONFIG_S_DRIVER_OK) {
+               if (pds_vdpa_request_irqs(pdsv))
+                       status = old_status | VIRTIO_CONFIG_S_FAILED;
+       }
+
        pds_vdpa_cmd_set_status(pdsv, status);
 
        /* Note: still working with FW on the need for this reset cmd */
@@ -409,6 +469,8 @@ static void pds_vdpa_set_status(struct vdpa_device *vdpa_dev, u8 status)
                        pdsv->vqs[i].avail_idx = 0;
                        pdsv->vqs[i].used_idx = 0;
                }
+
+               pds_vdpa_cmd_set_mac(pdsv, pdsv->mac);
        }
 
        if (status & ~old_status & VIRTIO_CONFIG_S_FEATURES_OK) {
@@ -418,6 +480,20 @@ static void pds_vdpa_set_status(struct vdpa_device *vdpa_dev, u8 status)
                                                        i, &pdsv->vqs[i].notify_pa);
                }
        }
+
+       if (old_status & ~status & VIRTIO_CONFIG_S_DRIVER_OK)
+               pds_vdpa_release_irqs(pdsv);
+}
+
+static void pds_vdpa_init_vqs_entry(struct pds_vdpa_device *pdsv, int qid,
+                                   void __iomem *notify)
+{
+       memset(&pdsv->vqs[qid], 0, sizeof(pdsv->vqs[0]));
+       pdsv->vqs[qid].qid = qid;
+       pdsv->vqs[qid].pdsv = pdsv;
+       pdsv->vqs[qid].ready = false;
+       pdsv->vqs[qid].irq = VIRTIO_MSI_NO_VECTOR;
+       pdsv->vqs[qid].notify = notify;
 }
 
 static int pds_vdpa_reset(struct vdpa_device *vdpa_dev)
@@ -441,14 +517,17 @@ static int pds_vdpa_reset(struct vdpa_device *vdpa_dev)
                        if (err)
                                dev_err(dev, "%s: reset_vq failed qid %d: %pe\n",
                                        __func__, i, ERR_PTR(err));
-                       pds_vdpa_release_irq(pdsv, i);
-                       memset(&pdsv->vqs[i], 0, sizeof(pdsv->vqs[0]));
-                       pdsv->vqs[i].ready = false;
                }
        }
 
        pds_vdpa_set_status(vdpa_dev, 0);
 
+       if (status & VIRTIO_CONFIG_S_DRIVER_OK) {
+               /* Reset the vq info */
+               for (i = 0; i < pdsv->num_vqs && !err; i++)
+                       pds_vdpa_init_vqs_entry(pdsv, i, pdsv->vqs[i].notify);
+       }
+
        return 0;
 }
 
@@ -532,7 +611,6 @@ static int pds_vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name,
        struct device *dma_dev;
        struct pci_dev *pdev;
        struct device *dev;
-       u8 mac[ETH_ALEN];
        int err;
        int i;
 
@@ -563,7 +641,7 @@ static int pds_vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name,
 
        if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_FEATURES)) {
                u64 unsupp_features =
-                       add_config->device_features & ~mgmt->supported_features;
+                       add_config->device_features & ~pdsv->supported_features;
 
                if (unsupp_features) {
                        dev_err(dev, "Unsupported features: %#llx\n", unsupp_features);
@@ -614,29 +692,30 @@ static int pds_vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name,
        }
 
        /* Set a mac, either from the user config if provided
-        * or set a random mac if default is 00:..:00
+        * or use the device's mac if not 00:..:00
+        * or set a random mac
         */
        if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MACADDR)) {
-               ether_addr_copy(mac, add_config->net.mac);
-               pds_vdpa_cmd_set_mac(pdsv, mac);
+               ether_addr_copy(pdsv->mac, add_config->net.mac);
        } else {
                struct virtio_net_config __iomem *vc;
 
                vc = pdsv->vdpa_aux->vd_mdev.device;
-               memcpy_fromio(mac, vc->mac, sizeof(mac));
-               if (is_zero_ether_addr(mac)) {
-                       eth_random_addr(mac);
-                       dev_info(dev, "setting random mac %pM\n", mac);
-                       pds_vdpa_cmd_set_mac(pdsv, mac);
+               memcpy_fromio(pdsv->mac, vc->mac, sizeof(pdsv->mac));
+               if (is_zero_ether_addr(pdsv->mac) &&
+                   (pdsv->supported_features & BIT_ULL(VIRTIO_NET_F_MAC))) {
+                       eth_random_addr(pdsv->mac);
+                       dev_info(dev, "setting random mac %pM\n", pdsv->mac);
                }
        }
+       pds_vdpa_cmd_set_mac(pdsv, pdsv->mac);
 
        for (i = 0; i < pdsv->num_vqs; i++) {
-               pdsv->vqs[i].qid = i;
-               pdsv->vqs[i].pdsv = pdsv;
-               pdsv->vqs[i].irq = VIRTIO_MSI_NO_VECTOR;
-               pdsv->vqs[i].notify = vp_modern_map_vq_notify(&pdsv->vdpa_aux->vd_mdev,
-                                                             i, &pdsv->vqs[i].notify_pa);
+               void __iomem *notify;
+
+               notify = vp_modern_map_vq_notify(&pdsv->vdpa_aux->vd_mdev,
+                                                i, &pdsv->vqs[i].notify_pa);
+               pds_vdpa_init_vqs_entry(pdsv, i, notify);
        }
 
        pdsv->vdpa_dev.mdev = &vdpa_aux->vdpa_mdev;
@@ -746,24 +825,19 @@ int pds_vdpa_get_mgmt_info(struct pds_vdpa_aux *vdpa_aux)
 
        max_vqs = min_t(u16, dev_intrs, max_vqs);
        mgmt->max_supported_vqs = min_t(u16, PDS_VDPA_MAX_QUEUES, max_vqs);
-       vdpa_aux->nintrs = mgmt->max_supported_vqs;
+       vdpa_aux->nintrs = 0;
 
        mgmt->ops = &pds_vdpa_mgmt_dev_ops;
        mgmt->id_table = pds_vdpa_id_table;
        mgmt->device = dev;
        mgmt->supported_features = le64_to_cpu(vdpa_aux->ident.hw_features);
+
+       /* advertise F_MAC even if the device doesn't */
+       mgmt->supported_features |= BIT_ULL(VIRTIO_NET_F_MAC);
+
        mgmt->config_attr_mask = BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MACADDR);
        mgmt->config_attr_mask |= BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MAX_VQP);
        mgmt->config_attr_mask |= BIT_ULL(VDPA_ATTR_DEV_FEATURES);
 
-       err = pci_alloc_irq_vectors(pdev, vdpa_aux->nintrs, vdpa_aux->nintrs,
-                                   PCI_IRQ_MSIX);
-       if (err < 0) {
-               dev_err(dev, "Couldn't get %d msix vectors: %pe\n",
-                       vdpa_aux->nintrs, ERR_PTR(err));
-               return err;
-       }
-       vdpa_aux->nintrs = err;
-
        return 0;
 }
index a1bc37de953746e1fc9250308bf81a701304e1fe..d984ba24a7dae13d928e4402139d9c906845e90a 100644 (file)
@@ -35,10 +35,11 @@ struct pds_vdpa_device {
        struct pds_vdpa_aux *vdpa_aux;
 
        struct pds_vdpa_vq_info vqs[PDS_VDPA_MAX_QUEUES];
-       u64 supported_features;         /* specified device features */
-       u64 req_features;               /* features requested by vdpa */
+       u64 supported_features;         /* supported device features */
+       u64 negotiated_features;        /* negotiated features */
        u8 vdpa_index;                  /* rsvd for future subdevice use */
        u8 num_vqs;                     /* num vqs in use */
+       u8 mac[ETH_ALEN];               /* mac selected when the device was added */
        struct vdpa_callback config_cb;
        struct notifier_block nb;
 };
index 965e32529eb856bd1452bcaabbabbba19aeb4bad..a7612e0783b36a89a61d5482a305c668a748ddac 100644 (file)
@@ -1247,44 +1247,41 @@ static const struct nla_policy vdpa_nl_policy[VDPA_ATTR_MAX + 1] = {
        [VDPA_ATTR_MGMTDEV_DEV_NAME] = { .type = NLA_STRING },
        [VDPA_ATTR_DEV_NAME] = { .type = NLA_STRING },
        [VDPA_ATTR_DEV_NET_CFG_MACADDR] = NLA_POLICY_ETH_ADDR,
+       [VDPA_ATTR_DEV_NET_CFG_MAX_VQP] = { .type = NLA_U16 },
        /* virtio spec 1.1 section 5.1.4.1 for valid MTU range */
        [VDPA_ATTR_DEV_NET_CFG_MTU] = NLA_POLICY_MIN(NLA_U16, 68),
+       [VDPA_ATTR_DEV_QUEUE_INDEX] = { .type = NLA_U32 },
+       [VDPA_ATTR_DEV_FEATURES] = { .type = NLA_U64 },
 };
 
 static const struct genl_ops vdpa_nl_ops[] = {
        {
                .cmd = VDPA_CMD_MGMTDEV_GET,
-               .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = vdpa_nl_cmd_mgmtdev_get_doit,
                .dumpit = vdpa_nl_cmd_mgmtdev_get_dumpit,
        },
        {
                .cmd = VDPA_CMD_DEV_NEW,
-               .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = vdpa_nl_cmd_dev_add_set_doit,
                .flags = GENL_ADMIN_PERM,
        },
        {
                .cmd = VDPA_CMD_DEV_DEL,
-               .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = vdpa_nl_cmd_dev_del_set_doit,
                .flags = GENL_ADMIN_PERM,
        },
        {
                .cmd = VDPA_CMD_DEV_GET,
-               .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = vdpa_nl_cmd_dev_get_doit,
                .dumpit = vdpa_nl_cmd_dev_get_dumpit,
        },
        {
                .cmd = VDPA_CMD_DEV_CONFIG_GET,
-               .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = vdpa_nl_cmd_dev_config_get_doit,
                .dumpit = vdpa_nl_cmd_dev_config_get_dumpit,
        },
        {
                .cmd = VDPA_CMD_DEV_VSTATS_GET,
-               .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = vdpa_nl_cmd_dev_stats_get_doit,
                .flags = GENL_ADMIN_PERM,
        },
index dc38ed21319da92201b94708775507e0c7ce75be..df7869537ef146fdc62b6d29180e5e051c7c8d2c 100644 (file)
@@ -935,10 +935,10 @@ static void vduse_dev_irq_inject(struct work_struct *work)
 {
        struct vduse_dev *dev = container_of(work, struct vduse_dev, inject);
 
-       spin_lock_irq(&dev->irq_lock);
+       spin_lock_bh(&dev->irq_lock);
        if (dev->config_cb.callback)
                dev->config_cb.callback(dev->config_cb.private);
-       spin_unlock_irq(&dev->irq_lock);
+       spin_unlock_bh(&dev->irq_lock);
 }
 
 static void vduse_vq_irq_inject(struct work_struct *work)
@@ -946,10 +946,10 @@ static void vduse_vq_irq_inject(struct work_struct *work)
        struct vduse_virtqueue *vq = container_of(work,
                                        struct vduse_virtqueue, inject);
 
-       spin_lock_irq(&vq->irq_lock);
+       spin_lock_bh(&vq->irq_lock);
        if (vq->ready && vq->cb.callback)
                vq->cb.callback(vq->cb.private);
-       spin_unlock_irq(&vq->irq_lock);
+       spin_unlock_bh(&vq->irq_lock);
 }
 
 static bool vduse_vq_signal_irqfd(struct vduse_virtqueue *vq)
index c83f7f043470d687b32d92f6dfe864b1e93964c3..abef0619c7901af0f36232a558e46a1b1dbf16ae 100644 (file)
@@ -25,6 +25,8 @@
 #include <linux/fs.h>
 #include <linux/vmalloc.h>
 #include <linux/miscdevice.h>
+#include <linux/blk_types.h>
+#include <linux/bio.h>
 #include <asm/unaligned.h>
 #include <scsi/scsi_common.h>
 #include <scsi/scsi_proto.h>
@@ -75,6 +77,9 @@ struct vhost_scsi_cmd {
        u32 tvc_prot_sgl_count;
        /* Saved unpacked SCSI LUN for vhost_scsi_target_queue_cmd() */
        u32 tvc_lun;
+       u32 copied_iov:1;
+       const void *saved_iter_addr;
+       struct iov_iter saved_iter;
        /* Pointer to the SGL formatted memory from virtio-scsi */
        struct scatterlist *tvc_sgl;
        struct scatterlist *tvc_prot_sgl;
@@ -328,8 +333,13 @@ static void vhost_scsi_release_cmd_res(struct se_cmd *se_cmd)
        int i;
 
        if (tv_cmd->tvc_sgl_count) {
-               for (i = 0; i < tv_cmd->tvc_sgl_count; i++)
-                       put_page(sg_page(&tv_cmd->tvc_sgl[i]));
+               for (i = 0; i < tv_cmd->tvc_sgl_count; i++) {
+                       if (tv_cmd->copied_iov)
+                               __free_page(sg_page(&tv_cmd->tvc_sgl[i]));
+                       else
+                               put_page(sg_page(&tv_cmd->tvc_sgl[i]));
+               }
+               kfree(tv_cmd->saved_iter_addr);
        }
        if (tv_cmd->tvc_prot_sgl_count) {
                for (i = 0; i < tv_cmd->tvc_prot_sgl_count; i++)
@@ -504,6 +514,28 @@ static void vhost_scsi_evt_work(struct vhost_work *work)
        mutex_unlock(&vq->mutex);
 }
 
+static int vhost_scsi_copy_sgl_to_iov(struct vhost_scsi_cmd *cmd)
+{
+       struct iov_iter *iter = &cmd->saved_iter;
+       struct scatterlist *sg = cmd->tvc_sgl;
+       struct page *page;
+       size_t len;
+       int i;
+
+       for (i = 0; i < cmd->tvc_sgl_count; i++) {
+               page = sg_page(&sg[i]);
+               len = sg[i].length;
+
+               if (copy_page_to_iter(page, 0, len, iter) != len) {
+                       pr_err("Could not copy data while handling misaligned cmd. Error %zu\n",
+                              len);
+                       return -1;
+               }
+       }
+
+       return 0;
+}
+
 /* Fill in status and signal that we are done processing this command
  *
  * This is scheduled in the vhost work queue so we are called with the owner
@@ -527,15 +559,20 @@ static void vhost_scsi_complete_cmd_work(struct vhost_work *work)
 
                pr_debug("%s tv_cmd %p resid %u status %#02x\n", __func__,
                        cmd, se_cmd->residual_count, se_cmd->scsi_status);
-
                memset(&v_rsp, 0, sizeof(v_rsp));
-               v_rsp.resid = cpu_to_vhost32(cmd->tvc_vq, se_cmd->residual_count);
-               /* TODO is status_qualifier field needed? */
-               v_rsp.status = se_cmd->scsi_status;
-               v_rsp.sense_len = cpu_to_vhost32(cmd->tvc_vq,
-                                                se_cmd->scsi_sense_length);
-               memcpy(v_rsp.sense, cmd->tvc_sense_buf,
-                      se_cmd->scsi_sense_length);
+
+               if (cmd->saved_iter_addr && vhost_scsi_copy_sgl_to_iov(cmd)) {
+                       v_rsp.response = VIRTIO_SCSI_S_BAD_TARGET;
+               } else {
+                       v_rsp.resid = cpu_to_vhost32(cmd->tvc_vq,
+                                                    se_cmd->residual_count);
+                       /* TODO is status_qualifier field needed? */
+                       v_rsp.status = se_cmd->scsi_status;
+                       v_rsp.sense_len = cpu_to_vhost32(cmd->tvc_vq,
+                                                        se_cmd->scsi_sense_length);
+                       memcpy(v_rsp.sense, cmd->tvc_sense_buf,
+                              se_cmd->scsi_sense_length);
+               }
 
                iov_iter_init(&iov_iter, ITER_DEST, cmd->tvc_resp_iov,
                              cmd->tvc_in_iovs, sizeof(v_rsp));
@@ -613,12 +650,12 @@ static int
 vhost_scsi_map_to_sgl(struct vhost_scsi_cmd *cmd,
                      struct iov_iter *iter,
                      struct scatterlist *sgl,
-                     bool write)
+                     bool is_prot)
 {
        struct page **pages = cmd->tvc_upages;
        struct scatterlist *sg = sgl;
-       ssize_t bytes;
-       size_t offset;
+       ssize_t bytes, mapped_bytes;
+       size_t offset, mapped_offset;
        unsigned int npages = 0;
 
        bytes = iov_iter_get_pages2(iter, pages, LONG_MAX,
@@ -627,13 +664,53 @@ vhost_scsi_map_to_sgl(struct vhost_scsi_cmd *cmd,
        if (bytes <= 0)
                return bytes < 0 ? bytes : -EFAULT;
 
+       mapped_bytes = bytes;
+       mapped_offset = offset;
+
        while (bytes) {
                unsigned n = min_t(unsigned, PAGE_SIZE - offset, bytes);
+               /*
+                * The block layer requires bios/requests to be a multiple of
+                * 512 bytes, but Windows can send us vecs that are misaligned.
+                * This can result in bios and later requests with misaligned
+                * sizes if we have to break up a cmd/scatterlist into multiple
+                * bios.
+                *
+                * We currently only break up a command into multiple bios if
+                * we hit the vec/seg limit, so check if our sgl_count is
+                * greater than the max and if a vec in the cmd has a
+                * misaligned offset/size.
+                */
+               if (!is_prot &&
+                   (offset & (SECTOR_SIZE - 1) || n & (SECTOR_SIZE - 1)) &&
+                   cmd->tvc_sgl_count > BIO_MAX_VECS) {
+                       WARN_ONCE(true,
+                                 "vhost-scsi detected misaligned IO. Performance may be degraded.");
+                       goto revert_iter_get_pages;
+               }
+
                sg_set_page(sg++, pages[npages++], n, offset);
                bytes -= n;
                offset = 0;
        }
+
        return npages;
+
+revert_iter_get_pages:
+       iov_iter_revert(iter, mapped_bytes);
+
+       npages = 0;
+       while (mapped_bytes) {
+               unsigned int n = min_t(unsigned int, PAGE_SIZE - mapped_offset,
+                                      mapped_bytes);
+
+               put_page(pages[npages++]);
+
+               mapped_bytes -= n;
+               mapped_offset = 0;
+       }
+
+       return -EINVAL;
 }
 
 static int
@@ -657,25 +734,80 @@ vhost_scsi_calc_sgls(struct iov_iter *iter, size_t bytes, int max_sgls)
 }
 
 static int
-vhost_scsi_iov_to_sgl(struct vhost_scsi_cmd *cmd, bool write,
-                     struct iov_iter *iter,
-                     struct scatterlist *sg, int sg_count)
+vhost_scsi_copy_iov_to_sgl(struct vhost_scsi_cmd *cmd, struct iov_iter *iter,
+                          struct scatterlist *sg, int sg_count)
+{
+       size_t len = iov_iter_count(iter);
+       unsigned int nbytes = 0;
+       struct page *page;
+       int i;
+
+       if (cmd->tvc_data_direction == DMA_FROM_DEVICE) {
+               cmd->saved_iter_addr = dup_iter(&cmd->saved_iter, iter,
+                                               GFP_KERNEL);
+               if (!cmd->saved_iter_addr)
+                       return -ENOMEM;
+       }
+
+       for (i = 0; i < sg_count; i++) {
+               page = alloc_page(GFP_KERNEL);
+               if (!page) {
+                       i--;
+                       goto err;
+               }
+
+               nbytes = min_t(unsigned int, PAGE_SIZE, len);
+               sg_set_page(&sg[i], page, nbytes, 0);
+
+               if (cmd->tvc_data_direction == DMA_TO_DEVICE &&
+                   copy_page_from_iter(page, 0, nbytes, iter) != nbytes)
+                       goto err;
+
+               len -= nbytes;
+       }
+
+       cmd->copied_iov = 1;
+       return 0;
+
+err:
+       pr_err("Could not read %u bytes while handling misaligned cmd\n",
+              nbytes);
+
+       for (; i >= 0; i--)
+               __free_page(sg_page(&sg[i]));
+       kfree(cmd->saved_iter_addr);
+       return -ENOMEM;
+}
+
+static int
+vhost_scsi_map_iov_to_sgl(struct vhost_scsi_cmd *cmd, struct iov_iter *iter,
+                         struct scatterlist *sg, int sg_count, bool is_prot)
 {
        struct scatterlist *p = sg;
+       size_t revert_bytes;
        int ret;
 
        while (iov_iter_count(iter)) {
-               ret = vhost_scsi_map_to_sgl(cmd, iter, sg, write);
+               ret = vhost_scsi_map_to_sgl(cmd, iter, sg, is_prot);
                if (ret < 0) {
+                       revert_bytes = 0;
+
                        while (p < sg) {
-                               struct page *page = sg_page(p++);
-                               if (page)
+                               struct page *page = sg_page(p);
+
+                               if (page) {
                                        put_page(page);
+                                       revert_bytes += p->length;
+                               }
+                               p++;
                        }
+
+                       iov_iter_revert(iter, revert_bytes);
                        return ret;
                }
                sg += ret;
        }
+
        return 0;
 }
 
@@ -685,7 +817,6 @@ vhost_scsi_mapal(struct vhost_scsi_cmd *cmd,
                 size_t data_bytes, struct iov_iter *data_iter)
 {
        int sgl_count, ret;
-       bool write = (cmd->tvc_data_direction == DMA_FROM_DEVICE);
 
        if (prot_bytes) {
                sgl_count = vhost_scsi_calc_sgls(prot_iter, prot_bytes,
@@ -698,9 +829,9 @@ vhost_scsi_mapal(struct vhost_scsi_cmd *cmd,
                pr_debug("%s prot_sg %p prot_sgl_count %u\n", __func__,
                         cmd->tvc_prot_sgl, cmd->tvc_prot_sgl_count);
 
-               ret = vhost_scsi_iov_to_sgl(cmd, write, prot_iter,
-                                           cmd->tvc_prot_sgl,
-                                           cmd->tvc_prot_sgl_count);
+               ret = vhost_scsi_map_iov_to_sgl(cmd, prot_iter,
+                                               cmd->tvc_prot_sgl,
+                                               cmd->tvc_prot_sgl_count, true);
                if (ret < 0) {
                        cmd->tvc_prot_sgl_count = 0;
                        return ret;
@@ -716,8 +847,14 @@ vhost_scsi_mapal(struct vhost_scsi_cmd *cmd,
        pr_debug("%s data_sg %p data_sgl_count %u\n", __func__,
                  cmd->tvc_sgl, cmd->tvc_sgl_count);
 
-       ret = vhost_scsi_iov_to_sgl(cmd, write, data_iter,
-                                   cmd->tvc_sgl, cmd->tvc_sgl_count);
+       ret = vhost_scsi_map_iov_to_sgl(cmd, data_iter, cmd->tvc_sgl,
+                                       cmd->tvc_sgl_count, false);
+       if (ret == -EINVAL) {
+               sg_init_table(cmd->tvc_sgl, cmd->tvc_sgl_count);
+               ret = vhost_scsi_copy_iov_to_sgl(cmd, data_iter, cmd->tvc_sgl,
+                                                cmd->tvc_sgl_count);
+       }
+
        if (ret < 0) {
                cmd->tvc_sgl_count = 0;
                return ret;
index 835f6cc2fb6644d445f005a1d74f77a17b4a8261..fa5226c198cc678775717d3d1db77c7090cfefd6 100644 (file)
@@ -38,11 +38,6 @@ module_param(bbm_block_size, ulong, 0444);
 MODULE_PARM_DESC(bbm_block_size,
                 "Big Block size in bytes. Default is 0 (auto-detection).");
 
-static bool bbm_safe_unplug = true;
-module_param(bbm_safe_unplug, bool, 0444);
-MODULE_PARM_DESC(bbm_safe_unplug,
-            "Use a safe unplug mechanism in BBM, avoiding long/endless loops");
-
 /*
  * virtio-mem currently supports the following modes of operation:
  *
@@ -173,6 +168,13 @@ struct virtio_mem {
                        /* The number of subblocks per Linux memory block. */
                        uint32_t sbs_per_mb;
 
+                       /*
+                        * Some of the Linux memory blocks tracked as "partially
+                        * plugged" are completely unplugged and can be offlined
+                        * and removed -- which previously failed.
+                        */
+                       bool have_unplugged_mb;
+
                        /* Summary of all memory block states. */
                        unsigned long mb_count[VIRTIO_MEM_SBM_MB_COUNT];
 
@@ -746,11 +748,15 @@ static int virtio_mem_offline_and_remove_memory(struct virtio_mem *vm,
                 * immediately instead of waiting.
                 */
                virtio_mem_retry(vm);
-       } else {
-               dev_dbg(&vm->vdev->dev,
-                       "offlining and removing memory failed: %d\n", rc);
+               return 0;
        }
-       return rc;
+       dev_dbg(&vm->vdev->dev, "offlining and removing memory failed: %d\n", rc);
+       /*
+        * We don't really expect this to fail, because we fake-offlined all
+        * memory already. But it could fail in corner cases.
+        */
+       WARN_ON_ONCE(rc != -ENOMEM && rc != -EBUSY);
+       return rc == -ENOMEM ? -ENOMEM : -EBUSY;
 }
 
 /*
@@ -766,6 +772,34 @@ static int virtio_mem_sbm_offline_and_remove_mb(struct virtio_mem *vm,
        return virtio_mem_offline_and_remove_memory(vm, addr, size);
 }
 
+/*
+ * Try (offlining and) removing memory from Linux in case all subblocks are
+ * unplugged. Can be called on online and offline memory blocks.
+ *
+ * May modify the state of memory blocks in virtio-mem.
+ */
+static int virtio_mem_sbm_try_remove_unplugged_mb(struct virtio_mem *vm,
+                                                 unsigned long mb_id)
+{
+       int rc;
+
+       /*
+        * Once all subblocks of a memory block were unplugged, offline and
+        * remove it.
+        */
+       if (!virtio_mem_sbm_test_sb_unplugged(vm, mb_id, 0, vm->sbm.sbs_per_mb))
+               return 0;
+
+       /* offline_and_remove_memory() works for online and offline memory. */
+       mutex_unlock(&vm->hotplug_mutex);
+       rc = virtio_mem_sbm_offline_and_remove_mb(vm, mb_id);
+       mutex_lock(&vm->hotplug_mutex);
+       if (!rc)
+               virtio_mem_sbm_set_mb_state(vm, mb_id,
+                                           VIRTIO_MEM_SBM_MB_UNUSED);
+       return rc;
+}
+
 /*
  * See virtio_mem_offline_and_remove_memory(): Try to offline and remove a
  * all Linux memory blocks covered by the big block.
@@ -1155,7 +1189,8 @@ static void virtio_mem_fake_online(unsigned long pfn, unsigned long nr_pages)
  * Try to allocate a range, marking pages fake-offline, effectively
  * fake-offlining them.
  */
-static int virtio_mem_fake_offline(unsigned long pfn, unsigned long nr_pages)
+static int virtio_mem_fake_offline(struct virtio_mem *vm, unsigned long pfn,
+                                  unsigned long nr_pages)
 {
        const bool is_movable = is_zone_movable_page(pfn_to_page(pfn));
        int rc, retry_count;
@@ -1168,6 +1203,14 @@ static int virtio_mem_fake_offline(unsigned long pfn, unsigned long nr_pages)
         * some guarantees.
         */
        for (retry_count = 0; retry_count < 5; retry_count++) {
+               /*
+                * If the config changed, stop immediately and go back to the
+                * main loop: avoid trying to keep unplugging if the device
+                * might have decided to not remove any more memory.
+                */
+               if (atomic_read(&vm->config_changed))
+                       return -EAGAIN;
+
                rc = alloc_contig_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE,
                                        GFP_KERNEL);
                if (rc == -ENOMEM)
@@ -1917,7 +1960,7 @@ static int virtio_mem_sbm_unplug_sb_online(struct virtio_mem *vm,
        start_pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
                             sb_id * vm->sbm.sb_size);
 
-       rc = virtio_mem_fake_offline(start_pfn, nr_pages);
+       rc = virtio_mem_fake_offline(vm, start_pfn, nr_pages);
        if (rc)
                return rc;
 
@@ -1989,20 +2032,10 @@ static int virtio_mem_sbm_unplug_any_sb_online(struct virtio_mem *vm,
        }
 
 unplugged:
-       /*
-        * Once all subblocks of a memory block were unplugged, offline and
-        * remove it. This will usually not fail, as no memory is in use
-        * anymore - however some other notifiers might NACK the request.
-        */
-       if (virtio_mem_sbm_test_sb_unplugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) {
-               mutex_unlock(&vm->hotplug_mutex);
-               rc = virtio_mem_sbm_offline_and_remove_mb(vm, mb_id);
-               mutex_lock(&vm->hotplug_mutex);
-               if (!rc)
-                       virtio_mem_sbm_set_mb_state(vm, mb_id,
-                                                   VIRTIO_MEM_SBM_MB_UNUSED);
-       }
-
+       rc = virtio_mem_sbm_try_remove_unplugged_mb(vm, mb_id);
+       if (rc)
+               vm->sbm.have_unplugged_mb = 1;
+       /* Ignore errors, this is not critical. We'll retry later. */
        return 0;
 }
 
@@ -2111,38 +2144,32 @@ static int virtio_mem_bbm_offline_remove_and_unplug_bb(struct virtio_mem *vm,
                         VIRTIO_MEM_BBM_BB_ADDED))
                return -EINVAL;
 
-       if (bbm_safe_unplug) {
-               /*
-                * Start by fake-offlining all memory. Once we marked the device
-                * block as fake-offline, all newly onlined memory will
-                * automatically be kept fake-offline. Protect from concurrent
-                * onlining/offlining until we have a consistent state.
-                */
-               mutex_lock(&vm->hotplug_mutex);
-               virtio_mem_bbm_set_bb_state(vm, bb_id,
-                                           VIRTIO_MEM_BBM_BB_FAKE_OFFLINE);
+       /*
+        * Start by fake-offlining all memory. Once we marked the device
+        * block as fake-offline, all newly onlined memory will
+        * automatically be kept fake-offline. Protect from concurrent
+        * onlining/offlining until we have a consistent state.
+        */
+       mutex_lock(&vm->hotplug_mutex);
+       virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_FAKE_OFFLINE);
 
-               for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
-                       page = pfn_to_online_page(pfn);
-                       if (!page)
-                               continue;
+       for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
+               page = pfn_to_online_page(pfn);
+               if (!page)
+                       continue;
 
-                       rc = virtio_mem_fake_offline(pfn, PAGES_PER_SECTION);
-                       if (rc) {
-                               end_pfn = pfn;
-                               goto rollback_safe_unplug;
-                       }
+               rc = virtio_mem_fake_offline(vm, pfn, PAGES_PER_SECTION);
+               if (rc) {
+                       end_pfn = pfn;
+                       goto rollback;
                }
-               mutex_unlock(&vm->hotplug_mutex);
        }
+       mutex_unlock(&vm->hotplug_mutex);
 
        rc = virtio_mem_bbm_offline_and_remove_bb(vm, bb_id);
        if (rc) {
-               if (bbm_safe_unplug) {
-                       mutex_lock(&vm->hotplug_mutex);
-                       goto rollback_safe_unplug;
-               }
-               return rc;
+               mutex_lock(&vm->hotplug_mutex);
+               goto rollback;
        }
 
        rc = virtio_mem_bbm_unplug_bb(vm, bb_id);
@@ -2154,7 +2181,7 @@ static int virtio_mem_bbm_offline_remove_and_unplug_bb(struct virtio_mem *vm,
                                            VIRTIO_MEM_BBM_BB_UNUSED);
        return rc;
 
-rollback_safe_unplug:
+rollback:
        for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
                page = pfn_to_online_page(pfn);
                if (!page)
@@ -2260,12 +2287,13 @@ static int virtio_mem_unplug_request(struct virtio_mem *vm, uint64_t diff)
 
 /*
  * Try to unplug all blocks that couldn't be unplugged before, for example,
- * because the hypervisor was busy.
+ * because the hypervisor was busy. Further, offline and remove any memory
+ * blocks where we previously failed.
  */
-static int virtio_mem_unplug_pending_mb(struct virtio_mem *vm)
+static int virtio_mem_cleanup_pending_mb(struct virtio_mem *vm)
 {
        unsigned long id;
-       int rc;
+       int rc = 0;
 
        if (!vm->in_sbm) {
                virtio_mem_bbm_for_each_bb(vm, id,
@@ -2287,6 +2315,27 @@ static int virtio_mem_unplug_pending_mb(struct virtio_mem *vm)
                                            VIRTIO_MEM_SBM_MB_UNUSED);
        }
 
+       if (!vm->sbm.have_unplugged_mb)
+               return 0;
+
+       /*
+        * Let's retry (offlining and) removing completely unplugged Linux
+        * memory blocks.
+        */
+       vm->sbm.have_unplugged_mb = false;
+
+       mutex_lock(&vm->hotplug_mutex);
+       virtio_mem_sbm_for_each_mb(vm, id, VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL)
+               rc |= virtio_mem_sbm_try_remove_unplugged_mb(vm, id);
+       virtio_mem_sbm_for_each_mb(vm, id, VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL)
+               rc |= virtio_mem_sbm_try_remove_unplugged_mb(vm, id);
+       virtio_mem_sbm_for_each_mb(vm, id, VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL)
+               rc |= virtio_mem_sbm_try_remove_unplugged_mb(vm, id);
+       mutex_unlock(&vm->hotplug_mutex);
+
+       if (rc)
+               vm->sbm.have_unplugged_mb = true;
+       /* Ignore errors, this is not critical. We'll retry later. */
        return 0;
 }
 
@@ -2368,9 +2417,9 @@ retry:
                virtio_mem_refresh_config(vm);
        }
 
-       /* Unplug any leftovers from previous runs */
+       /* Cleanup any leftovers from previous runs */
        if (!rc)
-               rc = virtio_mem_unplug_pending_mb(vm);
+               rc = virtio_mem_cleanup_pending_mb(vm);
 
        if (!rc && vm->requested_size != vm->plugged_size) {
                if (vm->requested_size > vm->plugged_size) {
@@ -2382,6 +2431,13 @@ retry:
                }
        }
 
+       /*
+        * Keep retrying to offline and remove completely unplugged Linux
+        * memory blocks.
+        */
+       if (!rc && vm->in_sbm && vm->sbm.have_unplugged_mb)
+               rc = -EBUSY;
+
        switch (rc) {
        case 0:
                vm->retry_timer_ms = VIRTIO_MEM_RETRY_TIMER_MIN_MS;
index a46a4a29e9295f7225bfbc631642e0e0a81f92d6..97760f611295941a36331573e53159ec5223eacc 100644 (file)
@@ -607,9 +607,8 @@ static void virtio_mmio_release_dev(struct device *_d)
        struct virtio_device *vdev =
                        container_of(_d, struct virtio_device, dev);
        struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev);
-       struct platform_device *pdev = vm_dev->pdev;
 
-       devm_kfree(&pdev->dev, vm_dev);
+       kfree(vm_dev);
 }
 
 /* Platform device */
@@ -620,7 +619,7 @@ static int virtio_mmio_probe(struct platform_device *pdev)
        unsigned long magic;
        int rc;
 
-       vm_dev = devm_kzalloc(&pdev->dev, sizeof(*vm_dev), GFP_KERNEL);
+       vm_dev = kzalloc(sizeof(*vm_dev), GFP_KERNEL);
        if (!vm_dev)
                return -ENOMEM;
 
index a6c86f916dbdf5d87eb9dbaa277bfa60a6ea6a5b..c2524a7207cfaeed4149748c7890b3ce0f7e6ccd 100644 (file)
@@ -557,8 +557,6 @@ static int virtio_pci_probe(struct pci_dev *pci_dev,
 
        pci_set_master(pci_dev);
 
-       vp_dev->is_legacy = vp_dev->ldev.ioaddr ? true : false;
-
        rc = register_virtio_device(&vp_dev->vdev);
        reg_dev = vp_dev;
        if (rc)
index 2257f1b3d8ae1b5561b154955ce860d17c176944..d9cbb02b35a112e0940fa45999dd98261ba5e690 100644 (file)
@@ -223,6 +223,7 @@ int virtio_pci_legacy_probe(struct virtio_pci_device *vp_dev)
        vp_dev->config_vector = vp_config_vector;
        vp_dev->setup_vq = setup_vq;
        vp_dev->del_vq = del_vq;
+       vp_dev->is_legacy = true;
 
        return 0;
 }
index 989e2d7184ce463aeef8b4cd6df968f9d8839015..961161da59000d301deef6db6207e63a34df669a 100644 (file)
@@ -393,11 +393,13 @@ static int virtio_vdpa_find_vqs(struct virtio_device *vdev, unsigned int nvqs,
        cb.callback = virtio_vdpa_config_cb;
        cb.private = vd_dev;
        ops->set_config_cb(vdpa, &cb);
+       kfree(masks);
 
        return 0;
 
 err_setup_vq:
        virtio_vdpa_del_vqs(vdev);
+       kfree(masks);
        return err;
 }