IB/mlx4: Fix data corruption triggered by wrong headroom marking order
[sfrench/cifs-2.6.git] / drivers / infiniband / hw / mlx4 / qp.c
index b5a24fbef70d95768e3bd9a572dff993aa67e854..85c51bdc36f1d403130fdf81655c51c7eca6e817 100644 (file)
@@ -415,9 +415,11 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
        return 0;
 
 err_wrid:
-       if (pd->uobject && !init_attr->srq)
-               mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context), &qp->db);
-       else {
+       if (pd->uobject) {
+               if (!init_attr->srq)
+                       mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context),
+                                             &qp->db);
+       } else {
                kfree(qp->sq.wrid);
                kfree(qp->rq.wrid);
        }
@@ -742,7 +744,7 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
                if (attr->path_mtu < IB_MTU_256 || attr->path_mtu > IB_MTU_4096) {
                        printk(KERN_ERR "path MTU (%u) is invalid\n",
                               attr->path_mtu);
-                       return -EINVAL;
+                       goto out;
                }
                context->mtu_msgmax = (attr->path_mtu << 5) | 31;
        }
@@ -781,10 +783,8 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
 
        if (attr_mask & IB_QP_AV) {
                if (mlx4_set_path(dev, &attr->ah_attr, &context->pri_path,
-                                 attr_mask & IB_QP_PORT ? attr->port_num : qp->port)) {
-                       err = -EINVAL;
+                                 attr_mask & IB_QP_PORT ? attr->port_num : qp->port))
                        goto out;
-               }
 
                optpar |= (MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH |
                           MLX4_QP_OPTPAR_SCHED_QUEUE);
@@ -798,15 +798,15 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
        if (attr_mask & IB_QP_ALT_PATH) {
                if (attr->alt_port_num == 0 ||
                    attr->alt_port_num > dev->dev->caps.num_ports)
-                       return -EINVAL;
+                       goto out;
 
                if (attr->alt_pkey_index >=
                    dev->dev->caps.pkey_table_len[attr->alt_port_num])
-                       return -EINVAL;
+                       goto out;
 
                if (mlx4_set_path(dev, &attr->alt_ah_attr, &context->alt_path,
                                  attr->alt_port_num))
-                       return -EINVAL;
+                       goto out;
 
                context->alt_path.pkey_index = attr->alt_pkey_index;
                context->alt_path.ackto = attr->alt_timeout << 3;
@@ -1209,15 +1209,44 @@ static void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg,
        memcpy(dseg->av, &to_mah(wr->wr.ud.ah)->av, sizeof (struct mlx4_av));
        dseg->dqpn = cpu_to_be32(wr->wr.ud.remote_qpn);
        dseg->qkey = cpu_to_be32(wr->wr.ud.remote_qkey);
+}
+
+static void set_mlx_icrc_seg(void *dseg)
+{
+       u32 *t = dseg;
+       struct mlx4_wqe_inline_seg *iseg = dseg;
 
+       t[1] = 0;
+
+       /*
+        * Need a barrier here before writing the byte_count field to
+        * make sure that all the data is visible before the
+        * byte_count field is set.  Otherwise, if the segment begins
+        * a new cacheline, the HCA prefetcher could grab the 64-byte
+        * chunk and get a valid (!= * 0xffffffff) byte count but
+        * stale data, and end up sending the wrong data.
+        */
+       wmb();
+
+       iseg->byte_count = cpu_to_be32((1 << 31) | 4);
 }
 
-static void set_data_seg(struct mlx4_wqe_data_seg *dseg,
-                        struct ib_sge *sg)
+static void set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ib_sge *sg)
 {
-       dseg->byte_count = cpu_to_be32(sg->length);
        dseg->lkey       = cpu_to_be32(sg->lkey);
        dseg->addr       = cpu_to_be64(sg->addr);
+
+       /*
+        * Need a barrier here before writing the byte_count field to
+        * make sure that all the data is visible before the
+        * byte_count field is set.  Otherwise, if the segment begins
+        * a new cacheline, the HCA prefetcher could grab the 64-byte
+        * chunk and get a valid (!= * 0xffffffff) byte count but
+        * stale data, and end up sending the wrong data.
+        */
+       wmb();
+
+       dseg->byte_count = cpu_to_be32(sg->length);
 }
 
 int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
@@ -1226,6 +1255,7 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
        struct mlx4_ib_qp *qp = to_mqp(ibqp);
        void *wqe;
        struct mlx4_wqe_ctrl_seg *ctrl;
+       struct mlx4_wqe_data_seg *dseg;
        unsigned long flags;
        int nreq;
        int err = 0;
@@ -1325,22 +1355,27 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
                        break;
                }
 
-               for (i = 0; i < wr->num_sge; ++i) {
-                       set_data_seg(wqe, wr->sg_list + i);
+               /*
+                * Write data segments in reverse order, so as to
+                * overwrite cacheline stamp last within each
+                * cacheline.  This avoids issues with WQE
+                * prefetching.
+                */
 
-                       wqe  += sizeof (struct mlx4_wqe_data_seg);
-                       size += sizeof (struct mlx4_wqe_data_seg) / 16;
-               }
+               dseg = wqe;
+               dseg += wr->num_sge - 1;
+               size += wr->num_sge * (sizeof (struct mlx4_wqe_data_seg) / 16);
 
                /* Add one more inline data segment for ICRC for MLX sends */
-               if (qp->ibqp.qp_type == IB_QPT_SMI || qp->ibqp.qp_type == IB_QPT_GSI) {
-                       ((struct mlx4_wqe_inline_seg *) wqe)->byte_count =
-                               cpu_to_be32((1 << 31) | 4);
-                       ((u32 *) wqe)[1] = 0;
-                       wqe  += sizeof (struct mlx4_wqe_data_seg);
+               if (unlikely(qp->ibqp.qp_type == IB_QPT_SMI ||
+                            qp->ibqp.qp_type == IB_QPT_GSI)) {
+                       set_mlx_icrc_seg(dseg + 1);
                        size += sizeof (struct mlx4_wqe_data_seg) / 16;
                }
 
+               for (i = wr->num_sge - 1; i >= 0; --i, --dseg)
+                       set_data_seg(dseg, wr->sg_list + i);
+
                ctrl->fence_size = (wr->send_flags & IB_SEND_FENCE ?
                                    MLX4_WQE_CTRL_FENCE : 0) | size;