Merge tag 'vfio-v6.6-rc1' of https://github.com/awilliam/linux-vfio
[sfrench/cifs-2.6.git] / drivers / vfio / pci / mlx5 / cmd.c
1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
2 /*
3  * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved
4  */
5
6 #include "cmd.h"
7
8 enum { CQ_OK = 0, CQ_EMPTY = -1, CQ_POLL_ERR = -2 };
9
10 static int mlx5vf_is_migratable(struct mlx5_core_dev *mdev, u16 func_id)
11 {
12         int query_sz = MLX5_ST_SZ_BYTES(query_hca_cap_out);
13         void *query_cap = NULL, *cap;
14         int ret;
15
16         query_cap = kzalloc(query_sz, GFP_KERNEL);
17         if (!query_cap)
18                 return -ENOMEM;
19
20         ret = mlx5_vport_get_other_func_cap(mdev, func_id, query_cap,
21                                             MLX5_CAP_GENERAL_2);
22         if (ret)
23                 goto out;
24
25         cap = MLX5_ADDR_OF(query_hca_cap_out, query_cap, capability);
26         if (!MLX5_GET(cmd_hca_cap_2, cap, migratable))
27                 ret = -EOPNOTSUPP;
28 out:
29         kfree(query_cap);
30         return ret;
31 }
32
33 static int mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev *mdev, u16 function_id,
34                                   u16 *vhca_id);
35 static void
36 _mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev);
37
38 int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod)
39 {
40         struct mlx5_vf_migration_file *migf = mvdev->saving_migf;
41         u32 out[MLX5_ST_SZ_DW(suspend_vhca_out)] = {};
42         u32 in[MLX5_ST_SZ_DW(suspend_vhca_in)] = {};
43         int err;
44
45         lockdep_assert_held(&mvdev->state_mutex);
46         if (mvdev->mdev_detach)
47                 return -ENOTCONN;
48
49         /*
50          * In case PRE_COPY is used, saving_migf is exposed while the device is
51          * running. Make sure to run only once there is no active save command.
52          * Running both in parallel, might end-up with a failure in the save
53          * command once it will try to turn on 'tracking' on a suspended device.
54          */
55         if (migf) {
56                 err = wait_for_completion_interruptible(&migf->save_comp);
57                 if (err)
58                         return err;
59         }
60
61         MLX5_SET(suspend_vhca_in, in, opcode, MLX5_CMD_OP_SUSPEND_VHCA);
62         MLX5_SET(suspend_vhca_in, in, vhca_id, mvdev->vhca_id);
63         MLX5_SET(suspend_vhca_in, in, op_mod, op_mod);
64
65         err = mlx5_cmd_exec_inout(mvdev->mdev, suspend_vhca, in, out);
66         if (migf)
67                 complete(&migf->save_comp);
68
69         return err;
70 }
71
72 int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod)
73 {
74         u32 out[MLX5_ST_SZ_DW(resume_vhca_out)] = {};
75         u32 in[MLX5_ST_SZ_DW(resume_vhca_in)] = {};
76
77         lockdep_assert_held(&mvdev->state_mutex);
78         if (mvdev->mdev_detach)
79                 return -ENOTCONN;
80
81         MLX5_SET(resume_vhca_in, in, opcode, MLX5_CMD_OP_RESUME_VHCA);
82         MLX5_SET(resume_vhca_in, in, vhca_id, mvdev->vhca_id);
83         MLX5_SET(resume_vhca_in, in, op_mod, op_mod);
84
85         return mlx5_cmd_exec_inout(mvdev->mdev, resume_vhca, in, out);
86 }
87
88 int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
89                                           size_t *state_size, u8 query_flags)
90 {
91         u32 out[MLX5_ST_SZ_DW(query_vhca_migration_state_out)] = {};
92         u32 in[MLX5_ST_SZ_DW(query_vhca_migration_state_in)] = {};
93         bool inc = query_flags & MLX5VF_QUERY_INC;
94         int ret;
95
96         lockdep_assert_held(&mvdev->state_mutex);
97         if (mvdev->mdev_detach)
98                 return -ENOTCONN;
99
100         /*
101          * In case PRE_COPY is used, saving_migf is exposed while device is
102          * running. Make sure to run only once there is no active save command.
103          * Running both in parallel, might end-up with a failure in the
104          * incremental query command on un-tracked vhca.
105          */
106         if (inc) {
107                 ret = wait_for_completion_interruptible(&mvdev->saving_migf->save_comp);
108                 if (ret)
109                         return ret;
110                 if (mvdev->saving_migf->state ==
111                     MLX5_MIGF_STATE_PRE_COPY_ERROR) {
112                         /*
113                          * In case we had a PRE_COPY error, only query full
114                          * image for final image
115                          */
116                         if (!(query_flags & MLX5VF_QUERY_FINAL)) {
117                                 *state_size = 0;
118                                 complete(&mvdev->saving_migf->save_comp);
119                                 return 0;
120                         }
121                         query_flags &= ~MLX5VF_QUERY_INC;
122                 }
123         }
124
125         MLX5_SET(query_vhca_migration_state_in, in, opcode,
126                  MLX5_CMD_OP_QUERY_VHCA_MIGRATION_STATE);
127         MLX5_SET(query_vhca_migration_state_in, in, vhca_id, mvdev->vhca_id);
128         MLX5_SET(query_vhca_migration_state_in, in, op_mod, 0);
129         MLX5_SET(query_vhca_migration_state_in, in, incremental,
130                  query_flags & MLX5VF_QUERY_INC);
131
132         ret = mlx5_cmd_exec_inout(mvdev->mdev, query_vhca_migration_state, in,
133                                   out);
134         if (inc)
135                 complete(&mvdev->saving_migf->save_comp);
136
137         if (ret)
138                 return ret;
139
140         *state_size = MLX5_GET(query_vhca_migration_state_out, out,
141                                required_umem_size);
142         return 0;
143 }
144
145 static void set_tracker_error(struct mlx5vf_pci_core_device *mvdev)
146 {
147         /* Mark the tracker under an error and wake it up if it's running */
148         mvdev->tracker.is_err = true;
149         complete(&mvdev->tracker_comp);
150 }
151
152 static int mlx5fv_vf_event(struct notifier_block *nb,
153                            unsigned long event, void *data)
154 {
155         struct mlx5vf_pci_core_device *mvdev =
156                 container_of(nb, struct mlx5vf_pci_core_device, nb);
157
158         switch (event) {
159         case MLX5_PF_NOTIFY_ENABLE_VF:
160                 mutex_lock(&mvdev->state_mutex);
161                 mvdev->mdev_detach = false;
162                 mlx5vf_state_mutex_unlock(mvdev);
163                 break;
164         case MLX5_PF_NOTIFY_DISABLE_VF:
165                 mlx5vf_cmd_close_migratable(mvdev);
166                 mutex_lock(&mvdev->state_mutex);
167                 mvdev->mdev_detach = true;
168                 mlx5vf_state_mutex_unlock(mvdev);
169                 break;
170         default:
171                 break;
172         }
173
174         return 0;
175 }
176
177 void mlx5vf_cmd_close_migratable(struct mlx5vf_pci_core_device *mvdev)
178 {
179         if (!mvdev->migrate_cap)
180                 return;
181
182         /* Must be done outside the lock to let it progress */
183         set_tracker_error(mvdev);
184         mutex_lock(&mvdev->state_mutex);
185         mlx5vf_disable_fds(mvdev);
186         _mlx5vf_free_page_tracker_resources(mvdev);
187         mlx5vf_state_mutex_unlock(mvdev);
188 }
189
190 void mlx5vf_cmd_remove_migratable(struct mlx5vf_pci_core_device *mvdev)
191 {
192         if (!mvdev->migrate_cap)
193                 return;
194
195         mlx5_sriov_blocking_notifier_unregister(mvdev->mdev, mvdev->vf_id,
196                                                 &mvdev->nb);
197         destroy_workqueue(mvdev->cb_wq);
198 }
199
200 void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev,
201                                const struct vfio_migration_ops *mig_ops,
202                                const struct vfio_log_ops *log_ops)
203 {
204         struct pci_dev *pdev = mvdev->core_device.pdev;
205         int ret;
206
207         if (!pdev->is_virtfn)
208                 return;
209
210         mvdev->mdev = mlx5_vf_get_core_dev(pdev);
211         if (!mvdev->mdev)
212                 return;
213
214         if (!MLX5_CAP_GEN(mvdev->mdev, migration))
215                 goto end;
216
217         mvdev->vf_id = pci_iov_vf_id(pdev);
218         if (mvdev->vf_id < 0)
219                 goto end;
220
221         ret = mlx5vf_is_migratable(mvdev->mdev, mvdev->vf_id + 1);
222         if (ret)
223                 goto end;
224
225         if (mlx5vf_cmd_get_vhca_id(mvdev->mdev, mvdev->vf_id + 1,
226                                    &mvdev->vhca_id))
227                 goto end;
228
229         mvdev->cb_wq = alloc_ordered_workqueue("mlx5vf_wq", 0);
230         if (!mvdev->cb_wq)
231                 goto end;
232
233         mutex_init(&mvdev->state_mutex);
234         spin_lock_init(&mvdev->reset_lock);
235         mvdev->nb.notifier_call = mlx5fv_vf_event;
236         ret = mlx5_sriov_blocking_notifier_register(mvdev->mdev, mvdev->vf_id,
237                                                     &mvdev->nb);
238         if (ret) {
239                 destroy_workqueue(mvdev->cb_wq);
240                 goto end;
241         }
242
243         mvdev->migrate_cap = 1;
244         mvdev->core_device.vdev.migration_flags =
245                 VFIO_MIGRATION_STOP_COPY |
246                 VFIO_MIGRATION_P2P;
247         mvdev->core_device.vdev.mig_ops = mig_ops;
248         init_completion(&mvdev->tracker_comp);
249         if (MLX5_CAP_GEN(mvdev->mdev, adv_virtualization))
250                 mvdev->core_device.vdev.log_ops = log_ops;
251
252         if (MLX5_CAP_GEN_2(mvdev->mdev, migration_multi_load) &&
253             MLX5_CAP_GEN_2(mvdev->mdev, migration_tracking_state))
254                 mvdev->core_device.vdev.migration_flags |=
255                         VFIO_MIGRATION_PRE_COPY;
256
257 end:
258         mlx5_vf_put_core_dev(mvdev->mdev);
259 }
260
261 static int mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev *mdev, u16 function_id,
262                                   u16 *vhca_id)
263 {
264         u32 in[MLX5_ST_SZ_DW(query_hca_cap_in)] = {};
265         int out_size;
266         void *out;
267         int ret;
268
269         out_size = MLX5_ST_SZ_BYTES(query_hca_cap_out);
270         out = kzalloc(out_size, GFP_KERNEL);
271         if (!out)
272                 return -ENOMEM;
273
274         MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP);
275         MLX5_SET(query_hca_cap_in, in, other_function, 1);
276         MLX5_SET(query_hca_cap_in, in, function_id, function_id);
277         MLX5_SET(query_hca_cap_in, in, op_mod,
278                  MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE << 1 |
279                  HCA_CAP_OPMOD_GET_CUR);
280
281         ret = mlx5_cmd_exec_inout(mdev, query_hca_cap, in, out);
282         if (ret)
283                 goto err_exec;
284
285         *vhca_id = MLX5_GET(query_hca_cap_out, out,
286                             capability.cmd_hca_cap.vhca_id);
287
288 err_exec:
289         kfree(out);
290         return ret;
291 }
292
293 static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn,
294                         struct mlx5_vhca_data_buffer *buf,
295                         struct mlx5_vhca_recv_buf *recv_buf,
296                         u32 *mkey)
297 {
298         size_t npages = buf ? DIV_ROUND_UP(buf->allocated_length, PAGE_SIZE) :
299                                 recv_buf->npages;
300         int err = 0, inlen;
301         __be64 *mtt;
302         void *mkc;
303         u32 *in;
304
305         inlen = MLX5_ST_SZ_BYTES(create_mkey_in) +
306                 sizeof(*mtt) * round_up(npages, 2);
307
308         in = kvzalloc(inlen, GFP_KERNEL);
309         if (!in)
310                 return -ENOMEM;
311
312         MLX5_SET(create_mkey_in, in, translations_octword_actual_size,
313                  DIV_ROUND_UP(npages, 2));
314         mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
315
316         if (buf) {
317                 struct sg_dma_page_iter dma_iter;
318
319                 for_each_sgtable_dma_page(&buf->table.sgt, &dma_iter, 0)
320                         *mtt++ = cpu_to_be64(sg_page_iter_dma_address(&dma_iter));
321         } else {
322                 int i;
323
324                 for (i = 0; i < npages; i++)
325                         *mtt++ = cpu_to_be64(recv_buf->dma_addrs[i]);
326         }
327
328         mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
329         MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT);
330         MLX5_SET(mkc, mkc, lr, 1);
331         MLX5_SET(mkc, mkc, lw, 1);
332         MLX5_SET(mkc, mkc, rr, 1);
333         MLX5_SET(mkc, mkc, rw, 1);
334         MLX5_SET(mkc, mkc, pd, pdn);
335         MLX5_SET(mkc, mkc, bsf_octword_size, 0);
336         MLX5_SET(mkc, mkc, qpn, 0xffffff);
337         MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT);
338         MLX5_SET(mkc, mkc, translations_octword_size, DIV_ROUND_UP(npages, 2));
339         MLX5_SET64(mkc, mkc, len, npages * PAGE_SIZE);
340         err = mlx5_core_create_mkey(mdev, mkey, in, inlen);
341         kvfree(in);
342         return err;
343 }
344
345 static int mlx5vf_dma_data_buffer(struct mlx5_vhca_data_buffer *buf)
346 {
347         struct mlx5vf_pci_core_device *mvdev = buf->migf->mvdev;
348         struct mlx5_core_dev *mdev = mvdev->mdev;
349         int ret;
350
351         lockdep_assert_held(&mvdev->state_mutex);
352         if (mvdev->mdev_detach)
353                 return -ENOTCONN;
354
355         if (buf->dmaed || !buf->allocated_length)
356                 return -EINVAL;
357
358         ret = dma_map_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0);
359         if (ret)
360                 return ret;
361
362         ret = _create_mkey(mdev, buf->migf->pdn, buf, NULL, &buf->mkey);
363         if (ret)
364                 goto err;
365
366         buf->dmaed = true;
367
368         return 0;
369 err:
370         dma_unmap_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0);
371         return ret;
372 }
373
374 void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf)
375 {
376         struct mlx5_vf_migration_file *migf = buf->migf;
377         struct sg_page_iter sg_iter;
378
379         lockdep_assert_held(&migf->mvdev->state_mutex);
380         WARN_ON(migf->mvdev->mdev_detach);
381
382         if (buf->dmaed) {
383                 mlx5_core_destroy_mkey(migf->mvdev->mdev, buf->mkey);
384                 dma_unmap_sgtable(migf->mvdev->mdev->device, &buf->table.sgt,
385                                   buf->dma_dir, 0);
386         }
387
388         /* Undo alloc_pages_bulk_array() */
389         for_each_sgtable_page(&buf->table.sgt, &sg_iter, 0)
390                 __free_page(sg_page_iter_page(&sg_iter));
391         sg_free_append_table(&buf->table);
392         kfree(buf);
393 }
394
395 struct mlx5_vhca_data_buffer *
396 mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf,
397                          size_t length,
398                          enum dma_data_direction dma_dir)
399 {
400         struct mlx5_vhca_data_buffer *buf;
401         int ret;
402
403         buf = kzalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT);
404         if (!buf)
405                 return ERR_PTR(-ENOMEM);
406
407         buf->dma_dir = dma_dir;
408         buf->migf = migf;
409         if (length) {
410                 ret = mlx5vf_add_migration_pages(buf,
411                                 DIV_ROUND_UP_ULL(length, PAGE_SIZE));
412                 if (ret)
413                         goto end;
414
415                 if (dma_dir != DMA_NONE) {
416                         ret = mlx5vf_dma_data_buffer(buf);
417                         if (ret)
418                                 goto end;
419                 }
420         }
421
422         return buf;
423 end:
424         mlx5vf_free_data_buffer(buf);
425         return ERR_PTR(ret);
426 }
427
428 void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf)
429 {
430         spin_lock_irq(&buf->migf->list_lock);
431         list_add_tail(&buf->buf_elm, &buf->migf->avail_list);
432         spin_unlock_irq(&buf->migf->list_lock);
433 }
434
435 struct mlx5_vhca_data_buffer *
436 mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf,
437                        size_t length, enum dma_data_direction dma_dir)
438 {
439         struct mlx5_vhca_data_buffer *buf, *temp_buf;
440         struct list_head free_list;
441
442         lockdep_assert_held(&migf->mvdev->state_mutex);
443         if (migf->mvdev->mdev_detach)
444                 return ERR_PTR(-ENOTCONN);
445
446         INIT_LIST_HEAD(&free_list);
447
448         spin_lock_irq(&migf->list_lock);
449         list_for_each_entry_safe(buf, temp_buf, &migf->avail_list, buf_elm) {
450                 if (buf->dma_dir == dma_dir) {
451                         list_del_init(&buf->buf_elm);
452                         if (buf->allocated_length >= length) {
453                                 spin_unlock_irq(&migf->list_lock);
454                                 goto found;
455                         }
456                         /*
457                          * Prevent holding redundant buffers. Put in a free
458                          * list and call at the end not under the spin lock
459                          * (&migf->list_lock) to mlx5vf_free_data_buffer which
460                          * might sleep.
461                          */
462                         list_add(&buf->buf_elm, &free_list);
463                 }
464         }
465         spin_unlock_irq(&migf->list_lock);
466         buf = mlx5vf_alloc_data_buffer(migf, length, dma_dir);
467
468 found:
469         while ((temp_buf = list_first_entry_or_null(&free_list,
470                                 struct mlx5_vhca_data_buffer, buf_elm))) {
471                 list_del(&temp_buf->buf_elm);
472                 mlx5vf_free_data_buffer(temp_buf);
473         }
474
475         return buf;
476 }
477
478 void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work)
479 {
480         struct mlx5vf_async_data *async_data = container_of(_work,
481                 struct mlx5vf_async_data, work);
482         struct mlx5_vf_migration_file *migf = container_of(async_data,
483                 struct mlx5_vf_migration_file, async_data);
484
485         mutex_lock(&migf->lock);
486         if (async_data->status) {
487                 mlx5vf_put_data_buffer(async_data->buf);
488                 if (async_data->header_buf)
489                         mlx5vf_put_data_buffer(async_data->header_buf);
490                 if (async_data->status == MLX5_CMD_STAT_BAD_RES_STATE_ERR)
491                         migf->state = MLX5_MIGF_STATE_PRE_COPY_ERROR;
492                 else
493                         migf->state = MLX5_MIGF_STATE_ERROR;
494                 wake_up_interruptible(&migf->poll_wait);
495         }
496         mutex_unlock(&migf->lock);
497         kvfree(async_data->out);
498         complete(&migf->save_comp);
499         fput(migf->filp);
500 }
501
502 static int add_buf_header(struct mlx5_vhca_data_buffer *header_buf,
503                           size_t image_size, bool initial_pre_copy)
504 {
505         struct mlx5_vf_migration_file *migf = header_buf->migf;
506         struct mlx5_vf_migration_header header = {};
507         unsigned long flags;
508         struct page *page;
509         u8 *to_buff;
510
511         header.record_size = cpu_to_le64(image_size);
512         header.flags = cpu_to_le32(MLX5_MIGF_HEADER_FLAGS_TAG_MANDATORY);
513         header.tag = cpu_to_le32(MLX5_MIGF_HEADER_TAG_FW_DATA);
514         page = mlx5vf_get_migration_page(header_buf, 0);
515         if (!page)
516                 return -EINVAL;
517         to_buff = kmap_local_page(page);
518         memcpy(to_buff, &header, sizeof(header));
519         kunmap_local(to_buff);
520         header_buf->length = sizeof(header);
521         header_buf->start_pos = header_buf->migf->max_pos;
522         migf->max_pos += header_buf->length;
523         spin_lock_irqsave(&migf->list_lock, flags);
524         list_add_tail(&header_buf->buf_elm, &migf->buf_list);
525         spin_unlock_irqrestore(&migf->list_lock, flags);
526         if (initial_pre_copy)
527                 migf->pre_copy_initial_bytes += sizeof(header);
528         return 0;
529 }
530
531 static void mlx5vf_save_callback(int status, struct mlx5_async_work *context)
532 {
533         struct mlx5vf_async_data *async_data = container_of(context,
534                         struct mlx5vf_async_data, cb_work);
535         struct mlx5_vf_migration_file *migf = container_of(async_data,
536                         struct mlx5_vf_migration_file, async_data);
537
538         if (!status) {
539                 size_t image_size;
540                 unsigned long flags;
541                 bool initial_pre_copy = migf->state != MLX5_MIGF_STATE_PRE_COPY &&
542                                 !async_data->last_chunk;
543
544                 image_size = MLX5_GET(save_vhca_state_out, async_data->out,
545                                       actual_image_size);
546                 if (async_data->header_buf) {
547                         status = add_buf_header(async_data->header_buf, image_size,
548                                                 initial_pre_copy);
549                         if (status)
550                                 goto err;
551                 }
552                 async_data->buf->length = image_size;
553                 async_data->buf->start_pos = migf->max_pos;
554                 migf->max_pos += async_data->buf->length;
555                 spin_lock_irqsave(&migf->list_lock, flags);
556                 list_add_tail(&async_data->buf->buf_elm, &migf->buf_list);
557                 spin_unlock_irqrestore(&migf->list_lock, flags);
558                 if (initial_pre_copy)
559                         migf->pre_copy_initial_bytes += image_size;
560                 migf->state = async_data->last_chunk ?
561                         MLX5_MIGF_STATE_COMPLETE : MLX5_MIGF_STATE_PRE_COPY;
562                 wake_up_interruptible(&migf->poll_wait);
563         }
564
565 err:
566         /*
567          * The error and the cleanup flows can't run from an
568          * interrupt context
569          */
570         if (status == -EREMOTEIO)
571                 status = MLX5_GET(save_vhca_state_out, async_data->out, status);
572         async_data->status = status;
573         queue_work(migf->mvdev->cb_wq, &async_data->work);
574 }
575
576 int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
577                                struct mlx5_vf_migration_file *migf,
578                                struct mlx5_vhca_data_buffer *buf, bool inc,
579                                bool track)
580 {
581         u32 out_size = MLX5_ST_SZ_BYTES(save_vhca_state_out);
582         u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {};
583         struct mlx5_vhca_data_buffer *header_buf = NULL;
584         struct mlx5vf_async_data *async_data;
585         int err;
586
587         lockdep_assert_held(&mvdev->state_mutex);
588         if (mvdev->mdev_detach)
589                 return -ENOTCONN;
590
591         err = wait_for_completion_interruptible(&migf->save_comp);
592         if (err)
593                 return err;
594
595         if (migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR)
596                 /*
597                  * In case we had a PRE_COPY error, SAVE is triggered only for
598                  * the final image, read device full image.
599                  */
600                 inc = false;
601
602         MLX5_SET(save_vhca_state_in, in, opcode,
603                  MLX5_CMD_OP_SAVE_VHCA_STATE);
604         MLX5_SET(save_vhca_state_in, in, op_mod, 0);
605         MLX5_SET(save_vhca_state_in, in, vhca_id, mvdev->vhca_id);
606         MLX5_SET(save_vhca_state_in, in, mkey, buf->mkey);
607         MLX5_SET(save_vhca_state_in, in, size, buf->allocated_length);
608         MLX5_SET(save_vhca_state_in, in, incremental, inc);
609         MLX5_SET(save_vhca_state_in, in, set_track, track);
610
611         async_data = &migf->async_data;
612         async_data->buf = buf;
613         async_data->last_chunk = !track;
614         async_data->out = kvzalloc(out_size, GFP_KERNEL);
615         if (!async_data->out) {
616                 err = -ENOMEM;
617                 goto err_out;
618         }
619
620         if (MLX5VF_PRE_COPY_SUPP(mvdev)) {
621                 if (async_data->last_chunk && migf->buf_header) {
622                         header_buf = migf->buf_header;
623                         migf->buf_header = NULL;
624                 } else {
625                         header_buf = mlx5vf_get_data_buffer(migf,
626                                 sizeof(struct mlx5_vf_migration_header), DMA_NONE);
627                         if (IS_ERR(header_buf)) {
628                                 err = PTR_ERR(header_buf);
629                                 goto err_free;
630                         }
631                 }
632         }
633
634         if (async_data->last_chunk)
635                 migf->state = MLX5_MIGF_STATE_SAVE_LAST;
636
637         async_data->header_buf = header_buf;
638         get_file(migf->filp);
639         err = mlx5_cmd_exec_cb(&migf->async_ctx, in, sizeof(in),
640                                async_data->out,
641                                out_size, mlx5vf_save_callback,
642                                &async_data->cb_work);
643         if (err)
644                 goto err_exec;
645
646         return 0;
647
648 err_exec:
649         if (header_buf)
650                 mlx5vf_put_data_buffer(header_buf);
651         fput(migf->filp);
652 err_free:
653         kvfree(async_data->out);
654 err_out:
655         complete(&migf->save_comp);
656         return err;
657 }
658
659 int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev,
660                                struct mlx5_vf_migration_file *migf,
661                                struct mlx5_vhca_data_buffer *buf)
662 {
663         u32 out[MLX5_ST_SZ_DW(load_vhca_state_out)] = {};
664         u32 in[MLX5_ST_SZ_DW(load_vhca_state_in)] = {};
665         int err;
666
667         lockdep_assert_held(&mvdev->state_mutex);
668         if (mvdev->mdev_detach)
669                 return -ENOTCONN;
670
671         if (!buf->dmaed) {
672                 err = mlx5vf_dma_data_buffer(buf);
673                 if (err)
674                         return err;
675         }
676
677         MLX5_SET(load_vhca_state_in, in, opcode,
678                  MLX5_CMD_OP_LOAD_VHCA_STATE);
679         MLX5_SET(load_vhca_state_in, in, op_mod, 0);
680         MLX5_SET(load_vhca_state_in, in, vhca_id, mvdev->vhca_id);
681         MLX5_SET(load_vhca_state_in, in, mkey, buf->mkey);
682         MLX5_SET(load_vhca_state_in, in, size, buf->length);
683         return mlx5_cmd_exec_inout(mvdev->mdev, load_vhca_state, in, out);
684 }
685
686 int mlx5vf_cmd_alloc_pd(struct mlx5_vf_migration_file *migf)
687 {
688         int err;
689
690         lockdep_assert_held(&migf->mvdev->state_mutex);
691         if (migf->mvdev->mdev_detach)
692                 return -ENOTCONN;
693
694         err = mlx5_core_alloc_pd(migf->mvdev->mdev, &migf->pdn);
695         return err;
696 }
697
698 void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf)
699 {
700         lockdep_assert_held(&migf->mvdev->state_mutex);
701         if (migf->mvdev->mdev_detach)
702                 return;
703
704         mlx5_core_dealloc_pd(migf->mvdev->mdev, migf->pdn);
705 }
706
707 void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf)
708 {
709         struct mlx5_vhca_data_buffer *entry;
710
711         lockdep_assert_held(&migf->mvdev->state_mutex);
712         WARN_ON(migf->mvdev->mdev_detach);
713
714         if (migf->buf) {
715                 mlx5vf_free_data_buffer(migf->buf);
716                 migf->buf = NULL;
717         }
718
719         if (migf->buf_header) {
720                 mlx5vf_free_data_buffer(migf->buf_header);
721                 migf->buf_header = NULL;
722         }
723
724         list_splice(&migf->avail_list, &migf->buf_list);
725
726         while ((entry = list_first_entry_or_null(&migf->buf_list,
727                                 struct mlx5_vhca_data_buffer, buf_elm))) {
728                 list_del(&entry->buf_elm);
729                 mlx5vf_free_data_buffer(entry);
730         }
731
732         mlx5vf_cmd_dealloc_pd(migf);
733 }
734
735 static int mlx5vf_create_tracker(struct mlx5_core_dev *mdev,
736                                  struct mlx5vf_pci_core_device *mvdev,
737                                  struct rb_root_cached *ranges, u32 nnodes)
738 {
739         int max_num_range =
740                 MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_max_num_range);
741         struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
742         int record_size = MLX5_ST_SZ_BYTES(page_track_range);
743         u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
744         struct interval_tree_node *node = NULL;
745         u64 total_ranges_len = 0;
746         u32 num_ranges = nnodes;
747         u8 log_addr_space_size;
748         void *range_list_ptr;
749         void *obj_context;
750         void *cmd_hdr;
751         int inlen;
752         void *in;
753         int err;
754         int i;
755
756         if (num_ranges > max_num_range) {
757                 vfio_combine_iova_ranges(ranges, nnodes, max_num_range);
758                 num_ranges = max_num_range;
759         }
760
761         inlen = MLX5_ST_SZ_BYTES(create_page_track_obj_in) +
762                                  record_size * num_ranges;
763         in = kzalloc(inlen, GFP_KERNEL);
764         if (!in)
765                 return -ENOMEM;
766
767         cmd_hdr = MLX5_ADDR_OF(create_page_track_obj_in, in,
768                                general_obj_in_cmd_hdr);
769         MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode,
770                  MLX5_CMD_OP_CREATE_GENERAL_OBJECT);
771         MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type,
772                  MLX5_OBJ_TYPE_PAGE_TRACK);
773         obj_context = MLX5_ADDR_OF(create_page_track_obj_in, in, obj_context);
774         MLX5_SET(page_track, obj_context, vhca_id, mvdev->vhca_id);
775         MLX5_SET(page_track, obj_context, track_type, 1);
776         MLX5_SET(page_track, obj_context, log_page_size,
777                  ilog2(tracker->host_qp->tracked_page_size));
778         MLX5_SET(page_track, obj_context, log_msg_size,
779                  ilog2(tracker->host_qp->max_msg_size));
780         MLX5_SET(page_track, obj_context, reporting_qpn, tracker->fw_qp->qpn);
781         MLX5_SET(page_track, obj_context, num_ranges, num_ranges);
782
783         range_list_ptr = MLX5_ADDR_OF(page_track, obj_context, track_range);
784         node = interval_tree_iter_first(ranges, 0, ULONG_MAX);
785         for (i = 0; i < num_ranges; i++) {
786                 void *addr_range_i_base = range_list_ptr + record_size * i;
787                 unsigned long length = node->last - node->start + 1;
788
789                 MLX5_SET64(page_track_range, addr_range_i_base, start_address,
790                            node->start);
791                 MLX5_SET64(page_track_range, addr_range_i_base, length, length);
792                 total_ranges_len += length;
793                 node = interval_tree_iter_next(node, 0, ULONG_MAX);
794         }
795
796         WARN_ON(node);
797         log_addr_space_size = ilog2(roundup_pow_of_two(total_ranges_len));
798         if (log_addr_space_size <
799             (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_min_addr_space)) ||
800             log_addr_space_size >
801             (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_max_addr_space))) {
802                 err = -EOPNOTSUPP;
803                 goto out;
804         }
805
806         MLX5_SET(page_track, obj_context, log_addr_space_size,
807                  log_addr_space_size);
808         err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out));
809         if (err)
810                 goto out;
811
812         tracker->id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id);
813 out:
814         kfree(in);
815         return err;
816 }
817
818 static int mlx5vf_cmd_destroy_tracker(struct mlx5_core_dev *mdev,
819                                       u32 tracker_id)
820 {
821         u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {};
822         u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
823
824         MLX5_SET(general_obj_in_cmd_hdr, in, opcode, MLX5_CMD_OP_DESTROY_GENERAL_OBJECT);
825         MLX5_SET(general_obj_in_cmd_hdr, in, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK);
826         MLX5_SET(general_obj_in_cmd_hdr, in, obj_id, tracker_id);
827
828         return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
829 }
830
831 static int mlx5vf_cmd_modify_tracker(struct mlx5_core_dev *mdev,
832                                      u32 tracker_id, unsigned long iova,
833                                      unsigned long length, u32 tracker_state)
834 {
835         u32 in[MLX5_ST_SZ_DW(modify_page_track_obj_in)] = {};
836         u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
837         void *obj_context;
838         void *cmd_hdr;
839
840         cmd_hdr = MLX5_ADDR_OF(modify_page_track_obj_in, in, general_obj_in_cmd_hdr);
841         MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_MODIFY_GENERAL_OBJECT);
842         MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK);
843         MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, tracker_id);
844
845         obj_context = MLX5_ADDR_OF(modify_page_track_obj_in, in, obj_context);
846         MLX5_SET64(page_track, obj_context, modify_field_select, 0x3);
847         MLX5_SET64(page_track, obj_context, range_start_address, iova);
848         MLX5_SET64(page_track, obj_context, length, length);
849         MLX5_SET(page_track, obj_context, state, tracker_state);
850
851         return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
852 }
853
854 static int alloc_cq_frag_buf(struct mlx5_core_dev *mdev,
855                              struct mlx5_vhca_cq_buf *buf, int nent,
856                              int cqe_size)
857 {
858         struct mlx5_frag_buf *frag_buf = &buf->frag_buf;
859         u8 log_wq_stride = 6 + (cqe_size == 128 ? 1 : 0);
860         u8 log_wq_sz = ilog2(cqe_size);
861         int err;
862
863         err = mlx5_frag_buf_alloc_node(mdev, nent * cqe_size, frag_buf,
864                                        mdev->priv.numa_node);
865         if (err)
866                 return err;
867
868         mlx5_init_fbc(frag_buf->frags, log_wq_stride, log_wq_sz, &buf->fbc);
869         buf->cqe_size = cqe_size;
870         buf->nent = nent;
871         return 0;
872 }
873
874 static void init_cq_frag_buf(struct mlx5_vhca_cq_buf *buf)
875 {
876         struct mlx5_cqe64 *cqe64;
877         void *cqe;
878         int i;
879
880         for (i = 0; i < buf->nent; i++) {
881                 cqe = mlx5_frag_buf_get_wqe(&buf->fbc, i);
882                 cqe64 = buf->cqe_size == 64 ? cqe : cqe + 64;
883                 cqe64->op_own = MLX5_CQE_INVALID << 4;
884         }
885 }
886
887 static void mlx5vf_destroy_cq(struct mlx5_core_dev *mdev,
888                               struct mlx5_vhca_cq *cq)
889 {
890         mlx5_core_destroy_cq(mdev, &cq->mcq);
891         mlx5_frag_buf_free(mdev, &cq->buf.frag_buf);
892         mlx5_db_free(mdev, &cq->db);
893 }
894
895 static void mlx5vf_cq_event(struct mlx5_core_cq *mcq, enum mlx5_event type)
896 {
897         if (type != MLX5_EVENT_TYPE_CQ_ERROR)
898                 return;
899
900         set_tracker_error(container_of(mcq, struct mlx5vf_pci_core_device,
901                                        tracker.cq.mcq));
902 }
903
904 static int mlx5vf_event_notifier(struct notifier_block *nb, unsigned long type,
905                                  void *data)
906 {
907         struct mlx5_vhca_page_tracker *tracker =
908                 mlx5_nb_cof(nb, struct mlx5_vhca_page_tracker, nb);
909         struct mlx5vf_pci_core_device *mvdev = container_of(
910                 tracker, struct mlx5vf_pci_core_device, tracker);
911         struct mlx5_eqe *eqe = data;
912         u8 event_type = (u8)type;
913         u8 queue_type;
914         int qp_num;
915
916         switch (event_type) {
917         case MLX5_EVENT_TYPE_WQ_CATAS_ERROR:
918         case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR:
919         case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
920                 queue_type = eqe->data.qp_srq.type;
921                 if (queue_type != MLX5_EVENT_QUEUE_TYPE_QP)
922                         break;
923                 qp_num = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff;
924                 if (qp_num != tracker->host_qp->qpn &&
925                     qp_num != tracker->fw_qp->qpn)
926                         break;
927                 set_tracker_error(mvdev);
928                 break;
929         default:
930                 break;
931         }
932
933         return NOTIFY_OK;
934 }
935
936 static void mlx5vf_cq_complete(struct mlx5_core_cq *mcq,
937                                struct mlx5_eqe *eqe)
938 {
939         struct mlx5vf_pci_core_device *mvdev =
940                 container_of(mcq, struct mlx5vf_pci_core_device,
941                              tracker.cq.mcq);
942
943         complete(&mvdev->tracker_comp);
944 }
945
946 static int mlx5vf_create_cq(struct mlx5_core_dev *mdev,
947                             struct mlx5_vhca_page_tracker *tracker,
948                             size_t ncqe)
949 {
950         int cqe_size = cache_line_size() == 128 ? 128 : 64;
951         u32 out[MLX5_ST_SZ_DW(create_cq_out)];
952         struct mlx5_vhca_cq *cq;
953         int inlen, err, eqn;
954         void *cqc, *in;
955         __be64 *pas;
956         int vector;
957
958         cq = &tracker->cq;
959         ncqe = roundup_pow_of_two(ncqe);
960         err = mlx5_db_alloc_node(mdev, &cq->db, mdev->priv.numa_node);
961         if (err)
962                 return err;
963
964         cq->ncqe = ncqe;
965         cq->mcq.set_ci_db = cq->db.db;
966         cq->mcq.arm_db = cq->db.db + 1;
967         cq->mcq.cqe_sz = cqe_size;
968         err = alloc_cq_frag_buf(mdev, &cq->buf, ncqe, cqe_size);
969         if (err)
970                 goto err_db_free;
971
972         init_cq_frag_buf(&cq->buf);
973         inlen = MLX5_ST_SZ_BYTES(create_cq_in) +
974                 MLX5_FLD_SZ_BYTES(create_cq_in, pas[0]) *
975                 cq->buf.frag_buf.npages;
976         in = kvzalloc(inlen, GFP_KERNEL);
977         if (!in) {
978                 err = -ENOMEM;
979                 goto err_buff;
980         }
981
982         vector = raw_smp_processor_id() % mlx5_comp_vectors_max(mdev);
983         err = mlx5_comp_eqn_get(mdev, vector, &eqn);
984         if (err)
985                 goto err_vec;
986
987         cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
988         MLX5_SET(cqc, cqc, log_cq_size, ilog2(ncqe));
989         MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn);
990         MLX5_SET(cqc, cqc, uar_page, tracker->uar->index);
991         MLX5_SET(cqc, cqc, log_page_size, cq->buf.frag_buf.page_shift -
992                  MLX5_ADAPTER_PAGE_SHIFT);
993         MLX5_SET64(cqc, cqc, dbr_addr, cq->db.dma);
994         pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas);
995         mlx5_fill_page_frag_array(&cq->buf.frag_buf, pas);
996         cq->mcq.comp = mlx5vf_cq_complete;
997         cq->mcq.event = mlx5vf_cq_event;
998         err = mlx5_core_create_cq(mdev, &cq->mcq, in, inlen, out, sizeof(out));
999         if (err)
1000                 goto err_vec;
1001
1002         mlx5_cq_arm(&cq->mcq, MLX5_CQ_DB_REQ_NOT, tracker->uar->map,
1003                     cq->mcq.cons_index);
1004         kvfree(in);
1005         return 0;
1006
1007 err_vec:
1008         kvfree(in);
1009 err_buff:
1010         mlx5_frag_buf_free(mdev, &cq->buf.frag_buf);
1011 err_db_free:
1012         mlx5_db_free(mdev, &cq->db);
1013         return err;
1014 }
1015
1016 static struct mlx5_vhca_qp *
1017 mlx5vf_create_rc_qp(struct mlx5_core_dev *mdev,
1018                     struct mlx5_vhca_page_tracker *tracker, u32 max_recv_wr)
1019 {
1020         u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {};
1021         struct mlx5_vhca_qp *qp;
1022         u8 log_rq_stride;
1023         u8 log_rq_sz;
1024         void *qpc;
1025         int inlen;
1026         void *in;
1027         int err;
1028
1029         qp = kzalloc(sizeof(*qp), GFP_KERNEL_ACCOUNT);
1030         if (!qp)
1031                 return ERR_PTR(-ENOMEM);
1032
1033         err = mlx5_db_alloc_node(mdev, &qp->db, mdev->priv.numa_node);
1034         if (err)
1035                 goto err_free;
1036
1037         if (max_recv_wr) {
1038                 qp->rq.wqe_cnt = roundup_pow_of_two(max_recv_wr);
1039                 log_rq_stride = ilog2(MLX5_SEND_WQE_DS);
1040                 log_rq_sz = ilog2(qp->rq.wqe_cnt);
1041                 err = mlx5_frag_buf_alloc_node(mdev,
1042                         wq_get_byte_sz(log_rq_sz, log_rq_stride),
1043                         &qp->buf, mdev->priv.numa_node);
1044                 if (err)
1045                         goto err_db_free;
1046                 mlx5_init_fbc(qp->buf.frags, log_rq_stride, log_rq_sz, &qp->rq.fbc);
1047         }
1048
1049         qp->rq.db = &qp->db.db[MLX5_RCV_DBR];
1050         inlen = MLX5_ST_SZ_BYTES(create_qp_in) +
1051                 MLX5_FLD_SZ_BYTES(create_qp_in, pas[0]) *
1052                 qp->buf.npages;
1053         in = kvzalloc(inlen, GFP_KERNEL);
1054         if (!in) {
1055                 err = -ENOMEM;
1056                 goto err_in;
1057         }
1058
1059         qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
1060         MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC);
1061         MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
1062         MLX5_SET(qpc, qpc, pd, tracker->pdn);
1063         MLX5_SET(qpc, qpc, uar_page, tracker->uar->index);
1064         MLX5_SET(qpc, qpc, log_page_size,
1065                  qp->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
1066         MLX5_SET(qpc, qpc, ts_format, mlx5_get_qp_default_ts(mdev));
1067         if (MLX5_CAP_GEN(mdev, cqe_version) == 1)
1068                 MLX5_SET(qpc, qpc, user_index, 0xFFFFFF);
1069         MLX5_SET(qpc, qpc, no_sq, 1);
1070         if (max_recv_wr) {
1071                 MLX5_SET(qpc, qpc, cqn_rcv, tracker->cq.mcq.cqn);
1072                 MLX5_SET(qpc, qpc, log_rq_stride, log_rq_stride - 4);
1073                 MLX5_SET(qpc, qpc, log_rq_size, log_rq_sz);
1074                 MLX5_SET(qpc, qpc, rq_type, MLX5_NON_ZERO_RQ);
1075                 MLX5_SET64(qpc, qpc, dbr_addr, qp->db.dma);
1076                 mlx5_fill_page_frag_array(&qp->buf,
1077                                           (__be64 *)MLX5_ADDR_OF(create_qp_in,
1078                                                                  in, pas));
1079         } else {
1080                 MLX5_SET(qpc, qpc, rq_type, MLX5_ZERO_LEN_RQ);
1081         }
1082
1083         MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP);
1084         err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out));
1085         kvfree(in);
1086         if (err)
1087                 goto err_in;
1088
1089         qp->qpn = MLX5_GET(create_qp_out, out, qpn);
1090         return qp;
1091
1092 err_in:
1093         if (max_recv_wr)
1094                 mlx5_frag_buf_free(mdev, &qp->buf);
1095 err_db_free:
1096         mlx5_db_free(mdev, &qp->db);
1097 err_free:
1098         kfree(qp);
1099         return ERR_PTR(err);
1100 }
1101
1102 static void mlx5vf_post_recv(struct mlx5_vhca_qp *qp)
1103 {
1104         struct mlx5_wqe_data_seg *data;
1105         unsigned int ix;
1106
1107         WARN_ON(qp->rq.pc - qp->rq.cc >= qp->rq.wqe_cnt);
1108         ix = qp->rq.pc & (qp->rq.wqe_cnt - 1);
1109         data = mlx5_frag_buf_get_wqe(&qp->rq.fbc, ix);
1110         data->byte_count = cpu_to_be32(qp->max_msg_size);
1111         data->lkey = cpu_to_be32(qp->recv_buf.mkey);
1112         data->addr = cpu_to_be64(qp->recv_buf.next_rq_offset);
1113         qp->rq.pc++;
1114         /* Make sure that descriptors are written before doorbell record. */
1115         dma_wmb();
1116         *qp->rq.db = cpu_to_be32(qp->rq.pc & 0xffff);
1117 }
1118
1119 static int mlx5vf_activate_qp(struct mlx5_core_dev *mdev,
1120                               struct mlx5_vhca_qp *qp, u32 remote_qpn,
1121                               bool host_qp)
1122 {
1123         u32 init_in[MLX5_ST_SZ_DW(rst2init_qp_in)] = {};
1124         u32 rtr_in[MLX5_ST_SZ_DW(init2rtr_qp_in)] = {};
1125         u32 rts_in[MLX5_ST_SZ_DW(rtr2rts_qp_in)] = {};
1126         void *qpc;
1127         int ret;
1128
1129         /* Init */
1130         qpc = MLX5_ADDR_OF(rst2init_qp_in, init_in, qpc);
1131         MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, 1);
1132         MLX5_SET(qpc, qpc, pm_state, MLX5_QPC_PM_STATE_MIGRATED);
1133         MLX5_SET(qpc, qpc, rre, 1);
1134         MLX5_SET(qpc, qpc, rwe, 1);
1135         MLX5_SET(rst2init_qp_in, init_in, opcode, MLX5_CMD_OP_RST2INIT_QP);
1136         MLX5_SET(rst2init_qp_in, init_in, qpn, qp->qpn);
1137         ret = mlx5_cmd_exec_in(mdev, rst2init_qp, init_in);
1138         if (ret)
1139                 return ret;
1140
1141         if (host_qp) {
1142                 struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf;
1143                 int i;
1144
1145                 for (i = 0; i < qp->rq.wqe_cnt; i++) {
1146                         mlx5vf_post_recv(qp);
1147                         recv_buf->next_rq_offset += qp->max_msg_size;
1148                 }
1149         }
1150
1151         /* RTR */
1152         qpc = MLX5_ADDR_OF(init2rtr_qp_in, rtr_in, qpc);
1153         MLX5_SET(init2rtr_qp_in, rtr_in, qpn, qp->qpn);
1154         MLX5_SET(qpc, qpc, mtu, IB_MTU_4096);
1155         MLX5_SET(qpc, qpc, log_msg_max, MLX5_CAP_GEN(mdev, log_max_msg));
1156         MLX5_SET(qpc, qpc, remote_qpn, remote_qpn);
1157         MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, 1);
1158         MLX5_SET(qpc, qpc, primary_address_path.fl, 1);
1159         MLX5_SET(qpc, qpc, min_rnr_nak, 1);
1160         MLX5_SET(init2rtr_qp_in, rtr_in, opcode, MLX5_CMD_OP_INIT2RTR_QP);
1161         MLX5_SET(init2rtr_qp_in, rtr_in, qpn, qp->qpn);
1162         ret = mlx5_cmd_exec_in(mdev, init2rtr_qp, rtr_in);
1163         if (ret || host_qp)
1164                 return ret;
1165
1166         /* RTS */
1167         qpc = MLX5_ADDR_OF(rtr2rts_qp_in, rts_in, qpc);
1168         MLX5_SET(rtr2rts_qp_in, rts_in, qpn, qp->qpn);
1169         MLX5_SET(qpc, qpc, retry_count, 7);
1170         MLX5_SET(qpc, qpc, rnr_retry, 7); /* Infinite retry if RNR NACK */
1171         MLX5_SET(qpc, qpc, primary_address_path.ack_timeout, 0x8); /* ~1ms */
1172         MLX5_SET(rtr2rts_qp_in, rts_in, opcode, MLX5_CMD_OP_RTR2RTS_QP);
1173         MLX5_SET(rtr2rts_qp_in, rts_in, qpn, qp->qpn);
1174
1175         return mlx5_cmd_exec_in(mdev, rtr2rts_qp, rts_in);
1176 }
1177
1178 static void mlx5vf_destroy_qp(struct mlx5_core_dev *mdev,
1179                               struct mlx5_vhca_qp *qp)
1180 {
1181         u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {};
1182
1183         MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP);
1184         MLX5_SET(destroy_qp_in, in, qpn, qp->qpn);
1185         mlx5_cmd_exec_in(mdev, destroy_qp, in);
1186
1187         mlx5_frag_buf_free(mdev, &qp->buf);
1188         mlx5_db_free(mdev, &qp->db);
1189         kfree(qp);
1190 }
1191
1192 static void free_recv_pages(struct mlx5_vhca_recv_buf *recv_buf)
1193 {
1194         int i;
1195
1196         /* Undo alloc_pages_bulk_array() */
1197         for (i = 0; i < recv_buf->npages; i++)
1198                 __free_page(recv_buf->page_list[i]);
1199
1200         kvfree(recv_buf->page_list);
1201 }
1202
1203 static int alloc_recv_pages(struct mlx5_vhca_recv_buf *recv_buf,
1204                             unsigned int npages)
1205 {
1206         unsigned int filled = 0, done = 0;
1207         int i;
1208
1209         recv_buf->page_list = kvcalloc(npages, sizeof(*recv_buf->page_list),
1210                                        GFP_KERNEL_ACCOUNT);
1211         if (!recv_buf->page_list)
1212                 return -ENOMEM;
1213
1214         for (;;) {
1215                 filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT,
1216                                                 npages - done,
1217                                                 recv_buf->page_list + done);
1218                 if (!filled)
1219                         goto err;
1220
1221                 done += filled;
1222                 if (done == npages)
1223                         break;
1224         }
1225
1226         recv_buf->npages = npages;
1227         return 0;
1228
1229 err:
1230         for (i = 0; i < npages; i++) {
1231                 if (recv_buf->page_list[i])
1232                         __free_page(recv_buf->page_list[i]);
1233         }
1234
1235         kvfree(recv_buf->page_list);
1236         return -ENOMEM;
1237 }
1238
1239 static int register_dma_recv_pages(struct mlx5_core_dev *mdev,
1240                                    struct mlx5_vhca_recv_buf *recv_buf)
1241 {
1242         int i, j;
1243
1244         recv_buf->dma_addrs = kvcalloc(recv_buf->npages,
1245                                        sizeof(*recv_buf->dma_addrs),
1246                                        GFP_KERNEL_ACCOUNT);
1247         if (!recv_buf->dma_addrs)
1248                 return -ENOMEM;
1249
1250         for (i = 0; i < recv_buf->npages; i++) {
1251                 recv_buf->dma_addrs[i] = dma_map_page(mdev->device,
1252                                                       recv_buf->page_list[i],
1253                                                       0, PAGE_SIZE,
1254                                                       DMA_FROM_DEVICE);
1255                 if (dma_mapping_error(mdev->device, recv_buf->dma_addrs[i]))
1256                         goto error;
1257         }
1258         return 0;
1259
1260 error:
1261         for (j = 0; j < i; j++)
1262                 dma_unmap_single(mdev->device, recv_buf->dma_addrs[j],
1263                                  PAGE_SIZE, DMA_FROM_DEVICE);
1264
1265         kvfree(recv_buf->dma_addrs);
1266         return -ENOMEM;
1267 }
1268
1269 static void unregister_dma_recv_pages(struct mlx5_core_dev *mdev,
1270                                       struct mlx5_vhca_recv_buf *recv_buf)
1271 {
1272         int i;
1273
1274         for (i = 0; i < recv_buf->npages; i++)
1275                 dma_unmap_single(mdev->device, recv_buf->dma_addrs[i],
1276                                  PAGE_SIZE, DMA_FROM_DEVICE);
1277
1278         kvfree(recv_buf->dma_addrs);
1279 }
1280
1281 static void mlx5vf_free_qp_recv_resources(struct mlx5_core_dev *mdev,
1282                                           struct mlx5_vhca_qp *qp)
1283 {
1284         struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf;
1285
1286         mlx5_core_destroy_mkey(mdev, recv_buf->mkey);
1287         unregister_dma_recv_pages(mdev, recv_buf);
1288         free_recv_pages(&qp->recv_buf);
1289 }
1290
1291 static int mlx5vf_alloc_qp_recv_resources(struct mlx5_core_dev *mdev,
1292                                           struct mlx5_vhca_qp *qp, u32 pdn,
1293                                           u64 rq_size)
1294 {
1295         unsigned int npages = DIV_ROUND_UP_ULL(rq_size, PAGE_SIZE);
1296         struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf;
1297         int err;
1298
1299         err = alloc_recv_pages(recv_buf, npages);
1300         if (err < 0)
1301                 return err;
1302
1303         err = register_dma_recv_pages(mdev, recv_buf);
1304         if (err)
1305                 goto end;
1306
1307         err = _create_mkey(mdev, pdn, NULL, recv_buf, &recv_buf->mkey);
1308         if (err)
1309                 goto err_create_mkey;
1310
1311         return 0;
1312
1313 err_create_mkey:
1314         unregister_dma_recv_pages(mdev, recv_buf);
1315 end:
1316         free_recv_pages(recv_buf);
1317         return err;
1318 }
1319
1320 static void
1321 _mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev)
1322 {
1323         struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
1324         struct mlx5_core_dev *mdev = mvdev->mdev;
1325
1326         lockdep_assert_held(&mvdev->state_mutex);
1327
1328         if (!mvdev->log_active)
1329                 return;
1330
1331         WARN_ON(mvdev->mdev_detach);
1332
1333         mlx5_eq_notifier_unregister(mdev, &tracker->nb);
1334         mlx5vf_cmd_destroy_tracker(mdev, tracker->id);
1335         mlx5vf_destroy_qp(mdev, tracker->fw_qp);
1336         mlx5vf_free_qp_recv_resources(mdev, tracker->host_qp);
1337         mlx5vf_destroy_qp(mdev, tracker->host_qp);
1338         mlx5vf_destroy_cq(mdev, &tracker->cq);
1339         mlx5_core_dealloc_pd(mdev, tracker->pdn);
1340         mlx5_put_uars_page(mdev, tracker->uar);
1341         mvdev->log_active = false;
1342 }
1343
1344 int mlx5vf_stop_page_tracker(struct vfio_device *vdev)
1345 {
1346         struct mlx5vf_pci_core_device *mvdev = container_of(
1347                 vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1348
1349         mutex_lock(&mvdev->state_mutex);
1350         if (!mvdev->log_active)
1351                 goto end;
1352
1353         _mlx5vf_free_page_tracker_resources(mvdev);
1354         mvdev->log_active = false;
1355 end:
1356         mlx5vf_state_mutex_unlock(mvdev);
1357         return 0;
1358 }
1359
1360 int mlx5vf_start_page_tracker(struct vfio_device *vdev,
1361                               struct rb_root_cached *ranges, u32 nnodes,
1362                               u64 *page_size)
1363 {
1364         struct mlx5vf_pci_core_device *mvdev = container_of(
1365                 vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1366         struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
1367         u8 log_tracked_page = ilog2(*page_size);
1368         struct mlx5_vhca_qp *host_qp;
1369         struct mlx5_vhca_qp *fw_qp;
1370         struct mlx5_core_dev *mdev;
1371         u32 max_msg_size = PAGE_SIZE;
1372         u64 rq_size = SZ_2M;
1373         u32 max_recv_wr;
1374         int err;
1375
1376         mutex_lock(&mvdev->state_mutex);
1377         if (mvdev->mdev_detach) {
1378                 err = -ENOTCONN;
1379                 goto end;
1380         }
1381
1382         if (mvdev->log_active) {
1383                 err = -EINVAL;
1384                 goto end;
1385         }
1386
1387         mdev = mvdev->mdev;
1388         memset(tracker, 0, sizeof(*tracker));
1389         tracker->uar = mlx5_get_uars_page(mdev);
1390         if (IS_ERR(tracker->uar)) {
1391                 err = PTR_ERR(tracker->uar);
1392                 goto end;
1393         }
1394
1395         err = mlx5_core_alloc_pd(mdev, &tracker->pdn);
1396         if (err)
1397                 goto err_uar;
1398
1399         max_recv_wr = DIV_ROUND_UP_ULL(rq_size, max_msg_size);
1400         err = mlx5vf_create_cq(mdev, tracker, max_recv_wr);
1401         if (err)
1402                 goto err_dealloc_pd;
1403
1404         host_qp = mlx5vf_create_rc_qp(mdev, tracker, max_recv_wr);
1405         if (IS_ERR(host_qp)) {
1406                 err = PTR_ERR(host_qp);
1407                 goto err_cq;
1408         }
1409
1410         host_qp->max_msg_size = max_msg_size;
1411         if (log_tracked_page < MLX5_CAP_ADV_VIRTUALIZATION(mdev,
1412                                 pg_track_log_min_page_size)) {
1413                 log_tracked_page = MLX5_CAP_ADV_VIRTUALIZATION(mdev,
1414                                 pg_track_log_min_page_size);
1415         } else if (log_tracked_page > MLX5_CAP_ADV_VIRTUALIZATION(mdev,
1416                                 pg_track_log_max_page_size)) {
1417                 log_tracked_page = MLX5_CAP_ADV_VIRTUALIZATION(mdev,
1418                                 pg_track_log_max_page_size);
1419         }
1420
1421         host_qp->tracked_page_size = (1ULL << log_tracked_page);
1422         err = mlx5vf_alloc_qp_recv_resources(mdev, host_qp, tracker->pdn,
1423                                              rq_size);
1424         if (err)
1425                 goto err_host_qp;
1426
1427         fw_qp = mlx5vf_create_rc_qp(mdev, tracker, 0);
1428         if (IS_ERR(fw_qp)) {
1429                 err = PTR_ERR(fw_qp);
1430                 goto err_recv_resources;
1431         }
1432
1433         err = mlx5vf_activate_qp(mdev, host_qp, fw_qp->qpn, true);
1434         if (err)
1435                 goto err_activate;
1436
1437         err = mlx5vf_activate_qp(mdev, fw_qp, host_qp->qpn, false);
1438         if (err)
1439                 goto err_activate;
1440
1441         tracker->host_qp = host_qp;
1442         tracker->fw_qp = fw_qp;
1443         err = mlx5vf_create_tracker(mdev, mvdev, ranges, nnodes);
1444         if (err)
1445                 goto err_activate;
1446
1447         MLX5_NB_INIT(&tracker->nb, mlx5vf_event_notifier, NOTIFY_ANY);
1448         mlx5_eq_notifier_register(mdev, &tracker->nb);
1449         *page_size = host_qp->tracked_page_size;
1450         mvdev->log_active = true;
1451         mlx5vf_state_mutex_unlock(mvdev);
1452         return 0;
1453
1454 err_activate:
1455         mlx5vf_destroy_qp(mdev, fw_qp);
1456 err_recv_resources:
1457         mlx5vf_free_qp_recv_resources(mdev, host_qp);
1458 err_host_qp:
1459         mlx5vf_destroy_qp(mdev, host_qp);
1460 err_cq:
1461         mlx5vf_destroy_cq(mdev, &tracker->cq);
1462 err_dealloc_pd:
1463         mlx5_core_dealloc_pd(mdev, tracker->pdn);
1464 err_uar:
1465         mlx5_put_uars_page(mdev, tracker->uar);
1466 end:
1467         mlx5vf_state_mutex_unlock(mvdev);
1468         return err;
1469 }
1470
1471 static void
1472 set_report_output(u32 size, int index, struct mlx5_vhca_qp *qp,
1473                   struct iova_bitmap *dirty)
1474 {
1475         u32 entry_size = MLX5_ST_SZ_BYTES(page_track_report_entry);
1476         u32 nent = size / entry_size;
1477         struct page *page;
1478         u64 addr;
1479         u64 *buf;
1480         int i;
1481
1482         if (WARN_ON(index >= qp->recv_buf.npages ||
1483                     (nent > qp->max_msg_size / entry_size)))
1484                 return;
1485
1486         page = qp->recv_buf.page_list[index];
1487         buf = kmap_local_page(page);
1488         for (i = 0; i < nent; i++) {
1489                 addr = MLX5_GET(page_track_report_entry, buf + i,
1490                                 dirty_address_low);
1491                 addr |= (u64)MLX5_GET(page_track_report_entry, buf + i,
1492                                       dirty_address_high) << 32;
1493                 iova_bitmap_set(dirty, addr, qp->tracked_page_size);
1494         }
1495         kunmap_local(buf);
1496 }
1497
1498 static void
1499 mlx5vf_rq_cqe(struct mlx5_vhca_qp *qp, struct mlx5_cqe64 *cqe,
1500               struct iova_bitmap *dirty, int *tracker_status)
1501 {
1502         u32 size;
1503         int ix;
1504
1505         qp->rq.cc++;
1506         *tracker_status = be32_to_cpu(cqe->immediate) >> 28;
1507         size = be32_to_cpu(cqe->byte_cnt);
1508         ix = be16_to_cpu(cqe->wqe_counter) & (qp->rq.wqe_cnt - 1);
1509
1510         /* zero length CQE, no data */
1511         WARN_ON(!size && *tracker_status == MLX5_PAGE_TRACK_STATE_REPORTING);
1512         if (size)
1513                 set_report_output(size, ix, qp, dirty);
1514
1515         qp->recv_buf.next_rq_offset = ix * qp->max_msg_size;
1516         mlx5vf_post_recv(qp);
1517 }
1518
1519 static void *get_cqe(struct mlx5_vhca_cq *cq, int n)
1520 {
1521         return mlx5_frag_buf_get_wqe(&cq->buf.fbc, n);
1522 }
1523
1524 static struct mlx5_cqe64 *get_sw_cqe(struct mlx5_vhca_cq *cq, int n)
1525 {
1526         void *cqe = get_cqe(cq, n & (cq->ncqe - 1));
1527         struct mlx5_cqe64 *cqe64;
1528
1529         cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64;
1530
1531         if (likely(get_cqe_opcode(cqe64) != MLX5_CQE_INVALID) &&
1532             !((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ !!(n & (cq->ncqe)))) {
1533                 return cqe64;
1534         } else {
1535                 return NULL;
1536         }
1537 }
1538
1539 static int
1540 mlx5vf_cq_poll_one(struct mlx5_vhca_cq *cq, struct mlx5_vhca_qp *qp,
1541                    struct iova_bitmap *dirty, int *tracker_status)
1542 {
1543         struct mlx5_cqe64 *cqe;
1544         u8 opcode;
1545
1546         cqe = get_sw_cqe(cq, cq->mcq.cons_index);
1547         if (!cqe)
1548                 return CQ_EMPTY;
1549
1550         ++cq->mcq.cons_index;
1551         /*
1552          * Make sure we read CQ entry contents after we've checked the
1553          * ownership bit.
1554          */
1555         rmb();
1556         opcode = get_cqe_opcode(cqe);
1557         switch (opcode) {
1558         case MLX5_CQE_RESP_SEND_IMM:
1559                 mlx5vf_rq_cqe(qp, cqe, dirty, tracker_status);
1560                 return CQ_OK;
1561         default:
1562                 return CQ_POLL_ERR;
1563         }
1564 }
1565
1566 int mlx5vf_tracker_read_and_clear(struct vfio_device *vdev, unsigned long iova,
1567                                   unsigned long length,
1568                                   struct iova_bitmap *dirty)
1569 {
1570         struct mlx5vf_pci_core_device *mvdev = container_of(
1571                 vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1572         struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
1573         struct mlx5_vhca_cq *cq = &tracker->cq;
1574         struct mlx5_core_dev *mdev;
1575         int poll_err, err;
1576
1577         mutex_lock(&mvdev->state_mutex);
1578         if (!mvdev->log_active) {
1579                 err = -EINVAL;
1580                 goto end;
1581         }
1582
1583         if (mvdev->mdev_detach) {
1584                 err = -ENOTCONN;
1585                 goto end;
1586         }
1587
1588         mdev = mvdev->mdev;
1589         err = mlx5vf_cmd_modify_tracker(mdev, tracker->id, iova, length,
1590                                         MLX5_PAGE_TRACK_STATE_REPORTING);
1591         if (err)
1592                 goto end;
1593
1594         tracker->status = MLX5_PAGE_TRACK_STATE_REPORTING;
1595         while (tracker->status == MLX5_PAGE_TRACK_STATE_REPORTING &&
1596                !tracker->is_err) {
1597                 poll_err = mlx5vf_cq_poll_one(cq, tracker->host_qp, dirty,
1598                                               &tracker->status);
1599                 if (poll_err == CQ_EMPTY) {
1600                         mlx5_cq_arm(&cq->mcq, MLX5_CQ_DB_REQ_NOT, tracker->uar->map,
1601                                     cq->mcq.cons_index);
1602                         poll_err = mlx5vf_cq_poll_one(cq, tracker->host_qp,
1603                                                       dirty, &tracker->status);
1604                         if (poll_err == CQ_EMPTY) {
1605                                 wait_for_completion(&mvdev->tracker_comp);
1606                                 continue;
1607                         }
1608                 }
1609                 if (poll_err == CQ_POLL_ERR) {
1610                         err = -EIO;
1611                         goto end;
1612                 }
1613                 mlx5_cq_set_ci(&cq->mcq);
1614         }
1615
1616         if (tracker->status == MLX5_PAGE_TRACK_STATE_ERROR)
1617                 tracker->is_err = true;
1618
1619         if (tracker->is_err)
1620                 err = -EIO;
1621 end:
1622         mlx5vf_state_mutex_unlock(mvdev);
1623         return err;
1624 }