1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
2 /* Copyright (c) 2019 Mellanox Technologies. */
4 #include <linux/interrupt.h>
5 #include <linux/notifier.h>
6 #include <linux/module.h>
7 #include <linux/mlx5/driver.h>
11 #ifdef CONFIG_RFS_ACCEL
12 #include <linux/cpu_rmap.h>
15 #define MLX5_MAX_IRQ_NAME (32)
16 /* max irq_index is 255. three chars */
17 #define MLX5_MAX_IRQ_IDX_CHARS (3)
19 #define MLX5_SFS_PER_CTRL_IRQ 64
20 #define MLX5_IRQ_CTRL_SF_MAX 8
21 /* min num of vectors for SFs to be enabled */
22 #define MLX5_IRQ_VEC_COMP_BASE_SF 2
24 #define MLX5_EQ_SHARE_IRQ_MAX_COMP (8)
25 #define MLX5_EQ_SHARE_IRQ_MAX_CTRL (UINT_MAX)
26 #define MLX5_EQ_SHARE_IRQ_MIN_COMP (1)
27 #define MLX5_EQ_SHARE_IRQ_MIN_CTRL (4)
28 #define MLX5_EQ_REFS_PER_IRQ (2)
31 struct atomic_notifier_head nh;
33 char name[MLX5_MAX_IRQ_NAME];
34 struct mlx5_irq_pool *pool;
40 struct mlx5_irq_pool {
41 char name[MLX5_MAX_IRQ_NAME - MLX5_MAX_IRQ_IDX_CHARS];
42 struct xa_limit xa_num_irqs;
43 struct mutex lock; /* sync IRQs creations */
47 struct mlx5_core_dev *dev;
50 struct mlx5_irq_table {
51 struct mlx5_irq_pool *pf_pool;
52 struct mlx5_irq_pool *sf_ctrl_pool;
53 struct mlx5_irq_pool *sf_comp_pool;
57 * mlx5_get_default_msix_vec_count - Get the default number of MSI-X vectors
58 * to be ssigned to each VF.
60 * @num_vfs: Number of enabled VFs
62 int mlx5_get_default_msix_vec_count(struct mlx5_core_dev *dev, int num_vfs)
64 int num_vf_msix, min_msix, max_msix;
66 num_vf_msix = MLX5_CAP_GEN_MAX(dev, num_total_dynamic_vf_msix);
70 min_msix = MLX5_CAP_GEN(dev, min_dynamic_vf_msix_table_size);
71 max_msix = MLX5_CAP_GEN(dev, max_dynamic_vf_msix_table_size);
73 /* Limit maximum number of MSI-X vectors so the default configuration
74 * has some available in the pool. This will allow the user to increase
75 * the number of vectors in a VF without having to first size-down other
78 return max(min(num_vf_msix / num_vfs, max_msix / 2), min_msix);
82 * mlx5_set_msix_vec_count - Set dynamically allocated MSI-X on the VF
84 * @function_id: Internal PCI VF function IDd
85 * @msix_vec_count: Number of MSI-X vectors to set
87 int mlx5_set_msix_vec_count(struct mlx5_core_dev *dev, int function_id,
90 int query_sz = MLX5_ST_SZ_BYTES(query_hca_cap_out);
91 int set_sz = MLX5_ST_SZ_BYTES(set_hca_cap_in);
92 void *hca_cap = NULL, *query_cap = NULL, *cap;
93 int num_vf_msix, min_msix, max_msix;
96 num_vf_msix = MLX5_CAP_GEN_MAX(dev, num_total_dynamic_vf_msix);
100 if (!MLX5_CAP_GEN(dev, vport_group_manager) || !mlx5_core_is_pf(dev))
103 min_msix = MLX5_CAP_GEN(dev, min_dynamic_vf_msix_table_size);
104 max_msix = MLX5_CAP_GEN(dev, max_dynamic_vf_msix_table_size);
106 if (msix_vec_count < min_msix)
109 if (msix_vec_count > max_msix)
112 query_cap = kzalloc(query_sz, GFP_KERNEL);
113 hca_cap = kzalloc(set_sz, GFP_KERNEL);
114 if (!hca_cap || !query_cap) {
119 ret = mlx5_vport_get_other_func_cap(dev, function_id, query_cap);
123 cap = MLX5_ADDR_OF(set_hca_cap_in, hca_cap, capability);
124 memcpy(cap, MLX5_ADDR_OF(query_hca_cap_out, query_cap, capability),
125 MLX5_UN_SZ_BYTES(hca_cap_union));
126 MLX5_SET(cmd_hca_cap, cap, dynamic_msix_table_size, msix_vec_count);
128 MLX5_SET(set_hca_cap_in, hca_cap, opcode, MLX5_CMD_OP_SET_HCA_CAP);
129 MLX5_SET(set_hca_cap_in, hca_cap, other_function, 1);
130 MLX5_SET(set_hca_cap_in, hca_cap, function_id, function_id);
132 MLX5_SET(set_hca_cap_in, hca_cap, op_mod,
133 MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE << 1);
134 ret = mlx5_cmd_exec_in(dev, set_hca_cap, hca_cap);
141 static void irq_release(struct mlx5_irq *irq)
143 struct mlx5_irq_pool *pool = irq->pool;
145 xa_erase(&pool->irqs, irq->index);
146 /* free_irq requires that affinity and rmap will be cleared
147 * before calling it. This is why there is asymmetry with set_rmap
148 * which should be called after alloc_irq but before request_irq.
150 irq_set_affinity_hint(irq->irqn, NULL);
151 free_cpumask_var(irq->mask);
152 free_irq(irq->irqn, &irq->nh);
156 static void irq_put(struct mlx5_irq *irq)
158 struct mlx5_irq_pool *pool = irq->pool;
160 mutex_lock(&pool->lock);
164 mutex_unlock(&pool->lock);
167 static int irq_get_locked(struct mlx5_irq *irq)
169 lockdep_assert_held(&irq->pool->lock);
170 if (WARN_ON_ONCE(!irq->refcount))
176 static int irq_get(struct mlx5_irq *irq)
180 mutex_lock(&irq->pool->lock);
181 err = irq_get_locked(irq);
182 mutex_unlock(&irq->pool->lock);
186 static irqreturn_t irq_int_handler(int irq, void *nh)
188 atomic_notifier_call_chain(nh, 0, NULL);
192 static void irq_sf_set_name(struct mlx5_irq_pool *pool, char *name, int vecidx)
194 snprintf(name, MLX5_MAX_IRQ_NAME, "%s%d", pool->name, vecidx);
197 static void irq_set_name(char *name, int vecidx)
200 snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_async%d", vecidx);
204 snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_comp%d",
205 vecidx - MLX5_IRQ_VEC_COMP_BASE);
208 static struct mlx5_irq *irq_request(struct mlx5_irq_pool *pool, int i)
210 struct mlx5_core_dev *dev = pool->dev;
211 char name[MLX5_MAX_IRQ_NAME];
212 struct mlx5_irq *irq;
215 irq = kzalloc(sizeof(*irq), GFP_KERNEL);
217 return ERR_PTR(-ENOMEM);
218 irq->irqn = pci_irq_vector(dev->pdev, i);
220 irq_set_name(name, i);
222 irq_sf_set_name(pool, name, i);
223 ATOMIC_INIT_NOTIFIER_HEAD(&irq->nh);
224 snprintf(irq->name, MLX5_MAX_IRQ_NAME,
225 "%s@pci:%s", name, pci_name(dev->pdev));
226 err = request_irq(irq->irqn, irq_int_handler, 0, irq->name,
229 mlx5_core_err(dev, "Failed to request irq. err = %d\n", err);
232 if (!zalloc_cpumask_var(&irq->mask, GFP_KERNEL)) {
233 mlx5_core_warn(dev, "zalloc_cpumask_var failed\n");
240 err = xa_err(xa_store(&pool->irqs, irq->index, irq, GFP_KERNEL));
242 mlx5_core_err(dev, "Failed to alloc xa entry for irq(%u). err = %d\n",
248 free_cpumask_var(irq->mask);
250 free_irq(irq->irqn, &irq->nh);
256 int mlx5_irq_attach_nb(struct mlx5_irq *irq, struct notifier_block *nb)
262 /* Something very bad happens here, we are enabling EQ
263 * on non-existing IRQ.
266 ret = atomic_notifier_chain_register(&irq->nh, nb);
272 int mlx5_irq_detach_nb(struct mlx5_irq *irq, struct notifier_block *nb)
276 err = atomic_notifier_chain_unregister(&irq->nh, nb);
281 struct cpumask *mlx5_irq_get_affinity_mask(struct mlx5_irq *irq)
286 int mlx5_irq_get_index(struct mlx5_irq *irq)
293 /* creating an irq from irq_pool */
294 static struct mlx5_irq *irq_pool_create_irq(struct mlx5_irq_pool *pool,
295 struct cpumask *affinity)
297 struct mlx5_irq *irq;
301 err = xa_alloc(&pool->irqs, &irq_index, NULL, pool->xa_num_irqs,
305 irq = irq_request(pool, irq_index);
308 cpumask_copy(irq->mask, affinity);
309 irq_set_affinity_hint(irq->irqn, irq->mask);
313 /* looking for the irq with the smallest refcount and the same affinity */
314 static struct mlx5_irq *irq_pool_find_least_loaded(struct mlx5_irq_pool *pool,
315 struct cpumask *affinity)
317 int start = pool->xa_num_irqs.min;
318 int end = pool->xa_num_irqs.max;
319 struct mlx5_irq *irq = NULL;
320 struct mlx5_irq *iter;
323 lockdep_assert_held(&pool->lock);
324 xa_for_each_range(&pool->irqs, index, iter, start, end) {
325 if (!cpumask_equal(iter->mask, affinity))
327 if (iter->refcount < pool->min_threshold)
329 if (!irq || iter->refcount < irq->refcount)
335 /* requesting an irq from a given pool according to given affinity */
336 static struct mlx5_irq *irq_pool_request_affinity(struct mlx5_irq_pool *pool,
337 struct cpumask *affinity)
339 struct mlx5_irq *least_loaded_irq, *new_irq;
341 mutex_lock(&pool->lock);
342 least_loaded_irq = irq_pool_find_least_loaded(pool, affinity);
343 if (least_loaded_irq &&
344 least_loaded_irq->refcount < pool->min_threshold)
346 new_irq = irq_pool_create_irq(pool, affinity);
347 if (IS_ERR(new_irq)) {
348 if (!least_loaded_irq) {
349 mlx5_core_err(pool->dev, "Didn't find IRQ for cpu = %u\n",
350 cpumask_first(affinity));
351 mutex_unlock(&pool->lock);
354 /* We failed to create a new IRQ for the requested affinity,
355 * sharing existing IRQ.
359 least_loaded_irq = new_irq;
362 irq_get_locked(least_loaded_irq);
363 if (least_loaded_irq->refcount > pool->max_threshold)
364 mlx5_core_dbg(pool->dev, "IRQ %u overloaded, pool_name: %s, %u EQs on this irq\n",
365 least_loaded_irq->irqn, pool->name,
366 least_loaded_irq->refcount / MLX5_EQ_REFS_PER_IRQ);
368 mutex_unlock(&pool->lock);
369 return least_loaded_irq;
372 /* requesting an irq from a given pool according to given index */
373 static struct mlx5_irq *
374 irq_pool_request_vector(struct mlx5_irq_pool *pool, int vecidx,
375 struct cpumask *affinity)
377 struct mlx5_irq *irq;
379 mutex_lock(&pool->lock);
380 irq = xa_load(&pool->irqs, vecidx);
385 irq = irq_request(pool, vecidx);
386 if (IS_ERR(irq) || !affinity)
388 cpumask_copy(irq->mask, affinity);
389 irq_set_affinity_hint(irq->irqn, irq->mask);
391 mutex_unlock(&pool->lock);
395 static struct mlx5_irq_pool *find_sf_irq_pool(struct mlx5_irq_table *irq_table,
396 int i, struct cpumask *affinity)
398 if (cpumask_empty(affinity) && i == MLX5_IRQ_EQ_CTRL)
399 return irq_table->sf_ctrl_pool;
400 return irq_table->sf_comp_pool;
404 * mlx5_irq_release - release an IRQ back to the system.
405 * @irq: irq to be released.
407 void mlx5_irq_release(struct mlx5_irq *irq)
409 synchronize_irq(irq->irqn);
414 * mlx5_irq_request - request an IRQ for mlx5 device.
415 * @dev: mlx5 device that requesting the IRQ.
416 * @vecidx: vector index of the IRQ. This argument is ignore if affinity is
418 * @affinity: cpumask requested for this IRQ.
420 * This function returns a pointer to IRQ, or ERR_PTR in case of error.
422 struct mlx5_irq *mlx5_irq_request(struct mlx5_core_dev *dev, u16 vecidx,
423 struct cpumask *affinity)
425 struct mlx5_irq_table *irq_table = mlx5_irq_table_get(dev);
426 struct mlx5_irq_pool *pool;
427 struct mlx5_irq *irq;
429 if (mlx5_core_is_sf(dev)) {
430 pool = find_sf_irq_pool(irq_table, vecidx, affinity);
432 /* we don't have IRQs for SFs, using the PF IRQs */
434 if (cpumask_empty(affinity) && !strcmp(pool->name, "mlx5_sf_comp"))
435 /* In case an SF user request IRQ with vecidx */
436 irq = irq_pool_request_vector(pool, vecidx, NULL);
438 irq = irq_pool_request_affinity(pool, affinity);
442 pool = irq_table->pf_pool;
443 irq = irq_pool_request_vector(pool, vecidx, affinity);
447 mlx5_core_dbg(dev, "irq %u mapped to cpu %*pbl, %u EQs on this irq\n",
448 irq->irqn, cpumask_pr_args(affinity),
449 irq->refcount / MLX5_EQ_REFS_PER_IRQ);
453 static struct mlx5_irq_pool *
454 irq_pool_alloc(struct mlx5_core_dev *dev, int start, int size, char *name,
455 u32 min_threshold, u32 max_threshold)
457 struct mlx5_irq_pool *pool = kvzalloc(sizeof(*pool), GFP_KERNEL);
460 return ERR_PTR(-ENOMEM);
462 mutex_init(&pool->lock);
463 xa_init_flags(&pool->irqs, XA_FLAGS_ALLOC);
464 pool->xa_num_irqs.min = start;
465 pool->xa_num_irqs.max = start + size - 1;
467 snprintf(pool->name, MLX5_MAX_IRQ_NAME - MLX5_MAX_IRQ_IDX_CHARS,
469 pool->min_threshold = min_threshold * MLX5_EQ_REFS_PER_IRQ;
470 pool->max_threshold = max_threshold * MLX5_EQ_REFS_PER_IRQ;
471 mlx5_core_dbg(dev, "pool->name = %s, pool->size = %d, pool->start = %d",
476 static void irq_pool_free(struct mlx5_irq_pool *pool)
478 struct mlx5_irq *irq;
481 /* There are cases in which we are destrying the irq_table before
482 * freeing all the IRQs, fast teardown for example. Hence, free the irqs
483 * which might not have been freed.
485 xa_for_each(&pool->irqs, index, irq)
487 xa_destroy(&pool->irqs);
488 mutex_destroy(&pool->lock);
492 static int irq_pools_init(struct mlx5_core_dev *dev, int sf_vec, int pf_vec)
494 struct mlx5_irq_table *table = dev->priv.irq_table;
495 int num_sf_ctrl_by_msix;
496 int num_sf_ctrl_by_sfs;
501 table->pf_pool = irq_pool_alloc(dev, 0, pf_vec, NULL,
502 MLX5_EQ_SHARE_IRQ_MIN_COMP,
503 MLX5_EQ_SHARE_IRQ_MAX_COMP);
504 if (IS_ERR(table->pf_pool))
505 return PTR_ERR(table->pf_pool);
506 if (!mlx5_sf_max_functions(dev))
508 if (sf_vec < MLX5_IRQ_VEC_COMP_BASE_SF) {
509 mlx5_core_dbg(dev, "Not enught IRQs for SFs. SF may run at lower performance\n");
513 /* init sf_ctrl_pool */
514 num_sf_ctrl_by_msix = DIV_ROUND_UP(sf_vec, MLX5_COMP_EQS_PER_SF);
515 num_sf_ctrl_by_sfs = DIV_ROUND_UP(mlx5_sf_max_functions(dev),
516 MLX5_SFS_PER_CTRL_IRQ);
517 num_sf_ctrl = min_t(int, num_sf_ctrl_by_msix, num_sf_ctrl_by_sfs);
518 num_sf_ctrl = min_t(int, MLX5_IRQ_CTRL_SF_MAX, num_sf_ctrl);
519 table->sf_ctrl_pool = irq_pool_alloc(dev, pf_vec, num_sf_ctrl,
521 MLX5_EQ_SHARE_IRQ_MIN_CTRL,
522 MLX5_EQ_SHARE_IRQ_MAX_CTRL);
523 if (IS_ERR(table->sf_ctrl_pool)) {
524 err = PTR_ERR(table->sf_ctrl_pool);
527 /* init sf_comp_pool */
528 table->sf_comp_pool = irq_pool_alloc(dev, pf_vec + num_sf_ctrl,
529 sf_vec - num_sf_ctrl, "mlx5_sf_comp",
530 MLX5_EQ_SHARE_IRQ_MIN_COMP,
531 MLX5_EQ_SHARE_IRQ_MAX_COMP);
532 if (IS_ERR(table->sf_comp_pool)) {
533 err = PTR_ERR(table->sf_comp_pool);
538 irq_pool_free(table->sf_ctrl_pool);
540 irq_pool_free(table->pf_pool);
544 static void irq_pools_destroy(struct mlx5_irq_table *table)
546 if (table->sf_ctrl_pool) {
547 irq_pool_free(table->sf_comp_pool);
548 irq_pool_free(table->sf_ctrl_pool);
550 irq_pool_free(table->pf_pool);
555 int mlx5_irq_table_init(struct mlx5_core_dev *dev)
557 struct mlx5_irq_table *irq_table;
559 if (mlx5_core_is_sf(dev))
562 irq_table = kvzalloc(sizeof(*irq_table), GFP_KERNEL);
566 dev->priv.irq_table = irq_table;
570 void mlx5_irq_table_cleanup(struct mlx5_core_dev *dev)
572 if (mlx5_core_is_sf(dev))
575 kvfree(dev->priv.irq_table);
578 int mlx5_irq_table_get_num_comp(struct mlx5_irq_table *table)
580 return table->pf_pool->xa_num_irqs.max - table->pf_pool->xa_num_irqs.min;
583 int mlx5_irq_table_create(struct mlx5_core_dev *dev)
585 int num_eqs = MLX5_CAP_GEN(dev, max_num_eqs) ?
586 MLX5_CAP_GEN(dev, max_num_eqs) :
587 1 << MLX5_CAP_GEN(dev, log_max_eq);
592 if (mlx5_core_is_sf(dev))
595 pf_vec = MLX5_CAP_GEN(dev, num_ports) * num_online_cpus() +
596 MLX5_IRQ_VEC_COMP_BASE;
597 pf_vec = min_t(int, pf_vec, num_eqs);
598 if (pf_vec <= MLX5_IRQ_VEC_COMP_BASE)
602 if (mlx5_sf_max_functions(dev))
603 total_vec += MLX5_IRQ_CTRL_SF_MAX +
604 MLX5_COMP_EQS_PER_SF * mlx5_sf_max_functions(dev);
606 total_vec = pci_alloc_irq_vectors(dev->pdev, MLX5_IRQ_VEC_COMP_BASE + 1,
607 total_vec, PCI_IRQ_MSIX);
610 pf_vec = min(pf_vec, total_vec);
612 err = irq_pools_init(dev, total_vec - pf_vec, pf_vec);
614 pci_free_irq_vectors(dev->pdev);
619 void mlx5_irq_table_destroy(struct mlx5_core_dev *dev)
621 struct mlx5_irq_table *table = dev->priv.irq_table;
623 if (mlx5_core_is_sf(dev))
626 /* There are cases where IRQs still will be in used when we reaching
627 * to here. Hence, making sure all the irqs are released.
629 irq_pools_destroy(table);
630 pci_free_irq_vectors(dev->pdev);
633 int mlx5_irq_table_get_sfs_vec(struct mlx5_irq_table *table)
635 if (table->sf_comp_pool)
636 return table->sf_comp_pool->xa_num_irqs.max -
637 table->sf_comp_pool->xa_num_irqs.min + 1;
639 return mlx5_irq_table_get_num_comp(table);
642 struct mlx5_irq_table *mlx5_irq_table_get(struct mlx5_core_dev *dev)
644 #ifdef CONFIG_MLX5_SF
645 if (mlx5_core_is_sf(dev))
646 return dev->priv.parent_mdev->priv.irq_table;
648 return dev->priv.irq_table;