1 // SPDX-License-Identifier: GPL-2.0
3 * Shared Memory Communications over RDMA (SMC-R) and RoCE
5 * Basic Transport Functions exploiting Infiniband API
7 * Copyright IBM Corp. 2016
9 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
12 #include <linux/socket.h>
13 #include <linux/if_vlan.h>
14 #include <linux/random.h>
15 #include <linux/workqueue.h>
16 #include <linux/wait.h>
17 #include <linux/reboot.h>
20 #include <rdma/ib_verbs.h>
21 #include <rdma/ib_cache.h>
30 #include "smc_close.h"
33 #define SMC_LGR_NUM_INCR 256
34 #define SMC_LGR_FREE_DELAY_SERV (600 * HZ)
35 #define SMC_LGR_FREE_DELAY_CLNT (SMC_LGR_FREE_DELAY_SERV + 10 * HZ)
36 #define SMC_LGR_FREE_DELAY_FAST (8 * HZ)
38 static struct smc_lgr_list smc_lgr_list = { /* established link groups */
39 .lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock),
40 .list = LIST_HEAD_INIT(smc_lgr_list.list),
44 static atomic_t lgr_cnt = ATOMIC_INIT(0); /* number of existing link groups */
45 static DECLARE_WAIT_QUEUE_HEAD(lgrs_deleted);
47 struct smc_ib_up_work {
48 struct work_struct work;
49 struct smc_link_group *lgr;
50 struct smc_ib_device *smcibdev;
54 static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb,
55 struct smc_buf_desc *buf_desc);
56 static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft);
58 static void smc_link_up_work(struct work_struct *work);
59 static void smc_link_down_work(struct work_struct *work);
61 /* return head of link group list and its lock for a given link group */
62 static inline struct list_head *smc_lgr_list_head(struct smc_link_group *lgr,
63 spinlock_t **lgr_lock)
66 *lgr_lock = &lgr->smcd->lgr_lock;
67 return &lgr->smcd->lgr_list;
70 *lgr_lock = &smc_lgr_list.lock;
71 return &smc_lgr_list.list;
74 static void smc_lgr_schedule_free_work(struct smc_link_group *lgr)
76 /* client link group creation always follows the server link group
77 * creation. For client use a somewhat higher removal delay time,
78 * otherwise there is a risk of out-of-sync link groups.
80 if (!lgr->freeing && !lgr->freefast) {
81 mod_delayed_work(system_wq, &lgr->free_work,
82 (!lgr->is_smcd && lgr->role == SMC_CLNT) ?
83 SMC_LGR_FREE_DELAY_CLNT :
84 SMC_LGR_FREE_DELAY_SERV);
88 void smc_lgr_schedule_free_work_fast(struct smc_link_group *lgr)
90 if (!lgr->freeing && !lgr->freefast) {
92 mod_delayed_work(system_wq, &lgr->free_work,
93 SMC_LGR_FREE_DELAY_FAST);
97 /* Register connection's alert token in our lookup structure.
98 * To use rbtrees we have to implement our own insert core.
99 * Requires @conns_lock
100 * @smc connection to register
101 * Returns 0 on success, != otherwise.
103 static void smc_lgr_add_alert_token(struct smc_connection *conn)
105 struct rb_node **link, *parent = NULL;
106 u32 token = conn->alert_token_local;
108 link = &conn->lgr->conns_all.rb_node;
110 struct smc_connection *cur = rb_entry(*link,
111 struct smc_connection, alert_node);
114 if (cur->alert_token_local > token)
115 link = &parent->rb_left;
117 link = &parent->rb_right;
119 /* Put the new node there */
120 rb_link_node(&conn->alert_node, parent, link);
121 rb_insert_color(&conn->alert_node, &conn->lgr->conns_all);
124 /* Register connection in link group by assigning an alert token
125 * registered in a search tree.
126 * Requires @conns_lock
127 * Note that '0' is a reserved value and not assigned.
129 static int smc_lgr_register_conn(struct smc_connection *conn)
131 struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
132 static atomic_t nexttoken = ATOMIC_INIT(0);
134 /* find a new alert_token_local value not yet used by some connection
137 sock_hold(&smc->sk); /* sock_put in smc_lgr_unregister_conn() */
138 while (!conn->alert_token_local) {
139 conn->alert_token_local = atomic_inc_return(&nexttoken);
140 if (smc_lgr_find_conn(conn->alert_token_local, conn->lgr))
141 conn->alert_token_local = 0;
143 smc_lgr_add_alert_token(conn);
145 /* assign the new connection to a link */
146 if (!conn->lgr->is_smcd) {
147 struct smc_link *lnk;
150 /* tbd - link balancing */
151 for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
152 lnk = &conn->lgr->lnk[i];
153 if (lnk->state == SMC_LNK_ACTIVATING ||
154 lnk->state == SMC_LNK_ACTIVE)
158 return SMC_CLC_DECL_NOACTLINK;
160 conn->lgr->conns_num++;
164 /* Unregister connection and reset the alert token of the given connection<
166 static void __smc_lgr_unregister_conn(struct smc_connection *conn)
168 struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
169 struct smc_link_group *lgr = conn->lgr;
171 rb_erase(&conn->alert_node, &lgr->conns_all);
173 conn->alert_token_local = 0;
174 sock_put(&smc->sk); /* sock_hold in smc_lgr_register_conn() */
177 /* Unregister connection from lgr
179 static void smc_lgr_unregister_conn(struct smc_connection *conn)
181 struct smc_link_group *lgr = conn->lgr;
185 write_lock_bh(&lgr->conns_lock);
186 if (conn->alert_token_local) {
187 __smc_lgr_unregister_conn(conn);
189 write_unlock_bh(&lgr->conns_lock);
193 void smc_lgr_cleanup_early(struct smc_connection *conn)
195 struct smc_link_group *lgr = conn->lgr;
196 struct list_head *lgr_list;
197 spinlock_t *lgr_lock;
203 lgr_list = smc_lgr_list_head(lgr, &lgr_lock);
204 spin_lock_bh(lgr_lock);
205 /* do not use this link group for new connections */
206 if (!list_empty(lgr_list))
207 list_del_init(lgr_list);
208 spin_unlock_bh(lgr_lock);
209 smc_lgr_schedule_free_work_fast(lgr);
212 static void smc_lgr_free(struct smc_link_group *lgr);
214 static void smc_lgr_free_work(struct work_struct *work)
216 struct smc_link_group *lgr = container_of(to_delayed_work(work),
217 struct smc_link_group,
219 spinlock_t *lgr_lock;
223 smc_lgr_list_head(lgr, &lgr_lock);
224 spin_lock_bh(lgr_lock);
226 spin_unlock_bh(lgr_lock);
229 read_lock_bh(&lgr->conns_lock);
230 conns = RB_EMPTY_ROOT(&lgr->conns_all);
231 read_unlock_bh(&lgr->conns_lock);
232 if (!conns) { /* number of lgr connections is no longer zero */
233 spin_unlock_bh(lgr_lock);
236 list_del_init(&lgr->list); /* remove from smc_lgr_list */
237 lgr->freeing = 1; /* this instance does the freeing, no new schedule */
238 spin_unlock_bh(lgr_lock);
239 cancel_delayed_work(&lgr->free_work);
241 if (lgr->is_smcd && !lgr->terminating)
242 smc_ism_signal_shutdown(lgr);
244 for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
245 struct smc_link *lnk = &lgr->lnk[i];
247 if (smc_link_usable(lnk))
248 lnk->state = SMC_LNK_INACTIVE;
250 wake_up_interruptible_all(&lgr->llc_waiter);
255 static void smc_lgr_terminate_work(struct work_struct *work)
257 struct smc_link_group *lgr = container_of(work, struct smc_link_group,
260 __smc_lgr_terminate(lgr, true);
263 /* return next unique link id for the lgr */
264 static u8 smcr_next_link_id(struct smc_link_group *lgr)
270 link_id = ++lgr->next_link_id;
271 if (!link_id) /* skip zero as link_id */
272 link_id = ++lgr->next_link_id;
273 for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
274 if (smc_link_usable(&lgr->lnk[i]) &&
275 lgr->lnk[i].link_id == link_id)
283 int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
284 u8 link_idx, struct smc_init_info *ini)
289 get_device(&ini->ib_dev->ibdev->dev);
290 atomic_inc(&ini->ib_dev->lnk_cnt);
291 lnk->state = SMC_LNK_ACTIVATING;
292 lnk->link_id = smcr_next_link_id(lgr);
294 lnk->link_idx = link_idx;
295 lnk->smcibdev = ini->ib_dev;
296 lnk->ibport = ini->ib_port;
297 lnk->path_mtu = ini->ib_dev->pattr[ini->ib_port - 1].active_mtu;
298 INIT_WORK(&lnk->link_down_wrk, smc_link_down_work);
299 if (!ini->ib_dev->initialized) {
300 rc = (int)smc_ib_setup_per_ibdev(ini->ib_dev);
304 get_random_bytes(rndvec, sizeof(rndvec));
305 lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) +
307 rc = smc_ib_determine_gid(lnk->smcibdev, lnk->ibport,
308 ini->vlan_id, lnk->gid, &lnk->sgid_index);
311 rc = smc_llc_link_init(lnk);
314 rc = smc_wr_alloc_link_mem(lnk);
317 rc = smc_ib_create_protection_domain(lnk);
320 rc = smc_ib_create_queue_pair(lnk);
323 rc = smc_wr_create_link(lnk);
329 smc_ib_destroy_queue_pair(lnk);
331 smc_ib_dealloc_protection_domain(lnk);
333 smc_wr_free_link_mem(lnk);
335 smc_llc_link_clear(lnk);
337 put_device(&ini->ib_dev->ibdev->dev);
338 memset(lnk, 0, sizeof(struct smc_link));
339 lnk->state = SMC_LNK_UNUSED;
340 if (!atomic_dec_return(&ini->ib_dev->lnk_cnt))
341 wake_up(&ini->ib_dev->lnks_deleted);
345 /* create a new SMC link group */
346 static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini)
348 struct smc_link_group *lgr;
349 struct list_head *lgr_list;
350 struct smc_link *lnk;
351 spinlock_t *lgr_lock;
356 if (ini->is_smcd && ini->vlan_id) {
357 if (smc_ism_get_vlan(ini->ism_dev, ini->vlan_id)) {
358 rc = SMC_CLC_DECL_ISMVLANERR;
363 lgr = kzalloc(sizeof(*lgr), GFP_KERNEL);
365 rc = SMC_CLC_DECL_MEM;
368 lgr->is_smcd = ini->is_smcd;
370 lgr->terminating = 0;
373 lgr->vlan_id = ini->vlan_id;
374 mutex_init(&lgr->sndbufs_lock);
375 mutex_init(&lgr->rmbs_lock);
376 rwlock_init(&lgr->conns_lock);
377 for (i = 0; i < SMC_RMBE_SIZES; i++) {
378 INIT_LIST_HEAD(&lgr->sndbufs[i]);
379 INIT_LIST_HEAD(&lgr->rmbs[i]);
381 lgr->next_link_id = 0;
382 smc_lgr_list.num += SMC_LGR_NUM_INCR;
383 memcpy(&lgr->id, (u8 *)&smc_lgr_list.num, SMC_LGR_ID_SIZE);
384 INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work);
385 INIT_WORK(&lgr->terminate_work, smc_lgr_terminate_work);
386 lgr->conns_all = RB_ROOT;
388 /* SMC-D specific settings */
389 get_device(&ini->ism_dev->dev);
390 lgr->peer_gid = ini->ism_gid;
391 lgr->smcd = ini->ism_dev;
392 lgr_list = &ini->ism_dev->lgr_list;
393 lgr_lock = &lgr->smcd->lgr_lock;
394 lgr->peer_shutdown = 0;
395 atomic_inc(&ini->ism_dev->lgr_cnt);
397 /* SMC-R specific settings */
398 lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
399 memcpy(lgr->peer_systemid, ini->ib_lcl->id_for_peer,
401 memcpy(lgr->pnet_id, ini->ib_dev->pnetid[ini->ib_port - 1],
403 smc_llc_lgr_init(lgr, smc);
405 link_idx = SMC_SINGLE_LINK;
406 lnk = &lgr->lnk[link_idx];
407 rc = smcr_link_init(lgr, lnk, link_idx, ini);
410 lgr_list = &smc_lgr_list.list;
411 lgr_lock = &smc_lgr_list.lock;
412 atomic_inc(&lgr_cnt);
415 spin_lock_bh(lgr_lock);
416 list_add(&lgr->list, lgr_list);
417 spin_unlock_bh(lgr_lock);
423 if (ini->is_smcd && ini->vlan_id)
424 smc_ism_put_vlan(ini->ism_dev, ini->vlan_id);
428 rc = SMC_CLC_DECL_MEM;
430 rc = SMC_CLC_DECL_INTERR;
435 static int smc_write_space(struct smc_connection *conn)
437 int buffer_len = conn->peer_rmbe_size;
438 union smc_host_cursor prod;
439 union smc_host_cursor cons;
442 smc_curs_copy(&prod, &conn->local_tx_ctrl.prod, conn);
443 smc_curs_copy(&cons, &conn->local_rx_ctrl.cons, conn);
444 /* determine rx_buf space */
445 space = buffer_len - smc_curs_diff(buffer_len, &cons, &prod);
449 static int smc_switch_cursor(struct smc_sock *smc)
451 struct smc_connection *conn = &smc->conn;
452 union smc_host_cursor cons, fin;
456 smc_curs_copy(&conn->tx_curs_sent, &conn->tx_curs_fin, conn);
457 smc_curs_copy(&fin, &conn->local_tx_ctrl_fin, conn);
458 /* set prod cursor to old state, enforce tx_rdma_writes() */
459 smc_curs_copy(&conn->local_tx_ctrl.prod, &fin, conn);
460 smc_curs_copy(&cons, &conn->local_rx_ctrl.cons, conn);
462 if (smc_curs_comp(conn->peer_rmbe_size, &cons, &fin) < 0) {
463 /* cons cursor advanced more than fin, and prod was set
464 * fin above, so now prod is smaller than cons. Fix that.
466 diff = smc_curs_diff(conn->peer_rmbe_size, &fin, &cons);
467 smc_curs_add(conn->sndbuf_desc->len,
468 &conn->tx_curs_sent, diff);
469 smc_curs_add(conn->sndbuf_desc->len,
470 &conn->tx_curs_fin, diff);
472 smp_mb__before_atomic();
473 atomic_add(diff, &conn->sndbuf_space);
474 smp_mb__after_atomic();
476 smc_curs_add(conn->peer_rmbe_size,
477 &conn->local_tx_ctrl.prod, diff);
478 smc_curs_add(conn->peer_rmbe_size,
479 &conn->local_tx_ctrl_fin, diff);
481 /* recalculate, value is used by tx_rdma_writes() */
482 atomic_set(&smc->conn.peer_rmbe_space, smc_write_space(conn));
484 if (smc->sk.sk_state != SMC_INIT &&
485 smc->sk.sk_state != SMC_CLOSED) {
486 rc = smcr_cdc_msg_send_validation(conn);
488 schedule_delayed_work(&conn->tx_work, 0);
489 smc->sk.sk_data_ready(&smc->sk);
495 struct smc_link *smc_switch_conns(struct smc_link_group *lgr,
496 struct smc_link *from_lnk, bool is_dev_err)
498 struct smc_link *to_lnk = NULL;
499 struct smc_connection *conn;
500 struct smc_sock *smc;
501 struct rb_node *node;
504 /* link is inactive, wake up tx waiters */
505 smc_wr_wakeup_tx_wait(from_lnk);
507 for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
508 if (lgr->lnk[i].state != SMC_LNK_ACTIVE ||
509 i == from_lnk->link_idx)
511 if (is_dev_err && from_lnk->smcibdev == lgr->lnk[i].smcibdev &&
512 from_lnk->ibport == lgr->lnk[i].ibport) {
515 to_lnk = &lgr->lnk[i];
519 smc_lgr_terminate_sched(lgr);
523 read_lock_bh(&lgr->conns_lock);
524 for (node = rb_first(&lgr->conns_all); node; node = rb_next(node)) {
525 conn = rb_entry(node, struct smc_connection, alert_node);
526 if (conn->lnk != from_lnk)
528 smc = container_of(conn, struct smc_sock, conn);
529 /* conn->lnk not yet set in SMC_INIT state */
530 if (smc->sk.sk_state == SMC_INIT)
532 if (smc->sk.sk_state == SMC_CLOSED ||
533 smc->sk.sk_state == SMC_PEERCLOSEWAIT1 ||
534 smc->sk.sk_state == SMC_PEERCLOSEWAIT2 ||
535 smc->sk.sk_state == SMC_APPFINCLOSEWAIT ||
536 smc->sk.sk_state == SMC_APPCLOSEWAIT1 ||
537 smc->sk.sk_state == SMC_APPCLOSEWAIT2 ||
538 smc->sk.sk_state == SMC_PEERFINCLOSEWAIT ||
539 smc->sk.sk_state == SMC_PEERABORTWAIT ||
540 smc->sk.sk_state == SMC_PROCESSABORT) {
541 spin_lock_bh(&conn->send_lock);
543 spin_unlock_bh(&conn->send_lock);
547 read_unlock_bh(&lgr->conns_lock);
548 /* avoid race with smcr_tx_sndbuf_nonempty() */
549 spin_lock_bh(&conn->send_lock);
551 rc = smc_switch_cursor(smc);
552 spin_unlock_bh(&conn->send_lock);
555 smcr_link_down_cond_sched(to_lnk);
560 read_unlock_bh(&lgr->conns_lock);
564 static void smcr_buf_unuse(struct smc_buf_desc *rmb_desc,
565 struct smc_link_group *lgr)
569 if (rmb_desc->is_conf_rkey && !list_empty(&lgr->list)) {
570 /* unregister rmb with peer */
571 rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY);
573 /* protect against smc_llc_cli_rkey_exchange() */
574 mutex_lock(&lgr->llc_conf_mutex);
575 smc_llc_do_delete_rkey(lgr, rmb_desc);
576 rmb_desc->is_conf_rkey = false;
577 mutex_unlock(&lgr->llc_conf_mutex);
578 smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl);
582 if (rmb_desc->is_reg_err) {
583 /* buf registration failed, reuse not possible */
584 mutex_lock(&lgr->rmbs_lock);
585 list_del(&rmb_desc->list);
586 mutex_unlock(&lgr->rmbs_lock);
588 smc_buf_free(lgr, true, rmb_desc);
594 static void smc_buf_unuse(struct smc_connection *conn,
595 struct smc_link_group *lgr)
597 if (conn->sndbuf_desc)
598 conn->sndbuf_desc->used = 0;
599 if (conn->rmb_desc && lgr->is_smcd)
600 conn->rmb_desc->used = 0;
601 else if (conn->rmb_desc)
602 smcr_buf_unuse(conn->rmb_desc, lgr);
605 /* remove a finished connection from its link group */
606 void smc_conn_free(struct smc_connection *conn)
608 struct smc_link_group *lgr = conn->lgr;
613 if (!list_empty(&lgr->list))
614 smc_ism_unset_conn(conn);
615 tasklet_kill(&conn->rx_tsklet);
617 smc_cdc_tx_dismiss_slots(conn);
619 if (!list_empty(&lgr->list)) {
620 smc_lgr_unregister_conn(conn);
621 smc_buf_unuse(conn, lgr); /* allow buffer reuse */
625 smc_lgr_schedule_free_work(lgr);
628 /* unregister a link from a buf_desc */
629 static void smcr_buf_unmap_link(struct smc_buf_desc *buf_desc, bool is_rmb,
630 struct smc_link *lnk)
633 buf_desc->is_reg_mr[lnk->link_idx] = false;
634 if (!buf_desc->is_map_ib[lnk->link_idx])
637 if (buf_desc->mr_rx[lnk->link_idx]) {
638 smc_ib_put_memory_region(
639 buf_desc->mr_rx[lnk->link_idx]);
640 buf_desc->mr_rx[lnk->link_idx] = NULL;
642 smc_ib_buf_unmap_sg(lnk, buf_desc, DMA_FROM_DEVICE);
644 smc_ib_buf_unmap_sg(lnk, buf_desc, DMA_TO_DEVICE);
646 sg_free_table(&buf_desc->sgt[lnk->link_idx]);
647 buf_desc->is_map_ib[lnk->link_idx] = false;
650 /* unmap all buffers of lgr for a deleted link */
651 static void smcr_buf_unmap_lgr(struct smc_link *lnk)
653 struct smc_link_group *lgr = lnk->lgr;
654 struct smc_buf_desc *buf_desc, *bf;
657 for (i = 0; i < SMC_RMBE_SIZES; i++) {
658 mutex_lock(&lgr->rmbs_lock);
659 list_for_each_entry_safe(buf_desc, bf, &lgr->rmbs[i], list)
660 smcr_buf_unmap_link(buf_desc, true, lnk);
661 mutex_unlock(&lgr->rmbs_lock);
662 mutex_lock(&lgr->sndbufs_lock);
663 list_for_each_entry_safe(buf_desc, bf, &lgr->sndbufs[i],
665 smcr_buf_unmap_link(buf_desc, false, lnk);
666 mutex_unlock(&lgr->sndbufs_lock);
670 static void smcr_rtoken_clear_link(struct smc_link *lnk)
672 struct smc_link_group *lgr = lnk->lgr;
675 for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) {
676 lgr->rtokens[i][lnk->link_idx].rkey = 0;
677 lgr->rtokens[i][lnk->link_idx].dma_addr = 0;
681 /* must be called under lgr->llc_conf_mutex lock */
682 void smcr_link_clear(struct smc_link *lnk)
684 struct smc_ib_device *smcibdev;
686 if (!lnk->lgr || lnk->state == SMC_LNK_UNUSED)
689 smc_llc_link_clear(lnk);
690 smcr_buf_unmap_lgr(lnk);
691 smcr_rtoken_clear_link(lnk);
692 smc_ib_modify_qp_reset(lnk);
693 smc_wr_free_link(lnk);
694 smc_ib_destroy_queue_pair(lnk);
695 smc_ib_dealloc_protection_domain(lnk);
696 smc_wr_free_link_mem(lnk);
697 put_device(&lnk->smcibdev->ibdev->dev);
698 smcibdev = lnk->smcibdev;
699 memset(lnk, 0, sizeof(struct smc_link));
700 lnk->state = SMC_LNK_UNUSED;
701 if (!atomic_dec_return(&smcibdev->lnk_cnt))
702 wake_up(&smcibdev->lnks_deleted);
705 static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb,
706 struct smc_buf_desc *buf_desc)
710 for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++)
711 smcr_buf_unmap_link(buf_desc, is_rmb, &lgr->lnk[i]);
714 __free_pages(buf_desc->pages, buf_desc->order);
718 static void smcd_buf_free(struct smc_link_group *lgr, bool is_dmb,
719 struct smc_buf_desc *buf_desc)
722 /* restore original buf len */
723 buf_desc->len += sizeof(struct smcd_cdc_msg);
724 smc_ism_unregister_dmb(lgr->smcd, buf_desc);
726 kfree(buf_desc->cpu_addr);
731 static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb,
732 struct smc_buf_desc *buf_desc)
735 smcd_buf_free(lgr, is_rmb, buf_desc);
737 smcr_buf_free(lgr, is_rmb, buf_desc);
740 static void __smc_lgr_free_bufs(struct smc_link_group *lgr, bool is_rmb)
742 struct smc_buf_desc *buf_desc, *bf_desc;
743 struct list_head *buf_list;
746 for (i = 0; i < SMC_RMBE_SIZES; i++) {
748 buf_list = &lgr->rmbs[i];
750 buf_list = &lgr->sndbufs[i];
751 list_for_each_entry_safe(buf_desc, bf_desc, buf_list,
753 list_del(&buf_desc->list);
754 smc_buf_free(lgr, is_rmb, buf_desc);
759 static void smc_lgr_free_bufs(struct smc_link_group *lgr)
761 /* free send buffers */
762 __smc_lgr_free_bufs(lgr, false);
764 __smc_lgr_free_bufs(lgr, true);
767 /* remove a link group */
768 static void smc_lgr_free(struct smc_link_group *lgr)
772 smc_lgr_free_bufs(lgr);
774 if (!lgr->terminating) {
775 smc_ism_put_vlan(lgr->smcd, lgr->vlan_id);
776 put_device(&lgr->smcd->dev);
778 if (!atomic_dec_return(&lgr->smcd->lgr_cnt))
779 wake_up(&lgr->smcd->lgrs_deleted);
781 for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
782 if (lgr->lnk[i].state != SMC_LNK_UNUSED)
783 smcr_link_clear(&lgr->lnk[i]);
785 smc_llc_lgr_clear(lgr);
786 if (!atomic_dec_return(&lgr_cnt))
787 wake_up(&lgrs_deleted);
792 static void smcd_unregister_all_dmbs(struct smc_link_group *lgr)
796 for (i = 0; i < SMC_RMBE_SIZES; i++) {
797 struct smc_buf_desc *buf_desc;
799 list_for_each_entry(buf_desc, &lgr->rmbs[i], list) {
800 buf_desc->len += sizeof(struct smcd_cdc_msg);
801 smc_ism_unregister_dmb(lgr->smcd, buf_desc);
806 static void smc_sk_wake_ups(struct smc_sock *smc)
808 smc->sk.sk_write_space(&smc->sk);
809 smc->sk.sk_data_ready(&smc->sk);
810 smc->sk.sk_state_change(&smc->sk);
813 /* kill a connection */
814 static void smc_conn_kill(struct smc_connection *conn, bool soft)
816 struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
818 if (conn->lgr->is_smcd && conn->lgr->peer_shutdown)
819 conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
821 smc_close_abort(conn);
823 smc->sk.sk_err = ECONNABORTED;
824 smc_sk_wake_ups(smc);
825 if (conn->lgr->is_smcd) {
826 smc_ism_unset_conn(conn);
828 tasklet_kill(&conn->rx_tsklet);
830 tasklet_unlock_wait(&conn->rx_tsklet);
832 smc_cdc_tx_dismiss_slots(conn);
834 smc_lgr_unregister_conn(conn);
835 smc_close_active_abort(smc);
838 static void smc_lgr_cleanup(struct smc_link_group *lgr)
843 smc_ism_signal_shutdown(lgr);
844 smcd_unregister_all_dmbs(lgr);
845 smc_ism_put_vlan(lgr->smcd, lgr->vlan_id);
846 put_device(&lgr->smcd->dev);
848 for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
849 struct smc_link *lnk = &lgr->lnk[i];
851 if (smc_link_usable(lnk))
852 lnk->state = SMC_LNK_INACTIVE;
854 wake_up_interruptible_all(&lgr->llc_waiter);
858 /* terminate link group
859 * @soft: true if link group shutdown can take its time
860 * false if immediate link group shutdown is required
862 static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft)
864 struct smc_connection *conn;
865 struct smc_sock *smc;
866 struct rb_node *node;
868 if (lgr->terminating)
869 return; /* lgr already terminating */
871 cancel_delayed_work_sync(&lgr->free_work);
872 lgr->terminating = 1;
874 /* kill remaining link group connections */
875 read_lock_bh(&lgr->conns_lock);
876 node = rb_first(&lgr->conns_all);
878 read_unlock_bh(&lgr->conns_lock);
879 conn = rb_entry(node, struct smc_connection, alert_node);
880 smc = container_of(conn, struct smc_sock, conn);
881 sock_hold(&smc->sk); /* sock_put below */
883 smc_conn_kill(conn, soft);
884 release_sock(&smc->sk);
885 sock_put(&smc->sk); /* sock_hold above */
886 read_lock_bh(&lgr->conns_lock);
887 node = rb_first(&lgr->conns_all);
889 read_unlock_bh(&lgr->conns_lock);
890 smc_lgr_cleanup(lgr);
892 smc_lgr_schedule_free_work_fast(lgr);
897 /* unlink link group and schedule termination */
898 void smc_lgr_terminate_sched(struct smc_link_group *lgr)
900 spinlock_t *lgr_lock;
902 smc_lgr_list_head(lgr, &lgr_lock);
903 spin_lock_bh(lgr_lock);
904 if (list_empty(&lgr->list) || lgr->terminating || lgr->freeing) {
905 spin_unlock_bh(lgr_lock);
906 return; /* lgr already terminating */
908 list_del_init(&lgr->list);
909 spin_unlock_bh(lgr_lock);
910 schedule_work(&lgr->terminate_work);
913 /* Called when peer lgr shutdown (regularly or abnormally) is received */
914 void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid, unsigned short vlan)
916 struct smc_link_group *lgr, *l;
917 LIST_HEAD(lgr_free_list);
919 /* run common cleanup function and build free list */
920 spin_lock_bh(&dev->lgr_lock);
921 list_for_each_entry_safe(lgr, l, &dev->lgr_list, list) {
922 if ((!peer_gid || lgr->peer_gid == peer_gid) &&
923 (vlan == VLAN_VID_MASK || lgr->vlan_id == vlan)) {
924 if (peer_gid) /* peer triggered termination */
925 lgr->peer_shutdown = 1;
926 list_move(&lgr->list, &lgr_free_list);
929 spin_unlock_bh(&dev->lgr_lock);
931 /* cancel the regular free workers and actually free lgrs */
932 list_for_each_entry_safe(lgr, l, &lgr_free_list, list) {
933 list_del_init(&lgr->list);
934 schedule_work(&lgr->terminate_work);
938 /* Called when an SMCD device is removed or the smc module is unloaded */
939 void smc_smcd_terminate_all(struct smcd_dev *smcd)
941 struct smc_link_group *lgr, *lg;
942 LIST_HEAD(lgr_free_list);
944 spin_lock_bh(&smcd->lgr_lock);
945 list_splice_init(&smcd->lgr_list, &lgr_free_list);
946 list_for_each_entry(lgr, &lgr_free_list, list)
948 spin_unlock_bh(&smcd->lgr_lock);
950 list_for_each_entry_safe(lgr, lg, &lgr_free_list, list) {
951 list_del_init(&lgr->list);
952 __smc_lgr_terminate(lgr, false);
955 if (atomic_read(&smcd->lgr_cnt))
956 wait_event(smcd->lgrs_deleted, !atomic_read(&smcd->lgr_cnt));
959 /* Called when an SMCR device is removed or the smc module is unloaded.
960 * If smcibdev is given, all SMCR link groups using this device are terminated.
961 * If smcibdev is NULL, all SMCR link groups are terminated.
963 void smc_smcr_terminate_all(struct smc_ib_device *smcibdev)
965 struct smc_link_group *lgr, *lg;
966 LIST_HEAD(lgr_free_list);
969 spin_lock_bh(&smc_lgr_list.lock);
971 list_splice_init(&smc_lgr_list.list, &lgr_free_list);
972 list_for_each_entry(lgr, &lgr_free_list, list)
975 list_for_each_entry_safe(lgr, lg, &smc_lgr_list.list, list) {
976 for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
977 if (lgr->lnk[i].smcibdev == smcibdev)
978 smcr_link_down_cond_sched(&lgr->lnk[i]);
982 spin_unlock_bh(&smc_lgr_list.lock);
984 list_for_each_entry_safe(lgr, lg, &lgr_free_list, list) {
985 list_del_init(&lgr->list);
986 __smc_lgr_terminate(lgr, false);
990 if (atomic_read(&smcibdev->lnk_cnt))
991 wait_event(smcibdev->lnks_deleted,
992 !atomic_read(&smcibdev->lnk_cnt));
994 if (atomic_read(&lgr_cnt))
995 wait_event(lgrs_deleted, !atomic_read(&lgr_cnt));
999 /* link is up - establish alternate link if applicable */
1000 static void smcr_link_up(struct smc_link_group *lgr,
1001 struct smc_ib_device *smcibdev, u8 ibport)
1003 struct smc_link *link = NULL;
1005 if (list_empty(&lgr->list) ||
1006 lgr->type == SMC_LGR_SYMMETRIC ||
1007 lgr->type == SMC_LGR_ASYMMETRIC_PEER)
1010 if (lgr->role == SMC_SERV) {
1011 /* trigger local add link processing */
1012 link = smc_llc_usable_link(lgr);
1015 smc_llc_srv_add_link_local(link);
1017 /* invite server to start add link processing */
1018 u8 gid[SMC_GID_SIZE];
1020 if (smc_ib_determine_gid(smcibdev, ibport, lgr->vlan_id, gid,
1023 if (lgr->llc_flow_lcl.type != SMC_LLC_FLOW_NONE) {
1024 /* some other llc task is ongoing */
1025 wait_event_interruptible_timeout(lgr->llc_waiter,
1026 (lgr->llc_flow_lcl.type == SMC_LLC_FLOW_NONE),
1029 if (list_empty(&lgr->list) ||
1030 !smc_ib_port_active(smcibdev, ibport))
1031 return; /* lgr or device no longer active */
1032 link = smc_llc_usable_link(lgr);
1035 smc_llc_send_add_link(link, smcibdev->mac[ibport - 1], gid,
1040 void smcr_port_add(struct smc_ib_device *smcibdev, u8 ibport)
1042 struct smc_ib_up_work *ib_work;
1043 struct smc_link_group *lgr, *n;
1045 list_for_each_entry_safe(lgr, n, &smc_lgr_list.list, list) {
1046 if (strncmp(smcibdev->pnetid[ibport - 1], lgr->pnet_id,
1047 SMC_MAX_PNETID_LEN) ||
1048 lgr->type == SMC_LGR_SYMMETRIC ||
1049 lgr->type == SMC_LGR_ASYMMETRIC_PEER)
1051 ib_work = kmalloc(sizeof(*ib_work), GFP_KERNEL);
1054 INIT_WORK(&ib_work->work, smc_link_up_work);
1056 ib_work->smcibdev = smcibdev;
1057 ib_work->ibport = ibport;
1058 schedule_work(&ib_work->work);
1062 /* link is down - switch connections to alternate link,
1063 * must be called under lgr->llc_conf_mutex lock
1065 static void smcr_link_down(struct smc_link *lnk)
1067 struct smc_link_group *lgr = lnk->lgr;
1068 struct smc_link *to_lnk;
1071 if (!lgr || lnk->state == SMC_LNK_UNUSED || list_empty(&lgr->list))
1074 smc_ib_modify_qp_reset(lnk);
1075 to_lnk = smc_switch_conns(lgr, lnk, true);
1076 if (!to_lnk) { /* no backup link available */
1077 smcr_link_clear(lnk);
1080 lgr->type = SMC_LGR_SINGLE;
1081 del_link_id = lnk->link_id;
1083 if (lgr->role == SMC_SERV) {
1084 /* trigger local delete link processing */
1085 smc_llc_srv_delete_link_local(to_lnk, del_link_id);
1087 if (lgr->llc_flow_lcl.type != SMC_LLC_FLOW_NONE) {
1088 /* another llc task is ongoing */
1089 mutex_unlock(&lgr->llc_conf_mutex);
1090 wait_event_interruptible_timeout(lgr->llc_waiter,
1091 (lgr->llc_flow_lcl.type == SMC_LLC_FLOW_NONE),
1093 mutex_lock(&lgr->llc_conf_mutex);
1095 smc_llc_send_delete_link(to_lnk, del_link_id, SMC_LLC_REQ, true,
1096 SMC_LLC_DEL_LOST_PATH);
1100 /* must be called under lgr->llc_conf_mutex lock */
1101 void smcr_link_down_cond(struct smc_link *lnk)
1103 if (smc_link_downing(&lnk->state))
1104 smcr_link_down(lnk);
1107 /* will get the lgr->llc_conf_mutex lock */
1108 void smcr_link_down_cond_sched(struct smc_link *lnk)
1110 if (smc_link_downing(&lnk->state))
1111 schedule_work(&lnk->link_down_wrk);
1114 void smcr_port_err(struct smc_ib_device *smcibdev, u8 ibport)
1116 struct smc_link_group *lgr, *n;
1119 list_for_each_entry_safe(lgr, n, &smc_lgr_list.list, list) {
1120 if (strncmp(smcibdev->pnetid[ibport - 1], lgr->pnet_id,
1121 SMC_MAX_PNETID_LEN))
1122 continue; /* lgr is not affected */
1123 if (list_empty(&lgr->list))
1125 for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
1126 struct smc_link *lnk = &lgr->lnk[i];
1128 if (smc_link_usable(lnk) &&
1129 lnk->smcibdev == smcibdev && lnk->ibport == ibport)
1130 smcr_link_down_cond_sched(lnk);
1135 static void smc_link_up_work(struct work_struct *work)
1137 struct smc_ib_up_work *ib_work = container_of(work,
1138 struct smc_ib_up_work,
1140 struct smc_link_group *lgr = ib_work->lgr;
1142 if (list_empty(&lgr->list))
1144 smcr_link_up(lgr, ib_work->smcibdev, ib_work->ibport);
1149 static void smc_link_down_work(struct work_struct *work)
1151 struct smc_link *link = container_of(work, struct smc_link,
1153 struct smc_link_group *lgr = link->lgr;
1155 if (list_empty(&lgr->list))
1157 wake_up_interruptible_all(&lgr->llc_waiter);
1158 mutex_lock(&lgr->llc_conf_mutex);
1159 smcr_link_down(link);
1160 mutex_unlock(&lgr->llc_conf_mutex);
1163 /* Determine vlan of internal TCP socket.
1164 * @vlan_id: address to store the determined vlan id into
1166 int smc_vlan_by_tcpsk(struct socket *clcsock, struct smc_init_info *ini)
1168 struct dst_entry *dst = sk_dst_get(clcsock->sk);
1169 struct net_device *ndev;
1170 int i, nest_lvl, rc = 0;
1183 if (is_vlan_dev(ndev)) {
1184 ini->vlan_id = vlan_dev_vlan_id(ndev);
1189 nest_lvl = ndev->lower_level;
1190 for (i = 0; i < nest_lvl; i++) {
1191 struct list_head *lower = &ndev->adj_list.lower;
1193 if (list_empty(lower))
1195 lower = lower->next;
1196 ndev = (struct net_device *)netdev_lower_get_next(ndev, &lower);
1197 if (is_vlan_dev(ndev)) {
1198 ini->vlan_id = vlan_dev_vlan_id(ndev);
1210 static bool smcr_lgr_match(struct smc_link_group *lgr,
1211 struct smc_clc_msg_local *lcl,
1212 enum smc_lgr_role role, u32 clcqpn)
1216 if (memcmp(lgr->peer_systemid, lcl->id_for_peer, SMC_SYSTEMID_LEN) ||
1220 for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
1221 if (lgr->lnk[i].state != SMC_LNK_ACTIVE)
1223 if ((lgr->role == SMC_SERV || lgr->lnk[i].peer_qpn == clcqpn) &&
1224 !memcmp(lgr->lnk[i].peer_gid, &lcl->gid, SMC_GID_SIZE) &&
1225 !memcmp(lgr->lnk[i].peer_mac, lcl->mac, sizeof(lcl->mac)))
1231 static bool smcd_lgr_match(struct smc_link_group *lgr,
1232 struct smcd_dev *smcismdev, u64 peer_gid)
1234 return lgr->peer_gid == peer_gid && lgr->smcd == smcismdev;
1237 /* create a new SMC connection (and a new link group if necessary) */
1238 int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini)
1240 struct smc_connection *conn = &smc->conn;
1241 struct list_head *lgr_list;
1242 struct smc_link_group *lgr;
1243 enum smc_lgr_role role;
1244 spinlock_t *lgr_lock;
1247 lgr_list = ini->is_smcd ? &ini->ism_dev->lgr_list : &smc_lgr_list.list;
1248 lgr_lock = ini->is_smcd ? &ini->ism_dev->lgr_lock : &smc_lgr_list.lock;
1249 ini->cln_first_contact = SMC_FIRST_CONTACT;
1250 role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
1251 if (role == SMC_CLNT && ini->srv_first_contact)
1252 /* create new link group as well */
1255 /* determine if an existing link group can be reused */
1256 spin_lock_bh(lgr_lock);
1257 list_for_each_entry(lgr, lgr_list, list) {
1258 write_lock_bh(&lgr->conns_lock);
1260 smcd_lgr_match(lgr, ini->ism_dev, ini->ism_gid) :
1261 smcr_lgr_match(lgr, ini->ib_lcl, role, ini->ib_clcqpn)) &&
1263 lgr->vlan_id == ini->vlan_id &&
1264 (role == SMC_CLNT ||
1265 lgr->conns_num < SMC_RMBS_PER_LGR_MAX)) {
1266 /* link group found */
1267 ini->cln_first_contact = SMC_REUSE_CONTACT;
1269 rc = smc_lgr_register_conn(conn); /* add conn to lgr */
1270 write_unlock_bh(&lgr->conns_lock);
1271 if (!rc && delayed_work_pending(&lgr->free_work))
1272 cancel_delayed_work(&lgr->free_work);
1275 write_unlock_bh(&lgr->conns_lock);
1277 spin_unlock_bh(lgr_lock);
1281 if (role == SMC_CLNT && !ini->srv_first_contact &&
1282 ini->cln_first_contact == SMC_FIRST_CONTACT) {
1283 /* Server reuses a link group, but Client wants to start
1285 * send out_of_sync decline, reason synchr. error
1287 return SMC_CLC_DECL_SYNCERR;
1291 if (ini->cln_first_contact == SMC_FIRST_CONTACT) {
1292 rc = smc_lgr_create(smc, ini);
1296 write_lock_bh(&lgr->conns_lock);
1297 rc = smc_lgr_register_conn(conn); /* add smc conn to lgr */
1298 write_unlock_bh(&lgr->conns_lock);
1302 conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE;
1303 conn->local_tx_ctrl.len = SMC_WR_TX_SIZE;
1304 conn->urg_state = SMC_URG_READ;
1306 conn->rx_off = sizeof(struct smcd_cdc_msg);
1307 smcd_cdc_rx_init(conn); /* init tasklet for this conn */
1309 #ifndef KERNEL_HAS_ATOMIC64
1310 spin_lock_init(&conn->acurs_lock);
1317 /* convert the RMB size into the compressed notation - minimum 16K.
1318 * In contrast to plain ilog2, this rounds towards the next power of 2,
1319 * so the socket application gets at least its desired sndbuf / rcvbuf size.
1321 static u8 smc_compress_bufsize(int size)
1325 if (size <= SMC_BUF_MIN_SIZE)
1328 size = (size - 1) >> 14;
1329 compressed = ilog2(size) + 1;
1330 if (compressed >= SMC_RMBE_SIZES)
1331 compressed = SMC_RMBE_SIZES - 1;
1335 /* convert the RMB size from compressed notation into integer */
1336 int smc_uncompress_bufsize(u8 compressed)
1340 size = 0x00000001 << (((int)compressed) + 14);
1344 /* try to reuse a sndbuf or rmb description slot for a certain
1345 * buffer size; if not available, return NULL
1347 static struct smc_buf_desc *smc_buf_get_slot(int compressed_bufsize,
1349 struct list_head *buf_list)
1351 struct smc_buf_desc *buf_slot;
1354 list_for_each_entry(buf_slot, buf_list, list) {
1355 if (cmpxchg(&buf_slot->used, 0, 1) == 0) {
1364 /* one of the conditions for announcing a receiver's current window size is
1365 * that it "results in a minimum increase in the window size of 10% of the
1366 * receive buffer space" [RFC7609]
1368 static inline int smc_rmb_wnd_update_limit(int rmbe_size)
1370 return min_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2);
1373 /* map an rmb buf to a link */
1374 static int smcr_buf_map_link(struct smc_buf_desc *buf_desc, bool is_rmb,
1375 struct smc_link *lnk)
1379 if (buf_desc->is_map_ib[lnk->link_idx])
1382 rc = sg_alloc_table(&buf_desc->sgt[lnk->link_idx], 1, GFP_KERNEL);
1385 sg_set_buf(buf_desc->sgt[lnk->link_idx].sgl,
1386 buf_desc->cpu_addr, buf_desc->len);
1388 /* map sg table to DMA address */
1389 rc = smc_ib_buf_map_sg(lnk, buf_desc,
1390 is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE);
1391 /* SMC protocol depends on mapping to one DMA address only */
1397 /* create a new memory region for the RMB */
1399 rc = smc_ib_get_memory_region(lnk->roce_pd,
1400 IB_ACCESS_REMOTE_WRITE |
1401 IB_ACCESS_LOCAL_WRITE,
1402 buf_desc, lnk->link_idx);
1405 smc_ib_sync_sg_for_device(lnk, buf_desc, DMA_FROM_DEVICE);
1407 buf_desc->is_map_ib[lnk->link_idx] = true;
1411 smc_ib_buf_unmap_sg(lnk, buf_desc,
1412 is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE);
1414 sg_free_table(&buf_desc->sgt[lnk->link_idx]);
1418 /* register a new rmb on IB device,
1419 * must be called under lgr->llc_conf_mutex lock
1421 int smcr_link_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc)
1423 if (list_empty(&link->lgr->list))
1425 if (!rmb_desc->is_reg_mr[link->link_idx]) {
1426 /* register memory region for new rmb */
1427 if (smc_wr_reg_send(link, rmb_desc->mr_rx[link->link_idx])) {
1428 rmb_desc->is_reg_err = true;
1431 rmb_desc->is_reg_mr[link->link_idx] = true;
1436 static int _smcr_buf_map_lgr(struct smc_link *lnk, struct mutex *lock,
1437 struct list_head *lst, bool is_rmb)
1439 struct smc_buf_desc *buf_desc, *bf;
1443 list_for_each_entry_safe(buf_desc, bf, lst, list) {
1444 if (!buf_desc->used)
1446 rc = smcr_buf_map_link(buf_desc, is_rmb, lnk);
1455 /* map all used buffers of lgr for a new link */
1456 int smcr_buf_map_lgr(struct smc_link *lnk)
1458 struct smc_link_group *lgr = lnk->lgr;
1461 for (i = 0; i < SMC_RMBE_SIZES; i++) {
1462 rc = _smcr_buf_map_lgr(lnk, &lgr->rmbs_lock,
1463 &lgr->rmbs[i], true);
1466 rc = _smcr_buf_map_lgr(lnk, &lgr->sndbufs_lock,
1467 &lgr->sndbufs[i], false);
1474 /* register all used buffers of lgr for a new link,
1475 * must be called under lgr->llc_conf_mutex lock
1477 int smcr_buf_reg_lgr(struct smc_link *lnk)
1479 struct smc_link_group *lgr = lnk->lgr;
1480 struct smc_buf_desc *buf_desc, *bf;
1483 mutex_lock(&lgr->rmbs_lock);
1484 for (i = 0; i < SMC_RMBE_SIZES; i++) {
1485 list_for_each_entry_safe(buf_desc, bf, &lgr->rmbs[i], list) {
1486 if (!buf_desc->used)
1488 rc = smcr_link_reg_rmb(lnk, buf_desc);
1494 mutex_unlock(&lgr->rmbs_lock);
1498 static struct smc_buf_desc *smcr_new_buf_create(struct smc_link_group *lgr,
1499 bool is_rmb, int bufsize)
1501 struct smc_buf_desc *buf_desc;
1503 /* try to alloc a new buffer */
1504 buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL);
1506 return ERR_PTR(-ENOMEM);
1508 buf_desc->order = get_order(bufsize);
1509 buf_desc->pages = alloc_pages(GFP_KERNEL | __GFP_NOWARN |
1510 __GFP_NOMEMALLOC | __GFP_COMP |
1511 __GFP_NORETRY | __GFP_ZERO,
1513 if (!buf_desc->pages) {
1515 return ERR_PTR(-EAGAIN);
1517 buf_desc->cpu_addr = (void *)page_address(buf_desc->pages);
1518 buf_desc->len = bufsize;
1522 /* map buf_desc on all usable links,
1523 * unused buffers stay mapped as long as the link is up
1525 static int smcr_buf_map_usable_links(struct smc_link_group *lgr,
1526 struct smc_buf_desc *buf_desc, bool is_rmb)
1530 /* protect against parallel link reconfiguration */
1531 mutex_lock(&lgr->llc_conf_mutex);
1532 for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
1533 struct smc_link *lnk = &lgr->lnk[i];
1535 if (!smc_link_usable(lnk))
1537 if (smcr_buf_map_link(buf_desc, is_rmb, lnk)) {
1543 mutex_unlock(&lgr->llc_conf_mutex);
1547 #define SMCD_DMBE_SIZES 7 /* 0 -> 16KB, 1 -> 32KB, .. 6 -> 1MB */
1549 static struct smc_buf_desc *smcd_new_buf_create(struct smc_link_group *lgr,
1550 bool is_dmb, int bufsize)
1552 struct smc_buf_desc *buf_desc;
1555 if (smc_compress_bufsize(bufsize) > SMCD_DMBE_SIZES)
1556 return ERR_PTR(-EAGAIN);
1558 /* try to alloc a new DMB */
1559 buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL);
1561 return ERR_PTR(-ENOMEM);
1563 rc = smc_ism_register_dmb(lgr, bufsize, buf_desc);
1566 return ERR_PTR(-EAGAIN);
1568 buf_desc->pages = virt_to_page(buf_desc->cpu_addr);
1569 /* CDC header stored in buf. So, pretend it was smaller */
1570 buf_desc->len = bufsize - sizeof(struct smcd_cdc_msg);
1572 buf_desc->cpu_addr = kzalloc(bufsize, GFP_KERNEL |
1573 __GFP_NOWARN | __GFP_NORETRY |
1575 if (!buf_desc->cpu_addr) {
1577 return ERR_PTR(-EAGAIN);
1579 buf_desc->len = bufsize;
1584 static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
1586 struct smc_buf_desc *buf_desc = ERR_PTR(-ENOMEM);
1587 struct smc_connection *conn = &smc->conn;
1588 struct smc_link_group *lgr = conn->lgr;
1589 struct list_head *buf_list;
1590 int bufsize, bufsize_short;
1591 struct mutex *lock; /* lock buffer list */
1595 /* use socket recv buffer size (w/o overhead) as start value */
1596 sk_buf_size = smc->sk.sk_rcvbuf / 2;
1598 /* use socket send buffer size (w/o overhead) as start value */
1599 sk_buf_size = smc->sk.sk_sndbuf / 2;
1601 for (bufsize_short = smc_compress_bufsize(sk_buf_size);
1602 bufsize_short >= 0; bufsize_short--) {
1605 lock = &lgr->rmbs_lock;
1606 buf_list = &lgr->rmbs[bufsize_short];
1608 lock = &lgr->sndbufs_lock;
1609 buf_list = &lgr->sndbufs[bufsize_short];
1611 bufsize = smc_uncompress_bufsize(bufsize_short);
1612 if ((1 << get_order(bufsize)) > SG_MAX_SINGLE_ALLOC)
1615 /* check for reusable slot in the link group */
1616 buf_desc = smc_buf_get_slot(bufsize_short, lock, buf_list);
1618 memset(buf_desc->cpu_addr, 0, bufsize);
1619 break; /* found reusable slot */
1623 buf_desc = smcd_new_buf_create(lgr, is_rmb, bufsize);
1625 buf_desc = smcr_new_buf_create(lgr, is_rmb, bufsize);
1627 if (PTR_ERR(buf_desc) == -ENOMEM)
1629 if (IS_ERR(buf_desc))
1634 list_add(&buf_desc->list, buf_list);
1639 if (IS_ERR(buf_desc))
1643 if (smcr_buf_map_usable_links(lgr, buf_desc, is_rmb)) {
1644 smcr_buf_unuse(buf_desc, lgr);
1650 conn->rmb_desc = buf_desc;
1651 conn->rmbe_size_short = bufsize_short;
1652 smc->sk.sk_rcvbuf = bufsize * 2;
1653 atomic_set(&conn->bytes_to_rcv, 0);
1654 conn->rmbe_update_limit =
1655 smc_rmb_wnd_update_limit(buf_desc->len);
1657 smc_ism_set_conn(conn); /* map RMB/smcd_dev to conn */
1659 conn->sndbuf_desc = buf_desc;
1660 smc->sk.sk_sndbuf = bufsize * 2;
1661 atomic_set(&conn->sndbuf_space, bufsize);
1666 void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn)
1668 if (!conn->lgr || conn->lgr->is_smcd || !smc_link_usable(conn->lnk))
1670 smc_ib_sync_sg_for_cpu(conn->lnk, conn->sndbuf_desc, DMA_TO_DEVICE);
1673 void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn)
1675 if (!conn->lgr || conn->lgr->is_smcd || !smc_link_usable(conn->lnk))
1677 smc_ib_sync_sg_for_device(conn->lnk, conn->sndbuf_desc, DMA_TO_DEVICE);
1680 void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn)
1684 if (!conn->lgr || conn->lgr->is_smcd)
1686 for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
1687 if (!smc_link_usable(&conn->lgr->lnk[i]))
1689 smc_ib_sync_sg_for_cpu(&conn->lgr->lnk[i], conn->rmb_desc,
1694 void smc_rmb_sync_sg_for_device(struct smc_connection *conn)
1698 if (!conn->lgr || conn->lgr->is_smcd)
1700 for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
1701 if (!smc_link_usable(&conn->lgr->lnk[i]))
1703 smc_ib_sync_sg_for_device(&conn->lgr->lnk[i], conn->rmb_desc,
1708 /* create the send and receive buffer for an SMC socket;
1709 * receive buffers are called RMBs;
1710 * (even though the SMC protocol allows more than one RMB-element per RMB,
1711 * the Linux implementation uses just one RMB-element per RMB, i.e. uses an
1712 * extra RMB for every connection in a link group
1714 int smc_buf_create(struct smc_sock *smc, bool is_smcd)
1718 /* create send buffer */
1719 rc = __smc_buf_create(smc, is_smcd, false);
1723 rc = __smc_buf_create(smc, is_smcd, true);
1725 smc_buf_free(smc->conn.lgr, false, smc->conn.sndbuf_desc);
1729 static inline int smc_rmb_reserve_rtoken_idx(struct smc_link_group *lgr)
1733 for_each_clear_bit(i, lgr->rtokens_used_mask, SMC_RMBS_PER_LGR_MAX) {
1734 if (!test_and_set_bit(i, lgr->rtokens_used_mask))
1740 static int smc_rtoken_find_by_link(struct smc_link_group *lgr, int lnk_idx,
1745 for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) {
1746 if (test_bit(i, lgr->rtokens_used_mask) &&
1747 lgr->rtokens[i][lnk_idx].rkey == rkey)
1753 /* set rtoken for a new link to an existing rmb */
1754 void smc_rtoken_set(struct smc_link_group *lgr, int link_idx, int link_idx_new,
1755 __be32 nw_rkey_known, __be64 nw_vaddr, __be32 nw_rkey)
1759 rtok_idx = smc_rtoken_find_by_link(lgr, link_idx, ntohl(nw_rkey_known));
1760 if (rtok_idx == -ENOENT)
1762 lgr->rtokens[rtok_idx][link_idx_new].rkey = ntohl(nw_rkey);
1763 lgr->rtokens[rtok_idx][link_idx_new].dma_addr = be64_to_cpu(nw_vaddr);
1766 /* set rtoken for a new link whose link_id is given */
1767 void smc_rtoken_set2(struct smc_link_group *lgr, int rtok_idx, int link_id,
1768 __be64 nw_vaddr, __be32 nw_rkey)
1770 u64 dma_addr = be64_to_cpu(nw_vaddr);
1771 u32 rkey = ntohl(nw_rkey);
1775 for (link_idx = 0; link_idx < SMC_LINKS_PER_LGR_MAX; link_idx++) {
1776 if (lgr->lnk[link_idx].link_id == link_id) {
1783 lgr->rtokens[rtok_idx][link_idx].rkey = rkey;
1784 lgr->rtokens[rtok_idx][link_idx].dma_addr = dma_addr;
1787 /* add a new rtoken from peer */
1788 int smc_rtoken_add(struct smc_link *lnk, __be64 nw_vaddr, __be32 nw_rkey)
1790 struct smc_link_group *lgr = smc_get_lgr(lnk);
1791 u64 dma_addr = be64_to_cpu(nw_vaddr);
1792 u32 rkey = ntohl(nw_rkey);
1795 for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) {
1796 if (lgr->rtokens[i][lnk->link_idx].rkey == rkey &&
1797 lgr->rtokens[i][lnk->link_idx].dma_addr == dma_addr &&
1798 test_bit(i, lgr->rtokens_used_mask)) {
1799 /* already in list */
1803 i = smc_rmb_reserve_rtoken_idx(lgr);
1806 lgr->rtokens[i][lnk->link_idx].rkey = rkey;
1807 lgr->rtokens[i][lnk->link_idx].dma_addr = dma_addr;
1811 /* delete an rtoken from all links */
1812 int smc_rtoken_delete(struct smc_link *lnk, __be32 nw_rkey)
1814 struct smc_link_group *lgr = smc_get_lgr(lnk);
1815 u32 rkey = ntohl(nw_rkey);
1818 for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) {
1819 if (lgr->rtokens[i][lnk->link_idx].rkey == rkey &&
1820 test_bit(i, lgr->rtokens_used_mask)) {
1821 for (j = 0; j < SMC_LINKS_PER_LGR_MAX; j++) {
1822 lgr->rtokens[i][j].rkey = 0;
1823 lgr->rtokens[i][j].dma_addr = 0;
1825 clear_bit(i, lgr->rtokens_used_mask);
1832 /* save rkey and dma_addr received from peer during clc handshake */
1833 int smc_rmb_rtoken_handling(struct smc_connection *conn,
1834 struct smc_link *lnk,
1835 struct smc_clc_msg_accept_confirm *clc)
1837 conn->rtoken_idx = smc_rtoken_add(lnk, clc->rmb_dma_addr,
1839 if (conn->rtoken_idx < 0)
1840 return conn->rtoken_idx;
1844 static void smc_core_going_away(void)
1846 struct smc_ib_device *smcibdev;
1847 struct smcd_dev *smcd;
1849 spin_lock(&smc_ib_devices.lock);
1850 list_for_each_entry(smcibdev, &smc_ib_devices.list, list) {
1853 for (i = 0; i < SMC_MAX_PORTS; i++)
1854 set_bit(i, smcibdev->ports_going_away);
1856 spin_unlock(&smc_ib_devices.lock);
1858 spin_lock(&smcd_dev_list.lock);
1859 list_for_each_entry(smcd, &smcd_dev_list.list, list) {
1860 smcd->going_away = 1;
1862 spin_unlock(&smcd_dev_list.lock);
1865 /* Clean up all SMC link groups */
1866 static void smc_lgrs_shutdown(void)
1868 struct smcd_dev *smcd;
1870 smc_core_going_away();
1872 smc_smcr_terminate_all(NULL);
1874 spin_lock(&smcd_dev_list.lock);
1875 list_for_each_entry(smcd, &smcd_dev_list.list, list)
1876 smc_smcd_terminate_all(smcd);
1877 spin_unlock(&smcd_dev_list.lock);
1880 static int smc_core_reboot_event(struct notifier_block *this,
1881 unsigned long event, void *ptr)
1883 smc_lgrs_shutdown();
1884 smc_ib_unregister_client();
1888 static struct notifier_block smc_reboot_notifier = {
1889 .notifier_call = smc_core_reboot_event,
1892 int __init smc_core_init(void)
1894 return register_reboot_notifier(&smc_reboot_notifier);
1897 /* Called (from smc_exit) when module is removed */
1898 void smc_core_exit(void)
1900 unregister_reboot_notifier(&smc_reboot_notifier);
1901 smc_lgrs_shutdown();