net/smc: add SMC-D support in data transfer
[sfrench/cifs-2.6.git] / net / smc / smc_core.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  *  Shared Memory Communications over RDMA (SMC-R) and RoCE
4  *
5  *  Basic Transport Functions exploiting Infiniband API
6  *
7  *  Copyright IBM Corp. 2016
8  *
9  *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
10  */
11
12 #include <linux/socket.h>
13 #include <linux/if_vlan.h>
14 #include <linux/random.h>
15 #include <linux/workqueue.h>
16 #include <net/tcp.h>
17 #include <net/sock.h>
18 #include <rdma/ib_verbs.h>
19
20 #include "smc.h"
21 #include "smc_clc.h"
22 #include "smc_core.h"
23 #include "smc_ib.h"
24 #include "smc_wr.h"
25 #include "smc_llc.h"
26 #include "smc_cdc.h"
27 #include "smc_close.h"
28 #include "smc_ism.h"
29
30 #define SMC_LGR_NUM_INCR                256
31 #define SMC_LGR_FREE_DELAY_SERV         (600 * HZ)
32 #define SMC_LGR_FREE_DELAY_CLNT         (SMC_LGR_FREE_DELAY_SERV + 10 * HZ)
33
34 static struct smc_lgr_list smc_lgr_list = {     /* established link groups */
35         .lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock),
36         .list = LIST_HEAD_INIT(smc_lgr_list.list),
37         .num = 0,
38 };
39
40 static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb,
41                          struct smc_buf_desc *buf_desc);
42
43 static void smc_lgr_schedule_free_work(struct smc_link_group *lgr)
44 {
45         /* client link group creation always follows the server link group
46          * creation. For client use a somewhat higher removal delay time,
47          * otherwise there is a risk of out-of-sync link groups.
48          */
49         mod_delayed_work(system_wq, &lgr->free_work,
50                          (!lgr->is_smcd && lgr->role == SMC_CLNT) ?
51                          SMC_LGR_FREE_DELAY_CLNT : SMC_LGR_FREE_DELAY_SERV);
52 }
53
54 /* Register connection's alert token in our lookup structure.
55  * To use rbtrees we have to implement our own insert core.
56  * Requires @conns_lock
57  * @smc         connection to register
58  * Returns 0 on success, != otherwise.
59  */
60 static void smc_lgr_add_alert_token(struct smc_connection *conn)
61 {
62         struct rb_node **link, *parent = NULL;
63         u32 token = conn->alert_token_local;
64
65         link = &conn->lgr->conns_all.rb_node;
66         while (*link) {
67                 struct smc_connection *cur = rb_entry(*link,
68                                         struct smc_connection, alert_node);
69
70                 parent = *link;
71                 if (cur->alert_token_local > token)
72                         link = &parent->rb_left;
73                 else
74                         link = &parent->rb_right;
75         }
76         /* Put the new node there */
77         rb_link_node(&conn->alert_node, parent, link);
78         rb_insert_color(&conn->alert_node, &conn->lgr->conns_all);
79 }
80
81 /* Register connection in link group by assigning an alert token
82  * registered in a search tree.
83  * Requires @conns_lock
84  * Note that '0' is a reserved value and not assigned.
85  */
86 static void smc_lgr_register_conn(struct smc_connection *conn)
87 {
88         struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
89         static atomic_t nexttoken = ATOMIC_INIT(0);
90
91         /* find a new alert_token_local value not yet used by some connection
92          * in this link group
93          */
94         sock_hold(&smc->sk); /* sock_put in smc_lgr_unregister_conn() */
95         while (!conn->alert_token_local) {
96                 conn->alert_token_local = atomic_inc_return(&nexttoken);
97                 if (smc_lgr_find_conn(conn->alert_token_local, conn->lgr))
98                         conn->alert_token_local = 0;
99         }
100         smc_lgr_add_alert_token(conn);
101         conn->lgr->conns_num++;
102 }
103
104 /* Unregister connection and reset the alert token of the given connection<
105  */
106 static void __smc_lgr_unregister_conn(struct smc_connection *conn)
107 {
108         struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
109         struct smc_link_group *lgr = conn->lgr;
110
111         rb_erase(&conn->alert_node, &lgr->conns_all);
112         lgr->conns_num--;
113         conn->alert_token_local = 0;
114         conn->lgr = NULL;
115         sock_put(&smc->sk); /* sock_hold in smc_lgr_register_conn() */
116 }
117
118 /* Unregister connection and trigger lgr freeing if applicable
119  */
120 static void smc_lgr_unregister_conn(struct smc_connection *conn)
121 {
122         struct smc_link_group *lgr = conn->lgr;
123         int reduced = 0;
124
125         write_lock_bh(&lgr->conns_lock);
126         if (conn->alert_token_local) {
127                 reduced = 1;
128                 __smc_lgr_unregister_conn(conn);
129         }
130         write_unlock_bh(&lgr->conns_lock);
131         if (!reduced || lgr->conns_num)
132                 return;
133         smc_lgr_schedule_free_work(lgr);
134 }
135
136 static void smc_lgr_free_work(struct work_struct *work)
137 {
138         struct smc_link_group *lgr = container_of(to_delayed_work(work),
139                                                   struct smc_link_group,
140                                                   free_work);
141         bool conns;
142
143         spin_lock_bh(&smc_lgr_list.lock);
144         if (list_empty(&lgr->list))
145                 goto free;
146         read_lock_bh(&lgr->conns_lock);
147         conns = RB_EMPTY_ROOT(&lgr->conns_all);
148         read_unlock_bh(&lgr->conns_lock);
149         if (!conns) { /* number of lgr connections is no longer zero */
150                 spin_unlock_bh(&smc_lgr_list.lock);
151                 return;
152         }
153         list_del_init(&lgr->list); /* remove from smc_lgr_list */
154 free:
155         spin_unlock_bh(&smc_lgr_list.lock);
156         if (!delayed_work_pending(&lgr->free_work)) {
157                 if (!lgr->is_smcd &&
158                     lgr->lnk[SMC_SINGLE_LINK].state != SMC_LNK_INACTIVE)
159                         smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]);
160                 smc_lgr_free(lgr);
161         }
162 }
163
164 /* create a new SMC link group */
165 static int smc_lgr_create(struct smc_sock *smc, bool is_smcd,
166                           struct smc_ib_device *smcibdev, u8 ibport,
167                           char *peer_systemid, unsigned short vlan_id,
168                           struct smcd_dev *smcismdev, u64 peer_gid)
169 {
170         struct smc_link_group *lgr;
171         struct smc_link *lnk;
172         u8 rndvec[3];
173         int rc = 0;
174         int i;
175
176         if (is_smcd && vlan_id) {
177                 rc = smc_ism_get_vlan(smcismdev, vlan_id);
178                 if (rc)
179                         goto out;
180         }
181
182         lgr = kzalloc(sizeof(*lgr), GFP_KERNEL);
183         if (!lgr) {
184                 rc = -ENOMEM;
185                 goto out;
186         }
187         lgr->is_smcd = is_smcd;
188         lgr->sync_err = 0;
189         lgr->vlan_id = vlan_id;
190         rwlock_init(&lgr->sndbufs_lock);
191         rwlock_init(&lgr->rmbs_lock);
192         rwlock_init(&lgr->conns_lock);
193         for (i = 0; i < SMC_RMBE_SIZES; i++) {
194                 INIT_LIST_HEAD(&lgr->sndbufs[i]);
195                 INIT_LIST_HEAD(&lgr->rmbs[i]);
196         }
197         smc_lgr_list.num += SMC_LGR_NUM_INCR;
198         memcpy(&lgr->id, (u8 *)&smc_lgr_list.num, SMC_LGR_ID_SIZE);
199         INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work);
200         lgr->conns_all = RB_ROOT;
201         if (is_smcd) {
202                 /* SMC-D specific settings */
203                 lgr->peer_gid = peer_gid;
204                 lgr->smcd = smcismdev;
205         } else {
206                 /* SMC-R specific settings */
207                 lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
208                 memcpy(lgr->peer_systemid, peer_systemid, SMC_SYSTEMID_LEN);
209
210                 lnk = &lgr->lnk[SMC_SINGLE_LINK];
211                 /* initialize link */
212                 lnk->state = SMC_LNK_ACTIVATING;
213                 lnk->link_id = SMC_SINGLE_LINK;
214                 lnk->smcibdev = smcibdev;
215                 lnk->ibport = ibport;
216                 lnk->path_mtu = smcibdev->pattr[ibport - 1].active_mtu;
217                 if (!smcibdev->initialized)
218                         smc_ib_setup_per_ibdev(smcibdev);
219                 get_random_bytes(rndvec, sizeof(rndvec));
220                 lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) +
221                         (rndvec[2] << 16);
222                 rc = smc_llc_link_init(lnk);
223                 if (rc)
224                         goto free_lgr;
225                 rc = smc_wr_alloc_link_mem(lnk);
226                 if (rc)
227                         goto clear_llc_lnk;
228                 rc = smc_ib_create_protection_domain(lnk);
229                 if (rc)
230                         goto free_link_mem;
231                 rc = smc_ib_create_queue_pair(lnk);
232                 if (rc)
233                         goto dealloc_pd;
234                 rc = smc_wr_create_link(lnk);
235                 if (rc)
236                         goto destroy_qp;
237         }
238         smc->conn.lgr = lgr;
239         spin_lock_bh(&smc_lgr_list.lock);
240         list_add(&lgr->list, &smc_lgr_list.list);
241         spin_unlock_bh(&smc_lgr_list.lock);
242         return 0;
243
244 destroy_qp:
245         smc_ib_destroy_queue_pair(lnk);
246 dealloc_pd:
247         smc_ib_dealloc_protection_domain(lnk);
248 free_link_mem:
249         smc_wr_free_link_mem(lnk);
250 clear_llc_lnk:
251         smc_llc_link_clear(lnk);
252 free_lgr:
253         kfree(lgr);
254 out:
255         return rc;
256 }
257
258 static void smc_buf_unuse(struct smc_connection *conn)
259 {
260         if (conn->sndbuf_desc)
261                 conn->sndbuf_desc->used = 0;
262         if (conn->rmb_desc) {
263                 if (!conn->rmb_desc->regerr) {
264                         conn->rmb_desc->reused = 1;
265                         conn->rmb_desc->used = 0;
266                 } else {
267                         /* buf registration failed, reuse not possible */
268                         struct smc_link_group *lgr = conn->lgr;
269
270                         write_lock_bh(&lgr->rmbs_lock);
271                         list_del(&conn->rmb_desc->list);
272                         write_unlock_bh(&lgr->rmbs_lock);
273
274                         smc_buf_free(lgr, true, conn->rmb_desc);
275                 }
276         }
277 }
278
279 /* remove a finished connection from its link group */
280 void smc_conn_free(struct smc_connection *conn)
281 {
282         if (!conn->lgr)
283                 return;
284         if (conn->lgr->is_smcd) {
285                 smc_ism_unset_conn(conn);
286                 tasklet_kill(&conn->rx_tsklet);
287         } else {
288                 smc_cdc_tx_dismiss_slots(conn);
289         }
290         smc_lgr_unregister_conn(conn);
291         smc_buf_unuse(conn);
292 }
293
294 static void smc_link_clear(struct smc_link *lnk)
295 {
296         lnk->peer_qpn = 0;
297         smc_llc_link_clear(lnk);
298         smc_ib_modify_qp_reset(lnk);
299         smc_wr_free_link(lnk);
300         smc_ib_destroy_queue_pair(lnk);
301         smc_ib_dealloc_protection_domain(lnk);
302         smc_wr_free_link_mem(lnk);
303 }
304
305 static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb,
306                           struct smc_buf_desc *buf_desc)
307 {
308         struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];
309
310         if (is_rmb) {
311                 if (buf_desc->mr_rx[SMC_SINGLE_LINK])
312                         smc_ib_put_memory_region(
313                                         buf_desc->mr_rx[SMC_SINGLE_LINK]);
314                 smc_ib_buf_unmap_sg(lnk->smcibdev, buf_desc,
315                                     DMA_FROM_DEVICE);
316         } else {
317                 smc_ib_buf_unmap_sg(lnk->smcibdev, buf_desc,
318                                     DMA_TO_DEVICE);
319         }
320         sg_free_table(&buf_desc->sgt[SMC_SINGLE_LINK]);
321         if (buf_desc->pages)
322                 __free_pages(buf_desc->pages, buf_desc->order);
323         kfree(buf_desc);
324 }
325
326 static void smcd_buf_free(struct smc_link_group *lgr, bool is_dmb,
327                           struct smc_buf_desc *buf_desc)
328 {
329         if (is_dmb) {
330                 /* restore original buf len */
331                 buf_desc->len += sizeof(struct smcd_cdc_msg);
332                 smc_ism_unregister_dmb(lgr->smcd, buf_desc);
333         } else {
334                 kfree(buf_desc->cpu_addr);
335         }
336         kfree(buf_desc);
337 }
338
339 static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb,
340                          struct smc_buf_desc *buf_desc)
341 {
342         if (lgr->is_smcd)
343                 smcd_buf_free(lgr, is_rmb, buf_desc);
344         else
345                 smcr_buf_free(lgr, is_rmb, buf_desc);
346 }
347
348 static void __smc_lgr_free_bufs(struct smc_link_group *lgr, bool is_rmb)
349 {
350         struct smc_buf_desc *buf_desc, *bf_desc;
351         struct list_head *buf_list;
352         int i;
353
354         for (i = 0; i < SMC_RMBE_SIZES; i++) {
355                 if (is_rmb)
356                         buf_list = &lgr->rmbs[i];
357                 else
358                         buf_list = &lgr->sndbufs[i];
359                 list_for_each_entry_safe(buf_desc, bf_desc, buf_list,
360                                          list) {
361                         list_del(&buf_desc->list);
362                         smc_buf_free(lgr, is_rmb, buf_desc);
363                 }
364         }
365 }
366
367 static void smc_lgr_free_bufs(struct smc_link_group *lgr)
368 {
369         /* free send buffers */
370         __smc_lgr_free_bufs(lgr, false);
371         /* free rmbs */
372         __smc_lgr_free_bufs(lgr, true);
373 }
374
375 /* remove a link group */
376 void smc_lgr_free(struct smc_link_group *lgr)
377 {
378         smc_lgr_free_bufs(lgr);
379         if (lgr->is_smcd)
380                 smc_ism_put_vlan(lgr->smcd, lgr->vlan_id);
381         else
382                 smc_link_clear(&lgr->lnk[SMC_SINGLE_LINK]);
383         kfree(lgr);
384 }
385
386 void smc_lgr_forget(struct smc_link_group *lgr)
387 {
388         spin_lock_bh(&smc_lgr_list.lock);
389         /* do not use this link group for new connections */
390         if (!list_empty(&lgr->list))
391                 list_del_init(&lgr->list);
392         spin_unlock_bh(&smc_lgr_list.lock);
393 }
394
395 /* terminate linkgroup abnormally */
396 static void __smc_lgr_terminate(struct smc_link_group *lgr)
397 {
398         struct smc_connection *conn;
399         struct smc_sock *smc;
400         struct rb_node *node;
401
402         if (lgr->terminating)
403                 return; /* lgr already terminating */
404         lgr->terminating = 1;
405         if (!list_empty(&lgr->list)) /* forget lgr */
406                 list_del_init(&lgr->list);
407         if (!lgr->is_smcd)
408                 smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]);
409
410         write_lock_bh(&lgr->conns_lock);
411         node = rb_first(&lgr->conns_all);
412         while (node) {
413                 conn = rb_entry(node, struct smc_connection, alert_node);
414                 smc = container_of(conn, struct smc_sock, conn);
415                 sock_hold(&smc->sk); /* sock_put in close work */
416                 conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
417                 __smc_lgr_unregister_conn(conn);
418                 write_unlock_bh(&lgr->conns_lock);
419                 if (!schedule_work(&conn->close_work))
420                         sock_put(&smc->sk);
421                 write_lock_bh(&lgr->conns_lock);
422                 node = rb_first(&lgr->conns_all);
423         }
424         write_unlock_bh(&lgr->conns_lock);
425         if (!lgr->is_smcd)
426                 wake_up(&lgr->lnk[SMC_SINGLE_LINK].wr_reg_wait);
427         smc_lgr_schedule_free_work(lgr);
428 }
429
430 void smc_lgr_terminate(struct smc_link_group *lgr)
431 {
432         spin_lock_bh(&smc_lgr_list.lock);
433         __smc_lgr_terminate(lgr);
434         spin_unlock_bh(&smc_lgr_list.lock);
435 }
436
437 /* Called when IB port is terminated */
438 void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport)
439 {
440         struct smc_link_group *lgr, *l;
441
442         spin_lock_bh(&smc_lgr_list.lock);
443         list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) {
444                 if (!lgr->is_smcd &&
445                     lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev &&
446                     lgr->lnk[SMC_SINGLE_LINK].ibport == ibport)
447                         __smc_lgr_terminate(lgr);
448         }
449         spin_unlock_bh(&smc_lgr_list.lock);
450 }
451
452 /* Called when SMC-D device is terminated or peer is lost */
453 void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid)
454 {
455         struct smc_link_group *lgr, *l;
456         LIST_HEAD(lgr_free_list);
457
458         /* run common cleanup function and build free list */
459         spin_lock_bh(&smc_lgr_list.lock);
460         list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) {
461                 if (lgr->is_smcd && lgr->smcd == dev &&
462                     (!peer_gid || lgr->peer_gid == peer_gid) &&
463                     !list_empty(&lgr->list)) {
464                         __smc_lgr_terminate(lgr);
465                         list_move(&lgr->list, &lgr_free_list);
466                 }
467         }
468         spin_unlock_bh(&smc_lgr_list.lock);
469
470         /* cancel the regular free workers and actually free lgrs */
471         list_for_each_entry_safe(lgr, l, &lgr_free_list, list) {
472                 list_del_init(&lgr->list);
473                 cancel_delayed_work_sync(&lgr->free_work);
474                 smc_lgr_free(lgr);
475         }
476 }
477
478 /* Determine vlan of internal TCP socket.
479  * @vlan_id: address to store the determined vlan id into
480  */
481 static int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id)
482 {
483         struct dst_entry *dst = sk_dst_get(clcsock->sk);
484         struct net_device *ndev;
485         int i, nest_lvl, rc = 0;
486
487         *vlan_id = 0;
488         if (!dst) {
489                 rc = -ENOTCONN;
490                 goto out;
491         }
492         if (!dst->dev) {
493                 rc = -ENODEV;
494                 goto out_rel;
495         }
496
497         ndev = dst->dev;
498         if (is_vlan_dev(ndev)) {
499                 *vlan_id = vlan_dev_vlan_id(ndev);
500                 goto out_rel;
501         }
502
503         rtnl_lock();
504         nest_lvl = dev_get_nest_level(ndev);
505         for (i = 0; i < nest_lvl; i++) {
506                 struct list_head *lower = &ndev->adj_list.lower;
507
508                 if (list_empty(lower))
509                         break;
510                 lower = lower->next;
511                 ndev = (struct net_device *)netdev_lower_get_next(ndev, &lower);
512                 if (is_vlan_dev(ndev)) {
513                         *vlan_id = vlan_dev_vlan_id(ndev);
514                         break;
515                 }
516         }
517         rtnl_unlock();
518
519 out_rel:
520         dst_release(dst);
521 out:
522         return rc;
523 }
524
525 /* determine the link gid matching the vlan id of the link group */
526 static int smc_link_determine_gid(struct smc_link_group *lgr)
527 {
528         struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];
529         struct ib_gid_attr gattr;
530         union ib_gid gid;
531         int i;
532
533         if (!lgr->vlan_id) {
534                 lnk->gid = lnk->smcibdev->gid[lnk->ibport - 1];
535                 return 0;
536         }
537
538         for (i = 0; i < lnk->smcibdev->pattr[lnk->ibport - 1].gid_tbl_len;
539              i++) {
540                 if (ib_query_gid(lnk->smcibdev->ibdev, lnk->ibport, i, &gid,
541                                  &gattr))
542                         continue;
543                 if (gattr.ndev) {
544                         if (is_vlan_dev(gattr.ndev) &&
545                             vlan_dev_vlan_id(gattr.ndev) == lgr->vlan_id) {
546                                 lnk->gid = gid;
547                                 dev_put(gattr.ndev);
548                                 return 0;
549                         }
550                         dev_put(gattr.ndev);
551                 }
552         }
553         return -ENODEV;
554 }
555
556 static bool smcr_lgr_match(struct smc_link_group *lgr,
557                            struct smc_clc_msg_local *lcl,
558                            enum smc_lgr_role role)
559 {
560         return !memcmp(lgr->peer_systemid, lcl->id_for_peer,
561                        SMC_SYSTEMID_LEN) &&
562                 !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_gid, &lcl->gid,
563                         SMC_GID_SIZE) &&
564                 !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_mac, lcl->mac,
565                         sizeof(lcl->mac)) &&
566                 lgr->role == role;
567 }
568
569 static bool smcd_lgr_match(struct smc_link_group *lgr,
570                            struct smcd_dev *smcismdev, u64 peer_gid)
571 {
572         return lgr->peer_gid == peer_gid && lgr->smcd == smcismdev;
573 }
574
575 /* create a new SMC connection (and a new link group if necessary) */
576 int smc_conn_create(struct smc_sock *smc, bool is_smcd, int srv_first_contact,
577                     struct smc_ib_device *smcibdev, u8 ibport,
578                     struct smc_clc_msg_local *lcl, struct smcd_dev *smcd,
579                     u64 peer_gid)
580 {
581         struct smc_connection *conn = &smc->conn;
582         int local_contact = SMC_FIRST_CONTACT;
583         struct smc_link_group *lgr;
584         unsigned short vlan_id;
585         enum smc_lgr_role role;
586         int rc = 0;
587
588         role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
589         rc = smc_vlan_by_tcpsk(smc->clcsock, &vlan_id);
590         if (rc)
591                 return rc;
592
593         if ((role == SMC_CLNT) && srv_first_contact)
594                 /* create new link group as well */
595                 goto create;
596
597         /* determine if an existing link group can be reused */
598         spin_lock_bh(&smc_lgr_list.lock);
599         list_for_each_entry(lgr, &smc_lgr_list.list, list) {
600                 write_lock_bh(&lgr->conns_lock);
601                 if ((is_smcd ? smcd_lgr_match(lgr, smcd, peer_gid) :
602                      smcr_lgr_match(lgr, lcl, role)) &&
603                     !lgr->sync_err &&
604                     lgr->vlan_id == vlan_id &&
605                     (role == SMC_CLNT ||
606                      lgr->conns_num < SMC_RMBS_PER_LGR_MAX)) {
607                         /* link group found */
608                         local_contact = SMC_REUSE_CONTACT;
609                         conn->lgr = lgr;
610                         smc_lgr_register_conn(conn); /* add smc conn to lgr */
611                         write_unlock_bh(&lgr->conns_lock);
612                         break;
613                 }
614                 write_unlock_bh(&lgr->conns_lock);
615         }
616         spin_unlock_bh(&smc_lgr_list.lock);
617
618         if (role == SMC_CLNT && !srv_first_contact &&
619             (local_contact == SMC_FIRST_CONTACT)) {
620                 /* Server reuses a link group, but Client wants to start
621                  * a new one
622                  * send out_of_sync decline, reason synchr. error
623                  */
624                 return -ENOLINK;
625         }
626
627 create:
628         if (local_contact == SMC_FIRST_CONTACT) {
629                 rc = smc_lgr_create(smc, is_smcd, smcibdev, ibport,
630                                     lcl->id_for_peer, vlan_id, smcd, peer_gid);
631                 if (rc)
632                         goto out;
633                 smc_lgr_register_conn(conn); /* add smc conn to lgr */
634                 if (!is_smcd)
635                         rc = smc_link_determine_gid(conn->lgr);
636         }
637         conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE;
638         conn->local_tx_ctrl.len = SMC_WR_TX_SIZE;
639         conn->urg_state = SMC_URG_READ;
640         if (is_smcd) {
641                 conn->rx_off = sizeof(struct smcd_cdc_msg);
642                 smcd_cdc_rx_init(conn); /* init tasklet for this conn */
643         }
644 #ifndef KERNEL_HAS_ATOMIC64
645         spin_lock_init(&conn->acurs_lock);
646 #endif
647
648 out:
649         return rc ? rc : local_contact;
650 }
651
652 /* convert the RMB size into the compressed notation - minimum 16K.
653  * In contrast to plain ilog2, this rounds towards the next power of 2,
654  * so the socket application gets at least its desired sndbuf / rcvbuf size.
655  */
656 static u8 smc_compress_bufsize(int size)
657 {
658         u8 compressed;
659
660         if (size <= SMC_BUF_MIN_SIZE)
661                 return 0;
662
663         size = (size - 1) >> 14;
664         compressed = ilog2(size) + 1;
665         if (compressed >= SMC_RMBE_SIZES)
666                 compressed = SMC_RMBE_SIZES - 1;
667         return compressed;
668 }
669
670 /* convert the RMB size from compressed notation into integer */
671 int smc_uncompress_bufsize(u8 compressed)
672 {
673         u32 size;
674
675         size = 0x00000001 << (((int)compressed) + 14);
676         return (int)size;
677 }
678
679 /* try to reuse a sndbuf or rmb description slot for a certain
680  * buffer size; if not available, return NULL
681  */
682 static struct smc_buf_desc *smc_buf_get_slot(int compressed_bufsize,
683                                              rwlock_t *lock,
684                                              struct list_head *buf_list)
685 {
686         struct smc_buf_desc *buf_slot;
687
688         read_lock_bh(lock);
689         list_for_each_entry(buf_slot, buf_list, list) {
690                 if (cmpxchg(&buf_slot->used, 0, 1) == 0) {
691                         read_unlock_bh(lock);
692                         return buf_slot;
693                 }
694         }
695         read_unlock_bh(lock);
696         return NULL;
697 }
698
699 /* one of the conditions for announcing a receiver's current window size is
700  * that it "results in a minimum increase in the window size of 10% of the
701  * receive buffer space" [RFC7609]
702  */
703 static inline int smc_rmb_wnd_update_limit(int rmbe_size)
704 {
705         return min_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2);
706 }
707
708 static struct smc_buf_desc *smcr_new_buf_create(struct smc_link_group *lgr,
709                                                 bool is_rmb, int bufsize)
710 {
711         struct smc_buf_desc *buf_desc;
712         struct smc_link *lnk;
713         int rc;
714
715         /* try to alloc a new buffer */
716         buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL);
717         if (!buf_desc)
718                 return ERR_PTR(-ENOMEM);
719
720         buf_desc->order = get_order(bufsize);
721         buf_desc->pages = alloc_pages(GFP_KERNEL | __GFP_NOWARN |
722                                       __GFP_NOMEMALLOC | __GFP_COMP |
723                                       __GFP_NORETRY | __GFP_ZERO,
724                                       buf_desc->order);
725         if (!buf_desc->pages) {
726                 kfree(buf_desc);
727                 return ERR_PTR(-EAGAIN);
728         }
729         buf_desc->cpu_addr = (void *)page_address(buf_desc->pages);
730
731         /* build the sg table from the pages */
732         lnk = &lgr->lnk[SMC_SINGLE_LINK];
733         rc = sg_alloc_table(&buf_desc->sgt[SMC_SINGLE_LINK], 1,
734                             GFP_KERNEL);
735         if (rc) {
736                 smc_buf_free(lgr, is_rmb, buf_desc);
737                 return ERR_PTR(rc);
738         }
739         sg_set_buf(buf_desc->sgt[SMC_SINGLE_LINK].sgl,
740                    buf_desc->cpu_addr, bufsize);
741
742         /* map sg table to DMA address */
743         rc = smc_ib_buf_map_sg(lnk->smcibdev, buf_desc,
744                                is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE);
745         /* SMC protocol depends on mapping to one DMA address only */
746         if (rc != 1)  {
747                 smc_buf_free(lgr, is_rmb, buf_desc);
748                 return ERR_PTR(-EAGAIN);
749         }
750
751         /* create a new memory region for the RMB */
752         if (is_rmb) {
753                 rc = smc_ib_get_memory_region(lnk->roce_pd,
754                                               IB_ACCESS_REMOTE_WRITE |
755                                               IB_ACCESS_LOCAL_WRITE,
756                                               buf_desc);
757                 if (rc) {
758                         smc_buf_free(lgr, is_rmb, buf_desc);
759                         return ERR_PTR(rc);
760                 }
761         }
762
763         buf_desc->len = bufsize;
764         return buf_desc;
765 }
766
767 #define SMCD_DMBE_SIZES         7 /* 0 -> 16KB, 1 -> 32KB, .. 6 -> 1MB */
768
769 static struct smc_buf_desc *smcd_new_buf_create(struct smc_link_group *lgr,
770                                                 bool is_dmb, int bufsize)
771 {
772         struct smc_buf_desc *buf_desc;
773         int rc;
774
775         if (smc_compress_bufsize(bufsize) > SMCD_DMBE_SIZES)
776                 return ERR_PTR(-EAGAIN);
777
778         /* try to alloc a new DMB */
779         buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL);
780         if (!buf_desc)
781                 return ERR_PTR(-ENOMEM);
782         if (is_dmb) {
783                 rc = smc_ism_register_dmb(lgr, bufsize, buf_desc);
784                 if (rc) {
785                         kfree(buf_desc);
786                         return ERR_PTR(-EAGAIN);
787                 }
788                 buf_desc->pages = virt_to_page(buf_desc->cpu_addr);
789                 /* CDC header stored in buf. So, pretend it was smaller */
790                 buf_desc->len = bufsize - sizeof(struct smcd_cdc_msg);
791         } else {
792                 buf_desc->cpu_addr = kzalloc(bufsize, GFP_KERNEL |
793                                              __GFP_NOWARN | __GFP_NORETRY |
794                                              __GFP_NOMEMALLOC);
795                 if (!buf_desc->cpu_addr) {
796                         kfree(buf_desc);
797                         return ERR_PTR(-EAGAIN);
798                 }
799                 buf_desc->len = bufsize;
800         }
801         return buf_desc;
802 }
803
804 static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
805 {
806         struct smc_buf_desc *buf_desc = ERR_PTR(-ENOMEM);
807         struct smc_connection *conn = &smc->conn;
808         struct smc_link_group *lgr = conn->lgr;
809         struct list_head *buf_list;
810         int bufsize, bufsize_short;
811         int sk_buf_size;
812         rwlock_t *lock;
813
814         if (is_rmb)
815                 /* use socket recv buffer size (w/o overhead) as start value */
816                 sk_buf_size = smc->sk.sk_rcvbuf / 2;
817         else
818                 /* use socket send buffer size (w/o overhead) as start value */
819                 sk_buf_size = smc->sk.sk_sndbuf / 2;
820
821         for (bufsize_short = smc_compress_bufsize(sk_buf_size);
822              bufsize_short >= 0; bufsize_short--) {
823
824                 if (is_rmb) {
825                         lock = &lgr->rmbs_lock;
826                         buf_list = &lgr->rmbs[bufsize_short];
827                 } else {
828                         lock = &lgr->sndbufs_lock;
829                         buf_list = &lgr->sndbufs[bufsize_short];
830                 }
831                 bufsize = smc_uncompress_bufsize(bufsize_short);
832                 if ((1 << get_order(bufsize)) > SG_MAX_SINGLE_ALLOC)
833                         continue;
834
835                 /* check for reusable slot in the link group */
836                 buf_desc = smc_buf_get_slot(bufsize_short, lock, buf_list);
837                 if (buf_desc) {
838                         memset(buf_desc->cpu_addr, 0, bufsize);
839                         break; /* found reusable slot */
840                 }
841
842                 if (is_smcd)
843                         buf_desc = smcd_new_buf_create(lgr, is_rmb, bufsize);
844                 else
845                         buf_desc = smcr_new_buf_create(lgr, is_rmb, bufsize);
846
847                 if (PTR_ERR(buf_desc) == -ENOMEM)
848                         break;
849                 if (IS_ERR(buf_desc))
850                         continue;
851
852                 buf_desc->used = 1;
853                 write_lock_bh(lock);
854                 list_add(&buf_desc->list, buf_list);
855                 write_unlock_bh(lock);
856                 break; /* found */
857         }
858
859         if (IS_ERR(buf_desc))
860                 return -ENOMEM;
861
862         if (is_rmb) {
863                 conn->rmb_desc = buf_desc;
864                 conn->rmbe_size_short = bufsize_short;
865                 smc->sk.sk_rcvbuf = bufsize * 2;
866                 atomic_set(&conn->bytes_to_rcv, 0);
867                 conn->rmbe_update_limit =
868                         smc_rmb_wnd_update_limit(buf_desc->len);
869                 if (is_smcd)
870                         smc_ism_set_conn(conn); /* map RMB/smcd_dev to conn */
871         } else {
872                 conn->sndbuf_desc = buf_desc;
873                 smc->sk.sk_sndbuf = bufsize * 2;
874                 atomic_set(&conn->sndbuf_space, bufsize);
875         }
876         return 0;
877 }
878
879 void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn)
880 {
881         struct smc_link_group *lgr = conn->lgr;
882
883         if (!conn->lgr || conn->lgr->is_smcd)
884                 return;
885         smc_ib_sync_sg_for_cpu(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
886                                conn->sndbuf_desc, DMA_TO_DEVICE);
887 }
888
889 void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn)
890 {
891         struct smc_link_group *lgr = conn->lgr;
892
893         if (!conn->lgr || conn->lgr->is_smcd)
894                 return;
895         smc_ib_sync_sg_for_device(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
896                                   conn->sndbuf_desc, DMA_TO_DEVICE);
897 }
898
899 void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn)
900 {
901         struct smc_link_group *lgr = conn->lgr;
902
903         if (!conn->lgr || conn->lgr->is_smcd)
904                 return;
905         smc_ib_sync_sg_for_cpu(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
906                                conn->rmb_desc, DMA_FROM_DEVICE);
907 }
908
909 void smc_rmb_sync_sg_for_device(struct smc_connection *conn)
910 {
911         struct smc_link_group *lgr = conn->lgr;
912
913         if (!conn->lgr || conn->lgr->is_smcd)
914                 return;
915         smc_ib_sync_sg_for_device(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
916                                   conn->rmb_desc, DMA_FROM_DEVICE);
917 }
918
919 /* create the send and receive buffer for an SMC socket;
920  * receive buffers are called RMBs;
921  * (even though the SMC protocol allows more than one RMB-element per RMB,
922  * the Linux implementation uses just one RMB-element per RMB, i.e. uses an
923  * extra RMB for every connection in a link group
924  */
925 int smc_buf_create(struct smc_sock *smc, bool is_smcd)
926 {
927         int rc;
928
929         /* create send buffer */
930         rc = __smc_buf_create(smc, is_smcd, false);
931         if (rc)
932                 return rc;
933         /* create rmb */
934         rc = __smc_buf_create(smc, is_smcd, true);
935         if (rc)
936                 smc_buf_free(smc->conn.lgr, false, smc->conn.sndbuf_desc);
937         return rc;
938 }
939
940 static inline int smc_rmb_reserve_rtoken_idx(struct smc_link_group *lgr)
941 {
942         int i;
943
944         for_each_clear_bit(i, lgr->rtokens_used_mask, SMC_RMBS_PER_LGR_MAX) {
945                 if (!test_and_set_bit(i, lgr->rtokens_used_mask))
946                         return i;
947         }
948         return -ENOSPC;
949 }
950
951 /* add a new rtoken from peer */
952 int smc_rtoken_add(struct smc_link_group *lgr, __be64 nw_vaddr, __be32 nw_rkey)
953 {
954         u64 dma_addr = be64_to_cpu(nw_vaddr);
955         u32 rkey = ntohl(nw_rkey);
956         int i;
957
958         for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) {
959                 if ((lgr->rtokens[i][SMC_SINGLE_LINK].rkey == rkey) &&
960                     (lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr == dma_addr) &&
961                     test_bit(i, lgr->rtokens_used_mask)) {
962                         /* already in list */
963                         return i;
964                 }
965         }
966         i = smc_rmb_reserve_rtoken_idx(lgr);
967         if (i < 0)
968                 return i;
969         lgr->rtokens[i][SMC_SINGLE_LINK].rkey = rkey;
970         lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr = dma_addr;
971         return i;
972 }
973
974 /* delete an rtoken */
975 int smc_rtoken_delete(struct smc_link_group *lgr, __be32 nw_rkey)
976 {
977         u32 rkey = ntohl(nw_rkey);
978         int i;
979
980         for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) {
981                 if (lgr->rtokens[i][SMC_SINGLE_LINK].rkey == rkey &&
982                     test_bit(i, lgr->rtokens_used_mask)) {
983                         lgr->rtokens[i][SMC_SINGLE_LINK].rkey = 0;
984                         lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr = 0;
985
986                         clear_bit(i, lgr->rtokens_used_mask);
987                         return 0;
988                 }
989         }
990         return -ENOENT;
991 }
992
993 /* save rkey and dma_addr received from peer during clc handshake */
994 int smc_rmb_rtoken_handling(struct smc_connection *conn,
995                             struct smc_clc_msg_accept_confirm *clc)
996 {
997         conn->rtoken_idx = smc_rtoken_add(conn->lgr, clc->rmb_dma_addr,
998                                           clc->rmb_rkey);
999         if (conn->rtoken_idx < 0)
1000                 return conn->rtoken_idx;
1001         return 0;
1002 }
1003
1004 /* Called (from smc_exit) when module is removed */
1005 void smc_core_exit(void)
1006 {
1007         struct smc_link_group *lgr, *lg;
1008         LIST_HEAD(lgr_freeing_list);
1009
1010         spin_lock_bh(&smc_lgr_list.lock);
1011         if (!list_empty(&smc_lgr_list.list))
1012                 list_splice_init(&smc_lgr_list.list, &lgr_freeing_list);
1013         spin_unlock_bh(&smc_lgr_list.lock);
1014         list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) {
1015                 list_del_init(&lgr->list);
1016                 if (!lgr->is_smcd)
1017                         smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]);
1018                 cancel_delayed_work_sync(&lgr->free_work);
1019                 smc_lgr_free(lgr); /* free link group */
1020         }
1021 }