Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net
[sfrench/cifs-2.6.git] / net / smc / af_smc.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  *  Shared Memory Communications over RDMA (SMC-R) and RoCE
4  *
5  *  AF_SMC protocol family socket handler keeping the AF_INET sock address type
6  *  applies to SOCK_STREAM sockets only
7  *  offers an alternative communication option for TCP-protocol sockets
8  *  applicable with RoCE-cards only
9  *
10  *  Initial restrictions:
11  *    - support for alternate links postponed
12  *
13  *  Copyright IBM Corp. 2016, 2018
14  *
15  *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
16  *              based on prototype from Frank Blaschka
17  */
18
19 #define KMSG_COMPONENT "smc"
20 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
21
22 #include <linux/module.h>
23 #include <linux/socket.h>
24 #include <linux/workqueue.h>
25 #include <linux/in.h>
26 #include <linux/sched/signal.h>
27 #include <linux/if_vlan.h>
28
29 #include <net/sock.h>
30 #include <net/tcp.h>
31 #include <net/smc.h>
32 #include <asm/ioctls.h>
33
34 #include <net/net_namespace.h>
35 #include <net/netns/generic.h>
36 #include "smc_netns.h"
37
38 #include "smc.h"
39 #include "smc_clc.h"
40 #include "smc_llc.h"
41 #include "smc_cdc.h"
42 #include "smc_core.h"
43 #include "smc_ib.h"
44 #include "smc_ism.h"
45 #include "smc_pnet.h"
46 #include "smc_tx.h"
47 #include "smc_rx.h"
48 #include "smc_close.h"
49
50 static DEFINE_MUTEX(smc_server_lgr_pending);    /* serialize link group
51                                                  * creation on server
52                                                  */
53 static DEFINE_MUTEX(smc_client_lgr_pending);    /* serialize link group
54                                                  * creation on client
55                                                  */
56
57 static void smc_tcp_listen_work(struct work_struct *);
58 static void smc_connect_work(struct work_struct *);
59
60 static void smc_set_keepalive(struct sock *sk, int val)
61 {
62         struct smc_sock *smc = smc_sk(sk);
63
64         smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val);
65 }
66
67 static struct smc_hashinfo smc_v4_hashinfo = {
68         .lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock),
69 };
70
71 static struct smc_hashinfo smc_v6_hashinfo = {
72         .lock = __RW_LOCK_UNLOCKED(smc_v6_hashinfo.lock),
73 };
74
75 int smc_hash_sk(struct sock *sk)
76 {
77         struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
78         struct hlist_head *head;
79
80         head = &h->ht;
81
82         write_lock_bh(&h->lock);
83         sk_add_node(sk, head);
84         sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
85         write_unlock_bh(&h->lock);
86
87         return 0;
88 }
89 EXPORT_SYMBOL_GPL(smc_hash_sk);
90
91 void smc_unhash_sk(struct sock *sk)
92 {
93         struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
94
95         write_lock_bh(&h->lock);
96         if (sk_del_node_init(sk))
97                 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
98         write_unlock_bh(&h->lock);
99 }
100 EXPORT_SYMBOL_GPL(smc_unhash_sk);
101
102 struct proto smc_proto = {
103         .name           = "SMC",
104         .owner          = THIS_MODULE,
105         .keepalive      = smc_set_keepalive,
106         .hash           = smc_hash_sk,
107         .unhash         = smc_unhash_sk,
108         .obj_size       = sizeof(struct smc_sock),
109         .h.smc_hash     = &smc_v4_hashinfo,
110         .slab_flags     = SLAB_TYPESAFE_BY_RCU,
111 };
112 EXPORT_SYMBOL_GPL(smc_proto);
113
114 struct proto smc_proto6 = {
115         .name           = "SMC6",
116         .owner          = THIS_MODULE,
117         .keepalive      = smc_set_keepalive,
118         .hash           = smc_hash_sk,
119         .unhash         = smc_unhash_sk,
120         .obj_size       = sizeof(struct smc_sock),
121         .h.smc_hash     = &smc_v6_hashinfo,
122         .slab_flags     = SLAB_TYPESAFE_BY_RCU,
123 };
124 EXPORT_SYMBOL_GPL(smc_proto6);
125
126 static int __smc_release(struct smc_sock *smc)
127 {
128         struct sock *sk = &smc->sk;
129         int rc = 0;
130
131         if (!smc->use_fallback) {
132                 rc = smc_close_active(smc);
133                 sock_set_flag(sk, SOCK_DEAD);
134                 sk->sk_shutdown |= SHUTDOWN_MASK;
135         } else {
136                 if (sk->sk_state != SMC_LISTEN && sk->sk_state != SMC_INIT)
137                         sock_put(sk); /* passive closing */
138                 if (sk->sk_state == SMC_LISTEN) {
139                         /* wake up clcsock accept */
140                         rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR);
141                 }
142                 sk->sk_state = SMC_CLOSED;
143                 sk->sk_state_change(sk);
144         }
145
146         sk->sk_prot->unhash(sk);
147
148         if (sk->sk_state == SMC_CLOSED) {
149                 if (smc->clcsock) {
150                         release_sock(sk);
151                         smc_clcsock_release(smc);
152                         lock_sock(sk);
153                 }
154                 if (!smc->use_fallback)
155                         smc_conn_free(&smc->conn);
156         }
157
158         return rc;
159 }
160
161 static int smc_release(struct socket *sock)
162 {
163         struct sock *sk = sock->sk;
164         struct smc_sock *smc;
165         int rc = 0;
166
167         if (!sk)
168                 goto out;
169
170         smc = smc_sk(sk);
171
172         /* cleanup for a dangling non-blocking connect */
173         if (smc->connect_nonblock && sk->sk_state == SMC_INIT)
174                 tcp_abort(smc->clcsock->sk, ECONNABORTED);
175         flush_work(&smc->connect_work);
176
177         if (sk->sk_state == SMC_LISTEN)
178                 /* smc_close_non_accepted() is called and acquires
179                  * sock lock for child sockets again
180                  */
181                 lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
182         else
183                 lock_sock(sk);
184
185         rc = __smc_release(smc);
186
187         /* detach socket */
188         sock_orphan(sk);
189         sock->sk = NULL;
190         release_sock(sk);
191
192         sock_put(sk); /* final sock_put */
193 out:
194         return rc;
195 }
196
197 static void smc_destruct(struct sock *sk)
198 {
199         if (sk->sk_state != SMC_CLOSED)
200                 return;
201         if (!sock_flag(sk, SOCK_DEAD))
202                 return;
203
204         sk_refcnt_debug_dec(sk);
205 }
206
207 static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
208                                    int protocol)
209 {
210         struct smc_sock *smc;
211         struct proto *prot;
212         struct sock *sk;
213
214         prot = (protocol == SMCPROTO_SMC6) ? &smc_proto6 : &smc_proto;
215         sk = sk_alloc(net, PF_SMC, GFP_KERNEL, prot, 0);
216         if (!sk)
217                 return NULL;
218
219         sock_init_data(sock, sk); /* sets sk_refcnt to 1 */
220         sk->sk_state = SMC_INIT;
221         sk->sk_destruct = smc_destruct;
222         sk->sk_protocol = protocol;
223         smc = smc_sk(sk);
224         INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
225         INIT_WORK(&smc->connect_work, smc_connect_work);
226         INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work);
227         INIT_LIST_HEAD(&smc->accept_q);
228         spin_lock_init(&smc->accept_q_lock);
229         spin_lock_init(&smc->conn.send_lock);
230         sk->sk_prot->hash(sk);
231         sk_refcnt_debug_inc(sk);
232         mutex_init(&smc->clcsock_release_lock);
233
234         return sk;
235 }
236
237 static int smc_bind(struct socket *sock, struct sockaddr *uaddr,
238                     int addr_len)
239 {
240         struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
241         struct sock *sk = sock->sk;
242         struct smc_sock *smc;
243         int rc;
244
245         smc = smc_sk(sk);
246
247         /* replicate tests from inet_bind(), to be safe wrt. future changes */
248         rc = -EINVAL;
249         if (addr_len < sizeof(struct sockaddr_in))
250                 goto out;
251
252         rc = -EAFNOSUPPORT;
253         if (addr->sin_family != AF_INET &&
254             addr->sin_family != AF_INET6 &&
255             addr->sin_family != AF_UNSPEC)
256                 goto out;
257         /* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */
258         if (addr->sin_family == AF_UNSPEC &&
259             addr->sin_addr.s_addr != htonl(INADDR_ANY))
260                 goto out;
261
262         lock_sock(sk);
263
264         /* Check if socket is already active */
265         rc = -EINVAL;
266         if (sk->sk_state != SMC_INIT || smc->connect_nonblock)
267                 goto out_rel;
268
269         smc->clcsock->sk->sk_reuse = sk->sk_reuse;
270         rc = kernel_bind(smc->clcsock, uaddr, addr_len);
271
272 out_rel:
273         release_sock(sk);
274 out:
275         return rc;
276 }
277
278 static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk,
279                                    unsigned long mask)
280 {
281         /* options we don't get control via setsockopt for */
282         nsk->sk_type = osk->sk_type;
283         nsk->sk_sndbuf = osk->sk_sndbuf;
284         nsk->sk_rcvbuf = osk->sk_rcvbuf;
285         nsk->sk_sndtimeo = osk->sk_sndtimeo;
286         nsk->sk_rcvtimeo = osk->sk_rcvtimeo;
287         nsk->sk_mark = osk->sk_mark;
288         nsk->sk_priority = osk->sk_priority;
289         nsk->sk_rcvlowat = osk->sk_rcvlowat;
290         nsk->sk_bound_dev_if = osk->sk_bound_dev_if;
291         nsk->sk_err = osk->sk_err;
292
293         nsk->sk_flags &= ~mask;
294         nsk->sk_flags |= osk->sk_flags & mask;
295 }
296
297 #define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \
298                              (1UL << SOCK_KEEPOPEN) | \
299                              (1UL << SOCK_LINGER) | \
300                              (1UL << SOCK_BROADCAST) | \
301                              (1UL << SOCK_TIMESTAMP) | \
302                              (1UL << SOCK_DBG) | \
303                              (1UL << SOCK_RCVTSTAMP) | \
304                              (1UL << SOCK_RCVTSTAMPNS) | \
305                              (1UL << SOCK_LOCALROUTE) | \
306                              (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \
307                              (1UL << SOCK_RXQ_OVFL) | \
308                              (1UL << SOCK_WIFI_STATUS) | \
309                              (1UL << SOCK_NOFCS) | \
310                              (1UL << SOCK_FILTER_LOCKED) | \
311                              (1UL << SOCK_TSTAMP_NEW))
312 /* copy only relevant settings and flags of SOL_SOCKET level from smc to
313  * clc socket (since smc is not called for these options from net/core)
314  */
315 static void smc_copy_sock_settings_to_clc(struct smc_sock *smc)
316 {
317         smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC);
318 }
319
320 #define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \
321                              (1UL << SOCK_KEEPOPEN) | \
322                              (1UL << SOCK_LINGER) | \
323                              (1UL << SOCK_DBG))
324 /* copy only settings and flags relevant for smc from clc to smc socket */
325 static void smc_copy_sock_settings_to_smc(struct smc_sock *smc)
326 {
327         smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
328 }
329
330 /* register a new rmb, send confirm_rkey msg to register with peer */
331 static int smc_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc,
332                        bool conf_rkey)
333 {
334         if (!rmb_desc->wr_reg) {
335                 /* register memory region for new rmb */
336                 if (smc_wr_reg_send(link, rmb_desc->mr_rx[SMC_SINGLE_LINK])) {
337                         rmb_desc->regerr = 1;
338                         return -EFAULT;
339                 }
340                 rmb_desc->wr_reg = 1;
341         }
342         if (!conf_rkey)
343                 return 0;
344         /* exchange confirm_rkey msg with peer */
345         if (smc_llc_do_confirm_rkey(link, rmb_desc)) {
346                 rmb_desc->regerr = 1;
347                 return -EFAULT;
348         }
349         return 0;
350 }
351
352 static int smc_clnt_conf_first_link(struct smc_sock *smc)
353 {
354         struct net *net = sock_net(smc->clcsock->sk);
355         struct smc_link_group *lgr = smc->conn.lgr;
356         struct smc_link *link;
357         int rest;
358         int rc;
359
360         link = &lgr->lnk[SMC_SINGLE_LINK];
361         /* receive CONFIRM LINK request from server over RoCE fabric */
362         rest = wait_for_completion_interruptible_timeout(
363                 &link->llc_confirm,
364                 SMC_LLC_WAIT_FIRST_TIME);
365         if (rest <= 0) {
366                 struct smc_clc_msg_decline dclc;
367
368                 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
369                                       SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
370                 return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc;
371         }
372
373         if (link->llc_confirm_rc)
374                 return SMC_CLC_DECL_RMBE_EC;
375
376         rc = smc_ib_modify_qp_rts(link);
377         if (rc)
378                 return SMC_CLC_DECL_ERR_RDYLNK;
379
380         smc_wr_remember_qp_attr(link);
381
382         if (smc_reg_rmb(link, smc->conn.rmb_desc, false))
383                 return SMC_CLC_DECL_ERR_REGRMB;
384
385         /* send CONFIRM LINK response over RoCE fabric */
386         rc = smc_llc_send_confirm_link(link, SMC_LLC_RESP);
387         if (rc < 0)
388                 return SMC_CLC_DECL_TIMEOUT_CL;
389
390         /* receive ADD LINK request from server over RoCE fabric */
391         rest = wait_for_completion_interruptible_timeout(&link->llc_add,
392                                                          SMC_LLC_WAIT_TIME);
393         if (rest <= 0) {
394                 struct smc_clc_msg_decline dclc;
395
396                 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
397                                       SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
398                 return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_AL : rc;
399         }
400
401         /* send add link reject message, only one link supported for now */
402         rc = smc_llc_send_add_link(link,
403                                    link->smcibdev->mac[link->ibport - 1],
404                                    link->gid, SMC_LLC_RESP);
405         if (rc < 0)
406                 return SMC_CLC_DECL_TIMEOUT_AL;
407
408         smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time);
409
410         return 0;
411 }
412
413 static void smcr_conn_save_peer_info(struct smc_sock *smc,
414                                      struct smc_clc_msg_accept_confirm *clc)
415 {
416         int bufsize = smc_uncompress_bufsize(clc->rmbe_size);
417
418         smc->conn.peer_rmbe_idx = clc->rmbe_idx;
419         smc->conn.local_tx_ctrl.token = ntohl(clc->rmbe_alert_token);
420         smc->conn.peer_rmbe_size = bufsize;
421         atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
422         smc->conn.tx_off = bufsize * (smc->conn.peer_rmbe_idx - 1);
423 }
424
425 static void smcd_conn_save_peer_info(struct smc_sock *smc,
426                                      struct smc_clc_msg_accept_confirm *clc)
427 {
428         int bufsize = smc_uncompress_bufsize(clc->dmbe_size);
429
430         smc->conn.peer_rmbe_idx = clc->dmbe_idx;
431         smc->conn.peer_token = clc->token;
432         /* msg header takes up space in the buffer */
433         smc->conn.peer_rmbe_size = bufsize - sizeof(struct smcd_cdc_msg);
434         atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
435         smc->conn.tx_off = bufsize * smc->conn.peer_rmbe_idx;
436 }
437
438 static void smc_conn_save_peer_info(struct smc_sock *smc,
439                                     struct smc_clc_msg_accept_confirm *clc)
440 {
441         if (smc->conn.lgr->is_smcd)
442                 smcd_conn_save_peer_info(smc, clc);
443         else
444                 smcr_conn_save_peer_info(smc, clc);
445 }
446
447 static void smc_link_save_peer_info(struct smc_link *link,
448                                     struct smc_clc_msg_accept_confirm *clc)
449 {
450         link->peer_qpn = ntoh24(clc->qpn);
451         memcpy(link->peer_gid, clc->lcl.gid, SMC_GID_SIZE);
452         memcpy(link->peer_mac, clc->lcl.mac, sizeof(link->peer_mac));
453         link->peer_psn = ntoh24(clc->psn);
454         link->peer_mtu = clc->qp_mtu;
455 }
456
457 static void smc_switch_to_fallback(struct smc_sock *smc)
458 {
459         smc->use_fallback = true;
460         if (smc->sk.sk_socket && smc->sk.sk_socket->file) {
461                 smc->clcsock->file = smc->sk.sk_socket->file;
462                 smc->clcsock->file->private_data = smc->clcsock;
463         }
464 }
465
466 /* fall back during connect */
467 static int smc_connect_fallback(struct smc_sock *smc, int reason_code)
468 {
469         smc_switch_to_fallback(smc);
470         smc->fallback_rsn = reason_code;
471         smc_copy_sock_settings_to_clc(smc);
472         smc->connect_nonblock = 0;
473         if (smc->sk.sk_state == SMC_INIT)
474                 smc->sk.sk_state = SMC_ACTIVE;
475         return 0;
476 }
477
478 /* decline and fall back during connect */
479 static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code)
480 {
481         int rc;
482
483         if (reason_code < 0) { /* error, fallback is not possible */
484                 if (smc->sk.sk_state == SMC_INIT)
485                         sock_put(&smc->sk); /* passive closing */
486                 return reason_code;
487         }
488         if (reason_code != SMC_CLC_DECL_PEERDECL) {
489                 rc = smc_clc_send_decline(smc, reason_code);
490                 if (rc < 0) {
491                         if (smc->sk.sk_state == SMC_INIT)
492                                 sock_put(&smc->sk); /* passive closing */
493                         return rc;
494                 }
495         }
496         return smc_connect_fallback(smc, reason_code);
497 }
498
499 /* abort connecting */
500 static int smc_connect_abort(struct smc_sock *smc, int reason_code,
501                              int local_contact)
502 {
503         if (local_contact == SMC_FIRST_CONTACT)
504                 smc_lgr_forget(smc->conn.lgr);
505         if (smc->conn.lgr->is_smcd)
506                 /* there is only one lgr role for SMC-D; use server lock */
507                 mutex_unlock(&smc_server_lgr_pending);
508         else
509                 mutex_unlock(&smc_client_lgr_pending);
510
511         smc_conn_free(&smc->conn);
512         smc->connect_nonblock = 0;
513         return reason_code;
514 }
515
516 /* check if there is a rdma device available for this connection. */
517 /* called for connect and listen */
518 static int smc_find_rdma_device(struct smc_sock *smc, struct smc_init_info *ini)
519 {
520         /* PNET table look up: search active ib_device and port
521          * within same PNETID that also contains the ethernet device
522          * used for the internal TCP socket
523          */
524         smc_pnet_find_roce_resource(smc->clcsock->sk, ini);
525         if (!ini->ib_dev)
526                 return SMC_CLC_DECL_NOSMCRDEV;
527         return 0;
528 }
529
530 /* check if there is an ISM device available for this connection. */
531 /* called for connect and listen */
532 static int smc_find_ism_device(struct smc_sock *smc, struct smc_init_info *ini)
533 {
534         /* Find ISM device with same PNETID as connecting interface  */
535         smc_pnet_find_ism_resource(smc->clcsock->sk, ini);
536         if (!ini->ism_dev)
537                 return SMC_CLC_DECL_NOSMCDDEV;
538         return 0;
539 }
540
541 /* Check for VLAN ID and register it on ISM device just for CLC handshake */
542 static int smc_connect_ism_vlan_setup(struct smc_sock *smc,
543                                       struct smc_init_info *ini)
544 {
545         if (ini->vlan_id && smc_ism_get_vlan(ini->ism_dev, ini->vlan_id))
546                 return SMC_CLC_DECL_ISMVLANERR;
547         return 0;
548 }
549
550 /* cleanup temporary VLAN ID registration used for CLC handshake. If ISM is
551  * used, the VLAN ID will be registered again during the connection setup.
552  */
553 static int smc_connect_ism_vlan_cleanup(struct smc_sock *smc, bool is_smcd,
554                                         struct smc_init_info *ini)
555 {
556         if (!is_smcd)
557                 return 0;
558         if (ini->vlan_id && smc_ism_put_vlan(ini->ism_dev, ini->vlan_id))
559                 return SMC_CLC_DECL_CNFERR;
560         return 0;
561 }
562
563 /* CLC handshake during connect */
564 static int smc_connect_clc(struct smc_sock *smc, int smc_type,
565                            struct smc_clc_msg_accept_confirm *aclc,
566                            struct smc_init_info *ini)
567 {
568         int rc = 0;
569
570         /* do inband token exchange */
571         rc = smc_clc_send_proposal(smc, smc_type, ini);
572         if (rc)
573                 return rc;
574         /* receive SMC Accept CLC message */
575         return smc_clc_wait_msg(smc, aclc, sizeof(*aclc), SMC_CLC_ACCEPT,
576                                 CLC_WAIT_TIME);
577 }
578
579 /* setup for RDMA connection of client */
580 static int smc_connect_rdma(struct smc_sock *smc,
581                             struct smc_clc_msg_accept_confirm *aclc,
582                             struct smc_init_info *ini)
583 {
584         struct smc_link *link;
585         int reason_code = 0;
586
587         ini->is_smcd = false;
588         ini->ib_lcl = &aclc->lcl;
589         ini->ib_clcqpn = ntoh24(aclc->qpn);
590         ini->srv_first_contact = aclc->hdr.flag;
591
592         mutex_lock(&smc_client_lgr_pending);
593         reason_code = smc_conn_create(smc, ini);
594         if (reason_code) {
595                 mutex_unlock(&smc_client_lgr_pending);
596                 return reason_code;
597         }
598         link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK];
599
600         smc_conn_save_peer_info(smc, aclc);
601
602         /* create send buffer and rmb */
603         if (smc_buf_create(smc, false))
604                 return smc_connect_abort(smc, SMC_CLC_DECL_MEM,
605                                          ini->cln_first_contact);
606
607         if (ini->cln_first_contact == SMC_FIRST_CONTACT)
608                 smc_link_save_peer_info(link, aclc);
609
610         if (smc_rmb_rtoken_handling(&smc->conn, aclc))
611                 return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RTOK,
612                                          ini->cln_first_contact);
613
614         smc_close_init(smc);
615         smc_rx_init(smc);
616
617         if (ini->cln_first_contact == SMC_FIRST_CONTACT) {
618                 if (smc_ib_ready_link(link))
619                         return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RDYLNK,
620                                                  ini->cln_first_contact);
621         } else {
622                 if (smc_reg_rmb(link, smc->conn.rmb_desc, true))
623                         return smc_connect_abort(smc, SMC_CLC_DECL_ERR_REGRMB,
624                                                  ini->cln_first_contact);
625         }
626         smc_rmb_sync_sg_for_device(&smc->conn);
627
628         reason_code = smc_clc_send_confirm(smc);
629         if (reason_code)
630                 return smc_connect_abort(smc, reason_code,
631                                          ini->cln_first_contact);
632
633         smc_tx_init(smc);
634
635         if (ini->cln_first_contact == SMC_FIRST_CONTACT) {
636                 /* QP confirmation over RoCE fabric */
637                 reason_code = smc_clnt_conf_first_link(smc);
638                 if (reason_code)
639                         return smc_connect_abort(smc, reason_code,
640                                                  ini->cln_first_contact);
641         }
642         mutex_unlock(&smc_client_lgr_pending);
643
644         smc_copy_sock_settings_to_clc(smc);
645         smc->connect_nonblock = 0;
646         if (smc->sk.sk_state == SMC_INIT)
647                 smc->sk.sk_state = SMC_ACTIVE;
648
649         return 0;
650 }
651
652 /* setup for ISM connection of client */
653 static int smc_connect_ism(struct smc_sock *smc,
654                            struct smc_clc_msg_accept_confirm *aclc,
655                            struct smc_init_info *ini)
656 {
657         int rc = 0;
658
659         ini->is_smcd = true;
660         ini->ism_gid = aclc->gid;
661         ini->srv_first_contact = aclc->hdr.flag;
662
663         /* there is only one lgr role for SMC-D; use server lock */
664         mutex_lock(&smc_server_lgr_pending);
665         rc = smc_conn_create(smc, ini);
666         if (rc) {
667                 mutex_unlock(&smc_server_lgr_pending);
668                 return rc;
669         }
670
671         /* Create send and receive buffers */
672         if (smc_buf_create(smc, true))
673                 return smc_connect_abort(smc, SMC_CLC_DECL_MEM,
674                                          ini->cln_first_contact);
675
676         smc_conn_save_peer_info(smc, aclc);
677         smc_close_init(smc);
678         smc_rx_init(smc);
679         smc_tx_init(smc);
680
681         rc = smc_clc_send_confirm(smc);
682         if (rc)
683                 return smc_connect_abort(smc, rc, ini->cln_first_contact);
684         mutex_unlock(&smc_server_lgr_pending);
685
686         smc_copy_sock_settings_to_clc(smc);
687         smc->connect_nonblock = 0;
688         if (smc->sk.sk_state == SMC_INIT)
689                 smc->sk.sk_state = SMC_ACTIVE;
690
691         return 0;
692 }
693
694 /* perform steps before actually connecting */
695 static int __smc_connect(struct smc_sock *smc)
696 {
697         bool ism_supported = false, rdma_supported = false;
698         struct smc_clc_msg_accept_confirm aclc;
699         struct smc_init_info ini = {0};
700         int smc_type;
701         int rc = 0;
702
703         sock_hold(&smc->sk); /* sock put in passive closing */
704
705         if (smc->use_fallback)
706                 return smc_connect_fallback(smc, smc->fallback_rsn);
707
708         /* if peer has not signalled SMC-capability, fall back */
709         if (!tcp_sk(smc->clcsock->sk)->syn_smc)
710                 return smc_connect_fallback(smc, SMC_CLC_DECL_PEERNOSMC);
711
712         /* IPSec connections opt out of SMC-R optimizations */
713         if (using_ipsec(smc))
714                 return smc_connect_decline_fallback(smc, SMC_CLC_DECL_IPSEC);
715
716         /* get vlan id from IP device */
717         if (smc_vlan_by_tcpsk(smc->clcsock, &ini))
718                 return smc_connect_decline_fallback(smc,
719                                                     SMC_CLC_DECL_GETVLANERR);
720
721         /* check if there is an ism device available */
722         if (!smc_find_ism_device(smc, &ini) &&
723             !smc_connect_ism_vlan_setup(smc, &ini)) {
724                 /* ISM is supported for this connection */
725                 ism_supported = true;
726                 smc_type = SMC_TYPE_D;
727         }
728
729         /* check if there is a rdma device available */
730         if (!smc_find_rdma_device(smc, &ini)) {
731                 /* RDMA is supported for this connection */
732                 rdma_supported = true;
733                 if (ism_supported)
734                         smc_type = SMC_TYPE_B; /* both */
735                 else
736                         smc_type = SMC_TYPE_R; /* only RDMA */
737         }
738
739         /* if neither ISM nor RDMA are supported, fallback */
740         if (!rdma_supported && !ism_supported)
741                 return smc_connect_decline_fallback(smc, SMC_CLC_DECL_NOSMCDEV);
742
743         /* perform CLC handshake */
744         rc = smc_connect_clc(smc, smc_type, &aclc, &ini);
745         if (rc) {
746                 smc_connect_ism_vlan_cleanup(smc, ism_supported, &ini);
747                 return smc_connect_decline_fallback(smc, rc);
748         }
749
750         /* depending on previous steps, connect using rdma or ism */
751         if (rdma_supported && aclc.hdr.path == SMC_TYPE_R)
752                 rc = smc_connect_rdma(smc, &aclc, &ini);
753         else if (ism_supported && aclc.hdr.path == SMC_TYPE_D)
754                 rc = smc_connect_ism(smc, &aclc, &ini);
755         else
756                 rc = SMC_CLC_DECL_MODEUNSUPP;
757         if (rc) {
758                 smc_connect_ism_vlan_cleanup(smc, ism_supported, &ini);
759                 return smc_connect_decline_fallback(smc, rc);
760         }
761
762         smc_connect_ism_vlan_cleanup(smc, ism_supported, &ini);
763         return 0;
764 }
765
766 static void smc_connect_work(struct work_struct *work)
767 {
768         struct smc_sock *smc = container_of(work, struct smc_sock,
769                                             connect_work);
770         long timeo = smc->sk.sk_sndtimeo;
771         int rc = 0;
772
773         if (!timeo)
774                 timeo = MAX_SCHEDULE_TIMEOUT;
775         lock_sock(smc->clcsock->sk);
776         if (smc->clcsock->sk->sk_err) {
777                 smc->sk.sk_err = smc->clcsock->sk->sk_err;
778         } else if ((1 << smc->clcsock->sk->sk_state) &
779                                         (TCPF_SYN_SENT | TCP_SYN_RECV)) {
780                 rc = sk_stream_wait_connect(smc->clcsock->sk, &timeo);
781                 if ((rc == -EPIPE) &&
782                     ((1 << smc->clcsock->sk->sk_state) &
783                                         (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)))
784                         rc = 0;
785         }
786         release_sock(smc->clcsock->sk);
787         lock_sock(&smc->sk);
788         if (rc != 0 || smc->sk.sk_err) {
789                 smc->sk.sk_state = SMC_CLOSED;
790                 if (rc == -EPIPE || rc == -EAGAIN)
791                         smc->sk.sk_err = EPIPE;
792                 else if (signal_pending(current))
793                         smc->sk.sk_err = -sock_intr_errno(timeo);
794                 goto out;
795         }
796
797         rc = __smc_connect(smc);
798         if (rc < 0)
799                 smc->sk.sk_err = -rc;
800
801 out:
802         if (!sock_flag(&smc->sk, SOCK_DEAD)) {
803                 if (smc->sk.sk_err) {
804                         smc->sk.sk_state_change(&smc->sk);
805                 } else { /* allow polling before and after fallback decision */
806                         smc->clcsock->sk->sk_write_space(smc->clcsock->sk);
807                         smc->sk.sk_write_space(&smc->sk);
808                 }
809         }
810         release_sock(&smc->sk);
811 }
812
813 static int smc_connect(struct socket *sock, struct sockaddr *addr,
814                        int alen, int flags)
815 {
816         struct sock *sk = sock->sk;
817         struct smc_sock *smc;
818         int rc = -EINVAL;
819
820         smc = smc_sk(sk);
821
822         /* separate smc parameter checking to be safe */
823         if (alen < sizeof(addr->sa_family))
824                 goto out_err;
825         if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6)
826                 goto out_err;
827
828         lock_sock(sk);
829         switch (sk->sk_state) {
830         default:
831                 goto out;
832         case SMC_ACTIVE:
833                 rc = -EISCONN;
834                 goto out;
835         case SMC_INIT:
836                 rc = 0;
837                 break;
838         }
839
840         smc_copy_sock_settings_to_clc(smc);
841         tcp_sk(smc->clcsock->sk)->syn_smc = 1;
842         if (smc->connect_nonblock) {
843                 rc = -EALREADY;
844                 goto out;
845         }
846         rc = kernel_connect(smc->clcsock, addr, alen, flags);
847         if (rc && rc != -EINPROGRESS)
848                 goto out;
849         if (flags & O_NONBLOCK) {
850                 if (schedule_work(&smc->connect_work))
851                         smc->connect_nonblock = 1;
852                 rc = -EINPROGRESS;
853         } else {
854                 rc = __smc_connect(smc);
855                 if (rc < 0)
856                         goto out;
857                 else
858                         rc = 0; /* success cases including fallback */
859         }
860
861 out:
862         release_sock(sk);
863 out_err:
864         return rc;
865 }
866
867 static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
868 {
869         struct socket *new_clcsock = NULL;
870         struct sock *lsk = &lsmc->sk;
871         struct sock *new_sk;
872         int rc = -EINVAL;
873
874         release_sock(lsk);
875         new_sk = smc_sock_alloc(sock_net(lsk), NULL, lsk->sk_protocol);
876         if (!new_sk) {
877                 rc = -ENOMEM;
878                 lsk->sk_err = ENOMEM;
879                 *new_smc = NULL;
880                 lock_sock(lsk);
881                 goto out;
882         }
883         *new_smc = smc_sk(new_sk);
884
885         mutex_lock(&lsmc->clcsock_release_lock);
886         if (lsmc->clcsock)
887                 rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0);
888         mutex_unlock(&lsmc->clcsock_release_lock);
889         lock_sock(lsk);
890         if  (rc < 0)
891                 lsk->sk_err = -rc;
892         if (rc < 0 || lsk->sk_state == SMC_CLOSED) {
893                 new_sk->sk_prot->unhash(new_sk);
894                 if (new_clcsock)
895                         sock_release(new_clcsock);
896                 new_sk->sk_state = SMC_CLOSED;
897                 sock_set_flag(new_sk, SOCK_DEAD);
898                 sock_put(new_sk); /* final */
899                 *new_smc = NULL;
900                 goto out;
901         }
902
903         (*new_smc)->clcsock = new_clcsock;
904 out:
905         return rc;
906 }
907
908 /* add a just created sock to the accept queue of the listen sock as
909  * candidate for a following socket accept call from user space
910  */
911 static void smc_accept_enqueue(struct sock *parent, struct sock *sk)
912 {
913         struct smc_sock *par = smc_sk(parent);
914
915         sock_hold(sk); /* sock_put in smc_accept_unlink () */
916         spin_lock(&par->accept_q_lock);
917         list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q);
918         spin_unlock(&par->accept_q_lock);
919         sk_acceptq_added(parent);
920 }
921
922 /* remove a socket from the accept queue of its parental listening socket */
923 static void smc_accept_unlink(struct sock *sk)
924 {
925         struct smc_sock *par = smc_sk(sk)->listen_smc;
926
927         spin_lock(&par->accept_q_lock);
928         list_del_init(&smc_sk(sk)->accept_q);
929         spin_unlock(&par->accept_q_lock);
930         sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk);
931         sock_put(sk); /* sock_hold in smc_accept_enqueue */
932 }
933
934 /* remove a sock from the accept queue to bind it to a new socket created
935  * for a socket accept call from user space
936  */
937 struct sock *smc_accept_dequeue(struct sock *parent,
938                                 struct socket *new_sock)
939 {
940         struct smc_sock *isk, *n;
941         struct sock *new_sk;
942
943         list_for_each_entry_safe(isk, n, &smc_sk(parent)->accept_q, accept_q) {
944                 new_sk = (struct sock *)isk;
945
946                 smc_accept_unlink(new_sk);
947                 if (new_sk->sk_state == SMC_CLOSED) {
948                         new_sk->sk_prot->unhash(new_sk);
949                         if (isk->clcsock) {
950                                 sock_release(isk->clcsock);
951                                 isk->clcsock = NULL;
952                         }
953                         sock_put(new_sk); /* final */
954                         continue;
955                 }
956                 if (new_sock) {
957                         sock_graft(new_sk, new_sock);
958                         if (isk->use_fallback) {
959                                 smc_sk(new_sk)->clcsock->file = new_sock->file;
960                                 isk->clcsock->file->private_data = isk->clcsock;
961                         }
962                 }
963                 return new_sk;
964         }
965         return NULL;
966 }
967
968 /* clean up for a created but never accepted sock */
969 void smc_close_non_accepted(struct sock *sk)
970 {
971         struct smc_sock *smc = smc_sk(sk);
972
973         lock_sock(sk);
974         if (!sk->sk_lingertime)
975                 /* wait for peer closing */
976                 sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT;
977         __smc_release(smc);
978         release_sock(sk);
979         sock_put(sk); /* final sock_put */
980 }
981
982 static int smc_serv_conf_first_link(struct smc_sock *smc)
983 {
984         struct net *net = sock_net(smc->clcsock->sk);
985         struct smc_link_group *lgr = smc->conn.lgr;
986         struct smc_link *link;
987         int rest;
988         int rc;
989
990         link = &lgr->lnk[SMC_SINGLE_LINK];
991
992         if (smc_reg_rmb(link, smc->conn.rmb_desc, false))
993                 return SMC_CLC_DECL_ERR_REGRMB;
994
995         /* send CONFIRM LINK request to client over the RoCE fabric */
996         rc = smc_llc_send_confirm_link(link, SMC_LLC_REQ);
997         if (rc < 0)
998                 return SMC_CLC_DECL_TIMEOUT_CL;
999
1000         /* receive CONFIRM LINK response from client over the RoCE fabric */
1001         rest = wait_for_completion_interruptible_timeout(
1002                 &link->llc_confirm_resp,
1003                 SMC_LLC_WAIT_FIRST_TIME);
1004         if (rest <= 0) {
1005                 struct smc_clc_msg_decline dclc;
1006
1007                 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
1008                                       SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
1009                 return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc;
1010         }
1011
1012         if (link->llc_confirm_resp_rc)
1013                 return SMC_CLC_DECL_RMBE_EC;
1014
1015         /* send ADD LINK request to client over the RoCE fabric */
1016         rc = smc_llc_send_add_link(link,
1017                                    link->smcibdev->mac[link->ibport - 1],
1018                                    link->gid, SMC_LLC_REQ);
1019         if (rc < 0)
1020                 return SMC_CLC_DECL_TIMEOUT_AL;
1021
1022         /* receive ADD LINK response from client over the RoCE fabric */
1023         rest = wait_for_completion_interruptible_timeout(&link->llc_add_resp,
1024                                                          SMC_LLC_WAIT_TIME);
1025         if (rest <= 0) {
1026                 struct smc_clc_msg_decline dclc;
1027
1028                 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
1029                                       SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
1030                 return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_AL : rc;
1031         }
1032
1033         smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time);
1034
1035         return 0;
1036 }
1037
1038 /* listen worker: finish */
1039 static void smc_listen_out(struct smc_sock *new_smc)
1040 {
1041         struct smc_sock *lsmc = new_smc->listen_smc;
1042         struct sock *newsmcsk = &new_smc->sk;
1043
1044         if (lsmc->sk.sk_state == SMC_LISTEN) {
1045                 lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING);
1046                 smc_accept_enqueue(&lsmc->sk, newsmcsk);
1047                 release_sock(&lsmc->sk);
1048         } else { /* no longer listening */
1049                 smc_close_non_accepted(newsmcsk);
1050         }
1051
1052         /* Wake up accept */
1053         lsmc->sk.sk_data_ready(&lsmc->sk);
1054         sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */
1055 }
1056
1057 /* listen worker: finish in state connected */
1058 static void smc_listen_out_connected(struct smc_sock *new_smc)
1059 {
1060         struct sock *newsmcsk = &new_smc->sk;
1061
1062         sk_refcnt_debug_inc(newsmcsk);
1063         if (newsmcsk->sk_state == SMC_INIT)
1064                 newsmcsk->sk_state = SMC_ACTIVE;
1065
1066         smc_listen_out(new_smc);
1067 }
1068
1069 /* listen worker: finish in error state */
1070 static void smc_listen_out_err(struct smc_sock *new_smc)
1071 {
1072         struct sock *newsmcsk = &new_smc->sk;
1073
1074         if (newsmcsk->sk_state == SMC_INIT)
1075                 sock_put(&new_smc->sk); /* passive closing */
1076         newsmcsk->sk_state = SMC_CLOSED;
1077         smc_conn_free(&new_smc->conn);
1078
1079         smc_listen_out(new_smc);
1080 }
1081
1082 /* listen worker: decline and fall back if possible */
1083 static void smc_listen_decline(struct smc_sock *new_smc, int reason_code,
1084                                int local_contact)
1085 {
1086         /* RDMA setup failed, switch back to TCP */
1087         if (local_contact == SMC_FIRST_CONTACT)
1088                 smc_lgr_forget(new_smc->conn.lgr);
1089         if (reason_code < 0) { /* error, no fallback possible */
1090                 smc_listen_out_err(new_smc);
1091                 return;
1092         }
1093         smc_conn_free(&new_smc->conn);
1094         smc_switch_to_fallback(new_smc);
1095         new_smc->fallback_rsn = reason_code;
1096         if (reason_code && reason_code != SMC_CLC_DECL_PEERDECL) {
1097                 if (smc_clc_send_decline(new_smc, reason_code) < 0) {
1098                         smc_listen_out_err(new_smc);
1099                         return;
1100                 }
1101         }
1102         smc_listen_out_connected(new_smc);
1103 }
1104
1105 /* listen worker: check prefixes */
1106 static int smc_listen_prfx_check(struct smc_sock *new_smc,
1107                                  struct smc_clc_msg_proposal *pclc)
1108 {
1109         struct smc_clc_msg_proposal_prefix *pclc_prfx;
1110         struct socket *newclcsock = new_smc->clcsock;
1111
1112         pclc_prfx = smc_clc_proposal_get_prefix(pclc);
1113         if (smc_clc_prfx_match(newclcsock, pclc_prfx))
1114                 return SMC_CLC_DECL_DIFFPREFIX;
1115
1116         return 0;
1117 }
1118
1119 /* listen worker: initialize connection and buffers */
1120 static int smc_listen_rdma_init(struct smc_sock *new_smc,
1121                                 struct smc_init_info *ini)
1122 {
1123         int rc;
1124
1125         /* allocate connection / link group */
1126         rc = smc_conn_create(new_smc, ini);
1127         if (rc)
1128                 return rc;
1129
1130         /* create send buffer and rmb */
1131         if (smc_buf_create(new_smc, false))
1132                 return SMC_CLC_DECL_MEM;
1133
1134         return 0;
1135 }
1136
1137 /* listen worker: initialize connection and buffers for SMC-D */
1138 static int smc_listen_ism_init(struct smc_sock *new_smc,
1139                                struct smc_clc_msg_proposal *pclc,
1140                                struct smc_init_info *ini)
1141 {
1142         struct smc_clc_msg_smcd *pclc_smcd;
1143         int rc;
1144
1145         pclc_smcd = smc_get_clc_msg_smcd(pclc);
1146         ini->ism_gid = pclc_smcd->gid;
1147         rc = smc_conn_create(new_smc, ini);
1148         if (rc)
1149                 return rc;
1150
1151         /* Check if peer can be reached via ISM device */
1152         if (smc_ism_cantalk(new_smc->conn.lgr->peer_gid,
1153                             new_smc->conn.lgr->vlan_id,
1154                             new_smc->conn.lgr->smcd)) {
1155                 if (ini->cln_first_contact == SMC_FIRST_CONTACT)
1156                         smc_lgr_forget(new_smc->conn.lgr);
1157                 smc_conn_free(&new_smc->conn);
1158                 return SMC_CLC_DECL_SMCDNOTALK;
1159         }
1160
1161         /* Create send and receive buffers */
1162         if (smc_buf_create(new_smc, true)) {
1163                 if (ini->cln_first_contact == SMC_FIRST_CONTACT)
1164                         smc_lgr_forget(new_smc->conn.lgr);
1165                 smc_conn_free(&new_smc->conn);
1166                 return SMC_CLC_DECL_MEM;
1167         }
1168
1169         return 0;
1170 }
1171
1172 /* listen worker: register buffers */
1173 static int smc_listen_rdma_reg(struct smc_sock *new_smc, int local_contact)
1174 {
1175         struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
1176
1177         if (local_contact != SMC_FIRST_CONTACT) {
1178                 if (smc_reg_rmb(link, new_smc->conn.rmb_desc, true))
1179                         return SMC_CLC_DECL_ERR_REGRMB;
1180         }
1181         smc_rmb_sync_sg_for_device(&new_smc->conn);
1182
1183         return 0;
1184 }
1185
1186 /* listen worker: finish RDMA setup */
1187 static int smc_listen_rdma_finish(struct smc_sock *new_smc,
1188                                   struct smc_clc_msg_accept_confirm *cclc,
1189                                   int local_contact)
1190 {
1191         struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
1192         int reason_code = 0;
1193
1194         if (local_contact == SMC_FIRST_CONTACT)
1195                 smc_link_save_peer_info(link, cclc);
1196
1197         if (smc_rmb_rtoken_handling(&new_smc->conn, cclc)) {
1198                 reason_code = SMC_CLC_DECL_ERR_RTOK;
1199                 goto decline;
1200         }
1201
1202         if (local_contact == SMC_FIRST_CONTACT) {
1203                 if (smc_ib_ready_link(link)) {
1204                         reason_code = SMC_CLC_DECL_ERR_RDYLNK;
1205                         goto decline;
1206                 }
1207                 /* QP confirmation over RoCE fabric */
1208                 reason_code = smc_serv_conf_first_link(new_smc);
1209                 if (reason_code)
1210                         goto decline;
1211         }
1212         return 0;
1213
1214 decline:
1215         smc_listen_decline(new_smc, reason_code, local_contact);
1216         return reason_code;
1217 }
1218
1219 /* setup for RDMA connection of server */
1220 static void smc_listen_work(struct work_struct *work)
1221 {
1222         struct smc_sock *new_smc = container_of(work, struct smc_sock,
1223                                                 smc_listen_work);
1224         struct socket *newclcsock = new_smc->clcsock;
1225         struct smc_clc_msg_accept_confirm cclc;
1226         struct smc_clc_msg_proposal *pclc;
1227         struct smc_init_info ini = {0};
1228         bool ism_supported = false;
1229         u8 buf[SMC_CLC_MAX_LEN];
1230         int rc = 0;
1231
1232         if (new_smc->listen_smc->sk.sk_state != SMC_LISTEN)
1233                 return smc_listen_out_err(new_smc);
1234
1235         if (new_smc->use_fallback) {
1236                 smc_listen_out_connected(new_smc);
1237                 return;
1238         }
1239
1240         /* check if peer is smc capable */
1241         if (!tcp_sk(newclcsock->sk)->syn_smc) {
1242                 smc_switch_to_fallback(new_smc);
1243                 new_smc->fallback_rsn = SMC_CLC_DECL_PEERNOSMC;
1244                 smc_listen_out_connected(new_smc);
1245                 return;
1246         }
1247
1248         /* do inband token exchange -
1249          * wait for and receive SMC Proposal CLC message
1250          */
1251         pclc = (struct smc_clc_msg_proposal *)&buf;
1252         rc = smc_clc_wait_msg(new_smc, pclc, SMC_CLC_MAX_LEN,
1253                               SMC_CLC_PROPOSAL, CLC_WAIT_TIME);
1254         if (rc)
1255                 goto out_decl;
1256
1257         /* IPSec connections opt out of SMC-R optimizations */
1258         if (using_ipsec(new_smc)) {
1259                 rc = SMC_CLC_DECL_IPSEC;
1260                 goto out_decl;
1261         }
1262
1263         /* check for matching IP prefix and subnet length */
1264         rc = smc_listen_prfx_check(new_smc, pclc);
1265         if (rc)
1266                 goto out_decl;
1267
1268         /* get vlan id from IP device */
1269         if (smc_vlan_by_tcpsk(new_smc->clcsock, &ini)) {
1270                 rc = SMC_CLC_DECL_GETVLANERR;
1271                 goto out_decl;
1272         }
1273
1274         mutex_lock(&smc_server_lgr_pending);
1275         smc_close_init(new_smc);
1276         smc_rx_init(new_smc);
1277         smc_tx_init(new_smc);
1278
1279         /* check if ISM is available */
1280         if (pclc->hdr.path == SMC_TYPE_D || pclc->hdr.path == SMC_TYPE_B) {
1281                 ini.is_smcd = true; /* prepare ISM check */
1282                 rc = smc_find_ism_device(new_smc, &ini);
1283                 if (!rc)
1284                         rc = smc_listen_ism_init(new_smc, pclc, &ini);
1285                 if (!rc)
1286                         ism_supported = true;
1287                 else if (pclc->hdr.path == SMC_TYPE_D)
1288                         goto out_unlock; /* skip RDMA and decline */
1289         }
1290
1291         /* check if RDMA is available */
1292         if (!ism_supported) { /* SMC_TYPE_R or SMC_TYPE_B */
1293                 /* prepare RDMA check */
1294                 memset(&ini, 0, sizeof(ini));
1295                 ini.is_smcd = false;
1296                 ini.ib_lcl = &pclc->lcl;
1297                 rc = smc_find_rdma_device(new_smc, &ini);
1298                 if (rc) {
1299                         /* no RDMA device found */
1300                         if (pclc->hdr.path == SMC_TYPE_B)
1301                                 /* neither ISM nor RDMA device found */
1302                                 rc = SMC_CLC_DECL_NOSMCDEV;
1303                         goto out_unlock;
1304                 }
1305                 rc = smc_listen_rdma_init(new_smc, &ini);
1306                 if (rc)
1307                         goto out_unlock;
1308                 rc = smc_listen_rdma_reg(new_smc, ini.cln_first_contact);
1309                 if (rc)
1310                         goto out_unlock;
1311         }
1312
1313         /* send SMC Accept CLC message */
1314         rc = smc_clc_send_accept(new_smc, ini.cln_first_contact);
1315         if (rc)
1316                 goto out_unlock;
1317
1318         /* SMC-D does not need this lock any more */
1319         if (ism_supported)
1320                 mutex_unlock(&smc_server_lgr_pending);
1321
1322         /* receive SMC Confirm CLC message */
1323         rc = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc),
1324                               SMC_CLC_CONFIRM, CLC_WAIT_TIME);
1325         if (rc) {
1326                 if (!ism_supported)
1327                         goto out_unlock;
1328                 goto out_decl;
1329         }
1330
1331         /* finish worker */
1332         if (!ism_supported) {
1333                 rc = smc_listen_rdma_finish(new_smc, &cclc,
1334                                             ini.cln_first_contact);
1335                 mutex_unlock(&smc_server_lgr_pending);
1336                 if (rc)
1337                         return;
1338         }
1339         smc_conn_save_peer_info(new_smc, &cclc);
1340         smc_listen_out_connected(new_smc);
1341         return;
1342
1343 out_unlock:
1344         mutex_unlock(&smc_server_lgr_pending);
1345 out_decl:
1346         smc_listen_decline(new_smc, rc, ini.cln_first_contact);
1347 }
1348
1349 static void smc_tcp_listen_work(struct work_struct *work)
1350 {
1351         struct smc_sock *lsmc = container_of(work, struct smc_sock,
1352                                              tcp_listen_work);
1353         struct sock *lsk = &lsmc->sk;
1354         struct smc_sock *new_smc;
1355         int rc = 0;
1356
1357         lock_sock(lsk);
1358         while (lsk->sk_state == SMC_LISTEN) {
1359                 rc = smc_clcsock_accept(lsmc, &new_smc);
1360                 if (rc)
1361                         goto out;
1362                 if (!new_smc)
1363                         continue;
1364
1365                 new_smc->listen_smc = lsmc;
1366                 new_smc->use_fallback = lsmc->use_fallback;
1367                 new_smc->fallback_rsn = lsmc->fallback_rsn;
1368                 sock_hold(lsk); /* sock_put in smc_listen_work */
1369                 INIT_WORK(&new_smc->smc_listen_work, smc_listen_work);
1370                 smc_copy_sock_settings_to_smc(new_smc);
1371                 new_smc->sk.sk_sndbuf = lsmc->sk.sk_sndbuf;
1372                 new_smc->sk.sk_rcvbuf = lsmc->sk.sk_rcvbuf;
1373                 sock_hold(&new_smc->sk); /* sock_put in passive closing */
1374                 if (!schedule_work(&new_smc->smc_listen_work))
1375                         sock_put(&new_smc->sk);
1376         }
1377
1378 out:
1379         release_sock(lsk);
1380         sock_put(&lsmc->sk); /* sock_hold in smc_listen */
1381 }
1382
1383 static int smc_listen(struct socket *sock, int backlog)
1384 {
1385         struct sock *sk = sock->sk;
1386         struct smc_sock *smc;
1387         int rc;
1388
1389         smc = smc_sk(sk);
1390         lock_sock(sk);
1391
1392         rc = -EINVAL;
1393         if ((sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) ||
1394             smc->connect_nonblock)
1395                 goto out;
1396
1397         rc = 0;
1398         if (sk->sk_state == SMC_LISTEN) {
1399                 sk->sk_max_ack_backlog = backlog;
1400                 goto out;
1401         }
1402         /* some socket options are handled in core, so we could not apply
1403          * them to the clc socket -- copy smc socket options to clc socket
1404          */
1405         smc_copy_sock_settings_to_clc(smc);
1406         if (!smc->use_fallback)
1407                 tcp_sk(smc->clcsock->sk)->syn_smc = 1;
1408
1409         rc = kernel_listen(smc->clcsock, backlog);
1410         if (rc)
1411                 goto out;
1412         sk->sk_max_ack_backlog = backlog;
1413         sk->sk_ack_backlog = 0;
1414         sk->sk_state = SMC_LISTEN;
1415         sock_hold(sk); /* sock_hold in tcp_listen_worker */
1416         if (!schedule_work(&smc->tcp_listen_work))
1417                 sock_put(sk);
1418
1419 out:
1420         release_sock(sk);
1421         return rc;
1422 }
1423
1424 static int smc_accept(struct socket *sock, struct socket *new_sock,
1425                       int flags, bool kern)
1426 {
1427         struct sock *sk = sock->sk, *nsk;
1428         DECLARE_WAITQUEUE(wait, current);
1429         struct smc_sock *lsmc;
1430         long timeo;
1431         int rc = 0;
1432
1433         lsmc = smc_sk(sk);
1434         sock_hold(sk); /* sock_put below */
1435         lock_sock(sk);
1436
1437         if (lsmc->sk.sk_state != SMC_LISTEN) {
1438                 rc = -EINVAL;
1439                 release_sock(sk);
1440                 goto out;
1441         }
1442
1443         /* Wait for an incoming connection */
1444         timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
1445         add_wait_queue_exclusive(sk_sleep(sk), &wait);
1446         while (!(nsk = smc_accept_dequeue(sk, new_sock))) {
1447                 set_current_state(TASK_INTERRUPTIBLE);
1448                 if (!timeo) {
1449                         rc = -EAGAIN;
1450                         break;
1451                 }
1452                 release_sock(sk);
1453                 timeo = schedule_timeout(timeo);
1454                 /* wakeup by sk_data_ready in smc_listen_work() */
1455                 sched_annotate_sleep();
1456                 lock_sock(sk);
1457                 if (signal_pending(current)) {
1458                         rc = sock_intr_errno(timeo);
1459                         break;
1460                 }
1461         }
1462         set_current_state(TASK_RUNNING);
1463         remove_wait_queue(sk_sleep(sk), &wait);
1464
1465         if (!rc)
1466                 rc = sock_error(nsk);
1467         release_sock(sk);
1468         if (rc)
1469                 goto out;
1470
1471         if (lsmc->sockopt_defer_accept && !(flags & O_NONBLOCK)) {
1472                 /* wait till data arrives on the socket */
1473                 timeo = msecs_to_jiffies(lsmc->sockopt_defer_accept *
1474                                                                 MSEC_PER_SEC);
1475                 if (smc_sk(nsk)->use_fallback) {
1476                         struct sock *clcsk = smc_sk(nsk)->clcsock->sk;
1477
1478                         lock_sock(clcsk);
1479                         if (skb_queue_empty(&clcsk->sk_receive_queue))
1480                                 sk_wait_data(clcsk, &timeo, NULL);
1481                         release_sock(clcsk);
1482                 } else if (!atomic_read(&smc_sk(nsk)->conn.bytes_to_rcv)) {
1483                         lock_sock(nsk);
1484                         smc_rx_wait(smc_sk(nsk), &timeo, smc_rx_data_available);
1485                         release_sock(nsk);
1486                 }
1487         }
1488
1489 out:
1490         sock_put(sk); /* sock_hold above */
1491         return rc;
1492 }
1493
1494 static int smc_getname(struct socket *sock, struct sockaddr *addr,
1495                        int peer)
1496 {
1497         struct smc_sock *smc;
1498
1499         if (peer && (sock->sk->sk_state != SMC_ACTIVE) &&
1500             (sock->sk->sk_state != SMC_APPCLOSEWAIT1))
1501                 return -ENOTCONN;
1502
1503         smc = smc_sk(sock->sk);
1504
1505         return smc->clcsock->ops->getname(smc->clcsock, addr, peer);
1506 }
1507
1508 static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
1509 {
1510         struct sock *sk = sock->sk;
1511         struct smc_sock *smc;
1512         int rc = -EPIPE;
1513
1514         smc = smc_sk(sk);
1515         lock_sock(sk);
1516         if ((sk->sk_state != SMC_ACTIVE) &&
1517             (sk->sk_state != SMC_APPCLOSEWAIT1) &&
1518             (sk->sk_state != SMC_INIT))
1519                 goto out;
1520
1521         if (msg->msg_flags & MSG_FASTOPEN) {
1522                 if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) {
1523                         smc_switch_to_fallback(smc);
1524                         smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP;
1525                 } else {
1526                         rc = -EINVAL;
1527                         goto out;
1528                 }
1529         }
1530
1531         if (smc->use_fallback)
1532                 rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len);
1533         else
1534                 rc = smc_tx_sendmsg(smc, msg, len);
1535 out:
1536         release_sock(sk);
1537         return rc;
1538 }
1539
1540 static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
1541                        int flags)
1542 {
1543         struct sock *sk = sock->sk;
1544         struct smc_sock *smc;
1545         int rc = -ENOTCONN;
1546
1547         smc = smc_sk(sk);
1548         lock_sock(sk);
1549         if (sk->sk_state == SMC_CLOSED && (sk->sk_shutdown & RCV_SHUTDOWN)) {
1550                 /* socket was connected before, no more data to read */
1551                 rc = 0;
1552                 goto out;
1553         }
1554         if ((sk->sk_state == SMC_INIT) ||
1555             (sk->sk_state == SMC_LISTEN) ||
1556             (sk->sk_state == SMC_CLOSED))
1557                 goto out;
1558
1559         if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
1560                 rc = 0;
1561                 goto out;
1562         }
1563
1564         if (smc->use_fallback) {
1565                 rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags);
1566         } else {
1567                 msg->msg_namelen = 0;
1568                 rc = smc_rx_recvmsg(smc, msg, NULL, len, flags);
1569         }
1570
1571 out:
1572         release_sock(sk);
1573         return rc;
1574 }
1575
1576 static __poll_t smc_accept_poll(struct sock *parent)
1577 {
1578         struct smc_sock *isk = smc_sk(parent);
1579         __poll_t mask = 0;
1580
1581         spin_lock(&isk->accept_q_lock);
1582         if (!list_empty(&isk->accept_q))
1583                 mask = EPOLLIN | EPOLLRDNORM;
1584         spin_unlock(&isk->accept_q_lock);
1585
1586         return mask;
1587 }
1588
1589 static __poll_t smc_poll(struct file *file, struct socket *sock,
1590                              poll_table *wait)
1591 {
1592         struct sock *sk = sock->sk;
1593         struct smc_sock *smc;
1594         __poll_t mask = 0;
1595
1596         if (!sk)
1597                 return EPOLLNVAL;
1598
1599         smc = smc_sk(sock->sk);
1600         if (smc->use_fallback) {
1601                 /* delegate to CLC child sock */
1602                 mask = smc->clcsock->ops->poll(file, smc->clcsock, wait);
1603                 sk->sk_err = smc->clcsock->sk->sk_err;
1604         } else {
1605                 if (sk->sk_state != SMC_CLOSED)
1606                         sock_poll_wait(file, sock, wait);
1607                 if (sk->sk_err)
1608                         mask |= EPOLLERR;
1609                 if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
1610                     (sk->sk_state == SMC_CLOSED))
1611                         mask |= EPOLLHUP;
1612                 if (sk->sk_state == SMC_LISTEN) {
1613                         /* woken up by sk_data_ready in smc_listen_work() */
1614                         mask |= smc_accept_poll(sk);
1615                 } else if (smc->use_fallback) { /* as result of connect_work()*/
1616                         mask |= smc->clcsock->ops->poll(file, smc->clcsock,
1617                                                            wait);
1618                         sk->sk_err = smc->clcsock->sk->sk_err;
1619                 } else {
1620                         if ((sk->sk_state != SMC_INIT &&
1621                              atomic_read(&smc->conn.sndbuf_space)) ||
1622                             sk->sk_shutdown & SEND_SHUTDOWN) {
1623                                 mask |= EPOLLOUT | EPOLLWRNORM;
1624                         } else {
1625                                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1626                                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1627                         }
1628                         if (atomic_read(&smc->conn.bytes_to_rcv))
1629                                 mask |= EPOLLIN | EPOLLRDNORM;
1630                         if (sk->sk_shutdown & RCV_SHUTDOWN)
1631                                 mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
1632                         if (sk->sk_state == SMC_APPCLOSEWAIT1)
1633                                 mask |= EPOLLIN;
1634                         if (smc->conn.urg_state == SMC_URG_VALID)
1635                                 mask |= EPOLLPRI;
1636                 }
1637         }
1638
1639         return mask;
1640 }
1641
1642 static int smc_shutdown(struct socket *sock, int how)
1643 {
1644         struct sock *sk = sock->sk;
1645         struct smc_sock *smc;
1646         int rc = -EINVAL;
1647         int rc1 = 0;
1648
1649         smc = smc_sk(sk);
1650
1651         if ((how < SHUT_RD) || (how > SHUT_RDWR))
1652                 return rc;
1653
1654         lock_sock(sk);
1655
1656         rc = -ENOTCONN;
1657         if ((sk->sk_state != SMC_ACTIVE) &&
1658             (sk->sk_state != SMC_PEERCLOSEWAIT1) &&
1659             (sk->sk_state != SMC_PEERCLOSEWAIT2) &&
1660             (sk->sk_state != SMC_APPCLOSEWAIT1) &&
1661             (sk->sk_state != SMC_APPCLOSEWAIT2) &&
1662             (sk->sk_state != SMC_APPFINCLOSEWAIT))
1663                 goto out;
1664         if (smc->use_fallback) {
1665                 rc = kernel_sock_shutdown(smc->clcsock, how);
1666                 sk->sk_shutdown = smc->clcsock->sk->sk_shutdown;
1667                 if (sk->sk_shutdown == SHUTDOWN_MASK)
1668                         sk->sk_state = SMC_CLOSED;
1669                 goto out;
1670         }
1671         switch (how) {
1672         case SHUT_RDWR:         /* shutdown in both directions */
1673                 rc = smc_close_active(smc);
1674                 break;
1675         case SHUT_WR:
1676                 rc = smc_close_shutdown_write(smc);
1677                 break;
1678         case SHUT_RD:
1679                 rc = 0;
1680                 /* nothing more to do because peer is not involved */
1681                 break;
1682         }
1683         if (smc->clcsock)
1684                 rc1 = kernel_sock_shutdown(smc->clcsock, how);
1685         /* map sock_shutdown_cmd constants to sk_shutdown value range */
1686         sk->sk_shutdown |= how + 1;
1687
1688 out:
1689         release_sock(sk);
1690         return rc ? rc : rc1;
1691 }
1692
1693 static int smc_setsockopt(struct socket *sock, int level, int optname,
1694                           char __user *optval, unsigned int optlen)
1695 {
1696         struct sock *sk = sock->sk;
1697         struct smc_sock *smc;
1698         int val, rc;
1699
1700         smc = smc_sk(sk);
1701
1702         /* generic setsockopts reaching us here always apply to the
1703          * CLC socket
1704          */
1705         rc = smc->clcsock->ops->setsockopt(smc->clcsock, level, optname,
1706                                            optval, optlen);
1707         if (smc->clcsock->sk->sk_err) {
1708                 sk->sk_err = smc->clcsock->sk->sk_err;
1709                 sk->sk_error_report(sk);
1710         }
1711         if (rc)
1712                 return rc;
1713
1714         if (optlen < sizeof(int))
1715                 return -EINVAL;
1716         if (get_user(val, (int __user *)optval))
1717                 return -EFAULT;
1718
1719         lock_sock(sk);
1720         switch (optname) {
1721         case TCP_ULP:
1722         case TCP_FASTOPEN:
1723         case TCP_FASTOPEN_CONNECT:
1724         case TCP_FASTOPEN_KEY:
1725         case TCP_FASTOPEN_NO_COOKIE:
1726                 /* option not supported by SMC */
1727                 if (sk->sk_state == SMC_INIT) {
1728                         smc_switch_to_fallback(smc);
1729                         smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP;
1730                 } else {
1731                         if (!smc->use_fallback)
1732                                 rc = -EINVAL;
1733                 }
1734                 break;
1735         case TCP_NODELAY:
1736                 if (sk->sk_state != SMC_INIT &&
1737                     sk->sk_state != SMC_LISTEN &&
1738                     sk->sk_state != SMC_CLOSED) {
1739                         if (val && !smc->use_fallback)
1740                                 mod_delayed_work(system_wq, &smc->conn.tx_work,
1741                                                  0);
1742                 }
1743                 break;
1744         case TCP_CORK:
1745                 if (sk->sk_state != SMC_INIT &&
1746                     sk->sk_state != SMC_LISTEN &&
1747                     sk->sk_state != SMC_CLOSED) {
1748                         if (!val && !smc->use_fallback)
1749                                 mod_delayed_work(system_wq, &smc->conn.tx_work,
1750                                                  0);
1751                 }
1752                 break;
1753         case TCP_DEFER_ACCEPT:
1754                 smc->sockopt_defer_accept = val;
1755                 break;
1756         default:
1757                 break;
1758         }
1759         release_sock(sk);
1760
1761         return rc;
1762 }
1763
1764 static int smc_getsockopt(struct socket *sock, int level, int optname,
1765                           char __user *optval, int __user *optlen)
1766 {
1767         struct smc_sock *smc;
1768
1769         smc = smc_sk(sock->sk);
1770         /* socket options apply to the CLC socket */
1771         return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname,
1772                                              optval, optlen);
1773 }
1774
1775 static int smc_ioctl(struct socket *sock, unsigned int cmd,
1776                      unsigned long arg)
1777 {
1778         union smc_host_cursor cons, urg;
1779         struct smc_connection *conn;
1780         struct smc_sock *smc;
1781         int answ;
1782
1783         smc = smc_sk(sock->sk);
1784         conn = &smc->conn;
1785         lock_sock(&smc->sk);
1786         if (smc->use_fallback) {
1787                 if (!smc->clcsock) {
1788                         release_sock(&smc->sk);
1789                         return -EBADF;
1790                 }
1791                 answ = smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg);
1792                 release_sock(&smc->sk);
1793                 return answ;
1794         }
1795         switch (cmd) {
1796         case SIOCINQ: /* same as FIONREAD */
1797                 if (smc->sk.sk_state == SMC_LISTEN) {
1798                         release_sock(&smc->sk);
1799                         return -EINVAL;
1800                 }
1801                 if (smc->sk.sk_state == SMC_INIT ||
1802                     smc->sk.sk_state == SMC_CLOSED)
1803                         answ = 0;
1804                 else
1805                         answ = atomic_read(&smc->conn.bytes_to_rcv);
1806                 break;
1807         case SIOCOUTQ:
1808                 /* output queue size (not send + not acked) */
1809                 if (smc->sk.sk_state == SMC_LISTEN) {
1810                         release_sock(&smc->sk);
1811                         return -EINVAL;
1812                 }
1813                 if (smc->sk.sk_state == SMC_INIT ||
1814                     smc->sk.sk_state == SMC_CLOSED)
1815                         answ = 0;
1816                 else
1817                         answ = smc->conn.sndbuf_desc->len -
1818                                         atomic_read(&smc->conn.sndbuf_space);
1819                 break;
1820         case SIOCOUTQNSD:
1821                 /* output queue size (not send only) */
1822                 if (smc->sk.sk_state == SMC_LISTEN) {
1823                         release_sock(&smc->sk);
1824                         return -EINVAL;
1825                 }
1826                 if (smc->sk.sk_state == SMC_INIT ||
1827                     smc->sk.sk_state == SMC_CLOSED)
1828                         answ = 0;
1829                 else
1830                         answ = smc_tx_prepared_sends(&smc->conn);
1831                 break;
1832         case SIOCATMARK:
1833                 if (smc->sk.sk_state == SMC_LISTEN) {
1834                         release_sock(&smc->sk);
1835                         return -EINVAL;
1836                 }
1837                 if (smc->sk.sk_state == SMC_INIT ||
1838                     smc->sk.sk_state == SMC_CLOSED) {
1839                         answ = 0;
1840                 } else {
1841                         smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn);
1842                         smc_curs_copy(&urg, &conn->urg_curs, conn);
1843                         answ = smc_curs_diff(conn->rmb_desc->len,
1844                                              &cons, &urg) == 1;
1845                 }
1846                 break;
1847         default:
1848                 release_sock(&smc->sk);
1849                 return -ENOIOCTLCMD;
1850         }
1851         release_sock(&smc->sk);
1852
1853         return put_user(answ, (int __user *)arg);
1854 }
1855
1856 static ssize_t smc_sendpage(struct socket *sock, struct page *page,
1857                             int offset, size_t size, int flags)
1858 {
1859         struct sock *sk = sock->sk;
1860         struct smc_sock *smc;
1861         int rc = -EPIPE;
1862
1863         smc = smc_sk(sk);
1864         lock_sock(sk);
1865         if (sk->sk_state != SMC_ACTIVE) {
1866                 release_sock(sk);
1867                 goto out;
1868         }
1869         release_sock(sk);
1870         if (smc->use_fallback)
1871                 rc = kernel_sendpage(smc->clcsock, page, offset,
1872                                      size, flags);
1873         else
1874                 rc = sock_no_sendpage(sock, page, offset, size, flags);
1875
1876 out:
1877         return rc;
1878 }
1879
1880 /* Map the affected portions of the rmbe into an spd, note the number of bytes
1881  * to splice in conn->splice_pending, and press 'go'. Delays consumer cursor
1882  * updates till whenever a respective page has been fully processed.
1883  * Note that subsequent recv() calls have to wait till all splice() processing
1884  * completed.
1885  */
1886 static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos,
1887                                struct pipe_inode_info *pipe, size_t len,
1888                                unsigned int flags)
1889 {
1890         struct sock *sk = sock->sk;
1891         struct smc_sock *smc;
1892         int rc = -ENOTCONN;
1893
1894         smc = smc_sk(sk);
1895         lock_sock(sk);
1896         if (sk->sk_state == SMC_CLOSED && (sk->sk_shutdown & RCV_SHUTDOWN)) {
1897                 /* socket was connected before, no more data to read */
1898                 rc = 0;
1899                 goto out;
1900         }
1901         if (sk->sk_state == SMC_INIT ||
1902             sk->sk_state == SMC_LISTEN ||
1903             sk->sk_state == SMC_CLOSED)
1904                 goto out;
1905
1906         if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
1907                 rc = 0;
1908                 goto out;
1909         }
1910
1911         if (smc->use_fallback) {
1912                 rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos,
1913                                                     pipe, len, flags);
1914         } else {
1915                 if (*ppos) {
1916                         rc = -ESPIPE;
1917                         goto out;
1918                 }
1919                 if (flags & SPLICE_F_NONBLOCK)
1920                         flags = MSG_DONTWAIT;
1921                 else
1922                         flags = 0;
1923                 rc = smc_rx_recvmsg(smc, NULL, pipe, len, flags);
1924         }
1925 out:
1926         release_sock(sk);
1927
1928         return rc;
1929 }
1930
1931 /* must look like tcp */
1932 static const struct proto_ops smc_sock_ops = {
1933         .family         = PF_SMC,
1934         .owner          = THIS_MODULE,
1935         .release        = smc_release,
1936         .bind           = smc_bind,
1937         .connect        = smc_connect,
1938         .socketpair     = sock_no_socketpair,
1939         .accept         = smc_accept,
1940         .getname        = smc_getname,
1941         .poll           = smc_poll,
1942         .ioctl          = smc_ioctl,
1943         .listen         = smc_listen,
1944         .shutdown       = smc_shutdown,
1945         .setsockopt     = smc_setsockopt,
1946         .getsockopt     = smc_getsockopt,
1947         .sendmsg        = smc_sendmsg,
1948         .recvmsg        = smc_recvmsg,
1949         .mmap           = sock_no_mmap,
1950         .sendpage       = smc_sendpage,
1951         .splice_read    = smc_splice_read,
1952 };
1953
1954 static int smc_create(struct net *net, struct socket *sock, int protocol,
1955                       int kern)
1956 {
1957         int family = (protocol == SMCPROTO_SMC6) ? PF_INET6 : PF_INET;
1958         struct smc_sock *smc;
1959         struct sock *sk;
1960         int rc;
1961
1962         rc = -ESOCKTNOSUPPORT;
1963         if (sock->type != SOCK_STREAM)
1964                 goto out;
1965
1966         rc = -EPROTONOSUPPORT;
1967         if (protocol != SMCPROTO_SMC && protocol != SMCPROTO_SMC6)
1968                 goto out;
1969
1970         rc = -ENOBUFS;
1971         sock->ops = &smc_sock_ops;
1972         sk = smc_sock_alloc(net, sock, protocol);
1973         if (!sk)
1974                 goto out;
1975
1976         /* create internal TCP socket for CLC handshake and fallback */
1977         smc = smc_sk(sk);
1978         smc->use_fallback = false; /* assume rdma capability first */
1979         smc->fallback_rsn = 0;
1980         rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP,
1981                               &smc->clcsock);
1982         if (rc) {
1983                 sk_common_release(sk);
1984                 goto out;
1985         }
1986         smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE);
1987         smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE);
1988
1989 out:
1990         return rc;
1991 }
1992
1993 static const struct net_proto_family smc_sock_family_ops = {
1994         .family = PF_SMC,
1995         .owner  = THIS_MODULE,
1996         .create = smc_create,
1997 };
1998
1999 unsigned int smc_net_id;
2000
2001 static __net_init int smc_net_init(struct net *net)
2002 {
2003         return smc_pnet_net_init(net);
2004 }
2005
2006 static void __net_exit smc_net_exit(struct net *net)
2007 {
2008         smc_pnet_net_exit(net);
2009 }
2010
2011 static struct pernet_operations smc_net_ops = {
2012         .init = smc_net_init,
2013         .exit = smc_net_exit,
2014         .id   = &smc_net_id,
2015         .size = sizeof(struct smc_net),
2016 };
2017
2018 static int __init smc_init(void)
2019 {
2020         int rc;
2021
2022         rc = register_pernet_subsys(&smc_net_ops);
2023         if (rc)
2024                 return rc;
2025
2026         rc = smc_pnet_init();
2027         if (rc)
2028                 goto out_pernet_subsys;
2029
2030         rc = smc_llc_init();
2031         if (rc) {
2032                 pr_err("%s: smc_llc_init fails with %d\n", __func__, rc);
2033                 goto out_pnet;
2034         }
2035
2036         rc = smc_cdc_init();
2037         if (rc) {
2038                 pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc);
2039                 goto out_pnet;
2040         }
2041
2042         rc = proto_register(&smc_proto, 1);
2043         if (rc) {
2044                 pr_err("%s: proto_register(v4) fails with %d\n", __func__, rc);
2045                 goto out_pnet;
2046         }
2047
2048         rc = proto_register(&smc_proto6, 1);
2049         if (rc) {
2050                 pr_err("%s: proto_register(v6) fails with %d\n", __func__, rc);
2051                 goto out_proto;
2052         }
2053
2054         rc = sock_register(&smc_sock_family_ops);
2055         if (rc) {
2056                 pr_err("%s: sock_register fails with %d\n", __func__, rc);
2057                 goto out_proto6;
2058         }
2059         INIT_HLIST_HEAD(&smc_v4_hashinfo.ht);
2060         INIT_HLIST_HEAD(&smc_v6_hashinfo.ht);
2061
2062         rc = smc_ib_register_client();
2063         if (rc) {
2064                 pr_err("%s: ib_register fails with %d\n", __func__, rc);
2065                 goto out_sock;
2066         }
2067
2068         static_branch_enable(&tcp_have_smc);
2069         return 0;
2070
2071 out_sock:
2072         sock_unregister(PF_SMC);
2073 out_proto6:
2074         proto_unregister(&smc_proto6);
2075 out_proto:
2076         proto_unregister(&smc_proto);
2077 out_pnet:
2078         smc_pnet_exit();
2079 out_pernet_subsys:
2080         unregister_pernet_subsys(&smc_net_ops);
2081
2082         return rc;
2083 }
2084
2085 static void __exit smc_exit(void)
2086 {
2087         smc_core_exit();
2088         static_branch_disable(&tcp_have_smc);
2089         smc_ib_unregister_client();
2090         sock_unregister(PF_SMC);
2091         proto_unregister(&smc_proto6);
2092         proto_unregister(&smc_proto);
2093         smc_pnet_exit();
2094         unregister_pernet_subsys(&smc_net_ops);
2095 }
2096
2097 module_init(smc_init);
2098 module_exit(smc_exit);
2099
2100 MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>");
2101 MODULE_DESCRIPTION("smc socket address family");
2102 MODULE_LICENSE("GPL");
2103 MODULE_ALIAS_NETPROTO(PF_SMC);