Merge tag 'driver-core-5.0-rc6' of git://git.kernel.org/pub/scm/linux/kernel/git...
[sfrench/cifs-2.6.git] / net / smc / af_smc.c
1 /*
2  *  Shared Memory Communications over RDMA (SMC-R) and RoCE
3  *
4  *  AF_SMC protocol family socket handler keeping the AF_INET sock address type
5  *  applies to SOCK_STREAM sockets only
6  *  offers an alternative communication option for TCP-protocol sockets
7  *  applicable with RoCE-cards only
8  *
9  *  Initial restrictions:
10  *    - support for alternate links postponed
11  *
12  *  Copyright IBM Corp. 2016, 2018
13  *
14  *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
15  *              based on prototype from Frank Blaschka
16  */
17
18 #define KMSG_COMPONENT "smc"
19 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
20
21 #include <linux/module.h>
22 #include <linux/socket.h>
23 #include <linux/workqueue.h>
24 #include <linux/in.h>
25 #include <linux/sched/signal.h>
26 #include <linux/if_vlan.h>
27
28 #include <net/sock.h>
29 #include <net/tcp.h>
30 #include <net/smc.h>
31 #include <asm/ioctls.h>
32
33 #include "smc.h"
34 #include "smc_clc.h"
35 #include "smc_llc.h"
36 #include "smc_cdc.h"
37 #include "smc_core.h"
38 #include "smc_ib.h"
39 #include "smc_ism.h"
40 #include "smc_pnet.h"
41 #include "smc_tx.h"
42 #include "smc_rx.h"
43 #include "smc_close.h"
44
45 static DEFINE_MUTEX(smc_create_lgr_pending);    /* serialize link group
46                                                  * creation
47                                                  */
48
49 static void smc_tcp_listen_work(struct work_struct *);
50 static void smc_connect_work(struct work_struct *);
51
52 static void smc_set_keepalive(struct sock *sk, int val)
53 {
54         struct smc_sock *smc = smc_sk(sk);
55
56         smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val);
57 }
58
59 static struct smc_hashinfo smc_v4_hashinfo = {
60         .lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock),
61 };
62
63 static struct smc_hashinfo smc_v6_hashinfo = {
64         .lock = __RW_LOCK_UNLOCKED(smc_v6_hashinfo.lock),
65 };
66
67 int smc_hash_sk(struct sock *sk)
68 {
69         struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
70         struct hlist_head *head;
71
72         head = &h->ht;
73
74         write_lock_bh(&h->lock);
75         sk_add_node(sk, head);
76         sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
77         write_unlock_bh(&h->lock);
78
79         return 0;
80 }
81 EXPORT_SYMBOL_GPL(smc_hash_sk);
82
83 void smc_unhash_sk(struct sock *sk)
84 {
85         struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
86
87         write_lock_bh(&h->lock);
88         if (sk_del_node_init(sk))
89                 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
90         write_unlock_bh(&h->lock);
91 }
92 EXPORT_SYMBOL_GPL(smc_unhash_sk);
93
94 struct proto smc_proto = {
95         .name           = "SMC",
96         .owner          = THIS_MODULE,
97         .keepalive      = smc_set_keepalive,
98         .hash           = smc_hash_sk,
99         .unhash         = smc_unhash_sk,
100         .obj_size       = sizeof(struct smc_sock),
101         .h.smc_hash     = &smc_v4_hashinfo,
102         .slab_flags     = SLAB_TYPESAFE_BY_RCU,
103 };
104 EXPORT_SYMBOL_GPL(smc_proto);
105
106 struct proto smc_proto6 = {
107         .name           = "SMC6",
108         .owner          = THIS_MODULE,
109         .keepalive      = smc_set_keepalive,
110         .hash           = smc_hash_sk,
111         .unhash         = smc_unhash_sk,
112         .obj_size       = sizeof(struct smc_sock),
113         .h.smc_hash     = &smc_v6_hashinfo,
114         .slab_flags     = SLAB_TYPESAFE_BY_RCU,
115 };
116 EXPORT_SYMBOL_GPL(smc_proto6);
117
118 static int smc_release(struct socket *sock)
119 {
120         struct sock *sk = sock->sk;
121         struct smc_sock *smc;
122         int rc = 0;
123
124         if (!sk)
125                 goto out;
126
127         smc = smc_sk(sk);
128
129         /* cleanup for a dangling non-blocking connect */
130         if (smc->connect_info && sk->sk_state == SMC_INIT)
131                 tcp_abort(smc->clcsock->sk, ECONNABORTED);
132         flush_work(&smc->connect_work);
133         kfree(smc->connect_info);
134         smc->connect_info = NULL;
135
136         if (sk->sk_state == SMC_LISTEN)
137                 /* smc_close_non_accepted() is called and acquires
138                  * sock lock for child sockets again
139                  */
140                 lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
141         else
142                 lock_sock(sk);
143
144         if (!smc->use_fallback) {
145                 rc = smc_close_active(smc);
146                 sock_set_flag(sk, SOCK_DEAD);
147                 sk->sk_shutdown |= SHUTDOWN_MASK;
148         }
149
150         sk->sk_prot->unhash(sk);
151
152         if (smc->clcsock) {
153                 if (smc->use_fallback && sk->sk_state == SMC_LISTEN) {
154                         /* wake up clcsock accept */
155                         rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR);
156                 }
157                 mutex_lock(&smc->clcsock_release_lock);
158                 sock_release(smc->clcsock);
159                 smc->clcsock = NULL;
160                 mutex_unlock(&smc->clcsock_release_lock);
161         }
162         if (smc->use_fallback) {
163                 if (sk->sk_state != SMC_LISTEN && sk->sk_state != SMC_INIT)
164                         sock_put(sk); /* passive closing */
165                 sk->sk_state = SMC_CLOSED;
166                 sk->sk_state_change(sk);
167         }
168
169         /* detach socket */
170         sock_orphan(sk);
171         sock->sk = NULL;
172         if (!smc->use_fallback && sk->sk_state == SMC_CLOSED)
173                 smc_conn_free(&smc->conn);
174         release_sock(sk);
175
176         sock_put(sk); /* final sock_put */
177 out:
178         return rc;
179 }
180
181 static void smc_destruct(struct sock *sk)
182 {
183         if (sk->sk_state != SMC_CLOSED)
184                 return;
185         if (!sock_flag(sk, SOCK_DEAD))
186                 return;
187
188         sk_refcnt_debug_dec(sk);
189 }
190
191 static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
192                                    int protocol)
193 {
194         struct smc_sock *smc;
195         struct proto *prot;
196         struct sock *sk;
197
198         prot = (protocol == SMCPROTO_SMC6) ? &smc_proto6 : &smc_proto;
199         sk = sk_alloc(net, PF_SMC, GFP_KERNEL, prot, 0);
200         if (!sk)
201                 return NULL;
202
203         sock_init_data(sock, sk); /* sets sk_refcnt to 1 */
204         sk->sk_state = SMC_INIT;
205         sk->sk_destruct = smc_destruct;
206         sk->sk_protocol = protocol;
207         smc = smc_sk(sk);
208         INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
209         INIT_WORK(&smc->connect_work, smc_connect_work);
210         INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work);
211         INIT_LIST_HEAD(&smc->accept_q);
212         spin_lock_init(&smc->accept_q_lock);
213         spin_lock_init(&smc->conn.send_lock);
214         sk->sk_prot->hash(sk);
215         sk_refcnt_debug_inc(sk);
216         mutex_init(&smc->clcsock_release_lock);
217
218         return sk;
219 }
220
221 static int smc_bind(struct socket *sock, struct sockaddr *uaddr,
222                     int addr_len)
223 {
224         struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
225         struct sock *sk = sock->sk;
226         struct smc_sock *smc;
227         int rc;
228
229         smc = smc_sk(sk);
230
231         /* replicate tests from inet_bind(), to be safe wrt. future changes */
232         rc = -EINVAL;
233         if (addr_len < sizeof(struct sockaddr_in))
234                 goto out;
235
236         rc = -EAFNOSUPPORT;
237         if (addr->sin_family != AF_INET &&
238             addr->sin_family != AF_INET6 &&
239             addr->sin_family != AF_UNSPEC)
240                 goto out;
241         /* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */
242         if (addr->sin_family == AF_UNSPEC &&
243             addr->sin_addr.s_addr != htonl(INADDR_ANY))
244                 goto out;
245
246         lock_sock(sk);
247
248         /* Check if socket is already active */
249         rc = -EINVAL;
250         if (sk->sk_state != SMC_INIT)
251                 goto out_rel;
252
253         smc->clcsock->sk->sk_reuse = sk->sk_reuse;
254         rc = kernel_bind(smc->clcsock, uaddr, addr_len);
255
256 out_rel:
257         release_sock(sk);
258 out:
259         return rc;
260 }
261
262 static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk,
263                                    unsigned long mask)
264 {
265         /* options we don't get control via setsockopt for */
266         nsk->sk_type = osk->sk_type;
267         nsk->sk_sndbuf = osk->sk_sndbuf;
268         nsk->sk_rcvbuf = osk->sk_rcvbuf;
269         nsk->sk_sndtimeo = osk->sk_sndtimeo;
270         nsk->sk_rcvtimeo = osk->sk_rcvtimeo;
271         nsk->sk_mark = osk->sk_mark;
272         nsk->sk_priority = osk->sk_priority;
273         nsk->sk_rcvlowat = osk->sk_rcvlowat;
274         nsk->sk_bound_dev_if = osk->sk_bound_dev_if;
275         nsk->sk_err = osk->sk_err;
276
277         nsk->sk_flags &= ~mask;
278         nsk->sk_flags |= osk->sk_flags & mask;
279 }
280
281 #define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \
282                              (1UL << SOCK_KEEPOPEN) | \
283                              (1UL << SOCK_LINGER) | \
284                              (1UL << SOCK_BROADCAST) | \
285                              (1UL << SOCK_TIMESTAMP) | \
286                              (1UL << SOCK_DBG) | \
287                              (1UL << SOCK_RCVTSTAMP) | \
288                              (1UL << SOCK_RCVTSTAMPNS) | \
289                              (1UL << SOCK_LOCALROUTE) | \
290                              (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \
291                              (1UL << SOCK_RXQ_OVFL) | \
292                              (1UL << SOCK_WIFI_STATUS) | \
293                              (1UL << SOCK_NOFCS) | \
294                              (1UL << SOCK_FILTER_LOCKED))
295 /* copy only relevant settings and flags of SOL_SOCKET level from smc to
296  * clc socket (since smc is not called for these options from net/core)
297  */
298 static void smc_copy_sock_settings_to_clc(struct smc_sock *smc)
299 {
300         smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC);
301 }
302
303 #define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \
304                              (1UL << SOCK_KEEPOPEN) | \
305                              (1UL << SOCK_LINGER) | \
306                              (1UL << SOCK_DBG))
307 /* copy only settings and flags relevant for smc from clc to smc socket */
308 static void smc_copy_sock_settings_to_smc(struct smc_sock *smc)
309 {
310         smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
311 }
312
313 /* register a new rmb, send confirm_rkey msg to register with peer */
314 static int smc_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc,
315                        bool conf_rkey)
316 {
317         if (!rmb_desc->wr_reg) {
318                 /* register memory region for new rmb */
319                 if (smc_wr_reg_send(link, rmb_desc->mr_rx[SMC_SINGLE_LINK])) {
320                         rmb_desc->regerr = 1;
321                         return -EFAULT;
322                 }
323                 rmb_desc->wr_reg = 1;
324         }
325         if (!conf_rkey)
326                 return 0;
327         /* exchange confirm_rkey msg with peer */
328         if (smc_llc_do_confirm_rkey(link, rmb_desc)) {
329                 rmb_desc->regerr = 1;
330                 return -EFAULT;
331         }
332         return 0;
333 }
334
335 static int smc_clnt_conf_first_link(struct smc_sock *smc)
336 {
337         struct net *net = sock_net(smc->clcsock->sk);
338         struct smc_link_group *lgr = smc->conn.lgr;
339         struct smc_link *link;
340         int rest;
341         int rc;
342
343         link = &lgr->lnk[SMC_SINGLE_LINK];
344         /* receive CONFIRM LINK request from server over RoCE fabric */
345         rest = wait_for_completion_interruptible_timeout(
346                 &link->llc_confirm,
347                 SMC_LLC_WAIT_FIRST_TIME);
348         if (rest <= 0) {
349                 struct smc_clc_msg_decline dclc;
350
351                 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
352                                       SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
353                 return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc;
354         }
355
356         if (link->llc_confirm_rc)
357                 return SMC_CLC_DECL_RMBE_EC;
358
359         rc = smc_ib_modify_qp_rts(link);
360         if (rc)
361                 return SMC_CLC_DECL_ERR_RDYLNK;
362
363         smc_wr_remember_qp_attr(link);
364
365         if (smc_reg_rmb(link, smc->conn.rmb_desc, false))
366                 return SMC_CLC_DECL_ERR_REGRMB;
367
368         /* send CONFIRM LINK response over RoCE fabric */
369         rc = smc_llc_send_confirm_link(link, SMC_LLC_RESP);
370         if (rc < 0)
371                 return SMC_CLC_DECL_TIMEOUT_CL;
372
373         /* receive ADD LINK request from server over RoCE fabric */
374         rest = wait_for_completion_interruptible_timeout(&link->llc_add,
375                                                          SMC_LLC_WAIT_TIME);
376         if (rest <= 0) {
377                 struct smc_clc_msg_decline dclc;
378
379                 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
380                                       SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
381                 return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_AL : rc;
382         }
383
384         /* send add link reject message, only one link supported for now */
385         rc = smc_llc_send_add_link(link,
386                                    link->smcibdev->mac[link->ibport - 1],
387                                    link->gid, SMC_LLC_RESP);
388         if (rc < 0)
389                 return SMC_CLC_DECL_TIMEOUT_AL;
390
391         smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time);
392
393         return 0;
394 }
395
396 static void smcr_conn_save_peer_info(struct smc_sock *smc,
397                                      struct smc_clc_msg_accept_confirm *clc)
398 {
399         int bufsize = smc_uncompress_bufsize(clc->rmbe_size);
400
401         smc->conn.peer_rmbe_idx = clc->rmbe_idx;
402         smc->conn.local_tx_ctrl.token = ntohl(clc->rmbe_alert_token);
403         smc->conn.peer_rmbe_size = bufsize;
404         atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
405         smc->conn.tx_off = bufsize * (smc->conn.peer_rmbe_idx - 1);
406 }
407
408 static void smcd_conn_save_peer_info(struct smc_sock *smc,
409                                      struct smc_clc_msg_accept_confirm *clc)
410 {
411         int bufsize = smc_uncompress_bufsize(clc->dmbe_size);
412
413         smc->conn.peer_rmbe_idx = clc->dmbe_idx;
414         smc->conn.peer_token = clc->token;
415         /* msg header takes up space in the buffer */
416         smc->conn.peer_rmbe_size = bufsize - sizeof(struct smcd_cdc_msg);
417         atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
418         smc->conn.tx_off = bufsize * smc->conn.peer_rmbe_idx;
419 }
420
421 static void smc_conn_save_peer_info(struct smc_sock *smc,
422                                     struct smc_clc_msg_accept_confirm *clc)
423 {
424         if (smc->conn.lgr->is_smcd)
425                 smcd_conn_save_peer_info(smc, clc);
426         else
427                 smcr_conn_save_peer_info(smc, clc);
428 }
429
430 static void smc_link_save_peer_info(struct smc_link *link,
431                                     struct smc_clc_msg_accept_confirm *clc)
432 {
433         link->peer_qpn = ntoh24(clc->qpn);
434         memcpy(link->peer_gid, clc->lcl.gid, SMC_GID_SIZE);
435         memcpy(link->peer_mac, clc->lcl.mac, sizeof(link->peer_mac));
436         link->peer_psn = ntoh24(clc->psn);
437         link->peer_mtu = clc->qp_mtu;
438 }
439
440 /* fall back during connect */
441 static int smc_connect_fallback(struct smc_sock *smc, int reason_code)
442 {
443         smc->use_fallback = true;
444         smc->fallback_rsn = reason_code;
445         smc_copy_sock_settings_to_clc(smc);
446         if (smc->sk.sk_state == SMC_INIT)
447                 smc->sk.sk_state = SMC_ACTIVE;
448         return 0;
449 }
450
451 /* decline and fall back during connect */
452 static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code)
453 {
454         int rc;
455
456         if (reason_code < 0) { /* error, fallback is not possible */
457                 if (smc->sk.sk_state == SMC_INIT)
458                         sock_put(&smc->sk); /* passive closing */
459                 return reason_code;
460         }
461         if (reason_code != SMC_CLC_DECL_PEERDECL) {
462                 rc = smc_clc_send_decline(smc, reason_code);
463                 if (rc < 0) {
464                         if (smc->sk.sk_state == SMC_INIT)
465                                 sock_put(&smc->sk); /* passive closing */
466                         return rc;
467                 }
468         }
469         return smc_connect_fallback(smc, reason_code);
470 }
471
472 /* abort connecting */
473 static int smc_connect_abort(struct smc_sock *smc, int reason_code,
474                              int local_contact)
475 {
476         if (local_contact == SMC_FIRST_CONTACT)
477                 smc_lgr_forget(smc->conn.lgr);
478         mutex_unlock(&smc_create_lgr_pending);
479         smc_conn_free(&smc->conn);
480         return reason_code;
481 }
482
483 /* check if there is a rdma device available for this connection. */
484 /* called for connect and listen */
485 static int smc_check_rdma(struct smc_sock *smc, struct smc_ib_device **ibdev,
486                           u8 *ibport, unsigned short vlan_id, u8 gid[])
487 {
488         int reason_code = 0;
489
490         /* PNET table look up: search active ib_device and port
491          * within same PNETID that also contains the ethernet device
492          * used for the internal TCP socket
493          */
494         smc_pnet_find_roce_resource(smc->clcsock->sk, ibdev, ibport, vlan_id,
495                                     gid);
496         if (!(*ibdev))
497                 reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
498
499         return reason_code;
500 }
501
502 /* check if there is an ISM device available for this connection. */
503 /* called for connect and listen */
504 static int smc_check_ism(struct smc_sock *smc, struct smcd_dev **ismdev)
505 {
506         /* Find ISM device with same PNETID as connecting interface  */
507         smc_pnet_find_ism_resource(smc->clcsock->sk, ismdev);
508         if (!(*ismdev))
509                 return SMC_CLC_DECL_CNFERR; /* configuration error */
510         return 0;
511 }
512
513 /* Check for VLAN ID and register it on ISM device just for CLC handshake */
514 static int smc_connect_ism_vlan_setup(struct smc_sock *smc,
515                                       struct smcd_dev *ismdev,
516                                       unsigned short vlan_id)
517 {
518         if (vlan_id && smc_ism_get_vlan(ismdev, vlan_id))
519                 return SMC_CLC_DECL_CNFERR;
520         return 0;
521 }
522
523 /* cleanup temporary VLAN ID registration used for CLC handshake. If ISM is
524  * used, the VLAN ID will be registered again during the connection setup.
525  */
526 static int smc_connect_ism_vlan_cleanup(struct smc_sock *smc, bool is_smcd,
527                                         struct smcd_dev *ismdev,
528                                         unsigned short vlan_id)
529 {
530         if (!is_smcd)
531                 return 0;
532         if (vlan_id && smc_ism_put_vlan(ismdev, vlan_id))
533                 return SMC_CLC_DECL_CNFERR;
534         return 0;
535 }
536
537 /* CLC handshake during connect */
538 static int smc_connect_clc(struct smc_sock *smc, int smc_type,
539                            struct smc_clc_msg_accept_confirm *aclc,
540                            struct smc_ib_device *ibdev, u8 ibport,
541                            u8 gid[], struct smcd_dev *ismdev)
542 {
543         int rc = 0;
544
545         /* do inband token exchange */
546         rc = smc_clc_send_proposal(smc, smc_type, ibdev, ibport, gid, ismdev);
547         if (rc)
548                 return rc;
549         /* receive SMC Accept CLC message */
550         return smc_clc_wait_msg(smc, aclc, sizeof(*aclc), SMC_CLC_ACCEPT,
551                                 CLC_WAIT_TIME);
552 }
553
554 /* setup for RDMA connection of client */
555 static int smc_connect_rdma(struct smc_sock *smc,
556                             struct smc_clc_msg_accept_confirm *aclc,
557                             struct smc_ib_device *ibdev, u8 ibport)
558 {
559         int local_contact = SMC_FIRST_CONTACT;
560         struct smc_link *link;
561         int reason_code = 0;
562
563         mutex_lock(&smc_create_lgr_pending);
564         local_contact = smc_conn_create(smc, false, aclc->hdr.flag, ibdev,
565                                         ibport, ntoh24(aclc->qpn), &aclc->lcl,
566                                         NULL, 0);
567         if (local_contact < 0) {
568                 if (local_contact == -ENOMEM)
569                         reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/
570                 else if (local_contact == -ENOLINK)
571                         reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */
572                 else
573                         reason_code = SMC_CLC_DECL_INTERR; /* other error */
574                 return smc_connect_abort(smc, reason_code, 0);
575         }
576         link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK];
577
578         smc_conn_save_peer_info(smc, aclc);
579
580         /* create send buffer and rmb */
581         if (smc_buf_create(smc, false))
582                 return smc_connect_abort(smc, SMC_CLC_DECL_MEM, local_contact);
583
584         if (local_contact == SMC_FIRST_CONTACT)
585                 smc_link_save_peer_info(link, aclc);
586
587         if (smc_rmb_rtoken_handling(&smc->conn, aclc))
588                 return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RTOK,
589                                          local_contact);
590
591         smc_close_init(smc);
592         smc_rx_init(smc);
593
594         if (local_contact == SMC_FIRST_CONTACT) {
595                 if (smc_ib_ready_link(link))
596                         return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RDYLNK,
597                                                  local_contact);
598         } else {
599                 if (smc_reg_rmb(link, smc->conn.rmb_desc, true))
600                         return smc_connect_abort(smc, SMC_CLC_DECL_ERR_REGRMB,
601                                                  local_contact);
602         }
603         smc_rmb_sync_sg_for_device(&smc->conn);
604
605         reason_code = smc_clc_send_confirm(smc);
606         if (reason_code)
607                 return smc_connect_abort(smc, reason_code, local_contact);
608
609         smc_tx_init(smc);
610
611         if (local_contact == SMC_FIRST_CONTACT) {
612                 /* QP confirmation over RoCE fabric */
613                 reason_code = smc_clnt_conf_first_link(smc);
614                 if (reason_code)
615                         return smc_connect_abort(smc, reason_code,
616                                                  local_contact);
617         }
618         mutex_unlock(&smc_create_lgr_pending);
619
620         smc_copy_sock_settings_to_clc(smc);
621         if (smc->sk.sk_state == SMC_INIT)
622                 smc->sk.sk_state = SMC_ACTIVE;
623
624         return 0;
625 }
626
627 /* setup for ISM connection of client */
628 static int smc_connect_ism(struct smc_sock *smc,
629                            struct smc_clc_msg_accept_confirm *aclc,
630                            struct smcd_dev *ismdev)
631 {
632         int local_contact = SMC_FIRST_CONTACT;
633         int rc = 0;
634
635         mutex_lock(&smc_create_lgr_pending);
636         local_contact = smc_conn_create(smc, true, aclc->hdr.flag, NULL, 0, 0,
637                                         NULL, ismdev, aclc->gid);
638         if (local_contact < 0)
639                 return smc_connect_abort(smc, SMC_CLC_DECL_MEM, 0);
640
641         /* Create send and receive buffers */
642         if (smc_buf_create(smc, true))
643                 return smc_connect_abort(smc, SMC_CLC_DECL_MEM, local_contact);
644
645         smc_conn_save_peer_info(smc, aclc);
646         smc_close_init(smc);
647         smc_rx_init(smc);
648         smc_tx_init(smc);
649
650         rc = smc_clc_send_confirm(smc);
651         if (rc)
652                 return smc_connect_abort(smc, rc, local_contact);
653         mutex_unlock(&smc_create_lgr_pending);
654
655         smc_copy_sock_settings_to_clc(smc);
656         if (smc->sk.sk_state == SMC_INIT)
657                 smc->sk.sk_state = SMC_ACTIVE;
658
659         return 0;
660 }
661
662 /* perform steps before actually connecting */
663 static int __smc_connect(struct smc_sock *smc)
664 {
665         bool ism_supported = false, rdma_supported = false;
666         struct smc_clc_msg_accept_confirm aclc;
667         struct smc_ib_device *ibdev;
668         struct smcd_dev *ismdev;
669         u8 gid[SMC_GID_SIZE];
670         unsigned short vlan;
671         int smc_type;
672         int rc = 0;
673         u8 ibport;
674
675         sock_hold(&smc->sk); /* sock put in passive closing */
676
677         if (smc->use_fallback)
678                 return smc_connect_fallback(smc, smc->fallback_rsn);
679
680         /* if peer has not signalled SMC-capability, fall back */
681         if (!tcp_sk(smc->clcsock->sk)->syn_smc)
682                 return smc_connect_fallback(smc, SMC_CLC_DECL_PEERNOSMC);
683
684         /* IPSec connections opt out of SMC-R optimizations */
685         if (using_ipsec(smc))
686                 return smc_connect_decline_fallback(smc, SMC_CLC_DECL_IPSEC);
687
688         /* check for VLAN ID */
689         if (smc_vlan_by_tcpsk(smc->clcsock, &vlan))
690                 return smc_connect_decline_fallback(smc, SMC_CLC_DECL_CNFERR);
691
692         /* check if there is an ism device available */
693         if (!smc_check_ism(smc, &ismdev) &&
694             !smc_connect_ism_vlan_setup(smc, ismdev, vlan)) {
695                 /* ISM is supported for this connection */
696                 ism_supported = true;
697                 smc_type = SMC_TYPE_D;
698         }
699
700         /* check if there is a rdma device available */
701         if (!smc_check_rdma(smc, &ibdev, &ibport, vlan, gid)) {
702                 /* RDMA is supported for this connection */
703                 rdma_supported = true;
704                 if (ism_supported)
705                         smc_type = SMC_TYPE_B; /* both */
706                 else
707                         smc_type = SMC_TYPE_R; /* only RDMA */
708         }
709
710         /* if neither ISM nor RDMA are supported, fallback */
711         if (!rdma_supported && !ism_supported)
712                 return smc_connect_decline_fallback(smc, SMC_CLC_DECL_NOSMCDEV);
713
714         /* perform CLC handshake */
715         rc = smc_connect_clc(smc, smc_type, &aclc, ibdev, ibport, gid, ismdev);
716         if (rc) {
717                 smc_connect_ism_vlan_cleanup(smc, ism_supported, ismdev, vlan);
718                 return smc_connect_decline_fallback(smc, rc);
719         }
720
721         /* depending on previous steps, connect using rdma or ism */
722         if (rdma_supported && aclc.hdr.path == SMC_TYPE_R)
723                 rc = smc_connect_rdma(smc, &aclc, ibdev, ibport);
724         else if (ism_supported && aclc.hdr.path == SMC_TYPE_D)
725                 rc = smc_connect_ism(smc, &aclc, ismdev);
726         else
727                 rc = SMC_CLC_DECL_MODEUNSUPP;
728         if (rc) {
729                 smc_connect_ism_vlan_cleanup(smc, ism_supported, ismdev, vlan);
730                 return smc_connect_decline_fallback(smc, rc);
731         }
732
733         smc_connect_ism_vlan_cleanup(smc, ism_supported, ismdev, vlan);
734         return 0;
735 }
736
737 static void smc_connect_work(struct work_struct *work)
738 {
739         struct smc_sock *smc = container_of(work, struct smc_sock,
740                                             connect_work);
741         int rc;
742
743         lock_sock(&smc->sk);
744         rc = kernel_connect(smc->clcsock, &smc->connect_info->addr,
745                             smc->connect_info->alen, smc->connect_info->flags);
746         if (smc->clcsock->sk->sk_err) {
747                 smc->sk.sk_err = smc->clcsock->sk->sk_err;
748                 goto out;
749         }
750         if (rc < 0) {
751                 smc->sk.sk_err = -rc;
752                 goto out;
753         }
754
755         rc = __smc_connect(smc);
756         if (rc < 0)
757                 smc->sk.sk_err = -rc;
758
759 out:
760         if (smc->sk.sk_err)
761                 smc->sk.sk_state_change(&smc->sk);
762         else
763                 smc->sk.sk_write_space(&smc->sk);
764         kfree(smc->connect_info);
765         smc->connect_info = NULL;
766         release_sock(&smc->sk);
767 }
768
769 static int smc_connect(struct socket *sock, struct sockaddr *addr,
770                        int alen, int flags)
771 {
772         struct sock *sk = sock->sk;
773         struct smc_sock *smc;
774         int rc = -EINVAL;
775
776         smc = smc_sk(sk);
777
778         /* separate smc parameter checking to be safe */
779         if (alen < sizeof(addr->sa_family))
780                 goto out_err;
781         if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6)
782                 goto out_err;
783
784         lock_sock(sk);
785         switch (sk->sk_state) {
786         default:
787                 goto out;
788         case SMC_ACTIVE:
789                 rc = -EISCONN;
790                 goto out;
791         case SMC_INIT:
792                 rc = 0;
793                 break;
794         }
795
796         smc_copy_sock_settings_to_clc(smc);
797         tcp_sk(smc->clcsock->sk)->syn_smc = 1;
798         if (flags & O_NONBLOCK) {
799                 if (smc->connect_info) {
800                         rc = -EALREADY;
801                         goto out;
802                 }
803                 smc->connect_info = kzalloc(alen + 2 * sizeof(int), GFP_KERNEL);
804                 if (!smc->connect_info) {
805                         rc = -ENOMEM;
806                         goto out;
807                 }
808                 smc->connect_info->alen = alen;
809                 smc->connect_info->flags = flags ^ O_NONBLOCK;
810                 memcpy(&smc->connect_info->addr, addr, alen);
811                 schedule_work(&smc->connect_work);
812                 rc = -EINPROGRESS;
813         } else {
814                 rc = kernel_connect(smc->clcsock, addr, alen, flags);
815                 if (rc)
816                         goto out;
817
818                 rc = __smc_connect(smc);
819                 if (rc < 0)
820                         goto out;
821                 else
822                         rc = 0; /* success cases including fallback */
823         }
824
825 out:
826         release_sock(sk);
827 out_err:
828         return rc;
829 }
830
831 static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
832 {
833         struct socket *new_clcsock = NULL;
834         struct sock *lsk = &lsmc->sk;
835         struct sock *new_sk;
836         int rc = -EINVAL;
837
838         release_sock(lsk);
839         new_sk = smc_sock_alloc(sock_net(lsk), NULL, lsk->sk_protocol);
840         if (!new_sk) {
841                 rc = -ENOMEM;
842                 lsk->sk_err = ENOMEM;
843                 *new_smc = NULL;
844                 lock_sock(lsk);
845                 goto out;
846         }
847         *new_smc = smc_sk(new_sk);
848
849         mutex_lock(&lsmc->clcsock_release_lock);
850         if (lsmc->clcsock)
851                 rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0);
852         mutex_unlock(&lsmc->clcsock_release_lock);
853         lock_sock(lsk);
854         if  (rc < 0)
855                 lsk->sk_err = -rc;
856         if (rc < 0 || lsk->sk_state == SMC_CLOSED) {
857                 if (new_clcsock)
858                         sock_release(new_clcsock);
859                 new_sk->sk_state = SMC_CLOSED;
860                 sock_set_flag(new_sk, SOCK_DEAD);
861                 new_sk->sk_prot->unhash(new_sk);
862                 sock_put(new_sk); /* final */
863                 *new_smc = NULL;
864                 goto out;
865         }
866
867         (*new_smc)->clcsock = new_clcsock;
868 out:
869         return rc;
870 }
871
872 /* add a just created sock to the accept queue of the listen sock as
873  * candidate for a following socket accept call from user space
874  */
875 static void smc_accept_enqueue(struct sock *parent, struct sock *sk)
876 {
877         struct smc_sock *par = smc_sk(parent);
878
879         sock_hold(sk); /* sock_put in smc_accept_unlink () */
880         spin_lock(&par->accept_q_lock);
881         list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q);
882         spin_unlock(&par->accept_q_lock);
883         sk_acceptq_added(parent);
884 }
885
886 /* remove a socket from the accept queue of its parental listening socket */
887 static void smc_accept_unlink(struct sock *sk)
888 {
889         struct smc_sock *par = smc_sk(sk)->listen_smc;
890
891         spin_lock(&par->accept_q_lock);
892         list_del_init(&smc_sk(sk)->accept_q);
893         spin_unlock(&par->accept_q_lock);
894         sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk);
895         sock_put(sk); /* sock_hold in smc_accept_enqueue */
896 }
897
898 /* remove a sock from the accept queue to bind it to a new socket created
899  * for a socket accept call from user space
900  */
901 struct sock *smc_accept_dequeue(struct sock *parent,
902                                 struct socket *new_sock)
903 {
904         struct smc_sock *isk, *n;
905         struct sock *new_sk;
906
907         list_for_each_entry_safe(isk, n, &smc_sk(parent)->accept_q, accept_q) {
908                 new_sk = (struct sock *)isk;
909
910                 smc_accept_unlink(new_sk);
911                 if (new_sk->sk_state == SMC_CLOSED) {
912                         if (isk->clcsock) {
913                                 sock_release(isk->clcsock);
914                                 isk->clcsock = NULL;
915                         }
916                         new_sk->sk_prot->unhash(new_sk);
917                         sock_put(new_sk); /* final */
918                         continue;
919                 }
920                 if (new_sock)
921                         sock_graft(new_sk, new_sock);
922                 return new_sk;
923         }
924         return NULL;
925 }
926
927 /* clean up for a created but never accepted sock */
928 void smc_close_non_accepted(struct sock *sk)
929 {
930         struct smc_sock *smc = smc_sk(sk);
931
932         lock_sock(sk);
933         if (!sk->sk_lingertime)
934                 /* wait for peer closing */
935                 sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT;
936         if (!smc->use_fallback) {
937                 smc_close_active(smc);
938                 sock_set_flag(sk, SOCK_DEAD);
939                 sk->sk_shutdown |= SHUTDOWN_MASK;
940         }
941         if (smc->clcsock) {
942                 struct socket *tcp;
943
944                 tcp = smc->clcsock;
945                 smc->clcsock = NULL;
946                 sock_release(tcp);
947         }
948         if (smc->use_fallback) {
949                 sock_put(sk); /* passive closing */
950                 sk->sk_state = SMC_CLOSED;
951         } else {
952                 if (sk->sk_state == SMC_CLOSED)
953                         smc_conn_free(&smc->conn);
954         }
955         release_sock(sk);
956         sk->sk_prot->unhash(sk);
957         sock_put(sk); /* final sock_put */
958 }
959
960 static int smc_serv_conf_first_link(struct smc_sock *smc)
961 {
962         struct net *net = sock_net(smc->clcsock->sk);
963         struct smc_link_group *lgr = smc->conn.lgr;
964         struct smc_link *link;
965         int rest;
966         int rc;
967
968         link = &lgr->lnk[SMC_SINGLE_LINK];
969
970         if (smc_reg_rmb(link, smc->conn.rmb_desc, false))
971                 return SMC_CLC_DECL_ERR_REGRMB;
972
973         /* send CONFIRM LINK request to client over the RoCE fabric */
974         rc = smc_llc_send_confirm_link(link, SMC_LLC_REQ);
975         if (rc < 0)
976                 return SMC_CLC_DECL_TIMEOUT_CL;
977
978         /* receive CONFIRM LINK response from client over the RoCE fabric */
979         rest = wait_for_completion_interruptible_timeout(
980                 &link->llc_confirm_resp,
981                 SMC_LLC_WAIT_FIRST_TIME);
982         if (rest <= 0) {
983                 struct smc_clc_msg_decline dclc;
984
985                 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
986                                       SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
987                 return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc;
988         }
989
990         if (link->llc_confirm_resp_rc)
991                 return SMC_CLC_DECL_RMBE_EC;
992
993         /* send ADD LINK request to client over the RoCE fabric */
994         rc = smc_llc_send_add_link(link,
995                                    link->smcibdev->mac[link->ibport - 1],
996                                    link->gid, SMC_LLC_REQ);
997         if (rc < 0)
998                 return SMC_CLC_DECL_TIMEOUT_AL;
999
1000         /* receive ADD LINK response from client over the RoCE fabric */
1001         rest = wait_for_completion_interruptible_timeout(&link->llc_add_resp,
1002                                                          SMC_LLC_WAIT_TIME);
1003         if (rest <= 0) {
1004                 struct smc_clc_msg_decline dclc;
1005
1006                 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
1007                                       SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
1008                 return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_AL : rc;
1009         }
1010
1011         smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time);
1012
1013         return 0;
1014 }
1015
1016 /* listen worker: finish */
1017 static void smc_listen_out(struct smc_sock *new_smc)
1018 {
1019         struct smc_sock *lsmc = new_smc->listen_smc;
1020         struct sock *newsmcsk = &new_smc->sk;
1021
1022         lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING);
1023         if (lsmc->sk.sk_state == SMC_LISTEN) {
1024                 smc_accept_enqueue(&lsmc->sk, newsmcsk);
1025         } else { /* no longer listening */
1026                 smc_close_non_accepted(newsmcsk);
1027         }
1028         release_sock(&lsmc->sk);
1029
1030         /* Wake up accept */
1031         lsmc->sk.sk_data_ready(&lsmc->sk);
1032         sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */
1033 }
1034
1035 /* listen worker: finish in state connected */
1036 static void smc_listen_out_connected(struct smc_sock *new_smc)
1037 {
1038         struct sock *newsmcsk = &new_smc->sk;
1039
1040         sk_refcnt_debug_inc(newsmcsk);
1041         if (newsmcsk->sk_state == SMC_INIT)
1042                 newsmcsk->sk_state = SMC_ACTIVE;
1043
1044         smc_listen_out(new_smc);
1045 }
1046
1047 /* listen worker: finish in error state */
1048 static void smc_listen_out_err(struct smc_sock *new_smc)
1049 {
1050         struct sock *newsmcsk = &new_smc->sk;
1051
1052         if (newsmcsk->sk_state == SMC_INIT)
1053                 sock_put(&new_smc->sk); /* passive closing */
1054         newsmcsk->sk_state = SMC_CLOSED;
1055         smc_conn_free(&new_smc->conn);
1056
1057         smc_listen_out(new_smc);
1058 }
1059
1060 /* listen worker: decline and fall back if possible */
1061 static void smc_listen_decline(struct smc_sock *new_smc, int reason_code,
1062                                int local_contact)
1063 {
1064         /* RDMA setup failed, switch back to TCP */
1065         if (local_contact == SMC_FIRST_CONTACT)
1066                 smc_lgr_forget(new_smc->conn.lgr);
1067         if (reason_code < 0) { /* error, no fallback possible */
1068                 smc_listen_out_err(new_smc);
1069                 return;
1070         }
1071         smc_conn_free(&new_smc->conn);
1072         new_smc->use_fallback = true;
1073         new_smc->fallback_rsn = reason_code;
1074         if (reason_code && reason_code != SMC_CLC_DECL_PEERDECL) {
1075                 if (smc_clc_send_decline(new_smc, reason_code) < 0) {
1076                         smc_listen_out_err(new_smc);
1077                         return;
1078                 }
1079         }
1080         smc_listen_out_connected(new_smc);
1081 }
1082
1083 /* listen worker: check prefixes */
1084 static int smc_listen_rdma_check(struct smc_sock *new_smc,
1085                                  struct smc_clc_msg_proposal *pclc)
1086 {
1087         struct smc_clc_msg_proposal_prefix *pclc_prfx;
1088         struct socket *newclcsock = new_smc->clcsock;
1089
1090         pclc_prfx = smc_clc_proposal_get_prefix(pclc);
1091         if (smc_clc_prfx_match(newclcsock, pclc_prfx))
1092                 return SMC_CLC_DECL_CNFERR;
1093
1094         return 0;
1095 }
1096
1097 /* listen worker: initialize connection and buffers */
1098 static int smc_listen_rdma_init(struct smc_sock *new_smc,
1099                                 struct smc_clc_msg_proposal *pclc,
1100                                 struct smc_ib_device *ibdev, u8 ibport,
1101                                 int *local_contact)
1102 {
1103         /* allocate connection / link group */
1104         *local_contact = smc_conn_create(new_smc, false, 0, ibdev, ibport, 0,
1105                                          &pclc->lcl, NULL, 0);
1106         if (*local_contact < 0) {
1107                 if (*local_contact == -ENOMEM)
1108                         return SMC_CLC_DECL_MEM;/* insufficient memory*/
1109                 return SMC_CLC_DECL_INTERR; /* other error */
1110         }
1111
1112         /* create send buffer and rmb */
1113         if (smc_buf_create(new_smc, false))
1114                 return SMC_CLC_DECL_MEM;
1115
1116         return 0;
1117 }
1118
1119 /* listen worker: initialize connection and buffers for SMC-D */
1120 static int smc_listen_ism_init(struct smc_sock *new_smc,
1121                                struct smc_clc_msg_proposal *pclc,
1122                                struct smcd_dev *ismdev,
1123                                int *local_contact)
1124 {
1125         struct smc_clc_msg_smcd *pclc_smcd;
1126
1127         pclc_smcd = smc_get_clc_msg_smcd(pclc);
1128         *local_contact = smc_conn_create(new_smc, true, 0, NULL, 0, 0, NULL,
1129                                          ismdev, pclc_smcd->gid);
1130         if (*local_contact < 0) {
1131                 if (*local_contact == -ENOMEM)
1132                         return SMC_CLC_DECL_MEM;/* insufficient memory*/
1133                 return SMC_CLC_DECL_INTERR; /* other error */
1134         }
1135
1136         /* Check if peer can be reached via ISM device */
1137         if (smc_ism_cantalk(new_smc->conn.lgr->peer_gid,
1138                             new_smc->conn.lgr->vlan_id,
1139                             new_smc->conn.lgr->smcd)) {
1140                 if (*local_contact == SMC_FIRST_CONTACT)
1141                         smc_lgr_forget(new_smc->conn.lgr);
1142                 smc_conn_free(&new_smc->conn);
1143                 return SMC_CLC_DECL_CNFERR;
1144         }
1145
1146         /* Create send and receive buffers */
1147         if (smc_buf_create(new_smc, true)) {
1148                 if (*local_contact == SMC_FIRST_CONTACT)
1149                         smc_lgr_forget(new_smc->conn.lgr);
1150                 smc_conn_free(&new_smc->conn);
1151                 return SMC_CLC_DECL_MEM;
1152         }
1153
1154         return 0;
1155 }
1156
1157 /* listen worker: register buffers */
1158 static int smc_listen_rdma_reg(struct smc_sock *new_smc, int local_contact)
1159 {
1160         struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
1161
1162         if (local_contact != SMC_FIRST_CONTACT) {
1163                 if (smc_reg_rmb(link, new_smc->conn.rmb_desc, true))
1164                         return SMC_CLC_DECL_ERR_REGRMB;
1165         }
1166         smc_rmb_sync_sg_for_device(&new_smc->conn);
1167
1168         return 0;
1169 }
1170
1171 /* listen worker: finish RDMA setup */
1172 static int smc_listen_rdma_finish(struct smc_sock *new_smc,
1173                                   struct smc_clc_msg_accept_confirm *cclc,
1174                                   int local_contact)
1175 {
1176         struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
1177         int reason_code = 0;
1178
1179         if (local_contact == SMC_FIRST_CONTACT)
1180                 smc_link_save_peer_info(link, cclc);
1181
1182         if (smc_rmb_rtoken_handling(&new_smc->conn, cclc)) {
1183                 reason_code = SMC_CLC_DECL_ERR_RTOK;
1184                 goto decline;
1185         }
1186
1187         if (local_contact == SMC_FIRST_CONTACT) {
1188                 if (smc_ib_ready_link(link)) {
1189                         reason_code = SMC_CLC_DECL_ERR_RDYLNK;
1190                         goto decline;
1191                 }
1192                 /* QP confirmation over RoCE fabric */
1193                 reason_code = smc_serv_conf_first_link(new_smc);
1194                 if (reason_code)
1195                         goto decline;
1196         }
1197         return 0;
1198
1199 decline:
1200         smc_listen_decline(new_smc, reason_code, local_contact);
1201         return reason_code;
1202 }
1203
1204 /* setup for RDMA connection of server */
1205 static void smc_listen_work(struct work_struct *work)
1206 {
1207         struct smc_sock *new_smc = container_of(work, struct smc_sock,
1208                                                 smc_listen_work);
1209         struct socket *newclcsock = new_smc->clcsock;
1210         struct smc_clc_msg_accept_confirm cclc;
1211         struct smc_clc_msg_proposal *pclc;
1212         struct smc_ib_device *ibdev;
1213         bool ism_supported = false;
1214         struct smcd_dev *ismdev;
1215         u8 buf[SMC_CLC_MAX_LEN];
1216         int local_contact = 0;
1217         unsigned short vlan;
1218         int reason_code = 0;
1219         int rc = 0;
1220         u8 ibport;
1221
1222         if (new_smc->use_fallback) {
1223                 smc_listen_out_connected(new_smc);
1224                 return;
1225         }
1226
1227         /* check if peer is smc capable */
1228         if (!tcp_sk(newclcsock->sk)->syn_smc) {
1229                 new_smc->use_fallback = true;
1230                 new_smc->fallback_rsn = SMC_CLC_DECL_PEERNOSMC;
1231                 smc_listen_out_connected(new_smc);
1232                 return;
1233         }
1234
1235         /* do inband token exchange -
1236          * wait for and receive SMC Proposal CLC message
1237          */
1238         pclc = (struct smc_clc_msg_proposal *)&buf;
1239         reason_code = smc_clc_wait_msg(new_smc, pclc, SMC_CLC_MAX_LEN,
1240                                        SMC_CLC_PROPOSAL, CLC_WAIT_TIME);
1241         if (reason_code) {
1242                 smc_listen_decline(new_smc, reason_code, 0);
1243                 return;
1244         }
1245
1246         /* IPSec connections opt out of SMC-R optimizations */
1247         if (using_ipsec(new_smc)) {
1248                 smc_listen_decline(new_smc, SMC_CLC_DECL_IPSEC, 0);
1249                 return;
1250         }
1251
1252         mutex_lock(&smc_create_lgr_pending);
1253         smc_close_init(new_smc);
1254         smc_rx_init(new_smc);
1255         smc_tx_init(new_smc);
1256
1257         /* check if ISM is available */
1258         if ((pclc->hdr.path == SMC_TYPE_D || pclc->hdr.path == SMC_TYPE_B) &&
1259             !smc_check_ism(new_smc, &ismdev) &&
1260             !smc_listen_ism_init(new_smc, pclc, ismdev, &local_contact)) {
1261                 ism_supported = true;
1262         }
1263
1264         /* check if RDMA is available */
1265         if (!ism_supported &&
1266             ((pclc->hdr.path != SMC_TYPE_R && pclc->hdr.path != SMC_TYPE_B) ||
1267              smc_vlan_by_tcpsk(new_smc->clcsock, &vlan) ||
1268              smc_check_rdma(new_smc, &ibdev, &ibport, vlan, NULL) ||
1269              smc_listen_rdma_check(new_smc, pclc) ||
1270              smc_listen_rdma_init(new_smc, pclc, ibdev, ibport,
1271                                   &local_contact) ||
1272              smc_listen_rdma_reg(new_smc, local_contact))) {
1273                 /* SMC not supported, decline */
1274                 mutex_unlock(&smc_create_lgr_pending);
1275                 smc_listen_decline(new_smc, SMC_CLC_DECL_MODEUNSUPP,
1276                                    local_contact);
1277                 return;
1278         }
1279
1280         /* send SMC Accept CLC message */
1281         rc = smc_clc_send_accept(new_smc, local_contact);
1282         if (rc) {
1283                 mutex_unlock(&smc_create_lgr_pending);
1284                 smc_listen_decline(new_smc, rc, local_contact);
1285                 return;
1286         }
1287
1288         /* receive SMC Confirm CLC message */
1289         reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc),
1290                                        SMC_CLC_CONFIRM, CLC_WAIT_TIME);
1291         if (reason_code) {
1292                 mutex_unlock(&smc_create_lgr_pending);
1293                 smc_listen_decline(new_smc, reason_code, local_contact);
1294                 return;
1295         }
1296
1297         /* finish worker */
1298         if (!ism_supported) {
1299                 if (smc_listen_rdma_finish(new_smc, &cclc, local_contact)) {
1300                         mutex_unlock(&smc_create_lgr_pending);
1301                         return;
1302                 }
1303         }
1304         smc_conn_save_peer_info(new_smc, &cclc);
1305         mutex_unlock(&smc_create_lgr_pending);
1306         smc_listen_out_connected(new_smc);
1307 }
1308
1309 static void smc_tcp_listen_work(struct work_struct *work)
1310 {
1311         struct smc_sock *lsmc = container_of(work, struct smc_sock,
1312                                              tcp_listen_work);
1313         struct sock *lsk = &lsmc->sk;
1314         struct smc_sock *new_smc;
1315         int rc = 0;
1316
1317         lock_sock(lsk);
1318         while (lsk->sk_state == SMC_LISTEN) {
1319                 rc = smc_clcsock_accept(lsmc, &new_smc);
1320                 if (rc)
1321                         goto out;
1322                 if (!new_smc)
1323                         continue;
1324
1325                 new_smc->listen_smc = lsmc;
1326                 new_smc->use_fallback = lsmc->use_fallback;
1327                 new_smc->fallback_rsn = lsmc->fallback_rsn;
1328                 sock_hold(lsk); /* sock_put in smc_listen_work */
1329                 INIT_WORK(&new_smc->smc_listen_work, smc_listen_work);
1330                 smc_copy_sock_settings_to_smc(new_smc);
1331                 new_smc->sk.sk_sndbuf = lsmc->sk.sk_sndbuf;
1332                 new_smc->sk.sk_rcvbuf = lsmc->sk.sk_rcvbuf;
1333                 sock_hold(&new_smc->sk); /* sock_put in passive closing */
1334                 if (!schedule_work(&new_smc->smc_listen_work))
1335                         sock_put(&new_smc->sk);
1336         }
1337
1338 out:
1339         release_sock(lsk);
1340         sock_put(&lsmc->sk); /* sock_hold in smc_listen */
1341 }
1342
1343 static int smc_listen(struct socket *sock, int backlog)
1344 {
1345         struct sock *sk = sock->sk;
1346         struct smc_sock *smc;
1347         int rc;
1348
1349         smc = smc_sk(sk);
1350         lock_sock(sk);
1351
1352         rc = -EINVAL;
1353         if ((sk->sk_state != SMC_INIT) && (sk->sk_state != SMC_LISTEN))
1354                 goto out;
1355
1356         rc = 0;
1357         if (sk->sk_state == SMC_LISTEN) {
1358                 sk->sk_max_ack_backlog = backlog;
1359                 goto out;
1360         }
1361         /* some socket options are handled in core, so we could not apply
1362          * them to the clc socket -- copy smc socket options to clc socket
1363          */
1364         smc_copy_sock_settings_to_clc(smc);
1365         if (!smc->use_fallback)
1366                 tcp_sk(smc->clcsock->sk)->syn_smc = 1;
1367
1368         rc = kernel_listen(smc->clcsock, backlog);
1369         if (rc)
1370                 goto out;
1371         sk->sk_max_ack_backlog = backlog;
1372         sk->sk_ack_backlog = 0;
1373         sk->sk_state = SMC_LISTEN;
1374         sock_hold(sk); /* sock_hold in tcp_listen_worker */
1375         if (!schedule_work(&smc->tcp_listen_work))
1376                 sock_put(sk);
1377
1378 out:
1379         release_sock(sk);
1380         return rc;
1381 }
1382
1383 static int smc_accept(struct socket *sock, struct socket *new_sock,
1384                       int flags, bool kern)
1385 {
1386         struct sock *sk = sock->sk, *nsk;
1387         DECLARE_WAITQUEUE(wait, current);
1388         struct smc_sock *lsmc;
1389         long timeo;
1390         int rc = 0;
1391
1392         lsmc = smc_sk(sk);
1393         sock_hold(sk); /* sock_put below */
1394         lock_sock(sk);
1395
1396         if (lsmc->sk.sk_state != SMC_LISTEN) {
1397                 rc = -EINVAL;
1398                 release_sock(sk);
1399                 goto out;
1400         }
1401
1402         /* Wait for an incoming connection */
1403         timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
1404         add_wait_queue_exclusive(sk_sleep(sk), &wait);
1405         while (!(nsk = smc_accept_dequeue(sk, new_sock))) {
1406                 set_current_state(TASK_INTERRUPTIBLE);
1407                 if (!timeo) {
1408                         rc = -EAGAIN;
1409                         break;
1410                 }
1411                 release_sock(sk);
1412                 timeo = schedule_timeout(timeo);
1413                 /* wakeup by sk_data_ready in smc_listen_work() */
1414                 sched_annotate_sleep();
1415                 lock_sock(sk);
1416                 if (signal_pending(current)) {
1417                         rc = sock_intr_errno(timeo);
1418                         break;
1419                 }
1420         }
1421         set_current_state(TASK_RUNNING);
1422         remove_wait_queue(sk_sleep(sk), &wait);
1423
1424         if (!rc)
1425                 rc = sock_error(nsk);
1426         release_sock(sk);
1427         if (rc)
1428                 goto out;
1429
1430         if (lsmc->sockopt_defer_accept && !(flags & O_NONBLOCK)) {
1431                 /* wait till data arrives on the socket */
1432                 timeo = msecs_to_jiffies(lsmc->sockopt_defer_accept *
1433                                                                 MSEC_PER_SEC);
1434                 if (smc_sk(nsk)->use_fallback) {
1435                         struct sock *clcsk = smc_sk(nsk)->clcsock->sk;
1436
1437                         lock_sock(clcsk);
1438                         if (skb_queue_empty(&clcsk->sk_receive_queue))
1439                                 sk_wait_data(clcsk, &timeo, NULL);
1440                         release_sock(clcsk);
1441                 } else if (!atomic_read(&smc_sk(nsk)->conn.bytes_to_rcv)) {
1442                         lock_sock(nsk);
1443                         smc_rx_wait(smc_sk(nsk), &timeo, smc_rx_data_available);
1444                         release_sock(nsk);
1445                 }
1446         }
1447
1448 out:
1449         sock_put(sk); /* sock_hold above */
1450         return rc;
1451 }
1452
1453 static int smc_getname(struct socket *sock, struct sockaddr *addr,
1454                        int peer)
1455 {
1456         struct smc_sock *smc;
1457
1458         if (peer && (sock->sk->sk_state != SMC_ACTIVE) &&
1459             (sock->sk->sk_state != SMC_APPCLOSEWAIT1))
1460                 return -ENOTCONN;
1461
1462         smc = smc_sk(sock->sk);
1463
1464         return smc->clcsock->ops->getname(smc->clcsock, addr, peer);
1465 }
1466
1467 static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
1468 {
1469         struct sock *sk = sock->sk;
1470         struct smc_sock *smc;
1471         int rc = -EPIPE;
1472
1473         smc = smc_sk(sk);
1474         lock_sock(sk);
1475         if ((sk->sk_state != SMC_ACTIVE) &&
1476             (sk->sk_state != SMC_APPCLOSEWAIT1) &&
1477             (sk->sk_state != SMC_INIT))
1478                 goto out;
1479
1480         if (msg->msg_flags & MSG_FASTOPEN) {
1481                 if (sk->sk_state == SMC_INIT) {
1482                         smc->use_fallback = true;
1483                         smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP;
1484                 } else {
1485                         rc = -EINVAL;
1486                         goto out;
1487                 }
1488         }
1489
1490         if (smc->use_fallback)
1491                 rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len);
1492         else
1493                 rc = smc_tx_sendmsg(smc, msg, len);
1494 out:
1495         release_sock(sk);
1496         return rc;
1497 }
1498
1499 static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
1500                        int flags)
1501 {
1502         struct sock *sk = sock->sk;
1503         struct smc_sock *smc;
1504         int rc = -ENOTCONN;
1505
1506         smc = smc_sk(sk);
1507         lock_sock(sk);
1508         if ((sk->sk_state == SMC_INIT) ||
1509             (sk->sk_state == SMC_LISTEN) ||
1510             (sk->sk_state == SMC_CLOSED))
1511                 goto out;
1512
1513         if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
1514                 rc = 0;
1515                 goto out;
1516         }
1517
1518         if (smc->use_fallback) {
1519                 rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags);
1520         } else {
1521                 msg->msg_namelen = 0;
1522                 rc = smc_rx_recvmsg(smc, msg, NULL, len, flags);
1523         }
1524
1525 out:
1526         release_sock(sk);
1527         return rc;
1528 }
1529
1530 static __poll_t smc_accept_poll(struct sock *parent)
1531 {
1532         struct smc_sock *isk = smc_sk(parent);
1533         __poll_t mask = 0;
1534
1535         spin_lock(&isk->accept_q_lock);
1536         if (!list_empty(&isk->accept_q))
1537                 mask = EPOLLIN | EPOLLRDNORM;
1538         spin_unlock(&isk->accept_q_lock);
1539
1540         return mask;
1541 }
1542
1543 static __poll_t smc_poll(struct file *file, struct socket *sock,
1544                              poll_table *wait)
1545 {
1546         struct sock *sk = sock->sk;
1547         __poll_t mask = 0;
1548         struct smc_sock *smc;
1549
1550         if (!sk)
1551                 return EPOLLNVAL;
1552
1553         smc = smc_sk(sock->sk);
1554         if (smc->use_fallback) {
1555                 /* delegate to CLC child sock */
1556                 mask = smc->clcsock->ops->poll(file, smc->clcsock, wait);
1557                 sk->sk_err = smc->clcsock->sk->sk_err;
1558                 if (sk->sk_err)
1559                         mask |= EPOLLERR;
1560         } else {
1561                 if (sk->sk_state != SMC_CLOSED)
1562                         sock_poll_wait(file, sock, wait);
1563                 if (sk->sk_err)
1564                         mask |= EPOLLERR;
1565                 if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
1566                     (sk->sk_state == SMC_CLOSED))
1567                         mask |= EPOLLHUP;
1568                 if (sk->sk_state == SMC_LISTEN) {
1569                         /* woken up by sk_data_ready in smc_listen_work() */
1570                         mask = smc_accept_poll(sk);
1571                 } else {
1572                         if (atomic_read(&smc->conn.sndbuf_space) ||
1573                             sk->sk_shutdown & SEND_SHUTDOWN) {
1574                                 mask |= EPOLLOUT | EPOLLWRNORM;
1575                         } else {
1576                                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1577                                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1578                         }
1579                         if (atomic_read(&smc->conn.bytes_to_rcv))
1580                                 mask |= EPOLLIN | EPOLLRDNORM;
1581                         if (sk->sk_shutdown & RCV_SHUTDOWN)
1582                                 mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
1583                         if (sk->sk_state == SMC_APPCLOSEWAIT1)
1584                                 mask |= EPOLLIN;
1585                         if (smc->conn.urg_state == SMC_URG_VALID)
1586                                 mask |= EPOLLPRI;
1587                 }
1588         }
1589
1590         return mask;
1591 }
1592
1593 static int smc_shutdown(struct socket *sock, int how)
1594 {
1595         struct sock *sk = sock->sk;
1596         struct smc_sock *smc;
1597         int rc = -EINVAL;
1598         int rc1 = 0;
1599
1600         smc = smc_sk(sk);
1601
1602         if ((how < SHUT_RD) || (how > SHUT_RDWR))
1603                 return rc;
1604
1605         lock_sock(sk);
1606
1607         rc = -ENOTCONN;
1608         if ((sk->sk_state != SMC_ACTIVE) &&
1609             (sk->sk_state != SMC_PEERCLOSEWAIT1) &&
1610             (sk->sk_state != SMC_PEERCLOSEWAIT2) &&
1611             (sk->sk_state != SMC_APPCLOSEWAIT1) &&
1612             (sk->sk_state != SMC_APPCLOSEWAIT2) &&
1613             (sk->sk_state != SMC_APPFINCLOSEWAIT))
1614                 goto out;
1615         if (smc->use_fallback) {
1616                 rc = kernel_sock_shutdown(smc->clcsock, how);
1617                 sk->sk_shutdown = smc->clcsock->sk->sk_shutdown;
1618                 if (sk->sk_shutdown == SHUTDOWN_MASK)
1619                         sk->sk_state = SMC_CLOSED;
1620                 goto out;
1621         }
1622         switch (how) {
1623         case SHUT_RDWR:         /* shutdown in both directions */
1624                 rc = smc_close_active(smc);
1625                 break;
1626         case SHUT_WR:
1627                 rc = smc_close_shutdown_write(smc);
1628                 break;
1629         case SHUT_RD:
1630                 rc = 0;
1631                 /* nothing more to do because peer is not involved */
1632                 break;
1633         }
1634         if (smc->clcsock)
1635                 rc1 = kernel_sock_shutdown(smc->clcsock, how);
1636         /* map sock_shutdown_cmd constants to sk_shutdown value range */
1637         sk->sk_shutdown |= how + 1;
1638
1639 out:
1640         release_sock(sk);
1641         return rc ? rc : rc1;
1642 }
1643
1644 static int smc_setsockopt(struct socket *sock, int level, int optname,
1645                           char __user *optval, unsigned int optlen)
1646 {
1647         struct sock *sk = sock->sk;
1648         struct smc_sock *smc;
1649         int val, rc;
1650
1651         smc = smc_sk(sk);
1652
1653         /* generic setsockopts reaching us here always apply to the
1654          * CLC socket
1655          */
1656         rc = smc->clcsock->ops->setsockopt(smc->clcsock, level, optname,
1657                                            optval, optlen);
1658         if (smc->clcsock->sk->sk_err) {
1659                 sk->sk_err = smc->clcsock->sk->sk_err;
1660                 sk->sk_error_report(sk);
1661         }
1662         if (rc)
1663                 return rc;
1664
1665         if (optlen < sizeof(int))
1666                 return -EINVAL;
1667         if (get_user(val, (int __user *)optval))
1668                 return -EFAULT;
1669
1670         lock_sock(sk);
1671         switch (optname) {
1672         case TCP_ULP:
1673         case TCP_FASTOPEN:
1674         case TCP_FASTOPEN_CONNECT:
1675         case TCP_FASTOPEN_KEY:
1676         case TCP_FASTOPEN_NO_COOKIE:
1677                 /* option not supported by SMC */
1678                 if (sk->sk_state == SMC_INIT) {
1679                         smc->use_fallback = true;
1680                         smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP;
1681                 } else {
1682                         if (!smc->use_fallback)
1683                                 rc = -EINVAL;
1684                 }
1685                 break;
1686         case TCP_NODELAY:
1687                 if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) {
1688                         if (val && !smc->use_fallback)
1689                                 mod_delayed_work(system_wq, &smc->conn.tx_work,
1690                                                  0);
1691                 }
1692                 break;
1693         case TCP_CORK:
1694                 if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) {
1695                         if (!val && !smc->use_fallback)
1696                                 mod_delayed_work(system_wq, &smc->conn.tx_work,
1697                                                  0);
1698                 }
1699                 break;
1700         case TCP_DEFER_ACCEPT:
1701                 smc->sockopt_defer_accept = val;
1702                 break;
1703         default:
1704                 break;
1705         }
1706         release_sock(sk);
1707
1708         return rc;
1709 }
1710
1711 static int smc_getsockopt(struct socket *sock, int level, int optname,
1712                           char __user *optval, int __user *optlen)
1713 {
1714         struct smc_sock *smc;
1715
1716         smc = smc_sk(sock->sk);
1717         /* socket options apply to the CLC socket */
1718         return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname,
1719                                              optval, optlen);
1720 }
1721
1722 static int smc_ioctl(struct socket *sock, unsigned int cmd,
1723                      unsigned long arg)
1724 {
1725         union smc_host_cursor cons, urg;
1726         struct smc_connection *conn;
1727         struct smc_sock *smc;
1728         int answ;
1729
1730         smc = smc_sk(sock->sk);
1731         conn = &smc->conn;
1732         lock_sock(&smc->sk);
1733         if (smc->use_fallback) {
1734                 if (!smc->clcsock) {
1735                         release_sock(&smc->sk);
1736                         return -EBADF;
1737                 }
1738                 answ = smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg);
1739                 release_sock(&smc->sk);
1740                 return answ;
1741         }
1742         switch (cmd) {
1743         case SIOCINQ: /* same as FIONREAD */
1744                 if (smc->sk.sk_state == SMC_LISTEN) {
1745                         release_sock(&smc->sk);
1746                         return -EINVAL;
1747                 }
1748                 if (smc->sk.sk_state == SMC_INIT ||
1749                     smc->sk.sk_state == SMC_CLOSED)
1750                         answ = 0;
1751                 else
1752                         answ = atomic_read(&smc->conn.bytes_to_rcv);
1753                 break;
1754         case SIOCOUTQ:
1755                 /* output queue size (not send + not acked) */
1756                 if (smc->sk.sk_state == SMC_LISTEN) {
1757                         release_sock(&smc->sk);
1758                         return -EINVAL;
1759                 }
1760                 if (smc->sk.sk_state == SMC_INIT ||
1761                     smc->sk.sk_state == SMC_CLOSED)
1762                         answ = 0;
1763                 else
1764                         answ = smc->conn.sndbuf_desc->len -
1765                                         atomic_read(&smc->conn.sndbuf_space);
1766                 break;
1767         case SIOCOUTQNSD:
1768                 /* output queue size (not send only) */
1769                 if (smc->sk.sk_state == SMC_LISTEN) {
1770                         release_sock(&smc->sk);
1771                         return -EINVAL;
1772                 }
1773                 if (smc->sk.sk_state == SMC_INIT ||
1774                     smc->sk.sk_state == SMC_CLOSED)
1775                         answ = 0;
1776                 else
1777                         answ = smc_tx_prepared_sends(&smc->conn);
1778                 break;
1779         case SIOCATMARK:
1780                 if (smc->sk.sk_state == SMC_LISTEN) {
1781                         release_sock(&smc->sk);
1782                         return -EINVAL;
1783                 }
1784                 if (smc->sk.sk_state == SMC_INIT ||
1785                     smc->sk.sk_state == SMC_CLOSED) {
1786                         answ = 0;
1787                 } else {
1788                         smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn);
1789                         smc_curs_copy(&urg, &conn->urg_curs, conn);
1790                         answ = smc_curs_diff(conn->rmb_desc->len,
1791                                              &cons, &urg) == 1;
1792                 }
1793                 break;
1794         default:
1795                 release_sock(&smc->sk);
1796                 return -ENOIOCTLCMD;
1797         }
1798         release_sock(&smc->sk);
1799
1800         return put_user(answ, (int __user *)arg);
1801 }
1802
1803 static ssize_t smc_sendpage(struct socket *sock, struct page *page,
1804                             int offset, size_t size, int flags)
1805 {
1806         struct sock *sk = sock->sk;
1807         struct smc_sock *smc;
1808         int rc = -EPIPE;
1809
1810         smc = smc_sk(sk);
1811         lock_sock(sk);
1812         if (sk->sk_state != SMC_ACTIVE) {
1813                 release_sock(sk);
1814                 goto out;
1815         }
1816         release_sock(sk);
1817         if (smc->use_fallback)
1818                 rc = kernel_sendpage(smc->clcsock, page, offset,
1819                                      size, flags);
1820         else
1821                 rc = sock_no_sendpage(sock, page, offset, size, flags);
1822
1823 out:
1824         return rc;
1825 }
1826
1827 /* Map the affected portions of the rmbe into an spd, note the number of bytes
1828  * to splice in conn->splice_pending, and press 'go'. Delays consumer cursor
1829  * updates till whenever a respective page has been fully processed.
1830  * Note that subsequent recv() calls have to wait till all splice() processing
1831  * completed.
1832  */
1833 static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos,
1834                                struct pipe_inode_info *pipe, size_t len,
1835                                unsigned int flags)
1836 {
1837         struct sock *sk = sock->sk;
1838         struct smc_sock *smc;
1839         int rc = -ENOTCONN;
1840
1841         smc = smc_sk(sk);
1842         lock_sock(sk);
1843
1844         if (sk->sk_state == SMC_INIT ||
1845             sk->sk_state == SMC_LISTEN ||
1846             sk->sk_state == SMC_CLOSED)
1847                 goto out;
1848
1849         if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
1850                 rc = 0;
1851                 goto out;
1852         }
1853
1854         if (smc->use_fallback) {
1855                 rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos,
1856                                                     pipe, len, flags);
1857         } else {
1858                 if (*ppos) {
1859                         rc = -ESPIPE;
1860                         goto out;
1861                 }
1862                 if (flags & SPLICE_F_NONBLOCK)
1863                         flags = MSG_DONTWAIT;
1864                 else
1865                         flags = 0;
1866                 rc = smc_rx_recvmsg(smc, NULL, pipe, len, flags);
1867         }
1868 out:
1869         release_sock(sk);
1870
1871         return rc;
1872 }
1873
1874 /* must look like tcp */
1875 static const struct proto_ops smc_sock_ops = {
1876         .family         = PF_SMC,
1877         .owner          = THIS_MODULE,
1878         .release        = smc_release,
1879         .bind           = smc_bind,
1880         .connect        = smc_connect,
1881         .socketpair     = sock_no_socketpair,
1882         .accept         = smc_accept,
1883         .getname        = smc_getname,
1884         .poll           = smc_poll,
1885         .ioctl          = smc_ioctl,
1886         .listen         = smc_listen,
1887         .shutdown       = smc_shutdown,
1888         .setsockopt     = smc_setsockopt,
1889         .getsockopt     = smc_getsockopt,
1890         .sendmsg        = smc_sendmsg,
1891         .recvmsg        = smc_recvmsg,
1892         .mmap           = sock_no_mmap,
1893         .sendpage       = smc_sendpage,
1894         .splice_read    = smc_splice_read,
1895 };
1896
1897 static int smc_create(struct net *net, struct socket *sock, int protocol,
1898                       int kern)
1899 {
1900         int family = (protocol == SMCPROTO_SMC6) ? PF_INET6 : PF_INET;
1901         struct smc_sock *smc;
1902         struct sock *sk;
1903         int rc;
1904
1905         rc = -ESOCKTNOSUPPORT;
1906         if (sock->type != SOCK_STREAM)
1907                 goto out;
1908
1909         rc = -EPROTONOSUPPORT;
1910         if (protocol != SMCPROTO_SMC && protocol != SMCPROTO_SMC6)
1911                 goto out;
1912
1913         rc = -ENOBUFS;
1914         sock->ops = &smc_sock_ops;
1915         sk = smc_sock_alloc(net, sock, protocol);
1916         if (!sk)
1917                 goto out;
1918
1919         /* create internal TCP socket for CLC handshake and fallback */
1920         smc = smc_sk(sk);
1921         smc->use_fallback = false; /* assume rdma capability first */
1922         smc->fallback_rsn = 0;
1923         rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP,
1924                               &smc->clcsock);
1925         if (rc) {
1926                 sk_common_release(sk);
1927                 goto out;
1928         }
1929         smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE);
1930         smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE);
1931
1932 out:
1933         return rc;
1934 }
1935
1936 static const struct net_proto_family smc_sock_family_ops = {
1937         .family = PF_SMC,
1938         .owner  = THIS_MODULE,
1939         .create = smc_create,
1940 };
1941
1942 static int __init smc_init(void)
1943 {
1944         int rc;
1945
1946         rc = smc_pnet_init();
1947         if (rc)
1948                 return rc;
1949
1950         rc = smc_llc_init();
1951         if (rc) {
1952                 pr_err("%s: smc_llc_init fails with %d\n", __func__, rc);
1953                 goto out_pnet;
1954         }
1955
1956         rc = smc_cdc_init();
1957         if (rc) {
1958                 pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc);
1959                 goto out_pnet;
1960         }
1961
1962         rc = proto_register(&smc_proto, 1);
1963         if (rc) {
1964                 pr_err("%s: proto_register(v4) fails with %d\n", __func__, rc);
1965                 goto out_pnet;
1966         }
1967
1968         rc = proto_register(&smc_proto6, 1);
1969         if (rc) {
1970                 pr_err("%s: proto_register(v6) fails with %d\n", __func__, rc);
1971                 goto out_proto;
1972         }
1973
1974         rc = sock_register(&smc_sock_family_ops);
1975         if (rc) {
1976                 pr_err("%s: sock_register fails with %d\n", __func__, rc);
1977                 goto out_proto6;
1978         }
1979         INIT_HLIST_HEAD(&smc_v4_hashinfo.ht);
1980         INIT_HLIST_HEAD(&smc_v6_hashinfo.ht);
1981
1982         rc = smc_ib_register_client();
1983         if (rc) {
1984                 pr_err("%s: ib_register fails with %d\n", __func__, rc);
1985                 goto out_sock;
1986         }
1987
1988         static_branch_enable(&tcp_have_smc);
1989         return 0;
1990
1991 out_sock:
1992         sock_unregister(PF_SMC);
1993 out_proto6:
1994         proto_unregister(&smc_proto6);
1995 out_proto:
1996         proto_unregister(&smc_proto);
1997 out_pnet:
1998         smc_pnet_exit();
1999         return rc;
2000 }
2001
2002 static void __exit smc_exit(void)
2003 {
2004         smc_core_exit();
2005         static_branch_disable(&tcp_have_smc);
2006         smc_ib_unregister_client();
2007         sock_unregister(PF_SMC);
2008         proto_unregister(&smc_proto6);
2009         proto_unregister(&smc_proto);
2010         smc_pnet_exit();
2011 }
2012
2013 module_init(smc_init);
2014 module_exit(smc_exit);
2015
2016 MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>");
2017 MODULE_DESCRIPTION("smc socket address family");
2018 MODULE_LICENSE("GPL");
2019 MODULE_ALIAS_NETPROTO(PF_SMC);