Merge tag 'xfs-5.16-fixes-4' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux
[sfrench/cifs-2.6.git] / net / ipv4 / tcp_ipv4.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
4  *              operating system.  INET is implemented using the  BSD Socket
5  *              interface as the means of communication with the user level.
6  *
7  *              Implementation of the Transmission Control Protocol(TCP).
8  *
9  *              IPv4 specific functions
10  *
11  *              code split from:
12  *              linux/ipv4/tcp.c
13  *              linux/ipv4/tcp_input.c
14  *              linux/ipv4/tcp_output.c
15  *
16  *              See tcp.c for author information
17  */
18
19 /*
20  * Changes:
21  *              David S. Miller :       New socket lookup architecture.
22  *                                      This code is dedicated to John Dyson.
23  *              David S. Miller :       Change semantics of established hash,
24  *                                      half is devoted to TIME_WAIT sockets
25  *                                      and the rest go in the other half.
26  *              Andi Kleen :            Add support for syncookies and fixed
27  *                                      some bugs: ip options weren't passed to
28  *                                      the TCP layer, missed a check for an
29  *                                      ACK bit.
30  *              Andi Kleen :            Implemented fast path mtu discovery.
31  *                                      Fixed many serious bugs in the
32  *                                      request_sock handling and moved
33  *                                      most of it into the af independent code.
34  *                                      Added tail drop and some other bugfixes.
35  *                                      Added new listen semantics.
36  *              Mike McLagan    :       Routing by source
37  *      Juan Jose Ciarlante:            ip_dynaddr bits
38  *              Andi Kleen:             various fixes.
39  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
40  *                                      coma.
41  *      Andi Kleen              :       Fix new listen.
42  *      Andi Kleen              :       Fix accept error reporting.
43  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
44  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
45  *                                      a single port at the same time.
46  */
47
48 #define pr_fmt(fmt) "TCP: " fmt
49
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60
61 #include <net/net_namespace.h>
62 #include <net/icmp.h>
63 #include <net/inet_hashtables.h>
64 #include <net/tcp.h>
65 #include <net/transp_v6.h>
66 #include <net/ipv6.h>
67 #include <net/inet_common.h>
68 #include <net/timewait_sock.h>
69 #include <net/xfrm.h>
70 #include <net/secure_seq.h>
71 #include <net/busy_poll.h>
72
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/inetdevice.h>
79 #include <linux/btf_ids.h>
80
81 #include <crypto/hash.h>
82 #include <linux/scatterlist.h>
83
84 #include <trace/events/tcp.h>
85
86 #ifdef CONFIG_TCP_MD5SIG
87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
88                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
89 #endif
90
91 struct inet_hashinfo tcp_hashinfo;
92 EXPORT_SYMBOL(tcp_hashinfo);
93
94 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
95 {
96         return secure_tcp_seq(ip_hdr(skb)->daddr,
97                               ip_hdr(skb)->saddr,
98                               tcp_hdr(skb)->dest,
99                               tcp_hdr(skb)->source);
100 }
101
102 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
103 {
104         return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
105 }
106
107 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
108 {
109         const struct inet_timewait_sock *tw = inet_twsk(sktw);
110         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
111         struct tcp_sock *tp = tcp_sk(sk);
112         int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
113
114         if (reuse == 2) {
115                 /* Still does not detect *everything* that goes through
116                  * lo, since we require a loopback src or dst address
117                  * or direct binding to 'lo' interface.
118                  */
119                 bool loopback = false;
120                 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
121                         loopback = true;
122 #if IS_ENABLED(CONFIG_IPV6)
123                 if (tw->tw_family == AF_INET6) {
124                         if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
125                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
126                             ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
127                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
128                                 loopback = true;
129                 } else
130 #endif
131                 {
132                         if (ipv4_is_loopback(tw->tw_daddr) ||
133                             ipv4_is_loopback(tw->tw_rcv_saddr))
134                                 loopback = true;
135                 }
136                 if (!loopback)
137                         reuse = 0;
138         }
139
140         /* With PAWS, it is safe from the viewpoint
141            of data integrity. Even without PAWS it is safe provided sequence
142            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
143
144            Actually, the idea is close to VJ's one, only timestamp cache is
145            held not per host, but per port pair and TW bucket is used as state
146            holder.
147
148            If TW bucket has been already destroyed we fall back to VJ's scheme
149            and use initial timestamp retrieved from peer table.
150          */
151         if (tcptw->tw_ts_recent_stamp &&
152             (!twp || (reuse && time_after32(ktime_get_seconds(),
153                                             tcptw->tw_ts_recent_stamp)))) {
154                 /* In case of repair and re-using TIME-WAIT sockets we still
155                  * want to be sure that it is safe as above but honor the
156                  * sequence numbers and time stamps set as part of the repair
157                  * process.
158                  *
159                  * Without this check re-using a TIME-WAIT socket with TCP
160                  * repair would accumulate a -1 on the repair assigned
161                  * sequence number. The first time it is reused the sequence
162                  * is -1, the second time -2, etc. This fixes that issue
163                  * without appearing to create any others.
164                  */
165                 if (likely(!tp->repair)) {
166                         u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
167
168                         if (!seq)
169                                 seq = 1;
170                         WRITE_ONCE(tp->write_seq, seq);
171                         tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
172                         tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
173                 }
174                 sock_hold(sktw);
175                 return 1;
176         }
177
178         return 0;
179 }
180 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
181
182 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
183                               int addr_len)
184 {
185         /* This check is replicated from tcp_v4_connect() and intended to
186          * prevent BPF program called below from accessing bytes that are out
187          * of the bound specified by user in addr_len.
188          */
189         if (addr_len < sizeof(struct sockaddr_in))
190                 return -EINVAL;
191
192         sock_owned_by_me(sk);
193
194         return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
195 }
196
197 /* This will initiate an outgoing connection. */
198 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
199 {
200         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
201         struct inet_sock *inet = inet_sk(sk);
202         struct tcp_sock *tp = tcp_sk(sk);
203         __be16 orig_sport, orig_dport;
204         __be32 daddr, nexthop;
205         struct flowi4 *fl4;
206         struct rtable *rt;
207         int err;
208         struct ip_options_rcu *inet_opt;
209         struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
210
211         if (addr_len < sizeof(struct sockaddr_in))
212                 return -EINVAL;
213
214         if (usin->sin_family != AF_INET)
215                 return -EAFNOSUPPORT;
216
217         nexthop = daddr = usin->sin_addr.s_addr;
218         inet_opt = rcu_dereference_protected(inet->inet_opt,
219                                              lockdep_sock_is_held(sk));
220         if (inet_opt && inet_opt->opt.srr) {
221                 if (!daddr)
222                         return -EINVAL;
223                 nexthop = inet_opt->opt.faddr;
224         }
225
226         orig_sport = inet->inet_sport;
227         orig_dport = usin->sin_port;
228         fl4 = &inet->cork.fl.u.ip4;
229         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
230                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
231                               IPPROTO_TCP,
232                               orig_sport, orig_dport, sk);
233         if (IS_ERR(rt)) {
234                 err = PTR_ERR(rt);
235                 if (err == -ENETUNREACH)
236                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
237                 return err;
238         }
239
240         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
241                 ip_rt_put(rt);
242                 return -ENETUNREACH;
243         }
244
245         if (!inet_opt || !inet_opt->opt.srr)
246                 daddr = fl4->daddr;
247
248         if (!inet->inet_saddr)
249                 inet->inet_saddr = fl4->saddr;
250         sk_rcv_saddr_set(sk, inet->inet_saddr);
251
252         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
253                 /* Reset inherited state */
254                 tp->rx_opt.ts_recent       = 0;
255                 tp->rx_opt.ts_recent_stamp = 0;
256                 if (likely(!tp->repair))
257                         WRITE_ONCE(tp->write_seq, 0);
258         }
259
260         inet->inet_dport = usin->sin_port;
261         sk_daddr_set(sk, daddr);
262
263         inet_csk(sk)->icsk_ext_hdr_len = 0;
264         if (inet_opt)
265                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
266
267         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
268
269         /* Socket identity is still unknown (sport may be zero).
270          * However we set state to SYN-SENT and not releasing socket
271          * lock select source port, enter ourselves into the hash tables and
272          * complete initialization after this.
273          */
274         tcp_set_state(sk, TCP_SYN_SENT);
275         err = inet_hash_connect(tcp_death_row, sk);
276         if (err)
277                 goto failure;
278
279         sk_set_txhash(sk);
280
281         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
282                                inet->inet_sport, inet->inet_dport, sk);
283         if (IS_ERR(rt)) {
284                 err = PTR_ERR(rt);
285                 rt = NULL;
286                 goto failure;
287         }
288         /* OK, now commit destination to socket.  */
289         sk->sk_gso_type = SKB_GSO_TCPV4;
290         sk_setup_caps(sk, &rt->dst);
291         rt = NULL;
292
293         if (likely(!tp->repair)) {
294                 if (!tp->write_seq)
295                         WRITE_ONCE(tp->write_seq,
296                                    secure_tcp_seq(inet->inet_saddr,
297                                                   inet->inet_daddr,
298                                                   inet->inet_sport,
299                                                   usin->sin_port));
300                 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
301                                                  inet->inet_saddr,
302                                                  inet->inet_daddr);
303         }
304
305         inet->inet_id = prandom_u32();
306
307         if (tcp_fastopen_defer_connect(sk, &err))
308                 return err;
309         if (err)
310                 goto failure;
311
312         err = tcp_connect(sk);
313
314         if (err)
315                 goto failure;
316
317         return 0;
318
319 failure:
320         /*
321          * This unhashes the socket and releases the local port,
322          * if necessary.
323          */
324         tcp_set_state(sk, TCP_CLOSE);
325         ip_rt_put(rt);
326         sk->sk_route_caps = 0;
327         inet->inet_dport = 0;
328         return err;
329 }
330 EXPORT_SYMBOL(tcp_v4_connect);
331
332 /*
333  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
334  * It can be called through tcp_release_cb() if socket was owned by user
335  * at the time tcp_v4_err() was called to handle ICMP message.
336  */
337 void tcp_v4_mtu_reduced(struct sock *sk)
338 {
339         struct inet_sock *inet = inet_sk(sk);
340         struct dst_entry *dst;
341         u32 mtu;
342
343         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
344                 return;
345         mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
346         dst = inet_csk_update_pmtu(sk, mtu);
347         if (!dst)
348                 return;
349
350         /* Something is about to be wrong... Remember soft error
351          * for the case, if this connection will not able to recover.
352          */
353         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
354                 sk->sk_err_soft = EMSGSIZE;
355
356         mtu = dst_mtu(dst);
357
358         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
359             ip_sk_accept_pmtu(sk) &&
360             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
361                 tcp_sync_mss(sk, mtu);
362
363                 /* Resend the TCP packet because it's
364                  * clear that the old packet has been
365                  * dropped. This is the new "fast" path mtu
366                  * discovery.
367                  */
368                 tcp_simple_retransmit(sk);
369         } /* else let the usual retransmit timer handle it */
370 }
371 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
372
373 static void do_redirect(struct sk_buff *skb, struct sock *sk)
374 {
375         struct dst_entry *dst = __sk_dst_check(sk, 0);
376
377         if (dst)
378                 dst->ops->redirect(dst, sk, skb);
379 }
380
381
382 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
383 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
384 {
385         struct request_sock *req = inet_reqsk(sk);
386         struct net *net = sock_net(sk);
387
388         /* ICMPs are not backlogged, hence we cannot get
389          * an established socket here.
390          */
391         if (seq != tcp_rsk(req)->snt_isn) {
392                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
393         } else if (abort) {
394                 /*
395                  * Still in SYN_RECV, just remove it silently.
396                  * There is no good way to pass the error to the newly
397                  * created socket, and POSIX does not want network
398                  * errors returned from accept().
399                  */
400                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
401                 tcp_listendrop(req->rsk_listener);
402         }
403         reqsk_put(req);
404 }
405 EXPORT_SYMBOL(tcp_req_err);
406
407 /* TCP-LD (RFC 6069) logic */
408 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
409 {
410         struct inet_connection_sock *icsk = inet_csk(sk);
411         struct tcp_sock *tp = tcp_sk(sk);
412         struct sk_buff *skb;
413         s32 remaining;
414         u32 delta_us;
415
416         if (sock_owned_by_user(sk))
417                 return;
418
419         if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
420             !icsk->icsk_backoff)
421                 return;
422
423         skb = tcp_rtx_queue_head(sk);
424         if (WARN_ON_ONCE(!skb))
425                 return;
426
427         icsk->icsk_backoff--;
428         icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
429         icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
430
431         tcp_mstamp_refresh(tp);
432         delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
433         remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
434
435         if (remaining > 0) {
436                 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
437                                           remaining, TCP_RTO_MAX);
438         } else {
439                 /* RTO revert clocked out retransmission.
440                  * Will retransmit now.
441                  */
442                 tcp_retransmit_timer(sk);
443         }
444 }
445 EXPORT_SYMBOL(tcp_ld_RTO_revert);
446
447 /*
448  * This routine is called by the ICMP module when it gets some
449  * sort of error condition.  If err < 0 then the socket should
450  * be closed and the error returned to the user.  If err > 0
451  * it's just the icmp type << 8 | icmp code.  After adjustment
452  * header points to the first 8 bytes of the tcp header.  We need
453  * to find the appropriate port.
454  *
455  * The locking strategy used here is very "optimistic". When
456  * someone else accesses the socket the ICMP is just dropped
457  * and for some paths there is no check at all.
458  * A more general error queue to queue errors for later handling
459  * is probably better.
460  *
461  */
462
463 int tcp_v4_err(struct sk_buff *skb, u32 info)
464 {
465         const struct iphdr *iph = (const struct iphdr *)skb->data;
466         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
467         struct tcp_sock *tp;
468         struct inet_sock *inet;
469         const int type = icmp_hdr(skb)->type;
470         const int code = icmp_hdr(skb)->code;
471         struct sock *sk;
472         struct request_sock *fastopen;
473         u32 seq, snd_una;
474         int err;
475         struct net *net = dev_net(skb->dev);
476
477         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
478                                        th->dest, iph->saddr, ntohs(th->source),
479                                        inet_iif(skb), 0);
480         if (!sk) {
481                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
482                 return -ENOENT;
483         }
484         if (sk->sk_state == TCP_TIME_WAIT) {
485                 inet_twsk_put(inet_twsk(sk));
486                 return 0;
487         }
488         seq = ntohl(th->seq);
489         if (sk->sk_state == TCP_NEW_SYN_RECV) {
490                 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
491                                      type == ICMP_TIME_EXCEEDED ||
492                                      (type == ICMP_DEST_UNREACH &&
493                                       (code == ICMP_NET_UNREACH ||
494                                        code == ICMP_HOST_UNREACH)));
495                 return 0;
496         }
497
498         bh_lock_sock(sk);
499         /* If too many ICMPs get dropped on busy
500          * servers this needs to be solved differently.
501          * We do take care of PMTU discovery (RFC1191) special case :
502          * we can receive locally generated ICMP messages while socket is held.
503          */
504         if (sock_owned_by_user(sk)) {
505                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
506                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
507         }
508         if (sk->sk_state == TCP_CLOSE)
509                 goto out;
510
511         if (static_branch_unlikely(&ip4_min_ttl)) {
512                 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
513                 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
514                         __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
515                         goto out;
516                 }
517         }
518
519         tp = tcp_sk(sk);
520         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
521         fastopen = rcu_dereference(tp->fastopen_rsk);
522         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
523         if (sk->sk_state != TCP_LISTEN &&
524             !between(seq, snd_una, tp->snd_nxt)) {
525                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
526                 goto out;
527         }
528
529         switch (type) {
530         case ICMP_REDIRECT:
531                 if (!sock_owned_by_user(sk))
532                         do_redirect(skb, sk);
533                 goto out;
534         case ICMP_SOURCE_QUENCH:
535                 /* Just silently ignore these. */
536                 goto out;
537         case ICMP_PARAMETERPROB:
538                 err = EPROTO;
539                 break;
540         case ICMP_DEST_UNREACH:
541                 if (code > NR_ICMP_UNREACH)
542                         goto out;
543
544                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
545                         /* We are not interested in TCP_LISTEN and open_requests
546                          * (SYN-ACKs send out by Linux are always <576bytes so
547                          * they should go through unfragmented).
548                          */
549                         if (sk->sk_state == TCP_LISTEN)
550                                 goto out;
551
552                         WRITE_ONCE(tp->mtu_info, info);
553                         if (!sock_owned_by_user(sk)) {
554                                 tcp_v4_mtu_reduced(sk);
555                         } else {
556                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
557                                         sock_hold(sk);
558                         }
559                         goto out;
560                 }
561
562                 err = icmp_err_convert[code].errno;
563                 /* check if this ICMP message allows revert of backoff.
564                  * (see RFC 6069)
565                  */
566                 if (!fastopen &&
567                     (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
568                         tcp_ld_RTO_revert(sk, seq);
569                 break;
570         case ICMP_TIME_EXCEEDED:
571                 err = EHOSTUNREACH;
572                 break;
573         default:
574                 goto out;
575         }
576
577         switch (sk->sk_state) {
578         case TCP_SYN_SENT:
579         case TCP_SYN_RECV:
580                 /* Only in fast or simultaneous open. If a fast open socket is
581                  * already accepted it is treated as a connected one below.
582                  */
583                 if (fastopen && !fastopen->sk)
584                         break;
585
586                 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
587
588                 if (!sock_owned_by_user(sk)) {
589                         sk->sk_err = err;
590
591                         sk_error_report(sk);
592
593                         tcp_done(sk);
594                 } else {
595                         sk->sk_err_soft = err;
596                 }
597                 goto out;
598         }
599
600         /* If we've already connected we will keep trying
601          * until we time out, or the user gives up.
602          *
603          * rfc1122 4.2.3.9 allows to consider as hard errors
604          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
605          * but it is obsoleted by pmtu discovery).
606          *
607          * Note, that in modern internet, where routing is unreliable
608          * and in each dark corner broken firewalls sit, sending random
609          * errors ordered by their masters even this two messages finally lose
610          * their original sense (even Linux sends invalid PORT_UNREACHs)
611          *
612          * Now we are in compliance with RFCs.
613          *                                                      --ANK (980905)
614          */
615
616         inet = inet_sk(sk);
617         if (!sock_owned_by_user(sk) && inet->recverr) {
618                 sk->sk_err = err;
619                 sk_error_report(sk);
620         } else  { /* Only an error on timeout */
621                 sk->sk_err_soft = err;
622         }
623
624 out:
625         bh_unlock_sock(sk);
626         sock_put(sk);
627         return 0;
628 }
629
630 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
631 {
632         struct tcphdr *th = tcp_hdr(skb);
633
634         th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
635         skb->csum_start = skb_transport_header(skb) - skb->head;
636         skb->csum_offset = offsetof(struct tcphdr, check);
637 }
638
639 /* This routine computes an IPv4 TCP checksum. */
640 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
641 {
642         const struct inet_sock *inet = inet_sk(sk);
643
644         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
645 }
646 EXPORT_SYMBOL(tcp_v4_send_check);
647
648 /*
649  *      This routine will send an RST to the other tcp.
650  *
651  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
652  *                    for reset.
653  *      Answer: if a packet caused RST, it is not for a socket
654  *              existing in our system, if it is matched to a socket,
655  *              it is just duplicate segment or bug in other side's TCP.
656  *              So that we build reply only basing on parameters
657  *              arrived with segment.
658  *      Exception: precedence violation. We do not implement it in any case.
659  */
660
661 #ifdef CONFIG_TCP_MD5SIG
662 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
663 #else
664 #define OPTION_BYTES sizeof(__be32)
665 #endif
666
667 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
668 {
669         const struct tcphdr *th = tcp_hdr(skb);
670         struct {
671                 struct tcphdr th;
672                 __be32 opt[OPTION_BYTES / sizeof(__be32)];
673         } rep;
674         struct ip_reply_arg arg;
675 #ifdef CONFIG_TCP_MD5SIG
676         struct tcp_md5sig_key *key = NULL;
677         const __u8 *hash_location = NULL;
678         unsigned char newhash[16];
679         int genhash;
680         struct sock *sk1 = NULL;
681 #endif
682         u64 transmit_time = 0;
683         struct sock *ctl_sk;
684         struct net *net;
685
686         /* Never send a reset in response to a reset. */
687         if (th->rst)
688                 return;
689
690         /* If sk not NULL, it means we did a successful lookup and incoming
691          * route had to be correct. prequeue might have dropped our dst.
692          */
693         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
694                 return;
695
696         /* Swap the send and the receive. */
697         memset(&rep, 0, sizeof(rep));
698         rep.th.dest   = th->source;
699         rep.th.source = th->dest;
700         rep.th.doff   = sizeof(struct tcphdr) / 4;
701         rep.th.rst    = 1;
702
703         if (th->ack) {
704                 rep.th.seq = th->ack_seq;
705         } else {
706                 rep.th.ack = 1;
707                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
708                                        skb->len - (th->doff << 2));
709         }
710
711         memset(&arg, 0, sizeof(arg));
712         arg.iov[0].iov_base = (unsigned char *)&rep;
713         arg.iov[0].iov_len  = sizeof(rep.th);
714
715         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
716 #ifdef CONFIG_TCP_MD5SIG
717         rcu_read_lock();
718         hash_location = tcp_parse_md5sig_option(th);
719         if (sk && sk_fullsock(sk)) {
720                 const union tcp_md5_addr *addr;
721                 int l3index;
722
723                 /* sdif set, means packet ingressed via a device
724                  * in an L3 domain and inet_iif is set to it.
725                  */
726                 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
727                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
728                 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
729         } else if (hash_location) {
730                 const union tcp_md5_addr *addr;
731                 int sdif = tcp_v4_sdif(skb);
732                 int dif = inet_iif(skb);
733                 int l3index;
734
735                 /*
736                  * active side is lost. Try to find listening socket through
737                  * source port, and then find md5 key through listening socket.
738                  * we are not loose security here:
739                  * Incoming packet is checked with md5 hash with finding key,
740                  * no RST generated if md5 hash doesn't match.
741                  */
742                 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
743                                              ip_hdr(skb)->saddr,
744                                              th->source, ip_hdr(skb)->daddr,
745                                              ntohs(th->source), dif, sdif);
746                 /* don't send rst if it can't find key */
747                 if (!sk1)
748                         goto out;
749
750                 /* sdif set, means packet ingressed via a device
751                  * in an L3 domain and dif is set to it.
752                  */
753                 l3index = sdif ? dif : 0;
754                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
755                 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
756                 if (!key)
757                         goto out;
758
759
760                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
761                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
762                         goto out;
763
764         }
765
766         if (key) {
767                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
768                                    (TCPOPT_NOP << 16) |
769                                    (TCPOPT_MD5SIG << 8) |
770                                    TCPOLEN_MD5SIG);
771                 /* Update length and the length the header thinks exists */
772                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
773                 rep.th.doff = arg.iov[0].iov_len / 4;
774
775                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
776                                      key, ip_hdr(skb)->saddr,
777                                      ip_hdr(skb)->daddr, &rep.th);
778         }
779 #endif
780         /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
781         if (rep.opt[0] == 0) {
782                 __be32 mrst = mptcp_reset_option(skb);
783
784                 if (mrst) {
785                         rep.opt[0] = mrst;
786                         arg.iov[0].iov_len += sizeof(mrst);
787                         rep.th.doff = arg.iov[0].iov_len / 4;
788                 }
789         }
790
791         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
792                                       ip_hdr(skb)->saddr, /* XXX */
793                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
794         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
795         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
796
797         /* When socket is gone, all binding information is lost.
798          * routing might fail in this case. No choice here, if we choose to force
799          * input interface, we will misroute in case of asymmetric route.
800          */
801         if (sk) {
802                 arg.bound_dev_if = sk->sk_bound_dev_if;
803                 if (sk_fullsock(sk))
804                         trace_tcp_send_reset(sk, skb);
805         }
806
807         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
808                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
809
810         arg.tos = ip_hdr(skb)->tos;
811         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
812         local_bh_disable();
813         ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
814         if (sk) {
815                 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
816                                    inet_twsk(sk)->tw_mark : sk->sk_mark;
817                 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
818                                    inet_twsk(sk)->tw_priority : sk->sk_priority;
819                 transmit_time = tcp_transmit_time(sk);
820         }
821         ip_send_unicast_reply(ctl_sk,
822                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
823                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
824                               &arg, arg.iov[0].iov_len,
825                               transmit_time);
826
827         ctl_sk->sk_mark = 0;
828         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
829         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
830         local_bh_enable();
831
832 #ifdef CONFIG_TCP_MD5SIG
833 out:
834         rcu_read_unlock();
835 #endif
836 }
837
838 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
839    outside socket context is ugly, certainly. What can I do?
840  */
841
842 static void tcp_v4_send_ack(const struct sock *sk,
843                             struct sk_buff *skb, u32 seq, u32 ack,
844                             u32 win, u32 tsval, u32 tsecr, int oif,
845                             struct tcp_md5sig_key *key,
846                             int reply_flags, u8 tos)
847 {
848         const struct tcphdr *th = tcp_hdr(skb);
849         struct {
850                 struct tcphdr th;
851                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
852 #ifdef CONFIG_TCP_MD5SIG
853                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
854 #endif
855                         ];
856         } rep;
857         struct net *net = sock_net(sk);
858         struct ip_reply_arg arg;
859         struct sock *ctl_sk;
860         u64 transmit_time;
861
862         memset(&rep.th, 0, sizeof(struct tcphdr));
863         memset(&arg, 0, sizeof(arg));
864
865         arg.iov[0].iov_base = (unsigned char *)&rep;
866         arg.iov[0].iov_len  = sizeof(rep.th);
867         if (tsecr) {
868                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
869                                    (TCPOPT_TIMESTAMP << 8) |
870                                    TCPOLEN_TIMESTAMP);
871                 rep.opt[1] = htonl(tsval);
872                 rep.opt[2] = htonl(tsecr);
873                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
874         }
875
876         /* Swap the send and the receive. */
877         rep.th.dest    = th->source;
878         rep.th.source  = th->dest;
879         rep.th.doff    = arg.iov[0].iov_len / 4;
880         rep.th.seq     = htonl(seq);
881         rep.th.ack_seq = htonl(ack);
882         rep.th.ack     = 1;
883         rep.th.window  = htons(win);
884
885 #ifdef CONFIG_TCP_MD5SIG
886         if (key) {
887                 int offset = (tsecr) ? 3 : 0;
888
889                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
890                                           (TCPOPT_NOP << 16) |
891                                           (TCPOPT_MD5SIG << 8) |
892                                           TCPOLEN_MD5SIG);
893                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
894                 rep.th.doff = arg.iov[0].iov_len/4;
895
896                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
897                                     key, ip_hdr(skb)->saddr,
898                                     ip_hdr(skb)->daddr, &rep.th);
899         }
900 #endif
901         arg.flags = reply_flags;
902         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
903                                       ip_hdr(skb)->saddr, /* XXX */
904                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
905         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
906         if (oif)
907                 arg.bound_dev_if = oif;
908         arg.tos = tos;
909         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
910         local_bh_disable();
911         ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
912         ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
913                            inet_twsk(sk)->tw_mark : sk->sk_mark;
914         ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
915                            inet_twsk(sk)->tw_priority : sk->sk_priority;
916         transmit_time = tcp_transmit_time(sk);
917         ip_send_unicast_reply(ctl_sk,
918                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
919                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
920                               &arg, arg.iov[0].iov_len,
921                               transmit_time);
922
923         ctl_sk->sk_mark = 0;
924         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
925         local_bh_enable();
926 }
927
928 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
929 {
930         struct inet_timewait_sock *tw = inet_twsk(sk);
931         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
932
933         tcp_v4_send_ack(sk, skb,
934                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
935                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
936                         tcp_time_stamp_raw() + tcptw->tw_ts_offset,
937                         tcptw->tw_ts_recent,
938                         tw->tw_bound_dev_if,
939                         tcp_twsk_md5_key(tcptw),
940                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
941                         tw->tw_tos
942                         );
943
944         inet_twsk_put(tw);
945 }
946
947 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
948                                   struct request_sock *req)
949 {
950         const union tcp_md5_addr *addr;
951         int l3index;
952
953         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
954          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
955          */
956         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
957                                              tcp_sk(sk)->snd_nxt;
958
959         /* RFC 7323 2.3
960          * The window field (SEG.WND) of every outgoing segment, with the
961          * exception of <SYN> segments, MUST be right-shifted by
962          * Rcv.Wind.Shift bits:
963          */
964         addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
965         l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
966         tcp_v4_send_ack(sk, skb, seq,
967                         tcp_rsk(req)->rcv_nxt,
968                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
969                         tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
970                         req->ts_recent,
971                         0,
972                         tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
973                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
974                         ip_hdr(skb)->tos);
975 }
976
977 /*
978  *      Send a SYN-ACK after having received a SYN.
979  *      This still operates on a request_sock only, not on a big
980  *      socket.
981  */
982 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
983                               struct flowi *fl,
984                               struct request_sock *req,
985                               struct tcp_fastopen_cookie *foc,
986                               enum tcp_synack_type synack_type,
987                               struct sk_buff *syn_skb)
988 {
989         const struct inet_request_sock *ireq = inet_rsk(req);
990         struct flowi4 fl4;
991         int err = -1;
992         struct sk_buff *skb;
993         u8 tos;
994
995         /* First, grab a route. */
996         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
997                 return -1;
998
999         skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1000
1001         if (skb) {
1002                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1003
1004                 tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ?
1005                                 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1006                                 (inet_sk(sk)->tos & INET_ECN_MASK) :
1007                                 inet_sk(sk)->tos;
1008
1009                 if (!INET_ECN_is_capable(tos) &&
1010                     tcp_bpf_ca_needs_ecn((struct sock *)req))
1011                         tos |= INET_ECN_ECT_0;
1012
1013                 rcu_read_lock();
1014                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1015                                             ireq->ir_rmt_addr,
1016                                             rcu_dereference(ireq->ireq_opt),
1017                                             tos);
1018                 rcu_read_unlock();
1019                 err = net_xmit_eval(err);
1020         }
1021
1022         return err;
1023 }
1024
1025 /*
1026  *      IPv4 request_sock destructor.
1027  */
1028 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1029 {
1030         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1031 }
1032
1033 #ifdef CONFIG_TCP_MD5SIG
1034 /*
1035  * RFC2385 MD5 checksumming requires a mapping of
1036  * IP address->MD5 Key.
1037  * We need to maintain these in the sk structure.
1038  */
1039
1040 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1041 EXPORT_SYMBOL(tcp_md5_needed);
1042
1043 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1044 {
1045         if (!old)
1046                 return true;
1047
1048         /* l3index always overrides non-l3index */
1049         if (old->l3index && new->l3index == 0)
1050                 return false;
1051         if (old->l3index == 0 && new->l3index)
1052                 return true;
1053
1054         return old->prefixlen < new->prefixlen;
1055 }
1056
1057 /* Find the Key structure for an address.  */
1058 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1059                                            const union tcp_md5_addr *addr,
1060                                            int family)
1061 {
1062         const struct tcp_sock *tp = tcp_sk(sk);
1063         struct tcp_md5sig_key *key;
1064         const struct tcp_md5sig_info *md5sig;
1065         __be32 mask;
1066         struct tcp_md5sig_key *best_match = NULL;
1067         bool match;
1068
1069         /* caller either holds rcu_read_lock() or socket lock */
1070         md5sig = rcu_dereference_check(tp->md5sig_info,
1071                                        lockdep_sock_is_held(sk));
1072         if (!md5sig)
1073                 return NULL;
1074
1075         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1076                                  lockdep_sock_is_held(sk)) {
1077                 if (key->family != family)
1078                         continue;
1079                 if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
1080                         continue;
1081                 if (family == AF_INET) {
1082                         mask = inet_make_mask(key->prefixlen);
1083                         match = (key->addr.a4.s_addr & mask) ==
1084                                 (addr->a4.s_addr & mask);
1085 #if IS_ENABLED(CONFIG_IPV6)
1086                 } else if (family == AF_INET6) {
1087                         match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1088                                                   key->prefixlen);
1089 #endif
1090                 } else {
1091                         match = false;
1092                 }
1093
1094                 if (match && better_md5_match(best_match, key))
1095                         best_match = key;
1096         }
1097         return best_match;
1098 }
1099 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1100
1101 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1102                                                       const union tcp_md5_addr *addr,
1103                                                       int family, u8 prefixlen,
1104                                                       int l3index, u8 flags)
1105 {
1106         const struct tcp_sock *tp = tcp_sk(sk);
1107         struct tcp_md5sig_key *key;
1108         unsigned int size = sizeof(struct in_addr);
1109         const struct tcp_md5sig_info *md5sig;
1110
1111         /* caller either holds rcu_read_lock() or socket lock */
1112         md5sig = rcu_dereference_check(tp->md5sig_info,
1113                                        lockdep_sock_is_held(sk));
1114         if (!md5sig)
1115                 return NULL;
1116 #if IS_ENABLED(CONFIG_IPV6)
1117         if (family == AF_INET6)
1118                 size = sizeof(struct in6_addr);
1119 #endif
1120         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1121                                  lockdep_sock_is_held(sk)) {
1122                 if (key->family != family)
1123                         continue;
1124                 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1125                         continue;
1126                 if (key->l3index != l3index)
1127                         continue;
1128                 if (!memcmp(&key->addr, addr, size) &&
1129                     key->prefixlen == prefixlen)
1130                         return key;
1131         }
1132         return NULL;
1133 }
1134
1135 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1136                                          const struct sock *addr_sk)
1137 {
1138         const union tcp_md5_addr *addr;
1139         int l3index;
1140
1141         l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1142                                                  addr_sk->sk_bound_dev_if);
1143         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1144         return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1145 }
1146 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1147
1148 /* This can be called on a newly created socket, from other files */
1149 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1150                    int family, u8 prefixlen, int l3index, u8 flags,
1151                    const u8 *newkey, u8 newkeylen, gfp_t gfp)
1152 {
1153         /* Add Key to the list */
1154         struct tcp_md5sig_key *key;
1155         struct tcp_sock *tp = tcp_sk(sk);
1156         struct tcp_md5sig_info *md5sig;
1157
1158         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1159         if (key) {
1160                 /* Pre-existing entry - just update that one.
1161                  * Note that the key might be used concurrently.
1162                  * data_race() is telling kcsan that we do not care of
1163                  * key mismatches, since changing MD5 key on live flows
1164                  * can lead to packet drops.
1165                  */
1166                 data_race(memcpy(key->key, newkey, newkeylen));
1167
1168                 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1169                  * Also note that a reader could catch new key->keylen value
1170                  * but old key->key[], this is the reason we use __GFP_ZERO
1171                  * at sock_kmalloc() time below these lines.
1172                  */
1173                 WRITE_ONCE(key->keylen, newkeylen);
1174
1175                 return 0;
1176         }
1177
1178         md5sig = rcu_dereference_protected(tp->md5sig_info,
1179                                            lockdep_sock_is_held(sk));
1180         if (!md5sig) {
1181                 md5sig = kmalloc(sizeof(*md5sig), gfp);
1182                 if (!md5sig)
1183                         return -ENOMEM;
1184
1185                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1186                 INIT_HLIST_HEAD(&md5sig->head);
1187                 rcu_assign_pointer(tp->md5sig_info, md5sig);
1188         }
1189
1190         key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1191         if (!key)
1192                 return -ENOMEM;
1193         if (!tcp_alloc_md5sig_pool()) {
1194                 sock_kfree_s(sk, key, sizeof(*key));
1195                 return -ENOMEM;
1196         }
1197
1198         memcpy(key->key, newkey, newkeylen);
1199         key->keylen = newkeylen;
1200         key->family = family;
1201         key->prefixlen = prefixlen;
1202         key->l3index = l3index;
1203         key->flags = flags;
1204         memcpy(&key->addr, addr,
1205                (family == AF_INET6) ? sizeof(struct in6_addr) :
1206                                       sizeof(struct in_addr));
1207         hlist_add_head_rcu(&key->node, &md5sig->head);
1208         return 0;
1209 }
1210 EXPORT_SYMBOL(tcp_md5_do_add);
1211
1212 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1213                    u8 prefixlen, int l3index, u8 flags)
1214 {
1215         struct tcp_md5sig_key *key;
1216
1217         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1218         if (!key)
1219                 return -ENOENT;
1220         hlist_del_rcu(&key->node);
1221         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1222         kfree_rcu(key, rcu);
1223         return 0;
1224 }
1225 EXPORT_SYMBOL(tcp_md5_do_del);
1226
1227 static void tcp_clear_md5_list(struct sock *sk)
1228 {
1229         struct tcp_sock *tp = tcp_sk(sk);
1230         struct tcp_md5sig_key *key;
1231         struct hlist_node *n;
1232         struct tcp_md5sig_info *md5sig;
1233
1234         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1235
1236         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1237                 hlist_del_rcu(&key->node);
1238                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1239                 kfree_rcu(key, rcu);
1240         }
1241 }
1242
1243 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1244                                  sockptr_t optval, int optlen)
1245 {
1246         struct tcp_md5sig cmd;
1247         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1248         const union tcp_md5_addr *addr;
1249         u8 prefixlen = 32;
1250         int l3index = 0;
1251         u8 flags;
1252
1253         if (optlen < sizeof(cmd))
1254                 return -EINVAL;
1255
1256         if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1257                 return -EFAULT;
1258
1259         if (sin->sin_family != AF_INET)
1260                 return -EINVAL;
1261
1262         flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1263
1264         if (optname == TCP_MD5SIG_EXT &&
1265             cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1266                 prefixlen = cmd.tcpm_prefixlen;
1267                 if (prefixlen > 32)
1268                         return -EINVAL;
1269         }
1270
1271         if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1272             cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1273                 struct net_device *dev;
1274
1275                 rcu_read_lock();
1276                 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1277                 if (dev && netif_is_l3_master(dev))
1278                         l3index = dev->ifindex;
1279
1280                 rcu_read_unlock();
1281
1282                 /* ok to reference set/not set outside of rcu;
1283                  * right now device MUST be an L3 master
1284                  */
1285                 if (!dev || !l3index)
1286                         return -EINVAL;
1287         }
1288
1289         addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1290
1291         if (!cmd.tcpm_keylen)
1292                 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1293
1294         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1295                 return -EINVAL;
1296
1297         return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1298                               cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1299 }
1300
1301 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1302                                    __be32 daddr, __be32 saddr,
1303                                    const struct tcphdr *th, int nbytes)
1304 {
1305         struct tcp4_pseudohdr *bp;
1306         struct scatterlist sg;
1307         struct tcphdr *_th;
1308
1309         bp = hp->scratch;
1310         bp->saddr = saddr;
1311         bp->daddr = daddr;
1312         bp->pad = 0;
1313         bp->protocol = IPPROTO_TCP;
1314         bp->len = cpu_to_be16(nbytes);
1315
1316         _th = (struct tcphdr *)(bp + 1);
1317         memcpy(_th, th, sizeof(*th));
1318         _th->check = 0;
1319
1320         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1321         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1322                                 sizeof(*bp) + sizeof(*th));
1323         return crypto_ahash_update(hp->md5_req);
1324 }
1325
1326 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1327                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1328 {
1329         struct tcp_md5sig_pool *hp;
1330         struct ahash_request *req;
1331
1332         hp = tcp_get_md5sig_pool();
1333         if (!hp)
1334                 goto clear_hash_noput;
1335         req = hp->md5_req;
1336
1337         if (crypto_ahash_init(req))
1338                 goto clear_hash;
1339         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1340                 goto clear_hash;
1341         if (tcp_md5_hash_key(hp, key))
1342                 goto clear_hash;
1343         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1344         if (crypto_ahash_final(req))
1345                 goto clear_hash;
1346
1347         tcp_put_md5sig_pool();
1348         return 0;
1349
1350 clear_hash:
1351         tcp_put_md5sig_pool();
1352 clear_hash_noput:
1353         memset(md5_hash, 0, 16);
1354         return 1;
1355 }
1356
1357 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1358                         const struct sock *sk,
1359                         const struct sk_buff *skb)
1360 {
1361         struct tcp_md5sig_pool *hp;
1362         struct ahash_request *req;
1363         const struct tcphdr *th = tcp_hdr(skb);
1364         __be32 saddr, daddr;
1365
1366         if (sk) { /* valid for establish/request sockets */
1367                 saddr = sk->sk_rcv_saddr;
1368                 daddr = sk->sk_daddr;
1369         } else {
1370                 const struct iphdr *iph = ip_hdr(skb);
1371                 saddr = iph->saddr;
1372                 daddr = iph->daddr;
1373         }
1374
1375         hp = tcp_get_md5sig_pool();
1376         if (!hp)
1377                 goto clear_hash_noput;
1378         req = hp->md5_req;
1379
1380         if (crypto_ahash_init(req))
1381                 goto clear_hash;
1382
1383         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1384                 goto clear_hash;
1385         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1386                 goto clear_hash;
1387         if (tcp_md5_hash_key(hp, key))
1388                 goto clear_hash;
1389         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1390         if (crypto_ahash_final(req))
1391                 goto clear_hash;
1392
1393         tcp_put_md5sig_pool();
1394         return 0;
1395
1396 clear_hash:
1397         tcp_put_md5sig_pool();
1398 clear_hash_noput:
1399         memset(md5_hash, 0, 16);
1400         return 1;
1401 }
1402 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1403
1404 #endif
1405
1406 /* Called with rcu_read_lock() */
1407 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1408                                     const struct sk_buff *skb,
1409                                     int dif, int sdif)
1410 {
1411 #ifdef CONFIG_TCP_MD5SIG
1412         /*
1413          * This gets called for each TCP segment that arrives
1414          * so we want to be efficient.
1415          * We have 3 drop cases:
1416          * o No MD5 hash and one expected.
1417          * o MD5 hash and we're not expecting one.
1418          * o MD5 hash and its wrong.
1419          */
1420         const __u8 *hash_location = NULL;
1421         struct tcp_md5sig_key *hash_expected;
1422         const struct iphdr *iph = ip_hdr(skb);
1423         const struct tcphdr *th = tcp_hdr(skb);
1424         const union tcp_md5_addr *addr;
1425         unsigned char newhash[16];
1426         int genhash, l3index;
1427
1428         /* sdif set, means packet ingressed via a device
1429          * in an L3 domain and dif is set to the l3mdev
1430          */
1431         l3index = sdif ? dif : 0;
1432
1433         addr = (union tcp_md5_addr *)&iph->saddr;
1434         hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1435         hash_location = tcp_parse_md5sig_option(th);
1436
1437         /* We've parsed the options - do we have a hash? */
1438         if (!hash_expected && !hash_location)
1439                 return false;
1440
1441         if (hash_expected && !hash_location) {
1442                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1443                 return true;
1444         }
1445
1446         if (!hash_expected && hash_location) {
1447                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1448                 return true;
1449         }
1450
1451         /* Okay, so this is hash_expected and hash_location -
1452          * so we need to calculate the checksum.
1453          */
1454         genhash = tcp_v4_md5_hash_skb(newhash,
1455                                       hash_expected,
1456                                       NULL, skb);
1457
1458         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1459                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1460                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
1461                                      &iph->saddr, ntohs(th->source),
1462                                      &iph->daddr, ntohs(th->dest),
1463                                      genhash ? " tcp_v4_calc_md5_hash failed"
1464                                      : "", l3index);
1465                 return true;
1466         }
1467         return false;
1468 #endif
1469         return false;
1470 }
1471
1472 static void tcp_v4_init_req(struct request_sock *req,
1473                             const struct sock *sk_listener,
1474                             struct sk_buff *skb)
1475 {
1476         struct inet_request_sock *ireq = inet_rsk(req);
1477         struct net *net = sock_net(sk_listener);
1478
1479         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1480         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1481         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1482 }
1483
1484 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1485                                           struct sk_buff *skb,
1486                                           struct flowi *fl,
1487                                           struct request_sock *req)
1488 {
1489         tcp_v4_init_req(req, sk, skb);
1490
1491         if (security_inet_conn_request(sk, skb, req))
1492                 return NULL;
1493
1494         return inet_csk_route_req(sk, &fl->u.ip4, req);
1495 }
1496
1497 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1498         .family         =       PF_INET,
1499         .obj_size       =       sizeof(struct tcp_request_sock),
1500         .rtx_syn_ack    =       tcp_rtx_synack,
1501         .send_ack       =       tcp_v4_reqsk_send_ack,
1502         .destructor     =       tcp_v4_reqsk_destructor,
1503         .send_reset     =       tcp_v4_send_reset,
1504         .syn_ack_timeout =      tcp_syn_ack_timeout,
1505 };
1506
1507 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1508         .mss_clamp      =       TCP_MSS_DEFAULT,
1509 #ifdef CONFIG_TCP_MD5SIG
1510         .req_md5_lookup =       tcp_v4_md5_lookup,
1511         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1512 #endif
1513 #ifdef CONFIG_SYN_COOKIES
1514         .cookie_init_seq =      cookie_v4_init_sequence,
1515 #endif
1516         .route_req      =       tcp_v4_route_req,
1517         .init_seq       =       tcp_v4_init_seq,
1518         .init_ts_off    =       tcp_v4_init_ts_off,
1519         .send_synack    =       tcp_v4_send_synack,
1520 };
1521
1522 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1523 {
1524         /* Never answer to SYNs send to broadcast or multicast */
1525         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1526                 goto drop;
1527
1528         return tcp_conn_request(&tcp_request_sock_ops,
1529                                 &tcp_request_sock_ipv4_ops, sk, skb);
1530
1531 drop:
1532         tcp_listendrop(sk);
1533         return 0;
1534 }
1535 EXPORT_SYMBOL(tcp_v4_conn_request);
1536
1537
1538 /*
1539  * The three way handshake has completed - we got a valid synack -
1540  * now create the new socket.
1541  */
1542 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1543                                   struct request_sock *req,
1544                                   struct dst_entry *dst,
1545                                   struct request_sock *req_unhash,
1546                                   bool *own_req)
1547 {
1548         struct inet_request_sock *ireq;
1549         bool found_dup_sk = false;
1550         struct inet_sock *newinet;
1551         struct tcp_sock *newtp;
1552         struct sock *newsk;
1553 #ifdef CONFIG_TCP_MD5SIG
1554         const union tcp_md5_addr *addr;
1555         struct tcp_md5sig_key *key;
1556         int l3index;
1557 #endif
1558         struct ip_options_rcu *inet_opt;
1559
1560         if (sk_acceptq_is_full(sk))
1561                 goto exit_overflow;
1562
1563         newsk = tcp_create_openreq_child(sk, req, skb);
1564         if (!newsk)
1565                 goto exit_nonewsk;
1566
1567         newsk->sk_gso_type = SKB_GSO_TCPV4;
1568         inet_sk_rx_dst_set(newsk, skb);
1569
1570         newtp                 = tcp_sk(newsk);
1571         newinet               = inet_sk(newsk);
1572         ireq                  = inet_rsk(req);
1573         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1574         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1575         newsk->sk_bound_dev_if = ireq->ir_iif;
1576         newinet->inet_saddr   = ireq->ir_loc_addr;
1577         inet_opt              = rcu_dereference(ireq->ireq_opt);
1578         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1579         newinet->mc_index     = inet_iif(skb);
1580         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1581         newinet->rcv_tos      = ip_hdr(skb)->tos;
1582         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1583         if (inet_opt)
1584                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1585         newinet->inet_id = prandom_u32();
1586
1587         /* Set ToS of the new socket based upon the value of incoming SYN.
1588          * ECT bits are set later in tcp_init_transfer().
1589          */
1590         if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)
1591                 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1592
1593         if (!dst) {
1594                 dst = inet_csk_route_child_sock(sk, newsk, req);
1595                 if (!dst)
1596                         goto put_and_exit;
1597         } else {
1598                 /* syncookie case : see end of cookie_v4_check() */
1599         }
1600         sk_setup_caps(newsk, dst);
1601
1602         tcp_ca_openreq_child(newsk, dst);
1603
1604         tcp_sync_mss(newsk, dst_mtu(dst));
1605         newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1606
1607         tcp_initialize_rcv_mss(newsk);
1608
1609 #ifdef CONFIG_TCP_MD5SIG
1610         l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1611         /* Copy over the MD5 key from the original socket */
1612         addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1613         key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1614         if (key) {
1615                 /*
1616                  * We're using one, so create a matching key
1617                  * on the newsk structure. If we fail to get
1618                  * memory, then we end up not copying the key
1619                  * across. Shucks.
1620                  */
1621                 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->flags,
1622                                key->key, key->keylen, GFP_ATOMIC);
1623                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1624         }
1625 #endif
1626
1627         if (__inet_inherit_port(sk, newsk) < 0)
1628                 goto put_and_exit;
1629         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1630                                        &found_dup_sk);
1631         if (likely(*own_req)) {
1632                 tcp_move_syn(newtp, req);
1633                 ireq->ireq_opt = NULL;
1634         } else {
1635                 newinet->inet_opt = NULL;
1636
1637                 if (!req_unhash && found_dup_sk) {
1638                         /* This code path should only be executed in the
1639                          * syncookie case only
1640                          */
1641                         bh_unlock_sock(newsk);
1642                         sock_put(newsk);
1643                         newsk = NULL;
1644                 }
1645         }
1646         return newsk;
1647
1648 exit_overflow:
1649         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1650 exit_nonewsk:
1651         dst_release(dst);
1652 exit:
1653         tcp_listendrop(sk);
1654         return NULL;
1655 put_and_exit:
1656         newinet->inet_opt = NULL;
1657         inet_csk_prepare_forced_close(newsk);
1658         tcp_done(newsk);
1659         goto exit;
1660 }
1661 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1662
1663 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1664 {
1665 #ifdef CONFIG_SYN_COOKIES
1666         const struct tcphdr *th = tcp_hdr(skb);
1667
1668         if (!th->syn)
1669                 sk = cookie_v4_check(sk, skb);
1670 #endif
1671         return sk;
1672 }
1673
1674 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1675                          struct tcphdr *th, u32 *cookie)
1676 {
1677         u16 mss = 0;
1678 #ifdef CONFIG_SYN_COOKIES
1679         mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1680                                     &tcp_request_sock_ipv4_ops, sk, th);
1681         if (mss) {
1682                 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1683                 tcp_synq_overflow(sk);
1684         }
1685 #endif
1686         return mss;
1687 }
1688
1689 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1690                                                            u32));
1691 /* The socket must have it's spinlock held when we get
1692  * here, unless it is a TCP_LISTEN socket.
1693  *
1694  * We have a potential double-lock case here, so even when
1695  * doing backlog processing we use the BH locking scheme.
1696  * This is because we cannot sleep with the original spinlock
1697  * held.
1698  */
1699 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1700 {
1701         struct sock *rsk;
1702
1703         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1704                 struct dst_entry *dst;
1705
1706                 dst = rcu_dereference_protected(sk->sk_rx_dst,
1707                                                 lockdep_sock_is_held(sk));
1708
1709                 sock_rps_save_rxhash(sk, skb);
1710                 sk_mark_napi_id(sk, skb);
1711                 if (dst) {
1712                         if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1713                             !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1714                                              dst, 0)) {
1715                                 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1716                                 dst_release(dst);
1717                         }
1718                 }
1719                 tcp_rcv_established(sk, skb);
1720                 return 0;
1721         }
1722
1723         if (tcp_checksum_complete(skb))
1724                 goto csum_err;
1725
1726         if (sk->sk_state == TCP_LISTEN) {
1727                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1728
1729                 if (!nsk)
1730                         goto discard;
1731                 if (nsk != sk) {
1732                         if (tcp_child_process(sk, nsk, skb)) {
1733                                 rsk = nsk;
1734                                 goto reset;
1735                         }
1736                         return 0;
1737                 }
1738         } else
1739                 sock_rps_save_rxhash(sk, skb);
1740
1741         if (tcp_rcv_state_process(sk, skb)) {
1742                 rsk = sk;
1743                 goto reset;
1744         }
1745         return 0;
1746
1747 reset:
1748         tcp_v4_send_reset(rsk, skb);
1749 discard:
1750         kfree_skb(skb);
1751         /* Be careful here. If this function gets more complicated and
1752          * gcc suffers from register pressure on the x86, sk (in %ebx)
1753          * might be destroyed here. This current version compiles correctly,
1754          * but you have been warned.
1755          */
1756         return 0;
1757
1758 csum_err:
1759         trace_tcp_bad_csum(skb);
1760         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1761         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1762         goto discard;
1763 }
1764 EXPORT_SYMBOL(tcp_v4_do_rcv);
1765
1766 int tcp_v4_early_demux(struct sk_buff *skb)
1767 {
1768         const struct iphdr *iph;
1769         const struct tcphdr *th;
1770         struct sock *sk;
1771
1772         if (skb->pkt_type != PACKET_HOST)
1773                 return 0;
1774
1775         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1776                 return 0;
1777
1778         iph = ip_hdr(skb);
1779         th = tcp_hdr(skb);
1780
1781         if (th->doff < sizeof(struct tcphdr) / 4)
1782                 return 0;
1783
1784         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1785                                        iph->saddr, th->source,
1786                                        iph->daddr, ntohs(th->dest),
1787                                        skb->skb_iif, inet_sdif(skb));
1788         if (sk) {
1789                 skb->sk = sk;
1790                 skb->destructor = sock_edemux;
1791                 if (sk_fullsock(sk)) {
1792                         struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1793
1794                         if (dst)
1795                                 dst = dst_check(dst, 0);
1796                         if (dst &&
1797                             sk->sk_rx_dst_ifindex == skb->skb_iif)
1798                                 skb_dst_set_noref(skb, dst);
1799                 }
1800         }
1801         return 0;
1802 }
1803
1804 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1805 {
1806         u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf);
1807         u32 tail_gso_size, tail_gso_segs;
1808         struct skb_shared_info *shinfo;
1809         const struct tcphdr *th;
1810         struct tcphdr *thtail;
1811         struct sk_buff *tail;
1812         unsigned int hdrlen;
1813         bool fragstolen;
1814         u32 gso_segs;
1815         u32 gso_size;
1816         int delta;
1817
1818         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1819          * we can fix skb->truesize to its real value to avoid future drops.
1820          * This is valid because skb is not yet charged to the socket.
1821          * It has been noticed pure SACK packets were sometimes dropped
1822          * (if cooked by drivers without copybreak feature).
1823          */
1824         skb_condense(skb);
1825
1826         skb_dst_drop(skb);
1827
1828         if (unlikely(tcp_checksum_complete(skb))) {
1829                 bh_unlock_sock(sk);
1830                 trace_tcp_bad_csum(skb);
1831                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1832                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1833                 return true;
1834         }
1835
1836         /* Attempt coalescing to last skb in backlog, even if we are
1837          * above the limits.
1838          * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1839          */
1840         th = (const struct tcphdr *)skb->data;
1841         hdrlen = th->doff * 4;
1842
1843         tail = sk->sk_backlog.tail;
1844         if (!tail)
1845                 goto no_coalesce;
1846         thtail = (struct tcphdr *)tail->data;
1847
1848         if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1849             TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1850             ((TCP_SKB_CB(tail)->tcp_flags |
1851               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1852             !((TCP_SKB_CB(tail)->tcp_flags &
1853               TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1854             ((TCP_SKB_CB(tail)->tcp_flags ^
1855               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1856 #ifdef CONFIG_TLS_DEVICE
1857             tail->decrypted != skb->decrypted ||
1858 #endif
1859             thtail->doff != th->doff ||
1860             memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1861                 goto no_coalesce;
1862
1863         __skb_pull(skb, hdrlen);
1864
1865         shinfo = skb_shinfo(skb);
1866         gso_size = shinfo->gso_size ?: skb->len;
1867         gso_segs = shinfo->gso_segs ?: 1;
1868
1869         shinfo = skb_shinfo(tail);
1870         tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1871         tail_gso_segs = shinfo->gso_segs ?: 1;
1872
1873         if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1874                 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1875
1876                 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1877                         TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1878                         thtail->window = th->window;
1879                 }
1880
1881                 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1882                  * thtail->fin, so that the fast path in tcp_rcv_established()
1883                  * is not entered if we append a packet with a FIN.
1884                  * SYN, RST, URG are not present.
1885                  * ACK is set on both packets.
1886                  * PSH : we do not really care in TCP stack,
1887                  *       at least for 'GRO' packets.
1888                  */
1889                 thtail->fin |= th->fin;
1890                 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1891
1892                 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1893                         TCP_SKB_CB(tail)->has_rxtstamp = true;
1894                         tail->tstamp = skb->tstamp;
1895                         skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1896                 }
1897
1898                 /* Not as strict as GRO. We only need to carry mss max value */
1899                 shinfo->gso_size = max(gso_size, tail_gso_size);
1900                 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1901
1902                 sk->sk_backlog.len += delta;
1903                 __NET_INC_STATS(sock_net(sk),
1904                                 LINUX_MIB_TCPBACKLOGCOALESCE);
1905                 kfree_skb_partial(skb, fragstolen);
1906                 return false;
1907         }
1908         __skb_push(skb, hdrlen);
1909
1910 no_coalesce:
1911         /* Only socket owner can try to collapse/prune rx queues
1912          * to reduce memory overhead, so add a little headroom here.
1913          * Few sockets backlog are possibly concurrently non empty.
1914          */
1915         limit += 64*1024;
1916
1917         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1918                 bh_unlock_sock(sk);
1919                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1920                 return true;
1921         }
1922         return false;
1923 }
1924 EXPORT_SYMBOL(tcp_add_backlog);
1925
1926 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1927 {
1928         struct tcphdr *th = (struct tcphdr *)skb->data;
1929
1930         return sk_filter_trim_cap(sk, skb, th->doff * 4);
1931 }
1932 EXPORT_SYMBOL(tcp_filter);
1933
1934 static void tcp_v4_restore_cb(struct sk_buff *skb)
1935 {
1936         memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1937                 sizeof(struct inet_skb_parm));
1938 }
1939
1940 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1941                            const struct tcphdr *th)
1942 {
1943         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1944          * barrier() makes sure compiler wont play fool^Waliasing games.
1945          */
1946         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1947                 sizeof(struct inet_skb_parm));
1948         barrier();
1949
1950         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1951         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1952                                     skb->len - th->doff * 4);
1953         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1954         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1955         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1956         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1957         TCP_SKB_CB(skb)->sacked  = 0;
1958         TCP_SKB_CB(skb)->has_rxtstamp =
1959                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1960 }
1961
1962 /*
1963  *      From tcp_input.c
1964  */
1965
1966 int tcp_v4_rcv(struct sk_buff *skb)
1967 {
1968         struct net *net = dev_net(skb->dev);
1969         int sdif = inet_sdif(skb);
1970         int dif = inet_iif(skb);
1971         const struct iphdr *iph;
1972         const struct tcphdr *th;
1973         bool refcounted;
1974         struct sock *sk;
1975         int ret;
1976
1977         if (skb->pkt_type != PACKET_HOST)
1978                 goto discard_it;
1979
1980         /* Count it even if it's bad */
1981         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1982
1983         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1984                 goto discard_it;
1985
1986         th = (const struct tcphdr *)skb->data;
1987
1988         if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1989                 goto bad_packet;
1990         if (!pskb_may_pull(skb, th->doff * 4))
1991                 goto discard_it;
1992
1993         /* An explanation is required here, I think.
1994          * Packet length and doff are validated by header prediction,
1995          * provided case of th->doff==0 is eliminated.
1996          * So, we defer the checks. */
1997
1998         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1999                 goto csum_error;
2000
2001         th = (const struct tcphdr *)skb->data;
2002         iph = ip_hdr(skb);
2003 lookup:
2004         sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
2005                                th->dest, sdif, &refcounted);
2006         if (!sk)
2007                 goto no_tcp_socket;
2008
2009 process:
2010         if (sk->sk_state == TCP_TIME_WAIT)
2011                 goto do_time_wait;
2012
2013         if (sk->sk_state == TCP_NEW_SYN_RECV) {
2014                 struct request_sock *req = inet_reqsk(sk);
2015                 bool req_stolen = false;
2016                 struct sock *nsk;
2017
2018                 sk = req->rsk_listener;
2019                 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
2020                         sk_drops_add(sk, skb);
2021                         reqsk_put(req);
2022                         goto discard_it;
2023                 }
2024                 if (tcp_checksum_complete(skb)) {
2025                         reqsk_put(req);
2026                         goto csum_error;
2027                 }
2028                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
2029                         nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2030                         if (!nsk) {
2031                                 inet_csk_reqsk_queue_drop_and_put(sk, req);
2032                                 goto lookup;
2033                         }
2034                         sk = nsk;
2035                         /* reuseport_migrate_sock() has already held one sk_refcnt
2036                          * before returning.
2037                          */
2038                 } else {
2039                         /* We own a reference on the listener, increase it again
2040                          * as we might lose it too soon.
2041                          */
2042                         sock_hold(sk);
2043                 }
2044                 refcounted = true;
2045                 nsk = NULL;
2046                 if (!tcp_filter(sk, skb)) {
2047                         th = (const struct tcphdr *)skb->data;
2048                         iph = ip_hdr(skb);
2049                         tcp_v4_fill_cb(skb, iph, th);
2050                         nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2051                 }
2052                 if (!nsk) {
2053                         reqsk_put(req);
2054                         if (req_stolen) {
2055                                 /* Another cpu got exclusive access to req
2056                                  * and created a full blown socket.
2057                                  * Try to feed this packet to this socket
2058                                  * instead of discarding it.
2059                                  */
2060                                 tcp_v4_restore_cb(skb);
2061                                 sock_put(sk);
2062                                 goto lookup;
2063                         }
2064                         goto discard_and_relse;
2065                 }
2066                 if (nsk == sk) {
2067                         reqsk_put(req);
2068                         tcp_v4_restore_cb(skb);
2069                 } else if (tcp_child_process(sk, nsk, skb)) {
2070                         tcp_v4_send_reset(nsk, skb);
2071                         goto discard_and_relse;
2072                 } else {
2073                         sock_put(sk);
2074                         return 0;
2075                 }
2076         }
2077
2078         if (static_branch_unlikely(&ip4_min_ttl)) {
2079                 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
2080                 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2081                         __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2082                         goto discard_and_relse;
2083                 }
2084         }
2085
2086         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2087                 goto discard_and_relse;
2088
2089         if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
2090                 goto discard_and_relse;
2091
2092         nf_reset_ct(skb);
2093
2094         if (tcp_filter(sk, skb))
2095                 goto discard_and_relse;
2096         th = (const struct tcphdr *)skb->data;
2097         iph = ip_hdr(skb);
2098         tcp_v4_fill_cb(skb, iph, th);
2099
2100         skb->dev = NULL;
2101
2102         if (sk->sk_state == TCP_LISTEN) {
2103                 ret = tcp_v4_do_rcv(sk, skb);
2104                 goto put_and_return;
2105         }
2106
2107         sk_incoming_cpu_update(sk);
2108
2109         bh_lock_sock_nested(sk);
2110         tcp_segs_in(tcp_sk(sk), skb);
2111         ret = 0;
2112         if (!sock_owned_by_user(sk)) {
2113                 ret = tcp_v4_do_rcv(sk, skb);
2114         } else {
2115                 if (tcp_add_backlog(sk, skb))
2116                         goto discard_and_relse;
2117         }
2118         bh_unlock_sock(sk);
2119
2120 put_and_return:
2121         if (refcounted)
2122                 sock_put(sk);
2123
2124         return ret;
2125
2126 no_tcp_socket:
2127         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2128                 goto discard_it;
2129
2130         tcp_v4_fill_cb(skb, iph, th);
2131
2132         if (tcp_checksum_complete(skb)) {
2133 csum_error:
2134                 trace_tcp_bad_csum(skb);
2135                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2136 bad_packet:
2137                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2138         } else {
2139                 tcp_v4_send_reset(NULL, skb);
2140         }
2141
2142 discard_it:
2143         /* Discard frame. */
2144         kfree_skb(skb);
2145         return 0;
2146
2147 discard_and_relse:
2148         sk_drops_add(sk, skb);
2149         if (refcounted)
2150                 sock_put(sk);
2151         goto discard_it;
2152
2153 do_time_wait:
2154         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2155                 inet_twsk_put(inet_twsk(sk));
2156                 goto discard_it;
2157         }
2158
2159         tcp_v4_fill_cb(skb, iph, th);
2160
2161         if (tcp_checksum_complete(skb)) {
2162                 inet_twsk_put(inet_twsk(sk));
2163                 goto csum_error;
2164         }
2165         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2166         case TCP_TW_SYN: {
2167                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2168                                                         &tcp_hashinfo, skb,
2169                                                         __tcp_hdrlen(th),
2170                                                         iph->saddr, th->source,
2171                                                         iph->daddr, th->dest,
2172                                                         inet_iif(skb),
2173                                                         sdif);
2174                 if (sk2) {
2175                         inet_twsk_deschedule_put(inet_twsk(sk));
2176                         sk = sk2;
2177                         tcp_v4_restore_cb(skb);
2178                         refcounted = false;
2179                         goto process;
2180                 }
2181         }
2182                 /* to ACK */
2183                 fallthrough;
2184         case TCP_TW_ACK:
2185                 tcp_v4_timewait_ack(sk, skb);
2186                 break;
2187         case TCP_TW_RST:
2188                 tcp_v4_send_reset(sk, skb);
2189                 inet_twsk_deschedule_put(inet_twsk(sk));
2190                 goto discard_it;
2191         case TCP_TW_SUCCESS:;
2192         }
2193         goto discard_it;
2194 }
2195
2196 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2197         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
2198         .twsk_unique    = tcp_twsk_unique,
2199         .twsk_destructor= tcp_twsk_destructor,
2200 };
2201
2202 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2203 {
2204         struct dst_entry *dst = skb_dst(skb);
2205
2206         if (dst && dst_hold_safe(dst)) {
2207                 rcu_assign_pointer(sk->sk_rx_dst, dst);
2208                 sk->sk_rx_dst_ifindex = skb->skb_iif;
2209         }
2210 }
2211 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2212
2213 const struct inet_connection_sock_af_ops ipv4_specific = {
2214         .queue_xmit        = ip_queue_xmit,
2215         .send_check        = tcp_v4_send_check,
2216         .rebuild_header    = inet_sk_rebuild_header,
2217         .sk_rx_dst_set     = inet_sk_rx_dst_set,
2218         .conn_request      = tcp_v4_conn_request,
2219         .syn_recv_sock     = tcp_v4_syn_recv_sock,
2220         .net_header_len    = sizeof(struct iphdr),
2221         .setsockopt        = ip_setsockopt,
2222         .getsockopt        = ip_getsockopt,
2223         .addr2sockaddr     = inet_csk_addr2sockaddr,
2224         .sockaddr_len      = sizeof(struct sockaddr_in),
2225         .mtu_reduced       = tcp_v4_mtu_reduced,
2226 };
2227 EXPORT_SYMBOL(ipv4_specific);
2228
2229 #ifdef CONFIG_TCP_MD5SIG
2230 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2231         .md5_lookup             = tcp_v4_md5_lookup,
2232         .calc_md5_hash          = tcp_v4_md5_hash_skb,
2233         .md5_parse              = tcp_v4_parse_md5_keys,
2234 };
2235 #endif
2236
2237 /* NOTE: A lot of things set to zero explicitly by call to
2238  *       sk_alloc() so need not be done here.
2239  */
2240 static int tcp_v4_init_sock(struct sock *sk)
2241 {
2242         struct inet_connection_sock *icsk = inet_csk(sk);
2243
2244         tcp_init_sock(sk);
2245
2246         icsk->icsk_af_ops = &ipv4_specific;
2247
2248 #ifdef CONFIG_TCP_MD5SIG
2249         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2250 #endif
2251
2252         return 0;
2253 }
2254
2255 void tcp_v4_destroy_sock(struct sock *sk)
2256 {
2257         struct tcp_sock *tp = tcp_sk(sk);
2258
2259         trace_tcp_destroy_sock(sk);
2260
2261         tcp_clear_xmit_timers(sk);
2262
2263         tcp_cleanup_congestion_control(sk);
2264
2265         tcp_cleanup_ulp(sk);
2266
2267         /* Cleanup up the write buffer. */
2268         tcp_write_queue_purge(sk);
2269
2270         /* Check if we want to disable active TFO */
2271         tcp_fastopen_active_disable_ofo_check(sk);
2272
2273         /* Cleans up our, hopefully empty, out_of_order_queue. */
2274         skb_rbtree_purge(&tp->out_of_order_queue);
2275
2276 #ifdef CONFIG_TCP_MD5SIG
2277         /* Clean up the MD5 key list, if any */
2278         if (tp->md5sig_info) {
2279                 tcp_clear_md5_list(sk);
2280                 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2281                 tp->md5sig_info = NULL;
2282         }
2283 #endif
2284
2285         /* Clean up a referenced TCP bind bucket. */
2286         if (inet_csk(sk)->icsk_bind_hash)
2287                 inet_put_port(sk);
2288
2289         BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2290
2291         /* If socket is aborted during connect operation */
2292         tcp_free_fastopen_req(tp);
2293         tcp_fastopen_destroy_cipher(sk);
2294         tcp_saved_syn_free(tp);
2295
2296         sk_sockets_allocated_dec(sk);
2297 }
2298 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2299
2300 #ifdef CONFIG_PROC_FS
2301 /* Proc filesystem TCP sock list dumping. */
2302
2303 static unsigned short seq_file_family(const struct seq_file *seq);
2304
2305 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2306 {
2307         unsigned short family = seq_file_family(seq);
2308
2309         /* AF_UNSPEC is used as a match all */
2310         return ((family == AF_UNSPEC || family == sk->sk_family) &&
2311                 net_eq(sock_net(sk), seq_file_net(seq)));
2312 }
2313
2314 /* Find a non empty bucket (starting from st->bucket)
2315  * and return the first sk from it.
2316  */
2317 static void *listening_get_first(struct seq_file *seq)
2318 {
2319         struct tcp_iter_state *st = seq->private;
2320
2321         st->offset = 0;
2322         for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) {
2323                 struct inet_listen_hashbucket *ilb2;
2324                 struct inet_connection_sock *icsk;
2325                 struct sock *sk;
2326
2327                 ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2328                 if (hlist_empty(&ilb2->head))
2329                         continue;
2330
2331                 spin_lock(&ilb2->lock);
2332                 inet_lhash2_for_each_icsk(icsk, &ilb2->head) {
2333                         sk = (struct sock *)icsk;
2334                         if (seq_sk_match(seq, sk))
2335                                 return sk;
2336                 }
2337                 spin_unlock(&ilb2->lock);
2338         }
2339
2340         return NULL;
2341 }
2342
2343 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2344  * If "cur" is the last one in the st->bucket,
2345  * call listening_get_first() to return the first sk of the next
2346  * non empty bucket.
2347  */
2348 static void *listening_get_next(struct seq_file *seq, void *cur)
2349 {
2350         struct tcp_iter_state *st = seq->private;
2351         struct inet_listen_hashbucket *ilb2;
2352         struct inet_connection_sock *icsk;
2353         struct sock *sk = cur;
2354
2355         ++st->num;
2356         ++st->offset;
2357
2358         icsk = inet_csk(sk);
2359         inet_lhash2_for_each_icsk_continue(icsk) {
2360                 sk = (struct sock *)icsk;
2361                 if (seq_sk_match(seq, sk))
2362                         return sk;
2363         }
2364
2365         ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2366         spin_unlock(&ilb2->lock);
2367         ++st->bucket;
2368         return listening_get_first(seq);
2369 }
2370
2371 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2372 {
2373         struct tcp_iter_state *st = seq->private;
2374         void *rc;
2375
2376         st->bucket = 0;
2377         st->offset = 0;
2378         rc = listening_get_first(seq);
2379
2380         while (rc && *pos) {
2381                 rc = listening_get_next(seq, rc);
2382                 --*pos;
2383         }
2384         return rc;
2385 }
2386
2387 static inline bool empty_bucket(const struct tcp_iter_state *st)
2388 {
2389         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2390 }
2391
2392 /*
2393  * Get first established socket starting from bucket given in st->bucket.
2394  * If st->bucket is zero, the very first socket in the hash is returned.
2395  */
2396 static void *established_get_first(struct seq_file *seq)
2397 {
2398         struct tcp_iter_state *st = seq->private;
2399
2400         st->offset = 0;
2401         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2402                 struct sock *sk;
2403                 struct hlist_nulls_node *node;
2404                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2405
2406                 /* Lockless fast path for the common case of empty buckets */
2407                 if (empty_bucket(st))
2408                         continue;
2409
2410                 spin_lock_bh(lock);
2411                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2412                         if (seq_sk_match(seq, sk))
2413                                 return sk;
2414                 }
2415                 spin_unlock_bh(lock);
2416         }
2417
2418         return NULL;
2419 }
2420
2421 static void *established_get_next(struct seq_file *seq, void *cur)
2422 {
2423         struct sock *sk = cur;
2424         struct hlist_nulls_node *node;
2425         struct tcp_iter_state *st = seq->private;
2426
2427         ++st->num;
2428         ++st->offset;
2429
2430         sk = sk_nulls_next(sk);
2431
2432         sk_nulls_for_each_from(sk, node) {
2433                 if (seq_sk_match(seq, sk))
2434                         return sk;
2435         }
2436
2437         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2438         ++st->bucket;
2439         return established_get_first(seq);
2440 }
2441
2442 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2443 {
2444         struct tcp_iter_state *st = seq->private;
2445         void *rc;
2446
2447         st->bucket = 0;
2448         rc = established_get_first(seq);
2449
2450         while (rc && pos) {
2451                 rc = established_get_next(seq, rc);
2452                 --pos;
2453         }
2454         return rc;
2455 }
2456
2457 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2458 {
2459         void *rc;
2460         struct tcp_iter_state *st = seq->private;
2461
2462         st->state = TCP_SEQ_STATE_LISTENING;
2463         rc        = listening_get_idx(seq, &pos);
2464
2465         if (!rc) {
2466                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2467                 rc        = established_get_idx(seq, pos);
2468         }
2469
2470         return rc;
2471 }
2472
2473 static void *tcp_seek_last_pos(struct seq_file *seq)
2474 {
2475         struct tcp_iter_state *st = seq->private;
2476         int bucket = st->bucket;
2477         int offset = st->offset;
2478         int orig_num = st->num;
2479         void *rc = NULL;
2480
2481         switch (st->state) {
2482         case TCP_SEQ_STATE_LISTENING:
2483                 if (st->bucket > tcp_hashinfo.lhash2_mask)
2484                         break;
2485                 st->state = TCP_SEQ_STATE_LISTENING;
2486                 rc = listening_get_first(seq);
2487                 while (offset-- && rc && bucket == st->bucket)
2488                         rc = listening_get_next(seq, rc);
2489                 if (rc)
2490                         break;
2491                 st->bucket = 0;
2492                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2493                 fallthrough;
2494         case TCP_SEQ_STATE_ESTABLISHED:
2495                 if (st->bucket > tcp_hashinfo.ehash_mask)
2496                         break;
2497                 rc = established_get_first(seq);
2498                 while (offset-- && rc && bucket == st->bucket)
2499                         rc = established_get_next(seq, rc);
2500         }
2501
2502         st->num = orig_num;
2503
2504         return rc;
2505 }
2506
2507 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2508 {
2509         struct tcp_iter_state *st = seq->private;
2510         void *rc;
2511
2512         if (*pos && *pos == st->last_pos) {
2513                 rc = tcp_seek_last_pos(seq);
2514                 if (rc)
2515                         goto out;
2516         }
2517
2518         st->state = TCP_SEQ_STATE_LISTENING;
2519         st->num = 0;
2520         st->bucket = 0;
2521         st->offset = 0;
2522         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2523
2524 out:
2525         st->last_pos = *pos;
2526         return rc;
2527 }
2528 EXPORT_SYMBOL(tcp_seq_start);
2529
2530 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2531 {
2532         struct tcp_iter_state *st = seq->private;
2533         void *rc = NULL;
2534
2535         if (v == SEQ_START_TOKEN) {
2536                 rc = tcp_get_idx(seq, 0);
2537                 goto out;
2538         }
2539
2540         switch (st->state) {
2541         case TCP_SEQ_STATE_LISTENING:
2542                 rc = listening_get_next(seq, v);
2543                 if (!rc) {
2544                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2545                         st->bucket = 0;
2546                         st->offset = 0;
2547                         rc        = established_get_first(seq);
2548                 }
2549                 break;
2550         case TCP_SEQ_STATE_ESTABLISHED:
2551                 rc = established_get_next(seq, v);
2552                 break;
2553         }
2554 out:
2555         ++*pos;
2556         st->last_pos = *pos;
2557         return rc;
2558 }
2559 EXPORT_SYMBOL(tcp_seq_next);
2560
2561 void tcp_seq_stop(struct seq_file *seq, void *v)
2562 {
2563         struct tcp_iter_state *st = seq->private;
2564
2565         switch (st->state) {
2566         case TCP_SEQ_STATE_LISTENING:
2567                 if (v != SEQ_START_TOKEN)
2568                         spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2569                 break;
2570         case TCP_SEQ_STATE_ESTABLISHED:
2571                 if (v)
2572                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2573                 break;
2574         }
2575 }
2576 EXPORT_SYMBOL(tcp_seq_stop);
2577
2578 static void get_openreq4(const struct request_sock *req,
2579                          struct seq_file *f, int i)
2580 {
2581         const struct inet_request_sock *ireq = inet_rsk(req);
2582         long delta = req->rsk_timer.expires - jiffies;
2583
2584         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2585                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2586                 i,
2587                 ireq->ir_loc_addr,
2588                 ireq->ir_num,
2589                 ireq->ir_rmt_addr,
2590                 ntohs(ireq->ir_rmt_port),
2591                 TCP_SYN_RECV,
2592                 0, 0, /* could print option size, but that is af dependent. */
2593                 1,    /* timers active (only the expire timer) */
2594                 jiffies_delta_to_clock_t(delta),
2595                 req->num_timeout,
2596                 from_kuid_munged(seq_user_ns(f),
2597                                  sock_i_uid(req->rsk_listener)),
2598                 0,  /* non standard timer */
2599                 0, /* open_requests have no inode */
2600                 0,
2601                 req);
2602 }
2603
2604 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2605 {
2606         int timer_active;
2607         unsigned long timer_expires;
2608         const struct tcp_sock *tp = tcp_sk(sk);
2609         const struct inet_connection_sock *icsk = inet_csk(sk);
2610         const struct inet_sock *inet = inet_sk(sk);
2611         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2612         __be32 dest = inet->inet_daddr;
2613         __be32 src = inet->inet_rcv_saddr;
2614         __u16 destp = ntohs(inet->inet_dport);
2615         __u16 srcp = ntohs(inet->inet_sport);
2616         int rx_queue;
2617         int state;
2618
2619         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2620             icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2621             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2622                 timer_active    = 1;
2623                 timer_expires   = icsk->icsk_timeout;
2624         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2625                 timer_active    = 4;
2626                 timer_expires   = icsk->icsk_timeout;
2627         } else if (timer_pending(&sk->sk_timer)) {
2628                 timer_active    = 2;
2629                 timer_expires   = sk->sk_timer.expires;
2630         } else {
2631                 timer_active    = 0;
2632                 timer_expires = jiffies;
2633         }
2634
2635         state = inet_sk_state_load(sk);
2636         if (state == TCP_LISTEN)
2637                 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2638         else
2639                 /* Because we don't lock the socket,
2640                  * we might find a transient negative value.
2641                  */
2642                 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2643                                       READ_ONCE(tp->copied_seq), 0);
2644
2645         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2646                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2647                 i, src, srcp, dest, destp, state,
2648                 READ_ONCE(tp->write_seq) - tp->snd_una,
2649                 rx_queue,
2650                 timer_active,
2651                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2652                 icsk->icsk_retransmits,
2653                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2654                 icsk->icsk_probes_out,
2655                 sock_i_ino(sk),
2656                 refcount_read(&sk->sk_refcnt), sk,
2657                 jiffies_to_clock_t(icsk->icsk_rto),
2658                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2659                 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2660                 tp->snd_cwnd,
2661                 state == TCP_LISTEN ?
2662                     fastopenq->max_qlen :
2663                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2664 }
2665
2666 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2667                                struct seq_file *f, int i)
2668 {
2669         long delta = tw->tw_timer.expires - jiffies;
2670         __be32 dest, src;
2671         __u16 destp, srcp;
2672
2673         dest  = tw->tw_daddr;
2674         src   = tw->tw_rcv_saddr;
2675         destp = ntohs(tw->tw_dport);
2676         srcp  = ntohs(tw->tw_sport);
2677
2678         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2679                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2680                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2681                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2682                 refcount_read(&tw->tw_refcnt), tw);
2683 }
2684
2685 #define TMPSZ 150
2686
2687 static int tcp4_seq_show(struct seq_file *seq, void *v)
2688 {
2689         struct tcp_iter_state *st;
2690         struct sock *sk = v;
2691
2692         seq_setwidth(seq, TMPSZ - 1);
2693         if (v == SEQ_START_TOKEN) {
2694                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2695                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2696                            "inode");
2697                 goto out;
2698         }
2699         st = seq->private;
2700
2701         if (sk->sk_state == TCP_TIME_WAIT)
2702                 get_timewait4_sock(v, seq, st->num);
2703         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2704                 get_openreq4(v, seq, st->num);
2705         else
2706                 get_tcp4_sock(v, seq, st->num);
2707 out:
2708         seq_pad(seq, '\n');
2709         return 0;
2710 }
2711
2712 #ifdef CONFIG_BPF_SYSCALL
2713 struct bpf_tcp_iter_state {
2714         struct tcp_iter_state state;
2715         unsigned int cur_sk;
2716         unsigned int end_sk;
2717         unsigned int max_sk;
2718         struct sock **batch;
2719         bool st_bucket_done;
2720 };
2721
2722 struct bpf_iter__tcp {
2723         __bpf_md_ptr(struct bpf_iter_meta *, meta);
2724         __bpf_md_ptr(struct sock_common *, sk_common);
2725         uid_t uid __aligned(8);
2726 };
2727
2728 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2729                              struct sock_common *sk_common, uid_t uid)
2730 {
2731         struct bpf_iter__tcp ctx;
2732
2733         meta->seq_num--;  /* skip SEQ_START_TOKEN */
2734         ctx.meta = meta;
2735         ctx.sk_common = sk_common;
2736         ctx.uid = uid;
2737         return bpf_iter_run_prog(prog, &ctx);
2738 }
2739
2740 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2741 {
2742         while (iter->cur_sk < iter->end_sk)
2743                 sock_put(iter->batch[iter->cur_sk++]);
2744 }
2745
2746 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2747                                       unsigned int new_batch_sz)
2748 {
2749         struct sock **new_batch;
2750
2751         new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2752                              GFP_USER | __GFP_NOWARN);
2753         if (!new_batch)
2754                 return -ENOMEM;
2755
2756         bpf_iter_tcp_put_batch(iter);
2757         kvfree(iter->batch);
2758         iter->batch = new_batch;
2759         iter->max_sk = new_batch_sz;
2760
2761         return 0;
2762 }
2763
2764 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2765                                                  struct sock *start_sk)
2766 {
2767         struct bpf_tcp_iter_state *iter = seq->private;
2768         struct tcp_iter_state *st = &iter->state;
2769         struct inet_connection_sock *icsk;
2770         unsigned int expected = 1;
2771         struct sock *sk;
2772
2773         sock_hold(start_sk);
2774         iter->batch[iter->end_sk++] = start_sk;
2775
2776         icsk = inet_csk(start_sk);
2777         inet_lhash2_for_each_icsk_continue(icsk) {
2778                 sk = (struct sock *)icsk;
2779                 if (seq_sk_match(seq, sk)) {
2780                         if (iter->end_sk < iter->max_sk) {
2781                                 sock_hold(sk);
2782                                 iter->batch[iter->end_sk++] = sk;
2783                         }
2784                         expected++;
2785                 }
2786         }
2787         spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2788
2789         return expected;
2790 }
2791
2792 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2793                                                    struct sock *start_sk)
2794 {
2795         struct bpf_tcp_iter_state *iter = seq->private;
2796         struct tcp_iter_state *st = &iter->state;
2797         struct hlist_nulls_node *node;
2798         unsigned int expected = 1;
2799         struct sock *sk;
2800
2801         sock_hold(start_sk);
2802         iter->batch[iter->end_sk++] = start_sk;
2803
2804         sk = sk_nulls_next(start_sk);
2805         sk_nulls_for_each_from(sk, node) {
2806                 if (seq_sk_match(seq, sk)) {
2807                         if (iter->end_sk < iter->max_sk) {
2808                                 sock_hold(sk);
2809                                 iter->batch[iter->end_sk++] = sk;
2810                         }
2811                         expected++;
2812                 }
2813         }
2814         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2815
2816         return expected;
2817 }
2818
2819 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2820 {
2821         struct bpf_tcp_iter_state *iter = seq->private;
2822         struct tcp_iter_state *st = &iter->state;
2823         unsigned int expected;
2824         bool resized = false;
2825         struct sock *sk;
2826
2827         /* The st->bucket is done.  Directly advance to the next
2828          * bucket instead of having the tcp_seek_last_pos() to skip
2829          * one by one in the current bucket and eventually find out
2830          * it has to advance to the next bucket.
2831          */
2832         if (iter->st_bucket_done) {
2833                 st->offset = 0;
2834                 st->bucket++;
2835                 if (st->state == TCP_SEQ_STATE_LISTENING &&
2836                     st->bucket > tcp_hashinfo.lhash2_mask) {
2837                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2838                         st->bucket = 0;
2839                 }
2840         }
2841
2842 again:
2843         /* Get a new batch */
2844         iter->cur_sk = 0;
2845         iter->end_sk = 0;
2846         iter->st_bucket_done = false;
2847
2848         sk = tcp_seek_last_pos(seq);
2849         if (!sk)
2850                 return NULL; /* Done */
2851
2852         if (st->state == TCP_SEQ_STATE_LISTENING)
2853                 expected = bpf_iter_tcp_listening_batch(seq, sk);
2854         else
2855                 expected = bpf_iter_tcp_established_batch(seq, sk);
2856
2857         if (iter->end_sk == expected) {
2858                 iter->st_bucket_done = true;
2859                 return sk;
2860         }
2861
2862         if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2863                 resized = true;
2864                 goto again;
2865         }
2866
2867         return sk;
2868 }
2869
2870 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2871 {
2872         /* bpf iter does not support lseek, so it always
2873          * continue from where it was stop()-ped.
2874          */
2875         if (*pos)
2876                 return bpf_iter_tcp_batch(seq);
2877
2878         return SEQ_START_TOKEN;
2879 }
2880
2881 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2882 {
2883         struct bpf_tcp_iter_state *iter = seq->private;
2884         struct tcp_iter_state *st = &iter->state;
2885         struct sock *sk;
2886
2887         /* Whenever seq_next() is called, the iter->cur_sk is
2888          * done with seq_show(), so advance to the next sk in
2889          * the batch.
2890          */
2891         if (iter->cur_sk < iter->end_sk) {
2892                 /* Keeping st->num consistent in tcp_iter_state.
2893                  * bpf_iter_tcp does not use st->num.
2894                  * meta.seq_num is used instead.
2895                  */
2896                 st->num++;
2897                 /* Move st->offset to the next sk in the bucket such that
2898                  * the future start() will resume at st->offset in
2899                  * st->bucket.  See tcp_seek_last_pos().
2900                  */
2901                 st->offset++;
2902                 sock_put(iter->batch[iter->cur_sk++]);
2903         }
2904
2905         if (iter->cur_sk < iter->end_sk)
2906                 sk = iter->batch[iter->cur_sk];
2907         else
2908                 sk = bpf_iter_tcp_batch(seq);
2909
2910         ++*pos;
2911         /* Keeping st->last_pos consistent in tcp_iter_state.
2912          * bpf iter does not do lseek, so st->last_pos always equals to *pos.
2913          */
2914         st->last_pos = *pos;
2915         return sk;
2916 }
2917
2918 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2919 {
2920         struct bpf_iter_meta meta;
2921         struct bpf_prog *prog;
2922         struct sock *sk = v;
2923         bool slow;
2924         uid_t uid;
2925         int ret;
2926
2927         if (v == SEQ_START_TOKEN)
2928                 return 0;
2929
2930         if (sk_fullsock(sk))
2931                 slow = lock_sock_fast(sk);
2932
2933         if (unlikely(sk_unhashed(sk))) {
2934                 ret = SEQ_SKIP;
2935                 goto unlock;
2936         }
2937
2938         if (sk->sk_state == TCP_TIME_WAIT) {
2939                 uid = 0;
2940         } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2941                 const struct request_sock *req = v;
2942
2943                 uid = from_kuid_munged(seq_user_ns(seq),
2944                                        sock_i_uid(req->rsk_listener));
2945         } else {
2946                 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2947         }
2948
2949         meta.seq = seq;
2950         prog = bpf_iter_get_info(&meta, false);
2951         ret = tcp_prog_seq_show(prog, &meta, v, uid);
2952
2953 unlock:
2954         if (sk_fullsock(sk))
2955                 unlock_sock_fast(sk, slow);
2956         return ret;
2957
2958 }
2959
2960 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2961 {
2962         struct bpf_tcp_iter_state *iter = seq->private;
2963         struct bpf_iter_meta meta;
2964         struct bpf_prog *prog;
2965
2966         if (!v) {
2967                 meta.seq = seq;
2968                 prog = bpf_iter_get_info(&meta, true);
2969                 if (prog)
2970                         (void)tcp_prog_seq_show(prog, &meta, v, 0);
2971         }
2972
2973         if (iter->cur_sk < iter->end_sk) {
2974                 bpf_iter_tcp_put_batch(iter);
2975                 iter->st_bucket_done = false;
2976         }
2977 }
2978
2979 static const struct seq_operations bpf_iter_tcp_seq_ops = {
2980         .show           = bpf_iter_tcp_seq_show,
2981         .start          = bpf_iter_tcp_seq_start,
2982         .next           = bpf_iter_tcp_seq_next,
2983         .stop           = bpf_iter_tcp_seq_stop,
2984 };
2985 #endif
2986 static unsigned short seq_file_family(const struct seq_file *seq)
2987 {
2988         const struct tcp_seq_afinfo *afinfo;
2989
2990 #ifdef CONFIG_BPF_SYSCALL
2991         /* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
2992         if (seq->op == &bpf_iter_tcp_seq_ops)
2993                 return AF_UNSPEC;
2994 #endif
2995
2996         /* Iterated from proc fs */
2997         afinfo = PDE_DATA(file_inode(seq->file));
2998         return afinfo->family;
2999 }
3000
3001 static const struct seq_operations tcp4_seq_ops = {
3002         .show           = tcp4_seq_show,
3003         .start          = tcp_seq_start,
3004         .next           = tcp_seq_next,
3005         .stop           = tcp_seq_stop,
3006 };
3007
3008 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3009         .family         = AF_INET,
3010 };
3011
3012 static int __net_init tcp4_proc_init_net(struct net *net)
3013 {
3014         if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3015                         sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3016                 return -ENOMEM;
3017         return 0;
3018 }
3019
3020 static void __net_exit tcp4_proc_exit_net(struct net *net)
3021 {
3022         remove_proc_entry("tcp", net->proc_net);
3023 }
3024
3025 static struct pernet_operations tcp4_net_ops = {
3026         .init = tcp4_proc_init_net,
3027         .exit = tcp4_proc_exit_net,
3028 };
3029
3030 int __init tcp4_proc_init(void)
3031 {
3032         return register_pernet_subsys(&tcp4_net_ops);
3033 }
3034
3035 void tcp4_proc_exit(void)
3036 {
3037         unregister_pernet_subsys(&tcp4_net_ops);
3038 }
3039 #endif /* CONFIG_PROC_FS */
3040
3041 /* @wake is one when sk_stream_write_space() calls us.
3042  * This sends EPOLLOUT only if notsent_bytes is half the limit.
3043  * This mimics the strategy used in sock_def_write_space().
3044  */
3045 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3046 {
3047         const struct tcp_sock *tp = tcp_sk(sk);
3048         u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3049                             READ_ONCE(tp->snd_nxt);
3050
3051         return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3052 }
3053 EXPORT_SYMBOL(tcp_stream_memory_free);
3054
3055 struct proto tcp_prot = {
3056         .name                   = "TCP",
3057         .owner                  = THIS_MODULE,
3058         .close                  = tcp_close,
3059         .pre_connect            = tcp_v4_pre_connect,
3060         .connect                = tcp_v4_connect,
3061         .disconnect             = tcp_disconnect,
3062         .accept                 = inet_csk_accept,
3063         .ioctl                  = tcp_ioctl,
3064         .init                   = tcp_v4_init_sock,
3065         .destroy                = tcp_v4_destroy_sock,
3066         .shutdown               = tcp_shutdown,
3067         .setsockopt             = tcp_setsockopt,
3068         .getsockopt             = tcp_getsockopt,
3069         .bpf_bypass_getsockopt  = tcp_bpf_bypass_getsockopt,
3070         .keepalive              = tcp_set_keepalive,
3071         .recvmsg                = tcp_recvmsg,
3072         .sendmsg                = tcp_sendmsg,
3073         .sendpage               = tcp_sendpage,
3074         .backlog_rcv            = tcp_v4_do_rcv,
3075         .release_cb             = tcp_release_cb,
3076         .hash                   = inet_hash,
3077         .unhash                 = inet_unhash,
3078         .get_port               = inet_csk_get_port,
3079 #ifdef CONFIG_BPF_SYSCALL
3080         .psock_update_sk_prot   = tcp_bpf_update_proto,
3081 #endif
3082         .enter_memory_pressure  = tcp_enter_memory_pressure,
3083         .leave_memory_pressure  = tcp_leave_memory_pressure,
3084         .stream_memory_free     = tcp_stream_memory_free,
3085         .sockets_allocated      = &tcp_sockets_allocated,
3086         .orphan_count           = &tcp_orphan_count,
3087         .memory_allocated       = &tcp_memory_allocated,
3088         .memory_pressure        = &tcp_memory_pressure,
3089         .sysctl_mem             = sysctl_tcp_mem,
3090         .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3091         .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
3092         .max_header             = MAX_TCP_HEADER,
3093         .obj_size               = sizeof(struct tcp_sock),
3094         .slab_flags             = SLAB_TYPESAFE_BY_RCU,
3095         .twsk_prot              = &tcp_timewait_sock_ops,
3096         .rsk_prot               = &tcp_request_sock_ops,
3097         .h.hashinfo             = &tcp_hashinfo,
3098         .no_autobind            = true,
3099         .diag_destroy           = tcp_abort,
3100 };
3101 EXPORT_SYMBOL(tcp_prot);
3102
3103 static void __net_exit tcp_sk_exit(struct net *net)
3104 {
3105         int cpu;
3106
3107         if (net->ipv4.tcp_congestion_control)
3108                 bpf_module_put(net->ipv4.tcp_congestion_control,
3109                                net->ipv4.tcp_congestion_control->owner);
3110
3111         for_each_possible_cpu(cpu)
3112                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
3113         free_percpu(net->ipv4.tcp_sk);
3114 }
3115
3116 static int __net_init tcp_sk_init(struct net *net)
3117 {
3118         int res, cpu, cnt;
3119
3120         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
3121         if (!net->ipv4.tcp_sk)
3122                 return -ENOMEM;
3123
3124         for_each_possible_cpu(cpu) {
3125                 struct sock *sk;
3126
3127                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3128                                            IPPROTO_TCP, net);
3129                 if (res)
3130                         goto fail;
3131                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3132
3133                 /* Please enforce IP_DF and IPID==0 for RST and
3134                  * ACK sent in SYN-RECV and TIME-WAIT state.
3135                  */
3136                 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3137
3138                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
3139         }
3140
3141         net->ipv4.sysctl_tcp_ecn = 2;
3142         net->ipv4.sysctl_tcp_ecn_fallback = 1;
3143
3144         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3145         net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3146         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3147         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3148         net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3149
3150         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3151         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3152         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3153
3154         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3155         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3156         net->ipv4.sysctl_tcp_syncookies = 1;
3157         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3158         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3159         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3160         net->ipv4.sysctl_tcp_orphan_retries = 0;
3161         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3162         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3163         net->ipv4.sysctl_tcp_tw_reuse = 2;
3164         net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3165
3166         cnt = tcp_hashinfo.ehash_mask + 1;
3167         net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
3168         net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
3169
3170         net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
3171         net->ipv4.sysctl_tcp_sack = 1;
3172         net->ipv4.sysctl_tcp_window_scaling = 1;
3173         net->ipv4.sysctl_tcp_timestamps = 1;
3174         net->ipv4.sysctl_tcp_early_retrans = 3;
3175         net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3176         net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3177         net->ipv4.sysctl_tcp_retrans_collapse = 1;
3178         net->ipv4.sysctl_tcp_max_reordering = 300;
3179         net->ipv4.sysctl_tcp_dsack = 1;
3180         net->ipv4.sysctl_tcp_app_win = 31;
3181         net->ipv4.sysctl_tcp_adv_win_scale = 1;
3182         net->ipv4.sysctl_tcp_frto = 2;
3183         net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3184         /* This limits the percentage of the congestion window which we
3185          * will allow a single TSO frame to consume.  Building TSO frames
3186          * which are too large can cause TCP streams to be bursty.
3187          */
3188         net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3189         /* Default TSQ limit of 16 TSO segments */
3190         net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3191         /* rfc5961 challenge ack rate limiting */
3192         net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
3193         net->ipv4.sysctl_tcp_min_tso_segs = 2;
3194         net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3195         net->ipv4.sysctl_tcp_autocorking = 1;
3196         net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3197         net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3198         net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3199         if (net != &init_net) {
3200                 memcpy(net->ipv4.sysctl_tcp_rmem,
3201                        init_net.ipv4.sysctl_tcp_rmem,
3202                        sizeof(init_net.ipv4.sysctl_tcp_rmem));
3203                 memcpy(net->ipv4.sysctl_tcp_wmem,
3204                        init_net.ipv4.sysctl_tcp_wmem,
3205                        sizeof(init_net.ipv4.sysctl_tcp_wmem));
3206         }
3207         net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3208         net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3209         net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3210         net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3211         net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3212         atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3213
3214         /* Reno is always built in */
3215         if (!net_eq(net, &init_net) &&
3216             bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3217                                init_net.ipv4.tcp_congestion_control->owner))
3218                 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3219         else
3220                 net->ipv4.tcp_congestion_control = &tcp_reno;
3221
3222         return 0;
3223 fail:
3224         tcp_sk_exit(net);
3225
3226         return res;
3227 }
3228
3229 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3230 {
3231         struct net *net;
3232
3233         inet_twsk_purge(&tcp_hashinfo, AF_INET);
3234
3235         list_for_each_entry(net, net_exit_list, exit_list)
3236                 tcp_fastopen_ctx_destroy(net);
3237 }
3238
3239 static struct pernet_operations __net_initdata tcp_sk_ops = {
3240        .init       = tcp_sk_init,
3241        .exit       = tcp_sk_exit,
3242        .exit_batch = tcp_sk_exit_batch,
3243 };
3244
3245 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3246 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3247                      struct sock_common *sk_common, uid_t uid)
3248
3249 #define INIT_BATCH_SZ 16
3250
3251 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3252 {
3253         struct bpf_tcp_iter_state *iter = priv_data;
3254         int err;
3255
3256         err = bpf_iter_init_seq_net(priv_data, aux);
3257         if (err)
3258                 return err;
3259
3260         err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3261         if (err) {
3262                 bpf_iter_fini_seq_net(priv_data);
3263                 return err;
3264         }
3265
3266         return 0;
3267 }
3268
3269 static void bpf_iter_fini_tcp(void *priv_data)
3270 {
3271         struct bpf_tcp_iter_state *iter = priv_data;
3272
3273         bpf_iter_fini_seq_net(priv_data);
3274         kvfree(iter->batch);
3275 }
3276
3277 static const struct bpf_iter_seq_info tcp_seq_info = {
3278         .seq_ops                = &bpf_iter_tcp_seq_ops,
3279         .init_seq_private       = bpf_iter_init_tcp,
3280         .fini_seq_private       = bpf_iter_fini_tcp,
3281         .seq_priv_size          = sizeof(struct bpf_tcp_iter_state),
3282 };
3283
3284 static const struct bpf_func_proto *
3285 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3286                             const struct bpf_prog *prog)
3287 {
3288         switch (func_id) {
3289         case BPF_FUNC_setsockopt:
3290                 return &bpf_sk_setsockopt_proto;
3291         case BPF_FUNC_getsockopt:
3292                 return &bpf_sk_getsockopt_proto;
3293         default:
3294                 return NULL;
3295         }
3296 }
3297
3298 static struct bpf_iter_reg tcp_reg_info = {
3299         .target                 = "tcp",
3300         .ctx_arg_info_size      = 1,
3301         .ctx_arg_info           = {
3302                 { offsetof(struct bpf_iter__tcp, sk_common),
3303                   PTR_TO_BTF_ID_OR_NULL },
3304         },
3305         .get_func_proto         = bpf_iter_tcp_get_func_proto,
3306         .seq_info               = &tcp_seq_info,
3307 };
3308
3309 static void __init bpf_iter_register(void)
3310 {
3311         tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3312         if (bpf_iter_reg_target(&tcp_reg_info))
3313                 pr_warn("Warning: could not register bpf iterator tcp\n");
3314 }
3315
3316 #endif
3317
3318 void __init tcp_v4_init(void)
3319 {
3320         if (register_pernet_subsys(&tcp_sk_ops))
3321                 panic("Failed to create the TCP control socket.\n");
3322
3323 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3324         bpf_iter_register();
3325 #endif
3326 }