Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
[sfrench/cifs-2.6.git] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  *              IPv4 specific functions
9  *
10  *
11  *              code split from:
12  *              linux/ipv4/tcp.c
13  *              linux/ipv4/tcp_input.c
14  *              linux/ipv4/tcp_output.c
15  *
16  *              See tcp.c for author information
17  *
18  *      This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23
24 /*
25  * Changes:
26  *              David S. Miller :       New socket lookup architecture.
27  *                                      This code is dedicated to John Dyson.
28  *              David S. Miller :       Change semantics of established hash,
29  *                                      half is devoted to TIME_WAIT sockets
30  *                                      and the rest go in the other half.
31  *              Andi Kleen :            Add support for syncookies and fixed
32  *                                      some bugs: ip options weren't passed to
33  *                                      the TCP layer, missed a check for an
34  *                                      ACK bit.
35  *              Andi Kleen :            Implemented fast path mtu discovery.
36  *                                      Fixed many serious bugs in the
37  *                                      request_sock handling and moved
38  *                                      most of it into the af independent code.
39  *                                      Added tail drop and some other bugfixes.
40  *                                      Added new listen semantics.
41  *              Mike McLagan    :       Routing by source
42  *      Juan Jose Ciarlante:            ip_dynaddr bits
43  *              Andi Kleen:             various fixes.
44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
45  *                                      coma.
46  *      Andi Kleen              :       Fix new listen.
47  *      Andi Kleen              :       Fix accept error reporting.
48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
50  *                                      a single port at the same time.
51  */
52
53 #define pr_fmt(fmt) "TCP: " fmt
54
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
65
66 #include <net/net_namespace.h>
67 #include <net/icmp.h>
68 #include <net/inet_hashtables.h>
69 #include <net/tcp.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
74 #include <net/xfrm.h>
75 #include <net/secure_seq.h>
76 #include <net/busy_poll.h>
77
78 #include <linux/inet.h>
79 #include <linux/ipv6.h>
80 #include <linux/stddef.h>
81 #include <linux/proc_fs.h>
82 #include <linux/seq_file.h>
83 #include <linux/inetdevice.h>
84
85 #include <crypto/hash.h>
86 #include <linux/scatterlist.h>
87
88 #include <trace/events/tcp.h>
89
90 #ifdef CONFIG_TCP_MD5SIG
91 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
92                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
93 #endif
94
95 struct inet_hashinfo tcp_hashinfo;
96 EXPORT_SYMBOL(tcp_hashinfo);
97
98 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
99 {
100         return secure_tcp_seq(ip_hdr(skb)->daddr,
101                               ip_hdr(skb)->saddr,
102                               tcp_hdr(skb)->dest,
103                               tcp_hdr(skb)->source);
104 }
105
106 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
107 {
108         return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
109 }
110
111 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
112 {
113         const struct inet_timewait_sock *tw = inet_twsk(sktw);
114         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
115         struct tcp_sock *tp = tcp_sk(sk);
116         int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
117
118         if (reuse == 2) {
119                 /* Still does not detect *everything* that goes through
120                  * lo, since we require a loopback src or dst address
121                  * or direct binding to 'lo' interface.
122                  */
123                 bool loopback = false;
124                 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
125                         loopback = true;
126 #if IS_ENABLED(CONFIG_IPV6)
127                 if (tw->tw_family == AF_INET6) {
128                         if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
129                             (ipv6_addr_v4mapped(&tw->tw_v6_daddr) &&
130                              (tw->tw_v6_daddr.s6_addr[12] == 127)) ||
131                             ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
132                             (ipv6_addr_v4mapped(&tw->tw_v6_rcv_saddr) &&
133                              (tw->tw_v6_rcv_saddr.s6_addr[12] == 127)))
134                                 loopback = true;
135                 } else
136 #endif
137                 {
138                         if (ipv4_is_loopback(tw->tw_daddr) ||
139                             ipv4_is_loopback(tw->tw_rcv_saddr))
140                                 loopback = true;
141                 }
142                 if (!loopback)
143                         reuse = 0;
144         }
145
146         /* With PAWS, it is safe from the viewpoint
147            of data integrity. Even without PAWS it is safe provided sequence
148            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
149
150            Actually, the idea is close to VJ's one, only timestamp cache is
151            held not per host, but per port pair and TW bucket is used as state
152            holder.
153
154            If TW bucket has been already destroyed we fall back to VJ's scheme
155            and use initial timestamp retrieved from peer table.
156          */
157         if (tcptw->tw_ts_recent_stamp &&
158             (!twp || (reuse && get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
159                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
160                 if (tp->write_seq == 0)
161                         tp->write_seq = 1;
162                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
163                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
164                 sock_hold(sktw);
165                 return 1;
166         }
167
168         return 0;
169 }
170 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
171
172 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
173                               int addr_len)
174 {
175         /* This check is replicated from tcp_v4_connect() and intended to
176          * prevent BPF program called below from accessing bytes that are out
177          * of the bound specified by user in addr_len.
178          */
179         if (addr_len < sizeof(struct sockaddr_in))
180                 return -EINVAL;
181
182         sock_owned_by_me(sk);
183
184         return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
185 }
186
187 /* This will initiate an outgoing connection. */
188 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
189 {
190         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
191         struct inet_sock *inet = inet_sk(sk);
192         struct tcp_sock *tp = tcp_sk(sk);
193         __be16 orig_sport, orig_dport;
194         __be32 daddr, nexthop;
195         struct flowi4 *fl4;
196         struct rtable *rt;
197         int err;
198         struct ip_options_rcu *inet_opt;
199         struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
200
201         if (addr_len < sizeof(struct sockaddr_in))
202                 return -EINVAL;
203
204         if (usin->sin_family != AF_INET)
205                 return -EAFNOSUPPORT;
206
207         nexthop = daddr = usin->sin_addr.s_addr;
208         inet_opt = rcu_dereference_protected(inet->inet_opt,
209                                              lockdep_sock_is_held(sk));
210         if (inet_opt && inet_opt->opt.srr) {
211                 if (!daddr)
212                         return -EINVAL;
213                 nexthop = inet_opt->opt.faddr;
214         }
215
216         orig_sport = inet->inet_sport;
217         orig_dport = usin->sin_port;
218         fl4 = &inet->cork.fl.u.ip4;
219         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
220                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
221                               IPPROTO_TCP,
222                               orig_sport, orig_dport, sk);
223         if (IS_ERR(rt)) {
224                 err = PTR_ERR(rt);
225                 if (err == -ENETUNREACH)
226                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
227                 return err;
228         }
229
230         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
231                 ip_rt_put(rt);
232                 return -ENETUNREACH;
233         }
234
235         if (!inet_opt || !inet_opt->opt.srr)
236                 daddr = fl4->daddr;
237
238         if (!inet->inet_saddr)
239                 inet->inet_saddr = fl4->saddr;
240         sk_rcv_saddr_set(sk, inet->inet_saddr);
241
242         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
243                 /* Reset inherited state */
244                 tp->rx_opt.ts_recent       = 0;
245                 tp->rx_opt.ts_recent_stamp = 0;
246                 if (likely(!tp->repair))
247                         tp->write_seq      = 0;
248         }
249
250         inet->inet_dport = usin->sin_port;
251         sk_daddr_set(sk, daddr);
252
253         inet_csk(sk)->icsk_ext_hdr_len = 0;
254         if (inet_opt)
255                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
256
257         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
258
259         /* Socket identity is still unknown (sport may be zero).
260          * However we set state to SYN-SENT and not releasing socket
261          * lock select source port, enter ourselves into the hash tables and
262          * complete initialization after this.
263          */
264         tcp_set_state(sk, TCP_SYN_SENT);
265         err = inet_hash_connect(tcp_death_row, sk);
266         if (err)
267                 goto failure;
268
269         sk_set_txhash(sk);
270
271         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
272                                inet->inet_sport, inet->inet_dport, sk);
273         if (IS_ERR(rt)) {
274                 err = PTR_ERR(rt);
275                 rt = NULL;
276                 goto failure;
277         }
278         /* OK, now commit destination to socket.  */
279         sk->sk_gso_type = SKB_GSO_TCPV4;
280         sk_setup_caps(sk, &rt->dst);
281         rt = NULL;
282
283         if (likely(!tp->repair)) {
284                 if (!tp->write_seq)
285                         tp->write_seq = secure_tcp_seq(inet->inet_saddr,
286                                                        inet->inet_daddr,
287                                                        inet->inet_sport,
288                                                        usin->sin_port);
289                 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
290                                                  inet->inet_saddr,
291                                                  inet->inet_daddr);
292         }
293
294         inet->inet_id = tp->write_seq ^ jiffies;
295
296         if (tcp_fastopen_defer_connect(sk, &err))
297                 return err;
298         if (err)
299                 goto failure;
300
301         err = tcp_connect(sk);
302
303         if (err)
304                 goto failure;
305
306         return 0;
307
308 failure:
309         /*
310          * This unhashes the socket and releases the local port,
311          * if necessary.
312          */
313         tcp_set_state(sk, TCP_CLOSE);
314         ip_rt_put(rt);
315         sk->sk_route_caps = 0;
316         inet->inet_dport = 0;
317         return err;
318 }
319 EXPORT_SYMBOL(tcp_v4_connect);
320
321 /*
322  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
323  * It can be called through tcp_release_cb() if socket was owned by user
324  * at the time tcp_v4_err() was called to handle ICMP message.
325  */
326 void tcp_v4_mtu_reduced(struct sock *sk)
327 {
328         struct inet_sock *inet = inet_sk(sk);
329         struct dst_entry *dst;
330         u32 mtu;
331
332         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
333                 return;
334         mtu = tcp_sk(sk)->mtu_info;
335         dst = inet_csk_update_pmtu(sk, mtu);
336         if (!dst)
337                 return;
338
339         /* Something is about to be wrong... Remember soft error
340          * for the case, if this connection will not able to recover.
341          */
342         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
343                 sk->sk_err_soft = EMSGSIZE;
344
345         mtu = dst_mtu(dst);
346
347         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
348             ip_sk_accept_pmtu(sk) &&
349             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
350                 tcp_sync_mss(sk, mtu);
351
352                 /* Resend the TCP packet because it's
353                  * clear that the old packet has been
354                  * dropped. This is the new "fast" path mtu
355                  * discovery.
356                  */
357                 tcp_simple_retransmit(sk);
358         } /* else let the usual retransmit timer handle it */
359 }
360 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
361
362 static void do_redirect(struct sk_buff *skb, struct sock *sk)
363 {
364         struct dst_entry *dst = __sk_dst_check(sk, 0);
365
366         if (dst)
367                 dst->ops->redirect(dst, sk, skb);
368 }
369
370
371 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
372 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
373 {
374         struct request_sock *req = inet_reqsk(sk);
375         struct net *net = sock_net(sk);
376
377         /* ICMPs are not backlogged, hence we cannot get
378          * an established socket here.
379          */
380         if (seq != tcp_rsk(req)->snt_isn) {
381                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
382         } else if (abort) {
383                 /*
384                  * Still in SYN_RECV, just remove it silently.
385                  * There is no good way to pass the error to the newly
386                  * created socket, and POSIX does not want network
387                  * errors returned from accept().
388                  */
389                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
390                 tcp_listendrop(req->rsk_listener);
391         }
392         reqsk_put(req);
393 }
394 EXPORT_SYMBOL(tcp_req_err);
395
396 /*
397  * This routine is called by the ICMP module when it gets some
398  * sort of error condition.  If err < 0 then the socket should
399  * be closed and the error returned to the user.  If err > 0
400  * it's just the icmp type << 8 | icmp code.  After adjustment
401  * header points to the first 8 bytes of the tcp header.  We need
402  * to find the appropriate port.
403  *
404  * The locking strategy used here is very "optimistic". When
405  * someone else accesses the socket the ICMP is just dropped
406  * and for some paths there is no check at all.
407  * A more general error queue to queue errors for later handling
408  * is probably better.
409  *
410  */
411
412 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
413 {
414         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
415         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
416         struct inet_connection_sock *icsk;
417         struct tcp_sock *tp;
418         struct inet_sock *inet;
419         const int type = icmp_hdr(icmp_skb)->type;
420         const int code = icmp_hdr(icmp_skb)->code;
421         struct sock *sk;
422         struct sk_buff *skb;
423         struct request_sock *fastopen;
424         u32 seq, snd_una;
425         s32 remaining;
426         u32 delta_us;
427         int err;
428         struct net *net = dev_net(icmp_skb->dev);
429
430         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
431                                        th->dest, iph->saddr, ntohs(th->source),
432                                        inet_iif(icmp_skb), 0);
433         if (!sk) {
434                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
435                 return;
436         }
437         if (sk->sk_state == TCP_TIME_WAIT) {
438                 inet_twsk_put(inet_twsk(sk));
439                 return;
440         }
441         seq = ntohl(th->seq);
442         if (sk->sk_state == TCP_NEW_SYN_RECV)
443                 return tcp_req_err(sk, seq,
444                                   type == ICMP_PARAMETERPROB ||
445                                   type == ICMP_TIME_EXCEEDED ||
446                                   (type == ICMP_DEST_UNREACH &&
447                                    (code == ICMP_NET_UNREACH ||
448                                     code == ICMP_HOST_UNREACH)));
449
450         bh_lock_sock(sk);
451         /* If too many ICMPs get dropped on busy
452          * servers this needs to be solved differently.
453          * We do take care of PMTU discovery (RFC1191) special case :
454          * we can receive locally generated ICMP messages while socket is held.
455          */
456         if (sock_owned_by_user(sk)) {
457                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
458                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
459         }
460         if (sk->sk_state == TCP_CLOSE)
461                 goto out;
462
463         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
464                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
465                 goto out;
466         }
467
468         icsk = inet_csk(sk);
469         tp = tcp_sk(sk);
470         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
471         fastopen = tp->fastopen_rsk;
472         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
473         if (sk->sk_state != TCP_LISTEN &&
474             !between(seq, snd_una, tp->snd_nxt)) {
475                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
476                 goto out;
477         }
478
479         switch (type) {
480         case ICMP_REDIRECT:
481                 if (!sock_owned_by_user(sk))
482                         do_redirect(icmp_skb, sk);
483                 goto out;
484         case ICMP_SOURCE_QUENCH:
485                 /* Just silently ignore these. */
486                 goto out;
487         case ICMP_PARAMETERPROB:
488                 err = EPROTO;
489                 break;
490         case ICMP_DEST_UNREACH:
491                 if (code > NR_ICMP_UNREACH)
492                         goto out;
493
494                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
495                         /* We are not interested in TCP_LISTEN and open_requests
496                          * (SYN-ACKs send out by Linux are always <576bytes so
497                          * they should go through unfragmented).
498                          */
499                         if (sk->sk_state == TCP_LISTEN)
500                                 goto out;
501
502                         tp->mtu_info = info;
503                         if (!sock_owned_by_user(sk)) {
504                                 tcp_v4_mtu_reduced(sk);
505                         } else {
506                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
507                                         sock_hold(sk);
508                         }
509                         goto out;
510                 }
511
512                 err = icmp_err_convert[code].errno;
513                 /* check if icmp_skb allows revert of backoff
514                  * (see draft-zimmermann-tcp-lcd) */
515                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
516                         break;
517                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
518                     !icsk->icsk_backoff || fastopen)
519                         break;
520
521                 if (sock_owned_by_user(sk))
522                         break;
523
524                 icsk->icsk_backoff--;
525                 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
526                                                TCP_TIMEOUT_INIT;
527                 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
528
529                 skb = tcp_rtx_queue_head(sk);
530                 BUG_ON(!skb);
531
532                 tcp_mstamp_refresh(tp);
533                 delta_us = (u32)(tp->tcp_mstamp - skb->skb_mstamp);
534                 remaining = icsk->icsk_rto -
535                             usecs_to_jiffies(delta_us);
536
537                 if (remaining > 0) {
538                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
539                                                   remaining, TCP_RTO_MAX);
540                 } else {
541                         /* RTO revert clocked out retransmission.
542                          * Will retransmit now */
543                         tcp_retransmit_timer(sk);
544                 }
545
546                 break;
547         case ICMP_TIME_EXCEEDED:
548                 err = EHOSTUNREACH;
549                 break;
550         default:
551                 goto out;
552         }
553
554         switch (sk->sk_state) {
555         case TCP_SYN_SENT:
556         case TCP_SYN_RECV:
557                 /* Only in fast or simultaneous open. If a fast open socket is
558                  * is already accepted it is treated as a connected one below.
559                  */
560                 if (fastopen && !fastopen->sk)
561                         break;
562
563                 if (!sock_owned_by_user(sk)) {
564                         sk->sk_err = err;
565
566                         sk->sk_error_report(sk);
567
568                         tcp_done(sk);
569                 } else {
570                         sk->sk_err_soft = err;
571                 }
572                 goto out;
573         }
574
575         /* If we've already connected we will keep trying
576          * until we time out, or the user gives up.
577          *
578          * rfc1122 4.2.3.9 allows to consider as hard errors
579          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
580          * but it is obsoleted by pmtu discovery).
581          *
582          * Note, that in modern internet, where routing is unreliable
583          * and in each dark corner broken firewalls sit, sending random
584          * errors ordered by their masters even this two messages finally lose
585          * their original sense (even Linux sends invalid PORT_UNREACHs)
586          *
587          * Now we are in compliance with RFCs.
588          *                                                      --ANK (980905)
589          */
590
591         inet = inet_sk(sk);
592         if (!sock_owned_by_user(sk) && inet->recverr) {
593                 sk->sk_err = err;
594                 sk->sk_error_report(sk);
595         } else  { /* Only an error on timeout */
596                 sk->sk_err_soft = err;
597         }
598
599 out:
600         bh_unlock_sock(sk);
601         sock_put(sk);
602 }
603
604 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
605 {
606         struct tcphdr *th = tcp_hdr(skb);
607
608         th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
609         skb->csum_start = skb_transport_header(skb) - skb->head;
610         skb->csum_offset = offsetof(struct tcphdr, check);
611 }
612
613 /* This routine computes an IPv4 TCP checksum. */
614 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
615 {
616         const struct inet_sock *inet = inet_sk(sk);
617
618         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
619 }
620 EXPORT_SYMBOL(tcp_v4_send_check);
621
622 /*
623  *      This routine will send an RST to the other tcp.
624  *
625  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
626  *                    for reset.
627  *      Answer: if a packet caused RST, it is not for a socket
628  *              existing in our system, if it is matched to a socket,
629  *              it is just duplicate segment or bug in other side's TCP.
630  *              So that we build reply only basing on parameters
631  *              arrived with segment.
632  *      Exception: precedence violation. We do not implement it in any case.
633  */
634
635 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
636 {
637         const struct tcphdr *th = tcp_hdr(skb);
638         struct {
639                 struct tcphdr th;
640 #ifdef CONFIG_TCP_MD5SIG
641                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
642 #endif
643         } rep;
644         struct ip_reply_arg arg;
645 #ifdef CONFIG_TCP_MD5SIG
646         struct tcp_md5sig_key *key = NULL;
647         const __u8 *hash_location = NULL;
648         unsigned char newhash[16];
649         int genhash;
650         struct sock *sk1 = NULL;
651 #endif
652         struct net *net;
653         struct sock *ctl_sk;
654
655         /* Never send a reset in response to a reset. */
656         if (th->rst)
657                 return;
658
659         /* If sk not NULL, it means we did a successful lookup and incoming
660          * route had to be correct. prequeue might have dropped our dst.
661          */
662         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
663                 return;
664
665         /* Swap the send and the receive. */
666         memset(&rep, 0, sizeof(rep));
667         rep.th.dest   = th->source;
668         rep.th.source = th->dest;
669         rep.th.doff   = sizeof(struct tcphdr) / 4;
670         rep.th.rst    = 1;
671
672         if (th->ack) {
673                 rep.th.seq = th->ack_seq;
674         } else {
675                 rep.th.ack = 1;
676                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
677                                        skb->len - (th->doff << 2));
678         }
679
680         memset(&arg, 0, sizeof(arg));
681         arg.iov[0].iov_base = (unsigned char *)&rep;
682         arg.iov[0].iov_len  = sizeof(rep.th);
683
684         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
685 #ifdef CONFIG_TCP_MD5SIG
686         rcu_read_lock();
687         hash_location = tcp_parse_md5sig_option(th);
688         if (sk && sk_fullsock(sk)) {
689                 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
690                                         &ip_hdr(skb)->saddr, AF_INET);
691         } else if (hash_location) {
692                 /*
693                  * active side is lost. Try to find listening socket through
694                  * source port, and then find md5 key through listening socket.
695                  * we are not loose security here:
696                  * Incoming packet is checked with md5 hash with finding key,
697                  * no RST generated if md5 hash doesn't match.
698                  */
699                 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
700                                              ip_hdr(skb)->saddr,
701                                              th->source, ip_hdr(skb)->daddr,
702                                              ntohs(th->source), inet_iif(skb),
703                                              tcp_v4_sdif(skb));
704                 /* don't send rst if it can't find key */
705                 if (!sk1)
706                         goto out;
707
708                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
709                                         &ip_hdr(skb)->saddr, AF_INET);
710                 if (!key)
711                         goto out;
712
713
714                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
715                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
716                         goto out;
717
718         }
719
720         if (key) {
721                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
722                                    (TCPOPT_NOP << 16) |
723                                    (TCPOPT_MD5SIG << 8) |
724                                    TCPOLEN_MD5SIG);
725                 /* Update length and the length the header thinks exists */
726                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
727                 rep.th.doff = arg.iov[0].iov_len / 4;
728
729                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
730                                      key, ip_hdr(skb)->saddr,
731                                      ip_hdr(skb)->daddr, &rep.th);
732         }
733 #endif
734         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
735                                       ip_hdr(skb)->saddr, /* XXX */
736                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
737         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
738         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
739
740         /* When socket is gone, all binding information is lost.
741          * routing might fail in this case. No choice here, if we choose to force
742          * input interface, we will misroute in case of asymmetric route.
743          */
744         if (sk) {
745                 arg.bound_dev_if = sk->sk_bound_dev_if;
746                 if (sk_fullsock(sk))
747                         trace_tcp_send_reset(sk, skb);
748         }
749
750         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
751                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
752
753         arg.tos = ip_hdr(skb)->tos;
754         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
755         local_bh_disable();
756         ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
757         if (sk)
758                 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
759                                    inet_twsk(sk)->tw_mark : sk->sk_mark;
760         ip_send_unicast_reply(ctl_sk,
761                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
762                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
763                               &arg, arg.iov[0].iov_len);
764
765         ctl_sk->sk_mark = 0;
766         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
767         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
768         local_bh_enable();
769
770 #ifdef CONFIG_TCP_MD5SIG
771 out:
772         rcu_read_unlock();
773 #endif
774 }
775
776 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
777    outside socket context is ugly, certainly. What can I do?
778  */
779
780 static void tcp_v4_send_ack(const struct sock *sk,
781                             struct sk_buff *skb, u32 seq, u32 ack,
782                             u32 win, u32 tsval, u32 tsecr, int oif,
783                             struct tcp_md5sig_key *key,
784                             int reply_flags, u8 tos)
785 {
786         const struct tcphdr *th = tcp_hdr(skb);
787         struct {
788                 struct tcphdr th;
789                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
790 #ifdef CONFIG_TCP_MD5SIG
791                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
792 #endif
793                         ];
794         } rep;
795         struct net *net = sock_net(sk);
796         struct ip_reply_arg arg;
797         struct sock *ctl_sk;
798
799         memset(&rep.th, 0, sizeof(struct tcphdr));
800         memset(&arg, 0, sizeof(arg));
801
802         arg.iov[0].iov_base = (unsigned char *)&rep;
803         arg.iov[0].iov_len  = sizeof(rep.th);
804         if (tsecr) {
805                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
806                                    (TCPOPT_TIMESTAMP << 8) |
807                                    TCPOLEN_TIMESTAMP);
808                 rep.opt[1] = htonl(tsval);
809                 rep.opt[2] = htonl(tsecr);
810                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
811         }
812
813         /* Swap the send and the receive. */
814         rep.th.dest    = th->source;
815         rep.th.source  = th->dest;
816         rep.th.doff    = arg.iov[0].iov_len / 4;
817         rep.th.seq     = htonl(seq);
818         rep.th.ack_seq = htonl(ack);
819         rep.th.ack     = 1;
820         rep.th.window  = htons(win);
821
822 #ifdef CONFIG_TCP_MD5SIG
823         if (key) {
824                 int offset = (tsecr) ? 3 : 0;
825
826                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
827                                           (TCPOPT_NOP << 16) |
828                                           (TCPOPT_MD5SIG << 8) |
829                                           TCPOLEN_MD5SIG);
830                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
831                 rep.th.doff = arg.iov[0].iov_len/4;
832
833                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
834                                     key, ip_hdr(skb)->saddr,
835                                     ip_hdr(skb)->daddr, &rep.th);
836         }
837 #endif
838         arg.flags = reply_flags;
839         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
840                                       ip_hdr(skb)->saddr, /* XXX */
841                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
842         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
843         if (oif)
844                 arg.bound_dev_if = oif;
845         arg.tos = tos;
846         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
847         local_bh_disable();
848         ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
849         if (sk)
850                 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
851                                    inet_twsk(sk)->tw_mark : sk->sk_mark;
852         ip_send_unicast_reply(ctl_sk,
853                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
854                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
855                               &arg, arg.iov[0].iov_len);
856
857         ctl_sk->sk_mark = 0;
858         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
859         local_bh_enable();
860 }
861
862 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
863 {
864         struct inet_timewait_sock *tw = inet_twsk(sk);
865         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
866
867         tcp_v4_send_ack(sk, skb,
868                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
869                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
870                         tcp_time_stamp_raw() + tcptw->tw_ts_offset,
871                         tcptw->tw_ts_recent,
872                         tw->tw_bound_dev_if,
873                         tcp_twsk_md5_key(tcptw),
874                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
875                         tw->tw_tos
876                         );
877
878         inet_twsk_put(tw);
879 }
880
881 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
882                                   struct request_sock *req)
883 {
884         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
885          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
886          */
887         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
888                                              tcp_sk(sk)->snd_nxt;
889
890         /* RFC 7323 2.3
891          * The window field (SEG.WND) of every outgoing segment, with the
892          * exception of <SYN> segments, MUST be right-shifted by
893          * Rcv.Wind.Shift bits:
894          */
895         tcp_v4_send_ack(sk, skb, seq,
896                         tcp_rsk(req)->rcv_nxt,
897                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
898                         tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
899                         req->ts_recent,
900                         0,
901                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
902                                           AF_INET),
903                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
904                         ip_hdr(skb)->tos);
905 }
906
907 /*
908  *      Send a SYN-ACK after having received a SYN.
909  *      This still operates on a request_sock only, not on a big
910  *      socket.
911  */
912 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
913                               struct flowi *fl,
914                               struct request_sock *req,
915                               struct tcp_fastopen_cookie *foc,
916                               enum tcp_synack_type synack_type)
917 {
918         const struct inet_request_sock *ireq = inet_rsk(req);
919         struct flowi4 fl4;
920         int err = -1;
921         struct sk_buff *skb;
922
923         /* First, grab a route. */
924         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
925                 return -1;
926
927         skb = tcp_make_synack(sk, dst, req, foc, synack_type);
928
929         if (skb) {
930                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
931
932                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
933                                             ireq->ir_rmt_addr,
934                                             ireq_opt_deref(ireq));
935                 err = net_xmit_eval(err);
936         }
937
938         return err;
939 }
940
941 /*
942  *      IPv4 request_sock destructor.
943  */
944 static void tcp_v4_reqsk_destructor(struct request_sock *req)
945 {
946         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
947 }
948
949 #ifdef CONFIG_TCP_MD5SIG
950 /*
951  * RFC2385 MD5 checksumming requires a mapping of
952  * IP address->MD5 Key.
953  * We need to maintain these in the sk structure.
954  */
955
956 /* Find the Key structure for an address.  */
957 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
958                                          const union tcp_md5_addr *addr,
959                                          int family)
960 {
961         const struct tcp_sock *tp = tcp_sk(sk);
962         struct tcp_md5sig_key *key;
963         const struct tcp_md5sig_info *md5sig;
964         __be32 mask;
965         struct tcp_md5sig_key *best_match = NULL;
966         bool match;
967
968         /* caller either holds rcu_read_lock() or socket lock */
969         md5sig = rcu_dereference_check(tp->md5sig_info,
970                                        lockdep_sock_is_held(sk));
971         if (!md5sig)
972                 return NULL;
973
974         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
975                 if (key->family != family)
976                         continue;
977
978                 if (family == AF_INET) {
979                         mask = inet_make_mask(key->prefixlen);
980                         match = (key->addr.a4.s_addr & mask) ==
981                                 (addr->a4.s_addr & mask);
982 #if IS_ENABLED(CONFIG_IPV6)
983                 } else if (family == AF_INET6) {
984                         match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
985                                                   key->prefixlen);
986 #endif
987                 } else {
988                         match = false;
989                 }
990
991                 if (match && (!best_match ||
992                               key->prefixlen > best_match->prefixlen))
993                         best_match = key;
994         }
995         return best_match;
996 }
997 EXPORT_SYMBOL(tcp_md5_do_lookup);
998
999 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1000                                                       const union tcp_md5_addr *addr,
1001                                                       int family, u8 prefixlen)
1002 {
1003         const struct tcp_sock *tp = tcp_sk(sk);
1004         struct tcp_md5sig_key *key;
1005         unsigned int size = sizeof(struct in_addr);
1006         const struct tcp_md5sig_info *md5sig;
1007
1008         /* caller either holds rcu_read_lock() or socket lock */
1009         md5sig = rcu_dereference_check(tp->md5sig_info,
1010                                        lockdep_sock_is_held(sk));
1011         if (!md5sig)
1012                 return NULL;
1013 #if IS_ENABLED(CONFIG_IPV6)
1014         if (family == AF_INET6)
1015                 size = sizeof(struct in6_addr);
1016 #endif
1017         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
1018                 if (key->family != family)
1019                         continue;
1020                 if (!memcmp(&key->addr, addr, size) &&
1021                     key->prefixlen == prefixlen)
1022                         return key;
1023         }
1024         return NULL;
1025 }
1026
1027 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1028                                          const struct sock *addr_sk)
1029 {
1030         const union tcp_md5_addr *addr;
1031
1032         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1033         return tcp_md5_do_lookup(sk, addr, AF_INET);
1034 }
1035 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1036
1037 /* This can be called on a newly created socket, from other files */
1038 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1039                    int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
1040                    gfp_t gfp)
1041 {
1042         /* Add Key to the list */
1043         struct tcp_md5sig_key *key;
1044         struct tcp_sock *tp = tcp_sk(sk);
1045         struct tcp_md5sig_info *md5sig;
1046
1047         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1048         if (key) {
1049                 /* Pre-existing entry - just update that one. */
1050                 memcpy(key->key, newkey, newkeylen);
1051                 key->keylen = newkeylen;
1052                 return 0;
1053         }
1054
1055         md5sig = rcu_dereference_protected(tp->md5sig_info,
1056                                            lockdep_sock_is_held(sk));
1057         if (!md5sig) {
1058                 md5sig = kmalloc(sizeof(*md5sig), gfp);
1059                 if (!md5sig)
1060                         return -ENOMEM;
1061
1062                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1063                 INIT_HLIST_HEAD(&md5sig->head);
1064                 rcu_assign_pointer(tp->md5sig_info, md5sig);
1065         }
1066
1067         key = sock_kmalloc(sk, sizeof(*key), gfp);
1068         if (!key)
1069                 return -ENOMEM;
1070         if (!tcp_alloc_md5sig_pool()) {
1071                 sock_kfree_s(sk, key, sizeof(*key));
1072                 return -ENOMEM;
1073         }
1074
1075         memcpy(key->key, newkey, newkeylen);
1076         key->keylen = newkeylen;
1077         key->family = family;
1078         key->prefixlen = prefixlen;
1079         memcpy(&key->addr, addr,
1080                (family == AF_INET6) ? sizeof(struct in6_addr) :
1081                                       sizeof(struct in_addr));
1082         hlist_add_head_rcu(&key->node, &md5sig->head);
1083         return 0;
1084 }
1085 EXPORT_SYMBOL(tcp_md5_do_add);
1086
1087 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1088                    u8 prefixlen)
1089 {
1090         struct tcp_md5sig_key *key;
1091
1092         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1093         if (!key)
1094                 return -ENOENT;
1095         hlist_del_rcu(&key->node);
1096         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1097         kfree_rcu(key, rcu);
1098         return 0;
1099 }
1100 EXPORT_SYMBOL(tcp_md5_do_del);
1101
1102 static void tcp_clear_md5_list(struct sock *sk)
1103 {
1104         struct tcp_sock *tp = tcp_sk(sk);
1105         struct tcp_md5sig_key *key;
1106         struct hlist_node *n;
1107         struct tcp_md5sig_info *md5sig;
1108
1109         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1110
1111         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1112                 hlist_del_rcu(&key->node);
1113                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1114                 kfree_rcu(key, rcu);
1115         }
1116 }
1117
1118 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1119                                  char __user *optval, int optlen)
1120 {
1121         struct tcp_md5sig cmd;
1122         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1123         u8 prefixlen = 32;
1124
1125         if (optlen < sizeof(cmd))
1126                 return -EINVAL;
1127
1128         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1129                 return -EFAULT;
1130
1131         if (sin->sin_family != AF_INET)
1132                 return -EINVAL;
1133
1134         if (optname == TCP_MD5SIG_EXT &&
1135             cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1136                 prefixlen = cmd.tcpm_prefixlen;
1137                 if (prefixlen > 32)
1138                         return -EINVAL;
1139         }
1140
1141         if (!cmd.tcpm_keylen)
1142                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1143                                       AF_INET, prefixlen);
1144
1145         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1146                 return -EINVAL;
1147
1148         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1149                               AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
1150                               GFP_KERNEL);
1151 }
1152
1153 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1154                                    __be32 daddr, __be32 saddr,
1155                                    const struct tcphdr *th, int nbytes)
1156 {
1157         struct tcp4_pseudohdr *bp;
1158         struct scatterlist sg;
1159         struct tcphdr *_th;
1160
1161         bp = hp->scratch;
1162         bp->saddr = saddr;
1163         bp->daddr = daddr;
1164         bp->pad = 0;
1165         bp->protocol = IPPROTO_TCP;
1166         bp->len = cpu_to_be16(nbytes);
1167
1168         _th = (struct tcphdr *)(bp + 1);
1169         memcpy(_th, th, sizeof(*th));
1170         _th->check = 0;
1171
1172         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1173         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1174                                 sizeof(*bp) + sizeof(*th));
1175         return crypto_ahash_update(hp->md5_req);
1176 }
1177
1178 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1179                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1180 {
1181         struct tcp_md5sig_pool *hp;
1182         struct ahash_request *req;
1183
1184         hp = tcp_get_md5sig_pool();
1185         if (!hp)
1186                 goto clear_hash_noput;
1187         req = hp->md5_req;
1188
1189         if (crypto_ahash_init(req))
1190                 goto clear_hash;
1191         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1192                 goto clear_hash;
1193         if (tcp_md5_hash_key(hp, key))
1194                 goto clear_hash;
1195         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1196         if (crypto_ahash_final(req))
1197                 goto clear_hash;
1198
1199         tcp_put_md5sig_pool();
1200         return 0;
1201
1202 clear_hash:
1203         tcp_put_md5sig_pool();
1204 clear_hash_noput:
1205         memset(md5_hash, 0, 16);
1206         return 1;
1207 }
1208
1209 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1210                         const struct sock *sk,
1211                         const struct sk_buff *skb)
1212 {
1213         struct tcp_md5sig_pool *hp;
1214         struct ahash_request *req;
1215         const struct tcphdr *th = tcp_hdr(skb);
1216         __be32 saddr, daddr;
1217
1218         if (sk) { /* valid for establish/request sockets */
1219                 saddr = sk->sk_rcv_saddr;
1220                 daddr = sk->sk_daddr;
1221         } else {
1222                 const struct iphdr *iph = ip_hdr(skb);
1223                 saddr = iph->saddr;
1224                 daddr = iph->daddr;
1225         }
1226
1227         hp = tcp_get_md5sig_pool();
1228         if (!hp)
1229                 goto clear_hash_noput;
1230         req = hp->md5_req;
1231
1232         if (crypto_ahash_init(req))
1233                 goto clear_hash;
1234
1235         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1236                 goto clear_hash;
1237         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1238                 goto clear_hash;
1239         if (tcp_md5_hash_key(hp, key))
1240                 goto clear_hash;
1241         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1242         if (crypto_ahash_final(req))
1243                 goto clear_hash;
1244
1245         tcp_put_md5sig_pool();
1246         return 0;
1247
1248 clear_hash:
1249         tcp_put_md5sig_pool();
1250 clear_hash_noput:
1251         memset(md5_hash, 0, 16);
1252         return 1;
1253 }
1254 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1255
1256 #endif
1257
1258 /* Called with rcu_read_lock() */
1259 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1260                                     const struct sk_buff *skb)
1261 {
1262 #ifdef CONFIG_TCP_MD5SIG
1263         /*
1264          * This gets called for each TCP segment that arrives
1265          * so we want to be efficient.
1266          * We have 3 drop cases:
1267          * o No MD5 hash and one expected.
1268          * o MD5 hash and we're not expecting one.
1269          * o MD5 hash and its wrong.
1270          */
1271         const __u8 *hash_location = NULL;
1272         struct tcp_md5sig_key *hash_expected;
1273         const struct iphdr *iph = ip_hdr(skb);
1274         const struct tcphdr *th = tcp_hdr(skb);
1275         int genhash;
1276         unsigned char newhash[16];
1277
1278         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1279                                           AF_INET);
1280         hash_location = tcp_parse_md5sig_option(th);
1281
1282         /* We've parsed the options - do we have a hash? */
1283         if (!hash_expected && !hash_location)
1284                 return false;
1285
1286         if (hash_expected && !hash_location) {
1287                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1288                 return true;
1289         }
1290
1291         if (!hash_expected && hash_location) {
1292                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1293                 return true;
1294         }
1295
1296         /* Okay, so this is hash_expected and hash_location -
1297          * so we need to calculate the checksum.
1298          */
1299         genhash = tcp_v4_md5_hash_skb(newhash,
1300                                       hash_expected,
1301                                       NULL, skb);
1302
1303         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1304                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1305                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1306                                      &iph->saddr, ntohs(th->source),
1307                                      &iph->daddr, ntohs(th->dest),
1308                                      genhash ? " tcp_v4_calc_md5_hash failed"
1309                                      : "");
1310                 return true;
1311         }
1312         return false;
1313 #endif
1314         return false;
1315 }
1316
1317 static void tcp_v4_init_req(struct request_sock *req,
1318                             const struct sock *sk_listener,
1319                             struct sk_buff *skb)
1320 {
1321         struct inet_request_sock *ireq = inet_rsk(req);
1322         struct net *net = sock_net(sk_listener);
1323
1324         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1325         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1326         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1327 }
1328
1329 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1330                                           struct flowi *fl,
1331                                           const struct request_sock *req)
1332 {
1333         return inet_csk_route_req(sk, &fl->u.ip4, req);
1334 }
1335
1336 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1337         .family         =       PF_INET,
1338         .obj_size       =       sizeof(struct tcp_request_sock),
1339         .rtx_syn_ack    =       tcp_rtx_synack,
1340         .send_ack       =       tcp_v4_reqsk_send_ack,
1341         .destructor     =       tcp_v4_reqsk_destructor,
1342         .send_reset     =       tcp_v4_send_reset,
1343         .syn_ack_timeout =      tcp_syn_ack_timeout,
1344 };
1345
1346 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1347         .mss_clamp      =       TCP_MSS_DEFAULT,
1348 #ifdef CONFIG_TCP_MD5SIG
1349         .req_md5_lookup =       tcp_v4_md5_lookup,
1350         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1351 #endif
1352         .init_req       =       tcp_v4_init_req,
1353 #ifdef CONFIG_SYN_COOKIES
1354         .cookie_init_seq =      cookie_v4_init_sequence,
1355 #endif
1356         .route_req      =       tcp_v4_route_req,
1357         .init_seq       =       tcp_v4_init_seq,
1358         .init_ts_off    =       tcp_v4_init_ts_off,
1359         .send_synack    =       tcp_v4_send_synack,
1360 };
1361
1362 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1363 {
1364         /* Never answer to SYNs send to broadcast or multicast */
1365         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1366                 goto drop;
1367
1368         return tcp_conn_request(&tcp_request_sock_ops,
1369                                 &tcp_request_sock_ipv4_ops, sk, skb);
1370
1371 drop:
1372         tcp_listendrop(sk);
1373         return 0;
1374 }
1375 EXPORT_SYMBOL(tcp_v4_conn_request);
1376
1377
1378 /*
1379  * The three way handshake has completed - we got a valid synack -
1380  * now create the new socket.
1381  */
1382 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1383                                   struct request_sock *req,
1384                                   struct dst_entry *dst,
1385                                   struct request_sock *req_unhash,
1386                                   bool *own_req)
1387 {
1388         struct inet_request_sock *ireq;
1389         struct inet_sock *newinet;
1390         struct tcp_sock *newtp;
1391         struct sock *newsk;
1392 #ifdef CONFIG_TCP_MD5SIG
1393         struct tcp_md5sig_key *key;
1394 #endif
1395         struct ip_options_rcu *inet_opt;
1396
1397         if (sk_acceptq_is_full(sk))
1398                 goto exit_overflow;
1399
1400         newsk = tcp_create_openreq_child(sk, req, skb);
1401         if (!newsk)
1402                 goto exit_nonewsk;
1403
1404         newsk->sk_gso_type = SKB_GSO_TCPV4;
1405         inet_sk_rx_dst_set(newsk, skb);
1406
1407         newtp                 = tcp_sk(newsk);
1408         newinet               = inet_sk(newsk);
1409         ireq                  = inet_rsk(req);
1410         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1411         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1412         newsk->sk_bound_dev_if = ireq->ir_iif;
1413         newinet->inet_saddr   = ireq->ir_loc_addr;
1414         inet_opt              = rcu_dereference(ireq->ireq_opt);
1415         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1416         newinet->mc_index     = inet_iif(skb);
1417         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1418         newinet->rcv_tos      = ip_hdr(skb)->tos;
1419         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1420         if (inet_opt)
1421                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1422         newinet->inet_id = newtp->write_seq ^ jiffies;
1423
1424         if (!dst) {
1425                 dst = inet_csk_route_child_sock(sk, newsk, req);
1426                 if (!dst)
1427                         goto put_and_exit;
1428         } else {
1429                 /* syncookie case : see end of cookie_v4_check() */
1430         }
1431         sk_setup_caps(newsk, dst);
1432
1433         tcp_ca_openreq_child(newsk, dst);
1434
1435         tcp_sync_mss(newsk, dst_mtu(dst));
1436         newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1437
1438         tcp_initialize_rcv_mss(newsk);
1439
1440 #ifdef CONFIG_TCP_MD5SIG
1441         /* Copy over the MD5 key from the original socket */
1442         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1443                                 AF_INET);
1444         if (key) {
1445                 /*
1446                  * We're using one, so create a matching key
1447                  * on the newsk structure. If we fail to get
1448                  * memory, then we end up not copying the key
1449                  * across. Shucks.
1450                  */
1451                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1452                                AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
1453                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1454         }
1455 #endif
1456
1457         if (__inet_inherit_port(sk, newsk) < 0)
1458                 goto put_and_exit;
1459         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1460         if (likely(*own_req)) {
1461                 tcp_move_syn(newtp, req);
1462                 ireq->ireq_opt = NULL;
1463         } else {
1464                 newinet->inet_opt = NULL;
1465         }
1466         return newsk;
1467
1468 exit_overflow:
1469         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1470 exit_nonewsk:
1471         dst_release(dst);
1472 exit:
1473         tcp_listendrop(sk);
1474         return NULL;
1475 put_and_exit:
1476         newinet->inet_opt = NULL;
1477         inet_csk_prepare_forced_close(newsk);
1478         tcp_done(newsk);
1479         goto exit;
1480 }
1481 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1482
1483 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1484 {
1485 #ifdef CONFIG_SYN_COOKIES
1486         const struct tcphdr *th = tcp_hdr(skb);
1487
1488         if (!th->syn)
1489                 sk = cookie_v4_check(sk, skb);
1490 #endif
1491         return sk;
1492 }
1493
1494 /* The socket must have it's spinlock held when we get
1495  * here, unless it is a TCP_LISTEN socket.
1496  *
1497  * We have a potential double-lock case here, so even when
1498  * doing backlog processing we use the BH locking scheme.
1499  * This is because we cannot sleep with the original spinlock
1500  * held.
1501  */
1502 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1503 {
1504         struct sock *rsk;
1505
1506         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1507                 struct dst_entry *dst = sk->sk_rx_dst;
1508
1509                 sock_rps_save_rxhash(sk, skb);
1510                 sk_mark_napi_id(sk, skb);
1511                 if (dst) {
1512                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1513                             !dst->ops->check(dst, 0)) {
1514                                 dst_release(dst);
1515                                 sk->sk_rx_dst = NULL;
1516                         }
1517                 }
1518                 tcp_rcv_established(sk, skb);
1519                 return 0;
1520         }
1521
1522         if (tcp_checksum_complete(skb))
1523                 goto csum_err;
1524
1525         if (sk->sk_state == TCP_LISTEN) {
1526                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1527
1528                 if (!nsk)
1529                         goto discard;
1530                 if (nsk != sk) {
1531                         if (tcp_child_process(sk, nsk, skb)) {
1532                                 rsk = nsk;
1533                                 goto reset;
1534                         }
1535                         return 0;
1536                 }
1537         } else
1538                 sock_rps_save_rxhash(sk, skb);
1539
1540         if (tcp_rcv_state_process(sk, skb)) {
1541                 rsk = sk;
1542                 goto reset;
1543         }
1544         return 0;
1545
1546 reset:
1547         tcp_v4_send_reset(rsk, skb);
1548 discard:
1549         kfree_skb(skb);
1550         /* Be careful here. If this function gets more complicated and
1551          * gcc suffers from register pressure on the x86, sk (in %ebx)
1552          * might be destroyed here. This current version compiles correctly,
1553          * but you have been warned.
1554          */
1555         return 0;
1556
1557 csum_err:
1558         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1559         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1560         goto discard;
1561 }
1562 EXPORT_SYMBOL(tcp_v4_do_rcv);
1563
1564 int tcp_v4_early_demux(struct sk_buff *skb)
1565 {
1566         const struct iphdr *iph;
1567         const struct tcphdr *th;
1568         struct sock *sk;
1569
1570         if (skb->pkt_type != PACKET_HOST)
1571                 return 0;
1572
1573         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1574                 return 0;
1575
1576         iph = ip_hdr(skb);
1577         th = tcp_hdr(skb);
1578
1579         if (th->doff < sizeof(struct tcphdr) / 4)
1580                 return 0;
1581
1582         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1583                                        iph->saddr, th->source,
1584                                        iph->daddr, ntohs(th->dest),
1585                                        skb->skb_iif, inet_sdif(skb));
1586         if (sk) {
1587                 skb->sk = sk;
1588                 skb->destructor = sock_edemux;
1589                 if (sk_fullsock(sk)) {
1590                         struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1591
1592                         if (dst)
1593                                 dst = dst_check(dst, 0);
1594                         if (dst &&
1595                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1596                                 skb_dst_set_noref(skb, dst);
1597                 }
1598         }
1599         return 0;
1600 }
1601
1602 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1603 {
1604         u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1605
1606         /* Only socket owner can try to collapse/prune rx queues
1607          * to reduce memory overhead, so add a little headroom here.
1608          * Few sockets backlog are possibly concurrently non empty.
1609          */
1610         limit += 64*1024;
1611
1612         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1613          * we can fix skb->truesize to its real value to avoid future drops.
1614          * This is valid because skb is not yet charged to the socket.
1615          * It has been noticed pure SACK packets were sometimes dropped
1616          * (if cooked by drivers without copybreak feature).
1617          */
1618         skb_condense(skb);
1619
1620         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1621                 bh_unlock_sock(sk);
1622                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1623                 return true;
1624         }
1625         return false;
1626 }
1627 EXPORT_SYMBOL(tcp_add_backlog);
1628
1629 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1630 {
1631         struct tcphdr *th = (struct tcphdr *)skb->data;
1632         unsigned int eaten = skb->len;
1633         int err;
1634
1635         err = sk_filter_trim_cap(sk, skb, th->doff * 4);
1636         if (!err) {
1637                 eaten -= skb->len;
1638                 TCP_SKB_CB(skb)->end_seq -= eaten;
1639         }
1640         return err;
1641 }
1642 EXPORT_SYMBOL(tcp_filter);
1643
1644 static void tcp_v4_restore_cb(struct sk_buff *skb)
1645 {
1646         memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1647                 sizeof(struct inet_skb_parm));
1648 }
1649
1650 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1651                            const struct tcphdr *th)
1652 {
1653         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1654          * barrier() makes sure compiler wont play fool^Waliasing games.
1655          */
1656         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1657                 sizeof(struct inet_skb_parm));
1658         barrier();
1659
1660         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1661         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1662                                     skb->len - th->doff * 4);
1663         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1664         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1665         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1666         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1667         TCP_SKB_CB(skb)->sacked  = 0;
1668         TCP_SKB_CB(skb)->has_rxtstamp =
1669                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1670 }
1671
1672 /*
1673  *      From tcp_input.c
1674  */
1675
1676 int tcp_v4_rcv(struct sk_buff *skb)
1677 {
1678         struct net *net = dev_net(skb->dev);
1679         int sdif = inet_sdif(skb);
1680         const struct iphdr *iph;
1681         const struct tcphdr *th;
1682         bool refcounted;
1683         struct sock *sk;
1684         int ret;
1685
1686         if (skb->pkt_type != PACKET_HOST)
1687                 goto discard_it;
1688
1689         /* Count it even if it's bad */
1690         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1691
1692         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1693                 goto discard_it;
1694
1695         th = (const struct tcphdr *)skb->data;
1696
1697         if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1698                 goto bad_packet;
1699         if (!pskb_may_pull(skb, th->doff * 4))
1700                 goto discard_it;
1701
1702         /* An explanation is required here, I think.
1703          * Packet length and doff are validated by header prediction,
1704          * provided case of th->doff==0 is eliminated.
1705          * So, we defer the checks. */
1706
1707         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1708                 goto csum_error;
1709
1710         th = (const struct tcphdr *)skb->data;
1711         iph = ip_hdr(skb);
1712 lookup:
1713         sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1714                                th->dest, sdif, &refcounted);
1715         if (!sk)
1716                 goto no_tcp_socket;
1717
1718 process:
1719         if (sk->sk_state == TCP_TIME_WAIT)
1720                 goto do_time_wait;
1721
1722         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1723                 struct request_sock *req = inet_reqsk(sk);
1724                 bool req_stolen = false;
1725                 struct sock *nsk;
1726
1727                 sk = req->rsk_listener;
1728                 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1729                         sk_drops_add(sk, skb);
1730                         reqsk_put(req);
1731                         goto discard_it;
1732                 }
1733                 if (tcp_checksum_complete(skb)) {
1734                         reqsk_put(req);
1735                         goto csum_error;
1736                 }
1737                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1738                         inet_csk_reqsk_queue_drop_and_put(sk, req);
1739                         goto lookup;
1740                 }
1741                 /* We own a reference on the listener, increase it again
1742                  * as we might lose it too soon.
1743                  */
1744                 sock_hold(sk);
1745                 refcounted = true;
1746                 nsk = NULL;
1747                 if (!tcp_filter(sk, skb)) {
1748                         th = (const struct tcphdr *)skb->data;
1749                         iph = ip_hdr(skb);
1750                         tcp_v4_fill_cb(skb, iph, th);
1751                         nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
1752                 }
1753                 if (!nsk) {
1754                         reqsk_put(req);
1755                         if (req_stolen) {
1756                                 /* Another cpu got exclusive access to req
1757                                  * and created a full blown socket.
1758                                  * Try to feed this packet to this socket
1759                                  * instead of discarding it.
1760                                  */
1761                                 tcp_v4_restore_cb(skb);
1762                                 sock_put(sk);
1763                                 goto lookup;
1764                         }
1765                         goto discard_and_relse;
1766                 }
1767                 if (nsk == sk) {
1768                         reqsk_put(req);
1769                         tcp_v4_restore_cb(skb);
1770                 } else if (tcp_child_process(sk, nsk, skb)) {
1771                         tcp_v4_send_reset(nsk, skb);
1772                         goto discard_and_relse;
1773                 } else {
1774                         sock_put(sk);
1775                         return 0;
1776                 }
1777         }
1778         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1779                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1780                 goto discard_and_relse;
1781         }
1782
1783         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1784                 goto discard_and_relse;
1785
1786         if (tcp_v4_inbound_md5_hash(sk, skb))
1787                 goto discard_and_relse;
1788
1789         nf_reset(skb);
1790
1791         if (tcp_filter(sk, skb))
1792                 goto discard_and_relse;
1793         th = (const struct tcphdr *)skb->data;
1794         iph = ip_hdr(skb);
1795         tcp_v4_fill_cb(skb, iph, th);
1796
1797         skb->dev = NULL;
1798
1799         if (sk->sk_state == TCP_LISTEN) {
1800                 ret = tcp_v4_do_rcv(sk, skb);
1801                 goto put_and_return;
1802         }
1803
1804         sk_incoming_cpu_update(sk);
1805
1806         bh_lock_sock_nested(sk);
1807         tcp_segs_in(tcp_sk(sk), skb);
1808         ret = 0;
1809         if (!sock_owned_by_user(sk)) {
1810                 ret = tcp_v4_do_rcv(sk, skb);
1811         } else if (tcp_add_backlog(sk, skb)) {
1812                 goto discard_and_relse;
1813         }
1814         bh_unlock_sock(sk);
1815
1816 put_and_return:
1817         if (refcounted)
1818                 sock_put(sk);
1819
1820         return ret;
1821
1822 no_tcp_socket:
1823         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1824                 goto discard_it;
1825
1826         tcp_v4_fill_cb(skb, iph, th);
1827
1828         if (tcp_checksum_complete(skb)) {
1829 csum_error:
1830                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1831 bad_packet:
1832                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
1833         } else {
1834                 tcp_v4_send_reset(NULL, skb);
1835         }
1836
1837 discard_it:
1838         /* Discard frame. */
1839         kfree_skb(skb);
1840         return 0;
1841
1842 discard_and_relse:
1843         sk_drops_add(sk, skb);
1844         if (refcounted)
1845                 sock_put(sk);
1846         goto discard_it;
1847
1848 do_time_wait:
1849         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1850                 inet_twsk_put(inet_twsk(sk));
1851                 goto discard_it;
1852         }
1853
1854         tcp_v4_fill_cb(skb, iph, th);
1855
1856         if (tcp_checksum_complete(skb)) {
1857                 inet_twsk_put(inet_twsk(sk));
1858                 goto csum_error;
1859         }
1860         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1861         case TCP_TW_SYN: {
1862                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1863                                                         &tcp_hashinfo, skb,
1864                                                         __tcp_hdrlen(th),
1865                                                         iph->saddr, th->source,
1866                                                         iph->daddr, th->dest,
1867                                                         inet_iif(skb),
1868                                                         sdif);
1869                 if (sk2) {
1870                         inet_twsk_deschedule_put(inet_twsk(sk));
1871                         sk = sk2;
1872                         tcp_v4_restore_cb(skb);
1873                         refcounted = false;
1874                         goto process;
1875                 }
1876         }
1877                 /* to ACK */
1878                 /* fall through */
1879         case TCP_TW_ACK:
1880                 tcp_v4_timewait_ack(sk, skb);
1881                 break;
1882         case TCP_TW_RST:
1883                 tcp_v4_send_reset(sk, skb);
1884                 inet_twsk_deschedule_put(inet_twsk(sk));
1885                 goto discard_it;
1886         case TCP_TW_SUCCESS:;
1887         }
1888         goto discard_it;
1889 }
1890
1891 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1892         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1893         .twsk_unique    = tcp_twsk_unique,
1894         .twsk_destructor= tcp_twsk_destructor,
1895 };
1896
1897 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1898 {
1899         struct dst_entry *dst = skb_dst(skb);
1900
1901         if (dst && dst_hold_safe(dst)) {
1902                 sk->sk_rx_dst = dst;
1903                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1904         }
1905 }
1906 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1907
1908 const struct inet_connection_sock_af_ops ipv4_specific = {
1909         .queue_xmit        = ip_queue_xmit,
1910         .send_check        = tcp_v4_send_check,
1911         .rebuild_header    = inet_sk_rebuild_header,
1912         .sk_rx_dst_set     = inet_sk_rx_dst_set,
1913         .conn_request      = tcp_v4_conn_request,
1914         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1915         .net_header_len    = sizeof(struct iphdr),
1916         .setsockopt        = ip_setsockopt,
1917         .getsockopt        = ip_getsockopt,
1918         .addr2sockaddr     = inet_csk_addr2sockaddr,
1919         .sockaddr_len      = sizeof(struct sockaddr_in),
1920 #ifdef CONFIG_COMPAT
1921         .compat_setsockopt = compat_ip_setsockopt,
1922         .compat_getsockopt = compat_ip_getsockopt,
1923 #endif
1924         .mtu_reduced       = tcp_v4_mtu_reduced,
1925 };
1926 EXPORT_SYMBOL(ipv4_specific);
1927
1928 #ifdef CONFIG_TCP_MD5SIG
1929 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1930         .md5_lookup             = tcp_v4_md5_lookup,
1931         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1932         .md5_parse              = tcp_v4_parse_md5_keys,
1933 };
1934 #endif
1935
1936 /* NOTE: A lot of things set to zero explicitly by call to
1937  *       sk_alloc() so need not be done here.
1938  */
1939 static int tcp_v4_init_sock(struct sock *sk)
1940 {
1941         struct inet_connection_sock *icsk = inet_csk(sk);
1942
1943         tcp_init_sock(sk);
1944
1945         icsk->icsk_af_ops = &ipv4_specific;
1946
1947 #ifdef CONFIG_TCP_MD5SIG
1948         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1949 #endif
1950
1951         return 0;
1952 }
1953
1954 void tcp_v4_destroy_sock(struct sock *sk)
1955 {
1956         struct tcp_sock *tp = tcp_sk(sk);
1957
1958         trace_tcp_destroy_sock(sk);
1959
1960         tcp_clear_xmit_timers(sk);
1961
1962         tcp_cleanup_congestion_control(sk);
1963
1964         tcp_cleanup_ulp(sk);
1965
1966         /* Cleanup up the write buffer. */
1967         tcp_write_queue_purge(sk);
1968
1969         /* Check if we want to disable active TFO */
1970         tcp_fastopen_active_disable_ofo_check(sk);
1971
1972         /* Cleans up our, hopefully empty, out_of_order_queue. */
1973         skb_rbtree_purge(&tp->out_of_order_queue);
1974
1975 #ifdef CONFIG_TCP_MD5SIG
1976         /* Clean up the MD5 key list, if any */
1977         if (tp->md5sig_info) {
1978                 tcp_clear_md5_list(sk);
1979                 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
1980                 tp->md5sig_info = NULL;
1981         }
1982 #endif
1983
1984         /* Clean up a referenced TCP bind bucket. */
1985         if (inet_csk(sk)->icsk_bind_hash)
1986                 inet_put_port(sk);
1987
1988         BUG_ON(tp->fastopen_rsk);
1989
1990         /* If socket is aborted during connect operation */
1991         tcp_free_fastopen_req(tp);
1992         tcp_fastopen_destroy_cipher(sk);
1993         tcp_saved_syn_free(tp);
1994
1995         sk_sockets_allocated_dec(sk);
1996 }
1997 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1998
1999 #ifdef CONFIG_PROC_FS
2000 /* Proc filesystem TCP sock list dumping. */
2001
2002 /*
2003  * Get next listener socket follow cur.  If cur is NULL, get first socket
2004  * starting from bucket given in st->bucket; when st->bucket is zero the
2005  * very first socket in the hash table is returned.
2006  */
2007 static void *listening_get_next(struct seq_file *seq, void *cur)
2008 {
2009         struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2010         struct tcp_iter_state *st = seq->private;
2011         struct net *net = seq_file_net(seq);
2012         struct inet_listen_hashbucket *ilb;
2013         struct sock *sk = cur;
2014
2015         if (!sk) {
2016 get_head:
2017                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2018                 spin_lock(&ilb->lock);
2019                 sk = sk_head(&ilb->head);
2020                 st->offset = 0;
2021                 goto get_sk;
2022         }
2023         ilb = &tcp_hashinfo.listening_hash[st->bucket];
2024         ++st->num;
2025         ++st->offset;
2026
2027         sk = sk_next(sk);
2028 get_sk:
2029         sk_for_each_from(sk) {
2030                 if (!net_eq(sock_net(sk), net))
2031                         continue;
2032                 if (sk->sk_family == afinfo->family)
2033                         return sk;
2034         }
2035         spin_unlock(&ilb->lock);
2036         st->offset = 0;
2037         if (++st->bucket < INET_LHTABLE_SIZE)
2038                 goto get_head;
2039         return NULL;
2040 }
2041
2042 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2043 {
2044         struct tcp_iter_state *st = seq->private;
2045         void *rc;
2046
2047         st->bucket = 0;
2048         st->offset = 0;
2049         rc = listening_get_next(seq, NULL);
2050
2051         while (rc && *pos) {
2052                 rc = listening_get_next(seq, rc);
2053                 --*pos;
2054         }
2055         return rc;
2056 }
2057
2058 static inline bool empty_bucket(const struct tcp_iter_state *st)
2059 {
2060         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2061 }
2062
2063 /*
2064  * Get first established socket starting from bucket given in st->bucket.
2065  * If st->bucket is zero, the very first socket in the hash is returned.
2066  */
2067 static void *established_get_first(struct seq_file *seq)
2068 {
2069         struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2070         struct tcp_iter_state *st = seq->private;
2071         struct net *net = seq_file_net(seq);
2072         void *rc = NULL;
2073
2074         st->offset = 0;
2075         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2076                 struct sock *sk;
2077                 struct hlist_nulls_node *node;
2078                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2079
2080                 /* Lockless fast path for the common case of empty buckets */
2081                 if (empty_bucket(st))
2082                         continue;
2083
2084                 spin_lock_bh(lock);
2085                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2086                         if (sk->sk_family != afinfo->family ||
2087                             !net_eq(sock_net(sk), net)) {
2088                                 continue;
2089                         }
2090                         rc = sk;
2091                         goto out;
2092                 }
2093                 spin_unlock_bh(lock);
2094         }
2095 out:
2096         return rc;
2097 }
2098
2099 static void *established_get_next(struct seq_file *seq, void *cur)
2100 {
2101         struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2102         struct sock *sk = cur;
2103         struct hlist_nulls_node *node;
2104         struct tcp_iter_state *st = seq->private;
2105         struct net *net = seq_file_net(seq);
2106
2107         ++st->num;
2108         ++st->offset;
2109
2110         sk = sk_nulls_next(sk);
2111
2112         sk_nulls_for_each_from(sk, node) {
2113                 if (sk->sk_family == afinfo->family &&
2114                     net_eq(sock_net(sk), net))
2115                         return sk;
2116         }
2117
2118         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2119         ++st->bucket;
2120         return established_get_first(seq);
2121 }
2122
2123 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2124 {
2125         struct tcp_iter_state *st = seq->private;
2126         void *rc;
2127
2128         st->bucket = 0;
2129         rc = established_get_first(seq);
2130
2131         while (rc && pos) {
2132                 rc = established_get_next(seq, rc);
2133                 --pos;
2134         }
2135         return rc;
2136 }
2137
2138 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2139 {
2140         void *rc;
2141         struct tcp_iter_state *st = seq->private;
2142
2143         st->state = TCP_SEQ_STATE_LISTENING;
2144         rc        = listening_get_idx(seq, &pos);
2145
2146         if (!rc) {
2147                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2148                 rc        = established_get_idx(seq, pos);
2149         }
2150
2151         return rc;
2152 }
2153
2154 static void *tcp_seek_last_pos(struct seq_file *seq)
2155 {
2156         struct tcp_iter_state *st = seq->private;
2157         int offset = st->offset;
2158         int orig_num = st->num;
2159         void *rc = NULL;
2160
2161         switch (st->state) {
2162         case TCP_SEQ_STATE_LISTENING:
2163                 if (st->bucket >= INET_LHTABLE_SIZE)
2164                         break;
2165                 st->state = TCP_SEQ_STATE_LISTENING;
2166                 rc = listening_get_next(seq, NULL);
2167                 while (offset-- && rc)
2168                         rc = listening_get_next(seq, rc);
2169                 if (rc)
2170                         break;
2171                 st->bucket = 0;
2172                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2173                 /* Fallthrough */
2174         case TCP_SEQ_STATE_ESTABLISHED:
2175                 if (st->bucket > tcp_hashinfo.ehash_mask)
2176                         break;
2177                 rc = established_get_first(seq);
2178                 while (offset-- && rc)
2179                         rc = established_get_next(seq, rc);
2180         }
2181
2182         st->num = orig_num;
2183
2184         return rc;
2185 }
2186
2187 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2188 {
2189         struct tcp_iter_state *st = seq->private;
2190         void *rc;
2191
2192         if (*pos && *pos == st->last_pos) {
2193                 rc = tcp_seek_last_pos(seq);
2194                 if (rc)
2195                         goto out;
2196         }
2197
2198         st->state = TCP_SEQ_STATE_LISTENING;
2199         st->num = 0;
2200         st->bucket = 0;
2201         st->offset = 0;
2202         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2203
2204 out:
2205         st->last_pos = *pos;
2206         return rc;
2207 }
2208 EXPORT_SYMBOL(tcp_seq_start);
2209
2210 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2211 {
2212         struct tcp_iter_state *st = seq->private;
2213         void *rc = NULL;
2214
2215         if (v == SEQ_START_TOKEN) {
2216                 rc = tcp_get_idx(seq, 0);
2217                 goto out;
2218         }
2219
2220         switch (st->state) {
2221         case TCP_SEQ_STATE_LISTENING:
2222                 rc = listening_get_next(seq, v);
2223                 if (!rc) {
2224                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2225                         st->bucket = 0;
2226                         st->offset = 0;
2227                         rc        = established_get_first(seq);
2228                 }
2229                 break;
2230         case TCP_SEQ_STATE_ESTABLISHED:
2231                 rc = established_get_next(seq, v);
2232                 break;
2233         }
2234 out:
2235         ++*pos;
2236         st->last_pos = *pos;
2237         return rc;
2238 }
2239 EXPORT_SYMBOL(tcp_seq_next);
2240
2241 void tcp_seq_stop(struct seq_file *seq, void *v)
2242 {
2243         struct tcp_iter_state *st = seq->private;
2244
2245         switch (st->state) {
2246         case TCP_SEQ_STATE_LISTENING:
2247                 if (v != SEQ_START_TOKEN)
2248                         spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2249                 break;
2250         case TCP_SEQ_STATE_ESTABLISHED:
2251                 if (v)
2252                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2253                 break;
2254         }
2255 }
2256 EXPORT_SYMBOL(tcp_seq_stop);
2257
2258 static void get_openreq4(const struct request_sock *req,
2259                          struct seq_file *f, int i)
2260 {
2261         const struct inet_request_sock *ireq = inet_rsk(req);
2262         long delta = req->rsk_timer.expires - jiffies;
2263
2264         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2265                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2266                 i,
2267                 ireq->ir_loc_addr,
2268                 ireq->ir_num,
2269                 ireq->ir_rmt_addr,
2270                 ntohs(ireq->ir_rmt_port),
2271                 TCP_SYN_RECV,
2272                 0, 0, /* could print option size, but that is af dependent. */
2273                 1,    /* timers active (only the expire timer) */
2274                 jiffies_delta_to_clock_t(delta),
2275                 req->num_timeout,
2276                 from_kuid_munged(seq_user_ns(f),
2277                                  sock_i_uid(req->rsk_listener)),
2278                 0,  /* non standard timer */
2279                 0, /* open_requests have no inode */
2280                 0,
2281                 req);
2282 }
2283
2284 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2285 {
2286         int timer_active;
2287         unsigned long timer_expires;
2288         const struct tcp_sock *tp = tcp_sk(sk);
2289         const struct inet_connection_sock *icsk = inet_csk(sk);
2290         const struct inet_sock *inet = inet_sk(sk);
2291         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2292         __be32 dest = inet->inet_daddr;
2293         __be32 src = inet->inet_rcv_saddr;
2294         __u16 destp = ntohs(inet->inet_dport);
2295         __u16 srcp = ntohs(inet->inet_sport);
2296         int rx_queue;
2297         int state;
2298
2299         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2300             icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2301             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2302                 timer_active    = 1;
2303                 timer_expires   = icsk->icsk_timeout;
2304         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2305                 timer_active    = 4;
2306                 timer_expires   = icsk->icsk_timeout;
2307         } else if (timer_pending(&sk->sk_timer)) {
2308                 timer_active    = 2;
2309                 timer_expires   = sk->sk_timer.expires;
2310         } else {
2311                 timer_active    = 0;
2312                 timer_expires = jiffies;
2313         }
2314
2315         state = inet_sk_state_load(sk);
2316         if (state == TCP_LISTEN)
2317                 rx_queue = sk->sk_ack_backlog;
2318         else
2319                 /* Because we don't lock the socket,
2320                  * we might find a transient negative value.
2321                  */
2322                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2323
2324         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2325                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2326                 i, src, srcp, dest, destp, state,
2327                 tp->write_seq - tp->snd_una,
2328                 rx_queue,
2329                 timer_active,
2330                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2331                 icsk->icsk_retransmits,
2332                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2333                 icsk->icsk_probes_out,
2334                 sock_i_ino(sk),
2335                 refcount_read(&sk->sk_refcnt), sk,
2336                 jiffies_to_clock_t(icsk->icsk_rto),
2337                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2338                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2339                 tp->snd_cwnd,
2340                 state == TCP_LISTEN ?
2341                     fastopenq->max_qlen :
2342                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2343 }
2344
2345 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2346                                struct seq_file *f, int i)
2347 {
2348         long delta = tw->tw_timer.expires - jiffies;
2349         __be32 dest, src;
2350         __u16 destp, srcp;
2351
2352         dest  = tw->tw_daddr;
2353         src   = tw->tw_rcv_saddr;
2354         destp = ntohs(tw->tw_dport);
2355         srcp  = ntohs(tw->tw_sport);
2356
2357         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2358                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2359                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2360                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2361                 refcount_read(&tw->tw_refcnt), tw);
2362 }
2363
2364 #define TMPSZ 150
2365
2366 static int tcp4_seq_show(struct seq_file *seq, void *v)
2367 {
2368         struct tcp_iter_state *st;
2369         struct sock *sk = v;
2370
2371         seq_setwidth(seq, TMPSZ - 1);
2372         if (v == SEQ_START_TOKEN) {
2373                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2374                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2375                            "inode");
2376                 goto out;
2377         }
2378         st = seq->private;
2379
2380         if (sk->sk_state == TCP_TIME_WAIT)
2381                 get_timewait4_sock(v, seq, st->num);
2382         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2383                 get_openreq4(v, seq, st->num);
2384         else
2385                 get_tcp4_sock(v, seq, st->num);
2386 out:
2387         seq_pad(seq, '\n');
2388         return 0;
2389 }
2390
2391 static const struct seq_operations tcp4_seq_ops = {
2392         .show           = tcp4_seq_show,
2393         .start          = tcp_seq_start,
2394         .next           = tcp_seq_next,
2395         .stop           = tcp_seq_stop,
2396 };
2397
2398 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2399         .family         = AF_INET,
2400 };
2401
2402 static int __net_init tcp4_proc_init_net(struct net *net)
2403 {
2404         if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2405                         sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2406                 return -ENOMEM;
2407         return 0;
2408 }
2409
2410 static void __net_exit tcp4_proc_exit_net(struct net *net)
2411 {
2412         remove_proc_entry("tcp", net->proc_net);
2413 }
2414
2415 static struct pernet_operations tcp4_net_ops = {
2416         .init = tcp4_proc_init_net,
2417         .exit = tcp4_proc_exit_net,
2418 };
2419
2420 int __init tcp4_proc_init(void)
2421 {
2422         return register_pernet_subsys(&tcp4_net_ops);
2423 }
2424
2425 void tcp4_proc_exit(void)
2426 {
2427         unregister_pernet_subsys(&tcp4_net_ops);
2428 }
2429 #endif /* CONFIG_PROC_FS */
2430
2431 struct proto tcp_prot = {
2432         .name                   = "TCP",
2433         .owner                  = THIS_MODULE,
2434         .close                  = tcp_close,
2435         .pre_connect            = tcp_v4_pre_connect,
2436         .connect                = tcp_v4_connect,
2437         .disconnect             = tcp_disconnect,
2438         .accept                 = inet_csk_accept,
2439         .ioctl                  = tcp_ioctl,
2440         .init                   = tcp_v4_init_sock,
2441         .destroy                = tcp_v4_destroy_sock,
2442         .shutdown               = tcp_shutdown,
2443         .setsockopt             = tcp_setsockopt,
2444         .getsockopt             = tcp_getsockopt,
2445         .keepalive              = tcp_set_keepalive,
2446         .recvmsg                = tcp_recvmsg,
2447         .sendmsg                = tcp_sendmsg,
2448         .sendpage               = tcp_sendpage,
2449         .backlog_rcv            = tcp_v4_do_rcv,
2450         .release_cb             = tcp_release_cb,
2451         .hash                   = inet_hash,
2452         .unhash                 = inet_unhash,
2453         .get_port               = inet_csk_get_port,
2454         .enter_memory_pressure  = tcp_enter_memory_pressure,
2455         .leave_memory_pressure  = tcp_leave_memory_pressure,
2456         .stream_memory_free     = tcp_stream_memory_free,
2457         .sockets_allocated      = &tcp_sockets_allocated,
2458         .orphan_count           = &tcp_orphan_count,
2459         .memory_allocated       = &tcp_memory_allocated,
2460         .memory_pressure        = &tcp_memory_pressure,
2461         .sysctl_mem             = sysctl_tcp_mem,
2462         .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2463         .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
2464         .max_header             = MAX_TCP_HEADER,
2465         .obj_size               = sizeof(struct tcp_sock),
2466         .slab_flags             = SLAB_TYPESAFE_BY_RCU,
2467         .twsk_prot              = &tcp_timewait_sock_ops,
2468         .rsk_prot               = &tcp_request_sock_ops,
2469         .h.hashinfo             = &tcp_hashinfo,
2470         .no_autobind            = true,
2471 #ifdef CONFIG_COMPAT
2472         .compat_setsockopt      = compat_tcp_setsockopt,
2473         .compat_getsockopt      = compat_tcp_getsockopt,
2474 #endif
2475         .diag_destroy           = tcp_abort,
2476 };
2477 EXPORT_SYMBOL(tcp_prot);
2478
2479 static void __net_exit tcp_sk_exit(struct net *net)
2480 {
2481         int cpu;
2482
2483         module_put(net->ipv4.tcp_congestion_control->owner);
2484
2485         for_each_possible_cpu(cpu)
2486                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2487         free_percpu(net->ipv4.tcp_sk);
2488 }
2489
2490 static int __net_init tcp_sk_init(struct net *net)
2491 {
2492         int res, cpu, cnt;
2493
2494         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2495         if (!net->ipv4.tcp_sk)
2496                 return -ENOMEM;
2497
2498         for_each_possible_cpu(cpu) {
2499                 struct sock *sk;
2500
2501                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2502                                            IPPROTO_TCP, net);
2503                 if (res)
2504                         goto fail;
2505                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2506                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2507         }
2508
2509         net->ipv4.sysctl_tcp_ecn = 2;
2510         net->ipv4.sysctl_tcp_ecn_fallback = 1;
2511
2512         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2513         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2514         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2515
2516         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2517         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2518         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2519
2520         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2521         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2522         net->ipv4.sysctl_tcp_syncookies = 1;
2523         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2524         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2525         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2526         net->ipv4.sysctl_tcp_orphan_retries = 0;
2527         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2528         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2529         net->ipv4.sysctl_tcp_tw_reuse = 2;
2530
2531         cnt = tcp_hashinfo.ehash_mask + 1;
2532         net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
2533         net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2534
2535         net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
2536         net->ipv4.sysctl_tcp_sack = 1;
2537         net->ipv4.sysctl_tcp_window_scaling = 1;
2538         net->ipv4.sysctl_tcp_timestamps = 1;
2539         net->ipv4.sysctl_tcp_early_retrans = 3;
2540         net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2541         net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2542         net->ipv4.sysctl_tcp_retrans_collapse = 1;
2543         net->ipv4.sysctl_tcp_max_reordering = 300;
2544         net->ipv4.sysctl_tcp_dsack = 1;
2545         net->ipv4.sysctl_tcp_app_win = 31;
2546         net->ipv4.sysctl_tcp_adv_win_scale = 1;
2547         net->ipv4.sysctl_tcp_frto = 2;
2548         net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2549         /* This limits the percentage of the congestion window which we
2550          * will allow a single TSO frame to consume.  Building TSO frames
2551          * which are too large can cause TCP streams to be bursty.
2552          */
2553         net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2554         /* Default TSQ limit of four TSO segments */
2555         net->ipv4.sysctl_tcp_limit_output_bytes = 262144;
2556         /* rfc5961 challenge ack rate limiting */
2557         net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2558         net->ipv4.sysctl_tcp_min_tso_segs = 2;
2559         net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2560         net->ipv4.sysctl_tcp_autocorking = 1;
2561         net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2562         net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2563         net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2564         if (net != &init_net) {
2565                 memcpy(net->ipv4.sysctl_tcp_rmem,
2566                        init_net.ipv4.sysctl_tcp_rmem,
2567                        sizeof(init_net.ipv4.sysctl_tcp_rmem));
2568                 memcpy(net->ipv4.sysctl_tcp_wmem,
2569                        init_net.ipv4.sysctl_tcp_wmem,
2570                        sizeof(init_net.ipv4.sysctl_tcp_wmem));
2571         }
2572         net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2573         net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2574         net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2575         spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2576         net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2577         atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2578
2579         /* Reno is always built in */
2580         if (!net_eq(net, &init_net) &&
2581             try_module_get(init_net.ipv4.tcp_congestion_control->owner))
2582                 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2583         else
2584                 net->ipv4.tcp_congestion_control = &tcp_reno;
2585
2586         return 0;
2587 fail:
2588         tcp_sk_exit(net);
2589
2590         return res;
2591 }
2592
2593 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2594 {
2595         struct net *net;
2596
2597         inet_twsk_purge(&tcp_hashinfo, AF_INET);
2598
2599         list_for_each_entry(net, net_exit_list, exit_list)
2600                 tcp_fastopen_ctx_destroy(net);
2601 }
2602
2603 static struct pernet_operations __net_initdata tcp_sk_ops = {
2604        .init       = tcp_sk_init,
2605        .exit       = tcp_sk_exit,
2606        .exit_batch = tcp_sk_exit_batch,
2607 };
2608
2609 void __init tcp_v4_init(void)
2610 {
2611         if (register_pernet_subsys(&tcp_sk_ops))
2612                 panic("Failed to create the TCP control socket.\n");
2613 }