2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * IPv4 specific functions
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
16 * See tcp.c for author information
18 * This program is free software; you can redistribute it and/or
19 * modify it under the terms of the GNU General Public License
20 * as published by the Free Software Foundation; either version
21 * 2 of the License, or (at your option) any later version.
26 * David S. Miller : New socket lookup architecture.
27 * This code is dedicated to John Dyson.
28 * David S. Miller : Change semantics of established hash,
29 * half is devoted to TIME_WAIT sockets
30 * and the rest go in the other half.
31 * Andi Kleen : Add support for syncookies and fixed
32 * some bugs: ip options weren't passed to
33 * the TCP layer, missed a check for an
35 * Andi Kleen : Implemented fast path mtu discovery.
36 * Fixed many serious bugs in the
37 * request_sock handling and moved
38 * most of it into the af independent code.
39 * Added tail drop and some other bugfixes.
40 * Added new listen semantics.
41 * Mike McLagan : Routing by source
42 * Juan Jose Ciarlante: ip_dynaddr bits
43 * Andi Kleen: various fixes.
44 * Vitaly E. Lavrov : Transparent proxy revived after year
46 * Andi Kleen : Fix new listen.
47 * Andi Kleen : Fix accept error reporting.
48 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
49 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
50 * a single port at the same time.
53 #define pr_fmt(fmt) "TCP: " fmt
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
66 #include <net/net_namespace.h>
68 #include <net/inet_hashtables.h>
70 #include <net/transp_v6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
75 #include <net/secure_seq.h>
76 #include <net/busy_poll.h>
78 #include <linux/inet.h>
79 #include <linux/ipv6.h>
80 #include <linux/stddef.h>
81 #include <linux/proc_fs.h>
82 #include <linux/seq_file.h>
84 #include <crypto/hash.h>
85 #include <linux/scatterlist.h>
87 int sysctl_tcp_low_latency __read_mostly;
89 #ifdef CONFIG_TCP_MD5SIG
90 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
91 __be32 daddr, __be32 saddr, const struct tcphdr *th);
94 struct inet_hashinfo tcp_hashinfo;
95 EXPORT_SYMBOL(tcp_hashinfo);
97 static u32 tcp_v4_init_sequence(const struct sk_buff *skb, u32 *tsoff)
99 return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
102 tcp_hdr(skb)->source, tsoff);
105 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
107 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
108 struct tcp_sock *tp = tcp_sk(sk);
110 /* With PAWS, it is safe from the viewpoint
111 of data integrity. Even without PAWS it is safe provided sequence
112 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
114 Actually, the idea is close to VJ's one, only timestamp cache is
115 held not per host, but per port pair and TW bucket is used as state
118 If TW bucket has been already destroyed we fall back to VJ's scheme
119 and use initial timestamp retrieved from peer table.
121 if (tcptw->tw_ts_recent_stamp &&
122 (!twp || (sock_net(sk)->ipv4.sysctl_tcp_tw_reuse &&
123 get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
124 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
125 if (tp->write_seq == 0)
127 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
128 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
135 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
137 /* This will initiate an outgoing connection. */
138 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
140 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
141 struct inet_sock *inet = inet_sk(sk);
142 struct tcp_sock *tp = tcp_sk(sk);
143 __be16 orig_sport, orig_dport;
144 __be32 daddr, nexthop;
148 struct ip_options_rcu *inet_opt;
149 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
151 if (addr_len < sizeof(struct sockaddr_in))
154 if (usin->sin_family != AF_INET)
155 return -EAFNOSUPPORT;
157 nexthop = daddr = usin->sin_addr.s_addr;
158 inet_opt = rcu_dereference_protected(inet->inet_opt,
159 lockdep_sock_is_held(sk));
160 if (inet_opt && inet_opt->opt.srr) {
163 nexthop = inet_opt->opt.faddr;
166 orig_sport = inet->inet_sport;
167 orig_dport = usin->sin_port;
168 fl4 = &inet->cork.fl.u.ip4;
169 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
170 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
172 orig_sport, orig_dport, sk);
175 if (err == -ENETUNREACH)
176 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
180 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
185 if (!inet_opt || !inet_opt->opt.srr)
188 if (!inet->inet_saddr)
189 inet->inet_saddr = fl4->saddr;
190 sk_rcv_saddr_set(sk, inet->inet_saddr);
192 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
193 /* Reset inherited state */
194 tp->rx_opt.ts_recent = 0;
195 tp->rx_opt.ts_recent_stamp = 0;
196 if (likely(!tp->repair))
200 if (tcp_death_row->sysctl_tw_recycle &&
201 !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
202 tcp_fetch_timewait_stamp(sk, &rt->dst);
204 inet->inet_dport = usin->sin_port;
205 sk_daddr_set(sk, daddr);
207 inet_csk(sk)->icsk_ext_hdr_len = 0;
209 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
211 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
213 /* Socket identity is still unknown (sport may be zero).
214 * However we set state to SYN-SENT and not releasing socket
215 * lock select source port, enter ourselves into the hash tables and
216 * complete initialization after this.
218 tcp_set_state(sk, TCP_SYN_SENT);
219 err = inet_hash_connect(tcp_death_row, sk);
225 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
226 inet->inet_sport, inet->inet_dport, sk);
232 /* OK, now commit destination to socket. */
233 sk->sk_gso_type = SKB_GSO_TCPV4;
234 sk_setup_caps(sk, &rt->dst);
237 if (!tp->write_seq && likely(!tp->repair))
238 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
244 inet->inet_id = tp->write_seq ^ jiffies;
246 if (tcp_fastopen_defer_connect(sk, &err))
251 err = tcp_connect(sk);
260 * This unhashes the socket and releases the local port,
263 tcp_set_state(sk, TCP_CLOSE);
265 sk->sk_route_caps = 0;
266 inet->inet_dport = 0;
269 EXPORT_SYMBOL(tcp_v4_connect);
272 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
273 * It can be called through tcp_release_cb() if socket was owned by user
274 * at the time tcp_v4_err() was called to handle ICMP message.
276 void tcp_v4_mtu_reduced(struct sock *sk)
278 struct dst_entry *dst;
279 struct inet_sock *inet = inet_sk(sk);
280 u32 mtu = tcp_sk(sk)->mtu_info;
282 dst = inet_csk_update_pmtu(sk, mtu);
286 /* Something is about to be wrong... Remember soft error
287 * for the case, if this connection will not able to recover.
289 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
290 sk->sk_err_soft = EMSGSIZE;
294 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
295 ip_sk_accept_pmtu(sk) &&
296 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
297 tcp_sync_mss(sk, mtu);
299 /* Resend the TCP packet because it's
300 * clear that the old packet has been
301 * dropped. This is the new "fast" path mtu
304 tcp_simple_retransmit(sk);
305 } /* else let the usual retransmit timer handle it */
307 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
309 static void do_redirect(struct sk_buff *skb, struct sock *sk)
311 struct dst_entry *dst = __sk_dst_check(sk, 0);
314 dst->ops->redirect(dst, sk, skb);
318 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
319 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
321 struct request_sock *req = inet_reqsk(sk);
322 struct net *net = sock_net(sk);
324 /* ICMPs are not backlogged, hence we cannot get
325 * an established socket here.
327 if (seq != tcp_rsk(req)->snt_isn) {
328 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
331 * Still in SYN_RECV, just remove it silently.
332 * There is no good way to pass the error to the newly
333 * created socket, and POSIX does not want network
334 * errors returned from accept().
336 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
337 tcp_listendrop(req->rsk_listener);
341 EXPORT_SYMBOL(tcp_req_err);
344 * This routine is called by the ICMP module when it gets some
345 * sort of error condition. If err < 0 then the socket should
346 * be closed and the error returned to the user. If err > 0
347 * it's just the icmp type << 8 | icmp code. After adjustment
348 * header points to the first 8 bytes of the tcp header. We need
349 * to find the appropriate port.
351 * The locking strategy used here is very "optimistic". When
352 * someone else accesses the socket the ICMP is just dropped
353 * and for some paths there is no check at all.
354 * A more general error queue to queue errors for later handling
355 * is probably better.
359 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
361 const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
362 struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
363 struct inet_connection_sock *icsk;
365 struct inet_sock *inet;
366 const int type = icmp_hdr(icmp_skb)->type;
367 const int code = icmp_hdr(icmp_skb)->code;
370 struct request_sock *fastopen;
374 struct net *net = dev_net(icmp_skb->dev);
376 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
377 th->dest, iph->saddr, ntohs(th->source),
380 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
383 if (sk->sk_state == TCP_TIME_WAIT) {
384 inet_twsk_put(inet_twsk(sk));
387 seq = ntohl(th->seq);
388 if (sk->sk_state == TCP_NEW_SYN_RECV)
389 return tcp_req_err(sk, seq,
390 type == ICMP_PARAMETERPROB ||
391 type == ICMP_TIME_EXCEEDED ||
392 (type == ICMP_DEST_UNREACH &&
393 (code == ICMP_NET_UNREACH ||
394 code == ICMP_HOST_UNREACH)));
397 /* If too many ICMPs get dropped on busy
398 * servers this needs to be solved differently.
399 * We do take care of PMTU discovery (RFC1191) special case :
400 * we can receive locally generated ICMP messages while socket is held.
402 if (sock_owned_by_user(sk)) {
403 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
404 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
406 if (sk->sk_state == TCP_CLOSE)
409 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
410 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
416 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
417 fastopen = tp->fastopen_rsk;
418 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
419 if (sk->sk_state != TCP_LISTEN &&
420 !between(seq, snd_una, tp->snd_nxt)) {
421 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
427 do_redirect(icmp_skb, sk);
429 case ICMP_SOURCE_QUENCH:
430 /* Just silently ignore these. */
432 case ICMP_PARAMETERPROB:
435 case ICMP_DEST_UNREACH:
436 if (code > NR_ICMP_UNREACH)
439 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
440 /* We are not interested in TCP_LISTEN and open_requests
441 * (SYN-ACKs send out by Linux are always <576bytes so
442 * they should go through unfragmented).
444 if (sk->sk_state == TCP_LISTEN)
448 if (!sock_owned_by_user(sk)) {
449 tcp_v4_mtu_reduced(sk);
451 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
457 err = icmp_err_convert[code].errno;
458 /* check if icmp_skb allows revert of backoff
459 * (see draft-zimmermann-tcp-lcd) */
460 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
462 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
463 !icsk->icsk_backoff || fastopen)
466 if (sock_owned_by_user(sk))
469 icsk->icsk_backoff--;
470 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
472 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
474 skb = tcp_write_queue_head(sk);
477 remaining = icsk->icsk_rto -
479 tcp_time_stamp - tcp_skb_timestamp(skb));
482 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
483 remaining, TCP_RTO_MAX);
485 /* RTO revert clocked out retransmission.
486 * Will retransmit now */
487 tcp_retransmit_timer(sk);
491 case ICMP_TIME_EXCEEDED:
498 switch (sk->sk_state) {
501 /* Only in fast or simultaneous open. If a fast open socket is
502 * is already accepted it is treated as a connected one below.
504 if (fastopen && !fastopen->sk)
507 if (!sock_owned_by_user(sk)) {
510 sk->sk_error_report(sk);
514 sk->sk_err_soft = err;
519 /* If we've already connected we will keep trying
520 * until we time out, or the user gives up.
522 * rfc1122 4.2.3.9 allows to consider as hard errors
523 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
524 * but it is obsoleted by pmtu discovery).
526 * Note, that in modern internet, where routing is unreliable
527 * and in each dark corner broken firewalls sit, sending random
528 * errors ordered by their masters even this two messages finally lose
529 * their original sense (even Linux sends invalid PORT_UNREACHs)
531 * Now we are in compliance with RFCs.
536 if (!sock_owned_by_user(sk) && inet->recverr) {
538 sk->sk_error_report(sk);
539 } else { /* Only an error on timeout */
540 sk->sk_err_soft = err;
548 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
550 struct tcphdr *th = tcp_hdr(skb);
552 if (skb->ip_summed == CHECKSUM_PARTIAL) {
553 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
554 skb->csum_start = skb_transport_header(skb) - skb->head;
555 skb->csum_offset = offsetof(struct tcphdr, check);
557 th->check = tcp_v4_check(skb->len, saddr, daddr,
564 /* This routine computes an IPv4 TCP checksum. */
565 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
567 const struct inet_sock *inet = inet_sk(sk);
569 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
571 EXPORT_SYMBOL(tcp_v4_send_check);
574 * This routine will send an RST to the other tcp.
576 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
578 * Answer: if a packet caused RST, it is not for a socket
579 * existing in our system, if it is matched to a socket,
580 * it is just duplicate segment or bug in other side's TCP.
581 * So that we build reply only basing on parameters
582 * arrived with segment.
583 * Exception: precedence violation. We do not implement it in any case.
586 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
588 const struct tcphdr *th = tcp_hdr(skb);
591 #ifdef CONFIG_TCP_MD5SIG
592 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
595 struct ip_reply_arg arg;
596 #ifdef CONFIG_TCP_MD5SIG
597 struct tcp_md5sig_key *key = NULL;
598 const __u8 *hash_location = NULL;
599 unsigned char newhash[16];
601 struct sock *sk1 = NULL;
605 /* Never send a reset in response to a reset. */
609 /* If sk not NULL, it means we did a successful lookup and incoming
610 * route had to be correct. prequeue might have dropped our dst.
612 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
615 /* Swap the send and the receive. */
616 memset(&rep, 0, sizeof(rep));
617 rep.th.dest = th->source;
618 rep.th.source = th->dest;
619 rep.th.doff = sizeof(struct tcphdr) / 4;
623 rep.th.seq = th->ack_seq;
626 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
627 skb->len - (th->doff << 2));
630 memset(&arg, 0, sizeof(arg));
631 arg.iov[0].iov_base = (unsigned char *)&rep;
632 arg.iov[0].iov_len = sizeof(rep.th);
634 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
635 #ifdef CONFIG_TCP_MD5SIG
637 hash_location = tcp_parse_md5sig_option(th);
638 if (sk && sk_fullsock(sk)) {
639 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
640 &ip_hdr(skb)->saddr, AF_INET);
641 } else if (hash_location) {
643 * active side is lost. Try to find listening socket through
644 * source port, and then find md5 key through listening socket.
645 * we are not loose security here:
646 * Incoming packet is checked with md5 hash with finding key,
647 * no RST generated if md5 hash doesn't match.
649 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
651 th->source, ip_hdr(skb)->daddr,
652 ntohs(th->source), inet_iif(skb));
653 /* don't send rst if it can't find key */
657 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
658 &ip_hdr(skb)->saddr, AF_INET);
663 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
664 if (genhash || memcmp(hash_location, newhash, 16) != 0)
670 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
672 (TCPOPT_MD5SIG << 8) |
674 /* Update length and the length the header thinks exists */
675 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
676 rep.th.doff = arg.iov[0].iov_len / 4;
678 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
679 key, ip_hdr(skb)->saddr,
680 ip_hdr(skb)->daddr, &rep.th);
683 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
684 ip_hdr(skb)->saddr, /* XXX */
685 arg.iov[0].iov_len, IPPROTO_TCP, 0);
686 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
687 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
689 /* When socket is gone, all binding information is lost.
690 * routing might fail in this case. No choice here, if we choose to force
691 * input interface, we will misroute in case of asymmetric route.
694 arg.bound_dev_if = sk->sk_bound_dev_if;
696 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
697 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
699 arg.tos = ip_hdr(skb)->tos;
700 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
702 ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
703 skb, &TCP_SKB_CB(skb)->header.h4.opt,
704 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
705 &arg, arg.iov[0].iov_len);
707 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
708 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
711 #ifdef CONFIG_TCP_MD5SIG
717 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
718 outside socket context is ugly, certainly. What can I do?
721 static void tcp_v4_send_ack(const struct sock *sk,
722 struct sk_buff *skb, u32 seq, u32 ack,
723 u32 win, u32 tsval, u32 tsecr, int oif,
724 struct tcp_md5sig_key *key,
725 int reply_flags, u8 tos)
727 const struct tcphdr *th = tcp_hdr(skb);
730 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
731 #ifdef CONFIG_TCP_MD5SIG
732 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
736 struct net *net = sock_net(sk);
737 struct ip_reply_arg arg;
739 memset(&rep.th, 0, sizeof(struct tcphdr));
740 memset(&arg, 0, sizeof(arg));
742 arg.iov[0].iov_base = (unsigned char *)&rep;
743 arg.iov[0].iov_len = sizeof(rep.th);
745 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
746 (TCPOPT_TIMESTAMP << 8) |
748 rep.opt[1] = htonl(tsval);
749 rep.opt[2] = htonl(tsecr);
750 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
753 /* Swap the send and the receive. */
754 rep.th.dest = th->source;
755 rep.th.source = th->dest;
756 rep.th.doff = arg.iov[0].iov_len / 4;
757 rep.th.seq = htonl(seq);
758 rep.th.ack_seq = htonl(ack);
760 rep.th.window = htons(win);
762 #ifdef CONFIG_TCP_MD5SIG
764 int offset = (tsecr) ? 3 : 0;
766 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
768 (TCPOPT_MD5SIG << 8) |
770 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
771 rep.th.doff = arg.iov[0].iov_len/4;
773 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
774 key, ip_hdr(skb)->saddr,
775 ip_hdr(skb)->daddr, &rep.th);
778 arg.flags = reply_flags;
779 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
780 ip_hdr(skb)->saddr, /* XXX */
781 arg.iov[0].iov_len, IPPROTO_TCP, 0);
782 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
784 arg.bound_dev_if = oif;
786 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
788 ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
789 skb, &TCP_SKB_CB(skb)->header.h4.opt,
790 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
791 &arg, arg.iov[0].iov_len);
793 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
797 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
799 struct inet_timewait_sock *tw = inet_twsk(sk);
800 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
802 tcp_v4_send_ack(sk, skb,
803 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
804 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
805 tcp_time_stamp + tcptw->tw_ts_offset,
808 tcp_twsk_md5_key(tcptw),
809 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
816 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
817 struct request_sock *req)
819 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
820 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
822 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
826 * The window field (SEG.WND) of every outgoing segment, with the
827 * exception of <SYN> segments, MUST be right-shifted by
828 * Rcv.Wind.Shift bits:
830 tcp_v4_send_ack(sk, skb, seq,
831 tcp_rsk(req)->rcv_nxt,
832 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
833 tcp_time_stamp + tcp_rsk(req)->ts_off,
836 tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
838 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
843 * Send a SYN-ACK after having received a SYN.
844 * This still operates on a request_sock only, not on a big
847 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
849 struct request_sock *req,
850 struct tcp_fastopen_cookie *foc,
851 enum tcp_synack_type synack_type)
853 const struct inet_request_sock *ireq = inet_rsk(req);
858 /* First, grab a route. */
859 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
862 skb = tcp_make_synack(sk, dst, req, foc, synack_type);
865 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
867 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
870 err = net_xmit_eval(err);
877 * IPv4 request_sock destructor.
879 static void tcp_v4_reqsk_destructor(struct request_sock *req)
881 kfree(inet_rsk(req)->opt);
884 #ifdef CONFIG_TCP_MD5SIG
886 * RFC2385 MD5 checksumming requires a mapping of
887 * IP address->MD5 Key.
888 * We need to maintain these in the sk structure.
891 /* Find the Key structure for an address. */
892 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
893 const union tcp_md5_addr *addr,
896 const struct tcp_sock *tp = tcp_sk(sk);
897 struct tcp_md5sig_key *key;
898 unsigned int size = sizeof(struct in_addr);
899 const struct tcp_md5sig_info *md5sig;
901 /* caller either holds rcu_read_lock() or socket lock */
902 md5sig = rcu_dereference_check(tp->md5sig_info,
903 lockdep_sock_is_held(sk));
906 #if IS_ENABLED(CONFIG_IPV6)
907 if (family == AF_INET6)
908 size = sizeof(struct in6_addr);
910 hlist_for_each_entry_rcu(key, &md5sig->head, node) {
911 if (key->family != family)
913 if (!memcmp(&key->addr, addr, size))
918 EXPORT_SYMBOL(tcp_md5_do_lookup);
920 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
921 const struct sock *addr_sk)
923 const union tcp_md5_addr *addr;
925 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
926 return tcp_md5_do_lookup(sk, addr, AF_INET);
928 EXPORT_SYMBOL(tcp_v4_md5_lookup);
930 /* This can be called on a newly created socket, from other files */
931 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
932 int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
934 /* Add Key to the list */
935 struct tcp_md5sig_key *key;
936 struct tcp_sock *tp = tcp_sk(sk);
937 struct tcp_md5sig_info *md5sig;
939 key = tcp_md5_do_lookup(sk, addr, family);
941 /* Pre-existing entry - just update that one. */
942 memcpy(key->key, newkey, newkeylen);
943 key->keylen = newkeylen;
947 md5sig = rcu_dereference_protected(tp->md5sig_info,
948 lockdep_sock_is_held(sk));
950 md5sig = kmalloc(sizeof(*md5sig), gfp);
954 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
955 INIT_HLIST_HEAD(&md5sig->head);
956 rcu_assign_pointer(tp->md5sig_info, md5sig);
959 key = sock_kmalloc(sk, sizeof(*key), gfp);
962 if (!tcp_alloc_md5sig_pool()) {
963 sock_kfree_s(sk, key, sizeof(*key));
967 memcpy(key->key, newkey, newkeylen);
968 key->keylen = newkeylen;
969 key->family = family;
970 memcpy(&key->addr, addr,
971 (family == AF_INET6) ? sizeof(struct in6_addr) :
972 sizeof(struct in_addr));
973 hlist_add_head_rcu(&key->node, &md5sig->head);
976 EXPORT_SYMBOL(tcp_md5_do_add);
978 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
980 struct tcp_md5sig_key *key;
982 key = tcp_md5_do_lookup(sk, addr, family);
985 hlist_del_rcu(&key->node);
986 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
990 EXPORT_SYMBOL(tcp_md5_do_del);
992 static void tcp_clear_md5_list(struct sock *sk)
994 struct tcp_sock *tp = tcp_sk(sk);
995 struct tcp_md5sig_key *key;
996 struct hlist_node *n;
997 struct tcp_md5sig_info *md5sig;
999 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1001 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1002 hlist_del_rcu(&key->node);
1003 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1004 kfree_rcu(key, rcu);
1008 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1011 struct tcp_md5sig cmd;
1012 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1014 if (optlen < sizeof(cmd))
1017 if (copy_from_user(&cmd, optval, sizeof(cmd)))
1020 if (sin->sin_family != AF_INET)
1023 if (!cmd.tcpm_keylen)
1024 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1027 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1030 return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1031 AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1035 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1036 __be32 daddr, __be32 saddr,
1037 const struct tcphdr *th, int nbytes)
1039 struct tcp4_pseudohdr *bp;
1040 struct scatterlist sg;
1047 bp->protocol = IPPROTO_TCP;
1048 bp->len = cpu_to_be16(nbytes);
1050 _th = (struct tcphdr *)(bp + 1);
1051 memcpy(_th, th, sizeof(*th));
1054 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1055 ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1056 sizeof(*bp) + sizeof(*th));
1057 return crypto_ahash_update(hp->md5_req);
1060 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1061 __be32 daddr, __be32 saddr, const struct tcphdr *th)
1063 struct tcp_md5sig_pool *hp;
1064 struct ahash_request *req;
1066 hp = tcp_get_md5sig_pool();
1068 goto clear_hash_noput;
1071 if (crypto_ahash_init(req))
1073 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1075 if (tcp_md5_hash_key(hp, key))
1077 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1078 if (crypto_ahash_final(req))
1081 tcp_put_md5sig_pool();
1085 tcp_put_md5sig_pool();
1087 memset(md5_hash, 0, 16);
1091 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1092 const struct sock *sk,
1093 const struct sk_buff *skb)
1095 struct tcp_md5sig_pool *hp;
1096 struct ahash_request *req;
1097 const struct tcphdr *th = tcp_hdr(skb);
1098 __be32 saddr, daddr;
1100 if (sk) { /* valid for establish/request sockets */
1101 saddr = sk->sk_rcv_saddr;
1102 daddr = sk->sk_daddr;
1104 const struct iphdr *iph = ip_hdr(skb);
1109 hp = tcp_get_md5sig_pool();
1111 goto clear_hash_noput;
1114 if (crypto_ahash_init(req))
1117 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1119 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1121 if (tcp_md5_hash_key(hp, key))
1123 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1124 if (crypto_ahash_final(req))
1127 tcp_put_md5sig_pool();
1131 tcp_put_md5sig_pool();
1133 memset(md5_hash, 0, 16);
1136 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1140 /* Called with rcu_read_lock() */
1141 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1142 const struct sk_buff *skb)
1144 #ifdef CONFIG_TCP_MD5SIG
1146 * This gets called for each TCP segment that arrives
1147 * so we want to be efficient.
1148 * We have 3 drop cases:
1149 * o No MD5 hash and one expected.
1150 * o MD5 hash and we're not expecting one.
1151 * o MD5 hash and its wrong.
1153 const __u8 *hash_location = NULL;
1154 struct tcp_md5sig_key *hash_expected;
1155 const struct iphdr *iph = ip_hdr(skb);
1156 const struct tcphdr *th = tcp_hdr(skb);
1158 unsigned char newhash[16];
1160 hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1162 hash_location = tcp_parse_md5sig_option(th);
1164 /* We've parsed the options - do we have a hash? */
1165 if (!hash_expected && !hash_location)
1168 if (hash_expected && !hash_location) {
1169 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1173 if (!hash_expected && hash_location) {
1174 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1178 /* Okay, so this is hash_expected and hash_location -
1179 * so we need to calculate the checksum.
1181 genhash = tcp_v4_md5_hash_skb(newhash,
1185 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1186 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1187 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1188 &iph->saddr, ntohs(th->source),
1189 &iph->daddr, ntohs(th->dest),
1190 genhash ? " tcp_v4_calc_md5_hash failed"
1199 static void tcp_v4_init_req(struct request_sock *req,
1200 const struct sock *sk_listener,
1201 struct sk_buff *skb)
1203 struct inet_request_sock *ireq = inet_rsk(req);
1205 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1206 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1207 ireq->opt = tcp_v4_save_options(skb);
1210 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1212 const struct request_sock *req,
1215 struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1218 if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1227 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1229 .obj_size = sizeof(struct tcp_request_sock),
1230 .rtx_syn_ack = tcp_rtx_synack,
1231 .send_ack = tcp_v4_reqsk_send_ack,
1232 .destructor = tcp_v4_reqsk_destructor,
1233 .send_reset = tcp_v4_send_reset,
1234 .syn_ack_timeout = tcp_syn_ack_timeout,
1237 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1238 .mss_clamp = TCP_MSS_DEFAULT,
1239 #ifdef CONFIG_TCP_MD5SIG
1240 .req_md5_lookup = tcp_v4_md5_lookup,
1241 .calc_md5_hash = tcp_v4_md5_hash_skb,
1243 .init_req = tcp_v4_init_req,
1244 #ifdef CONFIG_SYN_COOKIES
1245 .cookie_init_seq = cookie_v4_init_sequence,
1247 .route_req = tcp_v4_route_req,
1248 .init_seq = tcp_v4_init_sequence,
1249 .send_synack = tcp_v4_send_synack,
1252 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1254 /* Never answer to SYNs send to broadcast or multicast */
1255 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1258 return tcp_conn_request(&tcp_request_sock_ops,
1259 &tcp_request_sock_ipv4_ops, sk, skb);
1265 EXPORT_SYMBOL(tcp_v4_conn_request);
1269 * The three way handshake has completed - we got a valid synack -
1270 * now create the new socket.
1272 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1273 struct request_sock *req,
1274 struct dst_entry *dst,
1275 struct request_sock *req_unhash,
1278 struct inet_request_sock *ireq;
1279 struct inet_sock *newinet;
1280 struct tcp_sock *newtp;
1282 #ifdef CONFIG_TCP_MD5SIG
1283 struct tcp_md5sig_key *key;
1285 struct ip_options_rcu *inet_opt;
1287 if (sk_acceptq_is_full(sk))
1290 newsk = tcp_create_openreq_child(sk, req, skb);
1294 newsk->sk_gso_type = SKB_GSO_TCPV4;
1295 inet_sk_rx_dst_set(newsk, skb);
1297 newtp = tcp_sk(newsk);
1298 newinet = inet_sk(newsk);
1299 ireq = inet_rsk(req);
1300 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1301 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1302 newsk->sk_bound_dev_if = ireq->ir_iif;
1303 newinet->inet_saddr = ireq->ir_loc_addr;
1304 inet_opt = ireq->opt;
1305 rcu_assign_pointer(newinet->inet_opt, inet_opt);
1307 newinet->mc_index = inet_iif(skb);
1308 newinet->mc_ttl = ip_hdr(skb)->ttl;
1309 newinet->rcv_tos = ip_hdr(skb)->tos;
1310 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1312 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1313 newinet->inet_id = newtp->write_seq ^ jiffies;
1316 dst = inet_csk_route_child_sock(sk, newsk, req);
1320 /* syncookie case : see end of cookie_v4_check() */
1322 sk_setup_caps(newsk, dst);
1324 tcp_ca_openreq_child(newsk, dst);
1326 tcp_sync_mss(newsk, dst_mtu(dst));
1327 newtp->advmss = dst_metric_advmss(dst);
1328 if (tcp_sk(sk)->rx_opt.user_mss &&
1329 tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1330 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1332 tcp_initialize_rcv_mss(newsk);
1334 #ifdef CONFIG_TCP_MD5SIG
1335 /* Copy over the MD5 key from the original socket */
1336 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1340 * We're using one, so create a matching key
1341 * on the newsk structure. If we fail to get
1342 * memory, then we end up not copying the key
1345 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1346 AF_INET, key->key, key->keylen, GFP_ATOMIC);
1347 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1351 if (__inet_inherit_port(sk, newsk) < 0)
1353 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1355 tcp_move_syn(newtp, req);
1360 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1367 inet_csk_prepare_forced_close(newsk);
1371 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1373 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1375 #ifdef CONFIG_SYN_COOKIES
1376 const struct tcphdr *th = tcp_hdr(skb);
1379 sk = cookie_v4_check(sk, skb);
1384 /* The socket must have it's spinlock held when we get
1385 * here, unless it is a TCP_LISTEN socket.
1387 * We have a potential double-lock case here, so even when
1388 * doing backlog processing we use the BH locking scheme.
1389 * This is because we cannot sleep with the original spinlock
1392 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1396 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1397 struct dst_entry *dst = sk->sk_rx_dst;
1399 sock_rps_save_rxhash(sk, skb);
1400 sk_mark_napi_id(sk, skb);
1402 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1403 !dst->ops->check(dst, 0)) {
1405 sk->sk_rx_dst = NULL;
1408 tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1412 if (tcp_checksum_complete(skb))
1415 if (sk->sk_state == TCP_LISTEN) {
1416 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1421 sock_rps_save_rxhash(nsk, skb);
1422 sk_mark_napi_id(nsk, skb);
1423 if (tcp_child_process(sk, nsk, skb)) {
1430 sock_rps_save_rxhash(sk, skb);
1432 if (tcp_rcv_state_process(sk, skb)) {
1439 tcp_v4_send_reset(rsk, skb);
1442 /* Be careful here. If this function gets more complicated and
1443 * gcc suffers from register pressure on the x86, sk (in %ebx)
1444 * might be destroyed here. This current version compiles correctly,
1445 * but you have been warned.
1450 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1451 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1454 EXPORT_SYMBOL(tcp_v4_do_rcv);
1456 void tcp_v4_early_demux(struct sk_buff *skb)
1458 const struct iphdr *iph;
1459 const struct tcphdr *th;
1462 if (skb->pkt_type != PACKET_HOST)
1465 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1471 if (th->doff < sizeof(struct tcphdr) / 4)
1474 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1475 iph->saddr, th->source,
1476 iph->daddr, ntohs(th->dest),
1480 skb->destructor = sock_edemux;
1481 if (sk_fullsock(sk)) {
1482 struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1485 dst = dst_check(dst, 0);
1487 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1488 skb_dst_set_noref(skb, dst);
1493 /* Packet is added to VJ-style prequeue for processing in process
1494 * context, if a reader task is waiting. Apparently, this exciting
1495 * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1496 * failed somewhere. Latency? Burstiness? Well, at least now we will
1497 * see, why it failed. 8)8) --ANK
1500 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1502 struct tcp_sock *tp = tcp_sk(sk);
1504 if (sysctl_tcp_low_latency || !tp->ucopy.task)
1507 if (skb->len <= tcp_hdrlen(skb) &&
1508 skb_queue_len(&tp->ucopy.prequeue) == 0)
1511 /* Before escaping RCU protected region, we need to take care of skb
1512 * dst. Prequeue is only enabled for established sockets.
1513 * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1514 * Instead of doing full sk_rx_dst validity here, let's perform
1515 * an optimistic check.
1517 if (likely(sk->sk_rx_dst))
1520 skb_dst_force_safe(skb);
1522 __skb_queue_tail(&tp->ucopy.prequeue, skb);
1523 tp->ucopy.memory += skb->truesize;
1524 if (skb_queue_len(&tp->ucopy.prequeue) >= 32 ||
1525 tp->ucopy.memory + atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) {
1526 struct sk_buff *skb1;
1528 BUG_ON(sock_owned_by_user(sk));
1529 __NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUEDROPPED,
1530 skb_queue_len(&tp->ucopy.prequeue));
1532 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1533 sk_backlog_rcv(sk, skb1);
1535 tp->ucopy.memory = 0;
1536 } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1537 wake_up_interruptible_sync_poll(sk_sleep(sk),
1538 POLLIN | POLLRDNORM | POLLRDBAND);
1539 if (!inet_csk_ack_scheduled(sk))
1540 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1541 (3 * tcp_rto_min(sk)) / 4,
1546 EXPORT_SYMBOL(tcp_prequeue);
1548 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1550 u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1552 /* Only socket owner can try to collapse/prune rx queues
1553 * to reduce memory overhead, so add a little headroom here.
1554 * Few sockets backlog are possibly concurrently non empty.
1558 /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1559 * we can fix skb->truesize to its real value to avoid future drops.
1560 * This is valid because skb is not yet charged to the socket.
1561 * It has been noticed pure SACK packets were sometimes dropped
1562 * (if cooked by drivers without copybreak feature).
1566 if (unlikely(sk_add_backlog(sk, skb, limit))) {
1568 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1573 EXPORT_SYMBOL(tcp_add_backlog);
1575 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1577 struct tcphdr *th = (struct tcphdr *)skb->data;
1578 unsigned int eaten = skb->len;
1581 err = sk_filter_trim_cap(sk, skb, th->doff * 4);
1584 TCP_SKB_CB(skb)->end_seq -= eaten;
1588 EXPORT_SYMBOL(tcp_filter);
1594 int tcp_v4_rcv(struct sk_buff *skb)
1596 struct net *net = dev_net(skb->dev);
1597 const struct iphdr *iph;
1598 const struct tcphdr *th;
1603 if (skb->pkt_type != PACKET_HOST)
1606 /* Count it even if it's bad */
1607 __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1609 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1612 th = (const struct tcphdr *)skb->data;
1614 if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1616 if (!pskb_may_pull(skb, th->doff * 4))
1619 /* An explanation is required here, I think.
1620 * Packet length and doff are validated by header prediction,
1621 * provided case of th->doff==0 is eliminated.
1622 * So, we defer the checks. */
1624 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1627 th = (const struct tcphdr *)skb->data;
1629 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1630 * barrier() makes sure compiler wont play fool^Waliasing games.
1632 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1633 sizeof(struct inet_skb_parm));
1636 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1637 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1638 skb->len - th->doff * 4);
1639 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1640 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1641 TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1642 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1643 TCP_SKB_CB(skb)->sacked = 0;
1646 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1647 th->dest, &refcounted);
1652 if (sk->sk_state == TCP_TIME_WAIT)
1655 if (sk->sk_state == TCP_NEW_SYN_RECV) {
1656 struct request_sock *req = inet_reqsk(sk);
1659 sk = req->rsk_listener;
1660 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1661 sk_drops_add(sk, skb);
1665 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1666 inet_csk_reqsk_queue_drop_and_put(sk, req);
1669 /* We own a reference on the listener, increase it again
1670 * as we might lose it too soon.
1674 nsk = tcp_check_req(sk, skb, req, false);
1677 goto discard_and_relse;
1681 } else if (tcp_child_process(sk, nsk, skb)) {
1682 tcp_v4_send_reset(nsk, skb);
1683 goto discard_and_relse;
1689 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1690 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1691 goto discard_and_relse;
1694 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1695 goto discard_and_relse;
1697 if (tcp_v4_inbound_md5_hash(sk, skb))
1698 goto discard_and_relse;
1702 if (tcp_filter(sk, skb))
1703 goto discard_and_relse;
1704 th = (const struct tcphdr *)skb->data;
1709 if (sk->sk_state == TCP_LISTEN) {
1710 ret = tcp_v4_do_rcv(sk, skb);
1711 goto put_and_return;
1714 sk_incoming_cpu_update(sk);
1716 bh_lock_sock_nested(sk);
1717 tcp_segs_in(tcp_sk(sk), skb);
1719 if (!sock_owned_by_user(sk)) {
1720 if (!tcp_prequeue(sk, skb))
1721 ret = tcp_v4_do_rcv(sk, skb);
1722 } else if (tcp_add_backlog(sk, skb)) {
1723 goto discard_and_relse;
1734 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1737 if (tcp_checksum_complete(skb)) {
1739 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1741 __TCP_INC_STATS(net, TCP_MIB_INERRS);
1743 tcp_v4_send_reset(NULL, skb);
1747 /* Discard frame. */
1752 sk_drops_add(sk, skb);
1758 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1759 inet_twsk_put(inet_twsk(sk));
1763 if (tcp_checksum_complete(skb)) {
1764 inet_twsk_put(inet_twsk(sk));
1767 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1769 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1772 iph->saddr, th->source,
1773 iph->daddr, th->dest,
1776 inet_twsk_deschedule_put(inet_twsk(sk));
1781 /* Fall through to ACK */
1784 tcp_v4_timewait_ack(sk, skb);
1787 tcp_v4_send_reset(sk, skb);
1788 inet_twsk_deschedule_put(inet_twsk(sk));
1790 case TCP_TW_SUCCESS:;
1795 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1796 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
1797 .twsk_unique = tcp_twsk_unique,
1798 .twsk_destructor= tcp_twsk_destructor,
1801 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1803 struct dst_entry *dst = skb_dst(skb);
1805 if (dst && dst_hold_safe(dst)) {
1806 sk->sk_rx_dst = dst;
1807 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1810 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1812 const struct inet_connection_sock_af_ops ipv4_specific = {
1813 .queue_xmit = ip_queue_xmit,
1814 .send_check = tcp_v4_send_check,
1815 .rebuild_header = inet_sk_rebuild_header,
1816 .sk_rx_dst_set = inet_sk_rx_dst_set,
1817 .conn_request = tcp_v4_conn_request,
1818 .syn_recv_sock = tcp_v4_syn_recv_sock,
1819 .net_header_len = sizeof(struct iphdr),
1820 .setsockopt = ip_setsockopt,
1821 .getsockopt = ip_getsockopt,
1822 .addr2sockaddr = inet_csk_addr2sockaddr,
1823 .sockaddr_len = sizeof(struct sockaddr_in),
1824 #ifdef CONFIG_COMPAT
1825 .compat_setsockopt = compat_ip_setsockopt,
1826 .compat_getsockopt = compat_ip_getsockopt,
1828 .mtu_reduced = tcp_v4_mtu_reduced,
1830 EXPORT_SYMBOL(ipv4_specific);
1832 #ifdef CONFIG_TCP_MD5SIG
1833 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1834 .md5_lookup = tcp_v4_md5_lookup,
1835 .calc_md5_hash = tcp_v4_md5_hash_skb,
1836 .md5_parse = tcp_v4_parse_md5_keys,
1840 /* NOTE: A lot of things set to zero explicitly by call to
1841 * sk_alloc() so need not be done here.
1843 static int tcp_v4_init_sock(struct sock *sk)
1845 struct inet_connection_sock *icsk = inet_csk(sk);
1849 icsk->icsk_af_ops = &ipv4_specific;
1851 #ifdef CONFIG_TCP_MD5SIG
1852 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1858 void tcp_v4_destroy_sock(struct sock *sk)
1860 struct tcp_sock *tp = tcp_sk(sk);
1862 tcp_clear_xmit_timers(sk);
1864 tcp_cleanup_congestion_control(sk);
1866 /* Cleanup up the write buffer. */
1867 tcp_write_queue_purge(sk);
1869 /* Cleans up our, hopefully empty, out_of_order_queue. */
1870 skb_rbtree_purge(&tp->out_of_order_queue);
1872 #ifdef CONFIG_TCP_MD5SIG
1873 /* Clean up the MD5 key list, if any */
1874 if (tp->md5sig_info) {
1875 tcp_clear_md5_list(sk);
1876 kfree_rcu(tp->md5sig_info, rcu);
1877 tp->md5sig_info = NULL;
1881 /* Clean prequeue, it must be empty really */
1882 __skb_queue_purge(&tp->ucopy.prequeue);
1884 /* Clean up a referenced TCP bind bucket. */
1885 if (inet_csk(sk)->icsk_bind_hash)
1888 BUG_ON(tp->fastopen_rsk);
1890 /* If socket is aborted during connect operation */
1891 tcp_free_fastopen_req(tp);
1892 tcp_saved_syn_free(tp);
1894 sk_sockets_allocated_dec(sk);
1896 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1898 #ifdef CONFIG_PROC_FS
1899 /* Proc filesystem TCP sock list dumping. */
1902 * Get next listener socket follow cur. If cur is NULL, get first socket
1903 * starting from bucket given in st->bucket; when st->bucket is zero the
1904 * very first socket in the hash table is returned.
1906 static void *listening_get_next(struct seq_file *seq, void *cur)
1908 struct tcp_iter_state *st = seq->private;
1909 struct net *net = seq_file_net(seq);
1910 struct inet_listen_hashbucket *ilb;
1911 struct sock *sk = cur;
1915 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1916 spin_lock(&ilb->lock);
1917 sk = sk_head(&ilb->head);
1921 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1927 sk_for_each_from(sk) {
1928 if (!net_eq(sock_net(sk), net))
1930 if (sk->sk_family == st->family)
1933 spin_unlock(&ilb->lock);
1935 if (++st->bucket < INET_LHTABLE_SIZE)
1940 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1942 struct tcp_iter_state *st = seq->private;
1947 rc = listening_get_next(seq, NULL);
1949 while (rc && *pos) {
1950 rc = listening_get_next(seq, rc);
1956 static inline bool empty_bucket(const struct tcp_iter_state *st)
1958 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1962 * Get first established socket starting from bucket given in st->bucket.
1963 * If st->bucket is zero, the very first socket in the hash is returned.
1965 static void *established_get_first(struct seq_file *seq)
1967 struct tcp_iter_state *st = seq->private;
1968 struct net *net = seq_file_net(seq);
1972 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1974 struct hlist_nulls_node *node;
1975 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1977 /* Lockless fast path for the common case of empty buckets */
1978 if (empty_bucket(st))
1982 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1983 if (sk->sk_family != st->family ||
1984 !net_eq(sock_net(sk), net)) {
1990 spin_unlock_bh(lock);
1996 static void *established_get_next(struct seq_file *seq, void *cur)
1998 struct sock *sk = cur;
1999 struct hlist_nulls_node *node;
2000 struct tcp_iter_state *st = seq->private;
2001 struct net *net = seq_file_net(seq);
2006 sk = sk_nulls_next(sk);
2008 sk_nulls_for_each_from(sk, node) {
2009 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2013 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2015 return established_get_first(seq);
2018 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2020 struct tcp_iter_state *st = seq->private;
2024 rc = established_get_first(seq);
2027 rc = established_get_next(seq, rc);
2033 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2036 struct tcp_iter_state *st = seq->private;
2038 st->state = TCP_SEQ_STATE_LISTENING;
2039 rc = listening_get_idx(seq, &pos);
2042 st->state = TCP_SEQ_STATE_ESTABLISHED;
2043 rc = established_get_idx(seq, pos);
2049 static void *tcp_seek_last_pos(struct seq_file *seq)
2051 struct tcp_iter_state *st = seq->private;
2052 int offset = st->offset;
2053 int orig_num = st->num;
2056 switch (st->state) {
2057 case TCP_SEQ_STATE_LISTENING:
2058 if (st->bucket >= INET_LHTABLE_SIZE)
2060 st->state = TCP_SEQ_STATE_LISTENING;
2061 rc = listening_get_next(seq, NULL);
2062 while (offset-- && rc)
2063 rc = listening_get_next(seq, rc);
2067 st->state = TCP_SEQ_STATE_ESTABLISHED;
2069 case TCP_SEQ_STATE_ESTABLISHED:
2070 if (st->bucket > tcp_hashinfo.ehash_mask)
2072 rc = established_get_first(seq);
2073 while (offset-- && rc)
2074 rc = established_get_next(seq, rc);
2082 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2084 struct tcp_iter_state *st = seq->private;
2087 if (*pos && *pos == st->last_pos) {
2088 rc = tcp_seek_last_pos(seq);
2093 st->state = TCP_SEQ_STATE_LISTENING;
2097 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2100 st->last_pos = *pos;
2104 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2106 struct tcp_iter_state *st = seq->private;
2109 if (v == SEQ_START_TOKEN) {
2110 rc = tcp_get_idx(seq, 0);
2114 switch (st->state) {
2115 case TCP_SEQ_STATE_LISTENING:
2116 rc = listening_get_next(seq, v);
2118 st->state = TCP_SEQ_STATE_ESTABLISHED;
2121 rc = established_get_first(seq);
2124 case TCP_SEQ_STATE_ESTABLISHED:
2125 rc = established_get_next(seq, v);
2130 st->last_pos = *pos;
2134 static void tcp_seq_stop(struct seq_file *seq, void *v)
2136 struct tcp_iter_state *st = seq->private;
2138 switch (st->state) {
2139 case TCP_SEQ_STATE_LISTENING:
2140 if (v != SEQ_START_TOKEN)
2141 spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2143 case TCP_SEQ_STATE_ESTABLISHED:
2145 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2150 int tcp_seq_open(struct inode *inode, struct file *file)
2152 struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2153 struct tcp_iter_state *s;
2156 err = seq_open_net(inode, file, &afinfo->seq_ops,
2157 sizeof(struct tcp_iter_state));
2161 s = ((struct seq_file *)file->private_data)->private;
2162 s->family = afinfo->family;
2166 EXPORT_SYMBOL(tcp_seq_open);
2168 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2171 struct proc_dir_entry *p;
2173 afinfo->seq_ops.start = tcp_seq_start;
2174 afinfo->seq_ops.next = tcp_seq_next;
2175 afinfo->seq_ops.stop = tcp_seq_stop;
2177 p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2178 afinfo->seq_fops, afinfo);
2183 EXPORT_SYMBOL(tcp_proc_register);
2185 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2187 remove_proc_entry(afinfo->name, net->proc_net);
2189 EXPORT_SYMBOL(tcp_proc_unregister);
2191 static void get_openreq4(const struct request_sock *req,
2192 struct seq_file *f, int i)
2194 const struct inet_request_sock *ireq = inet_rsk(req);
2195 long delta = req->rsk_timer.expires - jiffies;
2197 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2198 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2203 ntohs(ireq->ir_rmt_port),
2205 0, 0, /* could print option size, but that is af dependent. */
2206 1, /* timers active (only the expire timer) */
2207 jiffies_delta_to_clock_t(delta),
2209 from_kuid_munged(seq_user_ns(f),
2210 sock_i_uid(req->rsk_listener)),
2211 0, /* non standard timer */
2212 0, /* open_requests have no inode */
2217 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2220 unsigned long timer_expires;
2221 const struct tcp_sock *tp = tcp_sk(sk);
2222 const struct inet_connection_sock *icsk = inet_csk(sk);
2223 const struct inet_sock *inet = inet_sk(sk);
2224 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2225 __be32 dest = inet->inet_daddr;
2226 __be32 src = inet->inet_rcv_saddr;
2227 __u16 destp = ntohs(inet->inet_dport);
2228 __u16 srcp = ntohs(inet->inet_sport);
2232 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2233 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2234 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2236 timer_expires = icsk->icsk_timeout;
2237 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2239 timer_expires = icsk->icsk_timeout;
2240 } else if (timer_pending(&sk->sk_timer)) {
2242 timer_expires = sk->sk_timer.expires;
2245 timer_expires = jiffies;
2248 state = sk_state_load(sk);
2249 if (state == TCP_LISTEN)
2250 rx_queue = sk->sk_ack_backlog;
2252 /* Because we don't lock the socket,
2253 * we might find a transient negative value.
2255 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2257 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2258 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2259 i, src, srcp, dest, destp, state,
2260 tp->write_seq - tp->snd_una,
2263 jiffies_delta_to_clock_t(timer_expires - jiffies),
2264 icsk->icsk_retransmits,
2265 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2266 icsk->icsk_probes_out,
2268 atomic_read(&sk->sk_refcnt), sk,
2269 jiffies_to_clock_t(icsk->icsk_rto),
2270 jiffies_to_clock_t(icsk->icsk_ack.ato),
2271 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2273 state == TCP_LISTEN ?
2274 fastopenq->max_qlen :
2275 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2278 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2279 struct seq_file *f, int i)
2281 long delta = tw->tw_timer.expires - jiffies;
2285 dest = tw->tw_daddr;
2286 src = tw->tw_rcv_saddr;
2287 destp = ntohs(tw->tw_dport);
2288 srcp = ntohs(tw->tw_sport);
2290 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2291 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2292 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2293 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2294 atomic_read(&tw->tw_refcnt), tw);
2299 static int tcp4_seq_show(struct seq_file *seq, void *v)
2301 struct tcp_iter_state *st;
2302 struct sock *sk = v;
2304 seq_setwidth(seq, TMPSZ - 1);
2305 if (v == SEQ_START_TOKEN) {
2306 seq_puts(seq, " sl local_address rem_address st tx_queue "
2307 "rx_queue tr tm->when retrnsmt uid timeout "
2313 if (sk->sk_state == TCP_TIME_WAIT)
2314 get_timewait4_sock(v, seq, st->num);
2315 else if (sk->sk_state == TCP_NEW_SYN_RECV)
2316 get_openreq4(v, seq, st->num);
2318 get_tcp4_sock(v, seq, st->num);
2324 static const struct file_operations tcp_afinfo_seq_fops = {
2325 .owner = THIS_MODULE,
2326 .open = tcp_seq_open,
2328 .llseek = seq_lseek,
2329 .release = seq_release_net
2332 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2335 .seq_fops = &tcp_afinfo_seq_fops,
2337 .show = tcp4_seq_show,
2341 static int __net_init tcp4_proc_init_net(struct net *net)
2343 return tcp_proc_register(net, &tcp4_seq_afinfo);
2346 static void __net_exit tcp4_proc_exit_net(struct net *net)
2348 tcp_proc_unregister(net, &tcp4_seq_afinfo);
2351 static struct pernet_operations tcp4_net_ops = {
2352 .init = tcp4_proc_init_net,
2353 .exit = tcp4_proc_exit_net,
2356 int __init tcp4_proc_init(void)
2358 return register_pernet_subsys(&tcp4_net_ops);
2361 void tcp4_proc_exit(void)
2363 unregister_pernet_subsys(&tcp4_net_ops);
2365 #endif /* CONFIG_PROC_FS */
2367 struct proto tcp_prot = {
2369 .owner = THIS_MODULE,
2371 .connect = tcp_v4_connect,
2372 .disconnect = tcp_disconnect,
2373 .accept = inet_csk_accept,
2375 .init = tcp_v4_init_sock,
2376 .destroy = tcp_v4_destroy_sock,
2377 .shutdown = tcp_shutdown,
2378 .setsockopt = tcp_setsockopt,
2379 .getsockopt = tcp_getsockopt,
2380 .keepalive = tcp_set_keepalive,
2381 .recvmsg = tcp_recvmsg,
2382 .sendmsg = tcp_sendmsg,
2383 .sendpage = tcp_sendpage,
2384 .backlog_rcv = tcp_v4_do_rcv,
2385 .release_cb = tcp_release_cb,
2387 .unhash = inet_unhash,
2388 .get_port = inet_csk_get_port,
2389 .enter_memory_pressure = tcp_enter_memory_pressure,
2390 .stream_memory_free = tcp_stream_memory_free,
2391 .sockets_allocated = &tcp_sockets_allocated,
2392 .orphan_count = &tcp_orphan_count,
2393 .memory_allocated = &tcp_memory_allocated,
2394 .memory_pressure = &tcp_memory_pressure,
2395 .sysctl_mem = sysctl_tcp_mem,
2396 .sysctl_wmem = sysctl_tcp_wmem,
2397 .sysctl_rmem = sysctl_tcp_rmem,
2398 .max_header = MAX_TCP_HEADER,
2399 .obj_size = sizeof(struct tcp_sock),
2400 .slab_flags = SLAB_DESTROY_BY_RCU,
2401 .twsk_prot = &tcp_timewait_sock_ops,
2402 .rsk_prot = &tcp_request_sock_ops,
2403 .h.hashinfo = &tcp_hashinfo,
2404 .no_autobind = true,
2405 #ifdef CONFIG_COMPAT
2406 .compat_setsockopt = compat_tcp_setsockopt,
2407 .compat_getsockopt = compat_tcp_getsockopt,
2409 .diag_destroy = tcp_abort,
2411 EXPORT_SYMBOL(tcp_prot);
2413 static void __net_exit tcp_sk_exit(struct net *net)
2417 for_each_possible_cpu(cpu)
2418 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2419 free_percpu(net->ipv4.tcp_sk);
2422 static int __net_init tcp_sk_init(struct net *net)
2426 net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2427 if (!net->ipv4.tcp_sk)
2430 for_each_possible_cpu(cpu) {
2433 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2437 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2438 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2441 net->ipv4.sysctl_tcp_ecn = 2;
2442 net->ipv4.sysctl_tcp_ecn_fallback = 1;
2444 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2445 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2446 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2448 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2449 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2450 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2452 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2453 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2454 net->ipv4.sysctl_tcp_syncookies = 1;
2455 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2456 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2457 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2458 net->ipv4.sysctl_tcp_orphan_retries = 0;
2459 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2460 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2461 net->ipv4.sysctl_tcp_tw_reuse = 0;
2463 cnt = tcp_hashinfo.ehash_mask + 1;
2464 net->ipv4.tcp_death_row.sysctl_tw_recycle = 0;
2465 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
2466 net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2468 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
2477 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2479 inet_twsk_purge(&tcp_hashinfo, AF_INET);
2482 static struct pernet_operations __net_initdata tcp_sk_ops = {
2483 .init = tcp_sk_init,
2484 .exit = tcp_sk_exit,
2485 .exit_batch = tcp_sk_exit_batch,
2488 void __init tcp_v4_init(void)
2490 if (register_pernet_subsys(&tcp_sk_ops))
2491 panic("Failed to create the TCP control socket.\n");