net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  *              IPv4 specific functions
   9  *
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  *
  18  *      This program is free software; you can redistribute it and/or
  19  *      modify it under the terms of the GNU General Public License
  20  *      as published by the Free Software Foundation; either version
  21  *      2 of the License, or (at your option) any later version.
  22  */
  23
  24 /*
  25  * Changes:
  26  *              David S. Miller :       New socket lookup architecture.
  27  *                                      This code is dedicated to John Dyson.
  28  *              David S. Miller :       Change semantics of established hash,
  29  *                                      half is devoted to TIME_WAIT sockets
  30  *                                      and the rest go in the other half.
  31  *              Andi Kleen :            Add support for syncookies and fixed
  32  *                                      some bugs: ip options weren't passed to
  33  *                                      the TCP layer, missed a check for an
  34  *                                      ACK bit.
  35  *              Andi Kleen :            Implemented fast path mtu discovery.
  36  *                                      Fixed many serious bugs in the
  37  *                                      request_sock handling and moved
  38  *                                      most of it into the af independent code.
  39  *                                      Added tail drop and some other bugfixes.
  40  *                                      Added new listen semantics.
  41  *              Mike McLagan    :       Routing by source
  42  *      Juan Jose Ciarlante:            ip_dynaddr bits
  43  *              Andi Kleen:             various fixes.
  44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45  *                                      coma.
  46  *      Andi Kleen              :       Fix new listen.
  47  *      Andi Kleen              :       Fix accept error reporting.
  48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50  *                                      a single port at the same time.
  51  */
  52
  53 #define pr_fmt(fmt) "TCP: " fmt
  54
  55 #include <linux/bottom_half.h>
  56 #include <linux/types.h>
  57 #include <linux/fcntl.h>
  58 #include <linux/module.h>
  59 #include <linux/random.h>
  60 #include <linux/cache.h>
  61 #include <linux/jhash.h>
  62 #include <linux/init.h>
  63 #include <linux/times.h>
  64 #include <linux/slab.h>
  65
  66 #include <net/net_namespace.h>
  67 #include <net/icmp.h>
  68 #include <net/inet_hashtables.h>
  69 #include <net/tcp.h>
  70 #include <net/transp_v6.h>
  71 #include <net/ipv6.h>
  72 #include <net/inet_common.h>
  73 #include <net/timewait_sock.h>
  74 #include <net/xfrm.h>
  75 #include <net/secure_seq.h>
  76 #include <net/busy_poll.h>
  77
  78 #include <linux/inet.h>
  79 #include <linux/ipv6.h>
  80 #include <linux/stddef.h>
  81 #include <linux/proc_fs.h>
  82 #include <linux/seq_file.h>
  83
  84 #include <crypto/hash.h>
  85 #include <linux/scatterlist.h>
  86
  87 int sysctl_tcp_low_latency __read_mostly;
  88
  89 #ifdef CONFIG_TCP_MD5SIG
  90 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  91                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  92 #endif
  93
  94 struct inet_hashinfo tcp_hashinfo;
  95 EXPORT_SYMBOL(tcp_hashinfo);
  96
  97 static u32 tcp_v4_init_sequence(const struct sk_buff *skb, u32 *tsoff)
  98 {
  99         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
 100                                           ip_hdr(skb)->saddr,
 101                                           tcp_hdr(skb)->dest,
 102                                           tcp_hdr(skb)->source, tsoff);
 103 }
 104
 105 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 106 {
 107         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 108         struct tcp_sock *tp = tcp_sk(sk);
 109
 110         /* With PAWS, it is safe from the viewpoint
 111            of data integrity. Even without PAWS it is safe provided sequence
 112            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 113
 114            Actually, the idea is close to VJ's one, only timestamp cache is
 115            held not per host, but per port pair and TW bucket is used as state
 116            holder.
 117
 118            If TW bucket has been already destroyed we fall back to VJ's scheme
 119            and use initial timestamp retrieved from peer table.
 120          */
 121         if (tcptw->tw_ts_recent_stamp &&
 122             (!twp || (sock_net(sk)->ipv4.sysctl_tcp_tw_reuse &&
 123                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 124                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 125                 if (tp->write_seq == 0)
 126                         tp->write_seq = 1;
 127                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 128                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 129                 sock_hold(sktw);
 130                 return 1;
 131         }
 132
 133         return 0;
 134 }
 135 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 136
 137 /* This will initiate an outgoing connection. */
 138 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 139 {
 140         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 141         struct inet_sock *inet = inet_sk(sk);
 142         struct tcp_sock *tp = tcp_sk(sk);
 143         __be16 orig_sport, orig_dport;
 144         __be32 daddr, nexthop;
 145         struct flowi4 *fl4;
 146         struct rtable *rt;
 147         int err;
 148         struct ip_options_rcu *inet_opt;
 149         struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 150
 151         if (addr_len < sizeof(struct sockaddr_in))
 152                 return -EINVAL;
 153
 154         if (usin->sin_family != AF_INET)
 155                 return -EAFNOSUPPORT;
 156
 157         nexthop = daddr = usin->sin_addr.s_addr;
 158         inet_opt = rcu_dereference_protected(inet->inet_opt,
 159                                              lockdep_sock_is_held(sk));
 160         if (inet_opt && inet_opt->opt.srr) {
 161                 if (!daddr)
 162                         return -EINVAL;
 163                 nexthop = inet_opt->opt.faddr;
 164         }
 165
 166         orig_sport = inet->inet_sport;
 167         orig_dport = usin->sin_port;
 168         fl4 = &inet->cork.fl.u.ip4;
 169         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 170                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 171                               IPPROTO_TCP,
 172                               orig_sport, orig_dport, sk);
 173         if (IS_ERR(rt)) {
 174                 err = PTR_ERR(rt);
 175                 if (err == -ENETUNREACH)
 176                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 177                 return err;
 178         }
 179
 180         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 181                 ip_rt_put(rt);
 182                 return -ENETUNREACH;
 183         }
 184
 185         if (!inet_opt || !inet_opt->opt.srr)
 186                 daddr = fl4->daddr;
 187
 188         if (!inet->inet_saddr)
 189                 inet->inet_saddr = fl4->saddr;
 190         sk_rcv_saddr_set(sk, inet->inet_saddr);
 191
 192         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 193                 /* Reset inherited state */
 194                 tp->rx_opt.ts_recent       = 0;
 195                 tp->rx_opt.ts_recent_stamp = 0;
 196                 if (likely(!tp->repair))
 197                         tp->write_seq      = 0;
 198         }
 199
 200         if (tcp_death_row->sysctl_tw_recycle &&
 201             !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
 202                 tcp_fetch_timewait_stamp(sk, &rt->dst);
 203
 204         inet->inet_dport = usin->sin_port;
 205         sk_daddr_set(sk, daddr);
 206
 207         inet_csk(sk)->icsk_ext_hdr_len = 0;
 208         if (inet_opt)
 209                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 210
 211         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 212
 213         /* Socket identity is still unknown (sport may be zero).
 214          * However we set state to SYN-SENT and not releasing socket
 215          * lock select source port, enter ourselves into the hash tables and
 216          * complete initialization after this.
 217          */
 218         tcp_set_state(sk, TCP_SYN_SENT);
 219         err = inet_hash_connect(tcp_death_row, sk);
 220         if (err)
 221                 goto failure;
 222
 223         sk_set_txhash(sk);
 224
 225         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 226                                inet->inet_sport, inet->inet_dport, sk);
 227         if (IS_ERR(rt)) {
 228                 err = PTR_ERR(rt);
 229                 rt = NULL;
 230                 goto failure;
 231         }
 232         /* OK, now commit destination to socket.  */
 233         sk->sk_gso_type = SKB_GSO_TCPV4;
 234         sk_setup_caps(sk, &rt->dst);
 235         rt = NULL;
 236
 237         if (!tp->write_seq && likely(!tp->repair))
 238                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
 239                                                            inet->inet_daddr,
 240                                                            inet->inet_sport,
 241                                                            usin->sin_port,
 242                                                            &tp->tsoffset);
 243
 244         inet->inet_id = tp->write_seq ^ jiffies;
 245
 246         if (tcp_fastopen_defer_connect(sk, &err))
 247                 return err;
 248         if (err)
 249                 goto failure;
 250
 251         err = tcp_connect(sk);
 252
 253         if (err)
 254                 goto failure;
 255
 256         return 0;
 257
 258 failure:
 259         /*
 260          * This unhashes the socket and releases the local port,
 261          * if necessary.
 262          */
 263         tcp_set_state(sk, TCP_CLOSE);
 264         ip_rt_put(rt);
 265         sk->sk_route_caps = 0;
 266         inet->inet_dport = 0;
 267         return err;
 268 }
 269 EXPORT_SYMBOL(tcp_v4_connect);
 270
 271 /*
 272  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 273  * It can be called through tcp_release_cb() if socket was owned by user
 274  * at the time tcp_v4_err() was called to handle ICMP message.
 275  */
 276 void tcp_v4_mtu_reduced(struct sock *sk)
 277 {
 278         struct dst_entry *dst;
 279         struct inet_sock *inet = inet_sk(sk);
 280         u32 mtu = tcp_sk(sk)->mtu_info;
 281
 282         dst = inet_csk_update_pmtu(sk, mtu);
 283         if (!dst)
 284                 return;
 285
 286         /* Something is about to be wrong... Remember soft error
 287          * for the case, if this connection will not able to recover.
 288          */
 289         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 290                 sk->sk_err_soft = EMSGSIZE;
 291
 292         mtu = dst_mtu(dst);
 293
 294         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 295             ip_sk_accept_pmtu(sk) &&
 296             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 297                 tcp_sync_mss(sk, mtu);
 298
 299                 /* Resend the TCP packet because it's
 300                  * clear that the old packet has been
 301                  * dropped. This is the new "fast" path mtu
 302                  * discovery.
 303                  */
 304                 tcp_simple_retransmit(sk);
 305         } /* else let the usual retransmit timer handle it */
 306 }
 307 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 308
 309 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 310 {
 311         struct dst_entry *dst = __sk_dst_check(sk, 0);
 312
 313         if (dst)
 314                 dst->ops->redirect(dst, sk, skb);
 315 }
 316
 317
 318 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 319 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 320 {
 321         struct request_sock *req = inet_reqsk(sk);
 322         struct net *net = sock_net(sk);
 323
 324         /* ICMPs are not backlogged, hence we cannot get
 325          * an established socket here.
 326          */
 327         if (seq != tcp_rsk(req)->snt_isn) {
 328                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 329         } else if (abort) {
 330                 /*
 331                  * Still in SYN_RECV, just remove it silently.
 332                  * There is no good way to pass the error to the newly
 333                  * created socket, and POSIX does not want network
 334                  * errors returned from accept().
 335                  */
 336                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 337                 tcp_listendrop(req->rsk_listener);
 338         }
 339         reqsk_put(req);
 340 }
 341 EXPORT_SYMBOL(tcp_req_err);
 342
 343 /*
 344  * This routine is called by the ICMP module when it gets some
 345  * sort of error condition.  If err < 0 then the socket should
 346  * be closed and the error returned to the user.  If err > 0
 347  * it's just the icmp type << 8 | icmp code.  After adjustment
 348  * header points to the first 8 bytes of the tcp header.  We need
 349  * to find the appropriate port.
 350  *
 351  * The locking strategy used here is very "optimistic". When
 352  * someone else accesses the socket the ICMP is just dropped
 353  * and for some paths there is no check at all.
 354  * A more general error queue to queue errors for later handling
 355  * is probably better.
 356  *
 357  */
 358
 359 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 360 {
 361         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 362         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 363         struct inet_connection_sock *icsk;
 364         struct tcp_sock *tp;
 365         struct inet_sock *inet;
 366         const int type = icmp_hdr(icmp_skb)->type;
 367         const int code = icmp_hdr(icmp_skb)->code;
 368         struct sock *sk;
 369         struct sk_buff *skb;
 370         struct request_sock *fastopen;
 371         __u32 seq, snd_una;
 372         __u32 remaining;
 373         int err;
 374         struct net *net = dev_net(icmp_skb->dev);
 375
 376         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 377                                        th->dest, iph->saddr, ntohs(th->source),
 378                                        inet_iif(icmp_skb));
 379         if (!sk) {
 380                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 381                 return;
 382         }
 383         if (sk->sk_state == TCP_TIME_WAIT) {
 384                 inet_twsk_put(inet_twsk(sk));
 385                 return;
 386         }
 387         seq = ntohl(th->seq);
 388         if (sk->sk_state == TCP_NEW_SYN_RECV)
 389                 return tcp_req_err(sk, seq,
 390                                   type == ICMP_PARAMETERPROB ||
 391                                   type == ICMP_TIME_EXCEEDED ||
 392                                   (type == ICMP_DEST_UNREACH &&
 393                                    (code == ICMP_NET_UNREACH ||
 394                                     code == ICMP_HOST_UNREACH)));
 395
 396         bh_lock_sock(sk);
 397         /* If too many ICMPs get dropped on busy
 398          * servers this needs to be solved differently.
 399          * We do take care of PMTU discovery (RFC1191) special case :
 400          * we can receive locally generated ICMP messages while socket is held.
 401          */
 402         if (sock_owned_by_user(sk)) {
 403                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 404                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 405         }
 406         if (sk->sk_state == TCP_CLOSE)
 407                 goto out;
 408
 409         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 410                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 411                 goto out;
 412         }
 413
 414         icsk = inet_csk(sk);
 415         tp = tcp_sk(sk);
 416         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 417         fastopen = tp->fastopen_rsk;
 418         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 419         if (sk->sk_state != TCP_LISTEN &&
 420             !between(seq, snd_una, tp->snd_nxt)) {
 421                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 422                 goto out;
 423         }
 424
 425         switch (type) {
 426         case ICMP_REDIRECT:
 427                 do_redirect(icmp_skb, sk);
 428                 goto out;
 429         case ICMP_SOURCE_QUENCH:
 430                 /* Just silently ignore these. */
 431                 goto out;
 432         case ICMP_PARAMETERPROB:
 433                 err = EPROTO;
 434                 break;
 435         case ICMP_DEST_UNREACH:
 436                 if (code > NR_ICMP_UNREACH)
 437                         goto out;
 438
 439                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 440                         /* We are not interested in TCP_LISTEN and open_requests
 441                          * (SYN-ACKs send out by Linux are always <576bytes so
 442                          * they should go through unfragmented).
 443                          */
 444                         if (sk->sk_state == TCP_LISTEN)
 445                                 goto out;
 446
 447                         tp->mtu_info = info;
 448                         if (!sock_owned_by_user(sk)) {
 449                                 tcp_v4_mtu_reduced(sk);
 450                         } else {
 451                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 452                                         sock_hold(sk);
 453                         }
 454                         goto out;
 455                 }
 456
 457                 err = icmp_err_convert[code].errno;
 458                 /* check if icmp_skb allows revert of backoff
 459                  * (see draft-zimmermann-tcp-lcd) */
 460                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 461                         break;
 462                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 463                     !icsk->icsk_backoff || fastopen)
 464                         break;
 465
 466                 if (sock_owned_by_user(sk))
 467                         break;
 468
 469                 icsk->icsk_backoff--;
 470                 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
 471                                                TCP_TIMEOUT_INIT;
 472                 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 473
 474                 skb = tcp_write_queue_head(sk);
 475                 BUG_ON(!skb);
 476
 477                 remaining = icsk->icsk_rto -
 478                             min(icsk->icsk_rto,
 479                                 tcp_time_stamp - tcp_skb_timestamp(skb));
 480
 481                 if (remaining) {
 482                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 483                                                   remaining, TCP_RTO_MAX);
 484                 } else {
 485                         /* RTO revert clocked out retransmission.
 486                          * Will retransmit now */
 487                         tcp_retransmit_timer(sk);
 488                 }
 489
 490                 break;
 491         case ICMP_TIME_EXCEEDED:
 492                 err = EHOSTUNREACH;
 493                 break;
 494         default:
 495                 goto out;
 496         }
 497
 498         switch (sk->sk_state) {
 499         case TCP_SYN_SENT:
 500         case TCP_SYN_RECV:
 501                 /* Only in fast or simultaneous open. If a fast open socket is
 502                  * is already accepted it is treated as a connected one below.
 503                  */
 504                 if (fastopen && !fastopen->sk)
 505                         break;
 506
 507                 if (!sock_owned_by_user(sk)) {
 508                         sk->sk_err = err;
 509
 510                         sk->sk_error_report(sk);
 511
 512                         tcp_done(sk);
 513                 } else {
 514                         sk->sk_err_soft = err;
 515                 }
 516                 goto out;
 517         }
 518
 519         /* If we've already connected we will keep trying
 520          * until we time out, or the user gives up.
 521          *
 522          * rfc1122 4.2.3.9 allows to consider as hard errors
 523          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 524          * but it is obsoleted by pmtu discovery).
 525          *
 526          * Note, that in modern internet, where routing is unreliable
 527          * and in each dark corner broken firewalls sit, sending random
 528          * errors ordered by their masters even this two messages finally lose
 529          * their original sense (even Linux sends invalid PORT_UNREACHs)
 530          *
 531          * Now we are in compliance with RFCs.
 532          *                                                      --ANK (980905)
 533          */
 534
 535         inet = inet_sk(sk);
 536         if (!sock_owned_by_user(sk) && inet->recverr) {
 537                 sk->sk_err = err;
 538                 sk->sk_error_report(sk);
 539         } else  { /* Only an error on timeout */
 540                 sk->sk_err_soft = err;
 541         }
 542
 543 out:
 544         bh_unlock_sock(sk);
 545         sock_put(sk);
 546 }
 547
 548 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 549 {
 550         struct tcphdr *th = tcp_hdr(skb);
 551
 552         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 553                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 554                 skb->csum_start = skb_transport_header(skb) - skb->head;
 555                 skb->csum_offset = offsetof(struct tcphdr, check);
 556         } else {
 557                 th->check = tcp_v4_check(skb->len, saddr, daddr,
 558                                          csum_partial(th,
 559                                                       th->doff << 2,
 560                                                       skb->csum));
 561         }
 562 }
 563
 564 /* This routine computes an IPv4 TCP checksum. */
 565 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 566 {
 567         const struct inet_sock *inet = inet_sk(sk);
 568
 569         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 570 }
 571 EXPORT_SYMBOL(tcp_v4_send_check);
 572
 573 /*
 574  *      This routine will send an RST to the other tcp.
 575  *
 576  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 577  *                    for reset.
 578  *      Answer: if a packet caused RST, it is not for a socket
 579  *              existing in our system, if it is matched to a socket,
 580  *              it is just duplicate segment or bug in other side's TCP.
 581  *              So that we build reply only basing on parameters
 582  *              arrived with segment.
 583  *      Exception: precedence violation. We do not implement it in any case.
 584  */
 585
 586 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 587 {
 588         const struct tcphdr *th = tcp_hdr(skb);
 589         struct {
 590                 struct tcphdr th;
 591 #ifdef CONFIG_TCP_MD5SIG
 592                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 593 #endif
 594         } rep;
 595         struct ip_reply_arg arg;
 596 #ifdef CONFIG_TCP_MD5SIG
 597         struct tcp_md5sig_key *key = NULL;
 598         const __u8 *hash_location = NULL;
 599         unsigned char newhash[16];
 600         int genhash;
 601         struct sock *sk1 = NULL;
 602 #endif
 603         struct net *net;
 604
 605         /* Never send a reset in response to a reset. */
 606         if (th->rst)
 607                 return;
 608
 609         /* If sk not NULL, it means we did a successful lookup and incoming
 610          * route had to be correct. prequeue might have dropped our dst.
 611          */
 612         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 613                 return;
 614
 615         /* Swap the send and the receive. */
 616         memset(&rep, 0, sizeof(rep));
 617         rep.th.dest   = th->source;
 618         rep.th.source = th->dest;
 619         rep.th.doff   = sizeof(struct tcphdr) / 4;
 620         rep.th.rst    = 1;
 621
 622         if (th->ack) {
 623                 rep.th.seq = th->ack_seq;
 624         } else {
 625                 rep.th.ack = 1;
 626                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 627                                        skb->len - (th->doff << 2));
 628         }
 629
 630         memset(&arg, 0, sizeof(arg));
 631         arg.iov[0].iov_base = (unsigned char *)&rep;
 632         arg.iov[0].iov_len  = sizeof(rep.th);
 633
 634         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 635 #ifdef CONFIG_TCP_MD5SIG
 636         rcu_read_lock();
 637         hash_location = tcp_parse_md5sig_option(th);
 638         if (sk && sk_fullsock(sk)) {
 639                 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
 640                                         &ip_hdr(skb)->saddr, AF_INET);
 641         } else if (hash_location) {
 642                 /*
 643                  * active side is lost. Try to find listening socket through
 644                  * source port, and then find md5 key through listening socket.
 645                  * we are not loose security here:
 646                  * Incoming packet is checked with md5 hash with finding key,
 647                  * no RST generated if md5 hash doesn't match.
 648                  */
 649                 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
 650                                              ip_hdr(skb)->saddr,
 651                                              th->source, ip_hdr(skb)->daddr,
 652                                              ntohs(th->source), inet_iif(skb));
 653                 /* don't send rst if it can't find key */
 654                 if (!sk1)
 655                         goto out;
 656
 657                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
 658                                         &ip_hdr(skb)->saddr, AF_INET);
 659                 if (!key)
 660                         goto out;
 661
 662
 663                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 664                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 665                         goto out;
 666
 667         }
 668
 669         if (key) {
 670                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 671                                    (TCPOPT_NOP << 16) |
 672                                    (TCPOPT_MD5SIG << 8) |
 673                                    TCPOLEN_MD5SIG);
 674                 /* Update length and the length the header thinks exists */
 675                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 676                 rep.th.doff = arg.iov[0].iov_len / 4;
 677
 678                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 679                                      key, ip_hdr(skb)->saddr,
 680                                      ip_hdr(skb)->daddr, &rep.th);
 681         }
 682 #endif
 683         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 684                                       ip_hdr(skb)->saddr, /* XXX */
 685                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 686         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 687         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 688
 689         /* When socket is gone, all binding information is lost.
 690          * routing might fail in this case. No choice here, if we choose to force
 691          * input interface, we will misroute in case of asymmetric route.
 692          */
 693         if (sk)
 694                 arg.bound_dev_if = sk->sk_bound_dev_if;
 695
 696         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 697                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 698
 699         arg.tos = ip_hdr(skb)->tos;
 700         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 701         local_bh_disable();
 702         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 703                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 704                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 705                               &arg, arg.iov[0].iov_len);
 706
 707         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 708         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 709         local_bh_enable();
 710
 711 #ifdef CONFIG_TCP_MD5SIG
 712 out:
 713         rcu_read_unlock();
 714 #endif
 715 }
 716
 717 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 718    outside socket context is ugly, certainly. What can I do?
 719  */
 720
 721 static void tcp_v4_send_ack(const struct sock *sk,
 722                             struct sk_buff *skb, u32 seq, u32 ack,
 723                             u32 win, u32 tsval, u32 tsecr, int oif,
 724                             struct tcp_md5sig_key *key,
 725                             int reply_flags, u8 tos)
 726 {
 727         const struct tcphdr *th = tcp_hdr(skb);
 728         struct {
 729                 struct tcphdr th;
 730                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 731 #ifdef CONFIG_TCP_MD5SIG
 732                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 733 #endif
 734                         ];
 735         } rep;
 736         struct net *net = sock_net(sk);
 737         struct ip_reply_arg arg;
 738
 739         memset(&rep.th, 0, sizeof(struct tcphdr));
 740         memset(&arg, 0, sizeof(arg));
 741
 742         arg.iov[0].iov_base = (unsigned char *)&rep;
 743         arg.iov[0].iov_len  = sizeof(rep.th);
 744         if (tsecr) {
 745                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 746                                    (TCPOPT_TIMESTAMP << 8) |
 747                                    TCPOLEN_TIMESTAMP);
 748                 rep.opt[1] = htonl(tsval);
 749                 rep.opt[2] = htonl(tsecr);
 750                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 751         }
 752
 753         /* Swap the send and the receive. */
 754         rep.th.dest    = th->source;
 755         rep.th.source  = th->dest;
 756         rep.th.doff    = arg.iov[0].iov_len / 4;
 757         rep.th.seq     = htonl(seq);
 758         rep.th.ack_seq = htonl(ack);
 759         rep.th.ack     = 1;
 760         rep.th.window  = htons(win);
 761
 762 #ifdef CONFIG_TCP_MD5SIG
 763         if (key) {
 764                 int offset = (tsecr) ? 3 : 0;
 765
 766                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 767                                           (TCPOPT_NOP << 16) |
 768                                           (TCPOPT_MD5SIG << 8) |
 769                                           TCPOLEN_MD5SIG);
 770                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 771                 rep.th.doff = arg.iov[0].iov_len/4;
 772
 773                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 774                                     key, ip_hdr(skb)->saddr,
 775                                     ip_hdr(skb)->daddr, &rep.th);
 776         }
 777 #endif
 778         arg.flags = reply_flags;
 779         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 780                                       ip_hdr(skb)->saddr, /* XXX */
 781                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 782         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 783         if (oif)
 784                 arg.bound_dev_if = oif;
 785         arg.tos = tos;
 786         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 787         local_bh_disable();
 788         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 789                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 790                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 791                               &arg, arg.iov[0].iov_len);
 792
 793         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 794         local_bh_enable();
 795 }
 796
 797 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 798 {
 799         struct inet_timewait_sock *tw = inet_twsk(sk);
 800         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 801
 802         tcp_v4_send_ack(sk, skb,
 803                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 804                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 805                         tcp_time_stamp + tcptw->tw_ts_offset,
 806                         tcptw->tw_ts_recent,
 807                         tw->tw_bound_dev_if,
 808                         tcp_twsk_md5_key(tcptw),
 809                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 810                         tw->tw_tos
 811                         );
 812
 813         inet_twsk_put(tw);
 814 }
 815
 816 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 817                                   struct request_sock *req)
 818 {
 819         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 820          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 821          */
 822         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 823                                              tcp_sk(sk)->snd_nxt;
 824
 825         /* RFC 7323 2.3
 826          * The window field (SEG.WND) of every outgoing segment, with the
 827          * exception of <SYN> segments, MUST be right-shifted by
 828          * Rcv.Wind.Shift bits:
 829          */
 830         tcp_v4_send_ack(sk, skb, seq,
 831                         tcp_rsk(req)->rcv_nxt,
 832                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 833                         tcp_time_stamp + tcp_rsk(req)->ts_off,
 834                         req->ts_recent,
 835                         0,
 836                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
 837                                           AF_INET),
 838                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 839                         ip_hdr(skb)->tos);
 840 }
 841
 842 /*
 843  *      Send a SYN-ACK after having received a SYN.
 844  *      This still operates on a request_sock only, not on a big
 845  *      socket.
 846  */
 847 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 848                               struct flowi *fl,
 849                               struct request_sock *req,
 850                               struct tcp_fastopen_cookie *foc,
 851                               enum tcp_synack_type synack_type)
 852 {
 853         const struct inet_request_sock *ireq = inet_rsk(req);
 854         struct flowi4 fl4;
 855         int err = -1;
 856         struct sk_buff *skb;
 857
 858         /* First, grab a route. */
 859         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 860                 return -1;
 861
 862         skb = tcp_make_synack(sk, dst, req, foc, synack_type);
 863
 864         if (skb) {
 865                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
 866
 867                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 868                                             ireq->ir_rmt_addr,
 869                                             ireq->opt);
 870                 err = net_xmit_eval(err);
 871         }
 872
 873         return err;
 874 }
 875
 876 /*
 877  *      IPv4 request_sock destructor.
 878  */
 879 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 880 {
 881         kfree(inet_rsk(req)->opt);
 882 }
 883
 884 #ifdef CONFIG_TCP_MD5SIG
 885 /*
 886  * RFC2385 MD5 checksumming requires a mapping of
 887  * IP address->MD5 Key.
 888  * We need to maintain these in the sk structure.
 889  */
 890
 891 /* Find the Key structure for an address.  */
 892 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
 893                                          const union tcp_md5_addr *addr,
 894                                          int family)
 895 {
 896         const struct tcp_sock *tp = tcp_sk(sk);
 897         struct tcp_md5sig_key *key;
 898         unsigned int size = sizeof(struct in_addr);
 899         const struct tcp_md5sig_info *md5sig;
 900
 901         /* caller either holds rcu_read_lock() or socket lock */
 902         md5sig = rcu_dereference_check(tp->md5sig_info,
 903                                        lockdep_sock_is_held(sk));
 904         if (!md5sig)
 905                 return NULL;
 906 #if IS_ENABLED(CONFIG_IPV6)
 907         if (family == AF_INET6)
 908                 size = sizeof(struct in6_addr);
 909 #endif
 910         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
 911                 if (key->family != family)
 912                         continue;
 913                 if (!memcmp(&key->addr, addr, size))
 914                         return key;
 915         }
 916         return NULL;
 917 }
 918 EXPORT_SYMBOL(tcp_md5_do_lookup);
 919
 920 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
 921                                          const struct sock *addr_sk)
 922 {
 923         const union tcp_md5_addr *addr;
 924
 925         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
 926         return tcp_md5_do_lookup(sk, addr, AF_INET);
 927 }
 928 EXPORT_SYMBOL(tcp_v4_md5_lookup);
 929
 930 /* This can be called on a newly created socket, from other files */
 931 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
 932                    int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
 933 {
 934         /* Add Key to the list */
 935         struct tcp_md5sig_key *key;
 936         struct tcp_sock *tp = tcp_sk(sk);
 937         struct tcp_md5sig_info *md5sig;
 938
 939         key = tcp_md5_do_lookup(sk, addr, family);
 940         if (key) {
 941                 /* Pre-existing entry - just update that one. */
 942                 memcpy(key->key, newkey, newkeylen);
 943                 key->keylen = newkeylen;
 944                 return 0;
 945         }
 946
 947         md5sig = rcu_dereference_protected(tp->md5sig_info,
 948                                            lockdep_sock_is_held(sk));
 949         if (!md5sig) {
 950                 md5sig = kmalloc(sizeof(*md5sig), gfp);
 951                 if (!md5sig)
 952                         return -ENOMEM;
 953
 954                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
 955                 INIT_HLIST_HEAD(&md5sig->head);
 956                 rcu_assign_pointer(tp->md5sig_info, md5sig);
 957         }
 958
 959         key = sock_kmalloc(sk, sizeof(*key), gfp);
 960         if (!key)
 961                 return -ENOMEM;
 962         if (!tcp_alloc_md5sig_pool()) {
 963                 sock_kfree_s(sk, key, sizeof(*key));
 964                 return -ENOMEM;
 965         }
 966
 967         memcpy(key->key, newkey, newkeylen);
 968         key->keylen = newkeylen;
 969         key->family = family;
 970         memcpy(&key->addr, addr,
 971                (family == AF_INET6) ? sizeof(struct in6_addr) :
 972                                       sizeof(struct in_addr));
 973         hlist_add_head_rcu(&key->node, &md5sig->head);
 974         return 0;
 975 }
 976 EXPORT_SYMBOL(tcp_md5_do_add);
 977
 978 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
 979 {
 980         struct tcp_md5sig_key *key;
 981
 982         key = tcp_md5_do_lookup(sk, addr, family);
 983         if (!key)
 984                 return -ENOENT;
 985         hlist_del_rcu(&key->node);
 986         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
 987         kfree_rcu(key, rcu);
 988         return 0;
 989 }
 990 EXPORT_SYMBOL(tcp_md5_do_del);
 991
 992 static void tcp_clear_md5_list(struct sock *sk)
 993 {
 994         struct tcp_sock *tp = tcp_sk(sk);
 995         struct tcp_md5sig_key *key;
 996         struct hlist_node *n;
 997         struct tcp_md5sig_info *md5sig;
 998
 999         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1000
1001         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1002                 hlist_del_rcu(&key->node);
1003                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1004                 kfree_rcu(key, rcu);
1005         }
1006 }
1007
1008 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1009                                  int optlen)
1010 {
1011         struct tcp_md5sig cmd;
1012         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1013
1014         if (optlen < sizeof(cmd))
1015                 return -EINVAL;
1016
1017         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1018                 return -EFAULT;
1019
1020         if (sin->sin_family != AF_INET)
1021                 return -EINVAL;
1022
1023         if (!cmd.tcpm_keylen)
1024                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1025                                       AF_INET);
1026
1027         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1028                 return -EINVAL;
1029
1030         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1031                               AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1032                               GFP_KERNEL);
1033 }
1034
1035 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1036                                    __be32 daddr, __be32 saddr,
1037                                    const struct tcphdr *th, int nbytes)
1038 {
1039         struct tcp4_pseudohdr *bp;
1040         struct scatterlist sg;
1041         struct tcphdr *_th;
1042
1043         bp = hp->scratch;
1044         bp->saddr = saddr;
1045         bp->daddr = daddr;
1046         bp->pad = 0;
1047         bp->protocol = IPPROTO_TCP;
1048         bp->len = cpu_to_be16(nbytes);
1049
1050         _th = (struct tcphdr *)(bp + 1);
1051         memcpy(_th, th, sizeof(*th));
1052         _th->check = 0;
1053
1054         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1055         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1056                                 sizeof(*bp) + sizeof(*th));
1057         return crypto_ahash_update(hp->md5_req);
1058 }
1059
1060 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1061                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1062 {
1063         struct tcp_md5sig_pool *hp;
1064         struct ahash_request *req;
1065
1066         hp = tcp_get_md5sig_pool();
1067         if (!hp)
1068                 goto clear_hash_noput;
1069         req = hp->md5_req;
1070
1071         if (crypto_ahash_init(req))
1072                 goto clear_hash;
1073         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1074                 goto clear_hash;
1075         if (tcp_md5_hash_key(hp, key))
1076                 goto clear_hash;
1077         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1078         if (crypto_ahash_final(req))
1079                 goto clear_hash;
1080
1081         tcp_put_md5sig_pool();
1082         return 0;
1083
1084 clear_hash:
1085         tcp_put_md5sig_pool();
1086 clear_hash_noput:
1087         memset(md5_hash, 0, 16);
1088         return 1;
1089 }
1090
1091 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1092                         const struct sock *sk,
1093                         const struct sk_buff *skb)
1094 {
1095         struct tcp_md5sig_pool *hp;
1096         struct ahash_request *req;
1097         const struct tcphdr *th = tcp_hdr(skb);
1098         __be32 saddr, daddr;
1099
1100         if (sk) { /* valid for establish/request sockets */
1101                 saddr = sk->sk_rcv_saddr;
1102                 daddr = sk->sk_daddr;
1103         } else {
1104                 const struct iphdr *iph = ip_hdr(skb);
1105                 saddr = iph->saddr;
1106                 daddr = iph->daddr;
1107         }
1108
1109         hp = tcp_get_md5sig_pool();
1110         if (!hp)
1111                 goto clear_hash_noput;
1112         req = hp->md5_req;
1113
1114         if (crypto_ahash_init(req))
1115                 goto clear_hash;
1116
1117         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1118                 goto clear_hash;
1119         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1120                 goto clear_hash;
1121         if (tcp_md5_hash_key(hp, key))
1122                 goto clear_hash;
1123         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1124         if (crypto_ahash_final(req))
1125                 goto clear_hash;
1126
1127         tcp_put_md5sig_pool();
1128         return 0;
1129
1130 clear_hash:
1131         tcp_put_md5sig_pool();
1132 clear_hash_noput:
1133         memset(md5_hash, 0, 16);
1134         return 1;
1135 }
1136 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1137
1138 #endif
1139
1140 /* Called with rcu_read_lock() */
1141 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1142                                     const struct sk_buff *skb)
1143 {
1144 #ifdef CONFIG_TCP_MD5SIG
1145         /*
1146          * This gets called for each TCP segment that arrives
1147          * so we want to be efficient.
1148          * We have 3 drop cases:
1149          * o No MD5 hash and one expected.
1150          * o MD5 hash and we're not expecting one.
1151          * o MD5 hash and its wrong.
1152          */
1153         const __u8 *hash_location = NULL;
1154         struct tcp_md5sig_key *hash_expected;
1155         const struct iphdr *iph = ip_hdr(skb);
1156         const struct tcphdr *th = tcp_hdr(skb);
1157         int genhash;
1158         unsigned char newhash[16];
1159
1160         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1161                                           AF_INET);
1162         hash_location = tcp_parse_md5sig_option(th);
1163
1164         /* We've parsed the options - do we have a hash? */
1165         if (!hash_expected && !hash_location)
1166                 return false;
1167
1168         if (hash_expected && !hash_location) {
1169                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1170                 return true;
1171         }
1172
1173         if (!hash_expected && hash_location) {
1174                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1175                 return true;
1176         }
1177
1178         /* Okay, so this is hash_expected and hash_location -
1179          * so we need to calculate the checksum.
1180          */
1181         genhash = tcp_v4_md5_hash_skb(newhash,
1182                                       hash_expected,
1183                                       NULL, skb);
1184
1185         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1186                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1187                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1188                                      &iph->saddr, ntohs(th->source),
1189                                      &iph->daddr, ntohs(th->dest),
1190                                      genhash ? " tcp_v4_calc_md5_hash failed"
1191                                      : "");
1192                 return true;
1193         }
1194         return false;
1195 #endif
1196         return false;
1197 }
1198
1199 static void tcp_v4_init_req(struct request_sock *req,
1200                             const struct sock *sk_listener,
1201                             struct sk_buff *skb)
1202 {
1203         struct inet_request_sock *ireq = inet_rsk(req);
1204
1205         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1206         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1207         ireq->opt = tcp_v4_save_options(skb);
1208 }
1209
1210 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1211                                           struct flowi *fl,
1212                                           const struct request_sock *req,
1213                                           bool *strict)
1214 {
1215         struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1216
1217         if (strict) {
1218                 if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1219                         *strict = true;
1220                 else
1221                         *strict = false;
1222         }
1223
1224         return dst;
1225 }
1226
1227 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1228         .family         =       PF_INET,
1229         .obj_size       =       sizeof(struct tcp_request_sock),
1230         .rtx_syn_ack    =       tcp_rtx_synack,
1231         .send_ack       =       tcp_v4_reqsk_send_ack,
1232         .destructor     =       tcp_v4_reqsk_destructor,
1233         .send_reset     =       tcp_v4_send_reset,
1234         .syn_ack_timeout =      tcp_syn_ack_timeout,
1235 };
1236
1237 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1238         .mss_clamp      =       TCP_MSS_DEFAULT,
1239 #ifdef CONFIG_TCP_MD5SIG
1240         .req_md5_lookup =       tcp_v4_md5_lookup,
1241         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1242 #endif
1243         .init_req       =       tcp_v4_init_req,
1244 #ifdef CONFIG_SYN_COOKIES
1245         .cookie_init_seq =      cookie_v4_init_sequence,
1246 #endif
1247         .route_req      =       tcp_v4_route_req,
1248         .init_seq       =       tcp_v4_init_sequence,
1249         .send_synack    =       tcp_v4_send_synack,
1250 };
1251
1252 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1253 {
1254         /* Never answer to SYNs send to broadcast or multicast */
1255         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1256                 goto drop;
1257
1258         return tcp_conn_request(&tcp_request_sock_ops,
1259                                 &tcp_request_sock_ipv4_ops, sk, skb);
1260
1261 drop:
1262         tcp_listendrop(sk);
1263         return 0;
1264 }
1265 EXPORT_SYMBOL(tcp_v4_conn_request);
1266
1267
1268 /*
1269  * The three way handshake has completed - we got a valid synack -
1270  * now create the new socket.
1271  */
1272 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1273                                   struct request_sock *req,
1274                                   struct dst_entry *dst,
1275                                   struct request_sock *req_unhash,
1276                                   bool *own_req)
1277 {
1278         struct inet_request_sock *ireq;
1279         struct inet_sock *newinet;
1280         struct tcp_sock *newtp;
1281         struct sock *newsk;
1282 #ifdef CONFIG_TCP_MD5SIG
1283         struct tcp_md5sig_key *key;
1284 #endif
1285         struct ip_options_rcu *inet_opt;
1286
1287         if (sk_acceptq_is_full(sk))
1288                 goto exit_overflow;
1289
1290         newsk = tcp_create_openreq_child(sk, req, skb);
1291         if (!newsk)
1292                 goto exit_nonewsk;
1293
1294         newsk->sk_gso_type = SKB_GSO_TCPV4;
1295         inet_sk_rx_dst_set(newsk, skb);
1296
1297         newtp                 = tcp_sk(newsk);
1298         newinet               = inet_sk(newsk);
1299         ireq                  = inet_rsk(req);
1300         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1301         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1302         newsk->sk_bound_dev_if = ireq->ir_iif;
1303         newinet->inet_saddr           = ireq->ir_loc_addr;
1304         inet_opt              = ireq->opt;
1305         rcu_assign_pointer(newinet->inet_opt, inet_opt);
1306         ireq->opt             = NULL;
1307         newinet->mc_index     = inet_iif(skb);
1308         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1309         newinet->rcv_tos      = ip_hdr(skb)->tos;
1310         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1311         if (inet_opt)
1312                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1313         newinet->inet_id = newtp->write_seq ^ jiffies;
1314
1315         if (!dst) {
1316                 dst = inet_csk_route_child_sock(sk, newsk, req);
1317                 if (!dst)
1318                         goto put_and_exit;
1319         } else {
1320                 /* syncookie case : see end of cookie_v4_check() */
1321         }
1322         sk_setup_caps(newsk, dst);
1323
1324         tcp_ca_openreq_child(newsk, dst);
1325
1326         tcp_sync_mss(newsk, dst_mtu(dst));
1327         newtp->advmss = dst_metric_advmss(dst);
1328         if (tcp_sk(sk)->rx_opt.user_mss &&
1329             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1330                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1331
1332         tcp_initialize_rcv_mss(newsk);
1333
1334 #ifdef CONFIG_TCP_MD5SIG
1335         /* Copy over the MD5 key from the original socket */
1336         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1337                                 AF_INET);
1338         if (key) {
1339                 /*
1340                  * We're using one, so create a matching key
1341                  * on the newsk structure. If we fail to get
1342                  * memory, then we end up not copying the key
1343                  * across. Shucks.
1344                  */
1345                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1346                                AF_INET, key->key, key->keylen, GFP_ATOMIC);
1347                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1348         }
1349 #endif
1350
1351         if (__inet_inherit_port(sk, newsk) < 0)
1352                 goto put_and_exit;
1353         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1354         if (*own_req)
1355                 tcp_move_syn(newtp, req);
1356
1357         return newsk;
1358
1359 exit_overflow:
1360         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1361 exit_nonewsk:
1362         dst_release(dst);
1363 exit:
1364         tcp_listendrop(sk);
1365         return NULL;
1366 put_and_exit:
1367         inet_csk_prepare_forced_close(newsk);
1368         tcp_done(newsk);
1369         goto exit;
1370 }
1371 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1372
1373 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1374 {
1375 #ifdef CONFIG_SYN_COOKIES
1376         const struct tcphdr *th = tcp_hdr(skb);
1377
1378         if (!th->syn)
1379                 sk = cookie_v4_check(sk, skb);
1380 #endif
1381         return sk;
1382 }
1383
1384 /* The socket must have it's spinlock held when we get
1385  * here, unless it is a TCP_LISTEN socket.
1386  *
1387  * We have a potential double-lock case here, so even when
1388  * doing backlog processing we use the BH locking scheme.
1389  * This is because we cannot sleep with the original spinlock
1390  * held.
1391  */
1392 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1393 {
1394         struct sock *rsk;
1395
1396         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1397                 struct dst_entry *dst = sk->sk_rx_dst;
1398
1399                 sock_rps_save_rxhash(sk, skb);
1400                 sk_mark_napi_id(sk, skb);
1401                 if (dst) {
1402                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1403                             !dst->ops->check(dst, 0)) {
1404                                 dst_release(dst);
1405                                 sk->sk_rx_dst = NULL;
1406                         }
1407                 }
1408                 tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1409                 return 0;
1410         }
1411
1412         if (tcp_checksum_complete(skb))
1413                 goto csum_err;
1414
1415         if (sk->sk_state == TCP_LISTEN) {
1416                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1417
1418                 if (!nsk)
1419                         goto discard;
1420                 if (nsk != sk) {
1421                         sock_rps_save_rxhash(nsk, skb);
1422                         sk_mark_napi_id(nsk, skb);
1423                         if (tcp_child_process(sk, nsk, skb)) {
1424                                 rsk = nsk;
1425                                 goto reset;
1426                         }
1427                         return 0;
1428                 }
1429         } else
1430                 sock_rps_save_rxhash(sk, skb);
1431
1432         if (tcp_rcv_state_process(sk, skb)) {
1433                 rsk = sk;
1434                 goto reset;
1435         }
1436         return 0;
1437
1438 reset:
1439         tcp_v4_send_reset(rsk, skb);
1440 discard:
1441         kfree_skb(skb);
1442         /* Be careful here. If this function gets more complicated and
1443          * gcc suffers from register pressure on the x86, sk (in %ebx)
1444          * might be destroyed here. This current version compiles correctly,
1445          * but you have been warned.
1446          */
1447         return 0;
1448
1449 csum_err:
1450         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1451         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1452         goto discard;
1453 }
1454 EXPORT_SYMBOL(tcp_v4_do_rcv);
1455
1456 void tcp_v4_early_demux(struct sk_buff *skb)
1457 {
1458         const struct iphdr *iph;
1459         const struct tcphdr *th;
1460         struct sock *sk;
1461
1462         if (skb->pkt_type != PACKET_HOST)
1463                 return;
1464
1465         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1466                 return;
1467
1468         iph = ip_hdr(skb);
1469         th = tcp_hdr(skb);
1470
1471         if (th->doff < sizeof(struct tcphdr) / 4)
1472                 return;
1473
1474         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1475                                        iph->saddr, th->source,
1476                                        iph->daddr, ntohs(th->dest),
1477                                        skb->skb_iif);
1478         if (sk) {
1479                 skb->sk = sk;
1480                 skb->destructor = sock_edemux;
1481                 if (sk_fullsock(sk)) {
1482                         struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1483
1484                         if (dst)
1485                                 dst = dst_check(dst, 0);
1486                         if (dst &&
1487                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1488                                 skb_dst_set_noref(skb, dst);
1489                 }
1490         }
1491 }
1492
1493 /* Packet is added to VJ-style prequeue for processing in process
1494  * context, if a reader task is waiting. Apparently, this exciting
1495  * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1496  * failed somewhere. Latency? Burstiness? Well, at least now we will
1497  * see, why it failed. 8)8)                               --ANK
1498  *
1499  */
1500 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1501 {
1502         struct tcp_sock *tp = tcp_sk(sk);
1503
1504         if (sysctl_tcp_low_latency || !tp->ucopy.task)
1505                 return false;
1506
1507         if (skb->len <= tcp_hdrlen(skb) &&
1508             skb_queue_len(&tp->ucopy.prequeue) == 0)
1509                 return false;
1510
1511         /* Before escaping RCU protected region, we need to take care of skb
1512          * dst. Prequeue is only enabled for established sockets.
1513          * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1514          * Instead of doing full sk_rx_dst validity here, let's perform
1515          * an optimistic check.
1516          */
1517         if (likely(sk->sk_rx_dst))
1518                 skb_dst_drop(skb);
1519         else
1520                 skb_dst_force_safe(skb);
1521
1522         __skb_queue_tail(&tp->ucopy.prequeue, skb);
1523         tp->ucopy.memory += skb->truesize;
1524         if (skb_queue_len(&tp->ucopy.prequeue) >= 32 ||
1525             tp->ucopy.memory + atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) {
1526                 struct sk_buff *skb1;
1527
1528                 BUG_ON(sock_owned_by_user(sk));
1529                 __NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUEDROPPED,
1530                                 skb_queue_len(&tp->ucopy.prequeue));
1531
1532                 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1533                         sk_backlog_rcv(sk, skb1);
1534
1535                 tp->ucopy.memory = 0;
1536         } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1537                 wake_up_interruptible_sync_poll(sk_sleep(sk),
1538                                            POLLIN | POLLRDNORM | POLLRDBAND);
1539                 if (!inet_csk_ack_scheduled(sk))
1540                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1541                                                   (3 * tcp_rto_min(sk)) / 4,
1542                                                   TCP_RTO_MAX);
1543         }
1544         return true;
1545 }
1546 EXPORT_SYMBOL(tcp_prequeue);
1547
1548 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1549 {
1550         u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1551
1552         /* Only socket owner can try to collapse/prune rx queues
1553          * to reduce memory overhead, so add a little headroom here.
1554          * Few sockets backlog are possibly concurrently non empty.
1555          */
1556         limit += 64*1024;
1557
1558         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1559          * we can fix skb->truesize to its real value to avoid future drops.
1560          * This is valid because skb is not yet charged to the socket.
1561          * It has been noticed pure SACK packets were sometimes dropped
1562          * (if cooked by drivers without copybreak feature).
1563          */
1564         skb_condense(skb);
1565
1566         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1567                 bh_unlock_sock(sk);
1568                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1569                 return true;
1570         }
1571         return false;
1572 }
1573 EXPORT_SYMBOL(tcp_add_backlog);
1574
1575 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1576 {
1577         struct tcphdr *th = (struct tcphdr *)skb->data;
1578         unsigned int eaten = skb->len;
1579         int err;
1580
1581         err = sk_filter_trim_cap(sk, skb, th->doff * 4);
1582         if (!err) {
1583                 eaten -= skb->len;
1584                 TCP_SKB_CB(skb)->end_seq -= eaten;
1585         }
1586         return err;
1587 }
1588 EXPORT_SYMBOL(tcp_filter);
1589
1590 /*
1591  *      From tcp_input.c
1592  */
1593
1594 int tcp_v4_rcv(struct sk_buff *skb)
1595 {
1596         struct net *net = dev_net(skb->dev);
1597         const struct iphdr *iph;
1598         const struct tcphdr *th;
1599         bool refcounted;
1600         struct sock *sk;
1601         int ret;
1602
1603         if (skb->pkt_type != PACKET_HOST)
1604                 goto discard_it;
1605
1606         /* Count it even if it's bad */
1607         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1608
1609         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1610                 goto discard_it;
1611
1612         th = (const struct tcphdr *)skb->data;
1613
1614         if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1615                 goto bad_packet;
1616         if (!pskb_may_pull(skb, th->doff * 4))
1617                 goto discard_it;
1618
1619         /* An explanation is required here, I think.
1620          * Packet length and doff are validated by header prediction,
1621          * provided case of th->doff==0 is eliminated.
1622          * So, we defer the checks. */
1623
1624         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1625                 goto csum_error;
1626
1627         th = (const struct tcphdr *)skb->data;
1628         iph = ip_hdr(skb);
1629         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1630          * barrier() makes sure compiler wont play fool^Waliasing games.
1631          */
1632         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1633                 sizeof(struct inet_skb_parm));
1634         barrier();
1635
1636         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1637         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1638                                     skb->len - th->doff * 4);
1639         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1640         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1641         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1642         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1643         TCP_SKB_CB(skb)->sacked  = 0;
1644
1645 lookup:
1646         sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1647                                th->dest, &refcounted);
1648         if (!sk)
1649                 goto no_tcp_socket;
1650
1651 process:
1652         if (sk->sk_state == TCP_TIME_WAIT)
1653                 goto do_time_wait;
1654
1655         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1656                 struct request_sock *req = inet_reqsk(sk);
1657                 struct sock *nsk;
1658
1659                 sk = req->rsk_listener;
1660                 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1661                         sk_drops_add(sk, skb);
1662                         reqsk_put(req);
1663                         goto discard_it;
1664                 }
1665                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1666                         inet_csk_reqsk_queue_drop_and_put(sk, req);
1667                         goto lookup;
1668                 }
1669                 /* We own a reference on the listener, increase it again
1670                  * as we might lose it too soon.
1671                  */
1672                 sock_hold(sk);
1673                 refcounted = true;
1674                 nsk = tcp_check_req(sk, skb, req, false);
1675                 if (!nsk) {
1676                         reqsk_put(req);
1677                         goto discard_and_relse;
1678                 }
1679                 if (nsk == sk) {
1680                         reqsk_put(req);
1681                 } else if (tcp_child_process(sk, nsk, skb)) {
1682                         tcp_v4_send_reset(nsk, skb);
1683                         goto discard_and_relse;
1684                 } else {
1685                         sock_put(sk);
1686                         return 0;
1687                 }
1688         }
1689         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1690                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1691                 goto discard_and_relse;
1692         }
1693
1694         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1695                 goto discard_and_relse;
1696
1697         if (tcp_v4_inbound_md5_hash(sk, skb))
1698                 goto discard_and_relse;
1699
1700         nf_reset(skb);
1701
1702         if (tcp_filter(sk, skb))
1703                 goto discard_and_relse;
1704         th = (const struct tcphdr *)skb->data;
1705         iph = ip_hdr(skb);
1706
1707         skb->dev = NULL;
1708
1709         if (sk->sk_state == TCP_LISTEN) {
1710                 ret = tcp_v4_do_rcv(sk, skb);
1711                 goto put_and_return;
1712         }
1713
1714         sk_incoming_cpu_update(sk);
1715
1716         bh_lock_sock_nested(sk);
1717         tcp_segs_in(tcp_sk(sk), skb);
1718         ret = 0;
1719         if (!sock_owned_by_user(sk)) {
1720                 if (!tcp_prequeue(sk, skb))
1721                         ret = tcp_v4_do_rcv(sk, skb);
1722         } else if (tcp_add_backlog(sk, skb)) {
1723                 goto discard_and_relse;
1724         }
1725         bh_unlock_sock(sk);
1726
1727 put_and_return:
1728         if (refcounted)
1729                 sock_put(sk);
1730
1731         return ret;
1732
1733 no_tcp_socket:
1734         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1735                 goto discard_it;
1736
1737         if (tcp_checksum_complete(skb)) {
1738 csum_error:
1739                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1740 bad_packet:
1741                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
1742         } else {
1743                 tcp_v4_send_reset(NULL, skb);
1744         }
1745
1746 discard_it:
1747         /* Discard frame. */
1748         kfree_skb(skb);
1749         return 0;
1750
1751 discard_and_relse:
1752         sk_drops_add(sk, skb);
1753         if (refcounted)
1754                 sock_put(sk);
1755         goto discard_it;
1756
1757 do_time_wait:
1758         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1759                 inet_twsk_put(inet_twsk(sk));
1760                 goto discard_it;
1761         }
1762
1763         if (tcp_checksum_complete(skb)) {
1764                 inet_twsk_put(inet_twsk(sk));
1765                 goto csum_error;
1766         }
1767         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1768         case TCP_TW_SYN: {
1769                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1770                                                         &tcp_hashinfo, skb,
1771                                                         __tcp_hdrlen(th),
1772                                                         iph->saddr, th->source,
1773                                                         iph->daddr, th->dest,
1774                                                         inet_iif(skb));
1775                 if (sk2) {
1776                         inet_twsk_deschedule_put(inet_twsk(sk));
1777                         sk = sk2;
1778                         refcounted = false;
1779                         goto process;
1780                 }
1781                 /* Fall through to ACK */
1782         }
1783         case TCP_TW_ACK:
1784                 tcp_v4_timewait_ack(sk, skb);
1785                 break;
1786         case TCP_TW_RST:
1787                 tcp_v4_send_reset(sk, skb);
1788                 inet_twsk_deschedule_put(inet_twsk(sk));
1789                 goto discard_it;
1790         case TCP_TW_SUCCESS:;
1791         }
1792         goto discard_it;
1793 }
1794
1795 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1796         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1797         .twsk_unique    = tcp_twsk_unique,
1798         .twsk_destructor= tcp_twsk_destructor,
1799 };
1800
1801 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1802 {
1803         struct dst_entry *dst = skb_dst(skb);
1804
1805         if (dst && dst_hold_safe(dst)) {
1806                 sk->sk_rx_dst = dst;
1807                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1808         }
1809 }
1810 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1811
1812 const struct inet_connection_sock_af_ops ipv4_specific = {
1813         .queue_xmit        = ip_queue_xmit,
1814         .send_check        = tcp_v4_send_check,
1815         .rebuild_header    = inet_sk_rebuild_header,
1816         .sk_rx_dst_set     = inet_sk_rx_dst_set,
1817         .conn_request      = tcp_v4_conn_request,
1818         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1819         .net_header_len    = sizeof(struct iphdr),
1820         .setsockopt        = ip_setsockopt,
1821         .getsockopt        = ip_getsockopt,
1822         .addr2sockaddr     = inet_csk_addr2sockaddr,
1823         .sockaddr_len      = sizeof(struct sockaddr_in),
1824 #ifdef CONFIG_COMPAT
1825         .compat_setsockopt = compat_ip_setsockopt,
1826         .compat_getsockopt = compat_ip_getsockopt,
1827 #endif
1828         .mtu_reduced       = tcp_v4_mtu_reduced,
1829 };
1830 EXPORT_SYMBOL(ipv4_specific);
1831
1832 #ifdef CONFIG_TCP_MD5SIG
1833 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1834         .md5_lookup             = tcp_v4_md5_lookup,
1835         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1836         .md5_parse              = tcp_v4_parse_md5_keys,
1837 };
1838 #endif
1839
1840 /* NOTE: A lot of things set to zero explicitly by call to
1841  *       sk_alloc() so need not be done here.
1842  */
1843 static int tcp_v4_init_sock(struct sock *sk)
1844 {
1845         struct inet_connection_sock *icsk = inet_csk(sk);
1846
1847         tcp_init_sock(sk);
1848
1849         icsk->icsk_af_ops = &ipv4_specific;
1850
1851 #ifdef CONFIG_TCP_MD5SIG
1852         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1853 #endif
1854
1855         return 0;
1856 }
1857
1858 void tcp_v4_destroy_sock(struct sock *sk)
1859 {
1860         struct tcp_sock *tp = tcp_sk(sk);
1861
1862         tcp_clear_xmit_timers(sk);
1863
1864         tcp_cleanup_congestion_control(sk);
1865
1866         /* Cleanup up the write buffer. */
1867         tcp_write_queue_purge(sk);
1868
1869         /* Cleans up our, hopefully empty, out_of_order_queue. */
1870         skb_rbtree_purge(&tp->out_of_order_queue);
1871
1872 #ifdef CONFIG_TCP_MD5SIG
1873         /* Clean up the MD5 key list, if any */
1874         if (tp->md5sig_info) {
1875                 tcp_clear_md5_list(sk);
1876                 kfree_rcu(tp->md5sig_info, rcu);
1877                 tp->md5sig_info = NULL;
1878         }
1879 #endif
1880
1881         /* Clean prequeue, it must be empty really */
1882         __skb_queue_purge(&tp->ucopy.prequeue);
1883
1884         /* Clean up a referenced TCP bind bucket. */
1885         if (inet_csk(sk)->icsk_bind_hash)
1886                 inet_put_port(sk);
1887
1888         BUG_ON(tp->fastopen_rsk);
1889
1890         /* If socket is aborted during connect operation */
1891         tcp_free_fastopen_req(tp);
1892         tcp_saved_syn_free(tp);
1893
1894         sk_sockets_allocated_dec(sk);
1895 }
1896 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1897
1898 #ifdef CONFIG_PROC_FS
1899 /* Proc filesystem TCP sock list dumping. */
1900
1901 /*
1902  * Get next listener socket follow cur.  If cur is NULL, get first socket
1903  * starting from bucket given in st->bucket; when st->bucket is zero the
1904  * very first socket in the hash table is returned.
1905  */
1906 static void *listening_get_next(struct seq_file *seq, void *cur)
1907 {
1908         struct tcp_iter_state *st = seq->private;
1909         struct net *net = seq_file_net(seq);
1910         struct inet_listen_hashbucket *ilb;
1911         struct sock *sk = cur;
1912
1913         if (!sk) {
1914 get_head:
1915                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1916                 spin_lock(&ilb->lock);
1917                 sk = sk_head(&ilb->head);
1918                 st->offset = 0;
1919                 goto get_sk;
1920         }
1921         ilb = &tcp_hashinfo.listening_hash[st->bucket];
1922         ++st->num;
1923         ++st->offset;
1924
1925         sk = sk_next(sk);
1926 get_sk:
1927         sk_for_each_from(sk) {
1928                 if (!net_eq(sock_net(sk), net))
1929                         continue;
1930                 if (sk->sk_family == st->family)
1931                         return sk;
1932         }
1933         spin_unlock(&ilb->lock);
1934         st->offset = 0;
1935         if (++st->bucket < INET_LHTABLE_SIZE)
1936                 goto get_head;
1937         return NULL;
1938 }
1939
1940 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1941 {
1942         struct tcp_iter_state *st = seq->private;
1943         void *rc;
1944
1945         st->bucket = 0;
1946         st->offset = 0;
1947         rc = listening_get_next(seq, NULL);
1948
1949         while (rc && *pos) {
1950                 rc = listening_get_next(seq, rc);
1951                 --*pos;
1952         }
1953         return rc;
1954 }
1955
1956 static inline bool empty_bucket(const struct tcp_iter_state *st)
1957 {
1958         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1959 }
1960
1961 /*
1962  * Get first established socket starting from bucket given in st->bucket.
1963  * If st->bucket is zero, the very first socket in the hash is returned.
1964  */
1965 static void *established_get_first(struct seq_file *seq)
1966 {
1967         struct tcp_iter_state *st = seq->private;
1968         struct net *net = seq_file_net(seq);
1969         void *rc = NULL;
1970
1971         st->offset = 0;
1972         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1973                 struct sock *sk;
1974                 struct hlist_nulls_node *node;
1975                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1976
1977                 /* Lockless fast path for the common case of empty buckets */
1978                 if (empty_bucket(st))
1979                         continue;
1980
1981                 spin_lock_bh(lock);
1982                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1983                         if (sk->sk_family != st->family ||
1984                             !net_eq(sock_net(sk), net)) {
1985                                 continue;
1986                         }
1987                         rc = sk;
1988                         goto out;
1989                 }
1990                 spin_unlock_bh(lock);
1991         }
1992 out:
1993         return rc;
1994 }
1995
1996 static void *established_get_next(struct seq_file *seq, void *cur)
1997 {
1998         struct sock *sk = cur;
1999         struct hlist_nulls_node *node;
2000         struct tcp_iter_state *st = seq->private;
2001         struct net *net = seq_file_net(seq);
2002
2003         ++st->num;
2004         ++st->offset;
2005
2006         sk = sk_nulls_next(sk);
2007
2008         sk_nulls_for_each_from(sk, node) {
2009                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2010                         return sk;
2011         }
2012
2013         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2014         ++st->bucket;
2015         return established_get_first(seq);
2016 }
2017
2018 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2019 {
2020         struct tcp_iter_state *st = seq->private;
2021         void *rc;
2022
2023         st->bucket = 0;
2024         rc = established_get_first(seq);
2025
2026         while (rc && pos) {
2027                 rc = established_get_next(seq, rc);
2028                 --pos;
2029         }
2030         return rc;
2031 }
2032
2033 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2034 {
2035         void *rc;
2036         struct tcp_iter_state *st = seq->private;
2037
2038         st->state = TCP_SEQ_STATE_LISTENING;
2039         rc        = listening_get_idx(seq, &pos);
2040
2041         if (!rc) {
2042                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2043                 rc        = established_get_idx(seq, pos);
2044         }
2045
2046         return rc;
2047 }
2048
2049 static void *tcp_seek_last_pos(struct seq_file *seq)
2050 {
2051         struct tcp_iter_state *st = seq->private;
2052         int offset = st->offset;
2053         int orig_num = st->num;
2054         void *rc = NULL;
2055
2056         switch (st->state) {
2057         case TCP_SEQ_STATE_LISTENING:
2058                 if (st->bucket >= INET_LHTABLE_SIZE)
2059                         break;
2060                 st->state = TCP_SEQ_STATE_LISTENING;
2061                 rc = listening_get_next(seq, NULL);
2062                 while (offset-- && rc)
2063                         rc = listening_get_next(seq, rc);
2064                 if (rc)
2065                         break;
2066                 st->bucket = 0;
2067                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2068                 /* Fallthrough */
2069         case TCP_SEQ_STATE_ESTABLISHED:
2070                 if (st->bucket > tcp_hashinfo.ehash_mask)
2071                         break;
2072                 rc = established_get_first(seq);
2073                 while (offset-- && rc)
2074                         rc = established_get_next(seq, rc);
2075         }
2076
2077         st->num = orig_num;
2078
2079         return rc;
2080 }
2081
2082 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2083 {
2084         struct tcp_iter_state *st = seq->private;
2085         void *rc;
2086
2087         if (*pos && *pos == st->last_pos) {
2088                 rc = tcp_seek_last_pos(seq);
2089                 if (rc)
2090                         goto out;
2091         }
2092
2093         st->state = TCP_SEQ_STATE_LISTENING;
2094         st->num = 0;
2095         st->bucket = 0;
2096         st->offset = 0;
2097         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2098
2099 out:
2100         st->last_pos = *pos;
2101         return rc;
2102 }
2103
2104 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2105 {
2106         struct tcp_iter_state *st = seq->private;
2107         void *rc = NULL;
2108
2109         if (v == SEQ_START_TOKEN) {
2110                 rc = tcp_get_idx(seq, 0);
2111                 goto out;
2112         }
2113
2114         switch (st->state) {
2115         case TCP_SEQ_STATE_LISTENING:
2116                 rc = listening_get_next(seq, v);
2117                 if (!rc) {
2118                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2119                         st->bucket = 0;
2120                         st->offset = 0;
2121                         rc        = established_get_first(seq);
2122                 }
2123                 break;
2124         case TCP_SEQ_STATE_ESTABLISHED:
2125                 rc = established_get_next(seq, v);
2126                 break;
2127         }
2128 out:
2129         ++*pos;
2130         st->last_pos = *pos;
2131         return rc;
2132 }
2133
2134 static void tcp_seq_stop(struct seq_file *seq, void *v)
2135 {
2136         struct tcp_iter_state *st = seq->private;
2137
2138         switch (st->state) {
2139         case TCP_SEQ_STATE_LISTENING:
2140                 if (v != SEQ_START_TOKEN)
2141                         spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2142                 break;
2143         case TCP_SEQ_STATE_ESTABLISHED:
2144                 if (v)
2145                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2146                 break;
2147         }
2148 }
2149
2150 int tcp_seq_open(struct inode *inode, struct file *file)
2151 {
2152         struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2153         struct tcp_iter_state *s;
2154         int err;
2155
2156         err = seq_open_net(inode, file, &afinfo->seq_ops,
2157                           sizeof(struct tcp_iter_state));
2158         if (err < 0)
2159                 return err;
2160
2161         s = ((struct seq_file *)file->private_data)->private;
2162         s->family               = afinfo->family;
2163         s->last_pos             = 0;
2164         return 0;
2165 }
2166 EXPORT_SYMBOL(tcp_seq_open);
2167
2168 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2169 {
2170         int rc = 0;
2171         struct proc_dir_entry *p;
2172
2173         afinfo->seq_ops.start           = tcp_seq_start;
2174         afinfo->seq_ops.next            = tcp_seq_next;
2175         afinfo->seq_ops.stop            = tcp_seq_stop;
2176
2177         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2178                              afinfo->seq_fops, afinfo);
2179         if (!p)
2180                 rc = -ENOMEM;
2181         return rc;
2182 }
2183 EXPORT_SYMBOL(tcp_proc_register);
2184
2185 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2186 {
2187         remove_proc_entry(afinfo->name, net->proc_net);
2188 }
2189 EXPORT_SYMBOL(tcp_proc_unregister);
2190
2191 static void get_openreq4(const struct request_sock *req,
2192                          struct seq_file *f, int i)
2193 {
2194         const struct inet_request_sock *ireq = inet_rsk(req);
2195         long delta = req->rsk_timer.expires - jiffies;
2196
2197         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2198                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2199                 i,
2200                 ireq->ir_loc_addr,
2201                 ireq->ir_num,
2202                 ireq->ir_rmt_addr,
2203                 ntohs(ireq->ir_rmt_port),
2204                 TCP_SYN_RECV,
2205                 0, 0, /* could print option size, but that is af dependent. */
2206                 1,    /* timers active (only the expire timer) */
2207                 jiffies_delta_to_clock_t(delta),
2208                 req->num_timeout,
2209                 from_kuid_munged(seq_user_ns(f),
2210                                  sock_i_uid(req->rsk_listener)),
2211                 0,  /* non standard timer */
2212                 0, /* open_requests have no inode */
2213                 0,
2214                 req);
2215 }
2216
2217 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2218 {
2219         int timer_active;
2220         unsigned long timer_expires;
2221         const struct tcp_sock *tp = tcp_sk(sk);
2222         const struct inet_connection_sock *icsk = inet_csk(sk);
2223         const struct inet_sock *inet = inet_sk(sk);
2224         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2225         __be32 dest = inet->inet_daddr;
2226         __be32 src = inet->inet_rcv_saddr;
2227         __u16 destp = ntohs(inet->inet_dport);
2228         __u16 srcp = ntohs(inet->inet_sport);
2229         int rx_queue;
2230         int state;
2231
2232         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2233             icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2234             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2235                 timer_active    = 1;
2236                 timer_expires   = icsk->icsk_timeout;
2237         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2238                 timer_active    = 4;
2239                 timer_expires   = icsk->icsk_timeout;
2240         } else if (timer_pending(&sk->sk_timer)) {
2241                 timer_active    = 2;
2242                 timer_expires   = sk->sk_timer.expires;
2243         } else {
2244                 timer_active    = 0;
2245                 timer_expires = jiffies;
2246         }
2247
2248         state = sk_state_load(sk);
2249         if (state == TCP_LISTEN)
2250                 rx_queue = sk->sk_ack_backlog;
2251         else
2252                 /* Because we don't lock the socket,
2253                  * we might find a transient negative value.
2254                  */
2255                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2256
2257         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2258                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2259                 i, src, srcp, dest, destp, state,
2260                 tp->write_seq - tp->snd_una,
2261                 rx_queue,
2262                 timer_active,
2263                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2264                 icsk->icsk_retransmits,
2265                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2266                 icsk->icsk_probes_out,
2267                 sock_i_ino(sk),
2268                 atomic_read(&sk->sk_refcnt), sk,
2269                 jiffies_to_clock_t(icsk->icsk_rto),
2270                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2271                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2272                 tp->snd_cwnd,
2273                 state == TCP_LISTEN ?
2274                     fastopenq->max_qlen :
2275                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2276 }
2277
2278 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2279                                struct seq_file *f, int i)
2280 {
2281         long delta = tw->tw_timer.expires - jiffies;
2282         __be32 dest, src;
2283         __u16 destp, srcp;
2284
2285         dest  = tw->tw_daddr;
2286         src   = tw->tw_rcv_saddr;
2287         destp = ntohs(tw->tw_dport);
2288         srcp  = ntohs(tw->tw_sport);
2289
2290         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2291                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2292                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2293                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2294                 atomic_read(&tw->tw_refcnt), tw);
2295 }
2296
2297 #define TMPSZ 150
2298
2299 static int tcp4_seq_show(struct seq_file *seq, void *v)
2300 {
2301         struct tcp_iter_state *st;
2302         struct sock *sk = v;
2303
2304         seq_setwidth(seq, TMPSZ - 1);
2305         if (v == SEQ_START_TOKEN) {
2306                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2307                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2308                            "inode");
2309                 goto out;
2310         }
2311         st = seq->private;
2312
2313         if (sk->sk_state == TCP_TIME_WAIT)
2314                 get_timewait4_sock(v, seq, st->num);
2315         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2316                 get_openreq4(v, seq, st->num);
2317         else
2318                 get_tcp4_sock(v, seq, st->num);
2319 out:
2320         seq_pad(seq, '\n');
2321         return 0;
2322 }
2323
2324 static const struct file_operations tcp_afinfo_seq_fops = {
2325         .owner   = THIS_MODULE,
2326         .open    = tcp_seq_open,
2327         .read    = seq_read,
2328         .llseek  = seq_lseek,
2329         .release = seq_release_net
2330 };
2331
2332 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2333         .name           = "tcp",
2334         .family         = AF_INET,
2335         .seq_fops       = &tcp_afinfo_seq_fops,
2336         .seq_ops        = {
2337                 .show           = tcp4_seq_show,
2338         },
2339 };
2340
2341 static int __net_init tcp4_proc_init_net(struct net *net)
2342 {
2343         return tcp_proc_register(net, &tcp4_seq_afinfo);
2344 }
2345
2346 static void __net_exit tcp4_proc_exit_net(struct net *net)
2347 {
2348         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2349 }
2350
2351 static struct pernet_operations tcp4_net_ops = {
2352         .init = tcp4_proc_init_net,
2353         .exit = tcp4_proc_exit_net,
2354 };
2355
2356 int __init tcp4_proc_init(void)
2357 {
2358         return register_pernet_subsys(&tcp4_net_ops);
2359 }
2360
2361 void tcp4_proc_exit(void)
2362 {
2363         unregister_pernet_subsys(&tcp4_net_ops);
2364 }
2365 #endif /* CONFIG_PROC_FS */
2366
2367 struct proto tcp_prot = {
2368         .name                   = "TCP",
2369         .owner                  = THIS_MODULE,
2370         .close                  = tcp_close,
2371         .connect                = tcp_v4_connect,
2372         .disconnect             = tcp_disconnect,
2373         .accept                 = inet_csk_accept,
2374         .ioctl                  = tcp_ioctl,
2375         .init                   = tcp_v4_init_sock,
2376         .destroy                = tcp_v4_destroy_sock,
2377         .shutdown               = tcp_shutdown,
2378         .setsockopt             = tcp_setsockopt,
2379         .getsockopt             = tcp_getsockopt,
2380         .keepalive              = tcp_set_keepalive,
2381         .recvmsg                = tcp_recvmsg,
2382         .sendmsg                = tcp_sendmsg,
2383         .sendpage               = tcp_sendpage,
2384         .backlog_rcv            = tcp_v4_do_rcv,
2385         .release_cb             = tcp_release_cb,
2386         .hash                   = inet_hash,
2387         .unhash                 = inet_unhash,
2388         .get_port               = inet_csk_get_port,
2389         .enter_memory_pressure  = tcp_enter_memory_pressure,
2390         .stream_memory_free     = tcp_stream_memory_free,
2391         .sockets_allocated      = &tcp_sockets_allocated,
2392         .orphan_count           = &tcp_orphan_count,
2393         .memory_allocated       = &tcp_memory_allocated,
2394         .memory_pressure        = &tcp_memory_pressure,
2395         .sysctl_mem             = sysctl_tcp_mem,
2396         .sysctl_wmem            = sysctl_tcp_wmem,
2397         .sysctl_rmem            = sysctl_tcp_rmem,
2398         .max_header             = MAX_TCP_HEADER,
2399         .obj_size               = sizeof(struct tcp_sock),
2400         .slab_flags             = SLAB_DESTROY_BY_RCU,
2401         .twsk_prot              = &tcp_timewait_sock_ops,
2402         .rsk_prot               = &tcp_request_sock_ops,
2403         .h.hashinfo             = &tcp_hashinfo,
2404         .no_autobind            = true,
2405 #ifdef CONFIG_COMPAT
2406         .compat_setsockopt      = compat_tcp_setsockopt,
2407         .compat_getsockopt      = compat_tcp_getsockopt,
2408 #endif
2409         .diag_destroy           = tcp_abort,
2410 };
2411 EXPORT_SYMBOL(tcp_prot);
2412
2413 static void __net_exit tcp_sk_exit(struct net *net)
2414 {
2415         int cpu;
2416
2417         for_each_possible_cpu(cpu)
2418                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2419         free_percpu(net->ipv4.tcp_sk);
2420 }
2421
2422 static int __net_init tcp_sk_init(struct net *net)
2423 {
2424         int res, cpu, cnt;
2425
2426         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2427         if (!net->ipv4.tcp_sk)
2428                 return -ENOMEM;
2429
2430         for_each_possible_cpu(cpu) {
2431                 struct sock *sk;
2432
2433                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2434                                            IPPROTO_TCP, net);
2435                 if (res)
2436                         goto fail;
2437                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2438                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2439         }
2440
2441         net->ipv4.sysctl_tcp_ecn = 2;
2442         net->ipv4.sysctl_tcp_ecn_fallback = 1;
2443
2444         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2445         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2446         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2447
2448         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2449         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2450         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2451
2452         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2453         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2454         net->ipv4.sysctl_tcp_syncookies = 1;
2455         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2456         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2457         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2458         net->ipv4.sysctl_tcp_orphan_retries = 0;
2459         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2460         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2461         net->ipv4.sysctl_tcp_tw_reuse = 0;
2462
2463         cnt = tcp_hashinfo.ehash_mask + 1;
2464         net->ipv4.tcp_death_row.sysctl_tw_recycle = 0;
2465         net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
2466         net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2467
2468         net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
2469
2470         return 0;
2471 fail:
2472         tcp_sk_exit(net);
2473
2474         return res;
2475 }
2476
2477 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2478 {
2479         inet_twsk_purge(&tcp_hashinfo, AF_INET);
2480 }
2481
2482 static struct pernet_operations __net_initdata tcp_sk_ops = {
2483        .init       = tcp_sk_init,
2484        .exit       = tcp_sk_exit,
2485        .exit_batch = tcp_sk_exit_batch,
2486 };
2487
2488 void __init tcp_v4_init(void)
2489 {
2490         if (register_pernet_subsys(&tcp_sk_ops))
2491                 panic("Failed to create the TCP control socket.\n");
2492 }