net/ipv4/tcp_ipv4.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4  *              operating system.  INET is implemented using the  BSD Socket
   5  *              interface as the means of communication with the user level.
   6  *
   7  *              Implementation of the Transmission Control Protocol(TCP).
   8  *
   9  *              IPv4 specific functions
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  */
  18
  19 /*
  20  * Changes:
  21  *              David S. Miller :       New socket lookup architecture.
  22  *                                      This code is dedicated to John Dyson.
  23  *              David S. Miller :       Change semantics of established hash,
  24  *                                      half is devoted to TIME_WAIT sockets
  25  *                                      and the rest go in the other half.
  26  *              Andi Kleen :            Add support for syncookies and fixed
  27  *                                      some bugs: ip options weren't passed to
  28  *                                      the TCP layer, missed a check for an
  29  *                                      ACK bit.
  30  *              Andi Kleen :            Implemented fast path mtu discovery.
  31  *                                      Fixed many serious bugs in the
  32  *                                      request_sock handling and moved
  33  *                                      most of it into the af independent code.
  34  *                                      Added tail drop and some other bugfixes.
  35  *                                      Added new listen semantics.
  36  *              Mike McLagan    :       Routing by source
  37  *      Juan Jose Ciarlante:            ip_dynaddr bits
  38  *              Andi Kleen:             various fixes.
  39  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  40  *                                      coma.
  41  *      Andi Kleen              :       Fix new listen.
  42  *      Andi Kleen              :       Fix accept error reporting.
  43  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  44  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  45  *                                      a single port at the same time.
  46  */
  47
  48 #define pr_fmt(fmt) "TCP: " fmt
  49
  50 #include <linux/bottom_half.h>
  51 #include <linux/types.h>
  52 #include <linux/fcntl.h>
  53 #include <linux/module.h>
  54 #include <linux/random.h>
  55 #include <linux/cache.h>
  56 #include <linux/jhash.h>
  57 #include <linux/init.h>
  58 #include <linux/times.h>
  59 #include <linux/slab.h>
  60
  61 #include <net/net_namespace.h>
  62 #include <net/icmp.h>
  63 #include <net/inet_hashtables.h>
  64 #include <net/tcp.h>
  65 #include <net/transp_v6.h>
  66 #include <net/ipv6.h>
  67 #include <net/inet_common.h>
  68 #include <net/timewait_sock.h>
  69 #include <net/xfrm.h>
  70 #include <net/secure_seq.h>
  71 #include <net/busy_poll.h>
  72
  73 #include <linux/inet.h>
  74 #include <linux/ipv6.h>
  75 #include <linux/stddef.h>
  76 #include <linux/proc_fs.h>
  77 #include <linux/seq_file.h>
  78 #include <linux/inetdevice.h>
  79 #include <linux/btf_ids.h>
  80
  81 #include <crypto/hash.h>
  82 #include <linux/scatterlist.h>
  83
  84 #include <trace/events/tcp.h>
  85
  86 #ifdef CONFIG_TCP_MD5SIG
  87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  88                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  89 #endif
  90
  91 struct inet_hashinfo tcp_hashinfo;
  92 EXPORT_SYMBOL(tcp_hashinfo);
  93
  94 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
  95 {
  96         return secure_tcp_seq(ip_hdr(skb)->daddr,
  97                               ip_hdr(skb)->saddr,
  98                               tcp_hdr(skb)->dest,
  99                               tcp_hdr(skb)->source);
 100 }
 101
 102 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
 103 {
 104         return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
 105 }
 106
 107 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 108 {
 109         const struct inet_timewait_sock *tw = inet_twsk(sktw);
 110         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 111         struct tcp_sock *tp = tcp_sk(sk);
 112         int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
 113
 114         if (reuse == 2) {
 115                 /* Still does not detect *everything* that goes through
 116                  * lo, since we require a loopback src or dst address
 117                  * or direct binding to 'lo' interface.
 118                  */
 119                 bool loopback = false;
 120                 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
 121                         loopback = true;
 122 #if IS_ENABLED(CONFIG_IPV6)
 123                 if (tw->tw_family == AF_INET6) {
 124                         if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
 125                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
 126                             ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
 127                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
 128                                 loopback = true;
 129                 } else
 130 #endif
 131                 {
 132                         if (ipv4_is_loopback(tw->tw_daddr) ||
 133                             ipv4_is_loopback(tw->tw_rcv_saddr))
 134                                 loopback = true;
 135                 }
 136                 if (!loopback)
 137                         reuse = 0;
 138         }
 139
 140         /* With PAWS, it is safe from the viewpoint
 141            of data integrity. Even without PAWS it is safe provided sequence
 142            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 143
 144            Actually, the idea is close to VJ's one, only timestamp cache is
 145            held not per host, but per port pair and TW bucket is used as state
 146            holder.
 147
 148            If TW bucket has been already destroyed we fall back to VJ's scheme
 149            and use initial timestamp retrieved from peer table.
 150          */
 151         if (tcptw->tw_ts_recent_stamp &&
 152             (!twp || (reuse && time_after32(ktime_get_seconds(),
 153                                             tcptw->tw_ts_recent_stamp)))) {
 154                 /* In case of repair and re-using TIME-WAIT sockets we still
 155                  * want to be sure that it is safe as above but honor the
 156                  * sequence numbers and time stamps set as part of the repair
 157                  * process.
 158                  *
 159                  * Without this check re-using a TIME-WAIT socket with TCP
 160                  * repair would accumulate a -1 on the repair assigned
 161                  * sequence number. The first time it is reused the sequence
 162                  * is -1, the second time -2, etc. This fixes that issue
 163                  * without appearing to create any others.
 164                  */
 165                 if (likely(!tp->repair)) {
 166                         u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
 167
 168                         if (!seq)
 169                                 seq = 1;
 170                         WRITE_ONCE(tp->write_seq, seq);
 171                         tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 172                         tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 173                 }
 174                 sock_hold(sktw);
 175                 return 1;
 176         }
 177
 178         return 0;
 179 }
 180 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 181
 182 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 183                               int addr_len)
 184 {
 185         /* This check is replicated from tcp_v4_connect() and intended to
 186          * prevent BPF program called below from accessing bytes that are out
 187          * of the bound specified by user in addr_len.
 188          */
 189         if (addr_len < sizeof(struct sockaddr_in))
 190                 return -EINVAL;
 191
 192         sock_owned_by_me(sk);
 193
 194         return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
 195 }
 196
 197 /* This will initiate an outgoing connection. */
 198 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 199 {
 200         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 201         struct inet_sock *inet = inet_sk(sk);
 202         struct tcp_sock *tp = tcp_sk(sk);
 203         __be16 orig_sport, orig_dport;
 204         __be32 daddr, nexthop;
 205         struct flowi4 *fl4;
 206         struct rtable *rt;
 207         int err;
 208         struct ip_options_rcu *inet_opt;
 209         struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 210
 211         if (addr_len < sizeof(struct sockaddr_in))
 212                 return -EINVAL;
 213
 214         if (usin->sin_family != AF_INET)
 215                 return -EAFNOSUPPORT;
 216
 217         nexthop = daddr = usin->sin_addr.s_addr;
 218         inet_opt = rcu_dereference_protected(inet->inet_opt,
 219                                              lockdep_sock_is_held(sk));
 220         if (inet_opt && inet_opt->opt.srr) {
 221                 if (!daddr)
 222                         return -EINVAL;
 223                 nexthop = inet_opt->opt.faddr;
 224         }
 225
 226         orig_sport = inet->inet_sport;
 227         orig_dport = usin->sin_port;
 228         fl4 = &inet->cork.fl.u.ip4;
 229         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 230                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 231                               IPPROTO_TCP,
 232                               orig_sport, orig_dport, sk);
 233         if (IS_ERR(rt)) {
 234                 err = PTR_ERR(rt);
 235                 if (err == -ENETUNREACH)
 236                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 237                 return err;
 238         }
 239
 240         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 241                 ip_rt_put(rt);
 242                 return -ENETUNREACH;
 243         }
 244
 245         if (!inet_opt || !inet_opt->opt.srr)
 246                 daddr = fl4->daddr;
 247
 248         if (!inet->inet_saddr)
 249                 inet->inet_saddr = fl4->saddr;
 250         sk_rcv_saddr_set(sk, inet->inet_saddr);
 251
 252         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 253                 /* Reset inherited state */
 254                 tp->rx_opt.ts_recent       = 0;
 255                 tp->rx_opt.ts_recent_stamp = 0;
 256                 if (likely(!tp->repair))
 257                         WRITE_ONCE(tp->write_seq, 0);
 258         }
 259
 260         inet->inet_dport = usin->sin_port;
 261         sk_daddr_set(sk, daddr);
 262
 263         inet_csk(sk)->icsk_ext_hdr_len = 0;
 264         if (inet_opt)
 265                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 266
 267         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 268
 269         /* Socket identity is still unknown (sport may be zero).
 270          * However we set state to SYN-SENT and not releasing socket
 271          * lock select source port, enter ourselves into the hash tables and
 272          * complete initialization after this.
 273          */
 274         tcp_set_state(sk, TCP_SYN_SENT);
 275         err = inet_hash_connect(tcp_death_row, sk);
 276         if (err)
 277                 goto failure;
 278
 279         sk_set_txhash(sk);
 280
 281         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 282                                inet->inet_sport, inet->inet_dport, sk);
 283         if (IS_ERR(rt)) {
 284                 err = PTR_ERR(rt);
 285                 rt = NULL;
 286                 goto failure;
 287         }
 288         /* OK, now commit destination to socket.  */
 289         sk->sk_gso_type = SKB_GSO_TCPV4;
 290         sk_setup_caps(sk, &rt->dst);
 291         rt = NULL;
 292
 293         if (likely(!tp->repair)) {
 294                 if (!tp->write_seq)
 295                         WRITE_ONCE(tp->write_seq,
 296                                    secure_tcp_seq(inet->inet_saddr,
 297                                                   inet->inet_daddr,
 298                                                   inet->inet_sport,
 299                                                   usin->sin_port));
 300                 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
 301                                                  inet->inet_saddr,
 302                                                  inet->inet_daddr);
 303         }
 304
 305         inet->inet_id = prandom_u32();
 306
 307         if (tcp_fastopen_defer_connect(sk, &err))
 308                 return err;
 309         if (err)
 310                 goto failure;
 311
 312         err = tcp_connect(sk);
 313
 314         if (err)
 315                 goto failure;
 316
 317         return 0;
 318
 319 failure:
 320         /*
 321          * This unhashes the socket and releases the local port,
 322          * if necessary.
 323          */
 324         tcp_set_state(sk, TCP_CLOSE);
 325         ip_rt_put(rt);
 326         sk->sk_route_caps = 0;
 327         inet->inet_dport = 0;
 328         return err;
 329 }
 330 EXPORT_SYMBOL(tcp_v4_connect);
 331
 332 /*
 333  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 334  * It can be called through tcp_release_cb() if socket was owned by user
 335  * at the time tcp_v4_err() was called to handle ICMP message.
 336  */
 337 void tcp_v4_mtu_reduced(struct sock *sk)
 338 {
 339         struct inet_sock *inet = inet_sk(sk);
 340         struct dst_entry *dst;
 341         u32 mtu;
 342
 343         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 344                 return;
 345         mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
 346         dst = inet_csk_update_pmtu(sk, mtu);
 347         if (!dst)
 348                 return;
 349
 350         /* Something is about to be wrong... Remember soft error
 351          * for the case, if this connection will not able to recover.
 352          */
 353         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 354                 sk->sk_err_soft = EMSGSIZE;
 355
 356         mtu = dst_mtu(dst);
 357
 358         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 359             ip_sk_accept_pmtu(sk) &&
 360             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 361                 tcp_sync_mss(sk, mtu);
 362
 363                 /* Resend the TCP packet because it's
 364                  * clear that the old packet has been
 365                  * dropped. This is the new "fast" path mtu
 366                  * discovery.
 367                  */
 368                 tcp_simple_retransmit(sk);
 369         } /* else let the usual retransmit timer handle it */
 370 }
 371 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 372
 373 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 374 {
 375         struct dst_entry *dst = __sk_dst_check(sk, 0);
 376
 377         if (dst)
 378                 dst->ops->redirect(dst, sk, skb);
 379 }
 380
 381
 382 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 383 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 384 {
 385         struct request_sock *req = inet_reqsk(sk);
 386         struct net *net = sock_net(sk);
 387
 388         /* ICMPs are not backlogged, hence we cannot get
 389          * an established socket here.
 390          */
 391         if (seq != tcp_rsk(req)->snt_isn) {
 392                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 393         } else if (abort) {
 394                 /*
 395                  * Still in SYN_RECV, just remove it silently.
 396                  * There is no good way to pass the error to the newly
 397                  * created socket, and POSIX does not want network
 398                  * errors returned from accept().
 399                  */
 400                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 401                 tcp_listendrop(req->rsk_listener);
 402         }
 403         reqsk_put(req);
 404 }
 405 EXPORT_SYMBOL(tcp_req_err);
 406
 407 /* TCP-LD (RFC 6069) logic */
 408 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
 409 {
 410         struct inet_connection_sock *icsk = inet_csk(sk);
 411         struct tcp_sock *tp = tcp_sk(sk);
 412         struct sk_buff *skb;
 413         s32 remaining;
 414         u32 delta_us;
 415
 416         if (sock_owned_by_user(sk))
 417                 return;
 418
 419         if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 420             !icsk->icsk_backoff)
 421                 return;
 422
 423         skb = tcp_rtx_queue_head(sk);
 424         if (WARN_ON_ONCE(!skb))
 425                 return;
 426
 427         icsk->icsk_backoff--;
 428         icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
 429         icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 430
 431         tcp_mstamp_refresh(tp);
 432         delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
 433         remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
 434
 435         if (remaining > 0) {
 436                 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 437                                           remaining, TCP_RTO_MAX);
 438         } else {
 439                 /* RTO revert clocked out retransmission.
 440                  * Will retransmit now.
 441                  */
 442                 tcp_retransmit_timer(sk);
 443         }
 444 }
 445 EXPORT_SYMBOL(tcp_ld_RTO_revert);
 446
 447 /*
 448  * This routine is called by the ICMP module when it gets some
 449  * sort of error condition.  If err < 0 then the socket should
 450  * be closed and the error returned to the user.  If err > 0
 451  * it's just the icmp type << 8 | icmp code.  After adjustment
 452  * header points to the first 8 bytes of the tcp header.  We need
 453  * to find the appropriate port.
 454  *
 455  * The locking strategy used here is very "optimistic". When
 456  * someone else accesses the socket the ICMP is just dropped
 457  * and for some paths there is no check at all.
 458  * A more general error queue to queue errors for later handling
 459  * is probably better.
 460  *
 461  */
 462
 463 int tcp_v4_err(struct sk_buff *skb, u32 info)
 464 {
 465         const struct iphdr *iph = (const struct iphdr *)skb->data;
 466         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
 467         struct tcp_sock *tp;
 468         struct inet_sock *inet;
 469         const int type = icmp_hdr(skb)->type;
 470         const int code = icmp_hdr(skb)->code;
 471         struct sock *sk;
 472         struct request_sock *fastopen;
 473         u32 seq, snd_una;
 474         int err;
 475         struct net *net = dev_net(skb->dev);
 476
 477         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 478                                        th->dest, iph->saddr, ntohs(th->source),
 479                                        inet_iif(skb), 0);
 480         if (!sk) {
 481                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 482                 return -ENOENT;
 483         }
 484         if (sk->sk_state == TCP_TIME_WAIT) {
 485                 inet_twsk_put(inet_twsk(sk));
 486                 return 0;
 487         }
 488         seq = ntohl(th->seq);
 489         if (sk->sk_state == TCP_NEW_SYN_RECV) {
 490                 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
 491                                      type == ICMP_TIME_EXCEEDED ||
 492                                      (type == ICMP_DEST_UNREACH &&
 493                                       (code == ICMP_NET_UNREACH ||
 494                                        code == ICMP_HOST_UNREACH)));
 495                 return 0;
 496         }
 497
 498         bh_lock_sock(sk);
 499         /* If too many ICMPs get dropped on busy
 500          * servers this needs to be solved differently.
 501          * We do take care of PMTU discovery (RFC1191) special case :
 502          * we can receive locally generated ICMP messages while socket is held.
 503          */
 504         if (sock_owned_by_user(sk)) {
 505                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 506                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 507         }
 508         if (sk->sk_state == TCP_CLOSE)
 509                 goto out;
 510
 511         if (static_branch_unlikely(&ip4_min_ttl)) {
 512                 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
 513                 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
 514                         __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 515                         goto out;
 516                 }
 517         }
 518
 519         tp = tcp_sk(sk);
 520         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 521         fastopen = rcu_dereference(tp->fastopen_rsk);
 522         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 523         if (sk->sk_state != TCP_LISTEN &&
 524             !between(seq, snd_una, tp->snd_nxt)) {
 525                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 526                 goto out;
 527         }
 528
 529         switch (type) {
 530         case ICMP_REDIRECT:
 531                 if (!sock_owned_by_user(sk))
 532                         do_redirect(skb, sk);
 533                 goto out;
 534         case ICMP_SOURCE_QUENCH:
 535                 /* Just silently ignore these. */
 536                 goto out;
 537         case ICMP_PARAMETERPROB:
 538                 err = EPROTO;
 539                 break;
 540         case ICMP_DEST_UNREACH:
 541                 if (code > NR_ICMP_UNREACH)
 542                         goto out;
 543
 544                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 545                         /* We are not interested in TCP_LISTEN and open_requests
 546                          * (SYN-ACKs send out by Linux are always <576bytes so
 547                          * they should go through unfragmented).
 548                          */
 549                         if (sk->sk_state == TCP_LISTEN)
 550                                 goto out;
 551
 552                         WRITE_ONCE(tp->mtu_info, info);
 553                         if (!sock_owned_by_user(sk)) {
 554                                 tcp_v4_mtu_reduced(sk);
 555                         } else {
 556                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 557                                         sock_hold(sk);
 558                         }
 559                         goto out;
 560                 }
 561
 562                 err = icmp_err_convert[code].errno;
 563                 /* check if this ICMP message allows revert of backoff.
 564                  * (see RFC 6069)
 565                  */
 566                 if (!fastopen &&
 567                     (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
 568                         tcp_ld_RTO_revert(sk, seq);
 569                 break;
 570         case ICMP_TIME_EXCEEDED:
 571                 err = EHOSTUNREACH;
 572                 break;
 573         default:
 574                 goto out;
 575         }
 576
 577         switch (sk->sk_state) {
 578         case TCP_SYN_SENT:
 579         case TCP_SYN_RECV:
 580                 /* Only in fast or simultaneous open. If a fast open socket is
 581                  * already accepted it is treated as a connected one below.
 582                  */
 583                 if (fastopen && !fastopen->sk)
 584                         break;
 585
 586                 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
 587
 588                 if (!sock_owned_by_user(sk)) {
 589                         sk->sk_err = err;
 590
 591                         sk_error_report(sk);
 592
 593                         tcp_done(sk);
 594                 } else {
 595                         sk->sk_err_soft = err;
 596                 }
 597                 goto out;
 598         }
 599
 600         /* If we've already connected we will keep trying
 601          * until we time out, or the user gives up.
 602          *
 603          * rfc1122 4.2.3.9 allows to consider as hard errors
 604          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 605          * but it is obsoleted by pmtu discovery).
 606          *
 607          * Note, that in modern internet, where routing is unreliable
 608          * and in each dark corner broken firewalls sit, sending random
 609          * errors ordered by their masters even this two messages finally lose
 610          * their original sense (even Linux sends invalid PORT_UNREACHs)
 611          *
 612          * Now we are in compliance with RFCs.
 613          *                                                      --ANK (980905)
 614          */
 615
 616         inet = inet_sk(sk);
 617         if (!sock_owned_by_user(sk) && inet->recverr) {
 618                 sk->sk_err = err;
 619                 sk_error_report(sk);
 620         } else  { /* Only an error on timeout */
 621                 sk->sk_err_soft = err;
 622         }
 623
 624 out:
 625         bh_unlock_sock(sk);
 626         sock_put(sk);
 627         return 0;
 628 }
 629
 630 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 631 {
 632         struct tcphdr *th = tcp_hdr(skb);
 633
 634         th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 635         skb->csum_start = skb_transport_header(skb) - skb->head;
 636         skb->csum_offset = offsetof(struct tcphdr, check);
 637 }
 638
 639 /* This routine computes an IPv4 TCP checksum. */
 640 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 641 {
 642         const struct inet_sock *inet = inet_sk(sk);
 643
 644         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 645 }
 646 EXPORT_SYMBOL(tcp_v4_send_check);
 647
 648 /*
 649  *      This routine will send an RST to the other tcp.
 650  *
 651  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 652  *                    for reset.
 653  *      Answer: if a packet caused RST, it is not for a socket
 654  *              existing in our system, if it is matched to a socket,
 655  *              it is just duplicate segment or bug in other side's TCP.
 656  *              So that we build reply only basing on parameters
 657  *              arrived with segment.
 658  *      Exception: precedence violation. We do not implement it in any case.
 659  */
 660
 661 #ifdef CONFIG_TCP_MD5SIG
 662 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
 663 #else
 664 #define OPTION_BYTES sizeof(__be32)
 665 #endif
 666
 667 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 668 {
 669         const struct tcphdr *th = tcp_hdr(skb);
 670         struct {
 671                 struct tcphdr th;
 672                 __be32 opt[OPTION_BYTES / sizeof(__be32)];
 673         } rep;
 674         struct ip_reply_arg arg;
 675 #ifdef CONFIG_TCP_MD5SIG
 676         struct tcp_md5sig_key *key = NULL;
 677         const __u8 *hash_location = NULL;
 678         unsigned char newhash[16];
 679         int genhash;
 680         struct sock *sk1 = NULL;
 681 #endif
 682         u64 transmit_time = 0;
 683         struct sock *ctl_sk;
 684         struct net *net;
 685
 686         /* Never send a reset in response to a reset. */
 687         if (th->rst)
 688                 return;
 689
 690         /* If sk not NULL, it means we did a successful lookup and incoming
 691          * route had to be correct. prequeue might have dropped our dst.
 692          */
 693         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 694                 return;
 695
 696         /* Swap the send and the receive. */
 697         memset(&rep, 0, sizeof(rep));
 698         rep.th.dest   = th->source;
 699         rep.th.source = th->dest;
 700         rep.th.doff   = sizeof(struct tcphdr) / 4;
 701         rep.th.rst    = 1;
 702
 703         if (th->ack) {
 704                 rep.th.seq = th->ack_seq;
 705         } else {
 706                 rep.th.ack = 1;
 707                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 708                                        skb->len - (th->doff << 2));
 709         }
 710
 711         memset(&arg, 0, sizeof(arg));
 712         arg.iov[0].iov_base = (unsigned char *)&rep;
 713         arg.iov[0].iov_len  = sizeof(rep.th);
 714
 715         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 716 #ifdef CONFIG_TCP_MD5SIG
 717         rcu_read_lock();
 718         hash_location = tcp_parse_md5sig_option(th);
 719         if (sk && sk_fullsock(sk)) {
 720                 const union tcp_md5_addr *addr;
 721                 int l3index;
 722
 723                 /* sdif set, means packet ingressed via a device
 724                  * in an L3 domain and inet_iif is set to it.
 725                  */
 726                 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
 727                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 728                 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
 729         } else if (hash_location) {
 730                 const union tcp_md5_addr *addr;
 731                 int sdif = tcp_v4_sdif(skb);
 732                 int dif = inet_iif(skb);
 733                 int l3index;
 734
 735                 /*
 736                  * active side is lost. Try to find listening socket through
 737                  * source port, and then find md5 key through listening socket.
 738                  * we are not loose security here:
 739                  * Incoming packet is checked with md5 hash with finding key,
 740                  * no RST generated if md5 hash doesn't match.
 741                  */
 742                 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
 743                                              ip_hdr(skb)->saddr,
 744                                              th->source, ip_hdr(skb)->daddr,
 745                                              ntohs(th->source), dif, sdif);
 746                 /* don't send rst if it can't find key */
 747                 if (!sk1)
 748                         goto out;
 749
 750                 /* sdif set, means packet ingressed via a device
 751                  * in an L3 domain and dif is set to it.
 752                  */
 753                 l3index = sdif ? dif : 0;
 754                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 755                 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
 756                 if (!key)
 757                         goto out;
 758
 759
 760                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 761                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 762                         goto out;
 763
 764         }
 765
 766         if (key) {
 767                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 768                                    (TCPOPT_NOP << 16) |
 769                                    (TCPOPT_MD5SIG << 8) |
 770                                    TCPOLEN_MD5SIG);
 771                 /* Update length and the length the header thinks exists */
 772                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 773                 rep.th.doff = arg.iov[0].iov_len / 4;
 774
 775                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 776                                      key, ip_hdr(skb)->saddr,
 777                                      ip_hdr(skb)->daddr, &rep.th);
 778         }
 779 #endif
 780         /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
 781         if (rep.opt[0] == 0) {
 782                 __be32 mrst = mptcp_reset_option(skb);
 783
 784                 if (mrst) {
 785                         rep.opt[0] = mrst;
 786                         arg.iov[0].iov_len += sizeof(mrst);
 787                         rep.th.doff = arg.iov[0].iov_len / 4;
 788                 }
 789         }
 790
 791         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 792                                       ip_hdr(skb)->saddr, /* XXX */
 793                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 794         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 795         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 796
 797         /* When socket is gone, all binding information is lost.
 798          * routing might fail in this case. No choice here, if we choose to force
 799          * input interface, we will misroute in case of asymmetric route.
 800          */
 801         if (sk) {
 802                 arg.bound_dev_if = sk->sk_bound_dev_if;
 803                 if (sk_fullsock(sk))
 804                         trace_tcp_send_reset(sk, skb);
 805         }
 806
 807         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 808                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 809
 810         arg.tos = ip_hdr(skb)->tos;
 811         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 812         local_bh_disable();
 813         ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
 814         if (sk) {
 815                 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 816                                    inet_twsk(sk)->tw_mark : sk->sk_mark;
 817                 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 818                                    inet_twsk(sk)->tw_priority : sk->sk_priority;
 819                 transmit_time = tcp_transmit_time(sk);
 820         }
 821         ip_send_unicast_reply(ctl_sk,
 822                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 823                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 824                               &arg, arg.iov[0].iov_len,
 825                               transmit_time);
 826
 827         ctl_sk->sk_mark = 0;
 828         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 829         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 830         local_bh_enable();
 831
 832 #ifdef CONFIG_TCP_MD5SIG
 833 out:
 834         rcu_read_unlock();
 835 #endif
 836 }
 837
 838 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 839    outside socket context is ugly, certainly. What can I do?
 840  */
 841
 842 static void tcp_v4_send_ack(const struct sock *sk,
 843                             struct sk_buff *skb, u32 seq, u32 ack,
 844                             u32 win, u32 tsval, u32 tsecr, int oif,
 845                             struct tcp_md5sig_key *key,
 846                             int reply_flags, u8 tos)
 847 {
 848         const struct tcphdr *th = tcp_hdr(skb);
 849         struct {
 850                 struct tcphdr th;
 851                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 852 #ifdef CONFIG_TCP_MD5SIG
 853                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 854 #endif
 855                         ];
 856         } rep;
 857         struct net *net = sock_net(sk);
 858         struct ip_reply_arg arg;
 859         struct sock *ctl_sk;
 860         u64 transmit_time;
 861
 862         memset(&rep.th, 0, sizeof(struct tcphdr));
 863         memset(&arg, 0, sizeof(arg));
 864
 865         arg.iov[0].iov_base = (unsigned char *)&rep;
 866         arg.iov[0].iov_len  = sizeof(rep.th);
 867         if (tsecr) {
 868                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 869                                    (TCPOPT_TIMESTAMP << 8) |
 870                                    TCPOLEN_TIMESTAMP);
 871                 rep.opt[1] = htonl(tsval);
 872                 rep.opt[2] = htonl(tsecr);
 873                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 874         }
 875
 876         /* Swap the send and the receive. */
 877         rep.th.dest    = th->source;
 878         rep.th.source  = th->dest;
 879         rep.th.doff    = arg.iov[0].iov_len / 4;
 880         rep.th.seq     = htonl(seq);
 881         rep.th.ack_seq = htonl(ack);
 882         rep.th.ack     = 1;
 883         rep.th.window  = htons(win);
 884
 885 #ifdef CONFIG_TCP_MD5SIG
 886         if (key) {
 887                 int offset = (tsecr) ? 3 : 0;
 888
 889                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 890                                           (TCPOPT_NOP << 16) |
 891                                           (TCPOPT_MD5SIG << 8) |
 892                                           TCPOLEN_MD5SIG);
 893                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 894                 rep.th.doff = arg.iov[0].iov_len/4;
 895
 896                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 897                                     key, ip_hdr(skb)->saddr,
 898                                     ip_hdr(skb)->daddr, &rep.th);
 899         }
 900 #endif
 901         arg.flags = reply_flags;
 902         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 903                                       ip_hdr(skb)->saddr, /* XXX */
 904                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 905         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 906         if (oif)
 907                 arg.bound_dev_if = oif;
 908         arg.tos = tos;
 909         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 910         local_bh_disable();
 911         ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
 912         ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 913                            inet_twsk(sk)->tw_mark : sk->sk_mark;
 914         ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 915                            inet_twsk(sk)->tw_priority : sk->sk_priority;
 916         transmit_time = tcp_transmit_time(sk);
 917         ip_send_unicast_reply(ctl_sk,
 918                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 919                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 920                               &arg, arg.iov[0].iov_len,
 921                               transmit_time);
 922
 923         ctl_sk->sk_mark = 0;
 924         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 925         local_bh_enable();
 926 }
 927
 928 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 929 {
 930         struct inet_timewait_sock *tw = inet_twsk(sk);
 931         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 932
 933         tcp_v4_send_ack(sk, skb,
 934                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 935                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 936                         tcp_time_stamp_raw() + tcptw->tw_ts_offset,
 937                         tcptw->tw_ts_recent,
 938                         tw->tw_bound_dev_if,
 939                         tcp_twsk_md5_key(tcptw),
 940                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 941                         tw->tw_tos
 942                         );
 943
 944         inet_twsk_put(tw);
 945 }
 946
 947 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 948                                   struct request_sock *req)
 949 {
 950         const union tcp_md5_addr *addr;
 951         int l3index;
 952
 953         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 954          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 955          */
 956         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 957                                              tcp_sk(sk)->snd_nxt;
 958
 959         /* RFC 7323 2.3
 960          * The window field (SEG.WND) of every outgoing segment, with the
 961          * exception of <SYN> segments, MUST be right-shifted by
 962          * Rcv.Wind.Shift bits:
 963          */
 964         addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 965         l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
 966         tcp_v4_send_ack(sk, skb, seq,
 967                         tcp_rsk(req)->rcv_nxt,
 968                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 969                         tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
 970                         req->ts_recent,
 971                         0,
 972                         tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
 973                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 974                         ip_hdr(skb)->tos);
 975 }
 976
 977 /*
 978  *      Send a SYN-ACK after having received a SYN.
 979  *      This still operates on a request_sock only, not on a big
 980  *      socket.
 981  */
 982 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 983                               struct flowi *fl,
 984                               struct request_sock *req,
 985                               struct tcp_fastopen_cookie *foc,
 986                               enum tcp_synack_type synack_type,
 987                               struct sk_buff *syn_skb)
 988 {
 989         const struct inet_request_sock *ireq = inet_rsk(req);
 990         struct flowi4 fl4;
 991         int err = -1;
 992         struct sk_buff *skb;
 993         u8 tos;
 994
 995         /* First, grab a route. */
 996         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 997                 return -1;
 998
 999         skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1000
1001         if (skb) {
1002                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1003
1004                 tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ?
1005                                 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1006                                 (inet_sk(sk)->tos & INET_ECN_MASK) :
1007                                 inet_sk(sk)->tos;
1008
1009                 if (!INET_ECN_is_capable(tos) &&
1010                     tcp_bpf_ca_needs_ecn((struct sock *)req))
1011                         tos |= INET_ECN_ECT_0;
1012
1013                 rcu_read_lock();
1014                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1015                                             ireq->ir_rmt_addr,
1016                                             rcu_dereference(ireq->ireq_opt),
1017                                             tos);
1018                 rcu_read_unlock();
1019                 err = net_xmit_eval(err);
1020         }
1021
1022         return err;
1023 }
1024
1025 /*
1026  *      IPv4 request_sock destructor.
1027  */
1028 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1029 {
1030         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1031 }
1032
1033 #ifdef CONFIG_TCP_MD5SIG
1034 /*
1035  * RFC2385 MD5 checksumming requires a mapping of
1036  * IP address->MD5 Key.
1037  * We need to maintain these in the sk structure.
1038  */
1039
1040 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1041 EXPORT_SYMBOL(tcp_md5_needed);
1042
1043 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1044 {
1045         if (!old)
1046                 return true;
1047
1048         /* l3index always overrides non-l3index */
1049         if (old->l3index && new->l3index == 0)
1050                 return false;
1051         if (old->l3index == 0 && new->l3index)
1052                 return true;
1053
1054         return old->prefixlen < new->prefixlen;
1055 }
1056
1057 /* Find the Key structure for an address.  */
1058 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1059                                            const union tcp_md5_addr *addr,
1060                                            int family)
1061 {
1062         const struct tcp_sock *tp = tcp_sk(sk);
1063         struct tcp_md5sig_key *key;
1064         const struct tcp_md5sig_info *md5sig;
1065         __be32 mask;
1066         struct tcp_md5sig_key *best_match = NULL;
1067         bool match;
1068
1069         /* caller either holds rcu_read_lock() or socket lock */
1070         md5sig = rcu_dereference_check(tp->md5sig_info,
1071                                        lockdep_sock_is_held(sk));
1072         if (!md5sig)
1073                 return NULL;
1074
1075         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1076                                  lockdep_sock_is_held(sk)) {
1077                 if (key->family != family)
1078                         continue;
1079                 if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
1080                         continue;
1081                 if (family == AF_INET) {
1082                         mask = inet_make_mask(key->prefixlen);
1083                         match = (key->addr.a4.s_addr & mask) ==
1084                                 (addr->a4.s_addr & mask);
1085 #if IS_ENABLED(CONFIG_IPV6)
1086                 } else if (family == AF_INET6) {
1087                         match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1088                                                   key->prefixlen);
1089 #endif
1090                 } else {
1091                         match = false;
1092                 }
1093
1094                 if (match && better_md5_match(best_match, key))
1095                         best_match = key;
1096         }
1097         return best_match;
1098 }
1099 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1100
1101 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1102                                                       const union tcp_md5_addr *addr,
1103                                                       int family, u8 prefixlen,
1104                                                       int l3index, u8 flags)
1105 {
1106         const struct tcp_sock *tp = tcp_sk(sk);
1107         struct tcp_md5sig_key *key;
1108         unsigned int size = sizeof(struct in_addr);
1109         const struct tcp_md5sig_info *md5sig;
1110
1111         /* caller either holds rcu_read_lock() or socket lock */
1112         md5sig = rcu_dereference_check(tp->md5sig_info,
1113                                        lockdep_sock_is_held(sk));
1114         if (!md5sig)
1115                 return NULL;
1116 #if IS_ENABLED(CONFIG_IPV6)
1117         if (family == AF_INET6)
1118                 size = sizeof(struct in6_addr);
1119 #endif
1120         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1121                                  lockdep_sock_is_held(sk)) {
1122                 if (key->family != family)
1123                         continue;
1124                 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1125                         continue;
1126                 if (key->l3index != l3index)
1127                         continue;
1128                 if (!memcmp(&key->addr, addr, size) &&
1129                     key->prefixlen == prefixlen)
1130                         return key;
1131         }
1132         return NULL;
1133 }
1134
1135 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1136                                          const struct sock *addr_sk)
1137 {
1138         const union tcp_md5_addr *addr;
1139         int l3index;
1140
1141         l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1142                                                  addr_sk->sk_bound_dev_if);
1143         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1144         return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1145 }
1146 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1147
1148 /* This can be called on a newly created socket, from other files */
1149 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1150                    int family, u8 prefixlen, int l3index, u8 flags,
1151                    const u8 *newkey, u8 newkeylen, gfp_t gfp)
1152 {
1153         /* Add Key to the list */
1154         struct tcp_md5sig_key *key;
1155         struct tcp_sock *tp = tcp_sk(sk);
1156         struct tcp_md5sig_info *md5sig;
1157
1158         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1159         if (key) {
1160                 /* Pre-existing entry - just update that one.
1161                  * Note that the key might be used concurrently.
1162                  * data_race() is telling kcsan that we do not care of
1163                  * key mismatches, since changing MD5 key on live flows
1164                  * can lead to packet drops.
1165                  */
1166                 data_race(memcpy(key->key, newkey, newkeylen));
1167
1168                 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1169                  * Also note that a reader could catch new key->keylen value
1170                  * but old key->key[], this is the reason we use __GFP_ZERO
1171                  * at sock_kmalloc() time below these lines.
1172                  */
1173                 WRITE_ONCE(key->keylen, newkeylen);
1174
1175                 return 0;
1176         }
1177
1178         md5sig = rcu_dereference_protected(tp->md5sig_info,
1179                                            lockdep_sock_is_held(sk));
1180         if (!md5sig) {
1181                 md5sig = kmalloc(sizeof(*md5sig), gfp);
1182                 if (!md5sig)
1183                         return -ENOMEM;
1184
1185                 sk_gso_disable(sk);
1186                 INIT_HLIST_HEAD(&md5sig->head);
1187                 rcu_assign_pointer(tp->md5sig_info, md5sig);
1188         }
1189
1190         key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1191         if (!key)
1192                 return -ENOMEM;
1193         if (!tcp_alloc_md5sig_pool()) {
1194                 sock_kfree_s(sk, key, sizeof(*key));
1195                 return -ENOMEM;
1196         }
1197
1198         memcpy(key->key, newkey, newkeylen);
1199         key->keylen = newkeylen;
1200         key->family = family;
1201         key->prefixlen = prefixlen;
1202         key->l3index = l3index;
1203         key->flags = flags;
1204         memcpy(&key->addr, addr,
1205                (family == AF_INET6) ? sizeof(struct in6_addr) :
1206                                       sizeof(struct in_addr));
1207         hlist_add_head_rcu(&key->node, &md5sig->head);
1208         return 0;
1209 }
1210 EXPORT_SYMBOL(tcp_md5_do_add);
1211
1212 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1213                    u8 prefixlen, int l3index, u8 flags)
1214 {
1215         struct tcp_md5sig_key *key;
1216
1217         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1218         if (!key)
1219                 return -ENOENT;
1220         hlist_del_rcu(&key->node);
1221         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1222         kfree_rcu(key, rcu);
1223         return 0;
1224 }
1225 EXPORT_SYMBOL(tcp_md5_do_del);
1226
1227 static void tcp_clear_md5_list(struct sock *sk)
1228 {
1229         struct tcp_sock *tp = tcp_sk(sk);
1230         struct tcp_md5sig_key *key;
1231         struct hlist_node *n;
1232         struct tcp_md5sig_info *md5sig;
1233
1234         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1235
1236         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1237                 hlist_del_rcu(&key->node);
1238                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1239                 kfree_rcu(key, rcu);
1240         }
1241 }
1242
1243 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1244                                  sockptr_t optval, int optlen)
1245 {
1246         struct tcp_md5sig cmd;
1247         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1248         const union tcp_md5_addr *addr;
1249         u8 prefixlen = 32;
1250         int l3index = 0;
1251         u8 flags;
1252
1253         if (optlen < sizeof(cmd))
1254                 return -EINVAL;
1255
1256         if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1257                 return -EFAULT;
1258
1259         if (sin->sin_family != AF_INET)
1260                 return -EINVAL;
1261
1262         flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1263
1264         if (optname == TCP_MD5SIG_EXT &&
1265             cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1266                 prefixlen = cmd.tcpm_prefixlen;
1267                 if (prefixlen > 32)
1268                         return -EINVAL;
1269         }
1270
1271         if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1272             cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1273                 struct net_device *dev;
1274
1275                 rcu_read_lock();
1276                 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1277                 if (dev && netif_is_l3_master(dev))
1278                         l3index = dev->ifindex;
1279
1280                 rcu_read_unlock();
1281
1282                 /* ok to reference set/not set outside of rcu;
1283                  * right now device MUST be an L3 master
1284                  */
1285                 if (!dev || !l3index)
1286                         return -EINVAL;
1287         }
1288
1289         addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1290
1291         if (!cmd.tcpm_keylen)
1292                 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1293
1294         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1295                 return -EINVAL;
1296
1297         return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1298                               cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1299 }
1300
1301 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1302                                    __be32 daddr, __be32 saddr,
1303                                    const struct tcphdr *th, int nbytes)
1304 {
1305         struct tcp4_pseudohdr *bp;
1306         struct scatterlist sg;
1307         struct tcphdr *_th;
1308
1309         bp = hp->scratch;
1310         bp->saddr = saddr;
1311         bp->daddr = daddr;
1312         bp->pad = 0;
1313         bp->protocol = IPPROTO_TCP;
1314         bp->len = cpu_to_be16(nbytes);
1315
1316         _th = (struct tcphdr *)(bp + 1);
1317         memcpy(_th, th, sizeof(*th));
1318         _th->check = 0;
1319
1320         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1321         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1322                                 sizeof(*bp) + sizeof(*th));
1323         return crypto_ahash_update(hp->md5_req);
1324 }
1325
1326 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1327                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1328 {
1329         struct tcp_md5sig_pool *hp;
1330         struct ahash_request *req;
1331
1332         hp = tcp_get_md5sig_pool();
1333         if (!hp)
1334                 goto clear_hash_noput;
1335         req = hp->md5_req;
1336
1337         if (crypto_ahash_init(req))
1338                 goto clear_hash;
1339         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1340                 goto clear_hash;
1341         if (tcp_md5_hash_key(hp, key))
1342                 goto clear_hash;
1343         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1344         if (crypto_ahash_final(req))
1345                 goto clear_hash;
1346
1347         tcp_put_md5sig_pool();
1348         return 0;
1349
1350 clear_hash:
1351         tcp_put_md5sig_pool();
1352 clear_hash_noput:
1353         memset(md5_hash, 0, 16);
1354         return 1;
1355 }
1356
1357 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1358                         const struct sock *sk,
1359                         const struct sk_buff *skb)
1360 {
1361         struct tcp_md5sig_pool *hp;
1362         struct ahash_request *req;
1363         const struct tcphdr *th = tcp_hdr(skb);
1364         __be32 saddr, daddr;
1365
1366         if (sk) { /* valid for establish/request sockets */
1367                 saddr = sk->sk_rcv_saddr;
1368                 daddr = sk->sk_daddr;
1369         } else {
1370                 const struct iphdr *iph = ip_hdr(skb);
1371                 saddr = iph->saddr;
1372                 daddr = iph->daddr;
1373         }
1374
1375         hp = tcp_get_md5sig_pool();
1376         if (!hp)
1377                 goto clear_hash_noput;
1378         req = hp->md5_req;
1379
1380         if (crypto_ahash_init(req))
1381                 goto clear_hash;
1382
1383         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1384                 goto clear_hash;
1385         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1386                 goto clear_hash;
1387         if (tcp_md5_hash_key(hp, key))
1388                 goto clear_hash;
1389         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1390         if (crypto_ahash_final(req))
1391                 goto clear_hash;
1392
1393         tcp_put_md5sig_pool();
1394         return 0;
1395
1396 clear_hash:
1397         tcp_put_md5sig_pool();
1398 clear_hash_noput:
1399         memset(md5_hash, 0, 16);
1400         return 1;
1401 }
1402 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1403
1404 #endif
1405
1406 /* Called with rcu_read_lock() */
1407 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1408                                     const struct sk_buff *skb,
1409                                     int dif, int sdif)
1410 {
1411 #ifdef CONFIG_TCP_MD5SIG
1412         /*
1413          * This gets called for each TCP segment that arrives
1414          * so we want to be efficient.
1415          * We have 3 drop cases:
1416          * o No MD5 hash and one expected.
1417          * o MD5 hash and we're not expecting one.
1418          * o MD5 hash and its wrong.
1419          */
1420         const __u8 *hash_location = NULL;
1421         struct tcp_md5sig_key *hash_expected;
1422         const struct iphdr *iph = ip_hdr(skb);
1423         const struct tcphdr *th = tcp_hdr(skb);
1424         const union tcp_md5_addr *addr;
1425         unsigned char newhash[16];
1426         int genhash, l3index;
1427
1428         /* sdif set, means packet ingressed via a device
1429          * in an L3 domain and dif is set to the l3mdev
1430          */
1431         l3index = sdif ? dif : 0;
1432
1433         addr = (union tcp_md5_addr *)&iph->saddr;
1434         hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1435         hash_location = tcp_parse_md5sig_option(th);
1436
1437         /* We've parsed the options - do we have a hash? */
1438         if (!hash_expected && !hash_location)
1439                 return false;
1440
1441         if (hash_expected && !hash_location) {
1442                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1443                 return true;
1444         }
1445
1446         if (!hash_expected && hash_location) {
1447                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1448                 return true;
1449         }
1450
1451         /* Okay, so this is hash_expected and hash_location -
1452          * so we need to calculate the checksum.
1453          */
1454         genhash = tcp_v4_md5_hash_skb(newhash,
1455                                       hash_expected,
1456                                       NULL, skb);
1457
1458         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1459                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1460                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
1461                                      &iph->saddr, ntohs(th->source),
1462                                      &iph->daddr, ntohs(th->dest),
1463                                      genhash ? " tcp_v4_calc_md5_hash failed"
1464                                      : "", l3index);
1465                 return true;
1466         }
1467         return false;
1468 #endif
1469         return false;
1470 }
1471
1472 static void tcp_v4_init_req(struct request_sock *req,
1473                             const struct sock *sk_listener,
1474                             struct sk_buff *skb)
1475 {
1476         struct inet_request_sock *ireq = inet_rsk(req);
1477         struct net *net = sock_net(sk_listener);
1478
1479         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1480         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1481         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1482 }
1483
1484 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1485                                           struct sk_buff *skb,
1486                                           struct flowi *fl,
1487                                           struct request_sock *req)
1488 {
1489         tcp_v4_init_req(req, sk, skb);
1490
1491         if (security_inet_conn_request(sk, skb, req))
1492                 return NULL;
1493
1494         return inet_csk_route_req(sk, &fl->u.ip4, req);
1495 }
1496
1497 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1498         .family         =       PF_INET,
1499         .obj_size       =       sizeof(struct tcp_request_sock),
1500         .rtx_syn_ack    =       tcp_rtx_synack,
1501         .send_ack       =       tcp_v4_reqsk_send_ack,
1502         .destructor     =       tcp_v4_reqsk_destructor,
1503         .send_reset     =       tcp_v4_send_reset,
1504         .syn_ack_timeout =      tcp_syn_ack_timeout,
1505 };
1506
1507 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1508         .mss_clamp      =       TCP_MSS_DEFAULT,
1509 #ifdef CONFIG_TCP_MD5SIG
1510         .req_md5_lookup =       tcp_v4_md5_lookup,
1511         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1512 #endif
1513 #ifdef CONFIG_SYN_COOKIES
1514         .cookie_init_seq =      cookie_v4_init_sequence,
1515 #endif
1516         .route_req      =       tcp_v4_route_req,
1517         .init_seq       =       tcp_v4_init_seq,
1518         .init_ts_off    =       tcp_v4_init_ts_off,
1519         .send_synack    =       tcp_v4_send_synack,
1520 };
1521
1522 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1523 {
1524         /* Never answer to SYNs send to broadcast or multicast */
1525         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1526                 goto drop;
1527
1528         return tcp_conn_request(&tcp_request_sock_ops,
1529                                 &tcp_request_sock_ipv4_ops, sk, skb);
1530
1531 drop:
1532         tcp_listendrop(sk);
1533         return 0;
1534 }
1535 EXPORT_SYMBOL(tcp_v4_conn_request);
1536
1537
1538 /*
1539  * The three way handshake has completed - we got a valid synack -
1540  * now create the new socket.
1541  */
1542 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1543                                   struct request_sock *req,
1544                                   struct dst_entry *dst,
1545                                   struct request_sock *req_unhash,
1546                                   bool *own_req)
1547 {
1548         struct inet_request_sock *ireq;
1549         bool found_dup_sk = false;
1550         struct inet_sock *newinet;
1551         struct tcp_sock *newtp;
1552         struct sock *newsk;
1553 #ifdef CONFIG_TCP_MD5SIG
1554         const union tcp_md5_addr *addr;
1555         struct tcp_md5sig_key *key;
1556         int l3index;
1557 #endif
1558         struct ip_options_rcu *inet_opt;
1559
1560         if (sk_acceptq_is_full(sk))
1561                 goto exit_overflow;
1562
1563         newsk = tcp_create_openreq_child(sk, req, skb);
1564         if (!newsk)
1565                 goto exit_nonewsk;
1566
1567         newsk->sk_gso_type = SKB_GSO_TCPV4;
1568         inet_sk_rx_dst_set(newsk, skb);
1569
1570         newtp                 = tcp_sk(newsk);
1571         newinet               = inet_sk(newsk);
1572         ireq                  = inet_rsk(req);
1573         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1574         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1575         newsk->sk_bound_dev_if = ireq->ir_iif;
1576         newinet->inet_saddr   = ireq->ir_loc_addr;
1577         inet_opt              = rcu_dereference(ireq->ireq_opt);
1578         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1579         newinet->mc_index     = inet_iif(skb);
1580         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1581         newinet->rcv_tos      = ip_hdr(skb)->tos;
1582         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1583         if (inet_opt)
1584                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1585         newinet->inet_id = prandom_u32();
1586
1587         /* Set ToS of the new socket based upon the value of incoming SYN.
1588          * ECT bits are set later in tcp_init_transfer().
1589          */
1590         if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)
1591                 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1592
1593         if (!dst) {
1594                 dst = inet_csk_route_child_sock(sk, newsk, req);
1595                 if (!dst)
1596                         goto put_and_exit;
1597         } else {
1598                 /* syncookie case : see end of cookie_v4_check() */
1599         }
1600         sk_setup_caps(newsk, dst);
1601
1602         tcp_ca_openreq_child(newsk, dst);
1603
1604         tcp_sync_mss(newsk, dst_mtu(dst));
1605         newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1606
1607         tcp_initialize_rcv_mss(newsk);
1608
1609 #ifdef CONFIG_TCP_MD5SIG
1610         l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1611         /* Copy over the MD5 key from the original socket */
1612         addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1613         key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1614         if (key) {
1615                 /*
1616                  * We're using one, so create a matching key
1617                  * on the newsk structure. If we fail to get
1618                  * memory, then we end up not copying the key
1619                  * across. Shucks.
1620                  */
1621                 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->flags,
1622                                key->key, key->keylen, GFP_ATOMIC);
1623                 sk_gso_disable(newsk);
1624         }
1625 #endif
1626
1627         if (__inet_inherit_port(sk, newsk) < 0)
1628                 goto put_and_exit;
1629         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1630                                        &found_dup_sk);
1631         if (likely(*own_req)) {
1632                 tcp_move_syn(newtp, req);
1633                 ireq->ireq_opt = NULL;
1634         } else {
1635                 newinet->inet_opt = NULL;
1636
1637                 if (!req_unhash && found_dup_sk) {
1638                         /* This code path should only be executed in the
1639                          * syncookie case only
1640                          */
1641                         bh_unlock_sock(newsk);
1642                         sock_put(newsk);
1643                         newsk = NULL;
1644                 }
1645         }
1646         return newsk;
1647
1648 exit_overflow:
1649         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1650 exit_nonewsk:
1651         dst_release(dst);
1652 exit:
1653         tcp_listendrop(sk);
1654         return NULL;
1655 put_and_exit:
1656         newinet->inet_opt = NULL;
1657         inet_csk_prepare_forced_close(newsk);
1658         tcp_done(newsk);
1659         goto exit;
1660 }
1661 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1662
1663 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1664 {
1665 #ifdef CONFIG_SYN_COOKIES
1666         const struct tcphdr *th = tcp_hdr(skb);
1667
1668         if (!th->syn)
1669                 sk = cookie_v4_check(sk, skb);
1670 #endif
1671         return sk;
1672 }
1673
1674 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1675                          struct tcphdr *th, u32 *cookie)
1676 {
1677         u16 mss = 0;
1678 #ifdef CONFIG_SYN_COOKIES
1679         mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1680                                     &tcp_request_sock_ipv4_ops, sk, th);
1681         if (mss) {
1682                 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1683                 tcp_synq_overflow(sk);
1684         }
1685 #endif
1686         return mss;
1687 }
1688
1689 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1690                                                            u32));
1691 /* The socket must have it's spinlock held when we get
1692  * here, unless it is a TCP_LISTEN socket.
1693  *
1694  * We have a potential double-lock case here, so even when
1695  * doing backlog processing we use the BH locking scheme.
1696  * This is because we cannot sleep with the original spinlock
1697  * held.
1698  */
1699 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1700 {
1701         struct sock *rsk;
1702
1703         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1704                 struct dst_entry *dst;
1705
1706                 dst = rcu_dereference_protected(sk->sk_rx_dst,
1707                                                 lockdep_sock_is_held(sk));
1708
1709                 sock_rps_save_rxhash(sk, skb);
1710                 sk_mark_napi_id(sk, skb);
1711                 if (dst) {
1712                         if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1713                             !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1714                                              dst, 0)) {
1715                                 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1716                                 dst_release(dst);
1717                         }
1718                 }
1719                 tcp_rcv_established(sk, skb);
1720                 return 0;
1721         }
1722
1723         if (tcp_checksum_complete(skb))
1724                 goto csum_err;
1725
1726         if (sk->sk_state == TCP_LISTEN) {
1727                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1728
1729                 if (!nsk)
1730                         goto discard;
1731                 if (nsk != sk) {
1732                         if (tcp_child_process(sk, nsk, skb)) {
1733                                 rsk = nsk;
1734                                 goto reset;
1735                         }
1736                         return 0;
1737                 }
1738         } else
1739                 sock_rps_save_rxhash(sk, skb);
1740
1741         if (tcp_rcv_state_process(sk, skb)) {
1742                 rsk = sk;
1743                 goto reset;
1744         }
1745         return 0;
1746
1747 reset:
1748         tcp_v4_send_reset(rsk, skb);
1749 discard:
1750         kfree_skb(skb);
1751         /* Be careful here. If this function gets more complicated and
1752          * gcc suffers from register pressure on the x86, sk (in %ebx)
1753          * might be destroyed here. This current version compiles correctly,
1754          * but you have been warned.
1755          */
1756         return 0;
1757
1758 csum_err:
1759         trace_tcp_bad_csum(skb);
1760         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1761         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1762         goto discard;
1763 }
1764 EXPORT_SYMBOL(tcp_v4_do_rcv);
1765
1766 int tcp_v4_early_demux(struct sk_buff *skb)
1767 {
1768         const struct iphdr *iph;
1769         const struct tcphdr *th;
1770         struct sock *sk;
1771
1772         if (skb->pkt_type != PACKET_HOST)
1773                 return 0;
1774
1775         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1776                 return 0;
1777
1778         iph = ip_hdr(skb);
1779         th = tcp_hdr(skb);
1780
1781         if (th->doff < sizeof(struct tcphdr) / 4)
1782                 return 0;
1783
1784         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1785                                        iph->saddr, th->source,
1786                                        iph->daddr, ntohs(th->dest),
1787                                        skb->skb_iif, inet_sdif(skb));
1788         if (sk) {
1789                 skb->sk = sk;
1790                 skb->destructor = sock_edemux;
1791                 if (sk_fullsock(sk)) {
1792                         struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1793
1794                         if (dst)
1795                                 dst = dst_check(dst, 0);
1796                         if (dst &&
1797                             sk->sk_rx_dst_ifindex == skb->skb_iif)
1798                                 skb_dst_set_noref(skb, dst);
1799                 }
1800         }
1801         return 0;
1802 }
1803
1804 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1805 {
1806         u32 limit, tail_gso_size, tail_gso_segs;
1807         struct skb_shared_info *shinfo;
1808         const struct tcphdr *th;
1809         struct tcphdr *thtail;
1810         struct sk_buff *tail;
1811         unsigned int hdrlen;
1812         bool fragstolen;
1813         u32 gso_segs;
1814         u32 gso_size;
1815         int delta;
1816
1817         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1818          * we can fix skb->truesize to its real value to avoid future drops.
1819          * This is valid because skb is not yet charged to the socket.
1820          * It has been noticed pure SACK packets were sometimes dropped
1821          * (if cooked by drivers without copybreak feature).
1822          */
1823         skb_condense(skb);
1824
1825         skb_dst_drop(skb);
1826
1827         if (unlikely(tcp_checksum_complete(skb))) {
1828                 bh_unlock_sock(sk);
1829                 trace_tcp_bad_csum(skb);
1830                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1831                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1832                 return true;
1833         }
1834
1835         /* Attempt coalescing to last skb in backlog, even if we are
1836          * above the limits.
1837          * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1838          */
1839         th = (const struct tcphdr *)skb->data;
1840         hdrlen = th->doff * 4;
1841
1842         tail = sk->sk_backlog.tail;
1843         if (!tail)
1844                 goto no_coalesce;
1845         thtail = (struct tcphdr *)tail->data;
1846
1847         if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1848             TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1849             ((TCP_SKB_CB(tail)->tcp_flags |
1850               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1851             !((TCP_SKB_CB(tail)->tcp_flags &
1852               TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1853             ((TCP_SKB_CB(tail)->tcp_flags ^
1854               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1855 #ifdef CONFIG_TLS_DEVICE
1856             tail->decrypted != skb->decrypted ||
1857 #endif
1858             thtail->doff != th->doff ||
1859             memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1860                 goto no_coalesce;
1861
1862         __skb_pull(skb, hdrlen);
1863
1864         shinfo = skb_shinfo(skb);
1865         gso_size = shinfo->gso_size ?: skb->len;
1866         gso_segs = shinfo->gso_segs ?: 1;
1867
1868         shinfo = skb_shinfo(tail);
1869         tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1870         tail_gso_segs = shinfo->gso_segs ?: 1;
1871
1872         if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1873                 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1874
1875                 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1876                         TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1877                         thtail->window = th->window;
1878                 }
1879
1880                 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1881                  * thtail->fin, so that the fast path in tcp_rcv_established()
1882                  * is not entered if we append a packet with a FIN.
1883                  * SYN, RST, URG are not present.
1884                  * ACK is set on both packets.
1885                  * PSH : we do not really care in TCP stack,
1886                  *       at least for 'GRO' packets.
1887                  */
1888                 thtail->fin |= th->fin;
1889                 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1890
1891                 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1892                         TCP_SKB_CB(tail)->has_rxtstamp = true;
1893                         tail->tstamp = skb->tstamp;
1894                         skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1895                 }
1896
1897                 /* Not as strict as GRO. We only need to carry mss max value */
1898                 shinfo->gso_size = max(gso_size, tail_gso_size);
1899                 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1900
1901                 sk->sk_backlog.len += delta;
1902                 __NET_INC_STATS(sock_net(sk),
1903                                 LINUX_MIB_TCPBACKLOGCOALESCE);
1904                 kfree_skb_partial(skb, fragstolen);
1905                 return false;
1906         }
1907         __skb_push(skb, hdrlen);
1908
1909 no_coalesce:
1910         /* Only socket owner can try to collapse/prune rx queues
1911          * to reduce memory overhead, so add a little headroom here.
1912          * Few sockets backlog are possibly concurrently non empty.
1913          */
1914         limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf) + 64*1024;
1915
1916         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1917                 bh_unlock_sock(sk);
1918                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1919                 return true;
1920         }
1921         return false;
1922 }
1923 EXPORT_SYMBOL(tcp_add_backlog);
1924
1925 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1926 {
1927         struct tcphdr *th = (struct tcphdr *)skb->data;
1928
1929         return sk_filter_trim_cap(sk, skb, th->doff * 4);
1930 }
1931 EXPORT_SYMBOL(tcp_filter);
1932
1933 static void tcp_v4_restore_cb(struct sk_buff *skb)
1934 {
1935         memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1936                 sizeof(struct inet_skb_parm));
1937 }
1938
1939 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1940                            const struct tcphdr *th)
1941 {
1942         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1943          * barrier() makes sure compiler wont play fool^Waliasing games.
1944          */
1945         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1946                 sizeof(struct inet_skb_parm));
1947         barrier();
1948
1949         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1950         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1951                                     skb->len - th->doff * 4);
1952         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1953         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1954         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1955         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1956         TCP_SKB_CB(skb)->sacked  = 0;
1957         TCP_SKB_CB(skb)->has_rxtstamp =
1958                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1959 }
1960
1961 /*
1962  *      From tcp_input.c
1963  */
1964
1965 int tcp_v4_rcv(struct sk_buff *skb)
1966 {
1967         struct net *net = dev_net(skb->dev);
1968         int sdif = inet_sdif(skb);
1969         int dif = inet_iif(skb);
1970         const struct iphdr *iph;
1971         const struct tcphdr *th;
1972         bool refcounted;
1973         struct sock *sk;
1974         int drop_reason;
1975         int ret;
1976
1977         drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
1978         if (skb->pkt_type != PACKET_HOST)
1979                 goto discard_it;
1980
1981         /* Count it even if it's bad */
1982         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1983
1984         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1985                 goto discard_it;
1986
1987         th = (const struct tcphdr *)skb->data;
1988
1989         if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
1990                 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
1991                 goto bad_packet;
1992         }
1993         if (!pskb_may_pull(skb, th->doff * 4))
1994                 goto discard_it;
1995
1996         /* An explanation is required here, I think.
1997          * Packet length and doff are validated by header prediction,
1998          * provided case of th->doff==0 is eliminated.
1999          * So, we defer the checks. */
2000
2001         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2002                 goto csum_error;
2003
2004         th = (const struct tcphdr *)skb->data;
2005         iph = ip_hdr(skb);
2006 lookup:
2007         sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
2008                                th->dest, sdif, &refcounted);
2009         if (!sk)
2010                 goto no_tcp_socket;
2011
2012 process:
2013         if (sk->sk_state == TCP_TIME_WAIT)
2014                 goto do_time_wait;
2015
2016         if (sk->sk_state == TCP_NEW_SYN_RECV) {
2017                 struct request_sock *req = inet_reqsk(sk);
2018                 bool req_stolen = false;
2019                 struct sock *nsk;
2020
2021                 sk = req->rsk_listener;
2022                 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
2023                         sk_drops_add(sk, skb);
2024                         reqsk_put(req);
2025                         goto discard_it;
2026                 }
2027                 if (tcp_checksum_complete(skb)) {
2028                         reqsk_put(req);
2029                         goto csum_error;
2030                 }
2031                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
2032                         nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2033                         if (!nsk) {
2034                                 inet_csk_reqsk_queue_drop_and_put(sk, req);
2035                                 goto lookup;
2036                         }
2037                         sk = nsk;
2038                         /* reuseport_migrate_sock() has already held one sk_refcnt
2039                          * before returning.
2040                          */
2041                 } else {
2042                         /* We own a reference on the listener, increase it again
2043                          * as we might lose it too soon.
2044                          */
2045                         sock_hold(sk);
2046                 }
2047                 refcounted = true;
2048                 nsk = NULL;
2049                 if (!tcp_filter(sk, skb)) {
2050                         th = (const struct tcphdr *)skb->data;
2051                         iph = ip_hdr(skb);
2052                         tcp_v4_fill_cb(skb, iph, th);
2053                         nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2054                 }
2055                 if (!nsk) {
2056                         reqsk_put(req);
2057                         if (req_stolen) {
2058                                 /* Another cpu got exclusive access to req
2059                                  * and created a full blown socket.
2060                                  * Try to feed this packet to this socket
2061                                  * instead of discarding it.
2062                                  */
2063                                 tcp_v4_restore_cb(skb);
2064                                 sock_put(sk);
2065                                 goto lookup;
2066                         }
2067                         goto discard_and_relse;
2068                 }
2069                 if (nsk == sk) {
2070                         reqsk_put(req);
2071                         tcp_v4_restore_cb(skb);
2072                 } else if (tcp_child_process(sk, nsk, skb)) {
2073                         tcp_v4_send_reset(nsk, skb);
2074                         goto discard_and_relse;
2075                 } else {
2076                         sock_put(sk);
2077                         return 0;
2078                 }
2079         }
2080
2081         if (static_branch_unlikely(&ip4_min_ttl)) {
2082                 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
2083                 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2084                         __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2085                         goto discard_and_relse;
2086                 }
2087         }
2088
2089         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2090                 goto discard_and_relse;
2091
2092         if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
2093                 goto discard_and_relse;
2094
2095         nf_reset_ct(skb);
2096
2097         if (tcp_filter(sk, skb)) {
2098                 drop_reason = SKB_DROP_REASON_TCP_FILTER;
2099                 goto discard_and_relse;
2100         }
2101         th = (const struct tcphdr *)skb->data;
2102         iph = ip_hdr(skb);
2103         tcp_v4_fill_cb(skb, iph, th);
2104
2105         skb->dev = NULL;
2106
2107         if (sk->sk_state == TCP_LISTEN) {
2108                 ret = tcp_v4_do_rcv(sk, skb);
2109                 goto put_and_return;
2110         }
2111
2112         sk_incoming_cpu_update(sk);
2113
2114         sk_defer_free_flush(sk);
2115         bh_lock_sock_nested(sk);
2116         tcp_segs_in(tcp_sk(sk), skb);
2117         ret = 0;
2118         if (!sock_owned_by_user(sk)) {
2119                 ret = tcp_v4_do_rcv(sk, skb);
2120         } else {
2121                 if (tcp_add_backlog(sk, skb))
2122                         goto discard_and_relse;
2123         }
2124         bh_unlock_sock(sk);
2125
2126 put_and_return:
2127         if (refcounted)
2128                 sock_put(sk);
2129
2130         return ret;
2131
2132 no_tcp_socket:
2133         drop_reason = SKB_DROP_REASON_NO_SOCKET;
2134         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2135                 goto discard_it;
2136
2137         tcp_v4_fill_cb(skb, iph, th);
2138
2139         if (tcp_checksum_complete(skb)) {
2140 csum_error:
2141                 drop_reason = SKB_DROP_REASON_TCP_CSUM;
2142                 trace_tcp_bad_csum(skb);
2143                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2144 bad_packet:
2145                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2146         } else {
2147                 tcp_v4_send_reset(NULL, skb);
2148         }
2149
2150 discard_it:
2151         /* Discard frame. */
2152         kfree_skb_reason(skb, drop_reason);
2153         return 0;
2154
2155 discard_and_relse:
2156         sk_drops_add(sk, skb);
2157         if (refcounted)
2158                 sock_put(sk);
2159         goto discard_it;
2160
2161 do_time_wait:
2162         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2163                 inet_twsk_put(inet_twsk(sk));
2164                 goto discard_it;
2165         }
2166
2167         tcp_v4_fill_cb(skb, iph, th);
2168
2169         if (tcp_checksum_complete(skb)) {
2170                 inet_twsk_put(inet_twsk(sk));
2171                 goto csum_error;
2172         }
2173         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2174         case TCP_TW_SYN: {
2175                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2176                                                         &tcp_hashinfo, skb,
2177                                                         __tcp_hdrlen(th),
2178                                                         iph->saddr, th->source,
2179                                                         iph->daddr, th->dest,
2180                                                         inet_iif(skb),
2181                                                         sdif);
2182                 if (sk2) {
2183                         inet_twsk_deschedule_put(inet_twsk(sk));
2184                         sk = sk2;
2185                         tcp_v4_restore_cb(skb);
2186                         refcounted = false;
2187                         goto process;
2188                 }
2189         }
2190                 /* to ACK */
2191                 fallthrough;
2192         case TCP_TW_ACK:
2193                 tcp_v4_timewait_ack(sk, skb);
2194                 break;
2195         case TCP_TW_RST:
2196                 tcp_v4_send_reset(sk, skb);
2197                 inet_twsk_deschedule_put(inet_twsk(sk));
2198                 goto discard_it;
2199         case TCP_TW_SUCCESS:;
2200         }
2201         goto discard_it;
2202 }
2203
2204 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2205         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
2206         .twsk_unique    = tcp_twsk_unique,
2207         .twsk_destructor= tcp_twsk_destructor,
2208 };
2209
2210 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2211 {
2212         struct dst_entry *dst = skb_dst(skb);
2213
2214         if (dst && dst_hold_safe(dst)) {
2215                 rcu_assign_pointer(sk->sk_rx_dst, dst);
2216                 sk->sk_rx_dst_ifindex = skb->skb_iif;
2217         }
2218 }
2219 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2220
2221 const struct inet_connection_sock_af_ops ipv4_specific = {
2222         .queue_xmit        = ip_queue_xmit,
2223         .send_check        = tcp_v4_send_check,
2224         .rebuild_header    = inet_sk_rebuild_header,
2225         .sk_rx_dst_set     = inet_sk_rx_dst_set,
2226         .conn_request      = tcp_v4_conn_request,
2227         .syn_recv_sock     = tcp_v4_syn_recv_sock,
2228         .net_header_len    = sizeof(struct iphdr),
2229         .setsockopt        = ip_setsockopt,
2230         .getsockopt        = ip_getsockopt,
2231         .addr2sockaddr     = inet_csk_addr2sockaddr,
2232         .sockaddr_len      = sizeof(struct sockaddr_in),
2233         .mtu_reduced       = tcp_v4_mtu_reduced,
2234 };
2235 EXPORT_SYMBOL(ipv4_specific);
2236
2237 #ifdef CONFIG_TCP_MD5SIG
2238 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2239         .md5_lookup             = tcp_v4_md5_lookup,
2240         .calc_md5_hash          = tcp_v4_md5_hash_skb,
2241         .md5_parse              = tcp_v4_parse_md5_keys,
2242 };
2243 #endif
2244
2245 /* NOTE: A lot of things set to zero explicitly by call to
2246  *       sk_alloc() so need not be done here.
2247  */
2248 static int tcp_v4_init_sock(struct sock *sk)
2249 {
2250         struct inet_connection_sock *icsk = inet_csk(sk);
2251
2252         tcp_init_sock(sk);
2253
2254         icsk->icsk_af_ops = &ipv4_specific;
2255
2256 #ifdef CONFIG_TCP_MD5SIG
2257         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2258 #endif
2259
2260         return 0;
2261 }
2262
2263 void tcp_v4_destroy_sock(struct sock *sk)
2264 {
2265         struct tcp_sock *tp = tcp_sk(sk);
2266
2267         trace_tcp_destroy_sock(sk);
2268
2269         tcp_clear_xmit_timers(sk);
2270
2271         tcp_cleanup_congestion_control(sk);
2272
2273         tcp_cleanup_ulp(sk);
2274
2275         /* Cleanup up the write buffer. */
2276         tcp_write_queue_purge(sk);
2277
2278         /* Check if we want to disable active TFO */
2279         tcp_fastopen_active_disable_ofo_check(sk);
2280
2281         /* Cleans up our, hopefully empty, out_of_order_queue. */
2282         skb_rbtree_purge(&tp->out_of_order_queue);
2283
2284 #ifdef CONFIG_TCP_MD5SIG
2285         /* Clean up the MD5 key list, if any */
2286         if (tp->md5sig_info) {
2287                 tcp_clear_md5_list(sk);
2288                 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2289                 tp->md5sig_info = NULL;
2290         }
2291 #endif
2292
2293         /* Clean up a referenced TCP bind bucket. */
2294         if (inet_csk(sk)->icsk_bind_hash)
2295                 inet_put_port(sk);
2296
2297         BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2298
2299         /* If socket is aborted during connect operation */
2300         tcp_free_fastopen_req(tp);
2301         tcp_fastopen_destroy_cipher(sk);
2302         tcp_saved_syn_free(tp);
2303
2304         sk_sockets_allocated_dec(sk);
2305 }
2306 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2307
2308 #ifdef CONFIG_PROC_FS
2309 /* Proc filesystem TCP sock list dumping. */
2310
2311 static unsigned short seq_file_family(const struct seq_file *seq);
2312
2313 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2314 {
2315         unsigned short family = seq_file_family(seq);
2316
2317         /* AF_UNSPEC is used as a match all */
2318         return ((family == AF_UNSPEC || family == sk->sk_family) &&
2319                 net_eq(sock_net(sk), seq_file_net(seq)));
2320 }
2321
2322 /* Find a non empty bucket (starting from st->bucket)
2323  * and return the first sk from it.
2324  */
2325 static void *listening_get_first(struct seq_file *seq)
2326 {
2327         struct tcp_iter_state *st = seq->private;
2328
2329         st->offset = 0;
2330         for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) {
2331                 struct inet_listen_hashbucket *ilb2;
2332                 struct inet_connection_sock *icsk;
2333                 struct sock *sk;
2334
2335                 ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2336                 if (hlist_empty(&ilb2->head))
2337                         continue;
2338
2339                 spin_lock(&ilb2->lock);
2340                 inet_lhash2_for_each_icsk(icsk, &ilb2->head) {
2341                         sk = (struct sock *)icsk;
2342                         if (seq_sk_match(seq, sk))
2343                                 return sk;
2344                 }
2345                 spin_unlock(&ilb2->lock);
2346         }
2347
2348         return NULL;
2349 }
2350
2351 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2352  * If "cur" is the last one in the st->bucket,
2353  * call listening_get_first() to return the first sk of the next
2354  * non empty bucket.
2355  */
2356 static void *listening_get_next(struct seq_file *seq, void *cur)
2357 {
2358         struct tcp_iter_state *st = seq->private;
2359         struct inet_listen_hashbucket *ilb2;
2360         struct inet_connection_sock *icsk;
2361         struct sock *sk = cur;
2362
2363         ++st->num;
2364         ++st->offset;
2365
2366         icsk = inet_csk(sk);
2367         inet_lhash2_for_each_icsk_continue(icsk) {
2368                 sk = (struct sock *)icsk;
2369                 if (seq_sk_match(seq, sk))
2370                         return sk;
2371         }
2372
2373         ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2374         spin_unlock(&ilb2->lock);
2375         ++st->bucket;
2376         return listening_get_first(seq);
2377 }
2378
2379 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2380 {
2381         struct tcp_iter_state *st = seq->private;
2382         void *rc;
2383
2384         st->bucket = 0;
2385         st->offset = 0;
2386         rc = listening_get_first(seq);
2387
2388         while (rc && *pos) {
2389                 rc = listening_get_next(seq, rc);
2390                 --*pos;
2391         }
2392         return rc;
2393 }
2394
2395 static inline bool empty_bucket(const struct tcp_iter_state *st)
2396 {
2397         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2398 }
2399
2400 /*
2401  * Get first established socket starting from bucket given in st->bucket.
2402  * If st->bucket is zero, the very first socket in the hash is returned.
2403  */
2404 static void *established_get_first(struct seq_file *seq)
2405 {
2406         struct tcp_iter_state *st = seq->private;
2407
2408         st->offset = 0;
2409         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2410                 struct sock *sk;
2411                 struct hlist_nulls_node *node;
2412                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2413
2414                 /* Lockless fast path for the common case of empty buckets */
2415                 if (empty_bucket(st))
2416                         continue;
2417
2418                 spin_lock_bh(lock);
2419                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2420                         if (seq_sk_match(seq, sk))
2421                                 return sk;
2422                 }
2423                 spin_unlock_bh(lock);
2424         }
2425
2426         return NULL;
2427 }
2428
2429 static void *established_get_next(struct seq_file *seq, void *cur)
2430 {
2431         struct sock *sk = cur;
2432         struct hlist_nulls_node *node;
2433         struct tcp_iter_state *st = seq->private;
2434
2435         ++st->num;
2436         ++st->offset;
2437
2438         sk = sk_nulls_next(sk);
2439
2440         sk_nulls_for_each_from(sk, node) {
2441                 if (seq_sk_match(seq, sk))
2442                         return sk;
2443         }
2444
2445         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2446         ++st->bucket;
2447         return established_get_first(seq);
2448 }
2449
2450 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2451 {
2452         struct tcp_iter_state *st = seq->private;
2453         void *rc;
2454
2455         st->bucket = 0;
2456         rc = established_get_first(seq);
2457
2458         while (rc && pos) {
2459                 rc = established_get_next(seq, rc);
2460                 --pos;
2461         }
2462         return rc;
2463 }
2464
2465 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2466 {
2467         void *rc;
2468         struct tcp_iter_state *st = seq->private;
2469
2470         st->state = TCP_SEQ_STATE_LISTENING;
2471         rc        = listening_get_idx(seq, &pos);
2472
2473         if (!rc) {
2474                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2475                 rc        = established_get_idx(seq, pos);
2476         }
2477
2478         return rc;
2479 }
2480
2481 static void *tcp_seek_last_pos(struct seq_file *seq)
2482 {
2483         struct tcp_iter_state *st = seq->private;
2484         int bucket = st->bucket;
2485         int offset = st->offset;
2486         int orig_num = st->num;
2487         void *rc = NULL;
2488
2489         switch (st->state) {
2490         case TCP_SEQ_STATE_LISTENING:
2491                 if (st->bucket > tcp_hashinfo.lhash2_mask)
2492                         break;
2493                 st->state = TCP_SEQ_STATE_LISTENING;
2494                 rc = listening_get_first(seq);
2495                 while (offset-- && rc && bucket == st->bucket)
2496                         rc = listening_get_next(seq, rc);
2497                 if (rc)
2498                         break;
2499                 st->bucket = 0;
2500                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2501                 fallthrough;
2502         case TCP_SEQ_STATE_ESTABLISHED:
2503                 if (st->bucket > tcp_hashinfo.ehash_mask)
2504                         break;
2505                 rc = established_get_first(seq);
2506                 while (offset-- && rc && bucket == st->bucket)
2507                         rc = established_get_next(seq, rc);
2508         }
2509
2510         st->num = orig_num;
2511
2512         return rc;
2513 }
2514
2515 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2516 {
2517         struct tcp_iter_state *st = seq->private;
2518         void *rc;
2519
2520         if (*pos && *pos == st->last_pos) {
2521                 rc = tcp_seek_last_pos(seq);
2522                 if (rc)
2523                         goto out;
2524         }
2525
2526         st->state = TCP_SEQ_STATE_LISTENING;
2527         st->num = 0;
2528         st->bucket = 0;
2529         st->offset = 0;
2530         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2531
2532 out:
2533         st->last_pos = *pos;
2534         return rc;
2535 }
2536 EXPORT_SYMBOL(tcp_seq_start);
2537
2538 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2539 {
2540         struct tcp_iter_state *st = seq->private;
2541         void *rc = NULL;
2542
2543         if (v == SEQ_START_TOKEN) {
2544                 rc = tcp_get_idx(seq, 0);
2545                 goto out;
2546         }
2547
2548         switch (st->state) {
2549         case TCP_SEQ_STATE_LISTENING:
2550                 rc = listening_get_next(seq, v);
2551                 if (!rc) {
2552                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2553                         st->bucket = 0;
2554                         st->offset = 0;
2555                         rc        = established_get_first(seq);
2556                 }
2557                 break;
2558         case TCP_SEQ_STATE_ESTABLISHED:
2559                 rc = established_get_next(seq, v);
2560                 break;
2561         }
2562 out:
2563         ++*pos;
2564         st->last_pos = *pos;
2565         return rc;
2566 }
2567 EXPORT_SYMBOL(tcp_seq_next);
2568
2569 void tcp_seq_stop(struct seq_file *seq, void *v)
2570 {
2571         struct tcp_iter_state *st = seq->private;
2572
2573         switch (st->state) {
2574         case TCP_SEQ_STATE_LISTENING:
2575                 if (v != SEQ_START_TOKEN)
2576                         spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2577                 break;
2578         case TCP_SEQ_STATE_ESTABLISHED:
2579                 if (v)
2580                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2581                 break;
2582         }
2583 }
2584 EXPORT_SYMBOL(tcp_seq_stop);
2585
2586 static void get_openreq4(const struct request_sock *req,
2587                          struct seq_file *f, int i)
2588 {
2589         const struct inet_request_sock *ireq = inet_rsk(req);
2590         long delta = req->rsk_timer.expires - jiffies;
2591
2592         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2593                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2594                 i,
2595                 ireq->ir_loc_addr,
2596                 ireq->ir_num,
2597                 ireq->ir_rmt_addr,
2598                 ntohs(ireq->ir_rmt_port),
2599                 TCP_SYN_RECV,
2600                 0, 0, /* could print option size, but that is af dependent. */
2601                 1,    /* timers active (only the expire timer) */
2602                 jiffies_delta_to_clock_t(delta),
2603                 req->num_timeout,
2604                 from_kuid_munged(seq_user_ns(f),
2605                                  sock_i_uid(req->rsk_listener)),
2606                 0,  /* non standard timer */
2607                 0, /* open_requests have no inode */
2608                 0,
2609                 req);
2610 }
2611
2612 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2613 {
2614         int timer_active;
2615         unsigned long timer_expires;
2616         const struct tcp_sock *tp = tcp_sk(sk);
2617         const struct inet_connection_sock *icsk = inet_csk(sk);
2618         const struct inet_sock *inet = inet_sk(sk);
2619         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2620         __be32 dest = inet->inet_daddr;
2621         __be32 src = inet->inet_rcv_saddr;
2622         __u16 destp = ntohs(inet->inet_dport);
2623         __u16 srcp = ntohs(inet->inet_sport);
2624         int rx_queue;
2625         int state;
2626
2627         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2628             icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2629             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2630                 timer_active    = 1;
2631                 timer_expires   = icsk->icsk_timeout;
2632         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2633                 timer_active    = 4;
2634                 timer_expires   = icsk->icsk_timeout;
2635         } else if (timer_pending(&sk->sk_timer)) {
2636                 timer_active    = 2;
2637                 timer_expires   = sk->sk_timer.expires;
2638         } else {
2639                 timer_active    = 0;
2640                 timer_expires = jiffies;
2641         }
2642
2643         state = inet_sk_state_load(sk);
2644         if (state == TCP_LISTEN)
2645                 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2646         else
2647                 /* Because we don't lock the socket,
2648                  * we might find a transient negative value.
2649                  */
2650                 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2651                                       READ_ONCE(tp->copied_seq), 0);
2652
2653         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2654                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2655                 i, src, srcp, dest, destp, state,
2656                 READ_ONCE(tp->write_seq) - tp->snd_una,
2657                 rx_queue,
2658                 timer_active,
2659                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2660                 icsk->icsk_retransmits,
2661                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2662                 icsk->icsk_probes_out,
2663                 sock_i_ino(sk),
2664                 refcount_read(&sk->sk_refcnt), sk,
2665                 jiffies_to_clock_t(icsk->icsk_rto),
2666                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2667                 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2668                 tp->snd_cwnd,
2669                 state == TCP_LISTEN ?
2670                     fastopenq->max_qlen :
2671                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2672 }
2673
2674 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2675                                struct seq_file *f, int i)
2676 {
2677         long delta = tw->tw_timer.expires - jiffies;
2678         __be32 dest, src;
2679         __u16 destp, srcp;
2680
2681         dest  = tw->tw_daddr;
2682         src   = tw->tw_rcv_saddr;
2683         destp = ntohs(tw->tw_dport);
2684         srcp  = ntohs(tw->tw_sport);
2685
2686         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2687                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2688                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2689                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2690                 refcount_read(&tw->tw_refcnt), tw);
2691 }
2692
2693 #define TMPSZ 150
2694
2695 static int tcp4_seq_show(struct seq_file *seq, void *v)
2696 {
2697         struct tcp_iter_state *st;
2698         struct sock *sk = v;
2699
2700         seq_setwidth(seq, TMPSZ - 1);
2701         if (v == SEQ_START_TOKEN) {
2702                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2703                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2704                            "inode");
2705                 goto out;
2706         }
2707         st = seq->private;
2708
2709         if (sk->sk_state == TCP_TIME_WAIT)
2710                 get_timewait4_sock(v, seq, st->num);
2711         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2712                 get_openreq4(v, seq, st->num);
2713         else
2714                 get_tcp4_sock(v, seq, st->num);
2715 out:
2716         seq_pad(seq, '\n');
2717         return 0;
2718 }
2719
2720 #ifdef CONFIG_BPF_SYSCALL
2721 struct bpf_tcp_iter_state {
2722         struct tcp_iter_state state;
2723         unsigned int cur_sk;
2724         unsigned int end_sk;
2725         unsigned int max_sk;
2726         struct sock **batch;
2727         bool st_bucket_done;
2728 };
2729
2730 struct bpf_iter__tcp {
2731         __bpf_md_ptr(struct bpf_iter_meta *, meta);
2732         __bpf_md_ptr(struct sock_common *, sk_common);
2733         uid_t uid __aligned(8);
2734 };
2735
2736 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2737                              struct sock_common *sk_common, uid_t uid)
2738 {
2739         struct bpf_iter__tcp ctx;
2740
2741         meta->seq_num--;  /* skip SEQ_START_TOKEN */
2742         ctx.meta = meta;
2743         ctx.sk_common = sk_common;
2744         ctx.uid = uid;
2745         return bpf_iter_run_prog(prog, &ctx);
2746 }
2747
2748 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2749 {
2750         while (iter->cur_sk < iter->end_sk)
2751                 sock_put(iter->batch[iter->cur_sk++]);
2752 }
2753
2754 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2755                                       unsigned int new_batch_sz)
2756 {
2757         struct sock **new_batch;
2758
2759         new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2760                              GFP_USER | __GFP_NOWARN);
2761         if (!new_batch)
2762                 return -ENOMEM;
2763
2764         bpf_iter_tcp_put_batch(iter);
2765         kvfree(iter->batch);
2766         iter->batch = new_batch;
2767         iter->max_sk = new_batch_sz;
2768
2769         return 0;
2770 }
2771
2772 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2773                                                  struct sock *start_sk)
2774 {
2775         struct bpf_tcp_iter_state *iter = seq->private;
2776         struct tcp_iter_state *st = &iter->state;
2777         struct inet_connection_sock *icsk;
2778         unsigned int expected = 1;
2779         struct sock *sk;
2780
2781         sock_hold(start_sk);
2782         iter->batch[iter->end_sk++] = start_sk;
2783
2784         icsk = inet_csk(start_sk);
2785         inet_lhash2_for_each_icsk_continue(icsk) {
2786                 sk = (struct sock *)icsk;
2787                 if (seq_sk_match(seq, sk)) {
2788                         if (iter->end_sk < iter->max_sk) {
2789                                 sock_hold(sk);
2790                                 iter->batch[iter->end_sk++] = sk;
2791                         }
2792                         expected++;
2793                 }
2794         }
2795         spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2796
2797         return expected;
2798 }
2799
2800 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2801                                                    struct sock *start_sk)
2802 {
2803         struct bpf_tcp_iter_state *iter = seq->private;
2804         struct tcp_iter_state *st = &iter->state;
2805         struct hlist_nulls_node *node;
2806         unsigned int expected = 1;
2807         struct sock *sk;
2808
2809         sock_hold(start_sk);
2810         iter->batch[iter->end_sk++] = start_sk;
2811
2812         sk = sk_nulls_next(start_sk);
2813         sk_nulls_for_each_from(sk, node) {
2814                 if (seq_sk_match(seq, sk)) {
2815                         if (iter->end_sk < iter->max_sk) {
2816                                 sock_hold(sk);
2817                                 iter->batch[iter->end_sk++] = sk;
2818                         }
2819                         expected++;
2820                 }
2821         }
2822         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2823
2824         return expected;
2825 }
2826
2827 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2828 {
2829         struct bpf_tcp_iter_state *iter = seq->private;
2830         struct tcp_iter_state *st = &iter->state;
2831         unsigned int expected;
2832         bool resized = false;
2833         struct sock *sk;
2834
2835         /* The st->bucket is done.  Directly advance to the next
2836          * bucket instead of having the tcp_seek_last_pos() to skip
2837          * one by one in the current bucket and eventually find out
2838          * it has to advance to the next bucket.
2839          */
2840         if (iter->st_bucket_done) {
2841                 st->offset = 0;
2842                 st->bucket++;
2843                 if (st->state == TCP_SEQ_STATE_LISTENING &&
2844                     st->bucket > tcp_hashinfo.lhash2_mask) {
2845                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2846                         st->bucket = 0;
2847                 }
2848         }
2849
2850 again:
2851         /* Get a new batch */
2852         iter->cur_sk = 0;
2853         iter->end_sk = 0;
2854         iter->st_bucket_done = false;
2855
2856         sk = tcp_seek_last_pos(seq);
2857         if (!sk)
2858                 return NULL; /* Done */
2859
2860         if (st->state == TCP_SEQ_STATE_LISTENING)
2861                 expected = bpf_iter_tcp_listening_batch(seq, sk);
2862         else
2863                 expected = bpf_iter_tcp_established_batch(seq, sk);
2864
2865         if (iter->end_sk == expected) {
2866                 iter->st_bucket_done = true;
2867                 return sk;
2868         }
2869
2870         if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2871                 resized = true;
2872                 goto again;
2873         }
2874
2875         return sk;
2876 }
2877
2878 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2879 {
2880         /* bpf iter does not support lseek, so it always
2881          * continue from where it was stop()-ped.
2882          */
2883         if (*pos)
2884                 return bpf_iter_tcp_batch(seq);
2885
2886         return SEQ_START_TOKEN;
2887 }
2888
2889 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2890 {
2891         struct bpf_tcp_iter_state *iter = seq->private;
2892         struct tcp_iter_state *st = &iter->state;
2893         struct sock *sk;
2894
2895         /* Whenever seq_next() is called, the iter->cur_sk is
2896          * done with seq_show(), so advance to the next sk in
2897          * the batch.
2898          */
2899         if (iter->cur_sk < iter->end_sk) {
2900                 /* Keeping st->num consistent in tcp_iter_state.
2901                  * bpf_iter_tcp does not use st->num.
2902                  * meta.seq_num is used instead.
2903                  */
2904                 st->num++;
2905                 /* Move st->offset to the next sk in the bucket such that
2906                  * the future start() will resume at st->offset in
2907                  * st->bucket.  See tcp_seek_last_pos().
2908                  */
2909                 st->offset++;
2910                 sock_put(iter->batch[iter->cur_sk++]);
2911         }
2912
2913         if (iter->cur_sk < iter->end_sk)
2914                 sk = iter->batch[iter->cur_sk];
2915         else
2916                 sk = bpf_iter_tcp_batch(seq);
2917
2918         ++*pos;
2919         /* Keeping st->last_pos consistent in tcp_iter_state.
2920          * bpf iter does not do lseek, so st->last_pos always equals to *pos.
2921          */
2922         st->last_pos = *pos;
2923         return sk;
2924 }
2925
2926 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2927 {
2928         struct bpf_iter_meta meta;
2929         struct bpf_prog *prog;
2930         struct sock *sk = v;
2931         bool slow;
2932         uid_t uid;
2933         int ret;
2934
2935         if (v == SEQ_START_TOKEN)
2936                 return 0;
2937
2938         if (sk_fullsock(sk))
2939                 slow = lock_sock_fast(sk);
2940
2941         if (unlikely(sk_unhashed(sk))) {
2942                 ret = SEQ_SKIP;
2943                 goto unlock;
2944         }
2945
2946         if (sk->sk_state == TCP_TIME_WAIT) {
2947                 uid = 0;
2948         } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2949                 const struct request_sock *req = v;
2950
2951                 uid = from_kuid_munged(seq_user_ns(seq),
2952                                        sock_i_uid(req->rsk_listener));
2953         } else {
2954                 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2955         }
2956
2957         meta.seq = seq;
2958         prog = bpf_iter_get_info(&meta, false);
2959         ret = tcp_prog_seq_show(prog, &meta, v, uid);
2960
2961 unlock:
2962         if (sk_fullsock(sk))
2963                 unlock_sock_fast(sk, slow);
2964         return ret;
2965
2966 }
2967
2968 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2969 {
2970         struct bpf_tcp_iter_state *iter = seq->private;
2971         struct bpf_iter_meta meta;
2972         struct bpf_prog *prog;
2973
2974         if (!v) {
2975                 meta.seq = seq;
2976                 prog = bpf_iter_get_info(&meta, true);
2977                 if (prog)
2978                         (void)tcp_prog_seq_show(prog, &meta, v, 0);
2979         }
2980
2981         if (iter->cur_sk < iter->end_sk) {
2982                 bpf_iter_tcp_put_batch(iter);
2983                 iter->st_bucket_done = false;
2984         }
2985 }
2986
2987 static const struct seq_operations bpf_iter_tcp_seq_ops = {
2988         .show           = bpf_iter_tcp_seq_show,
2989         .start          = bpf_iter_tcp_seq_start,
2990         .next           = bpf_iter_tcp_seq_next,
2991         .stop           = bpf_iter_tcp_seq_stop,
2992 };
2993 #endif
2994 static unsigned short seq_file_family(const struct seq_file *seq)
2995 {
2996         const struct tcp_seq_afinfo *afinfo;
2997
2998 #ifdef CONFIG_BPF_SYSCALL
2999         /* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
3000         if (seq->op == &bpf_iter_tcp_seq_ops)
3001                 return AF_UNSPEC;
3002 #endif
3003
3004         /* Iterated from proc fs */
3005         afinfo = PDE_DATA(file_inode(seq->file));
3006         return afinfo->family;
3007 }
3008
3009 static const struct seq_operations tcp4_seq_ops = {
3010         .show           = tcp4_seq_show,
3011         .start          = tcp_seq_start,
3012         .next           = tcp_seq_next,
3013         .stop           = tcp_seq_stop,
3014 };
3015
3016 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3017         .family         = AF_INET,
3018 };
3019
3020 static int __net_init tcp4_proc_init_net(struct net *net)
3021 {
3022         if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3023                         sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3024                 return -ENOMEM;
3025         return 0;
3026 }
3027
3028 static void __net_exit tcp4_proc_exit_net(struct net *net)
3029 {
3030         remove_proc_entry("tcp", net->proc_net);
3031 }
3032
3033 static struct pernet_operations tcp4_net_ops = {
3034         .init = tcp4_proc_init_net,
3035         .exit = tcp4_proc_exit_net,
3036 };
3037
3038 int __init tcp4_proc_init(void)
3039 {
3040         return register_pernet_subsys(&tcp4_net_ops);
3041 }
3042
3043 void tcp4_proc_exit(void)
3044 {
3045         unregister_pernet_subsys(&tcp4_net_ops);
3046 }
3047 #endif /* CONFIG_PROC_FS */
3048
3049 /* @wake is one when sk_stream_write_space() calls us.
3050  * This sends EPOLLOUT only if notsent_bytes is half the limit.
3051  * This mimics the strategy used in sock_def_write_space().
3052  */
3053 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3054 {
3055         const struct tcp_sock *tp = tcp_sk(sk);
3056         u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3057                             READ_ONCE(tp->snd_nxt);
3058
3059         return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3060 }
3061 EXPORT_SYMBOL(tcp_stream_memory_free);
3062
3063 struct proto tcp_prot = {
3064         .name                   = "TCP",
3065         .owner                  = THIS_MODULE,
3066         .close                  = tcp_close,
3067         .pre_connect            = tcp_v4_pre_connect,
3068         .connect                = tcp_v4_connect,
3069         .disconnect             = tcp_disconnect,
3070         .accept                 = inet_csk_accept,
3071         .ioctl                  = tcp_ioctl,
3072         .init                   = tcp_v4_init_sock,
3073         .destroy                = tcp_v4_destroy_sock,
3074         .shutdown               = tcp_shutdown,
3075         .setsockopt             = tcp_setsockopt,
3076         .getsockopt             = tcp_getsockopt,
3077         .bpf_bypass_getsockopt  = tcp_bpf_bypass_getsockopt,
3078         .keepalive              = tcp_set_keepalive,
3079         .recvmsg                = tcp_recvmsg,
3080         .sendmsg                = tcp_sendmsg,
3081         .sendpage               = tcp_sendpage,
3082         .backlog_rcv            = tcp_v4_do_rcv,
3083         .release_cb             = tcp_release_cb,
3084         .hash                   = inet_hash,
3085         .unhash                 = inet_unhash,
3086         .get_port               = inet_csk_get_port,
3087         .put_port               = inet_put_port,
3088 #ifdef CONFIG_BPF_SYSCALL
3089         .psock_update_sk_prot   = tcp_bpf_update_proto,
3090 #endif
3091         .enter_memory_pressure  = tcp_enter_memory_pressure,
3092         .leave_memory_pressure  = tcp_leave_memory_pressure,
3093         .stream_memory_free     = tcp_stream_memory_free,
3094         .sockets_allocated      = &tcp_sockets_allocated,
3095         .orphan_count           = &tcp_orphan_count,
3096         .memory_allocated       = &tcp_memory_allocated,
3097         .memory_pressure        = &tcp_memory_pressure,
3098         .sysctl_mem             = sysctl_tcp_mem,
3099         .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3100         .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
3101         .max_header             = MAX_TCP_HEADER,
3102         .obj_size               = sizeof(struct tcp_sock),
3103         .slab_flags             = SLAB_TYPESAFE_BY_RCU,
3104         .twsk_prot              = &tcp_timewait_sock_ops,
3105         .rsk_prot               = &tcp_request_sock_ops,
3106         .h.hashinfo             = &tcp_hashinfo,
3107         .no_autobind            = true,
3108         .diag_destroy           = tcp_abort,
3109 };
3110 EXPORT_SYMBOL(tcp_prot);
3111
3112 static void __net_exit tcp_sk_exit(struct net *net)
3113 {
3114         int cpu;
3115
3116         if (net->ipv4.tcp_congestion_control)
3117                 bpf_module_put(net->ipv4.tcp_congestion_control,
3118                                net->ipv4.tcp_congestion_control->owner);
3119
3120         for_each_possible_cpu(cpu)
3121                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
3122         free_percpu(net->ipv4.tcp_sk);
3123 }
3124
3125 static int __net_init tcp_sk_init(struct net *net)
3126 {
3127         int res, cpu, cnt;
3128
3129         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
3130         if (!net->ipv4.tcp_sk)
3131                 return -ENOMEM;
3132
3133         for_each_possible_cpu(cpu) {
3134                 struct sock *sk;
3135
3136                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3137                                            IPPROTO_TCP, net);
3138                 if (res)
3139                         goto fail;
3140                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3141
3142                 /* Please enforce IP_DF and IPID==0 for RST and
3143                  * ACK sent in SYN-RECV and TIME-WAIT state.
3144                  */
3145                 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3146
3147                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
3148         }
3149
3150         net->ipv4.sysctl_tcp_ecn = 2;
3151         net->ipv4.sysctl_tcp_ecn_fallback = 1;
3152
3153         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3154         net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3155         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3156         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3157         net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3158
3159         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3160         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3161         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3162
3163         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3164         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3165         net->ipv4.sysctl_tcp_syncookies = 1;
3166         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3167         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3168         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3169         net->ipv4.sysctl_tcp_orphan_retries = 0;
3170         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3171         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3172         net->ipv4.sysctl_tcp_tw_reuse = 2;
3173         net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3174
3175         cnt = tcp_hashinfo.ehash_mask + 1;
3176         net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
3177         net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
3178
3179         net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
3180         net->ipv4.sysctl_tcp_sack = 1;
3181         net->ipv4.sysctl_tcp_window_scaling = 1;
3182         net->ipv4.sysctl_tcp_timestamps = 1;
3183         net->ipv4.sysctl_tcp_early_retrans = 3;
3184         net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3185         net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3186         net->ipv4.sysctl_tcp_retrans_collapse = 1;
3187         net->ipv4.sysctl_tcp_max_reordering = 300;
3188         net->ipv4.sysctl_tcp_dsack = 1;
3189         net->ipv4.sysctl_tcp_app_win = 31;
3190         net->ipv4.sysctl_tcp_adv_win_scale = 1;
3191         net->ipv4.sysctl_tcp_frto = 2;
3192         net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3193         /* This limits the percentage of the congestion window which we
3194          * will allow a single TSO frame to consume.  Building TSO frames
3195          * which are too large can cause TCP streams to be bursty.
3196          */
3197         net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3198         /* Default TSQ limit of 16 TSO segments */
3199         net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3200         /* rfc5961 challenge ack rate limiting */
3201         net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
3202         net->ipv4.sysctl_tcp_min_tso_segs = 2;
3203         net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3204         net->ipv4.sysctl_tcp_autocorking = 1;
3205         net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3206         net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3207         net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3208         if (net != &init_net) {
3209                 memcpy(net->ipv4.sysctl_tcp_rmem,
3210                        init_net.ipv4.sysctl_tcp_rmem,
3211                        sizeof(init_net.ipv4.sysctl_tcp_rmem));
3212                 memcpy(net->ipv4.sysctl_tcp_wmem,
3213                        init_net.ipv4.sysctl_tcp_wmem,
3214                        sizeof(init_net.ipv4.sysctl_tcp_wmem));
3215         }
3216         net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3217         net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3218         net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3219         net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3220         net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3221         atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3222
3223         /* Reno is always built in */
3224         if (!net_eq(net, &init_net) &&
3225             bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3226                                init_net.ipv4.tcp_congestion_control->owner))
3227                 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3228         else
3229                 net->ipv4.tcp_congestion_control = &tcp_reno;
3230
3231         return 0;
3232 fail:
3233         tcp_sk_exit(net);
3234
3235         return res;
3236 }
3237
3238 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3239 {
3240         struct net *net;
3241
3242         inet_twsk_purge(&tcp_hashinfo, AF_INET);
3243
3244         list_for_each_entry(net, net_exit_list, exit_list)
3245                 tcp_fastopen_ctx_destroy(net);
3246 }
3247
3248 static struct pernet_operations __net_initdata tcp_sk_ops = {
3249        .init       = tcp_sk_init,
3250        .exit       = tcp_sk_exit,
3251        .exit_batch = tcp_sk_exit_batch,
3252 };
3253
3254 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3255 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3256                      struct sock_common *sk_common, uid_t uid)
3257
3258 #define INIT_BATCH_SZ 16
3259
3260 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3261 {
3262         struct bpf_tcp_iter_state *iter = priv_data;
3263         int err;
3264
3265         err = bpf_iter_init_seq_net(priv_data, aux);
3266         if (err)
3267                 return err;
3268
3269         err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3270         if (err) {
3271                 bpf_iter_fini_seq_net(priv_data);
3272                 return err;
3273         }
3274
3275         return 0;
3276 }
3277
3278 static void bpf_iter_fini_tcp(void *priv_data)
3279 {
3280         struct bpf_tcp_iter_state *iter = priv_data;
3281
3282         bpf_iter_fini_seq_net(priv_data);
3283         kvfree(iter->batch);
3284 }
3285
3286 static const struct bpf_iter_seq_info tcp_seq_info = {
3287         .seq_ops                = &bpf_iter_tcp_seq_ops,
3288         .init_seq_private       = bpf_iter_init_tcp,
3289         .fini_seq_private       = bpf_iter_fini_tcp,
3290         .seq_priv_size          = sizeof(struct bpf_tcp_iter_state),
3291 };
3292
3293 static const struct bpf_func_proto *
3294 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3295                             const struct bpf_prog *prog)
3296 {
3297         switch (func_id) {
3298         case BPF_FUNC_setsockopt:
3299                 return &bpf_sk_setsockopt_proto;
3300         case BPF_FUNC_getsockopt:
3301                 return &bpf_sk_getsockopt_proto;
3302         default:
3303                 return NULL;
3304         }
3305 }
3306
3307 static struct bpf_iter_reg tcp_reg_info = {
3308         .target                 = "tcp",
3309         .ctx_arg_info_size      = 1,
3310         .ctx_arg_info           = {
3311                 { offsetof(struct bpf_iter__tcp, sk_common),
3312                   PTR_TO_BTF_ID_OR_NULL },
3313         },
3314         .get_func_proto         = bpf_iter_tcp_get_func_proto,
3315         .seq_info               = &tcp_seq_info,
3316 };
3317
3318 static void __init bpf_iter_register(void)
3319 {
3320         tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3321         if (bpf_iter_reg_target(&tcp_reg_info))
3322                 pr_warn("Warning: could not register bpf iterator tcp\n");
3323 }
3324
3325 #endif
3326
3327 void __init tcp_v4_init(void)
3328 {
3329         if (register_pernet_subsys(&tcp_sk_ops))
3330                 panic("Failed to create the TCP control socket.\n");
3331
3332 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3333         bpf_iter_register();
3334 #endif
3335 }