net/ipv4/tcp.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  *
  22  * Fixes:
  23  *              Alan Cox        :       Numerous verify_area() calls
  24  *              Alan Cox        :       Set the ACK bit on a reset
  25  *              Alan Cox        :       Stopped it crashing if it closed while
  26  *                                      sk->inuse=1 and was trying to connect
  27  *                                      (tcp_err()).
  28  *              Alan Cox        :       All icmp error handling was broken
  29  *                                      pointers passed where wrong and the
  30  *                                      socket was looked up backwards. Nobody
  31  *                                      tested any icmp error code obviously.
  32  *              Alan Cox        :       tcp_err() now handled properly. It
  33  *                                      wakes people on errors. poll
  34  *                                      behaves and the icmp error race
  35  *                                      has gone by moving it into sock.c
  36  *              Alan Cox        :       tcp_send_reset() fixed to work for
  37  *                                      everything not just packets for
  38  *                                      unknown sockets.
  39  *              Alan Cox        :       tcp option processing.
  40  *              Alan Cox        :       Reset tweaked (still not 100%) [Had
  41  *                                      syn rule wrong]
  42  *              Herp Rosmanith  :       More reset fixes
  43  *              Alan Cox        :       No longer acks invalid rst frames.
  44  *                                      Acking any kind of RST is right out.
  45  *              Alan Cox        :       Sets an ignore me flag on an rst
  46  *                                      receive otherwise odd bits of prattle
  47  *                                      escape still
  48  *              Alan Cox        :       Fixed another acking RST frame bug.
  49  *                                      Should stop LAN workplace lockups.
  50  *              Alan Cox        :       Some tidyups using the new skb list
  51  *                                      facilities
  52  *              Alan Cox        :       sk->keepopen now seems to work
  53  *              Alan Cox        :       Pulls options out correctly on accepts
  54  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
  55  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a
  56  *                                      bit to skb ops.
  57  *              Alan Cox        :       Tidied tcp_data to avoid a potential
  58  *                                      nasty.
  59  *              Alan Cox        :       Added some better commenting, as the
  60  *                                      tcp is hard to follow
  61  *              Alan Cox        :       Removed incorrect check for 20 * psh
  62  *      Michael O'Reilly        :       ack < copied bug fix.
  63  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
  64  *              Alan Cox        :       FIN with no memory -> CRASH
  65  *              Alan Cox        :       Added socket option proto entries.
  66  *                                      Also added awareness of them to accept.
  67  *              Alan Cox        :       Added TCP options (SOL_TCP)
  68  *              Alan Cox        :       Switched wakeup calls to callbacks,
  69  *                                      so the kernel can layer network
  70  *                                      sockets.
  71  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
  72  *              Alan Cox        :       Handle FIN (more) properly (we hope).
  73  *              Alan Cox        :       RST frames sent on unsynchronised
  74  *                                      state ack error.
  75  *              Alan Cox        :       Put in missing check for SYN bit.
  76  *              Alan Cox        :       Added tcp_select_window() aka NET2E
  77  *                                      window non shrink trick.
  78  *              Alan Cox        :       Added a couple of small NET2E timer
  79  *                                      fixes
  80  *              Charles Hedrick :       TCP fixes
  81  *              Toomas Tamm     :       TCP window fixes
  82  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
  83  *              Charles Hedrick :       Rewrote most of it to actually work
  84  *              Linus           :       Rewrote tcp_read() and URG handling
  85  *                                      completely
  86  *              Gerhard Koerting:       Fixed some missing timer handling
  87  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
  88  *              Gerhard Koerting:       PC/TCP workarounds
  89  *              Adam Caldwell   :       Assorted timer/timing errors
  90  *              Matthew Dillon  :       Fixed another RST bug
  91  *              Alan Cox        :       Move to kernel side addressing changes.
  92  *              Alan Cox        :       Beginning work on TCP fastpathing
  93  *                                      (not yet usable)
  94  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
  95  *              Alan Cox        :       TCP fast path debugging
  96  *              Alan Cox        :       Window clamping
  97  *              Michael Riepe   :       Bug in tcp_check()
  98  *              Matt Dillon     :       More TCP improvements and RST bug fixes
  99  *              Matt Dillon     :       Yet more small nasties remove from the
 100  *                                      TCP code (Be very nice to this man if
 101  *                                      tcp finally works 100%) 8)
 102  *              Alan Cox        :       BSD accept semantics.
 103  *              Alan Cox        :       Reset on closedown bug.
 104  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
 105  *              Michael Pall    :       Handle poll() after URG properly in
 106  *                                      all cases.
 107  *              Michael Pall    :       Undo the last fix in tcp_read_urg()
 108  *                                      (multi URG PUSH broke rlogin).
 109  *              Michael Pall    :       Fix the multi URG PUSH problem in
 110  *                                      tcp_readable(), poll() after URG
 111  *                                      works now.
 112  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the
 113  *                                      BSD api.
 114  *              Alan Cox        :       Changed the semantics of sk->socket to
 115  *                                      fix a race and a signal problem with
 116  *                                      accept() and async I/O.
 117  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
 118  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
 119  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
 120  *                                      clients/servers which listen in on
 121  *                                      fixed ports.
 122  *              Alan Cox        :       Cleaned the above up and shrank it to
 123  *                                      a sensible code size.
 124  *              Alan Cox        :       Self connect lockup fix.
 125  *              Alan Cox        :       No connect to multicast.
 126  *              Ross Biro       :       Close unaccepted children on master
 127  *                                      socket close.
 128  *              Alan Cox        :       Reset tracing code.
 129  *              Alan Cox        :       Spurious resets on shutdown.
 130  *              Alan Cox        :       Giant 15 minute/60 second timer error
 131  *              Alan Cox        :       Small whoops in polling before an
 132  *                                      accept.
 133  *              Alan Cox        :       Kept the state trace facility since
 134  *                                      it's handy for debugging.
 135  *              Alan Cox        :       More reset handler fixes.
 136  *              Alan Cox        :       Started rewriting the code based on
 137  *                                      the RFC's for other useful protocol
 138  *                                      references see: Comer, KA9Q NOS, and
 139  *                                      for a reference on the difference
 140  *                                      between specifications and how BSD
 141  *                                      works see the 4.4lite source.
 142  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy
 143  *                                      close.
 144  *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
 145  *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
 146  *              Alan Cox        :       Reimplemented timers as per the RFC
 147  *                                      and using multiple timers for sanity.
 148  *              Alan Cox        :       Small bug fixes, and a lot of new
 149  *                                      comments.
 150  *              Alan Cox        :       Fixed dual reader crash by locking
 151  *                                      the buffers (much like datagram.c)
 152  *              Alan Cox        :       Fixed stuck sockets in probe. A probe
 153  *                                      now gets fed up of retrying without
 154  *                                      (even a no space) answer.
 155  *              Alan Cox        :       Extracted closing code better
 156  *              Alan Cox        :       Fixed the closing state machine to
 157  *                                      resemble the RFC.
 158  *              Alan Cox        :       More 'per spec' fixes.
 159  *              Jorge Cwik      :       Even faster checksumming.
 160  *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
 161  *                                      only frames. At least one pc tcp stack
 162  *                                      generates them.
 163  *              Alan Cox        :       Cache last socket.
 164  *              Alan Cox        :       Per route irtt.
 165  *              Matt Day        :       poll()->select() match BSD precisely on error
 166  *              Alan Cox        :       New buffers
 167  *              Marc Tamsky     :       Various sk->prot->retransmits and
 168  *                                      sk->retransmits misupdating fixed.
 169  *                                      Fixed tcp_write_timeout: stuck close,
 170  *                                      and TCP syn retries gets used now.
 171  *              Mark Yarvis     :       In tcp_read_wakeup(), don't send an
 172  *                                      ack if state is TCP_CLOSED.
 173  *              Alan Cox        :       Look up device on a retransmit - routes may
 174  *                                      change. Doesn't yet cope with MSS shrink right
 175  *                                      but it's a start!
 176  *              Marc Tamsky     :       Closing in closing fixes.
 177  *              Mike Shaver     :       RFC1122 verifications.
 178  *              Alan Cox        :       rcv_saddr errors.
 179  *              Alan Cox        :       Block double connect().
 180  *              Alan Cox        :       Small hooks for enSKIP.
 181  *              Alexey Kuznetsov:       Path MTU discovery.
 182  *              Alan Cox        :       Support soft errors.
 183  *              Alan Cox        :       Fix MTU discovery pathological case
 184  *                                      when the remote claims no mtu!
 185  *              Marc Tamsky     :       TCP_CLOSE fix.
 186  *              Colin (G3TNE)   :       Send a reset on syn ack replies in
 187  *                                      window but wrong (fixes NT lpd problems)
 188  *              Pedro Roque     :       Better TCP window handling, delayed ack.
 189  *              Joerg Reuter    :       No modification of locked buffers in
 190  *                                      tcp_do_retransmit()
 191  *              Eric Schenk     :       Changed receiver side silly window
 192  *                                      avoidance algorithm to BSD style
 193  *                                      algorithm. This doubles throughput
 194  *                                      against machines running Solaris,
 195  *                                      and seems to result in general
 196  *                                      improvement.
 197  *      Stefan Magdalinski      :       adjusted tcp_readable() to fix FIONREAD
 198  *      Willy Konynenberg       :       Transparent proxying support.
 199  *      Mike McLagan            :       Routing by source
 200  *              Keith Owens     :       Do proper merging with partial SKB's in
 201  *                                      tcp_do_sendmsg to avoid burstiness.
 202  *              Eric Schenk     :       Fix fast close down bug with
 203  *                                      shutdown() followed by close().
 204  *              Andi Kleen      :       Make poll agree with SIGIO
 205  *      Salvatore Sanfilippo    :       Support SO_LINGER with linger == 1 and
 206  *                                      lingertime == 0 (RFC 793 ABORT Call)
 207  *      Hirokazu Takahashi      :       Use copy_from_user() instead of
 208  *                                      csum_and_copy_from_user() if possible.
 209  *
 210  *              This program is free software; you can redistribute it and/or
 211  *              modify it under the terms of the GNU General Public License
 212  *              as published by the Free Software Foundation; either version
 213  *              2 of the License, or(at your option) any later version.
 214  *
 215  * Description of States:
 216  *
 217  *      TCP_SYN_SENT            sent a connection request, waiting for ack
 218  *
 219  *      TCP_SYN_RECV            received a connection request, sent ack,
 220  *                              waiting for final ack in three-way handshake.
 221  *
 222  *      TCP_ESTABLISHED         connection established
 223  *
 224  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
 225  *                              transmission of remaining buffered data
 226  *
 227  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
 228  *                              to shutdown
 229  *
 230  *      TCP_CLOSING             both sides have shutdown but we still have
 231  *                              data we have to finish sending
 232  *
 233  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
 234  *                              closed, can only be entered from FIN_WAIT2
 235  *                              or CLOSING.  Required because the other end
 236  *                              may not have gotten our last ACK causing it
 237  *                              to retransmit the data packet (which we ignore)
 238  *
 239  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
 240  *                              us to finish writing our data and to shutdown
 241  *                              (we have to close() to move on to LAST_ACK)
 242  *
 243  *      TCP_LAST_ACK            out side has shutdown after remote has
 244  *                              shutdown.  There may still be data in our
 245  *                              buffer that we have to finish sending
 246  *
 247  *      TCP_CLOSE               socket is finished
 248  */
 249
 250 #include <linux/config.h>
 251 #include <linux/module.h>
 252 #include <linux/types.h>
 253 #include <linux/fcntl.h>
 254 #include <linux/poll.h>
 255 #include <linux/init.h>
 256 #include <linux/smp_lock.h>
 257 #include <linux/fs.h>
 258 #include <linux/random.h>
 259 #include <linux/bootmem.h>
 260
 261 #include <net/icmp.h>
 262 #include <net/tcp.h>
 263 #include <net/xfrm.h>
 264 #include <net/ip.h>
 265
 266
 267 #include <asm/uaccess.h>
 268 #include <asm/ioctls.h>
 269
 270 int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
 271
 272 DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics);
 273
 274 atomic_t tcp_orphan_count = ATOMIC_INIT(0);
 275
 276 EXPORT_SYMBOL_GPL(tcp_orphan_count);
 277
 278 int sysctl_tcp_mem[3];
 279 int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
 280 int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
 281
 282 EXPORT_SYMBOL(sysctl_tcp_mem);
 283 EXPORT_SYMBOL(sysctl_tcp_rmem);
 284 EXPORT_SYMBOL(sysctl_tcp_wmem);
 285
 286 atomic_t tcp_memory_allocated;  /* Current allocated memory. */
 287 atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */
 288
 289 EXPORT_SYMBOL(tcp_memory_allocated);
 290 EXPORT_SYMBOL(tcp_sockets_allocated);
 291
 292 /*
 293  * Pressure flag: try to collapse.
 294  * Technical note: it is used by multiple contexts non atomically.
 295  * All the sk_stream_mem_schedule() is of this nature: accounting
 296  * is strict, actions are advisory and have some latency.
 297  */
 298 int tcp_memory_pressure;
 299
 300 EXPORT_SYMBOL(tcp_memory_pressure);
 301
 302 void tcp_enter_memory_pressure(void)
 303 {
 304         if (!tcp_memory_pressure) {
 305                 NET_INC_STATS(LINUX_MIB_TCPMEMORYPRESSURES);
 306                 tcp_memory_pressure = 1;
 307         }
 308 }
 309
 310 EXPORT_SYMBOL(tcp_enter_memory_pressure);
 311
 312 /*
 313  * LISTEN is a special case for poll..
 314  */
 315 static __inline__ unsigned int tcp_listen_poll(struct sock *sk,
 316                                                poll_table *wait)
 317 {
 318         return !reqsk_queue_empty(&inet_csk(sk)->icsk_accept_queue) ? (POLLIN | POLLRDNORM) : 0;
 319 }
 320
 321 /*
 322  *      Wait for a TCP event.
 323  *
 324  *      Note that we don't need to lock the socket, as the upper poll layers
 325  *      take care of normal races (between the test and the event) and we don't
 326  *      go look at any of the socket buffers directly.
 327  */
 328 unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
 329 {
 330         unsigned int mask;
 331         struct sock *sk = sock->sk;
 332         struct tcp_sock *tp = tcp_sk(sk);
 333
 334         poll_wait(file, sk->sk_sleep, wait);
 335         if (sk->sk_state == TCP_LISTEN)
 336                 return tcp_listen_poll(sk, wait);
 337
 338         /* Socket is not locked. We are protected from async events
 339            by poll logic and correct handling of state changes
 340            made by another threads is impossible in any case.
 341          */
 342
 343         mask = 0;
 344         if (sk->sk_err)
 345                 mask = POLLERR;
 346
 347         /*
 348          * POLLHUP is certainly not done right. But poll() doesn't
 349          * have a notion of HUP in just one direction, and for a
 350          * socket the read side is more interesting.
 351          *
 352          * Some poll() documentation says that POLLHUP is incompatible
 353          * with the POLLOUT/POLLWR flags, so somebody should check this
 354          * all. But careful, it tends to be safer to return too many
 355          * bits than too few, and you can easily break real applications
 356          * if you don't tell them that something has hung up!
 357          *
 358          * Check-me.
 359          *
 360          * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
 361          * our fs/select.c). It means that after we received EOF,
 362          * poll always returns immediately, making impossible poll() on write()
 363          * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
 364          * if and only if shutdown has been made in both directions.
 365          * Actually, it is interesting to look how Solaris and DUX
 366          * solve this dilemma. I would prefer, if PULLHUP were maskable,
 367          * then we could set it on SND_SHUTDOWN. BTW examples given
 368          * in Stevens' books assume exactly this behaviour, it explains
 369          * why PULLHUP is incompatible with POLLOUT.    --ANK
 370          *
 371          * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
 372          * blocking on fresh not-connected or disconnected socket. --ANK
 373          */
 374         if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
 375                 mask |= POLLHUP;
 376         if (sk->sk_shutdown & RCV_SHUTDOWN)
 377                 mask |= POLLIN | POLLRDNORM;
 378
 379         /* Connected? */
 380         if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
 381                 /* Potential race condition. If read of tp below will
 382                  * escape above sk->sk_state, we can be illegally awaken
 383                  * in SYN_* states. */
 384                 if ((tp->rcv_nxt != tp->copied_seq) &&
 385                     (tp->urg_seq != tp->copied_seq ||
 386                      tp->rcv_nxt != tp->copied_seq + 1 ||
 387                      sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))
 388                         mask |= POLLIN | POLLRDNORM;
 389
 390                 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
 391                         if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
 392                                 mask |= POLLOUT | POLLWRNORM;
 393                         } else {  /* send SIGIO later */
 394                                 set_bit(SOCK_ASYNC_NOSPACE,
 395                                         &sk->sk_socket->flags);
 396                                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 397
 398                                 /* Race breaker. If space is freed after
 399                                  * wspace test but before the flags are set,
 400                                  * IO signal will be lost.
 401                                  */
 402                                 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
 403                                         mask |= POLLOUT | POLLWRNORM;
 404                         }
 405                 }
 406
 407                 if (tp->urg_data & TCP_URG_VALID)
 408                         mask |= POLLPRI;
 409         }
 410         return mask;
 411 }
 412
 413 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 414 {
 415         struct tcp_sock *tp = tcp_sk(sk);
 416         int answ;
 417
 418         switch (cmd) {
 419         case SIOCINQ:
 420                 if (sk->sk_state == TCP_LISTEN)
 421                         return -EINVAL;
 422
 423                 lock_sock(sk);
 424                 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
 425                         answ = 0;
 426                 else if (sock_flag(sk, SOCK_URGINLINE) ||
 427                          !tp->urg_data ||
 428                          before(tp->urg_seq, tp->copied_seq) ||
 429                          !before(tp->urg_seq, tp->rcv_nxt)) {
 430                         answ = tp->rcv_nxt - tp->copied_seq;
 431
 432                         /* Subtract 1, if FIN is in queue. */
 433                         if (answ && !skb_queue_empty(&sk->sk_receive_queue))
 434                                 answ -=
 435                        ((struct sk_buff *)sk->sk_receive_queue.prev)->h.th->fin;
 436                 } else
 437                         answ = tp->urg_seq - tp->copied_seq;
 438                 release_sock(sk);
 439                 break;
 440         case SIOCATMARK:
 441                 answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
 442                 break;
 443         case SIOCOUTQ:
 444                 if (sk->sk_state == TCP_LISTEN)
 445                         return -EINVAL;
 446
 447                 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
 448                         answ = 0;
 449                 else
 450                         answ = tp->write_seq - tp->snd_una;
 451                 break;
 452         default:
 453                 return -ENOIOCTLCMD;
 454         };
 455
 456         return put_user(answ, (int __user *)arg);
 457 }
 458
 459 static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
 460 {
 461         TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
 462         tp->pushed_seq = tp->write_seq;
 463 }
 464
 465 static inline int forced_push(struct tcp_sock *tp)
 466 {
 467         return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
 468 }
 469
 470 static inline void skb_entail(struct sock *sk, struct tcp_sock *tp,
 471                               struct sk_buff *skb)
 472 {
 473         skb->csum = 0;
 474         TCP_SKB_CB(skb)->seq = tp->write_seq;
 475         TCP_SKB_CB(skb)->end_seq = tp->write_seq;
 476         TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
 477         TCP_SKB_CB(skb)->sacked = 0;
 478         skb_header_release(skb);
 479         __skb_queue_tail(&sk->sk_write_queue, skb);
 480         sk_charge_skb(sk, skb);
 481         if (!sk->sk_send_head)
 482                 sk->sk_send_head = skb;
 483         if (tp->nonagle & TCP_NAGLE_PUSH)
 484                 tp->nonagle &= ~TCP_NAGLE_PUSH;
 485 }
 486
 487 static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
 488                                 struct sk_buff *skb)
 489 {
 490         if (flags & MSG_OOB) {
 491                 tp->urg_mode = 1;
 492                 tp->snd_up = tp->write_seq;
 493                 TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
 494         }
 495 }
 496
 497 static inline void tcp_push(struct sock *sk, struct tcp_sock *tp, int flags,
 498                             int mss_now, int nonagle)
 499 {
 500         if (sk->sk_send_head) {
 501                 struct sk_buff *skb = sk->sk_write_queue.prev;
 502                 if (!(flags & MSG_MORE) || forced_push(tp))
 503                         tcp_mark_push(tp, skb);
 504                 tcp_mark_urg(tp, flags, skb);
 505                 __tcp_push_pending_frames(sk, tp, mss_now,
 506                                           (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
 507         }
 508 }
 509
 510 static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
 511                          size_t psize, int flags)
 512 {
 513         struct tcp_sock *tp = tcp_sk(sk);
 514         int mss_now, size_goal;
 515         int err;
 516         ssize_t copied;
 517         long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
 518
 519         /* Wait for a connection to finish. */
 520         if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
 521                 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
 522                         goto out_err;
 523
 524         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
 525
 526         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
 527         size_goal = tp->xmit_size_goal;
 528         copied = 0;
 529
 530         err = -EPIPE;
 531         if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
 532                 goto do_error;
 533
 534         while (psize > 0) {
 535                 struct sk_buff *skb = sk->sk_write_queue.prev;
 536                 struct page *page = pages[poffset / PAGE_SIZE];
 537                 int copy, i, can_coalesce;
 538                 int offset = poffset % PAGE_SIZE;
 539                 int size = min_t(size_t, psize, PAGE_SIZE - offset);
 540
 541                 if (!sk->sk_send_head || (copy = size_goal - skb->len) <= 0) {
 542 new_segment:
 543                         if (!sk_stream_memory_free(sk))
 544                                 goto wait_for_sndbuf;
 545
 546                         skb = sk_stream_alloc_pskb(sk, 0, 0,
 547                                                    sk->sk_allocation);
 548                         if (!skb)
 549                                 goto wait_for_memory;
 550
 551                         skb_entail(sk, tp, skb);
 552                         copy = size_goal;
 553                 }
 554
 555                 if (copy > size)
 556                         copy = size;
 557
 558                 i = skb_shinfo(skb)->nr_frags;
 559                 can_coalesce = skb_can_coalesce(skb, i, page, offset);
 560                 if (!can_coalesce && i >= MAX_SKB_FRAGS) {
 561                         tcp_mark_push(tp, skb);
 562                         goto new_segment;
 563                 }
 564                 if (sk->sk_forward_alloc < copy &&
 565                     !sk_stream_mem_schedule(sk, copy, 0))
 566                         goto wait_for_memory;
 567
 568                 if (can_coalesce) {
 569                         skb_shinfo(skb)->frags[i - 1].size += copy;
 570                 } else {
 571                         get_page(page);
 572                         skb_fill_page_desc(skb, i, page, offset, copy);
 573                 }
 574
 575                 skb->len += copy;
 576                 skb->data_len += copy;
 577                 skb->truesize += copy;
 578                 sk->sk_wmem_queued += copy;
 579                 sk->sk_forward_alloc -= copy;
 580                 skb->ip_summed = CHECKSUM_HW;
 581                 tp->write_seq += copy;
 582                 TCP_SKB_CB(skb)->end_seq += copy;
 583                 skb_shinfo(skb)->tso_segs = 0;
 584
 585                 if (!copied)
 586                         TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
 587
 588                 copied += copy;
 589                 poffset += copy;
 590                 if (!(psize -= copy))
 591                         goto out;
 592
 593                 if (skb->len < mss_now || (flags & MSG_OOB))
 594                         continue;
 595
 596                 if (forced_push(tp)) {
 597                         tcp_mark_push(tp, skb);
 598                         __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
 599                 } else if (skb == sk->sk_send_head)
 600                         tcp_push_one(sk, mss_now);
 601                 continue;
 602
 603 wait_for_sndbuf:
 604                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 605 wait_for_memory:
 606                 if (copied)
 607                         tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
 608
 609                 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
 610                         goto do_error;
 611
 612                 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
 613                 size_goal = tp->xmit_size_goal;
 614         }
 615
 616 out:
 617         if (copied)
 618                 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
 619         return copied;
 620
 621 do_error:
 622         if (copied)
 623                 goto out;
 624 out_err:
 625         return sk_stream_error(sk, flags, err);
 626 }
 627
 628 ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
 629                      size_t size, int flags)
 630 {
 631         ssize_t res;
 632         struct sock *sk = sock->sk;
 633
 634 #define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
 635
 636         if (!(sk->sk_route_caps & NETIF_F_SG) ||
 637             !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))
 638                 return sock_no_sendpage(sock, page, offset, size, flags);
 639
 640 #undef TCP_ZC_CSUM_FLAGS
 641
 642         lock_sock(sk);
 643         TCP_CHECK_TIMER(sk);
 644         res = do_tcp_sendpages(sk, &page, offset, size, flags);
 645         TCP_CHECK_TIMER(sk);
 646         release_sock(sk);
 647         return res;
 648 }
 649
 650 #define TCP_PAGE(sk)    (sk->sk_sndmsg_page)
 651 #define TCP_OFF(sk)     (sk->sk_sndmsg_off)
 652
 653 static inline int select_size(struct sock *sk, struct tcp_sock *tp)
 654 {
 655         int tmp = tp->mss_cache;
 656
 657         if (sk->sk_route_caps & NETIF_F_SG) {
 658                 if (sk->sk_route_caps & NETIF_F_TSO)
 659                         tmp = 0;
 660                 else {
 661                         int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
 662
 663                         if (tmp >= pgbreak &&
 664                             tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
 665                                 tmp = pgbreak;
 666                 }
 667         }
 668
 669         return tmp;
 670 }
 671
 672 int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 673                 size_t size)
 674 {
 675         struct iovec *iov;
 676         struct tcp_sock *tp = tcp_sk(sk);
 677         struct sk_buff *skb;
 678         int iovlen, flags;
 679         int mss_now, size_goal;
 680         int err, copied;
 681         long timeo;
 682
 683         lock_sock(sk);
 684         TCP_CHECK_TIMER(sk);
 685
 686         flags = msg->msg_flags;
 687         timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
 688
 689         /* Wait for a connection to finish. */
 690         if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
 691                 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
 692                         goto out_err;
 693
 694         /* This should be in poll */
 695         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
 696
 697         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
 698         size_goal = tp->xmit_size_goal;
 699
 700         /* Ok commence sending. */
 701         iovlen = msg->msg_iovlen;
 702         iov = msg->msg_iov;
 703         copied = 0;
 704
 705         err = -EPIPE;
 706         if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
 707                 goto do_error;
 708
 709         while (--iovlen >= 0) {
 710                 int seglen = iov->iov_len;
 711                 unsigned char __user *from = iov->iov_base;
 712
 713                 iov++;
 714
 715                 while (seglen > 0) {
 716                         int copy;
 717
 718                         skb = sk->sk_write_queue.prev;
 719
 720                         if (!sk->sk_send_head ||
 721                             (copy = size_goal - skb->len) <= 0) {
 722
 723 new_segment:
 724                                 /* Allocate new segment. If the interface is SG,
 725                                  * allocate skb fitting to single page.
 726                                  */
 727                                 if (!sk_stream_memory_free(sk))
 728                                         goto wait_for_sndbuf;
 729
 730                                 skb = sk_stream_alloc_pskb(sk, select_size(sk, tp),
 731                                                            0, sk->sk_allocation);
 732                                 if (!skb)
 733                                         goto wait_for_memory;
 734
 735                                 /*
 736                                  * Check whether we can use HW checksum.
 737                                  */
 738                                 if (sk->sk_route_caps &
 739                                     (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM |
 740                                      NETIF_F_HW_CSUM))
 741                                         skb->ip_summed = CHECKSUM_HW;
 742
 743                                 skb_entail(sk, tp, skb);
 744                                 copy = size_goal;
 745                         }
 746
 747                         /* Try to append data to the end of skb. */
 748                         if (copy > seglen)
 749                                 copy = seglen;
 750
 751                         /* Where to copy to? */
 752                         if (skb_tailroom(skb) > 0) {
 753                                 /* We have some space in skb head. Superb! */
 754                                 if (copy > skb_tailroom(skb))
 755                                         copy = skb_tailroom(skb);
 756                                 if ((err = skb_add_data(skb, from, copy)) != 0)
 757                                         goto do_fault;
 758                         } else {
 759                                 int merge = 0;
 760                                 int i = skb_shinfo(skb)->nr_frags;
 761                                 struct page *page = TCP_PAGE(sk);
 762                                 int off = TCP_OFF(sk);
 763
 764                                 if (skb_can_coalesce(skb, i, page, off) &&
 765                                     off != PAGE_SIZE) {
 766                                         /* We can extend the last page
 767                                          * fragment. */
 768                                         merge = 1;
 769                                 } else if (i == MAX_SKB_FRAGS ||
 770                                            (!i &&
 771                                            !(sk->sk_route_caps & NETIF_F_SG))) {
 772                                         /* Need to add new fragment and cannot
 773                                          * do this because interface is non-SG,
 774                                          * or because all the page slots are
 775                                          * busy. */
 776                                         tcp_mark_push(tp, skb);
 777                                         goto new_segment;
 778                                 } else if (page) {
 779                                         if (off == PAGE_SIZE) {
 780                                                 put_page(page);
 781                                                 TCP_PAGE(sk) = page = NULL;
 782                                         }
 783                                 }
 784
 785                                 if (!page) {
 786                                         /* Allocate new cache page. */
 787                                         if (!(page = sk_stream_alloc_page(sk)))
 788                                                 goto wait_for_memory;
 789                                         off = 0;
 790                                 }
 791
 792                                 if (copy > PAGE_SIZE - off)
 793                                         copy = PAGE_SIZE - off;
 794
 795                                 /* Time to copy data. We are close to
 796                                  * the end! */
 797                                 err = skb_copy_to_page(sk, from, skb, page,
 798                                                        off, copy);
 799                                 if (err) {
 800                                         /* If this page was new, give it to the
 801                                          * socket so it does not get leaked.
 802                                          */
 803                                         if (!TCP_PAGE(sk)) {
 804                                                 TCP_PAGE(sk) = page;
 805                                                 TCP_OFF(sk) = 0;
 806                                         }
 807                                         goto do_error;
 808                                 }
 809
 810                                 /* Update the skb. */
 811                                 if (merge) {
 812                                         skb_shinfo(skb)->frags[i - 1].size +=
 813                                                                         copy;
 814                                 } else {
 815                                         skb_fill_page_desc(skb, i, page, off, copy);
 816                                         if (TCP_PAGE(sk)) {
 817                                                 get_page(page);
 818                                         } else if (off + copy < PAGE_SIZE) {
 819                                                 get_page(page);
 820                                                 TCP_PAGE(sk) = page;
 821                                         }
 822                                 }
 823
 824                                 TCP_OFF(sk) = off + copy;
 825                         }
 826
 827                         if (!copied)
 828                                 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
 829
 830                         tp->write_seq += copy;
 831                         TCP_SKB_CB(skb)->end_seq += copy;
 832                         skb_shinfo(skb)->tso_segs = 0;
 833
 834                         from += copy;
 835                         copied += copy;
 836                         if ((seglen -= copy) == 0 && iovlen == 0)
 837                                 goto out;
 838
 839                         if (skb->len < mss_now || (flags & MSG_OOB))
 840                                 continue;
 841
 842                         if (forced_push(tp)) {
 843                                 tcp_mark_push(tp, skb);
 844                                 __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
 845                         } else if (skb == sk->sk_send_head)
 846                                 tcp_push_one(sk, mss_now);
 847                         continue;
 848
 849 wait_for_sndbuf:
 850                         set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 851 wait_for_memory:
 852                         if (copied)
 853                                 tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
 854
 855                         if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
 856                                 goto do_error;
 857
 858                         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
 859                         size_goal = tp->xmit_size_goal;
 860                 }
 861         }
 862
 863 out:
 864         if (copied)
 865                 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
 866         TCP_CHECK_TIMER(sk);
 867         release_sock(sk);
 868         return copied;
 869
 870 do_fault:
 871         if (!skb->len) {
 872                 if (sk->sk_send_head == skb)
 873                         sk->sk_send_head = NULL;
 874                 __skb_unlink(skb, &sk->sk_write_queue);
 875                 sk_stream_free_skb(sk, skb);
 876         }
 877
 878 do_error:
 879         if (copied)
 880                 goto out;
 881 out_err:
 882         err = sk_stream_error(sk, flags, err);
 883         TCP_CHECK_TIMER(sk);
 884         release_sock(sk);
 885         return err;
 886 }
 887
 888 /*
 889  *      Handle reading urgent data. BSD has very simple semantics for
 890  *      this, no blocking and very strange errors 8)
 891  */
 892
 893 static int tcp_recv_urg(struct sock *sk, long timeo,
 894                         struct msghdr *msg, int len, int flags,
 895                         int *addr_len)
 896 {
 897         struct tcp_sock *tp = tcp_sk(sk);
 898
 899         /* No URG data to read. */
 900         if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
 901             tp->urg_data == TCP_URG_READ)
 902                 return -EINVAL; /* Yes this is right ! */
 903
 904         if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
 905                 return -ENOTCONN;
 906
 907         if (tp->urg_data & TCP_URG_VALID) {
 908                 int err = 0;
 909                 char c = tp->urg_data;
 910
 911                 if (!(flags & MSG_PEEK))
 912                         tp->urg_data = TCP_URG_READ;
 913
 914                 /* Read urgent data. */
 915                 msg->msg_flags |= MSG_OOB;
 916
 917                 if (len > 0) {
 918                         if (!(flags & MSG_TRUNC))
 919                                 err = memcpy_toiovec(msg->msg_iov, &c, 1);
 920                         len = 1;
 921                 } else
 922                         msg->msg_flags |= MSG_TRUNC;
 923
 924                 return err ? -EFAULT : len;
 925         }
 926
 927         if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
 928                 return 0;
 929
 930         /* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
 931          * the available implementations agree in this case:
 932          * this call should never block, independent of the
 933          * blocking state of the socket.
 934          * Mike <pall@rz.uni-karlsruhe.de>
 935          */
 936         return -EAGAIN;
 937 }
 938
 939 /* Clean up the receive buffer for full frames taken by the user,
 940  * then send an ACK if necessary.  COPIED is the number of bytes
 941  * tcp_recvmsg has given to the user so far, it speeds up the
 942  * calculation of whether or not we must ACK for the sake of
 943  * a window update.
 944  */
 945 static void cleanup_rbuf(struct sock *sk, int copied)
 946 {
 947         struct tcp_sock *tp = tcp_sk(sk);
 948         int time_to_ack = 0;
 949
 950 #if TCP_DEBUG
 951         struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
 952
 953         BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
 954 #endif
 955
 956         if (inet_csk_ack_scheduled(sk)) {
 957                 const struct inet_connection_sock *icsk = inet_csk(sk);
 958                    /* Delayed ACKs frequently hit locked sockets during bulk
 959                     * receive. */
 960                 if (icsk->icsk_ack.blocked ||
 961                     /* Once-per-two-segments ACK was not sent by tcp_input.c */
 962                     tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
 963                     /*
 964                      * If this read emptied read buffer, we send ACK, if
 965                      * connection is not bidirectional, user drained
 966                      * receive buffer and there was a small segment
 967                      * in queue.
 968                      */
 969                     (copied > 0 && (icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
 970                      !icsk->icsk_ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
 971                         time_to_ack = 1;
 972         }
 973
 974         /* We send an ACK if we can now advertise a non-zero window
 975          * which has been raised "significantly".
 976          *
 977          * Even if window raised up to infinity, do not send window open ACK
 978          * in states, where we will not receive more. It is useless.
 979          */
 980         if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
 981                 __u32 rcv_window_now = tcp_receive_window(tp);
 982
 983                 /* Optimize, __tcp_select_window() is not cheap. */
 984                 if (2*rcv_window_now <= tp->window_clamp) {
 985                         __u32 new_window = __tcp_select_window(sk);
 986
 987                         /* Send ACK now, if this read freed lots of space
 988                          * in our buffer. Certainly, new_window is new window.
 989                          * We can advertise it now, if it is not less than current one.
 990                          * "Lots" means "at least twice" here.
 991                          */
 992                         if (new_window && new_window >= 2 * rcv_window_now)
 993                                 time_to_ack = 1;
 994                 }
 995         }
 996         if (time_to_ack)
 997                 tcp_send_ack(sk);
 998 }
 999
1000 static void tcp_prequeue_process(struct sock *sk)
1001 {
1002         struct sk_buff *skb;
1003         struct tcp_sock *tp = tcp_sk(sk);
1004
1005         NET_INC_STATS_USER(LINUX_MIB_TCPPREQUEUED);
1006
1007         /* RX process wants to run with disabled BHs, though it is not
1008          * necessary */
1009         local_bh_disable();
1010         while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1011                 sk->sk_backlog_rcv(sk, skb);
1012         local_bh_enable();
1013
1014         /* Clear memory counter. */
1015         tp->ucopy.memory = 0;
1016 }
1017
1018 static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1019 {
1020         struct sk_buff *skb;
1021         u32 offset;
1022
1023         skb_queue_walk(&sk->sk_receive_queue, skb) {
1024                 offset = seq - TCP_SKB_CB(skb)->seq;
1025                 if (skb->h.th->syn)
1026                         offset--;
1027                 if (offset < skb->len || skb->h.th->fin) {
1028                         *off = offset;
1029                         return skb;
1030                 }
1031         }
1032         return NULL;
1033 }
1034
1035 /*
1036  * This routine provides an alternative to tcp_recvmsg() for routines
1037  * that would like to handle copying from skbuffs directly in 'sendfile'
1038  * fashion.
1039  * Note:
1040  *      - It is assumed that the socket was locked by the caller.
1041  *      - The routine does not block.
1042  *      - At present, there is no support for reading OOB data
1043  *        or for 'peeking' the socket using this routine
1044  *        (although both would be easy to implement).
1045  */
1046 int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1047                   sk_read_actor_t recv_actor)
1048 {
1049         struct sk_buff *skb;
1050         struct tcp_sock *tp = tcp_sk(sk);
1051         u32 seq = tp->copied_seq;
1052         u32 offset;
1053         int copied = 0;
1054
1055         if (sk->sk_state == TCP_LISTEN)
1056                 return -ENOTCONN;
1057         while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1058                 if (offset < skb->len) {
1059                         size_t used, len;
1060
1061                         len = skb->len - offset;
1062                         /* Stop reading if we hit a patch of urgent data */
1063                         if (tp->urg_data) {
1064                                 u32 urg_offset = tp->urg_seq - seq;
1065                                 if (urg_offset < len)
1066                                         len = urg_offset;
1067                                 if (!len)
1068                                         break;
1069                         }
1070                         used = recv_actor(desc, skb, offset, len);
1071                         if (used <= len) {
1072                                 seq += used;
1073                                 copied += used;
1074                                 offset += used;
1075                         }
1076                         if (offset != skb->len)
1077                                 break;
1078                 }
1079                 if (skb->h.th->fin) {
1080                         sk_eat_skb(sk, skb);
1081                         ++seq;
1082                         break;
1083                 }
1084                 sk_eat_skb(sk, skb);
1085                 if (!desc->count)
1086                         break;
1087         }
1088         tp->copied_seq = seq;
1089
1090         tcp_rcv_space_adjust(sk);
1091
1092         /* Clean up data we have read: This will do ACK frames. */
1093         if (copied)
1094                 cleanup_rbuf(sk, copied);
1095         return copied;
1096 }
1097
1098 /*
1099  *      This routine copies from a sock struct into the user buffer.
1100  *
1101  *      Technical note: in 2.3 we work on _locked_ socket, so that
1102  *      tricks with *seq access order and skb->users are not required.
1103  *      Probably, code can be easily improved even more.
1104  */
1105
1106 int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1107                 size_t len, int nonblock, int flags, int *addr_len)
1108 {
1109         struct tcp_sock *tp = tcp_sk(sk);
1110         int copied = 0;
1111         u32 peek_seq;
1112         u32 *seq;
1113         unsigned long used;
1114         int err;
1115         int target;             /* Read at least this many bytes */
1116         long timeo;
1117         struct task_struct *user_recv = NULL;
1118
1119         lock_sock(sk);
1120
1121         TCP_CHECK_TIMER(sk);
1122
1123         err = -ENOTCONN;
1124         if (sk->sk_state == TCP_LISTEN)
1125                 goto out;
1126
1127         timeo = sock_rcvtimeo(sk, nonblock);
1128
1129         /* Urgent data needs to be handled specially. */
1130         if (flags & MSG_OOB)
1131                 goto recv_urg;
1132
1133         seq = &tp->copied_seq;
1134         if (flags & MSG_PEEK) {
1135                 peek_seq = tp->copied_seq;
1136                 seq = &peek_seq;
1137         }
1138
1139         target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1140
1141         do {
1142                 struct sk_buff *skb;
1143                 u32 offset;
1144
1145                 /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
1146                 if (tp->urg_data && tp->urg_seq == *seq) {
1147                         if (copied)
1148                                 break;
1149                         if (signal_pending(current)) {
1150                                 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1151                                 break;
1152                         }
1153                 }
1154
1155                 /* Next get a buffer. */
1156
1157                 skb = skb_peek(&sk->sk_receive_queue);
1158                 do {
1159                         if (!skb)
1160                                 break;
1161
1162                         /* Now that we have two receive queues this
1163                          * shouldn't happen.
1164                          */
1165                         if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1166                                 printk(KERN_INFO "recvmsg bug: copied %X "
1167                                        "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
1168                                 break;
1169                         }
1170                         offset = *seq - TCP_SKB_CB(skb)->seq;
1171                         if (skb->h.th->syn)
1172                                 offset--;
1173                         if (offset < skb->len)
1174                                 goto found_ok_skb;
1175                         if (skb->h.th->fin)
1176                                 goto found_fin_ok;
1177                         BUG_TRAP(flags & MSG_PEEK);
1178                         skb = skb->next;
1179                 } while (skb != (struct sk_buff *)&sk->sk_receive_queue);
1180
1181                 /* Well, if we have backlog, try to process it now yet. */
1182
1183                 if (copied >= target && !sk->sk_backlog.tail)
1184                         break;
1185
1186                 if (copied) {
1187                         if (sk->sk_err ||
1188                             sk->sk_state == TCP_CLOSE ||
1189                             (sk->sk_shutdown & RCV_SHUTDOWN) ||
1190                             !timeo ||
1191                             signal_pending(current) ||
1192                             (flags & MSG_PEEK))
1193                                 break;
1194                 } else {
1195                         if (sock_flag(sk, SOCK_DONE))
1196                                 break;
1197
1198                         if (sk->sk_err) {
1199                                 copied = sock_error(sk);
1200                                 break;
1201                         }
1202
1203                         if (sk->sk_shutdown & RCV_SHUTDOWN)
1204                                 break;
1205
1206                         if (sk->sk_state == TCP_CLOSE) {
1207                                 if (!sock_flag(sk, SOCK_DONE)) {
1208                                         /* This occurs when user tries to read
1209                                          * from never connected socket.
1210                                          */
1211                                         copied = -ENOTCONN;
1212                                         break;
1213                                 }
1214                                 break;
1215                         }
1216
1217                         if (!timeo) {
1218                                 copied = -EAGAIN;
1219                                 break;
1220                         }
1221
1222                         if (signal_pending(current)) {
1223                                 copied = sock_intr_errno(timeo);
1224                                 break;
1225                         }
1226                 }
1227
1228                 cleanup_rbuf(sk, copied);
1229
1230                 if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
1231                         /* Install new reader */
1232                         if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1233                                 user_recv = current;
1234                                 tp->ucopy.task = user_recv;
1235                                 tp->ucopy.iov = msg->msg_iov;
1236                         }
1237
1238                         tp->ucopy.len = len;
1239
1240                         BUG_TRAP(tp->copied_seq == tp->rcv_nxt ||
1241                                  (flags & (MSG_PEEK | MSG_TRUNC)));
1242
1243                         /* Ugly... If prequeue is not empty, we have to
1244                          * process it before releasing socket, otherwise
1245                          * order will be broken at second iteration.
1246                          * More elegant solution is required!!!
1247                          *
1248                          * Look: we have the following (pseudo)queues:
1249                          *
1250                          * 1. packets in flight
1251                          * 2. backlog
1252                          * 3. prequeue
1253                          * 4. receive_queue
1254                          *
1255                          * Each queue can be processed only if the next ones
1256                          * are empty. At this point we have empty receive_queue.
1257                          * But prequeue _can_ be not empty after 2nd iteration,
1258                          * when we jumped to start of loop because backlog
1259                          * processing added something to receive_queue.
1260                          * We cannot release_sock(), because backlog contains
1261                          * packets arrived _after_ prequeued ones.
1262                          *
1263                          * Shortly, algorithm is clear --- to process all
1264                          * the queues in order. We could make it more directly,
1265                          * requeueing packets from backlog to prequeue, if
1266                          * is not empty. It is more elegant, but eats cycles,
1267                          * unfortunately.
1268                          */
1269                         if (!skb_queue_empty(&tp->ucopy.prequeue))
1270                                 goto do_prequeue;
1271
1272                         /* __ Set realtime policy in scheduler __ */
1273                 }
1274
1275                 if (copied >= target) {
1276                         /* Do not sleep, just process backlog. */
1277                         release_sock(sk);
1278                         lock_sock(sk);
1279                 } else
1280                         sk_wait_data(sk, &timeo);
1281
1282                 if (user_recv) {
1283                         int chunk;
1284
1285                         /* __ Restore normal policy in scheduler __ */
1286
1287                         if ((chunk = len - tp->ucopy.len) != 0) {
1288                                 NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
1289                                 len -= chunk;
1290                                 copied += chunk;
1291                         }
1292
1293                         if (tp->rcv_nxt == tp->copied_seq &&
1294                             !skb_queue_empty(&tp->ucopy.prequeue)) {
1295 do_prequeue:
1296                                 tcp_prequeue_process(sk);
1297
1298                                 if ((chunk = len - tp->ucopy.len) != 0) {
1299                                         NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1300                                         len -= chunk;
1301                                         copied += chunk;
1302                                 }
1303                         }
1304                 }
1305                 if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
1306                         if (net_ratelimit())
1307                                 printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1308                                        current->comm, current->pid);
1309                         peek_seq = tp->copied_seq;
1310                 }
1311                 continue;
1312
1313         found_ok_skb:
1314                 /* Ok so how much can we use? */
1315                 used = skb->len - offset;
1316                 if (len < used)
1317                         used = len;
1318
1319                 /* Do we have urgent data here? */
1320                 if (tp->urg_data) {
1321                         u32 urg_offset = tp->urg_seq - *seq;
1322                         if (urg_offset < used) {
1323                                 if (!urg_offset) {
1324                                         if (!sock_flag(sk, SOCK_URGINLINE)) {
1325                                                 ++*seq;
1326                                                 offset++;
1327                                                 used--;
1328                                                 if (!used)
1329                                                         goto skip_copy;
1330                                         }
1331                                 } else
1332                                         used = urg_offset;
1333                         }
1334                 }
1335
1336                 if (!(flags & MSG_TRUNC)) {
1337                         err = skb_copy_datagram_iovec(skb, offset,
1338                                                       msg->msg_iov, used);
1339                         if (err) {
1340                                 /* Exception. Bailout! */
1341                                 if (!copied)
1342                                         copied = -EFAULT;
1343                                 break;
1344                         }
1345                 }
1346
1347                 *seq += used;
1348                 copied += used;
1349                 len -= used;
1350
1351                 tcp_rcv_space_adjust(sk);
1352
1353 skip_copy:
1354                 if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1355                         tp->urg_data = 0;
1356                         tcp_fast_path_check(sk, tp);
1357                 }
1358                 if (used + offset < skb->len)
1359                         continue;
1360
1361                 if (skb->h.th->fin)
1362                         goto found_fin_ok;
1363                 if (!(flags & MSG_PEEK))
1364                         sk_eat_skb(sk, skb);
1365                 continue;
1366
1367         found_fin_ok:
1368                 /* Process the FIN. */
1369                 ++*seq;
1370                 if (!(flags & MSG_PEEK))
1371                         sk_eat_skb(sk, skb);
1372                 break;
1373         } while (len > 0);
1374
1375         if (user_recv) {
1376                 if (!skb_queue_empty(&tp->ucopy.prequeue)) {
1377                         int chunk;
1378
1379                         tp->ucopy.len = copied > 0 ? len : 0;
1380
1381                         tcp_prequeue_process(sk);
1382
1383                         if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1384                                 NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1385                                 len -= chunk;
1386                                 copied += chunk;
1387                         }
1388                 }
1389
1390                 tp->ucopy.task = NULL;
1391                 tp->ucopy.len = 0;
1392         }
1393
1394         /* According to UNIX98, msg_name/msg_namelen are ignored
1395          * on connected socket. I was just happy when found this 8) --ANK
1396          */
1397
1398         /* Clean up data we have read: This will do ACK frames. */
1399         cleanup_rbuf(sk, copied);
1400
1401         TCP_CHECK_TIMER(sk);
1402         release_sock(sk);
1403         return copied;
1404
1405 out:
1406         TCP_CHECK_TIMER(sk);
1407         release_sock(sk);
1408         return err;
1409
1410 recv_urg:
1411         err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1412         goto out;
1413 }
1414
1415 /*
1416  *      State processing on a close. This implements the state shift for
1417  *      sending our FIN frame. Note that we only send a FIN for some
1418  *      states. A shutdown() may have already sent the FIN, or we may be
1419  *      closed.
1420  */
1421
1422 static unsigned char new_state[16] = {
1423   /* current state:        new state:      action:      */
1424   /* (Invalid)          */ TCP_CLOSE,
1425   /* TCP_ESTABLISHED    */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1426   /* TCP_SYN_SENT       */ TCP_CLOSE,
1427   /* TCP_SYN_RECV       */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1428   /* TCP_FIN_WAIT1      */ TCP_FIN_WAIT1,
1429   /* TCP_FIN_WAIT2      */ TCP_FIN_WAIT2,
1430   /* TCP_TIME_WAIT      */ TCP_CLOSE,
1431   /* TCP_CLOSE          */ TCP_CLOSE,
1432   /* TCP_CLOSE_WAIT     */ TCP_LAST_ACK  | TCP_ACTION_FIN,
1433   /* TCP_LAST_ACK       */ TCP_LAST_ACK,
1434   /* TCP_LISTEN         */ TCP_CLOSE,
1435   /* TCP_CLOSING        */ TCP_CLOSING,
1436 };
1437
1438 static int tcp_close_state(struct sock *sk)
1439 {
1440         int next = (int)new_state[sk->sk_state];
1441         int ns = next & TCP_STATE_MASK;
1442
1443         tcp_set_state(sk, ns);
1444
1445         return next & TCP_ACTION_FIN;
1446 }
1447
1448 /*
1449  *      Shutdown the sending side of a connection. Much like close except
1450  *      that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD).
1451  */
1452
1453 void tcp_shutdown(struct sock *sk, int how)
1454 {
1455         /*      We need to grab some memory, and put together a FIN,
1456          *      and then put it into the queue to be sent.
1457          *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1458          */
1459         if (!(how & SEND_SHUTDOWN))
1460                 return;
1461
1462         /* If we've already sent a FIN, or it's a closed state, skip this. */
1463         if ((1 << sk->sk_state) &
1464             (TCPF_ESTABLISHED | TCPF_SYN_SENT |
1465              TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
1466                 /* Clear out any half completed packets.  FIN if needed. */
1467                 if (tcp_close_state(sk))
1468                         tcp_send_fin(sk);
1469         }
1470 }
1471
1472 void tcp_close(struct sock *sk, long timeout)
1473 {
1474         struct sk_buff *skb;
1475         int data_was_unread = 0;
1476
1477         lock_sock(sk);
1478         sk->sk_shutdown = SHUTDOWN_MASK;
1479
1480         if (sk->sk_state == TCP_LISTEN) {
1481                 tcp_set_state(sk, TCP_CLOSE);
1482
1483                 /* Special case. */
1484                 inet_csk_listen_stop(sk);
1485
1486                 goto adjudge_to_death;
1487         }
1488
1489         /*  We need to flush the recv. buffs.  We do this only on the
1490          *  descriptor close, not protocol-sourced closes, because the
1491          *  reader process may not have drained the data yet!
1492          */
1493         while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
1494                 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
1495                           skb->h.th->fin;
1496                 data_was_unread += len;
1497                 __kfree_skb(skb);
1498         }
1499
1500         sk_stream_mem_reclaim(sk);
1501
1502         /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
1503          * 3.10, we send a RST here because data was lost.  To
1504          * witness the awful effects of the old behavior of always
1505          * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
1506          * a bulk GET in an FTP client, suspend the process, wait
1507          * for the client to advertise a zero window, then kill -9
1508          * the FTP client, wheee...  Note: timeout is always zero
1509          * in such a case.
1510          */
1511         if (data_was_unread) {
1512                 /* Unread data was tossed, zap the connection. */
1513                 NET_INC_STATS_USER(LINUX_MIB_TCPABORTONCLOSE);
1514                 tcp_set_state(sk, TCP_CLOSE);
1515                 tcp_send_active_reset(sk, GFP_KERNEL);
1516         } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
1517                 /* Check zero linger _after_ checking for unread data. */
1518                 sk->sk_prot->disconnect(sk, 0);
1519                 NET_INC_STATS_USER(LINUX_MIB_TCPABORTONDATA);
1520         } else if (tcp_close_state(sk)) {
1521                 /* We FIN if the application ate all the data before
1522                  * zapping the connection.
1523                  */
1524
1525                 /* RED-PEN. Formally speaking, we have broken TCP state
1526                  * machine. State transitions:
1527                  *
1528                  * TCP_ESTABLISHED -> TCP_FIN_WAIT1
1529                  * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
1530                  * TCP_CLOSE_WAIT -> TCP_LAST_ACK
1531                  *
1532                  * are legal only when FIN has been sent (i.e. in window),
1533                  * rather than queued out of window. Purists blame.
1534                  *
1535                  * F.e. "RFC state" is ESTABLISHED,
1536                  * if Linux state is FIN-WAIT-1, but FIN is still not sent.
1537                  *
1538                  * The visible declinations are that sometimes
1539                  * we enter time-wait state, when it is not required really
1540                  * (harmless), do not send active resets, when they are
1541                  * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
1542                  * they look as CLOSING or LAST_ACK for Linux)
1543                  * Probably, I missed some more holelets.
1544                  *                                              --ANK
1545                  */
1546                 tcp_send_fin(sk);
1547         }
1548
1549         sk_stream_wait_close(sk, timeout);
1550
1551 adjudge_to_death:
1552         /* It is the last release_sock in its life. It will remove backlog. */
1553         release_sock(sk);
1554
1555
1556         /* Now socket is owned by kernel and we acquire BH lock
1557            to finish close. No need to check for user refs.
1558          */
1559         local_bh_disable();
1560         bh_lock_sock(sk);
1561         BUG_TRAP(!sock_owned_by_user(sk));
1562
1563         sock_hold(sk);
1564         sock_orphan(sk);
1565
1566         /*      This is a (useful) BSD violating of the RFC. There is a
1567          *      problem with TCP as specified in that the other end could
1568          *      keep a socket open forever with no application left this end.
1569          *      We use a 3 minute timeout (about the same as BSD) then kill
1570          *      our end. If they send after that then tough - BUT: long enough
1571          *      that we won't make the old 4*rto = almost no time - whoops
1572          *      reset mistake.
1573          *
1574          *      Nope, it was not mistake. It is really desired behaviour
1575          *      f.e. on http servers, when such sockets are useless, but
1576          *      consume significant resources. Let's do it with special
1577          *      linger2 option.                                 --ANK
1578          */
1579
1580         if (sk->sk_state == TCP_FIN_WAIT2) {
1581                 struct tcp_sock *tp = tcp_sk(sk);
1582                 if (tp->linger2 < 0) {
1583                         tcp_set_state(sk, TCP_CLOSE);
1584                         tcp_send_active_reset(sk, GFP_ATOMIC);
1585                         NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER);
1586                 } else {
1587                         const int tmo = tcp_fin_time(sk);
1588
1589                         if (tmo > TCP_TIMEWAIT_LEN) {
1590                                 inet_csk_reset_keepalive_timer(sk, tcp_fin_time(sk));
1591                         } else {
1592                                 atomic_inc(sk->sk_prot->orphan_count);
1593                                 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
1594                                 goto out;
1595                         }
1596                 }
1597         }
1598         if (sk->sk_state != TCP_CLOSE) {
1599                 sk_stream_mem_reclaim(sk);
1600                 if (atomic_read(sk->sk_prot->orphan_count) > sysctl_tcp_max_orphans ||
1601                     (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
1602                      atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
1603                         if (net_ratelimit())
1604                                 printk(KERN_INFO "TCP: too many of orphaned "
1605                                        "sockets\n");
1606                         tcp_set_state(sk, TCP_CLOSE);
1607                         tcp_send_active_reset(sk, GFP_ATOMIC);
1608                         NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
1609                 }
1610         }
1611         atomic_inc(sk->sk_prot->orphan_count);
1612
1613         if (sk->sk_state == TCP_CLOSE)
1614                 inet_csk_destroy_sock(sk);
1615         /* Otherwise, socket is reprieved until protocol close. */
1616
1617 out:
1618         bh_unlock_sock(sk);
1619         local_bh_enable();
1620         sock_put(sk);
1621 }
1622
1623 /* These states need RST on ABORT according to RFC793 */
1624
1625 static inline int tcp_need_reset(int state)
1626 {
1627         return (1 << state) &
1628                (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
1629                 TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
1630 }
1631
1632 int tcp_disconnect(struct sock *sk, int flags)
1633 {
1634         struct inet_sock *inet = inet_sk(sk);
1635         struct inet_connection_sock *icsk = inet_csk(sk);
1636         struct tcp_sock *tp = tcp_sk(sk);
1637         int err = 0;
1638         int old_state = sk->sk_state;
1639
1640         if (old_state != TCP_CLOSE)
1641                 tcp_set_state(sk, TCP_CLOSE);
1642
1643         /* ABORT function of RFC793 */
1644         if (old_state == TCP_LISTEN) {
1645                 inet_csk_listen_stop(sk);
1646         } else if (tcp_need_reset(old_state) ||
1647                    (tp->snd_nxt != tp->write_seq &&
1648                     (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
1649                 /* The last check adjusts for discrepance of Linux wrt. RFC
1650                  * states
1651                  */
1652                 tcp_send_active_reset(sk, gfp_any());
1653                 sk->sk_err = ECONNRESET;
1654         } else if (old_state == TCP_SYN_SENT)
1655                 sk->sk_err = ECONNRESET;
1656
1657         tcp_clear_xmit_timers(sk);
1658         __skb_queue_purge(&sk->sk_receive_queue);
1659         sk_stream_writequeue_purge(sk);
1660         __skb_queue_purge(&tp->out_of_order_queue);
1661
1662         inet->dport = 0;
1663
1664         if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
1665                 inet_reset_saddr(sk);
1666
1667         sk->sk_shutdown = 0;
1668         sock_reset_flag(sk, SOCK_DONE);
1669         tp->srtt = 0;
1670         if ((tp->write_seq += tp->max_window + 2) == 0)
1671                 tp->write_seq = 1;
1672         icsk->icsk_backoff = 0;
1673         tp->snd_cwnd = 2;
1674         tp->probes_out = 0;
1675         tp->packets_out = 0;
1676         tp->snd_ssthresh = 0x7fffffff;
1677         tp->snd_cwnd_cnt = 0;
1678         tcp_set_ca_state(tp, TCP_CA_Open);
1679         tcp_clear_retrans(tp);
1680         inet_csk_delack_init(sk);
1681         sk->sk_send_head = NULL;
1682         tp->rx_opt.saw_tstamp = 0;
1683         tcp_sack_reset(&tp->rx_opt);
1684         __sk_dst_reset(sk);
1685
1686         BUG_TRAP(!inet->num || icsk->icsk_bind_hash);
1687
1688         sk->sk_error_report(sk);
1689         return err;
1690 }
1691
1692 /*
1693  *      Socket option code for TCP.
1694  */
1695 int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
1696                    int optlen)
1697 {
1698         struct tcp_sock *tp = tcp_sk(sk);
1699         struct inet_connection_sock *icsk = inet_csk(sk);
1700         int val;
1701         int err = 0;
1702
1703         if (level != SOL_TCP)
1704                 return tp->af_specific->setsockopt(sk, level, optname,
1705                                                    optval, optlen);
1706
1707         /* This is a string value all the others are int's */
1708         if (optname == TCP_CONGESTION) {
1709                 char name[TCP_CA_NAME_MAX];
1710
1711                 if (optlen < 1)
1712                         return -EINVAL;
1713
1714                 val = strncpy_from_user(name, optval,
1715                                         min(TCP_CA_NAME_MAX-1, optlen));
1716                 if (val < 0)
1717                         return -EFAULT;
1718                 name[val] = 0;
1719
1720                 lock_sock(sk);
1721                 err = tcp_set_congestion_control(tp, name);
1722                 release_sock(sk);
1723                 return err;
1724         }
1725
1726         if (optlen < sizeof(int))
1727                 return -EINVAL;
1728
1729         if (get_user(val, (int __user *)optval))
1730                 return -EFAULT;
1731
1732         lock_sock(sk);
1733
1734         switch (optname) {
1735         case TCP_MAXSEG:
1736                 /* Values greater than interface MTU won't take effect. However
1737                  * at the point when this call is done we typically don't yet
1738                  * know which interface is going to be used */
1739                 if (val < 8 || val > MAX_TCP_WINDOW) {
1740                         err = -EINVAL;
1741                         break;
1742                 }
1743                 tp->rx_opt.user_mss = val;
1744                 break;
1745
1746         case TCP_NODELAY:
1747                 if (val) {
1748                         /* TCP_NODELAY is weaker than TCP_CORK, so that
1749                          * this option on corked socket is remembered, but
1750                          * it is not activated until cork is cleared.
1751                          *
1752                          * However, when TCP_NODELAY is set we make
1753                          * an explicit push, which overrides even TCP_CORK
1754                          * for currently queued segments.
1755                          */
1756                         tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
1757                         tcp_push_pending_frames(sk, tp);
1758                 } else {
1759                         tp->nonagle &= ~TCP_NAGLE_OFF;
1760                 }
1761                 break;
1762
1763         case TCP_CORK:
1764                 /* When set indicates to always queue non-full frames.
1765                  * Later the user clears this option and we transmit
1766                  * any pending partial frames in the queue.  This is
1767                  * meant to be used alongside sendfile() to get properly
1768                  * filled frames when the user (for example) must write
1769                  * out headers with a write() call first and then use
1770                  * sendfile to send out the data parts.
1771                  *
1772                  * TCP_CORK can be set together with TCP_NODELAY and it is
1773                  * stronger than TCP_NODELAY.
1774                  */
1775                 if (val) {
1776                         tp->nonagle |= TCP_NAGLE_CORK;
1777                 } else {
1778                         tp->nonagle &= ~TCP_NAGLE_CORK;
1779                         if (tp->nonagle&TCP_NAGLE_OFF)
1780                                 tp->nonagle |= TCP_NAGLE_PUSH;
1781                         tcp_push_pending_frames(sk, tp);
1782                 }
1783                 break;
1784
1785         case TCP_KEEPIDLE:
1786                 if (val < 1 || val > MAX_TCP_KEEPIDLE)
1787                         err = -EINVAL;
1788                 else {
1789                         tp->keepalive_time = val * HZ;
1790                         if (sock_flag(sk, SOCK_KEEPOPEN) &&
1791                             !((1 << sk->sk_state) &
1792                               (TCPF_CLOSE | TCPF_LISTEN))) {
1793                                 __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
1794                                 if (tp->keepalive_time > elapsed)
1795                                         elapsed = tp->keepalive_time - elapsed;
1796                                 else
1797                                         elapsed = 0;
1798                                 inet_csk_reset_keepalive_timer(sk, elapsed);
1799                         }
1800                 }
1801                 break;
1802         case TCP_KEEPINTVL:
1803                 if (val < 1 || val > MAX_TCP_KEEPINTVL)
1804                         err = -EINVAL;
1805                 else
1806                         tp->keepalive_intvl = val * HZ;
1807                 break;
1808         case TCP_KEEPCNT:
1809                 if (val < 1 || val > MAX_TCP_KEEPCNT)
1810                         err = -EINVAL;
1811                 else
1812                         tp->keepalive_probes = val;
1813                 break;
1814         case TCP_SYNCNT:
1815                 if (val < 1 || val > MAX_TCP_SYNCNT)
1816                         err = -EINVAL;
1817                 else
1818                         icsk->icsk_syn_retries = val;
1819                 break;
1820
1821         case TCP_LINGER2:
1822                 if (val < 0)
1823                         tp->linger2 = -1;
1824                 else if (val > sysctl_tcp_fin_timeout / HZ)
1825                         tp->linger2 = 0;
1826                 else
1827                         tp->linger2 = val * HZ;
1828                 break;
1829
1830         case TCP_DEFER_ACCEPT:
1831                 icsk->icsk_accept_queue.rskq_defer_accept = 0;
1832                 if (val > 0) {
1833                         /* Translate value in seconds to number of
1834                          * retransmits */
1835                         while (icsk->icsk_accept_queue.rskq_defer_accept < 32 &&
1836                                val > ((TCP_TIMEOUT_INIT / HZ) <<
1837                                        icsk->icsk_accept_queue.rskq_defer_accept))
1838                                 icsk->icsk_accept_queue.rskq_defer_accept++;
1839                         icsk->icsk_accept_queue.rskq_defer_accept++;
1840                 }
1841                 break;
1842
1843         case TCP_WINDOW_CLAMP:
1844                 if (!val) {
1845                         if (sk->sk_state != TCP_CLOSE) {
1846                                 err = -EINVAL;
1847                                 break;
1848                         }
1849                         tp->window_clamp = 0;
1850                 } else
1851                         tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
1852                                                 SOCK_MIN_RCVBUF / 2 : val;
1853                 break;
1854
1855         case TCP_QUICKACK:
1856                 if (!val) {
1857                         icsk->icsk_ack.pingpong = 1;
1858                 } else {
1859                         icsk->icsk_ack.pingpong = 0;
1860                         if ((1 << sk->sk_state) &
1861                             (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
1862                             inet_csk_ack_scheduled(sk)) {
1863                                 icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
1864                                 cleanup_rbuf(sk, 1);
1865                                 if (!(val & 1))
1866                                         icsk->icsk_ack.pingpong = 1;
1867                         }
1868                 }
1869                 break;
1870
1871         default:
1872                 err = -ENOPROTOOPT;
1873                 break;
1874         };
1875         release_sock(sk);
1876         return err;
1877 }
1878
1879 /* Return information about state of tcp endpoint in API format. */
1880 void tcp_get_info(struct sock *sk, struct tcp_info *info)
1881 {
1882         struct tcp_sock *tp = tcp_sk(sk);
1883         const struct inet_connection_sock *icsk = inet_csk(sk);
1884         u32 now = tcp_time_stamp;
1885
1886         memset(info, 0, sizeof(*info));
1887
1888         info->tcpi_state = sk->sk_state;
1889         info->tcpi_ca_state = tp->ca_state;
1890         info->tcpi_retransmits = icsk->icsk_retransmits;
1891         info->tcpi_probes = tp->probes_out;
1892         info->tcpi_backoff = icsk->icsk_backoff;
1893
1894         if (tp->rx_opt.tstamp_ok)
1895                 info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
1896         if (tp->rx_opt.sack_ok)
1897                 info->tcpi_options |= TCPI_OPT_SACK;
1898         if (tp->rx_opt.wscale_ok) {
1899                 info->tcpi_options |= TCPI_OPT_WSCALE;
1900                 info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
1901                 info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
1902         }
1903
1904         if (tp->ecn_flags&TCP_ECN_OK)
1905                 info->tcpi_options |= TCPI_OPT_ECN;
1906
1907         info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
1908         info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
1909         info->tcpi_snd_mss = tp->mss_cache;
1910         info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
1911
1912         info->tcpi_unacked = tp->packets_out;
1913         info->tcpi_sacked = tp->sacked_out;
1914         info->tcpi_lost = tp->lost_out;
1915         info->tcpi_retrans = tp->retrans_out;
1916         info->tcpi_fackets = tp->fackets_out;
1917
1918         info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
1919         info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
1920         info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
1921
1922         info->tcpi_pmtu = tp->pmtu_cookie;
1923         info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
1924         info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
1925         info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
1926         info->tcpi_snd_ssthresh = tp->snd_ssthresh;
1927         info->tcpi_snd_cwnd = tp->snd_cwnd;
1928         info->tcpi_advmss = tp->advmss;
1929         info->tcpi_reordering = tp->reordering;
1930
1931         info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
1932         info->tcpi_rcv_space = tp->rcvq_space.space;
1933
1934         info->tcpi_total_retrans = tp->total_retrans;
1935 }
1936
1937 EXPORT_SYMBOL_GPL(tcp_get_info);
1938
1939 int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
1940                    int __user *optlen)
1941 {
1942         struct inet_connection_sock *icsk = inet_csk(sk);
1943         struct tcp_sock *tp = tcp_sk(sk);
1944         int val, len;
1945
1946         if (level != SOL_TCP)
1947                 return tp->af_specific->getsockopt(sk, level, optname,
1948                                                    optval, optlen);
1949
1950         if (get_user(len, optlen))
1951                 return -EFAULT;
1952
1953         len = min_t(unsigned int, len, sizeof(int));
1954
1955         if (len < 0)
1956                 return -EINVAL;
1957
1958         switch (optname) {
1959         case TCP_MAXSEG:
1960                 val = tp->mss_cache;
1961                 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
1962                         val = tp->rx_opt.user_mss;
1963                 break;
1964         case TCP_NODELAY:
1965                 val = !!(tp->nonagle&TCP_NAGLE_OFF);
1966                 break;
1967         case TCP_CORK:
1968                 val = !!(tp->nonagle&TCP_NAGLE_CORK);
1969                 break;
1970         case TCP_KEEPIDLE:
1971                 val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;
1972                 break;
1973         case TCP_KEEPINTVL:
1974                 val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;
1975                 break;
1976         case TCP_KEEPCNT:
1977                 val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
1978                 break;
1979         case TCP_SYNCNT:
1980                 val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
1981                 break;
1982         case TCP_LINGER2:
1983                 val = tp->linger2;
1984                 if (val >= 0)
1985                         val = (val ? : sysctl_tcp_fin_timeout) / HZ;
1986                 break;
1987         case TCP_DEFER_ACCEPT:
1988                 val = !icsk->icsk_accept_queue.rskq_defer_accept ? 0 :
1989                         ((TCP_TIMEOUT_INIT / HZ) << (icsk->icsk_accept_queue.rskq_defer_accept - 1));
1990                 break;
1991         case TCP_WINDOW_CLAMP:
1992                 val = tp->window_clamp;
1993                 break;
1994         case TCP_INFO: {
1995                 struct tcp_info info;
1996
1997                 if (get_user(len, optlen))
1998                         return -EFAULT;
1999
2000                 tcp_get_info(sk, &info);
2001
2002                 len = min_t(unsigned int, len, sizeof(info));
2003                 if (put_user(len, optlen))
2004                         return -EFAULT;
2005                 if (copy_to_user(optval, &info, len))
2006                         return -EFAULT;
2007                 return 0;
2008         }
2009         case TCP_QUICKACK:
2010                 val = !icsk->icsk_ack.pingpong;
2011                 break;
2012
2013         case TCP_CONGESTION:
2014                 if (get_user(len, optlen))
2015                         return -EFAULT;
2016                 len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
2017                 if (put_user(len, optlen))
2018                         return -EFAULT;
2019                 if (copy_to_user(optval, tp->ca_ops->name, len))
2020                         return -EFAULT;
2021                 return 0;
2022         default:
2023                 return -ENOPROTOOPT;
2024         };
2025
2026         if (put_user(len, optlen))
2027                 return -EFAULT;
2028         if (copy_to_user(optval, &val, len))
2029                 return -EFAULT;
2030         return 0;
2031 }
2032
2033
2034 extern void __skb_cb_too_small_for_tcp(int, int);
2035 extern struct tcp_congestion_ops tcp_reno;
2036
2037 static __initdata unsigned long thash_entries;
2038 static int __init set_thash_entries(char *str)
2039 {
2040         if (!str)
2041                 return 0;
2042         thash_entries = simple_strtoul(str, &str, 0);
2043         return 1;
2044 }
2045 __setup("thash_entries=", set_thash_entries);
2046
2047 void __init tcp_init(void)
2048 {
2049         struct sk_buff *skb = NULL;
2050         int order, i;
2051
2052         if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
2053                 __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2054                                            sizeof(skb->cb));
2055
2056         tcp_hashinfo.bind_bucket_cachep =
2057                 kmem_cache_create("tcp_bind_bucket",
2058                                   sizeof(struct inet_bind_bucket), 0,
2059                                   SLAB_HWCACHE_ALIGN, NULL, NULL);
2060         if (!tcp_hashinfo.bind_bucket_cachep)
2061                 panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
2062
2063         /* Size and allocate the main established and bind bucket
2064          * hash tables.
2065          *
2066          * The methodology is similar to that of the buffer cache.
2067          */
2068         tcp_hashinfo.ehash =
2069                 alloc_large_system_hash("TCP established",
2070                                         sizeof(struct inet_ehash_bucket),
2071                                         thash_entries,
2072                                         (num_physpages >= 128 * 1024) ?
2073                                                 (25 - PAGE_SHIFT) :
2074                                                 (27 - PAGE_SHIFT),
2075                                         HASH_HIGHMEM,
2076                                         &tcp_hashinfo.ehash_size,
2077                                         NULL,
2078                                         0);
2079         tcp_hashinfo.ehash_size = (1 << tcp_hashinfo.ehash_size) >> 1;
2080         for (i = 0; i < (tcp_hashinfo.ehash_size << 1); i++) {
2081                 rwlock_init(&tcp_hashinfo.ehash[i].lock);
2082                 INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain);
2083         }
2084
2085         tcp_hashinfo.bhash =
2086                 alloc_large_system_hash("TCP bind",
2087                                         sizeof(struct inet_bind_hashbucket),
2088                                         tcp_hashinfo.ehash_size,
2089                                         (num_physpages >= 128 * 1024) ?
2090                                                 (25 - PAGE_SHIFT) :
2091                                                 (27 - PAGE_SHIFT),
2092                                         HASH_HIGHMEM,
2093                                         &tcp_hashinfo.bhash_size,
2094                                         NULL,
2095                                         64 * 1024);
2096         tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size;
2097         for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
2098                 spin_lock_init(&tcp_hashinfo.bhash[i].lock);
2099                 INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
2100         }
2101
2102         /* Try to be a bit smarter and adjust defaults depending
2103          * on available memory.
2104          */
2105         for (order = 0; ((1 << order) << PAGE_SHIFT) <
2106                         (tcp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket));
2107                         order++)
2108                 ;
2109         if (order >= 4) {
2110                 sysctl_local_port_range[0] = 32768;
2111                 sysctl_local_port_range[1] = 61000;
2112                 tcp_death_row.sysctl_max_tw_buckets = 180000;
2113                 sysctl_tcp_max_orphans = 4096 << (order - 4);
2114                 sysctl_max_syn_backlog = 1024;
2115         } else if (order < 3) {
2116                 sysctl_local_port_range[0] = 1024 * (3 - order);
2117                 tcp_death_row.sysctl_max_tw_buckets >>= (3 - order);
2118                 sysctl_tcp_max_orphans >>= (3 - order);
2119                 sysctl_max_syn_backlog = 128;
2120         }
2121         tcp_hashinfo.port_rover = sysctl_local_port_range[0] - 1;
2122
2123         sysctl_tcp_mem[0] =  768 << order;
2124         sysctl_tcp_mem[1] = 1024 << order;
2125         sysctl_tcp_mem[2] = 1536 << order;
2126
2127         if (order < 3) {
2128                 sysctl_tcp_wmem[2] = 64 * 1024;
2129                 sysctl_tcp_rmem[0] = PAGE_SIZE;
2130                 sysctl_tcp_rmem[1] = 43689;
2131                 sysctl_tcp_rmem[2] = 2 * 43689;
2132         }
2133
2134         printk(KERN_INFO "TCP: Hash tables configured "
2135                "(established %d bind %d)\n",
2136                tcp_hashinfo.ehash_size << 1, tcp_hashinfo.bhash_size);
2137
2138         tcp_register_congestion_control(&tcp_reno);
2139 }
2140
2141 EXPORT_SYMBOL(tcp_close);
2142 EXPORT_SYMBOL(tcp_disconnect);
2143 EXPORT_SYMBOL(tcp_getsockopt);
2144 EXPORT_SYMBOL(tcp_ioctl);
2145 EXPORT_SYMBOL(tcp_poll);
2146 EXPORT_SYMBOL(tcp_read_sock);
2147 EXPORT_SYMBOL(tcp_recvmsg);
2148 EXPORT_SYMBOL(tcp_sendmsg);
2149 EXPORT_SYMBOL(tcp_sendpage);
2150 EXPORT_SYMBOL(tcp_setsockopt);
2151 EXPORT_SYMBOL(tcp_shutdown);
2152 EXPORT_SYMBOL(tcp_statistics);