[TCP]: Simplify SKB data portion allocation with NETIF_F_SG.
[sfrench/cifs-2.6.git] / net / ipv4 / tcp.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  * Version:     $Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $
9  *
10  * Authors:     Ross Biro
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
14  *              Florian La Roche, <flla@stud.uni-sb.de>
15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20  *              Jorge Cwik, <jorge@laser.satlink.net>
21  *
22  * Fixes:
23  *              Alan Cox        :       Numerous verify_area() calls
24  *              Alan Cox        :       Set the ACK bit on a reset
25  *              Alan Cox        :       Stopped it crashing if it closed while
26  *                                      sk->inuse=1 and was trying to connect
27  *                                      (tcp_err()).
28  *              Alan Cox        :       All icmp error handling was broken
29  *                                      pointers passed where wrong and the
30  *                                      socket was looked up backwards. Nobody
31  *                                      tested any icmp error code obviously.
32  *              Alan Cox        :       tcp_err() now handled properly. It
33  *                                      wakes people on errors. poll
34  *                                      behaves and the icmp error race
35  *                                      has gone by moving it into sock.c
36  *              Alan Cox        :       tcp_send_reset() fixed to work for
37  *                                      everything not just packets for
38  *                                      unknown sockets.
39  *              Alan Cox        :       tcp option processing.
40  *              Alan Cox        :       Reset tweaked (still not 100%) [Had
41  *                                      syn rule wrong]
42  *              Herp Rosmanith  :       More reset fixes
43  *              Alan Cox        :       No longer acks invalid rst frames.
44  *                                      Acking any kind of RST is right out.
45  *              Alan Cox        :       Sets an ignore me flag on an rst
46  *                                      receive otherwise odd bits of prattle
47  *                                      escape still
48  *              Alan Cox        :       Fixed another acking RST frame bug.
49  *                                      Should stop LAN workplace lockups.
50  *              Alan Cox        :       Some tidyups using the new skb list
51  *                                      facilities
52  *              Alan Cox        :       sk->keepopen now seems to work
53  *              Alan Cox        :       Pulls options out correctly on accepts
54  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
55  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a
56  *                                      bit to skb ops.
57  *              Alan Cox        :       Tidied tcp_data to avoid a potential
58  *                                      nasty.
59  *              Alan Cox        :       Added some better commenting, as the
60  *                                      tcp is hard to follow
61  *              Alan Cox        :       Removed incorrect check for 20 * psh
62  *      Michael O'Reilly        :       ack < copied bug fix.
63  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
64  *              Alan Cox        :       FIN with no memory -> CRASH
65  *              Alan Cox        :       Added socket option proto entries.
66  *                                      Also added awareness of them to accept.
67  *              Alan Cox        :       Added TCP options (SOL_TCP)
68  *              Alan Cox        :       Switched wakeup calls to callbacks,
69  *                                      so the kernel can layer network
70  *                                      sockets.
71  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
72  *              Alan Cox        :       Handle FIN (more) properly (we hope).
73  *              Alan Cox        :       RST frames sent on unsynchronised
74  *                                      state ack error.
75  *              Alan Cox        :       Put in missing check for SYN bit.
76  *              Alan Cox        :       Added tcp_select_window() aka NET2E
77  *                                      window non shrink trick.
78  *              Alan Cox        :       Added a couple of small NET2E timer
79  *                                      fixes
80  *              Charles Hedrick :       TCP fixes
81  *              Toomas Tamm     :       TCP window fixes
82  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
83  *              Charles Hedrick :       Rewrote most of it to actually work
84  *              Linus           :       Rewrote tcp_read() and URG handling
85  *                                      completely
86  *              Gerhard Koerting:       Fixed some missing timer handling
87  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
88  *              Gerhard Koerting:       PC/TCP workarounds
89  *              Adam Caldwell   :       Assorted timer/timing errors
90  *              Matthew Dillon  :       Fixed another RST bug
91  *              Alan Cox        :       Move to kernel side addressing changes.
92  *              Alan Cox        :       Beginning work on TCP fastpathing
93  *                                      (not yet usable)
94  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
95  *              Alan Cox        :       TCP fast path debugging
96  *              Alan Cox        :       Window clamping
97  *              Michael Riepe   :       Bug in tcp_check()
98  *              Matt Dillon     :       More TCP improvements and RST bug fixes
99  *              Matt Dillon     :       Yet more small nasties remove from the
100  *                                      TCP code (Be very nice to this man if
101  *                                      tcp finally works 100%) 8)
102  *              Alan Cox        :       BSD accept semantics.
103  *              Alan Cox        :       Reset on closedown bug.
104  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
105  *              Michael Pall    :       Handle poll() after URG properly in
106  *                                      all cases.
107  *              Michael Pall    :       Undo the last fix in tcp_read_urg()
108  *                                      (multi URG PUSH broke rlogin).
109  *              Michael Pall    :       Fix the multi URG PUSH problem in
110  *                                      tcp_readable(), poll() after URG
111  *                                      works now.
112  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the
113  *                                      BSD api.
114  *              Alan Cox        :       Changed the semantics of sk->socket to
115  *                                      fix a race and a signal problem with
116  *                                      accept() and async I/O.
117  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
118  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
119  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
120  *                                      clients/servers which listen in on
121  *                                      fixed ports.
122  *              Alan Cox        :       Cleaned the above up and shrank it to
123  *                                      a sensible code size.
124  *              Alan Cox        :       Self connect lockup fix.
125  *              Alan Cox        :       No connect to multicast.
126  *              Ross Biro       :       Close unaccepted children on master
127  *                                      socket close.
128  *              Alan Cox        :       Reset tracing code.
129  *              Alan Cox        :       Spurious resets on shutdown.
130  *              Alan Cox        :       Giant 15 minute/60 second timer error
131  *              Alan Cox        :       Small whoops in polling before an
132  *                                      accept.
133  *              Alan Cox        :       Kept the state trace facility since
134  *                                      it's handy for debugging.
135  *              Alan Cox        :       More reset handler fixes.
136  *              Alan Cox        :       Started rewriting the code based on
137  *                                      the RFC's for other useful protocol
138  *                                      references see: Comer, KA9Q NOS, and
139  *                                      for a reference on the difference
140  *                                      between specifications and how BSD
141  *                                      works see the 4.4lite source.
142  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy
143  *                                      close.
144  *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
145  *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
146  *              Alan Cox        :       Reimplemented timers as per the RFC
147  *                                      and using multiple timers for sanity.
148  *              Alan Cox        :       Small bug fixes, and a lot of new
149  *                                      comments.
150  *              Alan Cox        :       Fixed dual reader crash by locking
151  *                                      the buffers (much like datagram.c)
152  *              Alan Cox        :       Fixed stuck sockets in probe. A probe
153  *                                      now gets fed up of retrying without
154  *                                      (even a no space) answer.
155  *              Alan Cox        :       Extracted closing code better
156  *              Alan Cox        :       Fixed the closing state machine to
157  *                                      resemble the RFC.
158  *              Alan Cox        :       More 'per spec' fixes.
159  *              Jorge Cwik      :       Even faster checksumming.
160  *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
161  *                                      only frames. At least one pc tcp stack
162  *                                      generates them.
163  *              Alan Cox        :       Cache last socket.
164  *              Alan Cox        :       Per route irtt.
165  *              Matt Day        :       poll()->select() match BSD precisely on error
166  *              Alan Cox        :       New buffers
167  *              Marc Tamsky     :       Various sk->prot->retransmits and
168  *                                      sk->retransmits misupdating fixed.
169  *                                      Fixed tcp_write_timeout: stuck close,
170  *                                      and TCP syn retries gets used now.
171  *              Mark Yarvis     :       In tcp_read_wakeup(), don't send an
172  *                                      ack if state is TCP_CLOSED.
173  *              Alan Cox        :       Look up device on a retransmit - routes may
174  *                                      change. Doesn't yet cope with MSS shrink right
175  *                                      but it's a start!
176  *              Marc Tamsky     :       Closing in closing fixes.
177  *              Mike Shaver     :       RFC1122 verifications.
178  *              Alan Cox        :       rcv_saddr errors.
179  *              Alan Cox        :       Block double connect().
180  *              Alan Cox        :       Small hooks for enSKIP.
181  *              Alexey Kuznetsov:       Path MTU discovery.
182  *              Alan Cox        :       Support soft errors.
183  *              Alan Cox        :       Fix MTU discovery pathological case
184  *                                      when the remote claims no mtu!
185  *              Marc Tamsky     :       TCP_CLOSE fix.
186  *              Colin (G3TNE)   :       Send a reset on syn ack replies in
187  *                                      window but wrong (fixes NT lpd problems)
188  *              Pedro Roque     :       Better TCP window handling, delayed ack.
189  *              Joerg Reuter    :       No modification of locked buffers in
190  *                                      tcp_do_retransmit()
191  *              Eric Schenk     :       Changed receiver side silly window
192  *                                      avoidance algorithm to BSD style
193  *                                      algorithm. This doubles throughput
194  *                                      against machines running Solaris,
195  *                                      and seems to result in general
196  *                                      improvement.
197  *      Stefan Magdalinski      :       adjusted tcp_readable() to fix FIONREAD
198  *      Willy Konynenberg       :       Transparent proxying support.
199  *      Mike McLagan            :       Routing by source
200  *              Keith Owens     :       Do proper merging with partial SKB's in
201  *                                      tcp_do_sendmsg to avoid burstiness.
202  *              Eric Schenk     :       Fix fast close down bug with
203  *                                      shutdown() followed by close().
204  *              Andi Kleen      :       Make poll agree with SIGIO
205  *      Salvatore Sanfilippo    :       Support SO_LINGER with linger == 1 and
206  *                                      lingertime == 0 (RFC 793 ABORT Call)
207  *      Hirokazu Takahashi      :       Use copy_from_user() instead of
208  *                                      csum_and_copy_from_user() if possible.
209  *
210  *              This program is free software; you can redistribute it and/or
211  *              modify it under the terms of the GNU General Public License
212  *              as published by the Free Software Foundation; either version
213  *              2 of the License, or(at your option) any later version.
214  *
215  * Description of States:
216  *
217  *      TCP_SYN_SENT            sent a connection request, waiting for ack
218  *
219  *      TCP_SYN_RECV            received a connection request, sent ack,
220  *                              waiting for final ack in three-way handshake.
221  *
222  *      TCP_ESTABLISHED         connection established
223  *
224  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
225  *                              transmission of remaining buffered data
226  *
227  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
228  *                              to shutdown
229  *
230  *      TCP_CLOSING             both sides have shutdown but we still have
231  *                              data we have to finish sending
232  *
233  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
234  *                              closed, can only be entered from FIN_WAIT2
235  *                              or CLOSING.  Required because the other end
236  *                              may not have gotten our last ACK causing it
237  *                              to retransmit the data packet (which we ignore)
238  *
239  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
240  *                              us to finish writing our data and to shutdown
241  *                              (we have to close() to move on to LAST_ACK)
242  *
243  *      TCP_LAST_ACK            out side has shutdown after remote has
244  *                              shutdown.  There may still be data in our
245  *                              buffer that we have to finish sending
246  *
247  *      TCP_CLOSE               socket is finished
248  */
249
250 #include <linux/config.h>
251 #include <linux/module.h>
252 #include <linux/types.h>
253 #include <linux/fcntl.h>
254 #include <linux/poll.h>
255 #include <linux/init.h>
256 #include <linux/smp_lock.h>
257 #include <linux/fs.h>
258 #include <linux/random.h>
259 #include <linux/bootmem.h>
260
261 #include <net/icmp.h>
262 #include <net/tcp.h>
263 #include <net/xfrm.h>
264 #include <net/ip.h>
265
266
267 #include <asm/uaccess.h>
268 #include <asm/ioctls.h>
269
270 int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
271
272 DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics);
273
274 kmem_cache_t *tcp_bucket_cachep;
275 kmem_cache_t *tcp_timewait_cachep;
276
277 atomic_t tcp_orphan_count = ATOMIC_INIT(0);
278
279 int sysctl_tcp_mem[3];
280 int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
281 int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
282
283 EXPORT_SYMBOL(sysctl_tcp_mem);
284 EXPORT_SYMBOL(sysctl_tcp_rmem);
285 EXPORT_SYMBOL(sysctl_tcp_wmem);
286
287 atomic_t tcp_memory_allocated;  /* Current allocated memory. */
288 atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */
289
290 EXPORT_SYMBOL(tcp_memory_allocated);
291 EXPORT_SYMBOL(tcp_sockets_allocated);
292
293 /*
294  * Pressure flag: try to collapse.
295  * Technical note: it is used by multiple contexts non atomically.
296  * All the sk_stream_mem_schedule() is of this nature: accounting
297  * is strict, actions are advisory and have some latency.
298  */
299 int tcp_memory_pressure;
300
301 EXPORT_SYMBOL(tcp_memory_pressure);
302
303 void tcp_enter_memory_pressure(void)
304 {
305         if (!tcp_memory_pressure) {
306                 NET_INC_STATS(LINUX_MIB_TCPMEMORYPRESSURES);
307                 tcp_memory_pressure = 1;
308         }
309 }
310
311 EXPORT_SYMBOL(tcp_enter_memory_pressure);
312
313 /*
314  * LISTEN is a special case for poll..
315  */
316 static __inline__ unsigned int tcp_listen_poll(struct sock *sk,
317                                                poll_table *wait)
318 {
319         return !reqsk_queue_empty(&tcp_sk(sk)->accept_queue) ? (POLLIN | POLLRDNORM) : 0;
320 }
321
322 /*
323  *      Wait for a TCP event.
324  *
325  *      Note that we don't need to lock the socket, as the upper poll layers
326  *      take care of normal races (between the test and the event) and we don't
327  *      go look at any of the socket buffers directly.
328  */
329 unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
330 {
331         unsigned int mask;
332         struct sock *sk = sock->sk;
333         struct tcp_sock *tp = tcp_sk(sk);
334
335         poll_wait(file, sk->sk_sleep, wait);
336         if (sk->sk_state == TCP_LISTEN)
337                 return tcp_listen_poll(sk, wait);
338
339         /* Socket is not locked. We are protected from async events
340            by poll logic and correct handling of state changes
341            made by another threads is impossible in any case.
342          */
343
344         mask = 0;
345         if (sk->sk_err)
346                 mask = POLLERR;
347
348         /*
349          * POLLHUP is certainly not done right. But poll() doesn't
350          * have a notion of HUP in just one direction, and for a
351          * socket the read side is more interesting.
352          *
353          * Some poll() documentation says that POLLHUP is incompatible
354          * with the POLLOUT/POLLWR flags, so somebody should check this
355          * all. But careful, it tends to be safer to return too many
356          * bits than too few, and you can easily break real applications
357          * if you don't tell them that something has hung up!
358          *
359          * Check-me.
360          *
361          * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
362          * our fs/select.c). It means that after we received EOF,
363          * poll always returns immediately, making impossible poll() on write()
364          * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
365          * if and only if shutdown has been made in both directions.
366          * Actually, it is interesting to look how Solaris and DUX
367          * solve this dilemma. I would prefer, if PULLHUP were maskable,
368          * then we could set it on SND_SHUTDOWN. BTW examples given
369          * in Stevens' books assume exactly this behaviour, it explains
370          * why PULLHUP is incompatible with POLLOUT.    --ANK
371          *
372          * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
373          * blocking on fresh not-connected or disconnected socket. --ANK
374          */
375         if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
376                 mask |= POLLHUP;
377         if (sk->sk_shutdown & RCV_SHUTDOWN)
378                 mask |= POLLIN | POLLRDNORM;
379
380         /* Connected? */
381         if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
382                 /* Potential race condition. If read of tp below will
383                  * escape above sk->sk_state, we can be illegally awaken
384                  * in SYN_* states. */
385                 if ((tp->rcv_nxt != tp->copied_seq) &&
386                     (tp->urg_seq != tp->copied_seq ||
387                      tp->rcv_nxt != tp->copied_seq + 1 ||
388                      sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))
389                         mask |= POLLIN | POLLRDNORM;
390
391                 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
392                         if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
393                                 mask |= POLLOUT | POLLWRNORM;
394                         } else {  /* send SIGIO later */
395                                 set_bit(SOCK_ASYNC_NOSPACE,
396                                         &sk->sk_socket->flags);
397                                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
398
399                                 /* Race breaker. If space is freed after
400                                  * wspace test but before the flags are set,
401                                  * IO signal will be lost.
402                                  */
403                                 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
404                                         mask |= POLLOUT | POLLWRNORM;
405                         }
406                 }
407
408                 if (tp->urg_data & TCP_URG_VALID)
409                         mask |= POLLPRI;
410         }
411         return mask;
412 }
413
414 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
415 {
416         struct tcp_sock *tp = tcp_sk(sk);
417         int answ;
418
419         switch (cmd) {
420         case SIOCINQ:
421                 if (sk->sk_state == TCP_LISTEN)
422                         return -EINVAL;
423
424                 lock_sock(sk);
425                 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
426                         answ = 0;
427                 else if (sock_flag(sk, SOCK_URGINLINE) ||
428                          !tp->urg_data ||
429                          before(tp->urg_seq, tp->copied_seq) ||
430                          !before(tp->urg_seq, tp->rcv_nxt)) {
431                         answ = tp->rcv_nxt - tp->copied_seq;
432
433                         /* Subtract 1, if FIN is in queue. */
434                         if (answ && !skb_queue_empty(&sk->sk_receive_queue))
435                                 answ -=
436                        ((struct sk_buff *)sk->sk_receive_queue.prev)->h.th->fin;
437                 } else
438                         answ = tp->urg_seq - tp->copied_seq;
439                 release_sock(sk);
440                 break;
441         case SIOCATMARK:
442                 answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
443                 break;
444         case SIOCOUTQ:
445                 if (sk->sk_state == TCP_LISTEN)
446                         return -EINVAL;
447
448                 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
449                         answ = 0;
450                 else
451                         answ = tp->write_seq - tp->snd_una;
452                 break;
453         default:
454                 return -ENOIOCTLCMD;
455         };
456
457         return put_user(answ, (int __user *)arg);
458 }
459
460
461 int tcp_listen_start(struct sock *sk)
462 {
463         struct inet_sock *inet = inet_sk(sk);
464         struct tcp_sock *tp = tcp_sk(sk);
465         int rc = reqsk_queue_alloc(&tp->accept_queue, TCP_SYNQ_HSIZE);
466
467         if (rc != 0)
468                 return rc;
469
470         sk->sk_max_ack_backlog = 0;
471         sk->sk_ack_backlog = 0;
472         tcp_delack_init(tp);
473
474         /* There is race window here: we announce ourselves listening,
475          * but this transition is still not validated by get_port().
476          * It is OK, because this socket enters to hash table only
477          * after validation is complete.
478          */
479         sk->sk_state = TCP_LISTEN;
480         if (!sk->sk_prot->get_port(sk, inet->num)) {
481                 inet->sport = htons(inet->num);
482
483                 sk_dst_reset(sk);
484                 sk->sk_prot->hash(sk);
485
486                 return 0;
487         }
488
489         sk->sk_state = TCP_CLOSE;
490         reqsk_queue_destroy(&tp->accept_queue);
491         return -EADDRINUSE;
492 }
493
494 /*
495  *      This routine closes sockets which have been at least partially
496  *      opened, but not yet accepted.
497  */
498
499 static void tcp_listen_stop (struct sock *sk)
500 {
501         struct tcp_sock *tp = tcp_sk(sk);
502         struct listen_sock *lopt;
503         struct request_sock *acc_req;
504         struct request_sock *req;
505         int i;
506
507         tcp_delete_keepalive_timer(sk);
508
509         /* make all the listen_opt local to us */
510         lopt = reqsk_queue_yank_listen_sk(&tp->accept_queue);
511         acc_req = reqsk_queue_yank_acceptq(&tp->accept_queue);
512
513         if (lopt->qlen) {
514                 for (i = 0; i < TCP_SYNQ_HSIZE; i++) {
515                         while ((req = lopt->syn_table[i]) != NULL) {
516                                 lopt->syn_table[i] = req->dl_next;
517                                 lopt->qlen--;
518                                 reqsk_free(req);
519
520                 /* Following specs, it would be better either to send FIN
521                  * (and enter FIN-WAIT-1, it is normal close)
522                  * or to send active reset (abort).
523                  * Certainly, it is pretty dangerous while synflood, but it is
524                  * bad justification for our negligence 8)
525                  * To be honest, we are not able to make either
526                  * of the variants now.                 --ANK
527                  */
528                         }
529                 }
530         }
531         BUG_TRAP(!lopt->qlen);
532
533         kfree(lopt);
534
535         while ((req = acc_req) != NULL) {
536                 struct sock *child = req->sk;
537
538                 acc_req = req->dl_next;
539
540                 local_bh_disable();
541                 bh_lock_sock(child);
542                 BUG_TRAP(!sock_owned_by_user(child));
543                 sock_hold(child);
544
545                 tcp_disconnect(child, O_NONBLOCK);
546
547                 sock_orphan(child);
548
549                 atomic_inc(&tcp_orphan_count);
550
551                 tcp_destroy_sock(child);
552
553                 bh_unlock_sock(child);
554                 local_bh_enable();
555                 sock_put(child);
556
557                 sk_acceptq_removed(sk);
558                 __reqsk_free(req);
559         }
560         BUG_TRAP(!sk->sk_ack_backlog);
561 }
562
563 static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
564 {
565         TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
566         tp->pushed_seq = tp->write_seq;
567 }
568
569 static inline int forced_push(struct tcp_sock *tp)
570 {
571         return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
572 }
573
574 static inline void skb_entail(struct sock *sk, struct tcp_sock *tp,
575                               struct sk_buff *skb)
576 {
577         skb->csum = 0;
578         TCP_SKB_CB(skb)->seq = tp->write_seq;
579         TCP_SKB_CB(skb)->end_seq = tp->write_seq;
580         TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
581         TCP_SKB_CB(skb)->sacked = 0;
582         skb_header_release(skb);
583         __skb_queue_tail(&sk->sk_write_queue, skb);
584         sk_charge_skb(sk, skb);
585         if (!sk->sk_send_head)
586                 sk->sk_send_head = skb;
587         else if (tp->nonagle&TCP_NAGLE_PUSH)
588                 tp->nonagle &= ~TCP_NAGLE_PUSH; 
589 }
590
591 static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
592                                 struct sk_buff *skb)
593 {
594         if (flags & MSG_OOB) {
595                 tp->urg_mode = 1;
596                 tp->snd_up = tp->write_seq;
597                 TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
598         }
599 }
600
601 static inline void tcp_push(struct sock *sk, struct tcp_sock *tp, int flags,
602                             int mss_now, int nonagle)
603 {
604         if (sk->sk_send_head) {
605                 struct sk_buff *skb = sk->sk_write_queue.prev;
606                 if (!(flags & MSG_MORE) || forced_push(tp))
607                         tcp_mark_push(tp, skb);
608                 tcp_mark_urg(tp, flags, skb);
609                 __tcp_push_pending_frames(sk, tp, mss_now,
610                                           (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
611         }
612 }
613
614 static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
615                          size_t psize, int flags)
616 {
617         struct tcp_sock *tp = tcp_sk(sk);
618         int mss_now;
619         int err;
620         ssize_t copied;
621         long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
622
623         /* Wait for a connection to finish. */
624         if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
625                 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
626                         goto out_err;
627
628         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
629
630         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
631         copied = 0;
632
633         err = -EPIPE;
634         if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
635                 goto do_error;
636
637         while (psize > 0) {
638                 struct sk_buff *skb = sk->sk_write_queue.prev;
639                 struct page *page = pages[poffset / PAGE_SIZE];
640                 int copy, i, can_coalesce;
641                 int offset = poffset % PAGE_SIZE;
642                 int size = min_t(size_t, psize, PAGE_SIZE - offset);
643
644                 if (!sk->sk_send_head || (copy = mss_now - skb->len) <= 0) {
645 new_segment:
646                         if (!sk_stream_memory_free(sk))
647                                 goto wait_for_sndbuf;
648
649                         skb = sk_stream_alloc_pskb(sk, 0, 0,
650                                                    sk->sk_allocation);
651                         if (!skb)
652                                 goto wait_for_memory;
653
654                         skb_entail(sk, tp, skb);
655                         copy = mss_now;
656                 }
657
658                 if (copy > size)
659                         copy = size;
660
661                 i = skb_shinfo(skb)->nr_frags;
662                 can_coalesce = skb_can_coalesce(skb, i, page, offset);
663                 if (!can_coalesce && i >= MAX_SKB_FRAGS) {
664                         tcp_mark_push(tp, skb);
665                         goto new_segment;
666                 }
667                 if (sk->sk_forward_alloc < copy &&
668                     !sk_stream_mem_schedule(sk, copy, 0))
669                         goto wait_for_memory;
670                 
671                 if (can_coalesce) {
672                         skb_shinfo(skb)->frags[i - 1].size += copy;
673                 } else {
674                         get_page(page);
675                         skb_fill_page_desc(skb, i, page, offset, copy);
676                 }
677
678                 skb->len += copy;
679                 skb->data_len += copy;
680                 skb->truesize += copy;
681                 sk->sk_wmem_queued += copy;
682                 sk->sk_forward_alloc -= copy;
683                 skb->ip_summed = CHECKSUM_HW;
684                 tp->write_seq += copy;
685                 TCP_SKB_CB(skb)->end_seq += copy;
686                 skb_shinfo(skb)->tso_segs = 0;
687
688                 if (!copied)
689                         TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
690
691                 copied += copy;
692                 poffset += copy;
693                 if (!(psize -= copy))
694                         goto out;
695
696                 if (skb->len != mss_now || (flags & MSG_OOB))
697                         continue;
698
699                 if (forced_push(tp)) {
700                         tcp_mark_push(tp, skb);
701                         __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
702                 } else if (skb == sk->sk_send_head)
703                         tcp_push_one(sk, mss_now);
704                 continue;
705
706 wait_for_sndbuf:
707                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
708 wait_for_memory:
709                 if (copied)
710                         tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
711
712                 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
713                         goto do_error;
714
715                 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
716         }
717
718 out:
719         if (copied)
720                 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
721         return copied;
722
723 do_error:
724         if (copied)
725                 goto out;
726 out_err:
727         return sk_stream_error(sk, flags, err);
728 }
729
730 ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
731                      size_t size, int flags)
732 {
733         ssize_t res;
734         struct sock *sk = sock->sk;
735
736 #define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
737
738         if (!(sk->sk_route_caps & NETIF_F_SG) ||
739             !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))
740                 return sock_no_sendpage(sock, page, offset, size, flags);
741
742 #undef TCP_ZC_CSUM_FLAGS
743
744         lock_sock(sk);
745         TCP_CHECK_TIMER(sk);
746         res = do_tcp_sendpages(sk, &page, offset, size, flags);
747         TCP_CHECK_TIMER(sk);
748         release_sock(sk);
749         return res;
750 }
751
752 #define TCP_PAGE(sk)    (sk->sk_sndmsg_page)
753 #define TCP_OFF(sk)     (sk->sk_sndmsg_off)
754
755 static inline int select_size(struct sock *sk, struct tcp_sock *tp)
756 {
757         int tmp = tp->mss_cache_std;
758
759         if (sk->sk_route_caps & NETIF_F_SG)
760                 tmp = 0;
761
762         return tmp;
763 }
764
765 int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
766                 size_t size)
767 {
768         struct iovec *iov;
769         struct tcp_sock *tp = tcp_sk(sk);
770         struct sk_buff *skb;
771         int iovlen, flags;
772         int mss_now;
773         int err, copied;
774         long timeo;
775
776         lock_sock(sk);
777         TCP_CHECK_TIMER(sk);
778
779         flags = msg->msg_flags;
780         timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
781
782         /* Wait for a connection to finish. */
783         if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
784                 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
785                         goto out_err;
786
787         /* This should be in poll */
788         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
789
790         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
791
792         /* Ok commence sending. */
793         iovlen = msg->msg_iovlen;
794         iov = msg->msg_iov;
795         copied = 0;
796
797         err = -EPIPE;
798         if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
799                 goto do_error;
800
801         while (--iovlen >= 0) {
802                 int seglen = iov->iov_len;
803                 unsigned char __user *from = iov->iov_base;
804
805                 iov++;
806
807                 while (seglen > 0) {
808                         int copy;
809
810                         skb = sk->sk_write_queue.prev;
811
812                         if (!sk->sk_send_head ||
813                             (copy = mss_now - skb->len) <= 0) {
814
815 new_segment:
816                                 /* Allocate new segment. If the interface is SG,
817                                  * allocate skb fitting to single page.
818                                  */
819                                 if (!sk_stream_memory_free(sk))
820                                         goto wait_for_sndbuf;
821
822                                 skb = sk_stream_alloc_pskb(sk, select_size(sk, tp),
823                                                            0, sk->sk_allocation);
824                                 if (!skb)
825                                         goto wait_for_memory;
826
827                                 /*
828                                  * Check whether we can use HW checksum.
829                                  */
830                                 if (sk->sk_route_caps &
831                                     (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM |
832                                      NETIF_F_HW_CSUM))
833                                         skb->ip_summed = CHECKSUM_HW;
834
835                                 skb_entail(sk, tp, skb);
836                                 copy = mss_now;
837                         }
838
839                         /* Try to append data to the end of skb. */
840                         if (copy > seglen)
841                                 copy = seglen;
842
843                         /* Where to copy to? */
844                         if (skb_tailroom(skb) > 0) {
845                                 /* We have some space in skb head. Superb! */
846                                 if (copy > skb_tailroom(skb))
847                                         copy = skb_tailroom(skb);
848                                 if ((err = skb_add_data(skb, from, copy)) != 0)
849                                         goto do_fault;
850                         } else {
851                                 int merge = 0;
852                                 int i = skb_shinfo(skb)->nr_frags;
853                                 struct page *page = TCP_PAGE(sk);
854                                 int off = TCP_OFF(sk);
855
856                                 if (skb_can_coalesce(skb, i, page, off) &&
857                                     off != PAGE_SIZE) {
858                                         /* We can extend the last page
859                                          * fragment. */
860                                         merge = 1;
861                                 } else if (i == MAX_SKB_FRAGS ||
862                                            (!i &&
863                                            !(sk->sk_route_caps & NETIF_F_SG))) {
864                                         /* Need to add new fragment and cannot
865                                          * do this because interface is non-SG,
866                                          * or because all the page slots are
867                                          * busy. */
868                                         tcp_mark_push(tp, skb);
869                                         goto new_segment;
870                                 } else if (page) {
871                                         if (off == PAGE_SIZE) {
872                                                 put_page(page);
873                                                 TCP_PAGE(sk) = page = NULL;
874                                         }
875                                 }
876
877                                 if (!page) {
878                                         /* Allocate new cache page. */
879                                         if (!(page = sk_stream_alloc_page(sk)))
880                                                 goto wait_for_memory;
881                                         off = 0;
882                                 }
883
884                                 if (copy > PAGE_SIZE - off)
885                                         copy = PAGE_SIZE - off;
886
887                                 /* Time to copy data. We are close to
888                                  * the end! */
889                                 err = skb_copy_to_page(sk, from, skb, page,
890                                                        off, copy);
891                                 if (err) {
892                                         /* If this page was new, give it to the
893                                          * socket so it does not get leaked.
894                                          */
895                                         if (!TCP_PAGE(sk)) {
896                                                 TCP_PAGE(sk) = page;
897                                                 TCP_OFF(sk) = 0;
898                                         }
899                                         goto do_error;
900                                 }
901
902                                 /* Update the skb. */
903                                 if (merge) {
904                                         skb_shinfo(skb)->frags[i - 1].size +=
905                                                                         copy;
906                                 } else {
907                                         skb_fill_page_desc(skb, i, page, off, copy);
908                                         if (TCP_PAGE(sk)) {
909                                                 get_page(page);
910                                         } else if (off + copy < PAGE_SIZE) {
911                                                 get_page(page);
912                                                 TCP_PAGE(sk) = page;
913                                         }
914                                 }
915
916                                 TCP_OFF(sk) = off + copy;
917                         }
918
919                         if (!copied)
920                                 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
921
922                         tp->write_seq += copy;
923                         TCP_SKB_CB(skb)->end_seq += copy;
924                         skb_shinfo(skb)->tso_segs = 0;
925
926                         from += copy;
927                         copied += copy;
928                         if ((seglen -= copy) == 0 && iovlen == 0)
929                                 goto out;
930
931                         if (skb->len != mss_now || (flags & MSG_OOB))
932                                 continue;
933
934                         if (forced_push(tp)) {
935                                 tcp_mark_push(tp, skb);
936                                 __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
937                         } else if (skb == sk->sk_send_head)
938                                 tcp_push_one(sk, mss_now);
939                         continue;
940
941 wait_for_sndbuf:
942                         set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
943 wait_for_memory:
944                         if (copied)
945                                 tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
946
947                         if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
948                                 goto do_error;
949
950                         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
951                 }
952         }
953
954 out:
955         if (copied)
956                 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
957         TCP_CHECK_TIMER(sk);
958         release_sock(sk);
959         return copied;
960
961 do_fault:
962         if (!skb->len) {
963                 if (sk->sk_send_head == skb)
964                         sk->sk_send_head = NULL;
965                 __skb_unlink(skb, skb->list);
966                 sk_stream_free_skb(sk, skb);
967         }
968
969 do_error:
970         if (copied)
971                 goto out;
972 out_err:
973         err = sk_stream_error(sk, flags, err);
974         TCP_CHECK_TIMER(sk);
975         release_sock(sk);
976         return err;
977 }
978
979 /*
980  *      Handle reading urgent data. BSD has very simple semantics for
981  *      this, no blocking and very strange errors 8)
982  */
983
984 static int tcp_recv_urg(struct sock *sk, long timeo,
985                         struct msghdr *msg, int len, int flags,
986                         int *addr_len)
987 {
988         struct tcp_sock *tp = tcp_sk(sk);
989
990         /* No URG data to read. */
991         if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
992             tp->urg_data == TCP_URG_READ)
993                 return -EINVAL; /* Yes this is right ! */
994
995         if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
996                 return -ENOTCONN;
997
998         if (tp->urg_data & TCP_URG_VALID) {
999                 int err = 0;
1000                 char c = tp->urg_data;
1001
1002                 if (!(flags & MSG_PEEK))
1003                         tp->urg_data = TCP_URG_READ;
1004
1005                 /* Read urgent data. */
1006                 msg->msg_flags |= MSG_OOB;
1007
1008                 if (len > 0) {
1009                         if (!(flags & MSG_TRUNC))
1010                                 err = memcpy_toiovec(msg->msg_iov, &c, 1);
1011                         len = 1;
1012                 } else
1013                         msg->msg_flags |= MSG_TRUNC;
1014
1015                 return err ? -EFAULT : len;
1016         }
1017
1018         if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1019                 return 0;
1020
1021         /* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
1022          * the available implementations agree in this case:
1023          * this call should never block, independent of the
1024          * blocking state of the socket.
1025          * Mike <pall@rz.uni-karlsruhe.de>
1026          */
1027         return -EAGAIN;
1028 }
1029
1030 /* Clean up the receive buffer for full frames taken by the user,
1031  * then send an ACK if necessary.  COPIED is the number of bytes
1032  * tcp_recvmsg has given to the user so far, it speeds up the
1033  * calculation of whether or not we must ACK for the sake of
1034  * a window update.
1035  */
1036 static void cleanup_rbuf(struct sock *sk, int copied)
1037 {
1038         struct tcp_sock *tp = tcp_sk(sk);
1039         int time_to_ack = 0;
1040
1041 #if TCP_DEBUG
1042         struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1043
1044         BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
1045 #endif
1046
1047         if (tcp_ack_scheduled(tp)) {
1048                    /* Delayed ACKs frequently hit locked sockets during bulk
1049                     * receive. */
1050                 if (tp->ack.blocked ||
1051                     /* Once-per-two-segments ACK was not sent by tcp_input.c */
1052                     tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss ||
1053                     /*
1054                      * If this read emptied read buffer, we send ACK, if
1055                      * connection is not bidirectional, user drained
1056                      * receive buffer and there was a small segment
1057                      * in queue.
1058                      */
1059                     (copied > 0 && (tp->ack.pending & TCP_ACK_PUSHED) &&
1060                      !tp->ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
1061                         time_to_ack = 1;
1062         }
1063
1064         /* We send an ACK if we can now advertise a non-zero window
1065          * which has been raised "significantly".
1066          *
1067          * Even if window raised up to infinity, do not send window open ACK
1068          * in states, where we will not receive more. It is useless.
1069          */
1070         if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1071                 __u32 rcv_window_now = tcp_receive_window(tp);
1072
1073                 /* Optimize, __tcp_select_window() is not cheap. */
1074                 if (2*rcv_window_now <= tp->window_clamp) {
1075                         __u32 new_window = __tcp_select_window(sk);
1076
1077                         /* Send ACK now, if this read freed lots of space
1078                          * in our buffer. Certainly, new_window is new window.
1079                          * We can advertise it now, if it is not less than current one.
1080                          * "Lots" means "at least twice" here.
1081                          */
1082                         if (new_window && new_window >= 2 * rcv_window_now)
1083                                 time_to_ack = 1;
1084                 }
1085         }
1086         if (time_to_ack)
1087                 tcp_send_ack(sk);
1088 }
1089
1090 static void tcp_prequeue_process(struct sock *sk)
1091 {
1092         struct sk_buff *skb;
1093         struct tcp_sock *tp = tcp_sk(sk);
1094
1095         NET_ADD_STATS_USER(LINUX_MIB_TCPPREQUEUED, skb_queue_len(&tp->ucopy.prequeue));
1096
1097         /* RX process wants to run with disabled BHs, though it is not
1098          * necessary */
1099         local_bh_disable();
1100         while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1101                 sk->sk_backlog_rcv(sk, skb);
1102         local_bh_enable();
1103
1104         /* Clear memory counter. */
1105         tp->ucopy.memory = 0;
1106 }
1107
1108 static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1109 {
1110         struct sk_buff *skb;
1111         u32 offset;
1112
1113         skb_queue_walk(&sk->sk_receive_queue, skb) {
1114                 offset = seq - TCP_SKB_CB(skb)->seq;
1115                 if (skb->h.th->syn)
1116                         offset--;
1117                 if (offset < skb->len || skb->h.th->fin) {
1118                         *off = offset;
1119                         return skb;
1120                 }
1121         }
1122         return NULL;
1123 }
1124
1125 /*
1126  * This routine provides an alternative to tcp_recvmsg() for routines
1127  * that would like to handle copying from skbuffs directly in 'sendfile'
1128  * fashion.
1129  * Note:
1130  *      - It is assumed that the socket was locked by the caller.
1131  *      - The routine does not block.
1132  *      - At present, there is no support for reading OOB data
1133  *        or for 'peeking' the socket using this routine
1134  *        (although both would be easy to implement).
1135  */
1136 int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1137                   sk_read_actor_t recv_actor)
1138 {
1139         struct sk_buff *skb;
1140         struct tcp_sock *tp = tcp_sk(sk);
1141         u32 seq = tp->copied_seq;
1142         u32 offset;
1143         int copied = 0;
1144
1145         if (sk->sk_state == TCP_LISTEN)
1146                 return -ENOTCONN;
1147         while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1148                 if (offset < skb->len) {
1149                         size_t used, len;
1150
1151                         len = skb->len - offset;
1152                         /* Stop reading if we hit a patch of urgent data */
1153                         if (tp->urg_data) {
1154                                 u32 urg_offset = tp->urg_seq - seq;
1155                                 if (urg_offset < len)
1156                                         len = urg_offset;
1157                                 if (!len)
1158                                         break;
1159                         }
1160                         used = recv_actor(desc, skb, offset, len);
1161                         if (used <= len) {
1162                                 seq += used;
1163                                 copied += used;
1164                                 offset += used;
1165                         }
1166                         if (offset != skb->len)
1167                                 break;
1168                 }
1169                 if (skb->h.th->fin) {
1170                         sk_eat_skb(sk, skb);
1171                         ++seq;
1172                         break;
1173                 }
1174                 sk_eat_skb(sk, skb);
1175                 if (!desc->count)
1176                         break;
1177         }
1178         tp->copied_seq = seq;
1179
1180         tcp_rcv_space_adjust(sk);
1181
1182         /* Clean up data we have read: This will do ACK frames. */
1183         if (copied)
1184                 cleanup_rbuf(sk, copied);
1185         return copied;
1186 }
1187
1188 /*
1189  *      This routine copies from a sock struct into the user buffer.
1190  *
1191  *      Technical note: in 2.3 we work on _locked_ socket, so that
1192  *      tricks with *seq access order and skb->users are not required.
1193  *      Probably, code can be easily improved even more.
1194  */
1195
1196 int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1197                 size_t len, int nonblock, int flags, int *addr_len)
1198 {
1199         struct tcp_sock *tp = tcp_sk(sk);
1200         int copied = 0;
1201         u32 peek_seq;
1202         u32 *seq;
1203         unsigned long used;
1204         int err;
1205         int target;             /* Read at least this many bytes */
1206         long timeo;
1207         struct task_struct *user_recv = NULL;
1208
1209         lock_sock(sk);
1210
1211         TCP_CHECK_TIMER(sk);
1212
1213         err = -ENOTCONN;
1214         if (sk->sk_state == TCP_LISTEN)
1215                 goto out;
1216
1217         timeo = sock_rcvtimeo(sk, nonblock);
1218
1219         /* Urgent data needs to be handled specially. */
1220         if (flags & MSG_OOB)
1221                 goto recv_urg;
1222
1223         seq = &tp->copied_seq;
1224         if (flags & MSG_PEEK) {
1225                 peek_seq = tp->copied_seq;
1226                 seq = &peek_seq;
1227         }
1228
1229         target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1230
1231         do {
1232                 struct sk_buff *skb;
1233                 u32 offset;
1234
1235                 /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
1236                 if (tp->urg_data && tp->urg_seq == *seq) {
1237                         if (copied)
1238                                 break;
1239                         if (signal_pending(current)) {
1240                                 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1241                                 break;
1242                         }
1243                 }
1244
1245                 /* Next get a buffer. */
1246
1247                 skb = skb_peek(&sk->sk_receive_queue);
1248                 do {
1249                         if (!skb)
1250                                 break;
1251
1252                         /* Now that we have two receive queues this
1253                          * shouldn't happen.
1254                          */
1255                         if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1256                                 printk(KERN_INFO "recvmsg bug: copied %X "
1257                                        "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
1258                                 break;
1259                         }
1260                         offset = *seq - TCP_SKB_CB(skb)->seq;
1261                         if (skb->h.th->syn)
1262                                 offset--;
1263                         if (offset < skb->len)
1264                                 goto found_ok_skb;
1265                         if (skb->h.th->fin)
1266                                 goto found_fin_ok;
1267                         BUG_TRAP(flags & MSG_PEEK);
1268                         skb = skb->next;
1269                 } while (skb != (struct sk_buff *)&sk->sk_receive_queue);
1270
1271                 /* Well, if we have backlog, try to process it now yet. */
1272
1273                 if (copied >= target && !sk->sk_backlog.tail)
1274                         break;
1275
1276                 if (copied) {
1277                         if (sk->sk_err ||
1278                             sk->sk_state == TCP_CLOSE ||
1279                             (sk->sk_shutdown & RCV_SHUTDOWN) ||
1280                             !timeo ||
1281                             signal_pending(current) ||
1282                             (flags & MSG_PEEK))
1283                                 break;
1284                 } else {
1285                         if (sock_flag(sk, SOCK_DONE))
1286                                 break;
1287
1288                         if (sk->sk_err) {
1289                                 copied = sock_error(sk);
1290                                 break;
1291                         }
1292
1293                         if (sk->sk_shutdown & RCV_SHUTDOWN)
1294                                 break;
1295
1296                         if (sk->sk_state == TCP_CLOSE) {
1297                                 if (!sock_flag(sk, SOCK_DONE)) {
1298                                         /* This occurs when user tries to read
1299                                          * from never connected socket.
1300                                          */
1301                                         copied = -ENOTCONN;
1302                                         break;
1303                                 }
1304                                 break;
1305                         }
1306
1307                         if (!timeo) {
1308                                 copied = -EAGAIN;
1309                                 break;
1310                         }
1311
1312                         if (signal_pending(current)) {
1313                                 copied = sock_intr_errno(timeo);
1314                                 break;
1315                         }
1316                 }
1317
1318                 cleanup_rbuf(sk, copied);
1319
1320                 if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
1321                         /* Install new reader */
1322                         if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1323                                 user_recv = current;
1324                                 tp->ucopy.task = user_recv;
1325                                 tp->ucopy.iov = msg->msg_iov;
1326                         }
1327
1328                         tp->ucopy.len = len;
1329
1330                         BUG_TRAP(tp->copied_seq == tp->rcv_nxt ||
1331                                  (flags & (MSG_PEEK | MSG_TRUNC)));
1332
1333                         /* Ugly... If prequeue is not empty, we have to
1334                          * process it before releasing socket, otherwise
1335                          * order will be broken at second iteration.
1336                          * More elegant solution is required!!!
1337                          *
1338                          * Look: we have the following (pseudo)queues:
1339                          *
1340                          * 1. packets in flight
1341                          * 2. backlog
1342                          * 3. prequeue
1343                          * 4. receive_queue
1344                          *
1345                          * Each queue can be processed only if the next ones
1346                          * are empty. At this point we have empty receive_queue.
1347                          * But prequeue _can_ be not empty after 2nd iteration,
1348                          * when we jumped to start of loop because backlog
1349                          * processing added something to receive_queue.
1350                          * We cannot release_sock(), because backlog contains
1351                          * packets arrived _after_ prequeued ones.
1352                          *
1353                          * Shortly, algorithm is clear --- to process all
1354                          * the queues in order. We could make it more directly,
1355                          * requeueing packets from backlog to prequeue, if
1356                          * is not empty. It is more elegant, but eats cycles,
1357                          * unfortunately.
1358                          */
1359                         if (skb_queue_len(&tp->ucopy.prequeue))
1360                                 goto do_prequeue;
1361
1362                         /* __ Set realtime policy in scheduler __ */
1363                 }
1364
1365                 if (copied >= target) {
1366                         /* Do not sleep, just process backlog. */
1367                         release_sock(sk);
1368                         lock_sock(sk);
1369                 } else
1370                         sk_wait_data(sk, &timeo);
1371
1372                 if (user_recv) {
1373                         int chunk;
1374
1375                         /* __ Restore normal policy in scheduler __ */
1376
1377                         if ((chunk = len - tp->ucopy.len) != 0) {
1378                                 NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
1379                                 len -= chunk;
1380                                 copied += chunk;
1381                         }
1382
1383                         if (tp->rcv_nxt == tp->copied_seq &&
1384                             skb_queue_len(&tp->ucopy.prequeue)) {
1385 do_prequeue:
1386                                 tcp_prequeue_process(sk);
1387
1388                                 if ((chunk = len - tp->ucopy.len) != 0) {
1389                                         NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1390                                         len -= chunk;
1391                                         copied += chunk;
1392                                 }
1393                         }
1394                 }
1395                 if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
1396                         if (net_ratelimit())
1397                                 printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1398                                        current->comm, current->pid);
1399                         peek_seq = tp->copied_seq;
1400                 }
1401                 continue;
1402
1403         found_ok_skb:
1404                 /* Ok so how much can we use? */
1405                 used = skb->len - offset;
1406                 if (len < used)
1407                         used = len;
1408
1409                 /* Do we have urgent data here? */
1410                 if (tp->urg_data) {
1411                         u32 urg_offset = tp->urg_seq - *seq;
1412                         if (urg_offset < used) {
1413                                 if (!urg_offset) {
1414                                         if (!sock_flag(sk, SOCK_URGINLINE)) {
1415                                                 ++*seq;
1416                                                 offset++;
1417                                                 used--;
1418                                                 if (!used)
1419                                                         goto skip_copy;
1420                                         }
1421                                 } else
1422                                         used = urg_offset;
1423                         }
1424                 }
1425
1426                 if (!(flags & MSG_TRUNC)) {
1427                         err = skb_copy_datagram_iovec(skb, offset,
1428                                                       msg->msg_iov, used);
1429                         if (err) {
1430                                 /* Exception. Bailout! */
1431                                 if (!copied)
1432                                         copied = -EFAULT;
1433                                 break;
1434                         }
1435                 }
1436
1437                 *seq += used;
1438                 copied += used;
1439                 len -= used;
1440
1441                 tcp_rcv_space_adjust(sk);
1442
1443 skip_copy:
1444                 if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1445                         tp->urg_data = 0;
1446                         tcp_fast_path_check(sk, tp);
1447                 }
1448                 if (used + offset < skb->len)
1449                         continue;
1450
1451                 if (skb->h.th->fin)
1452                         goto found_fin_ok;
1453                 if (!(flags & MSG_PEEK))
1454                         sk_eat_skb(sk, skb);
1455                 continue;
1456
1457         found_fin_ok:
1458                 /* Process the FIN. */
1459                 ++*seq;
1460                 if (!(flags & MSG_PEEK))
1461                         sk_eat_skb(sk, skb);
1462                 break;
1463         } while (len > 0);
1464
1465         if (user_recv) {
1466                 if (skb_queue_len(&tp->ucopy.prequeue)) {
1467                         int chunk;
1468
1469                         tp->ucopy.len = copied > 0 ? len : 0;
1470
1471                         tcp_prequeue_process(sk);
1472
1473                         if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1474                                 NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1475                                 len -= chunk;
1476                                 copied += chunk;
1477                         }
1478                 }
1479
1480                 tp->ucopy.task = NULL;
1481                 tp->ucopy.len = 0;
1482         }
1483
1484         /* According to UNIX98, msg_name/msg_namelen are ignored
1485          * on connected socket. I was just happy when found this 8) --ANK
1486          */
1487
1488         /* Clean up data we have read: This will do ACK frames. */
1489         cleanup_rbuf(sk, copied);
1490
1491         TCP_CHECK_TIMER(sk);
1492         release_sock(sk);
1493         return copied;
1494
1495 out:
1496         TCP_CHECK_TIMER(sk);
1497         release_sock(sk);
1498         return err;
1499
1500 recv_urg:
1501         err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1502         goto out;
1503 }
1504
1505 /*
1506  *      State processing on a close. This implements the state shift for
1507  *      sending our FIN frame. Note that we only send a FIN for some
1508  *      states. A shutdown() may have already sent the FIN, or we may be
1509  *      closed.
1510  */
1511
1512 static unsigned char new_state[16] = {
1513   /* current state:        new state:      action:      */
1514   /* (Invalid)          */ TCP_CLOSE,
1515   /* TCP_ESTABLISHED    */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1516   /* TCP_SYN_SENT       */ TCP_CLOSE,
1517   /* TCP_SYN_RECV       */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1518   /* TCP_FIN_WAIT1      */ TCP_FIN_WAIT1,
1519   /* TCP_FIN_WAIT2      */ TCP_FIN_WAIT2,
1520   /* TCP_TIME_WAIT      */ TCP_CLOSE,
1521   /* TCP_CLOSE          */ TCP_CLOSE,
1522   /* TCP_CLOSE_WAIT     */ TCP_LAST_ACK  | TCP_ACTION_FIN,
1523   /* TCP_LAST_ACK       */ TCP_LAST_ACK,
1524   /* TCP_LISTEN         */ TCP_CLOSE,
1525   /* TCP_CLOSING        */ TCP_CLOSING,
1526 };
1527
1528 static int tcp_close_state(struct sock *sk)
1529 {
1530         int next = (int)new_state[sk->sk_state];
1531         int ns = next & TCP_STATE_MASK;
1532
1533         tcp_set_state(sk, ns);
1534
1535         return next & TCP_ACTION_FIN;
1536 }
1537
1538 /*
1539  *      Shutdown the sending side of a connection. Much like close except
1540  *      that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD).
1541  */
1542
1543 void tcp_shutdown(struct sock *sk, int how)
1544 {
1545         /*      We need to grab some memory, and put together a FIN,
1546          *      and then put it into the queue to be sent.
1547          *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1548          */
1549         if (!(how & SEND_SHUTDOWN))
1550                 return;
1551
1552         /* If we've already sent a FIN, or it's a closed state, skip this. */
1553         if ((1 << sk->sk_state) &
1554             (TCPF_ESTABLISHED | TCPF_SYN_SENT |
1555              TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
1556                 /* Clear out any half completed packets.  FIN if needed. */
1557                 if (tcp_close_state(sk))
1558                         tcp_send_fin(sk);
1559         }
1560 }
1561
1562 /*
1563  * At this point, there should be no process reference to this
1564  * socket, and thus no user references at all.  Therefore we
1565  * can assume the socket waitqueue is inactive and nobody will
1566  * try to jump onto it.
1567  */
1568 void tcp_destroy_sock(struct sock *sk)
1569 {
1570         BUG_TRAP(sk->sk_state == TCP_CLOSE);
1571         BUG_TRAP(sock_flag(sk, SOCK_DEAD));
1572
1573         /* It cannot be in hash table! */
1574         BUG_TRAP(sk_unhashed(sk));
1575
1576         /* If it has not 0 inet_sk(sk)->num, it must be bound */
1577         BUG_TRAP(!inet_sk(sk)->num || tcp_sk(sk)->bind_hash);
1578
1579         sk->sk_prot->destroy(sk);
1580
1581         sk_stream_kill_queues(sk);
1582
1583         xfrm_sk_free_policy(sk);
1584
1585 #ifdef INET_REFCNT_DEBUG
1586         if (atomic_read(&sk->sk_refcnt) != 1) {
1587                 printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n",
1588                        sk, atomic_read(&sk->sk_refcnt));
1589         }
1590 #endif
1591
1592         atomic_dec(&tcp_orphan_count);
1593         sock_put(sk);
1594 }
1595
1596 void tcp_close(struct sock *sk, long timeout)
1597 {
1598         struct sk_buff *skb;
1599         int data_was_unread = 0;
1600
1601         lock_sock(sk);
1602         sk->sk_shutdown = SHUTDOWN_MASK;
1603
1604         if (sk->sk_state == TCP_LISTEN) {
1605                 tcp_set_state(sk, TCP_CLOSE);
1606
1607                 /* Special case. */
1608                 tcp_listen_stop(sk);
1609
1610                 goto adjudge_to_death;
1611         }
1612
1613         /*  We need to flush the recv. buffs.  We do this only on the
1614          *  descriptor close, not protocol-sourced closes, because the
1615          *  reader process may not have drained the data yet!
1616          */
1617         while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
1618                 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
1619                           skb->h.th->fin;
1620                 data_was_unread += len;
1621                 __kfree_skb(skb);
1622         }
1623
1624         sk_stream_mem_reclaim(sk);
1625
1626         /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
1627          * 3.10, we send a RST here because data was lost.  To
1628          * witness the awful effects of the old behavior of always
1629          * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
1630          * a bulk GET in an FTP client, suspend the process, wait
1631          * for the client to advertise a zero window, then kill -9
1632          * the FTP client, wheee...  Note: timeout is always zero
1633          * in such a case.
1634          */
1635         if (data_was_unread) {
1636                 /* Unread data was tossed, zap the connection. */
1637                 NET_INC_STATS_USER(LINUX_MIB_TCPABORTONCLOSE);
1638                 tcp_set_state(sk, TCP_CLOSE);
1639                 tcp_send_active_reset(sk, GFP_KERNEL);
1640         } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
1641                 /* Check zero linger _after_ checking for unread data. */
1642                 sk->sk_prot->disconnect(sk, 0);
1643                 NET_INC_STATS_USER(LINUX_MIB_TCPABORTONDATA);
1644         } else if (tcp_close_state(sk)) {
1645                 /* We FIN if the application ate all the data before
1646                  * zapping the connection.
1647                  */
1648
1649                 /* RED-PEN. Formally speaking, we have broken TCP state
1650                  * machine. State transitions:
1651                  *
1652                  * TCP_ESTABLISHED -> TCP_FIN_WAIT1
1653                  * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
1654                  * TCP_CLOSE_WAIT -> TCP_LAST_ACK
1655                  *
1656                  * are legal only when FIN has been sent (i.e. in window),
1657                  * rather than queued out of window. Purists blame.
1658                  *
1659                  * F.e. "RFC state" is ESTABLISHED,
1660                  * if Linux state is FIN-WAIT-1, but FIN is still not sent.
1661                  *
1662                  * The visible declinations are that sometimes
1663                  * we enter time-wait state, when it is not required really
1664                  * (harmless), do not send active resets, when they are
1665                  * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
1666                  * they look as CLOSING or LAST_ACK for Linux)
1667                  * Probably, I missed some more holelets.
1668                  *                                              --ANK
1669                  */
1670                 tcp_send_fin(sk);
1671         }
1672
1673         sk_stream_wait_close(sk, timeout);
1674
1675 adjudge_to_death:
1676         /* It is the last release_sock in its life. It will remove backlog. */
1677         release_sock(sk);
1678
1679
1680         /* Now socket is owned by kernel and we acquire BH lock
1681            to finish close. No need to check for user refs.
1682          */
1683         local_bh_disable();
1684         bh_lock_sock(sk);
1685         BUG_TRAP(!sock_owned_by_user(sk));
1686
1687         sock_hold(sk);
1688         sock_orphan(sk);
1689
1690         /*      This is a (useful) BSD violating of the RFC. There is a
1691          *      problem with TCP as specified in that the other end could
1692          *      keep a socket open forever with no application left this end.
1693          *      We use a 3 minute timeout (about the same as BSD) then kill
1694          *      our end. If they send after that then tough - BUT: long enough
1695          *      that we won't make the old 4*rto = almost no time - whoops
1696          *      reset mistake.
1697          *
1698          *      Nope, it was not mistake. It is really desired behaviour
1699          *      f.e. on http servers, when such sockets are useless, but
1700          *      consume significant resources. Let's do it with special
1701          *      linger2 option.                                 --ANK
1702          */
1703
1704         if (sk->sk_state == TCP_FIN_WAIT2) {
1705                 struct tcp_sock *tp = tcp_sk(sk);
1706                 if (tp->linger2 < 0) {
1707                         tcp_set_state(sk, TCP_CLOSE);
1708                         tcp_send_active_reset(sk, GFP_ATOMIC);
1709                         NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER);
1710                 } else {
1711                         int tmo = tcp_fin_time(tp);
1712
1713                         if (tmo > TCP_TIMEWAIT_LEN) {
1714                                 tcp_reset_keepalive_timer(sk, tcp_fin_time(tp));
1715                         } else {
1716                                 atomic_inc(&tcp_orphan_count);
1717                                 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
1718                                 goto out;
1719                         }
1720                 }
1721         }
1722         if (sk->sk_state != TCP_CLOSE) {
1723                 sk_stream_mem_reclaim(sk);
1724                 if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans ||
1725                     (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
1726                      atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
1727                         if (net_ratelimit())
1728                                 printk(KERN_INFO "TCP: too many of orphaned "
1729                                        "sockets\n");
1730                         tcp_set_state(sk, TCP_CLOSE);
1731                         tcp_send_active_reset(sk, GFP_ATOMIC);
1732                         NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
1733                 }
1734         }
1735         atomic_inc(&tcp_orphan_count);
1736
1737         if (sk->sk_state == TCP_CLOSE)
1738                 tcp_destroy_sock(sk);
1739         /* Otherwise, socket is reprieved until protocol close. */
1740
1741 out:
1742         bh_unlock_sock(sk);
1743         local_bh_enable();
1744         sock_put(sk);
1745 }
1746
1747 /* These states need RST on ABORT according to RFC793 */
1748
1749 static inline int tcp_need_reset(int state)
1750 {
1751         return (1 << state) &
1752                (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
1753                 TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
1754 }
1755
1756 int tcp_disconnect(struct sock *sk, int flags)
1757 {
1758         struct inet_sock *inet = inet_sk(sk);
1759         struct tcp_sock *tp = tcp_sk(sk);
1760         int err = 0;
1761         int old_state = sk->sk_state;
1762
1763         if (old_state != TCP_CLOSE)
1764                 tcp_set_state(sk, TCP_CLOSE);
1765
1766         /* ABORT function of RFC793 */
1767         if (old_state == TCP_LISTEN) {
1768                 tcp_listen_stop(sk);
1769         } else if (tcp_need_reset(old_state) ||
1770                    (tp->snd_nxt != tp->write_seq &&
1771                     (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
1772                 /* The last check adjusts for discrepance of Linux wrt. RFC
1773                  * states
1774                  */
1775                 tcp_send_active_reset(sk, gfp_any());
1776                 sk->sk_err = ECONNRESET;
1777         } else if (old_state == TCP_SYN_SENT)
1778                 sk->sk_err = ECONNRESET;
1779
1780         tcp_clear_xmit_timers(sk);
1781         __skb_queue_purge(&sk->sk_receive_queue);
1782         sk_stream_writequeue_purge(sk);
1783         __skb_queue_purge(&tp->out_of_order_queue);
1784
1785         inet->dport = 0;
1786
1787         if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
1788                 inet_reset_saddr(sk);
1789
1790         sk->sk_shutdown = 0;
1791         sock_reset_flag(sk, SOCK_DONE);
1792         tp->srtt = 0;
1793         if ((tp->write_seq += tp->max_window + 2) == 0)
1794                 tp->write_seq = 1;
1795         tp->backoff = 0;
1796         tp->snd_cwnd = 2;
1797         tp->probes_out = 0;
1798         tp->packets_out = 0;
1799         tp->snd_ssthresh = 0x7fffffff;
1800         tp->snd_cwnd_cnt = 0;
1801         tcp_set_ca_state(tp, TCP_CA_Open);
1802         tcp_clear_retrans(tp);
1803         tcp_delack_init(tp);
1804         sk->sk_send_head = NULL;
1805         tp->rx_opt.saw_tstamp = 0;
1806         tcp_sack_reset(&tp->rx_opt);
1807         __sk_dst_reset(sk);
1808
1809         BUG_TRAP(!inet->num || tp->bind_hash);
1810
1811         sk->sk_error_report(sk);
1812         return err;
1813 }
1814
1815 /*
1816  *      Wait for an incoming connection, avoid race
1817  *      conditions. This must be called with the socket locked.
1818  */
1819 static int wait_for_connect(struct sock *sk, long timeo)
1820 {
1821         struct tcp_sock *tp = tcp_sk(sk);
1822         DEFINE_WAIT(wait);
1823         int err;
1824
1825         /*
1826          * True wake-one mechanism for incoming connections: only
1827          * one process gets woken up, not the 'whole herd'.
1828          * Since we do not 'race & poll' for established sockets
1829          * anymore, the common case will execute the loop only once.
1830          *
1831          * Subtle issue: "add_wait_queue_exclusive()" will be added
1832          * after any current non-exclusive waiters, and we know that
1833          * it will always _stay_ after any new non-exclusive waiters
1834          * because all non-exclusive waiters are added at the
1835          * beginning of the wait-queue. As such, it's ok to "drop"
1836          * our exclusiveness temporarily when we get woken up without
1837          * having to remove and re-insert us on the wait queue.
1838          */
1839         for (;;) {
1840                 prepare_to_wait_exclusive(sk->sk_sleep, &wait,
1841                                           TASK_INTERRUPTIBLE);
1842                 release_sock(sk);
1843                 if (reqsk_queue_empty(&tp->accept_queue))
1844                         timeo = schedule_timeout(timeo);
1845                 lock_sock(sk);
1846                 err = 0;
1847                 if (!reqsk_queue_empty(&tp->accept_queue))
1848                         break;
1849                 err = -EINVAL;
1850                 if (sk->sk_state != TCP_LISTEN)
1851                         break;
1852                 err = sock_intr_errno(timeo);
1853                 if (signal_pending(current))
1854                         break;
1855                 err = -EAGAIN;
1856                 if (!timeo)
1857                         break;
1858         }
1859         finish_wait(sk->sk_sleep, &wait);
1860         return err;
1861 }
1862
1863 /*
1864  *      This will accept the next outstanding connection.
1865  */
1866
1867 struct sock *tcp_accept(struct sock *sk, int flags, int *err)
1868 {
1869         struct tcp_sock *tp = tcp_sk(sk);
1870         struct sock *newsk;
1871         int error;
1872
1873         lock_sock(sk);
1874
1875         /* We need to make sure that this socket is listening,
1876          * and that it has something pending.
1877          */
1878         error = -EINVAL;
1879         if (sk->sk_state != TCP_LISTEN)
1880                 goto out_err;
1881
1882         /* Find already established connection */
1883         if (reqsk_queue_empty(&tp->accept_queue)) {
1884                 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
1885
1886                 /* If this is a non blocking socket don't sleep */
1887                 error = -EAGAIN;
1888                 if (!timeo)
1889                         goto out_err;
1890
1891                 error = wait_for_connect(sk, timeo);
1892                 if (error)
1893                         goto out_err;
1894         }
1895
1896         newsk = reqsk_queue_get_child(&tp->accept_queue, sk);
1897         BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
1898 out:
1899         release_sock(sk);
1900         return newsk;
1901 out_err:
1902         newsk = NULL;
1903         *err = error;
1904         goto out;
1905 }
1906
1907 /*
1908  *      Socket option code for TCP.
1909  */
1910 int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
1911                    int optlen)
1912 {
1913         struct tcp_sock *tp = tcp_sk(sk);
1914         int val;
1915         int err = 0;
1916
1917         if (level != SOL_TCP)
1918                 return tp->af_specific->setsockopt(sk, level, optname,
1919                                                    optval, optlen);
1920
1921         /* This is a string value all the others are int's */
1922         if (optname == TCP_CONGESTION) {
1923                 char name[TCP_CA_NAME_MAX];
1924
1925                 if (optlen < 1)
1926                         return -EINVAL;
1927
1928                 val = strncpy_from_user(name, optval,
1929                                         min(TCP_CA_NAME_MAX-1, optlen));
1930                 if (val < 0)
1931                         return -EFAULT;
1932                 name[val] = 0;
1933
1934                 lock_sock(sk);
1935                 err = tcp_set_congestion_control(tp, name);
1936                 release_sock(sk);
1937                 return err;
1938         }
1939
1940         if (optlen < sizeof(int))
1941                 return -EINVAL;
1942
1943         if (get_user(val, (int __user *)optval))
1944                 return -EFAULT;
1945
1946         lock_sock(sk);
1947
1948         switch (optname) {
1949         case TCP_MAXSEG:
1950                 /* Values greater than interface MTU won't take effect. However
1951                  * at the point when this call is done we typically don't yet
1952                  * know which interface is going to be used */
1953                 if (val < 8 || val > MAX_TCP_WINDOW) {
1954                         err = -EINVAL;
1955                         break;
1956                 }
1957                 tp->rx_opt.user_mss = val;
1958                 break;
1959
1960         case TCP_NODELAY:
1961                 if (val) {
1962                         /* TCP_NODELAY is weaker than TCP_CORK, so that
1963                          * this option on corked socket is remembered, but
1964                          * it is not activated until cork is cleared.
1965                          *
1966                          * However, when TCP_NODELAY is set we make
1967                          * an explicit push, which overrides even TCP_CORK
1968                          * for currently queued segments.
1969                          */
1970                         tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
1971                         tcp_push_pending_frames(sk, tp);
1972                 } else {
1973                         tp->nonagle &= ~TCP_NAGLE_OFF;
1974                 }
1975                 break;
1976
1977         case TCP_CORK:
1978                 /* When set indicates to always queue non-full frames.
1979                  * Later the user clears this option and we transmit
1980                  * any pending partial frames in the queue.  This is
1981                  * meant to be used alongside sendfile() to get properly
1982                  * filled frames when the user (for example) must write
1983                  * out headers with a write() call first and then use
1984                  * sendfile to send out the data parts.
1985                  *
1986                  * TCP_CORK can be set together with TCP_NODELAY and it is
1987                  * stronger than TCP_NODELAY.
1988                  */
1989                 if (val) {
1990                         tp->nonagle |= TCP_NAGLE_CORK;
1991                 } else {
1992                         tp->nonagle &= ~TCP_NAGLE_CORK;
1993                         if (tp->nonagle&TCP_NAGLE_OFF)
1994                                 tp->nonagle |= TCP_NAGLE_PUSH;
1995                         tcp_push_pending_frames(sk, tp);
1996                 }
1997                 break;
1998
1999         case TCP_KEEPIDLE:
2000                 if (val < 1 || val > MAX_TCP_KEEPIDLE)
2001                         err = -EINVAL;
2002                 else {
2003                         tp->keepalive_time = val * HZ;
2004                         if (sock_flag(sk, SOCK_KEEPOPEN) &&
2005                             !((1 << sk->sk_state) &
2006                               (TCPF_CLOSE | TCPF_LISTEN))) {
2007                                 __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
2008                                 if (tp->keepalive_time > elapsed)
2009                                         elapsed = tp->keepalive_time - elapsed;
2010                                 else
2011                                         elapsed = 0;
2012                                 tcp_reset_keepalive_timer(sk, elapsed);
2013                         }
2014                 }
2015                 break;
2016         case TCP_KEEPINTVL:
2017                 if (val < 1 || val > MAX_TCP_KEEPINTVL)
2018                         err = -EINVAL;
2019                 else
2020                         tp->keepalive_intvl = val * HZ;
2021                 break;
2022         case TCP_KEEPCNT:
2023                 if (val < 1 || val > MAX_TCP_KEEPCNT)
2024                         err = -EINVAL;
2025                 else
2026                         tp->keepalive_probes = val;
2027                 break;
2028         case TCP_SYNCNT:
2029                 if (val < 1 || val > MAX_TCP_SYNCNT)
2030                         err = -EINVAL;
2031                 else
2032                         tp->syn_retries = val;
2033                 break;
2034
2035         case TCP_LINGER2:
2036                 if (val < 0)
2037                         tp->linger2 = -1;
2038                 else if (val > sysctl_tcp_fin_timeout / HZ)
2039                         tp->linger2 = 0;
2040                 else
2041                         tp->linger2 = val * HZ;
2042                 break;
2043
2044         case TCP_DEFER_ACCEPT:
2045                 tp->defer_accept = 0;
2046                 if (val > 0) {
2047                         /* Translate value in seconds to number of
2048                          * retransmits */
2049                         while (tp->defer_accept < 32 &&
2050                                val > ((TCP_TIMEOUT_INIT / HZ) <<
2051                                        tp->defer_accept))
2052                                 tp->defer_accept++;
2053                         tp->defer_accept++;
2054                 }
2055                 break;
2056
2057         case TCP_WINDOW_CLAMP:
2058                 if (!val) {
2059                         if (sk->sk_state != TCP_CLOSE) {
2060                                 err = -EINVAL;
2061                                 break;
2062                         }
2063                         tp->window_clamp = 0;
2064                 } else
2065                         tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
2066                                                 SOCK_MIN_RCVBUF / 2 : val;
2067                 break;
2068
2069         case TCP_QUICKACK:
2070                 if (!val) {
2071                         tp->ack.pingpong = 1;
2072                 } else {
2073                         tp->ack.pingpong = 0;
2074                         if ((1 << sk->sk_state) &
2075                             (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
2076                             tcp_ack_scheduled(tp)) {
2077                                 tp->ack.pending |= TCP_ACK_PUSHED;
2078                                 cleanup_rbuf(sk, 1);
2079                                 if (!(val & 1))
2080                                         tp->ack.pingpong = 1;
2081                         }
2082                 }
2083                 break;
2084
2085         default:
2086                 err = -ENOPROTOOPT;
2087                 break;
2088         };
2089         release_sock(sk);
2090         return err;
2091 }
2092
2093 /* Return information about state of tcp endpoint in API format. */
2094 void tcp_get_info(struct sock *sk, struct tcp_info *info)
2095 {
2096         struct tcp_sock *tp = tcp_sk(sk);
2097         u32 now = tcp_time_stamp;
2098
2099         memset(info, 0, sizeof(*info));
2100
2101         info->tcpi_state = sk->sk_state;
2102         info->tcpi_ca_state = tp->ca_state;
2103         info->tcpi_retransmits = tp->retransmits;
2104         info->tcpi_probes = tp->probes_out;
2105         info->tcpi_backoff = tp->backoff;
2106
2107         if (tp->rx_opt.tstamp_ok)
2108                 info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
2109         if (tp->rx_opt.sack_ok)
2110                 info->tcpi_options |= TCPI_OPT_SACK;
2111         if (tp->rx_opt.wscale_ok) {
2112                 info->tcpi_options |= TCPI_OPT_WSCALE;
2113                 info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
2114                 info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
2115         } 
2116
2117         if (tp->ecn_flags&TCP_ECN_OK)
2118                 info->tcpi_options |= TCPI_OPT_ECN;
2119
2120         info->tcpi_rto = jiffies_to_usecs(tp->rto);
2121         info->tcpi_ato = jiffies_to_usecs(tp->ack.ato);
2122         info->tcpi_snd_mss = tp->mss_cache_std;
2123         info->tcpi_rcv_mss = tp->ack.rcv_mss;
2124
2125         info->tcpi_unacked = tp->packets_out;
2126         info->tcpi_sacked = tp->sacked_out;
2127         info->tcpi_lost = tp->lost_out;
2128         info->tcpi_retrans = tp->retrans_out;
2129         info->tcpi_fackets = tp->fackets_out;
2130
2131         info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
2132         info->tcpi_last_data_recv = jiffies_to_msecs(now - tp->ack.lrcvtime);
2133         info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
2134
2135         info->tcpi_pmtu = tp->pmtu_cookie;
2136         info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
2137         info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
2138         info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
2139         info->tcpi_snd_ssthresh = tp->snd_ssthresh;
2140         info->tcpi_snd_cwnd = tp->snd_cwnd;
2141         info->tcpi_advmss = tp->advmss;
2142         info->tcpi_reordering = tp->reordering;
2143
2144         info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
2145         info->tcpi_rcv_space = tp->rcvq_space.space;
2146
2147         info->tcpi_total_retrans = tp->total_retrans;
2148 }
2149
2150 EXPORT_SYMBOL_GPL(tcp_get_info);
2151
2152 int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2153                    int __user *optlen)
2154 {
2155         struct tcp_sock *tp = tcp_sk(sk);
2156         int val, len;
2157
2158         if (level != SOL_TCP)
2159                 return tp->af_specific->getsockopt(sk, level, optname,
2160                                                    optval, optlen);
2161
2162         if (get_user(len, optlen))
2163                 return -EFAULT;
2164
2165         len = min_t(unsigned int, len, sizeof(int));
2166
2167         if (len < 0)
2168                 return -EINVAL;
2169
2170         switch (optname) {
2171         case TCP_MAXSEG:
2172                 val = tp->mss_cache_std;
2173                 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2174                         val = tp->rx_opt.user_mss;
2175                 break;
2176         case TCP_NODELAY:
2177                 val = !!(tp->nonagle&TCP_NAGLE_OFF);
2178                 break;
2179         case TCP_CORK:
2180                 val = !!(tp->nonagle&TCP_NAGLE_CORK);
2181                 break;
2182         case TCP_KEEPIDLE:
2183                 val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;
2184                 break;
2185         case TCP_KEEPINTVL:
2186                 val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;
2187                 break;
2188         case TCP_KEEPCNT:
2189                 val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
2190                 break;
2191         case TCP_SYNCNT:
2192                 val = tp->syn_retries ? : sysctl_tcp_syn_retries;
2193                 break;
2194         case TCP_LINGER2:
2195                 val = tp->linger2;
2196                 if (val >= 0)
2197                         val = (val ? : sysctl_tcp_fin_timeout) / HZ;
2198                 break;
2199         case TCP_DEFER_ACCEPT:
2200                 val = !tp->defer_accept ? 0 : ((TCP_TIMEOUT_INIT / HZ) <<
2201                                                (tp->defer_accept - 1));
2202                 break;
2203         case TCP_WINDOW_CLAMP:
2204                 val = tp->window_clamp;
2205                 break;
2206         case TCP_INFO: {
2207                 struct tcp_info info;
2208
2209                 if (get_user(len, optlen))
2210                         return -EFAULT;
2211
2212                 tcp_get_info(sk, &info);
2213
2214                 len = min_t(unsigned int, len, sizeof(info));
2215                 if (put_user(len, optlen))
2216                         return -EFAULT;
2217                 if (copy_to_user(optval, &info, len))
2218                         return -EFAULT;
2219                 return 0;
2220         }
2221         case TCP_QUICKACK:
2222                 val = !tp->ack.pingpong;
2223                 break;
2224
2225         case TCP_CONGESTION:
2226                 if (get_user(len, optlen))
2227                         return -EFAULT;
2228                 len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
2229                 if (put_user(len, optlen))
2230                         return -EFAULT;
2231                 if (copy_to_user(optval, tp->ca_ops->name, len))
2232                         return -EFAULT;
2233                 return 0;
2234         default:
2235                 return -ENOPROTOOPT;
2236         };
2237
2238         if (put_user(len, optlen))
2239                 return -EFAULT;
2240         if (copy_to_user(optval, &val, len))
2241                 return -EFAULT;
2242         return 0;
2243 }
2244
2245
2246 extern void __skb_cb_too_small_for_tcp(int, int);
2247 extern struct tcp_congestion_ops tcp_reno;
2248
2249 static __initdata unsigned long thash_entries;
2250 static int __init set_thash_entries(char *str)
2251 {
2252         if (!str)
2253                 return 0;
2254         thash_entries = simple_strtoul(str, &str, 0);
2255         return 1;
2256 }
2257 __setup("thash_entries=", set_thash_entries);
2258
2259 void __init tcp_init(void)
2260 {
2261         struct sk_buff *skb = NULL;
2262         int order, i;
2263
2264         if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
2265                 __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2266                                            sizeof(skb->cb));
2267
2268         tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
2269                                               sizeof(struct tcp_bind_bucket),
2270                                               0, SLAB_HWCACHE_ALIGN,
2271                                               NULL, NULL);
2272         if (!tcp_bucket_cachep)
2273                 panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
2274
2275         tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
2276                                                 sizeof(struct tcp_tw_bucket),
2277                                                 0, SLAB_HWCACHE_ALIGN,
2278                                                 NULL, NULL);
2279         if (!tcp_timewait_cachep)
2280                 panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
2281
2282         /* Size and allocate the main established and bind bucket
2283          * hash tables.
2284          *
2285          * The methodology is similar to that of the buffer cache.
2286          */
2287         tcp_ehash = (struct tcp_ehash_bucket *)
2288                 alloc_large_system_hash("TCP established",
2289                                         sizeof(struct tcp_ehash_bucket),
2290                                         thash_entries,
2291                                         (num_physpages >= 128 * 1024) ?
2292                                                 (25 - PAGE_SHIFT) :
2293                                                 (27 - PAGE_SHIFT),
2294                                         HASH_HIGHMEM,
2295                                         &tcp_ehash_size,
2296                                         NULL,
2297                                         0);
2298         tcp_ehash_size = (1 << tcp_ehash_size) >> 1;
2299         for (i = 0; i < (tcp_ehash_size << 1); i++) {
2300                 rwlock_init(&tcp_ehash[i].lock);
2301                 INIT_HLIST_HEAD(&tcp_ehash[i].chain);
2302         }
2303
2304         tcp_bhash = (struct tcp_bind_hashbucket *)
2305                 alloc_large_system_hash("TCP bind",
2306                                         sizeof(struct tcp_bind_hashbucket),
2307                                         tcp_ehash_size,
2308                                         (num_physpages >= 128 * 1024) ?
2309                                                 (25 - PAGE_SHIFT) :
2310                                                 (27 - PAGE_SHIFT),
2311                                         HASH_HIGHMEM,
2312                                         &tcp_bhash_size,
2313                                         NULL,
2314                                         64 * 1024);
2315         tcp_bhash_size = 1 << tcp_bhash_size;
2316         for (i = 0; i < tcp_bhash_size; i++) {
2317                 spin_lock_init(&tcp_bhash[i].lock);
2318                 INIT_HLIST_HEAD(&tcp_bhash[i].chain);
2319         }
2320
2321         /* Try to be a bit smarter and adjust defaults depending
2322          * on available memory.
2323          */
2324         for (order = 0; ((1 << order) << PAGE_SHIFT) <
2325                         (tcp_bhash_size * sizeof(struct tcp_bind_hashbucket));
2326                         order++)
2327                 ;
2328         if (order >= 4) {
2329                 sysctl_local_port_range[0] = 32768;
2330                 sysctl_local_port_range[1] = 61000;
2331                 sysctl_tcp_max_tw_buckets = 180000;
2332                 sysctl_tcp_max_orphans = 4096 << (order - 4);
2333                 sysctl_max_syn_backlog = 1024;
2334         } else if (order < 3) {
2335                 sysctl_local_port_range[0] = 1024 * (3 - order);
2336                 sysctl_tcp_max_tw_buckets >>= (3 - order);
2337                 sysctl_tcp_max_orphans >>= (3 - order);
2338                 sysctl_max_syn_backlog = 128;
2339         }
2340         tcp_port_rover = sysctl_local_port_range[0] - 1;
2341
2342         sysctl_tcp_mem[0] =  768 << order;
2343         sysctl_tcp_mem[1] = 1024 << order;
2344         sysctl_tcp_mem[2] = 1536 << order;
2345
2346         if (order < 3) {
2347                 sysctl_tcp_wmem[2] = 64 * 1024;
2348                 sysctl_tcp_rmem[0] = PAGE_SIZE;
2349                 sysctl_tcp_rmem[1] = 43689;
2350                 sysctl_tcp_rmem[2] = 2 * 43689;
2351         }
2352
2353         printk(KERN_INFO "TCP: Hash tables configured "
2354                "(established %d bind %d)\n",
2355                tcp_ehash_size << 1, tcp_bhash_size);
2356
2357         tcp_register_congestion_control(&tcp_reno);
2358 }
2359
2360 EXPORT_SYMBOL(tcp_accept);
2361 EXPORT_SYMBOL(tcp_close);
2362 EXPORT_SYMBOL(tcp_destroy_sock);
2363 EXPORT_SYMBOL(tcp_disconnect);
2364 EXPORT_SYMBOL(tcp_getsockopt);
2365 EXPORT_SYMBOL(tcp_ioctl);
2366 EXPORT_SYMBOL(tcp_poll);
2367 EXPORT_SYMBOL(tcp_read_sock);
2368 EXPORT_SYMBOL(tcp_recvmsg);
2369 EXPORT_SYMBOL(tcp_sendmsg);
2370 EXPORT_SYMBOL(tcp_sendpage);
2371 EXPORT_SYMBOL(tcp_setsockopt);
2372 EXPORT_SYMBOL(tcp_shutdown);
2373 EXPORT_SYMBOL(tcp_statistics);
2374 EXPORT_SYMBOL(tcp_timewait_cachep);