[TCP]: Honour sk_bound_dev_if in tcp_v4_send_ack
[sfrench/cifs-2.6.git] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  * Version:     $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
9  *
10  *              IPv4 specific functions
11  *
12  *
13  *              code split from:
14  *              linux/ipv4/tcp.c
15  *              linux/ipv4/tcp_input.c
16  *              linux/ipv4/tcp_output.c
17  *
18  *              See tcp.c for author information
19  *
20  *      This program is free software; you can redistribute it and/or
21  *      modify it under the terms of the GNU General Public License
22  *      as published by the Free Software Foundation; either version
23  *      2 of the License, or (at your option) any later version.
24  */
25
26 /*
27  * Changes:
28  *              David S. Miller :       New socket lookup architecture.
29  *                                      This code is dedicated to John Dyson.
30  *              David S. Miller :       Change semantics of established hash,
31  *                                      half is devoted to TIME_WAIT sockets
32  *                                      and the rest go in the other half.
33  *              Andi Kleen :            Add support for syncookies and fixed
34  *                                      some bugs: ip options weren't passed to
35  *                                      the TCP layer, missed a check for an
36  *                                      ACK bit.
37  *              Andi Kleen :            Implemented fast path mtu discovery.
38  *                                      Fixed many serious bugs in the
39  *                                      request_sock handling and moved
40  *                                      most of it into the af independent code.
41  *                                      Added tail drop and some other bugfixes.
42  *                                      Added new listen semantics.
43  *              Mike McLagan    :       Routing by source
44  *      Juan Jose Ciarlante:            ip_dynaddr bits
45  *              Andi Kleen:             various fixes.
46  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
47  *                                      coma.
48  *      Andi Kleen              :       Fix new listen.
49  *      Andi Kleen              :       Fix accept error reporting.
50  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
51  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
52  *                                      a single port at the same time.
53  */
54
55
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64
65 #include <net/icmp.h>
66 #include <net/inet_hashtables.h>
67 #include <net/tcp.h>
68 #include <net/transp_v6.h>
69 #include <net/ipv6.h>
70 #include <net/inet_common.h>
71 #include <net/timewait_sock.h>
72 #include <net/xfrm.h>
73 #include <net/netdma.h>
74
75 #include <linux/inet.h>
76 #include <linux/ipv6.h>
77 #include <linux/stddef.h>
78 #include <linux/proc_fs.h>
79 #include <linux/seq_file.h>
80
81 #include <linux/crypto.h>
82 #include <linux/scatterlist.h>
83
84 int sysctl_tcp_tw_reuse __read_mostly;
85 int sysctl_tcp_low_latency __read_mostly;
86
87 /* Check TCP sequence numbers in ICMP packets. */
88 #define ICMP_MIN_LENGTH 8
89
90 /* Socket used for sending RSTs */
91 static struct socket *tcp_socket __read_mostly;
92
93 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb);
94
95 #ifdef CONFIG_TCP_MD5SIG
96 static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
97                                                    __be32 addr);
98 static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
99                                    __be32 saddr, __be32 daddr,
100                                    struct tcphdr *th, int protocol,
101                                    int tcplen);
102 #endif
103
104 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
105         .lhash_lock  = __RW_LOCK_UNLOCKED(tcp_hashinfo.lhash_lock),
106         .lhash_users = ATOMIC_INIT(0),
107         .lhash_wait  = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
108 };
109
110 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
111 {
112         return inet_csk_get_port(&tcp_hashinfo, sk, snum,
113                                  inet_csk_bind_conflict);
114 }
115
116 static void tcp_v4_hash(struct sock *sk)
117 {
118         inet_hash(&tcp_hashinfo, sk);
119 }
120
121 void tcp_unhash(struct sock *sk)
122 {
123         inet_unhash(&tcp_hashinfo, sk);
124 }
125
126 static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
127 {
128         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
129                                           ip_hdr(skb)->saddr,
130                                           tcp_hdr(skb)->dest,
131                                           tcp_hdr(skb)->source);
132 }
133
134 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
135 {
136         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
137         struct tcp_sock *tp = tcp_sk(sk);
138
139         /* With PAWS, it is safe from the viewpoint
140            of data integrity. Even without PAWS it is safe provided sequence
141            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
142
143            Actually, the idea is close to VJ's one, only timestamp cache is
144            held not per host, but per port pair and TW bucket is used as state
145            holder.
146
147            If TW bucket has been already destroyed we fall back to VJ's scheme
148            and use initial timestamp retrieved from peer table.
149          */
150         if (tcptw->tw_ts_recent_stamp &&
151             (twp == NULL || (sysctl_tcp_tw_reuse &&
152                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
153                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
154                 if (tp->write_seq == 0)
155                         tp->write_seq = 1;
156                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
157                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
158                 sock_hold(sktw);
159                 return 1;
160         }
161
162         return 0;
163 }
164
165 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
166
167 /* This will initiate an outgoing connection. */
168 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
169 {
170         struct inet_sock *inet = inet_sk(sk);
171         struct tcp_sock *tp = tcp_sk(sk);
172         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
173         struct rtable *rt;
174         __be32 daddr, nexthop;
175         int tmp;
176         int err;
177
178         if (addr_len < sizeof(struct sockaddr_in))
179                 return -EINVAL;
180
181         if (usin->sin_family != AF_INET)
182                 return -EAFNOSUPPORT;
183
184         nexthop = daddr = usin->sin_addr.s_addr;
185         if (inet->opt && inet->opt->srr) {
186                 if (!daddr)
187                         return -EINVAL;
188                 nexthop = inet->opt->faddr;
189         }
190
191         tmp = ip_route_connect(&rt, nexthop, inet->saddr,
192                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
193                                IPPROTO_TCP,
194                                inet->sport, usin->sin_port, sk, 1);
195         if (tmp < 0) {
196                 if (tmp == -ENETUNREACH)
197                         IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
198                 return tmp;
199         }
200
201         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
202                 ip_rt_put(rt);
203                 return -ENETUNREACH;
204         }
205
206         if (!inet->opt || !inet->opt->srr)
207                 daddr = rt->rt_dst;
208
209         if (!inet->saddr)
210                 inet->saddr = rt->rt_src;
211         inet->rcv_saddr = inet->saddr;
212
213         if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
214                 /* Reset inherited state */
215                 tp->rx_opt.ts_recent       = 0;
216                 tp->rx_opt.ts_recent_stamp = 0;
217                 tp->write_seq              = 0;
218         }
219
220         if (tcp_death_row.sysctl_tw_recycle &&
221             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
222                 struct inet_peer *peer = rt_get_peer(rt);
223                 /*
224                  * VJ's idea. We save last timestamp seen from
225                  * the destination in peer table, when entering state
226                  * TIME-WAIT * and initialize rx_opt.ts_recent from it,
227                  * when trying new connection.
228                  */
229                 if (peer != NULL &&
230                     peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) {
231                         tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
232                         tp->rx_opt.ts_recent = peer->tcp_ts;
233                 }
234         }
235
236         inet->dport = usin->sin_port;
237         inet->daddr = daddr;
238
239         inet_csk(sk)->icsk_ext_hdr_len = 0;
240         if (inet->opt)
241                 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
242
243         tp->rx_opt.mss_clamp = 536;
244
245         /* Socket identity is still unknown (sport may be zero).
246          * However we set state to SYN-SENT and not releasing socket
247          * lock select source port, enter ourselves into the hash tables and
248          * complete initialization after this.
249          */
250         tcp_set_state(sk, TCP_SYN_SENT);
251         err = inet_hash_connect(&tcp_death_row, sk);
252         if (err)
253                 goto failure;
254
255         err = ip_route_newports(&rt, IPPROTO_TCP,
256                                 inet->sport, inet->dport, sk);
257         if (err)
258                 goto failure;
259
260         /* OK, now commit destination to socket.  */
261         sk->sk_gso_type = SKB_GSO_TCPV4;
262         sk_setup_caps(sk, &rt->u.dst);
263
264         if (!tp->write_seq)
265                 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
266                                                            inet->daddr,
267                                                            inet->sport,
268                                                            usin->sin_port);
269
270         inet->id = tp->write_seq ^ jiffies;
271
272         err = tcp_connect(sk);
273         rt = NULL;
274         if (err)
275                 goto failure;
276
277         return 0;
278
279 failure:
280         /*
281          * This unhashes the socket and releases the local port,
282          * if necessary.
283          */
284         tcp_set_state(sk, TCP_CLOSE);
285         ip_rt_put(rt);
286         sk->sk_route_caps = 0;
287         inet->dport = 0;
288         return err;
289 }
290
291 /*
292  * This routine does path mtu discovery as defined in RFC1191.
293  */
294 static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
295 {
296         struct dst_entry *dst;
297         struct inet_sock *inet = inet_sk(sk);
298
299         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
300          * send out by Linux are always <576bytes so they should go through
301          * unfragmented).
302          */
303         if (sk->sk_state == TCP_LISTEN)
304                 return;
305
306         /* We don't check in the destentry if pmtu discovery is forbidden
307          * on this route. We just assume that no packet_to_big packets
308          * are send back when pmtu discovery is not active.
309          * There is a small race when the user changes this flag in the
310          * route, but I think that's acceptable.
311          */
312         if ((dst = __sk_dst_check(sk, 0)) == NULL)
313                 return;
314
315         dst->ops->update_pmtu(dst, mtu);
316
317         /* Something is about to be wrong... Remember soft error
318          * for the case, if this connection will not able to recover.
319          */
320         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
321                 sk->sk_err_soft = EMSGSIZE;
322
323         mtu = dst_mtu(dst);
324
325         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
326             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
327                 tcp_sync_mss(sk, mtu);
328
329                 /* Resend the TCP packet because it's
330                  * clear that the old packet has been
331                  * dropped. This is the new "fast" path mtu
332                  * discovery.
333                  */
334                 tcp_simple_retransmit(sk);
335         } /* else let the usual retransmit timer handle it */
336 }
337
338 /*
339  * This routine is called by the ICMP module when it gets some
340  * sort of error condition.  If err < 0 then the socket should
341  * be closed and the error returned to the user.  If err > 0
342  * it's just the icmp type << 8 | icmp code.  After adjustment
343  * header points to the first 8 bytes of the tcp header.  We need
344  * to find the appropriate port.
345  *
346  * The locking strategy used here is very "optimistic". When
347  * someone else accesses the socket the ICMP is just dropped
348  * and for some paths there is no check at all.
349  * A more general error queue to queue errors for later handling
350  * is probably better.
351  *
352  */
353
354 void tcp_v4_err(struct sk_buff *skb, u32 info)
355 {
356         struct iphdr *iph = (struct iphdr *)skb->data;
357         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
358         struct tcp_sock *tp;
359         struct inet_sock *inet;
360         const int type = icmp_hdr(skb)->type;
361         const int code = icmp_hdr(skb)->code;
362         struct sock *sk;
363         __u32 seq;
364         int err;
365
366         if (skb->len < (iph->ihl << 2) + 8) {
367                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
368                 return;
369         }
370
371         sk = inet_lookup(&tcp_hashinfo, iph->daddr, th->dest, iph->saddr,
372                          th->source, inet_iif(skb));
373         if (!sk) {
374                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
375                 return;
376         }
377         if (sk->sk_state == TCP_TIME_WAIT) {
378                 inet_twsk_put(inet_twsk(sk));
379                 return;
380         }
381
382         bh_lock_sock(sk);
383         /* If too many ICMPs get dropped on busy
384          * servers this needs to be solved differently.
385          */
386         if (sock_owned_by_user(sk))
387                 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
388
389         if (sk->sk_state == TCP_CLOSE)
390                 goto out;
391
392         tp = tcp_sk(sk);
393         seq = ntohl(th->seq);
394         if (sk->sk_state != TCP_LISTEN &&
395             !between(seq, tp->snd_una, tp->snd_nxt)) {
396                 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
397                 goto out;
398         }
399
400         switch (type) {
401         case ICMP_SOURCE_QUENCH:
402                 /* Just silently ignore these. */
403                 goto out;
404         case ICMP_PARAMETERPROB:
405                 err = EPROTO;
406                 break;
407         case ICMP_DEST_UNREACH:
408                 if (code > NR_ICMP_UNREACH)
409                         goto out;
410
411                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
412                         if (!sock_owned_by_user(sk))
413                                 do_pmtu_discovery(sk, iph, info);
414                         goto out;
415                 }
416
417                 err = icmp_err_convert[code].errno;
418                 break;
419         case ICMP_TIME_EXCEEDED:
420                 err = EHOSTUNREACH;
421                 break;
422         default:
423                 goto out;
424         }
425
426         switch (sk->sk_state) {
427                 struct request_sock *req, **prev;
428         case TCP_LISTEN:
429                 if (sock_owned_by_user(sk))
430                         goto out;
431
432                 req = inet_csk_search_req(sk, &prev, th->dest,
433                                           iph->daddr, iph->saddr);
434                 if (!req)
435                         goto out;
436
437                 /* ICMPs are not backlogged, hence we cannot get
438                    an established socket here.
439                  */
440                 BUG_TRAP(!req->sk);
441
442                 if (seq != tcp_rsk(req)->snt_isn) {
443                         NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
444                         goto out;
445                 }
446
447                 /*
448                  * Still in SYN_RECV, just remove it silently.
449                  * There is no good way to pass the error to the newly
450                  * created socket, and POSIX does not want network
451                  * errors returned from accept().
452                  */
453                 inet_csk_reqsk_queue_drop(sk, req, prev);
454                 goto out;
455
456         case TCP_SYN_SENT:
457         case TCP_SYN_RECV:  /* Cannot happen.
458                                It can f.e. if SYNs crossed.
459                              */
460                 if (!sock_owned_by_user(sk)) {
461                         sk->sk_err = err;
462
463                         sk->sk_error_report(sk);
464
465                         tcp_done(sk);
466                 } else {
467                         sk->sk_err_soft = err;
468                 }
469                 goto out;
470         }
471
472         /* If we've already connected we will keep trying
473          * until we time out, or the user gives up.
474          *
475          * rfc1122 4.2.3.9 allows to consider as hard errors
476          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
477          * but it is obsoleted by pmtu discovery).
478          *
479          * Note, that in modern internet, where routing is unreliable
480          * and in each dark corner broken firewalls sit, sending random
481          * errors ordered by their masters even this two messages finally lose
482          * their original sense (even Linux sends invalid PORT_UNREACHs)
483          *
484          * Now we are in compliance with RFCs.
485          *                                                      --ANK (980905)
486          */
487
488         inet = inet_sk(sk);
489         if (!sock_owned_by_user(sk) && inet->recverr) {
490                 sk->sk_err = err;
491                 sk->sk_error_report(sk);
492         } else  { /* Only an error on timeout */
493                 sk->sk_err_soft = err;
494         }
495
496 out:
497         bh_unlock_sock(sk);
498         sock_put(sk);
499 }
500
501 /* This routine computes an IPv4 TCP checksum. */
502 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
503 {
504         struct inet_sock *inet = inet_sk(sk);
505         struct tcphdr *th = tcp_hdr(skb);
506
507         if (skb->ip_summed == CHECKSUM_PARTIAL) {
508                 th->check = ~tcp_v4_check(len, inet->saddr,
509                                           inet->daddr, 0);
510                 skb->csum_start = skb_transport_header(skb) - skb->head;
511                 skb->csum_offset = offsetof(struct tcphdr, check);
512         } else {
513                 th->check = tcp_v4_check(len, inet->saddr, inet->daddr,
514                                          csum_partial((char *)th,
515                                                       th->doff << 2,
516                                                       skb->csum));
517         }
518 }
519
520 int tcp_v4_gso_send_check(struct sk_buff *skb)
521 {
522         const struct iphdr *iph;
523         struct tcphdr *th;
524
525         if (!pskb_may_pull(skb, sizeof(*th)))
526                 return -EINVAL;
527
528         iph = ip_hdr(skb);
529         th = tcp_hdr(skb);
530
531         th->check = 0;
532         th->check = ~tcp_v4_check(skb->len, iph->saddr, iph->daddr, 0);
533         skb->csum_start = skb_transport_header(skb) - skb->head;
534         skb->csum_offset = offsetof(struct tcphdr, check);
535         skb->ip_summed = CHECKSUM_PARTIAL;
536         return 0;
537 }
538
539 /*
540  *      This routine will send an RST to the other tcp.
541  *
542  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
543  *                    for reset.
544  *      Answer: if a packet caused RST, it is not for a socket
545  *              existing in our system, if it is matched to a socket,
546  *              it is just duplicate segment or bug in other side's TCP.
547  *              So that we build reply only basing on parameters
548  *              arrived with segment.
549  *      Exception: precedence violation. We do not implement it in any case.
550  */
551
552 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
553 {
554         struct tcphdr *th = tcp_hdr(skb);
555         struct {
556                 struct tcphdr th;
557 #ifdef CONFIG_TCP_MD5SIG
558                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
559 #endif
560         } rep;
561         struct ip_reply_arg arg;
562 #ifdef CONFIG_TCP_MD5SIG
563         struct tcp_md5sig_key *key;
564 #endif
565
566         /* Never send a reset in response to a reset. */
567         if (th->rst)
568                 return;
569
570         if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
571                 return;
572
573         /* Swap the send and the receive. */
574         memset(&rep, 0, sizeof(rep));
575         rep.th.dest   = th->source;
576         rep.th.source = th->dest;
577         rep.th.doff   = sizeof(struct tcphdr) / 4;
578         rep.th.rst    = 1;
579
580         if (th->ack) {
581                 rep.th.seq = th->ack_seq;
582         } else {
583                 rep.th.ack = 1;
584                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
585                                        skb->len - (th->doff << 2));
586         }
587
588         memset(&arg, 0, sizeof(arg));
589         arg.iov[0].iov_base = (unsigned char *)&rep;
590         arg.iov[0].iov_len  = sizeof(rep.th);
591
592 #ifdef CONFIG_TCP_MD5SIG
593         key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
594         if (key) {
595                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
596                                    (TCPOPT_NOP << 16) |
597                                    (TCPOPT_MD5SIG << 8) |
598                                    TCPOLEN_MD5SIG);
599                 /* Update length and the length the header thinks exists */
600                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
601                 rep.th.doff = arg.iov[0].iov_len / 4;
602
603                 tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[1],
604                                         key,
605                                         ip_hdr(skb)->daddr,
606                                         ip_hdr(skb)->saddr,
607                                         &rep.th, IPPROTO_TCP,
608                                         arg.iov[0].iov_len);
609         }
610 #endif
611         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
612                                       ip_hdr(skb)->saddr, /* XXX */
613                                       sizeof(struct tcphdr), IPPROTO_TCP, 0);
614         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
615
616         ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
617
618         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
619         TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
620 }
621
622 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
623    outside socket context is ugly, certainly. What can I do?
624  */
625
626 static void tcp_v4_send_ack(struct tcp_timewait_sock *twsk,
627                             struct sk_buff *skb, u32 seq, u32 ack,
628                             u32 win, u32 ts)
629 {
630         struct tcphdr *th = tcp_hdr(skb);
631         struct {
632                 struct tcphdr th;
633                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
634 #ifdef CONFIG_TCP_MD5SIG
635                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
636 #endif
637                         ];
638         } rep;
639         struct ip_reply_arg arg;
640 #ifdef CONFIG_TCP_MD5SIG
641         struct tcp_md5sig_key *key;
642         struct tcp_md5sig_key tw_key;
643 #endif
644
645         memset(&rep.th, 0, sizeof(struct tcphdr));
646         memset(&arg, 0, sizeof(arg));
647
648         arg.iov[0].iov_base = (unsigned char *)&rep;
649         arg.iov[0].iov_len  = sizeof(rep.th);
650         if (ts) {
651                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
652                                    (TCPOPT_TIMESTAMP << 8) |
653                                    TCPOLEN_TIMESTAMP);
654                 rep.opt[1] = htonl(tcp_time_stamp);
655                 rep.opt[2] = htonl(ts);
656                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
657         }
658
659         /* Swap the send and the receive. */
660         rep.th.dest    = th->source;
661         rep.th.source  = th->dest;
662         rep.th.doff    = arg.iov[0].iov_len / 4;
663         rep.th.seq     = htonl(seq);
664         rep.th.ack_seq = htonl(ack);
665         rep.th.ack     = 1;
666         rep.th.window  = htons(win);
667
668 #ifdef CONFIG_TCP_MD5SIG
669         /*
670          * The SKB holds an imcoming packet, but may not have a valid ->sk
671          * pointer. This is especially the case when we're dealing with a
672          * TIME_WAIT ack, because the sk structure is long gone, and only
673          * the tcp_timewait_sock remains. So the md5 key is stashed in that
674          * structure, and we use it in preference.  I believe that (twsk ||
675          * skb->sk) holds true, but we program defensively.
676          */
677         if (!twsk && skb->sk) {
678                 key = tcp_v4_md5_do_lookup(skb->sk, ip_hdr(skb)->daddr);
679         } else if (twsk && twsk->tw_md5_keylen) {
680                 tw_key.key = twsk->tw_md5_key;
681                 tw_key.keylen = twsk->tw_md5_keylen;
682                 key = &tw_key;
683         } else
684                 key = NULL;
685
686         if (key) {
687                 int offset = (ts) ? 3 : 0;
688
689                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
690                                           (TCPOPT_NOP << 16) |
691                                           (TCPOPT_MD5SIG << 8) |
692                                           TCPOLEN_MD5SIG);
693                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
694                 rep.th.doff = arg.iov[0].iov_len/4;
695
696                 tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[offset],
697                                         key,
698                                         ip_hdr(skb)->daddr,
699                                         ip_hdr(skb)->saddr,
700                                         &rep.th, IPPROTO_TCP,
701                                         arg.iov[0].iov_len);
702         }
703 #endif
704         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
705                                       ip_hdr(skb)->saddr, /* XXX */
706                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
707         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
708         if (twsk)
709                 arg.bound_dev_if = twsk->tw_sk.tw_bound_dev_if;
710
711         ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
712
713         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
714 }
715
716 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
717 {
718         struct inet_timewait_sock *tw = inet_twsk(sk);
719         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
720
721         tcp_v4_send_ack(tcptw, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
722                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
723                         tcptw->tw_ts_recent);
724
725         inet_twsk_put(tw);
726 }
727
728 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb,
729                                   struct request_sock *req)
730 {
731         tcp_v4_send_ack(NULL, skb, tcp_rsk(req)->snt_isn + 1,
732                         tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
733                         req->ts_recent);
734 }
735
736 /*
737  *      Send a SYN-ACK after having received an ACK.
738  *      This still operates on a request_sock only, not on a big
739  *      socket.
740  */
741 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
742                               struct dst_entry *dst)
743 {
744         const struct inet_request_sock *ireq = inet_rsk(req);
745         int err = -1;
746         struct sk_buff * skb;
747
748         /* First, grab a route. */
749         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
750                 goto out;
751
752         skb = tcp_make_synack(sk, dst, req);
753
754         if (skb) {
755                 struct tcphdr *th = tcp_hdr(skb);
756
757                 th->check = tcp_v4_check(skb->len,
758                                          ireq->loc_addr,
759                                          ireq->rmt_addr,
760                                          csum_partial((char *)th, skb->len,
761                                                       skb->csum));
762
763                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
764                                             ireq->rmt_addr,
765                                             ireq->opt);
766                 err = net_xmit_eval(err);
767         }
768
769 out:
770         dst_release(dst);
771         return err;
772 }
773
774 /*
775  *      IPv4 request_sock destructor.
776  */
777 static void tcp_v4_reqsk_destructor(struct request_sock *req)
778 {
779         kfree(inet_rsk(req)->opt);
780 }
781
782 #ifdef CONFIG_SYN_COOKIES
783 static void syn_flood_warning(struct sk_buff *skb)
784 {
785         static unsigned long warntime;
786
787         if (time_after(jiffies, (warntime + HZ * 60))) {
788                 warntime = jiffies;
789                 printk(KERN_INFO
790                        "possible SYN flooding on port %d. Sending cookies.\n",
791                        ntohs(tcp_hdr(skb)->dest));
792         }
793 }
794 #endif
795
796 /*
797  * Save and compile IPv4 options into the request_sock if needed.
798  */
799 static struct ip_options *tcp_v4_save_options(struct sock *sk,
800                                               struct sk_buff *skb)
801 {
802         struct ip_options *opt = &(IPCB(skb)->opt);
803         struct ip_options *dopt = NULL;
804
805         if (opt && opt->optlen) {
806                 int opt_size = optlength(opt);
807                 dopt = kmalloc(opt_size, GFP_ATOMIC);
808                 if (dopt) {
809                         if (ip_options_echo(dopt, skb)) {
810                                 kfree(dopt);
811                                 dopt = NULL;
812                         }
813                 }
814         }
815         return dopt;
816 }
817
818 #ifdef CONFIG_TCP_MD5SIG
819 /*
820  * RFC2385 MD5 checksumming requires a mapping of
821  * IP address->MD5 Key.
822  * We need to maintain these in the sk structure.
823  */
824
825 /* Find the Key structure for an address.  */
826 static struct tcp_md5sig_key *
827                         tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
828 {
829         struct tcp_sock *tp = tcp_sk(sk);
830         int i;
831
832         if (!tp->md5sig_info || !tp->md5sig_info->entries4)
833                 return NULL;
834         for (i = 0; i < tp->md5sig_info->entries4; i++) {
835                 if (tp->md5sig_info->keys4[i].addr == addr)
836                         return (struct tcp_md5sig_key *)
837                                                 &tp->md5sig_info->keys4[i];
838         }
839         return NULL;
840 }
841
842 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
843                                          struct sock *addr_sk)
844 {
845         return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->daddr);
846 }
847
848 EXPORT_SYMBOL(tcp_v4_md5_lookup);
849
850 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
851                                                       struct request_sock *req)
852 {
853         return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
854 }
855
856 /* This can be called on a newly created socket, from other files */
857 int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
858                       u8 *newkey, u8 newkeylen)
859 {
860         /* Add Key to the list */
861         struct tcp4_md5sig_key *key;
862         struct tcp_sock *tp = tcp_sk(sk);
863         struct tcp4_md5sig_key *keys;
864
865         key = (struct tcp4_md5sig_key *)tcp_v4_md5_do_lookup(sk, addr);
866         if (key) {
867                 /* Pre-existing entry - just update that one. */
868                 kfree(key->key);
869                 key->key = newkey;
870                 key->keylen = newkeylen;
871         } else {
872                 struct tcp_md5sig_info *md5sig;
873
874                 if (!tp->md5sig_info) {
875                         tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
876                                                   GFP_ATOMIC);
877                         if (!tp->md5sig_info) {
878                                 kfree(newkey);
879                                 return -ENOMEM;
880                         }
881                 }
882                 if (tcp_alloc_md5sig_pool() == NULL) {
883                         kfree(newkey);
884                         return -ENOMEM;
885                 }
886                 md5sig = tp->md5sig_info;
887
888                 if (md5sig->alloced4 == md5sig->entries4) {
889                         keys = kmalloc((sizeof(*keys) *
890                                         (md5sig->entries4 + 1)), GFP_ATOMIC);
891                         if (!keys) {
892                                 kfree(newkey);
893                                 tcp_free_md5sig_pool();
894                                 return -ENOMEM;
895                         }
896
897                         if (md5sig->entries4)
898                                 memcpy(keys, md5sig->keys4,
899                                        sizeof(*keys) * md5sig->entries4);
900
901                         /* Free old key list, and reference new one */
902                         if (md5sig->keys4)
903                                 kfree(md5sig->keys4);
904                         md5sig->keys4 = keys;
905                         md5sig->alloced4++;
906                 }
907                 md5sig->entries4++;
908                 md5sig->keys4[md5sig->entries4 - 1].addr   = addr;
909                 md5sig->keys4[md5sig->entries4 - 1].key    = newkey;
910                 md5sig->keys4[md5sig->entries4 - 1].keylen = newkeylen;
911         }
912         return 0;
913 }
914
915 EXPORT_SYMBOL(tcp_v4_md5_do_add);
916
917 static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
918                                u8 *newkey, u8 newkeylen)
919 {
920         return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->daddr,
921                                  newkey, newkeylen);
922 }
923
924 int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
925 {
926         struct tcp_sock *tp = tcp_sk(sk);
927         int i;
928
929         for (i = 0; i < tp->md5sig_info->entries4; i++) {
930                 if (tp->md5sig_info->keys4[i].addr == addr) {
931                         /* Free the key */
932                         kfree(tp->md5sig_info->keys4[i].key);
933                         tp->md5sig_info->entries4--;
934
935                         if (tp->md5sig_info->entries4 == 0) {
936                                 kfree(tp->md5sig_info->keys4);
937                                 tp->md5sig_info->keys4 = NULL;
938                                 tp->md5sig_info->alloced4 = 0;
939                         } else if (tp->md5sig_info->entries4 != i) {
940                                 /* Need to do some manipulation */
941                                 memcpy(&tp->md5sig_info->keys4[i],
942                                        &tp->md5sig_info->keys4[i+1],
943                                        (tp->md5sig_info->entries4 - i) *
944                                         sizeof(struct tcp4_md5sig_key));
945                         }
946                         tcp_free_md5sig_pool();
947                         return 0;
948                 }
949         }
950         return -ENOENT;
951 }
952
953 EXPORT_SYMBOL(tcp_v4_md5_do_del);
954
955 static void tcp_v4_clear_md5_list(struct sock *sk)
956 {
957         struct tcp_sock *tp = tcp_sk(sk);
958
959         /* Free each key, then the set of key keys,
960          * the crypto element, and then decrement our
961          * hold on the last resort crypto.
962          */
963         if (tp->md5sig_info->entries4) {
964                 int i;
965                 for (i = 0; i < tp->md5sig_info->entries4; i++)
966                         kfree(tp->md5sig_info->keys4[i].key);
967                 tp->md5sig_info->entries4 = 0;
968                 tcp_free_md5sig_pool();
969         }
970         if (tp->md5sig_info->keys4) {
971                 kfree(tp->md5sig_info->keys4);
972                 tp->md5sig_info->keys4 = NULL;
973                 tp->md5sig_info->alloced4  = 0;
974         }
975 }
976
977 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
978                                  int optlen)
979 {
980         struct tcp_md5sig cmd;
981         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
982         u8 *newkey;
983
984         if (optlen < sizeof(cmd))
985                 return -EINVAL;
986
987         if (copy_from_user(&cmd, optval, sizeof(cmd)))
988                 return -EFAULT;
989
990         if (sin->sin_family != AF_INET)
991                 return -EINVAL;
992
993         if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
994                 if (!tcp_sk(sk)->md5sig_info)
995                         return -ENOENT;
996                 return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
997         }
998
999         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1000                 return -EINVAL;
1001
1002         if (!tcp_sk(sk)->md5sig_info) {
1003                 struct tcp_sock *tp = tcp_sk(sk);
1004                 struct tcp_md5sig_info *p = kzalloc(sizeof(*p), GFP_KERNEL);
1005
1006                 if (!p)
1007                         return -EINVAL;
1008
1009                 tp->md5sig_info = p;
1010
1011         }
1012
1013         newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1014         if (!newkey)
1015                 return -ENOMEM;
1016         return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1017                                  newkey, cmd.tcpm_keylen);
1018 }
1019
1020 static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
1021                                    __be32 saddr, __be32 daddr,
1022                                    struct tcphdr *th, int protocol,
1023                                    int tcplen)
1024 {
1025         struct scatterlist sg[4];
1026         __u16 data_len;
1027         int block = 0;
1028         __sum16 old_checksum;
1029         struct tcp_md5sig_pool *hp;
1030         struct tcp4_pseudohdr *bp;
1031         struct hash_desc *desc;
1032         int err;
1033         unsigned int nbytes = 0;
1034
1035         /*
1036          * Okay, so RFC2385 is turned on for this connection,
1037          * so we need to generate the MD5 hash for the packet now.
1038          */
1039
1040         hp = tcp_get_md5sig_pool();
1041         if (!hp)
1042                 goto clear_hash_noput;
1043
1044         bp = &hp->md5_blk.ip4;
1045         desc = &hp->md5_desc;
1046
1047         /*
1048          * 1. the TCP pseudo-header (in the order: source IP address,
1049          * destination IP address, zero-padded protocol number, and
1050          * segment length)
1051          */
1052         bp->saddr = saddr;
1053         bp->daddr = daddr;
1054         bp->pad = 0;
1055         bp->protocol = protocol;
1056         bp->len = htons(tcplen);
1057         sg_set_buf(&sg[block++], bp, sizeof(*bp));
1058         nbytes += sizeof(*bp);
1059
1060         /* 2. the TCP header, excluding options, and assuming a
1061          * checksum of zero/
1062          */
1063         old_checksum = th->check;
1064         th->check = 0;
1065         sg_set_buf(&sg[block++], th, sizeof(struct tcphdr));
1066         nbytes += sizeof(struct tcphdr);
1067
1068         /* 3. the TCP segment data (if any) */
1069         data_len = tcplen - (th->doff << 2);
1070         if (data_len > 0) {
1071                 unsigned char *data = (unsigned char *)th + (th->doff << 2);
1072                 sg_set_buf(&sg[block++], data, data_len);
1073                 nbytes += data_len;
1074         }
1075
1076         /* 4. an independently-specified key or password, known to both
1077          * TCPs and presumably connection-specific
1078          */
1079         sg_set_buf(&sg[block++], key->key, key->keylen);
1080         nbytes += key->keylen;
1081
1082         /* Now store the Hash into the packet */
1083         err = crypto_hash_init(desc);
1084         if (err)
1085                 goto clear_hash;
1086         err = crypto_hash_update(desc, sg, nbytes);
1087         if (err)
1088                 goto clear_hash;
1089         err = crypto_hash_final(desc, md5_hash);
1090         if (err)
1091                 goto clear_hash;
1092
1093         /* Reset header, and free up the crypto */
1094         tcp_put_md5sig_pool();
1095         th->check = old_checksum;
1096
1097 out:
1098         return 0;
1099 clear_hash:
1100         tcp_put_md5sig_pool();
1101 clear_hash_noput:
1102         memset(md5_hash, 0, 16);
1103         goto out;
1104 }
1105
1106 int tcp_v4_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
1107                          struct sock *sk,
1108                          struct dst_entry *dst,
1109                          struct request_sock *req,
1110                          struct tcphdr *th, int protocol,
1111                          int tcplen)
1112 {
1113         __be32 saddr, daddr;
1114
1115         if (sk) {
1116                 saddr = inet_sk(sk)->saddr;
1117                 daddr = inet_sk(sk)->daddr;
1118         } else {
1119                 struct rtable *rt = (struct rtable *)dst;
1120                 BUG_ON(!rt);
1121                 saddr = rt->rt_src;
1122                 daddr = rt->rt_dst;
1123         }
1124         return tcp_v4_do_calc_md5_hash(md5_hash, key,
1125                                        saddr, daddr,
1126                                        th, protocol, tcplen);
1127 }
1128
1129 EXPORT_SYMBOL(tcp_v4_calc_md5_hash);
1130
1131 static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1132 {
1133         /*
1134          * This gets called for each TCP segment that arrives
1135          * so we want to be efficient.
1136          * We have 3 drop cases:
1137          * o No MD5 hash and one expected.
1138          * o MD5 hash and we're not expecting one.
1139          * o MD5 hash and its wrong.
1140          */
1141         __u8 *hash_location = NULL;
1142         struct tcp_md5sig_key *hash_expected;
1143         const struct iphdr *iph = ip_hdr(skb);
1144         struct tcphdr *th = tcp_hdr(skb);
1145         int length = (th->doff << 2) - sizeof(struct tcphdr);
1146         int genhash;
1147         unsigned char *ptr;
1148         unsigned char newhash[16];
1149
1150         hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1151
1152         /*
1153          * If the TCP option length is less than the TCP_MD5SIG
1154          * option length, then we can shortcut
1155          */
1156         if (length < TCPOLEN_MD5SIG) {
1157                 if (hash_expected)
1158                         return 1;
1159                 else
1160                         return 0;
1161         }
1162
1163         /* Okay, we can't shortcut - we have to grub through the options */
1164         ptr = (unsigned char *)(th + 1);
1165         while (length > 0) {
1166                 int opcode = *ptr++;
1167                 int opsize;
1168
1169                 switch (opcode) {
1170                 case TCPOPT_EOL:
1171                         goto done_opts;
1172                 case TCPOPT_NOP:
1173                         length--;
1174                         continue;
1175                 default:
1176                         opsize = *ptr++;
1177                         if (opsize < 2)
1178                                 goto done_opts;
1179                         if (opsize > length)
1180                                 goto done_opts;
1181
1182                         if (opcode == TCPOPT_MD5SIG) {
1183                                 hash_location = ptr;
1184                                 goto done_opts;
1185                         }
1186                 }
1187                 ptr += opsize-2;
1188                 length -= opsize;
1189         }
1190 done_opts:
1191         /* We've parsed the options - do we have a hash? */
1192         if (!hash_expected && !hash_location)
1193                 return 0;
1194
1195         if (hash_expected && !hash_location) {
1196                 LIMIT_NETDEBUG(KERN_INFO "MD5 Hash expected but NOT found "
1197                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
1198                                NIPQUAD(iph->saddr), ntohs(th->source),
1199                                NIPQUAD(iph->daddr), ntohs(th->dest));
1200                 return 1;
1201         }
1202
1203         if (!hash_expected && hash_location) {
1204                 LIMIT_NETDEBUG(KERN_INFO "MD5 Hash NOT expected but found "
1205                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
1206                                NIPQUAD(iph->saddr), ntohs(th->source),
1207                                NIPQUAD(iph->daddr), ntohs(th->dest));
1208                 return 1;
1209         }
1210
1211         /* Okay, so this is hash_expected and hash_location -
1212          * so we need to calculate the checksum.
1213          */
1214         genhash = tcp_v4_do_calc_md5_hash(newhash,
1215                                           hash_expected,
1216                                           iph->saddr, iph->daddr,
1217                                           th, sk->sk_protocol,
1218                                           skb->len);
1219
1220         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1221                 if (net_ratelimit()) {
1222                         printk(KERN_INFO "MD5 Hash failed for "
1223                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)%s\n",
1224                                NIPQUAD(iph->saddr), ntohs(th->source),
1225                                NIPQUAD(iph->daddr), ntohs(th->dest),
1226                                genhash ? " tcp_v4_calc_md5_hash failed" : "");
1227                 }
1228                 return 1;
1229         }
1230         return 0;
1231 }
1232
1233 #endif
1234
1235 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1236         .family         =       PF_INET,
1237         .obj_size       =       sizeof(struct tcp_request_sock),
1238         .rtx_syn_ack    =       tcp_v4_send_synack,
1239         .send_ack       =       tcp_v4_reqsk_send_ack,
1240         .destructor     =       tcp_v4_reqsk_destructor,
1241         .send_reset     =       tcp_v4_send_reset,
1242 };
1243
1244 #ifdef CONFIG_TCP_MD5SIG
1245 static struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1246         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1247 };
1248 #endif
1249
1250 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1251         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1252         .twsk_unique    = tcp_twsk_unique,
1253         .twsk_destructor= tcp_twsk_destructor,
1254 };
1255
1256 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1257 {
1258         struct inet_request_sock *ireq;
1259         struct tcp_options_received tmp_opt;
1260         struct request_sock *req;
1261         __be32 saddr = ip_hdr(skb)->saddr;
1262         __be32 daddr = ip_hdr(skb)->daddr;
1263         __u32 isn = TCP_SKB_CB(skb)->when;
1264         struct dst_entry *dst = NULL;
1265 #ifdef CONFIG_SYN_COOKIES
1266         int want_cookie = 0;
1267 #else
1268 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1269 #endif
1270
1271         /* Never answer to SYNs send to broadcast or multicast */
1272         if (((struct rtable *)skb->dst)->rt_flags &
1273             (RTCF_BROADCAST | RTCF_MULTICAST))
1274                 goto drop;
1275
1276         /* TW buckets are converted to open requests without
1277          * limitations, they conserve resources and peer is
1278          * evidently real one.
1279          */
1280         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1281 #ifdef CONFIG_SYN_COOKIES
1282                 if (sysctl_tcp_syncookies) {
1283                         want_cookie = 1;
1284                 } else
1285 #endif
1286                 goto drop;
1287         }
1288
1289         /* Accept backlog is full. If we have already queued enough
1290          * of warm entries in syn queue, drop request. It is better than
1291          * clogging syn queue with openreqs with exponentially increasing
1292          * timeout.
1293          */
1294         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1295                 goto drop;
1296
1297         req = reqsk_alloc(&tcp_request_sock_ops);
1298         if (!req)
1299                 goto drop;
1300
1301 #ifdef CONFIG_TCP_MD5SIG
1302         tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1303 #endif
1304
1305         tcp_clear_options(&tmp_opt);
1306         tmp_opt.mss_clamp = 536;
1307         tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
1308
1309         tcp_parse_options(skb, &tmp_opt, 0);
1310
1311         if (want_cookie) {
1312                 tcp_clear_options(&tmp_opt);
1313                 tmp_opt.saw_tstamp = 0;
1314         }
1315
1316         if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1317                 /* Some OSes (unknown ones, but I see them on web server, which
1318                  * contains information interesting only for windows'
1319                  * users) do not send their stamp in SYN. It is easy case.
1320                  * We simply do not advertise TS support.
1321                  */
1322                 tmp_opt.saw_tstamp = 0;
1323                 tmp_opt.tstamp_ok  = 0;
1324         }
1325         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1326
1327         tcp_openreq_init(req, &tmp_opt, skb);
1328
1329         if (security_inet_conn_request(sk, skb, req))
1330                 goto drop_and_free;
1331
1332         ireq = inet_rsk(req);
1333         ireq->loc_addr = daddr;
1334         ireq->rmt_addr = saddr;
1335         ireq->opt = tcp_v4_save_options(sk, skb);
1336         if (!want_cookie)
1337                 TCP_ECN_create_request(req, tcp_hdr(skb));
1338
1339         if (want_cookie) {
1340 #ifdef CONFIG_SYN_COOKIES
1341                 syn_flood_warning(skb);
1342 #endif
1343                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1344         } else if (!isn) {
1345                 struct inet_peer *peer = NULL;
1346
1347                 /* VJ's idea. We save last timestamp seen
1348                  * from the destination in peer table, when entering
1349                  * state TIME-WAIT, and check against it before
1350                  * accepting new connection request.
1351                  *
1352                  * If "isn" is not zero, this request hit alive
1353                  * timewait bucket, so that all the necessary checks
1354                  * are made in the function processing timewait state.
1355                  */
1356                 if (tmp_opt.saw_tstamp &&
1357                     tcp_death_row.sysctl_tw_recycle &&
1358                     (dst = inet_csk_route_req(sk, req)) != NULL &&
1359                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1360                     peer->v4daddr == saddr) {
1361                         if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1362                             (s32)(peer->tcp_ts - req->ts_recent) >
1363                                                         TCP_PAWS_WINDOW) {
1364                                 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1365                                 dst_release(dst);
1366                                 goto drop_and_free;
1367                         }
1368                 }
1369                 /* Kill the following clause, if you dislike this way. */
1370                 else if (!sysctl_tcp_syncookies &&
1371                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1372                           (sysctl_max_syn_backlog >> 2)) &&
1373                          (!peer || !peer->tcp_ts_stamp) &&
1374                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1375                         /* Without syncookies last quarter of
1376                          * backlog is filled with destinations,
1377                          * proven to be alive.
1378                          * It means that we continue to communicate
1379                          * to destinations, already remembered
1380                          * to the moment of synflood.
1381                          */
1382                         LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
1383                                        "request from %u.%u.%u.%u/%u\n",
1384                                        NIPQUAD(saddr),
1385                                        ntohs(tcp_hdr(skb)->source));
1386                         dst_release(dst);
1387                         goto drop_and_free;
1388                 }
1389
1390                 isn = tcp_v4_init_sequence(skb);
1391         }
1392         tcp_rsk(req)->snt_isn = isn;
1393
1394         if (tcp_v4_send_synack(sk, req, dst))
1395                 goto drop_and_free;
1396
1397         if (want_cookie) {
1398                 reqsk_free(req);
1399         } else {
1400                 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1401         }
1402         return 0;
1403
1404 drop_and_free:
1405         reqsk_free(req);
1406 drop:
1407         return 0;
1408 }
1409
1410
1411 /*
1412  * The three way handshake has completed - we got a valid synack -
1413  * now create the new socket.
1414  */
1415 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1416                                   struct request_sock *req,
1417                                   struct dst_entry *dst)
1418 {
1419         struct inet_request_sock *ireq;
1420         struct inet_sock *newinet;
1421         struct tcp_sock *newtp;
1422         struct sock *newsk;
1423 #ifdef CONFIG_TCP_MD5SIG
1424         struct tcp_md5sig_key *key;
1425 #endif
1426
1427         if (sk_acceptq_is_full(sk))
1428                 goto exit_overflow;
1429
1430         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1431                 goto exit;
1432
1433         newsk = tcp_create_openreq_child(sk, req, skb);
1434         if (!newsk)
1435                 goto exit;
1436
1437         newsk->sk_gso_type = SKB_GSO_TCPV4;
1438         sk_setup_caps(newsk, dst);
1439
1440         newtp                 = tcp_sk(newsk);
1441         newinet               = inet_sk(newsk);
1442         ireq                  = inet_rsk(req);
1443         newinet->daddr        = ireq->rmt_addr;
1444         newinet->rcv_saddr    = ireq->loc_addr;
1445         newinet->saddr        = ireq->loc_addr;
1446         newinet->opt          = ireq->opt;
1447         ireq->opt             = NULL;
1448         newinet->mc_index     = inet_iif(skb);
1449         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1450         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1451         if (newinet->opt)
1452                 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1453         newinet->id = newtp->write_seq ^ jiffies;
1454
1455         tcp_mtup_init(newsk);
1456         tcp_sync_mss(newsk, dst_mtu(dst));
1457         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1458         tcp_initialize_rcv_mss(newsk);
1459
1460 #ifdef CONFIG_TCP_MD5SIG
1461         /* Copy over the MD5 key from the original socket */
1462         if ((key = tcp_v4_md5_do_lookup(sk, newinet->daddr)) != NULL) {
1463                 /*
1464                  * We're using one, so create a matching key
1465                  * on the newsk structure. If we fail to get
1466                  * memory, then we end up not copying the key
1467                  * across. Shucks.
1468                  */
1469                 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1470                 if (newkey != NULL)
1471                         tcp_v4_md5_do_add(newsk, inet_sk(sk)->daddr,
1472                                           newkey, key->keylen);
1473         }
1474 #endif
1475
1476         __inet_hash(&tcp_hashinfo, newsk, 0);
1477         __inet_inherit_port(&tcp_hashinfo, sk, newsk);
1478
1479         return newsk;
1480
1481 exit_overflow:
1482         NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1483 exit:
1484         NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1485         dst_release(dst);
1486         return NULL;
1487 }
1488
1489 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1490 {
1491         struct tcphdr *th = tcp_hdr(skb);
1492         const struct iphdr *iph = ip_hdr(skb);
1493         struct sock *nsk;
1494         struct request_sock **prev;
1495         /* Find possible connection requests. */
1496         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1497                                                        iph->saddr, iph->daddr);
1498         if (req)
1499                 return tcp_check_req(sk, skb, req, prev);
1500
1501         nsk = inet_lookup_established(&tcp_hashinfo, iph->saddr, th->source,
1502                                       iph->daddr, th->dest, inet_iif(skb));
1503
1504         if (nsk) {
1505                 if (nsk->sk_state != TCP_TIME_WAIT) {
1506                         bh_lock_sock(nsk);
1507                         return nsk;
1508                 }
1509                 inet_twsk_put(inet_twsk(nsk));
1510                 return NULL;
1511         }
1512
1513 #ifdef CONFIG_SYN_COOKIES
1514         if (!th->rst && !th->syn && th->ack)
1515                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1516 #endif
1517         return sk;
1518 }
1519
1520 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1521 {
1522         const struct iphdr *iph = ip_hdr(skb);
1523
1524         if (skb->ip_summed == CHECKSUM_COMPLETE) {
1525                 if (!tcp_v4_check(skb->len, iph->saddr,
1526                                   iph->daddr, skb->csum)) {
1527                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1528                         return 0;
1529                 }
1530         }
1531
1532         skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1533                                        skb->len, IPPROTO_TCP, 0);
1534
1535         if (skb->len <= 76) {
1536                 return __skb_checksum_complete(skb);
1537         }
1538         return 0;
1539 }
1540
1541
1542 /* The socket must have it's spinlock held when we get
1543  * here.
1544  *
1545  * We have a potential double-lock case here, so even when
1546  * doing backlog processing we use the BH locking scheme.
1547  * This is because we cannot sleep with the original spinlock
1548  * held.
1549  */
1550 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1551 {
1552         struct sock *rsk;
1553 #ifdef CONFIG_TCP_MD5SIG
1554         /*
1555          * We really want to reject the packet as early as possible
1556          * if:
1557          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1558          *  o There is an MD5 option and we're not expecting one
1559          */
1560         if (tcp_v4_inbound_md5_hash(sk, skb))
1561                 goto discard;
1562 #endif
1563
1564         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1565                 TCP_CHECK_TIMER(sk);
1566                 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1567                         rsk = sk;
1568                         goto reset;
1569                 }
1570                 TCP_CHECK_TIMER(sk);
1571                 return 0;
1572         }
1573
1574         if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1575                 goto csum_err;
1576
1577         if (sk->sk_state == TCP_LISTEN) {
1578                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1579                 if (!nsk)
1580                         goto discard;
1581
1582                 if (nsk != sk) {
1583                         if (tcp_child_process(sk, nsk, skb)) {
1584                                 rsk = nsk;
1585                                 goto reset;
1586                         }
1587                         return 0;
1588                 }
1589         }
1590
1591         TCP_CHECK_TIMER(sk);
1592         if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1593                 rsk = sk;
1594                 goto reset;
1595         }
1596         TCP_CHECK_TIMER(sk);
1597         return 0;
1598
1599 reset:
1600         tcp_v4_send_reset(rsk, skb);
1601 discard:
1602         kfree_skb(skb);
1603         /* Be careful here. If this function gets more complicated and
1604          * gcc suffers from register pressure on the x86, sk (in %ebx)
1605          * might be destroyed here. This current version compiles correctly,
1606          * but you have been warned.
1607          */
1608         return 0;
1609
1610 csum_err:
1611         TCP_INC_STATS_BH(TCP_MIB_INERRS);
1612         goto discard;
1613 }
1614
1615 /*
1616  *      From tcp_input.c
1617  */
1618
1619 int tcp_v4_rcv(struct sk_buff *skb)
1620 {
1621         const struct iphdr *iph;
1622         struct tcphdr *th;
1623         struct sock *sk;
1624         int ret;
1625
1626         if (skb->pkt_type != PACKET_HOST)
1627                 goto discard_it;
1628
1629         /* Count it even if it's bad */
1630         TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1631
1632         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1633                 goto discard_it;
1634
1635         th = tcp_hdr(skb);
1636
1637         if (th->doff < sizeof(struct tcphdr) / 4)
1638                 goto bad_packet;
1639         if (!pskb_may_pull(skb, th->doff * 4))
1640                 goto discard_it;
1641
1642         /* An explanation is required here, I think.
1643          * Packet length and doff are validated by header prediction,
1644          * provided case of th->doff==0 is eliminated.
1645          * So, we defer the checks. */
1646         if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1647                 goto bad_packet;
1648
1649         th = tcp_hdr(skb);
1650         iph = ip_hdr(skb);
1651         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1652         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1653                                     skb->len - th->doff * 4);
1654         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1655         TCP_SKB_CB(skb)->when    = 0;
1656         TCP_SKB_CB(skb)->flags   = iph->tos;
1657         TCP_SKB_CB(skb)->sacked  = 0;
1658
1659         sk = __inet_lookup(&tcp_hashinfo, iph->saddr, th->source,
1660                            iph->daddr, th->dest, inet_iif(skb));
1661         if (!sk)
1662                 goto no_tcp_socket;
1663
1664 process:
1665         if (sk->sk_state == TCP_TIME_WAIT)
1666                 goto do_time_wait;
1667
1668         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1669                 goto discard_and_relse;
1670         nf_reset(skb);
1671
1672         if (sk_filter(sk, skb))
1673                 goto discard_and_relse;
1674
1675         skb->dev = NULL;
1676
1677         bh_lock_sock_nested(sk);
1678         ret = 0;
1679         if (!sock_owned_by_user(sk)) {
1680 #ifdef CONFIG_NET_DMA
1681                 struct tcp_sock *tp = tcp_sk(sk);
1682                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1683                         tp->ucopy.dma_chan = get_softnet_dma();
1684                 if (tp->ucopy.dma_chan)
1685                         ret = tcp_v4_do_rcv(sk, skb);
1686                 else
1687 #endif
1688                 {
1689                         if (!tcp_prequeue(sk, skb))
1690                         ret = tcp_v4_do_rcv(sk, skb);
1691                 }
1692         } else
1693                 sk_add_backlog(sk, skb);
1694         bh_unlock_sock(sk);
1695
1696         sock_put(sk);
1697
1698         return ret;
1699
1700 no_tcp_socket:
1701         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1702                 goto discard_it;
1703
1704         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1705 bad_packet:
1706                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1707         } else {
1708                 tcp_v4_send_reset(NULL, skb);
1709         }
1710
1711 discard_it:
1712         /* Discard frame. */
1713         kfree_skb(skb);
1714         return 0;
1715
1716 discard_and_relse:
1717         sock_put(sk);
1718         goto discard_it;
1719
1720 do_time_wait:
1721         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1722                 inet_twsk_put(inet_twsk(sk));
1723                 goto discard_it;
1724         }
1725
1726         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1727                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1728                 inet_twsk_put(inet_twsk(sk));
1729                 goto discard_it;
1730         }
1731         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1732         case TCP_TW_SYN: {
1733                 struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo,
1734                                                         iph->daddr, th->dest,
1735                                                         inet_iif(skb));
1736                 if (sk2) {
1737                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1738                         inet_twsk_put(inet_twsk(sk));
1739                         sk = sk2;
1740                         goto process;
1741                 }
1742                 /* Fall through to ACK */
1743         }
1744         case TCP_TW_ACK:
1745                 tcp_v4_timewait_ack(sk, skb);
1746                 break;
1747         case TCP_TW_RST:
1748                 goto no_tcp_socket;
1749         case TCP_TW_SUCCESS:;
1750         }
1751         goto discard_it;
1752 }
1753
1754 /* VJ's idea. Save last timestamp seen from this destination
1755  * and hold it at least for normal timewait interval to use for duplicate
1756  * segment detection in subsequent connections, before they enter synchronized
1757  * state.
1758  */
1759
1760 int tcp_v4_remember_stamp(struct sock *sk)
1761 {
1762         struct inet_sock *inet = inet_sk(sk);
1763         struct tcp_sock *tp = tcp_sk(sk);
1764         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1765         struct inet_peer *peer = NULL;
1766         int release_it = 0;
1767
1768         if (!rt || rt->rt_dst != inet->daddr) {
1769                 peer = inet_getpeer(inet->daddr, 1);
1770                 release_it = 1;
1771         } else {
1772                 if (!rt->peer)
1773                         rt_bind_peer(rt, 1);
1774                 peer = rt->peer;
1775         }
1776
1777         if (peer) {
1778                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1779                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1780                      peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1781                         peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1782                         peer->tcp_ts = tp->rx_opt.ts_recent;
1783                 }
1784                 if (release_it)
1785                         inet_putpeer(peer);
1786                 return 1;
1787         }
1788
1789         return 0;
1790 }
1791
1792 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1793 {
1794         struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1795
1796         if (peer) {
1797                 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1798
1799                 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1800                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1801                      peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1802                         peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1803                         peer->tcp_ts       = tcptw->tw_ts_recent;
1804                 }
1805                 inet_putpeer(peer);
1806                 return 1;
1807         }
1808
1809         return 0;
1810 }
1811
1812 struct inet_connection_sock_af_ops ipv4_specific = {
1813         .queue_xmit        = ip_queue_xmit,
1814         .send_check        = tcp_v4_send_check,
1815         .rebuild_header    = inet_sk_rebuild_header,
1816         .conn_request      = tcp_v4_conn_request,
1817         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1818         .remember_stamp    = tcp_v4_remember_stamp,
1819         .net_header_len    = sizeof(struct iphdr),
1820         .setsockopt        = ip_setsockopt,
1821         .getsockopt        = ip_getsockopt,
1822         .addr2sockaddr     = inet_csk_addr2sockaddr,
1823         .sockaddr_len      = sizeof(struct sockaddr_in),
1824 #ifdef CONFIG_COMPAT
1825         .compat_setsockopt = compat_ip_setsockopt,
1826         .compat_getsockopt = compat_ip_getsockopt,
1827 #endif
1828 };
1829
1830 #ifdef CONFIG_TCP_MD5SIG
1831 static struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1832         .md5_lookup             = tcp_v4_md5_lookup,
1833         .calc_md5_hash          = tcp_v4_calc_md5_hash,
1834         .md5_add                = tcp_v4_md5_add_func,
1835         .md5_parse              = tcp_v4_parse_md5_keys,
1836 };
1837 #endif
1838
1839 /* NOTE: A lot of things set to zero explicitly by call to
1840  *       sk_alloc() so need not be done here.
1841  */
1842 static int tcp_v4_init_sock(struct sock *sk)
1843 {
1844         struct inet_connection_sock *icsk = inet_csk(sk);
1845         struct tcp_sock *tp = tcp_sk(sk);
1846
1847         skb_queue_head_init(&tp->out_of_order_queue);
1848         tcp_init_xmit_timers(sk);
1849         tcp_prequeue_init(tp);
1850
1851         icsk->icsk_rto = TCP_TIMEOUT_INIT;
1852         tp->mdev = TCP_TIMEOUT_INIT;
1853
1854         /* So many TCP implementations out there (incorrectly) count the
1855          * initial SYN frame in their delayed-ACK and congestion control
1856          * algorithms that we must have the following bandaid to talk
1857          * efficiently to them.  -DaveM
1858          */
1859         tp->snd_cwnd = 2;
1860
1861         /* See draft-stevens-tcpca-spec-01 for discussion of the
1862          * initialization of these values.
1863          */
1864         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
1865         tp->snd_cwnd_clamp = ~0;
1866         tp->mss_cache = 536;
1867
1868         tp->reordering = sysctl_tcp_reordering;
1869         icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1870
1871         sk->sk_state = TCP_CLOSE;
1872
1873         sk->sk_write_space = sk_stream_write_space;
1874         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1875
1876         icsk->icsk_af_ops = &ipv4_specific;
1877         icsk->icsk_sync_mss = tcp_sync_mss;
1878 #ifdef CONFIG_TCP_MD5SIG
1879         tp->af_specific = &tcp_sock_ipv4_specific;
1880 #endif
1881
1882         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1883         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1884
1885         atomic_inc(&tcp_sockets_allocated);
1886
1887         return 0;
1888 }
1889
1890 int tcp_v4_destroy_sock(struct sock *sk)
1891 {
1892         struct tcp_sock *tp = tcp_sk(sk);
1893
1894         tcp_clear_xmit_timers(sk);
1895
1896         tcp_cleanup_congestion_control(sk);
1897
1898         /* Cleanup up the write buffer. */
1899         tcp_write_queue_purge(sk);
1900
1901         /* Cleans up our, hopefully empty, out_of_order_queue. */
1902         __skb_queue_purge(&tp->out_of_order_queue);
1903
1904 #ifdef CONFIG_TCP_MD5SIG
1905         /* Clean up the MD5 key list, if any */
1906         if (tp->md5sig_info) {
1907                 tcp_v4_clear_md5_list(sk);
1908                 kfree(tp->md5sig_info);
1909                 tp->md5sig_info = NULL;
1910         }
1911 #endif
1912
1913 #ifdef CONFIG_NET_DMA
1914         /* Cleans up our sk_async_wait_queue */
1915         __skb_queue_purge(&sk->sk_async_wait_queue);
1916 #endif
1917
1918         /* Clean prequeue, it must be empty really */
1919         __skb_queue_purge(&tp->ucopy.prequeue);
1920
1921         /* Clean up a referenced TCP bind bucket. */
1922         if (inet_csk(sk)->icsk_bind_hash)
1923                 inet_put_port(&tcp_hashinfo, sk);
1924
1925         /*
1926          * If sendmsg cached page exists, toss it.
1927          */
1928         if (sk->sk_sndmsg_page) {
1929                 __free_page(sk->sk_sndmsg_page);
1930                 sk->sk_sndmsg_page = NULL;
1931         }
1932
1933         atomic_dec(&tcp_sockets_allocated);
1934
1935         return 0;
1936 }
1937
1938 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1939
1940 #ifdef CONFIG_PROC_FS
1941 /* Proc filesystem TCP sock list dumping. */
1942
1943 static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
1944 {
1945         return hlist_empty(head) ? NULL :
1946                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1947 }
1948
1949 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1950 {
1951         return tw->tw_node.next ?
1952                 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1953 }
1954
1955 static void *listening_get_next(struct seq_file *seq, void *cur)
1956 {
1957         struct inet_connection_sock *icsk;
1958         struct hlist_node *node;
1959         struct sock *sk = cur;
1960         struct tcp_iter_state* st = seq->private;
1961
1962         if (!sk) {
1963                 st->bucket = 0;
1964                 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1965                 goto get_sk;
1966         }
1967
1968         ++st->num;
1969
1970         if (st->state == TCP_SEQ_STATE_OPENREQ) {
1971                 struct request_sock *req = cur;
1972
1973                 icsk = inet_csk(st->syn_wait_sk);
1974                 req = req->dl_next;
1975                 while (1) {
1976                         while (req) {
1977                                 if (req->rsk_ops->family == st->family) {
1978                                         cur = req;
1979                                         goto out;
1980                                 }
1981                                 req = req->dl_next;
1982                         }
1983                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1984                                 break;
1985 get_req:
1986                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1987                 }
1988                 sk        = sk_next(st->syn_wait_sk);
1989                 st->state = TCP_SEQ_STATE_LISTENING;
1990                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1991         } else {
1992                 icsk = inet_csk(sk);
1993                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1994                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
1995                         goto start_req;
1996                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1997                 sk = sk_next(sk);
1998         }
1999 get_sk:
2000         sk_for_each_from(sk, node) {
2001                 if (sk->sk_family == st->family) {
2002                         cur = sk;
2003                         goto out;
2004                 }
2005                 icsk = inet_csk(sk);
2006                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2007                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2008 start_req:
2009                         st->uid         = sock_i_uid(sk);
2010                         st->syn_wait_sk = sk;
2011                         st->state       = TCP_SEQ_STATE_OPENREQ;
2012                         st->sbucket     = 0;
2013                         goto get_req;
2014                 }
2015                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2016         }
2017         if (++st->bucket < INET_LHTABLE_SIZE) {
2018                 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
2019                 goto get_sk;
2020         }
2021         cur = NULL;
2022 out:
2023         return cur;
2024 }
2025
2026 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2027 {
2028         void *rc = listening_get_next(seq, NULL);
2029
2030         while (rc && *pos) {
2031                 rc = listening_get_next(seq, rc);
2032                 --*pos;
2033         }
2034         return rc;
2035 }
2036
2037 static void *established_get_first(struct seq_file *seq)
2038 {
2039         struct tcp_iter_state* st = seq->private;
2040         void *rc = NULL;
2041
2042         for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
2043                 struct sock *sk;
2044                 struct hlist_node *node;
2045                 struct inet_timewait_sock *tw;
2046
2047                 /* We can reschedule _before_ having picked the target: */
2048                 cond_resched_softirq();
2049
2050                 read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
2051                 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2052                         if (sk->sk_family != st->family) {
2053                                 continue;
2054                         }
2055                         rc = sk;
2056                         goto out;
2057                 }
2058                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2059                 inet_twsk_for_each(tw, node,
2060                                    &tcp_hashinfo.ehash[st->bucket].twchain) {
2061                         if (tw->tw_family != st->family) {
2062                                 continue;
2063                         }
2064                         rc = tw;
2065                         goto out;
2066                 }
2067                 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
2068                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2069         }
2070 out:
2071         return rc;
2072 }
2073
2074 static void *established_get_next(struct seq_file *seq, void *cur)
2075 {
2076         struct sock *sk = cur;
2077         struct inet_timewait_sock *tw;
2078         struct hlist_node *node;
2079         struct tcp_iter_state* st = seq->private;
2080
2081         ++st->num;
2082
2083         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2084                 tw = cur;
2085                 tw = tw_next(tw);
2086 get_tw:
2087                 while (tw && tw->tw_family != st->family) {
2088                         tw = tw_next(tw);
2089                 }
2090                 if (tw) {
2091                         cur = tw;
2092                         goto out;
2093                 }
2094                 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
2095                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2096
2097                 /* We can reschedule between buckets: */
2098                 cond_resched_softirq();
2099
2100                 if (++st->bucket < tcp_hashinfo.ehash_size) {
2101                         read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
2102                         sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
2103                 } else {
2104                         cur = NULL;
2105                         goto out;
2106                 }
2107         } else
2108                 sk = sk_next(sk);
2109
2110         sk_for_each_from(sk, node) {
2111                 if (sk->sk_family == st->family)
2112                         goto found;
2113         }
2114
2115         st->state = TCP_SEQ_STATE_TIME_WAIT;
2116         tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2117         goto get_tw;
2118 found:
2119         cur = sk;
2120 out:
2121         return cur;
2122 }
2123
2124 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2125 {
2126         void *rc = established_get_first(seq);
2127
2128         while (rc && pos) {
2129                 rc = established_get_next(seq, rc);
2130                 --pos;
2131         }
2132         return rc;
2133 }
2134
2135 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2136 {
2137         void *rc;
2138         struct tcp_iter_state* st = seq->private;
2139
2140         inet_listen_lock(&tcp_hashinfo);
2141         st->state = TCP_SEQ_STATE_LISTENING;
2142         rc        = listening_get_idx(seq, &pos);
2143
2144         if (!rc) {
2145                 inet_listen_unlock(&tcp_hashinfo);
2146                 local_bh_disable();
2147                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2148                 rc        = established_get_idx(seq, pos);
2149         }
2150
2151         return rc;
2152 }
2153
2154 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2155 {
2156         struct tcp_iter_state* st = seq->private;
2157         st->state = TCP_SEQ_STATE_LISTENING;
2158         st->num = 0;
2159         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2160 }
2161
2162 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2163 {
2164         void *rc = NULL;
2165         struct tcp_iter_state* st;
2166
2167         if (v == SEQ_START_TOKEN) {
2168                 rc = tcp_get_idx(seq, 0);
2169                 goto out;
2170         }
2171         st = seq->private;
2172
2173         switch (st->state) {
2174         case TCP_SEQ_STATE_OPENREQ:
2175         case TCP_SEQ_STATE_LISTENING:
2176                 rc = listening_get_next(seq, v);
2177                 if (!rc) {
2178                         inet_listen_unlock(&tcp_hashinfo);
2179                         local_bh_disable();
2180                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2181                         rc        = established_get_first(seq);
2182                 }
2183                 break;
2184         case TCP_SEQ_STATE_ESTABLISHED:
2185         case TCP_SEQ_STATE_TIME_WAIT:
2186                 rc = established_get_next(seq, v);
2187                 break;
2188         }
2189 out:
2190         ++*pos;
2191         return rc;
2192 }
2193
2194 static void tcp_seq_stop(struct seq_file *seq, void *v)
2195 {
2196         struct tcp_iter_state* st = seq->private;
2197
2198         switch (st->state) {
2199         case TCP_SEQ_STATE_OPENREQ:
2200                 if (v) {
2201                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2202                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2203                 }
2204         case TCP_SEQ_STATE_LISTENING:
2205                 if (v != SEQ_START_TOKEN)
2206                         inet_listen_unlock(&tcp_hashinfo);
2207                 break;
2208         case TCP_SEQ_STATE_TIME_WAIT:
2209         case TCP_SEQ_STATE_ESTABLISHED:
2210                 if (v)
2211                         read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
2212                 local_bh_enable();
2213                 break;
2214         }
2215 }
2216
2217 static int tcp_seq_open(struct inode *inode, struct file *file)
2218 {
2219         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2220         struct seq_file *seq;
2221         struct tcp_iter_state *s;
2222         int rc;
2223
2224         if (unlikely(afinfo == NULL))
2225                 return -EINVAL;
2226
2227         s = kzalloc(sizeof(*s), GFP_KERNEL);
2228         if (!s)
2229                 return -ENOMEM;
2230         s->family               = afinfo->family;
2231         s->seq_ops.start        = tcp_seq_start;
2232         s->seq_ops.next         = tcp_seq_next;
2233         s->seq_ops.show         = afinfo->seq_show;
2234         s->seq_ops.stop         = tcp_seq_stop;
2235
2236         rc = seq_open(file, &s->seq_ops);
2237         if (rc)
2238                 goto out_kfree;
2239         seq          = file->private_data;
2240         seq->private = s;
2241 out:
2242         return rc;
2243 out_kfree:
2244         kfree(s);
2245         goto out;
2246 }
2247
2248 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2249 {
2250         int rc = 0;
2251         struct proc_dir_entry *p;
2252
2253         if (!afinfo)
2254                 return -EINVAL;
2255         afinfo->seq_fops->owner         = afinfo->owner;
2256         afinfo->seq_fops->open          = tcp_seq_open;
2257         afinfo->seq_fops->read          = seq_read;
2258         afinfo->seq_fops->llseek        = seq_lseek;
2259         afinfo->seq_fops->release       = seq_release_private;
2260
2261         p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2262         if (p)
2263                 p->data = afinfo;
2264         else
2265                 rc = -ENOMEM;
2266         return rc;
2267 }
2268
2269 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2270 {
2271         if (!afinfo)
2272                 return;
2273         proc_net_remove(afinfo->name);
2274         memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2275 }
2276
2277 static void get_openreq4(struct sock *sk, struct request_sock *req,
2278                          char *tmpbuf, int i, int uid)
2279 {
2280         const struct inet_request_sock *ireq = inet_rsk(req);
2281         int ttd = req->expires - jiffies;
2282
2283         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2284                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2285                 i,
2286                 ireq->loc_addr,
2287                 ntohs(inet_sk(sk)->sport),
2288                 ireq->rmt_addr,
2289                 ntohs(ireq->rmt_port),
2290                 TCP_SYN_RECV,
2291                 0, 0, /* could print option size, but that is af dependent. */
2292                 1,    /* timers active (only the expire timer) */
2293                 jiffies_to_clock_t(ttd),
2294                 req->retrans,
2295                 uid,
2296                 0,  /* non standard timer */
2297                 0, /* open_requests have no inode */
2298                 atomic_read(&sk->sk_refcnt),
2299                 req);
2300 }
2301
2302 static void get_tcp4_sock(struct sock *sk, char *tmpbuf, int i)
2303 {
2304         int timer_active;
2305         unsigned long timer_expires;
2306         struct tcp_sock *tp = tcp_sk(sk);
2307         const struct inet_connection_sock *icsk = inet_csk(sk);
2308         struct inet_sock *inet = inet_sk(sk);
2309         __be32 dest = inet->daddr;
2310         __be32 src = inet->rcv_saddr;
2311         __u16 destp = ntohs(inet->dport);
2312         __u16 srcp = ntohs(inet->sport);
2313
2314         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2315                 timer_active    = 1;
2316                 timer_expires   = icsk->icsk_timeout;
2317         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2318                 timer_active    = 4;
2319                 timer_expires   = icsk->icsk_timeout;
2320         } else if (timer_pending(&sk->sk_timer)) {
2321                 timer_active    = 2;
2322                 timer_expires   = sk->sk_timer.expires;
2323         } else {
2324                 timer_active    = 0;
2325                 timer_expires = jiffies;
2326         }
2327
2328         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2329                         "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2330                 i, src, srcp, dest, destp, sk->sk_state,
2331                 tp->write_seq - tp->snd_una,
2332                 sk->sk_state == TCP_LISTEN ? sk->sk_ack_backlog :
2333                                              (tp->rcv_nxt - tp->copied_seq),
2334                 timer_active,
2335                 jiffies_to_clock_t(timer_expires - jiffies),
2336                 icsk->icsk_retransmits,
2337                 sock_i_uid(sk),
2338                 icsk->icsk_probes_out,
2339                 sock_i_ino(sk),
2340                 atomic_read(&sk->sk_refcnt), sk,
2341                 icsk->icsk_rto,
2342                 icsk->icsk_ack.ato,
2343                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2344                 tp->snd_cwnd,
2345                 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2346 }
2347
2348 static void get_timewait4_sock(struct inet_timewait_sock *tw,
2349                                char *tmpbuf, int i)
2350 {
2351         __be32 dest, src;
2352         __u16 destp, srcp;
2353         int ttd = tw->tw_ttd - jiffies;
2354
2355         if (ttd < 0)
2356                 ttd = 0;
2357
2358         dest  = tw->tw_daddr;
2359         src   = tw->tw_rcv_saddr;
2360         destp = ntohs(tw->tw_dport);
2361         srcp  = ntohs(tw->tw_sport);
2362
2363         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2364                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2365                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2366                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2367                 atomic_read(&tw->tw_refcnt), tw);
2368 }
2369
2370 #define TMPSZ 150
2371
2372 static int tcp4_seq_show(struct seq_file *seq, void *v)
2373 {
2374         struct tcp_iter_state* st;
2375         char tmpbuf[TMPSZ + 1];
2376
2377         if (v == SEQ_START_TOKEN) {
2378                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2379                            "  sl  local_address rem_address   st tx_queue "
2380                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2381                            "inode");
2382                 goto out;
2383         }
2384         st = seq->private;
2385
2386         switch (st->state) {
2387         case TCP_SEQ_STATE_LISTENING:
2388         case TCP_SEQ_STATE_ESTABLISHED:
2389                 get_tcp4_sock(v, tmpbuf, st->num);
2390                 break;
2391         case TCP_SEQ_STATE_OPENREQ:
2392                 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2393                 break;
2394         case TCP_SEQ_STATE_TIME_WAIT:
2395                 get_timewait4_sock(v, tmpbuf, st->num);
2396                 break;
2397         }
2398         seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2399 out:
2400         return 0;
2401 }
2402
2403 static struct file_operations tcp4_seq_fops;
2404 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2405         .owner          = THIS_MODULE,
2406         .name           = "tcp",
2407         .family         = AF_INET,
2408         .seq_show       = tcp4_seq_show,
2409         .seq_fops       = &tcp4_seq_fops,
2410 };
2411
2412 int __init tcp4_proc_init(void)
2413 {
2414         return tcp_proc_register(&tcp4_seq_afinfo);
2415 }
2416
2417 void tcp4_proc_exit(void)
2418 {
2419         tcp_proc_unregister(&tcp4_seq_afinfo);
2420 }
2421 #endif /* CONFIG_PROC_FS */
2422
2423 struct proto tcp_prot = {
2424         .name                   = "TCP",
2425         .owner                  = THIS_MODULE,
2426         .close                  = tcp_close,
2427         .connect                = tcp_v4_connect,
2428         .disconnect             = tcp_disconnect,
2429         .accept                 = inet_csk_accept,
2430         .ioctl                  = tcp_ioctl,
2431         .init                   = tcp_v4_init_sock,
2432         .destroy                = tcp_v4_destroy_sock,
2433         .shutdown               = tcp_shutdown,
2434         .setsockopt             = tcp_setsockopt,
2435         .getsockopt             = tcp_getsockopt,
2436         .sendmsg                = tcp_sendmsg,
2437         .recvmsg                = tcp_recvmsg,
2438         .backlog_rcv            = tcp_v4_do_rcv,
2439         .hash                   = tcp_v4_hash,
2440         .unhash                 = tcp_unhash,
2441         .get_port               = tcp_v4_get_port,
2442         .enter_memory_pressure  = tcp_enter_memory_pressure,
2443         .sockets_allocated      = &tcp_sockets_allocated,
2444         .orphan_count           = &tcp_orphan_count,
2445         .memory_allocated       = &tcp_memory_allocated,
2446         .memory_pressure        = &tcp_memory_pressure,
2447         .sysctl_mem             = sysctl_tcp_mem,
2448         .sysctl_wmem            = sysctl_tcp_wmem,
2449         .sysctl_rmem            = sysctl_tcp_rmem,
2450         .max_header             = MAX_TCP_HEADER,
2451         .obj_size               = sizeof(struct tcp_sock),
2452         .twsk_prot              = &tcp_timewait_sock_ops,
2453         .rsk_prot               = &tcp_request_sock_ops,
2454 #ifdef CONFIG_COMPAT
2455         .compat_setsockopt      = compat_tcp_setsockopt,
2456         .compat_getsockopt      = compat_tcp_getsockopt,
2457 #endif
2458 };
2459
2460 void __init tcp_v4_init(struct net_proto_family *ops)
2461 {
2462         if (inet_csk_ctl_sock_create(&tcp_socket, PF_INET, SOCK_RAW,
2463                                      IPPROTO_TCP) < 0)
2464                 panic("Failed to create the TCP control socket.\n");
2465 }
2466
2467 EXPORT_SYMBOL(ipv4_specific);
2468 EXPORT_SYMBOL(tcp_hashinfo);
2469 EXPORT_SYMBOL(tcp_prot);
2470 EXPORT_SYMBOL(tcp_unhash);
2471 EXPORT_SYMBOL(tcp_v4_conn_request);
2472 EXPORT_SYMBOL(tcp_v4_connect);
2473 EXPORT_SYMBOL(tcp_v4_do_rcv);
2474 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2475 EXPORT_SYMBOL(tcp_v4_send_check);
2476 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2477
2478 #ifdef CONFIG_PROC_FS
2479 EXPORT_SYMBOL(tcp_proc_register);
2480 EXPORT_SYMBOL(tcp_proc_unregister);
2481 #endif
2482 EXPORT_SYMBOL(sysctl_local_port_range);
2483 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2484