Merge branch 'upstream-linus' of master.kernel.org:/pub/scm/linux/kernel/git/jgarzik...
[sfrench/cifs-2.6.git] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  * Version:     $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
9  *
10  *              IPv4 specific functions
11  *
12  *
13  *              code split from:
14  *              linux/ipv4/tcp.c
15  *              linux/ipv4/tcp_input.c
16  *              linux/ipv4/tcp_output.c
17  *
18  *              See tcp.c for author information
19  *
20  *      This program is free software; you can redistribute it and/or
21  *      modify it under the terms of the GNU General Public License
22  *      as published by the Free Software Foundation; either version
23  *      2 of the License, or (at your option) any later version.
24  */
25
26 /*
27  * Changes:
28  *              David S. Miller :       New socket lookup architecture.
29  *                                      This code is dedicated to John Dyson.
30  *              David S. Miller :       Change semantics of established hash,
31  *                                      half is devoted to TIME_WAIT sockets
32  *                                      and the rest go in the other half.
33  *              Andi Kleen :            Add support for syncookies and fixed
34  *                                      some bugs: ip options weren't passed to
35  *                                      the TCP layer, missed a check for an
36  *                                      ACK bit.
37  *              Andi Kleen :            Implemented fast path mtu discovery.
38  *                                      Fixed many serious bugs in the
39  *                                      request_sock handling and moved
40  *                                      most of it into the af independent code.
41  *                                      Added tail drop and some other bugfixes.
42  *                                      Added new listen semantics.
43  *              Mike McLagan    :       Routing by source
44  *      Juan Jose Ciarlante:            ip_dynaddr bits
45  *              Andi Kleen:             various fixes.
46  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
47  *                                      coma.
48  *      Andi Kleen              :       Fix new listen.
49  *      Andi Kleen              :       Fix accept error reporting.
50  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
51  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
52  *                                      a single port at the same time.
53  */
54
55
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64
65 #include <net/net_namespace.h>
66 #include <net/icmp.h>
67 #include <net/inet_hashtables.h>
68 #include <net/tcp.h>
69 #include <net/transp_v6.h>
70 #include <net/ipv6.h>
71 #include <net/inet_common.h>
72 #include <net/timewait_sock.h>
73 #include <net/xfrm.h>
74 #include <net/netdma.h>
75
76 #include <linux/inet.h>
77 #include <linux/ipv6.h>
78 #include <linux/stddef.h>
79 #include <linux/proc_fs.h>
80 #include <linux/seq_file.h>
81
82 #include <linux/crypto.h>
83 #include <linux/scatterlist.h>
84
85 int sysctl_tcp_tw_reuse __read_mostly;
86 int sysctl_tcp_low_latency __read_mostly;
87
88 /* Check TCP sequence numbers in ICMP packets. */
89 #define ICMP_MIN_LENGTH 8
90
91 /* Socket used for sending RSTs */
92 static struct socket *tcp_socket __read_mostly;
93
94 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb);
95
96 #ifdef CONFIG_TCP_MD5SIG
97 static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
98                                                    __be32 addr);
99 static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
100                                    __be32 saddr, __be32 daddr,
101                                    struct tcphdr *th, int protocol,
102                                    int tcplen);
103 #endif
104
105 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
106         .lhash_lock  = __RW_LOCK_UNLOCKED(tcp_hashinfo.lhash_lock),
107         .lhash_users = ATOMIC_INIT(0),
108         .lhash_wait  = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
109 };
110
111 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
112 {
113         return inet_csk_get_port(&tcp_hashinfo, sk, snum,
114                                  inet_csk_bind_conflict);
115 }
116
117 static void tcp_v4_hash(struct sock *sk)
118 {
119         inet_hash(&tcp_hashinfo, sk);
120 }
121
122 void tcp_unhash(struct sock *sk)
123 {
124         inet_unhash(&tcp_hashinfo, sk);
125 }
126
127 static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
128 {
129         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
130                                           ip_hdr(skb)->saddr,
131                                           tcp_hdr(skb)->dest,
132                                           tcp_hdr(skb)->source);
133 }
134
135 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
136 {
137         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
138         struct tcp_sock *tp = tcp_sk(sk);
139
140         /* With PAWS, it is safe from the viewpoint
141            of data integrity. Even without PAWS it is safe provided sequence
142            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
143
144            Actually, the idea is close to VJ's one, only timestamp cache is
145            held not per host, but per port pair and TW bucket is used as state
146            holder.
147
148            If TW bucket has been already destroyed we fall back to VJ's scheme
149            and use initial timestamp retrieved from peer table.
150          */
151         if (tcptw->tw_ts_recent_stamp &&
152             (twp == NULL || (sysctl_tcp_tw_reuse &&
153                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
154                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
155                 if (tp->write_seq == 0)
156                         tp->write_seq = 1;
157                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
158                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
159                 sock_hold(sktw);
160                 return 1;
161         }
162
163         return 0;
164 }
165
166 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
167
168 /* This will initiate an outgoing connection. */
169 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
170 {
171         struct inet_sock *inet = inet_sk(sk);
172         struct tcp_sock *tp = tcp_sk(sk);
173         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
174         struct rtable *rt;
175         __be32 daddr, nexthop;
176         int tmp;
177         int err;
178
179         if (addr_len < sizeof(struct sockaddr_in))
180                 return -EINVAL;
181
182         if (usin->sin_family != AF_INET)
183                 return -EAFNOSUPPORT;
184
185         nexthop = daddr = usin->sin_addr.s_addr;
186         if (inet->opt && inet->opt->srr) {
187                 if (!daddr)
188                         return -EINVAL;
189                 nexthop = inet->opt->faddr;
190         }
191
192         tmp = ip_route_connect(&rt, nexthop, inet->saddr,
193                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
194                                IPPROTO_TCP,
195                                inet->sport, usin->sin_port, sk, 1);
196         if (tmp < 0) {
197                 if (tmp == -ENETUNREACH)
198                         IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
199                 return tmp;
200         }
201
202         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
203                 ip_rt_put(rt);
204                 return -ENETUNREACH;
205         }
206
207         if (!inet->opt || !inet->opt->srr)
208                 daddr = rt->rt_dst;
209
210         if (!inet->saddr)
211                 inet->saddr = rt->rt_src;
212         inet->rcv_saddr = inet->saddr;
213
214         if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
215                 /* Reset inherited state */
216                 tp->rx_opt.ts_recent       = 0;
217                 tp->rx_opt.ts_recent_stamp = 0;
218                 tp->write_seq              = 0;
219         }
220
221         if (tcp_death_row.sysctl_tw_recycle &&
222             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
223                 struct inet_peer *peer = rt_get_peer(rt);
224                 /*
225                  * VJ's idea. We save last timestamp seen from
226                  * the destination in peer table, when entering state
227                  * TIME-WAIT * and initialize rx_opt.ts_recent from it,
228                  * when trying new connection.
229                  */
230                 if (peer != NULL &&
231                     peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) {
232                         tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
233                         tp->rx_opt.ts_recent = peer->tcp_ts;
234                 }
235         }
236
237         inet->dport = usin->sin_port;
238         inet->daddr = daddr;
239
240         inet_csk(sk)->icsk_ext_hdr_len = 0;
241         if (inet->opt)
242                 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
243
244         tp->rx_opt.mss_clamp = 536;
245
246         /* Socket identity is still unknown (sport may be zero).
247          * However we set state to SYN-SENT and not releasing socket
248          * lock select source port, enter ourselves into the hash tables and
249          * complete initialization after this.
250          */
251         tcp_set_state(sk, TCP_SYN_SENT);
252         err = inet_hash_connect(&tcp_death_row, sk);
253         if (err)
254                 goto failure;
255
256         err = ip_route_newports(&rt, IPPROTO_TCP,
257                                 inet->sport, inet->dport, sk);
258         if (err)
259                 goto failure;
260
261         /* OK, now commit destination to socket.  */
262         sk->sk_gso_type = SKB_GSO_TCPV4;
263         sk_setup_caps(sk, &rt->u.dst);
264
265         if (!tp->write_seq)
266                 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
267                                                            inet->daddr,
268                                                            inet->sport,
269                                                            usin->sin_port);
270
271         inet->id = tp->write_seq ^ jiffies;
272
273         err = tcp_connect(sk);
274         rt = NULL;
275         if (err)
276                 goto failure;
277
278         return 0;
279
280 failure:
281         /*
282          * This unhashes the socket and releases the local port,
283          * if necessary.
284          */
285         tcp_set_state(sk, TCP_CLOSE);
286         ip_rt_put(rt);
287         sk->sk_route_caps = 0;
288         inet->dport = 0;
289         return err;
290 }
291
292 /*
293  * This routine does path mtu discovery as defined in RFC1191.
294  */
295 static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
296 {
297         struct dst_entry *dst;
298         struct inet_sock *inet = inet_sk(sk);
299
300         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
301          * send out by Linux are always <576bytes so they should go through
302          * unfragmented).
303          */
304         if (sk->sk_state == TCP_LISTEN)
305                 return;
306
307         /* We don't check in the destentry if pmtu discovery is forbidden
308          * on this route. We just assume that no packet_to_big packets
309          * are send back when pmtu discovery is not active.
310          * There is a small race when the user changes this flag in the
311          * route, but I think that's acceptable.
312          */
313         if ((dst = __sk_dst_check(sk, 0)) == NULL)
314                 return;
315
316         dst->ops->update_pmtu(dst, mtu);
317
318         /* Something is about to be wrong... Remember soft error
319          * for the case, if this connection will not able to recover.
320          */
321         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
322                 sk->sk_err_soft = EMSGSIZE;
323
324         mtu = dst_mtu(dst);
325
326         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
327             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
328                 tcp_sync_mss(sk, mtu);
329
330                 /* Resend the TCP packet because it's
331                  * clear that the old packet has been
332                  * dropped. This is the new "fast" path mtu
333                  * discovery.
334                  */
335                 tcp_simple_retransmit(sk);
336         } /* else let the usual retransmit timer handle it */
337 }
338
339 /*
340  * This routine is called by the ICMP module when it gets some
341  * sort of error condition.  If err < 0 then the socket should
342  * be closed and the error returned to the user.  If err > 0
343  * it's just the icmp type << 8 | icmp code.  After adjustment
344  * header points to the first 8 bytes of the tcp header.  We need
345  * to find the appropriate port.
346  *
347  * The locking strategy used here is very "optimistic". When
348  * someone else accesses the socket the ICMP is just dropped
349  * and for some paths there is no check at all.
350  * A more general error queue to queue errors for later handling
351  * is probably better.
352  *
353  */
354
355 void tcp_v4_err(struct sk_buff *skb, u32 info)
356 {
357         struct iphdr *iph = (struct iphdr *)skb->data;
358         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
359         struct tcp_sock *tp;
360         struct inet_sock *inet;
361         const int type = icmp_hdr(skb)->type;
362         const int code = icmp_hdr(skb)->code;
363         struct sock *sk;
364         __u32 seq;
365         int err;
366
367         if (skb->len < (iph->ihl << 2) + 8) {
368                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
369                 return;
370         }
371
372         sk = inet_lookup(&tcp_hashinfo, iph->daddr, th->dest, iph->saddr,
373                          th->source, inet_iif(skb));
374         if (!sk) {
375                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
376                 return;
377         }
378         if (sk->sk_state == TCP_TIME_WAIT) {
379                 inet_twsk_put(inet_twsk(sk));
380                 return;
381         }
382
383         bh_lock_sock(sk);
384         /* If too many ICMPs get dropped on busy
385          * servers this needs to be solved differently.
386          */
387         if (sock_owned_by_user(sk))
388                 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
389
390         if (sk->sk_state == TCP_CLOSE)
391                 goto out;
392
393         tp = tcp_sk(sk);
394         seq = ntohl(th->seq);
395         if (sk->sk_state != TCP_LISTEN &&
396             !between(seq, tp->snd_una, tp->snd_nxt)) {
397                 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
398                 goto out;
399         }
400
401         switch (type) {
402         case ICMP_SOURCE_QUENCH:
403                 /* Just silently ignore these. */
404                 goto out;
405         case ICMP_PARAMETERPROB:
406                 err = EPROTO;
407                 break;
408         case ICMP_DEST_UNREACH:
409                 if (code > NR_ICMP_UNREACH)
410                         goto out;
411
412                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
413                         if (!sock_owned_by_user(sk))
414                                 do_pmtu_discovery(sk, iph, info);
415                         goto out;
416                 }
417
418                 err = icmp_err_convert[code].errno;
419                 break;
420         case ICMP_TIME_EXCEEDED:
421                 err = EHOSTUNREACH;
422                 break;
423         default:
424                 goto out;
425         }
426
427         switch (sk->sk_state) {
428                 struct request_sock *req, **prev;
429         case TCP_LISTEN:
430                 if (sock_owned_by_user(sk))
431                         goto out;
432
433                 req = inet_csk_search_req(sk, &prev, th->dest,
434                                           iph->daddr, iph->saddr);
435                 if (!req)
436                         goto out;
437
438                 /* ICMPs are not backlogged, hence we cannot get
439                    an established socket here.
440                  */
441                 BUG_TRAP(!req->sk);
442
443                 if (seq != tcp_rsk(req)->snt_isn) {
444                         NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
445                         goto out;
446                 }
447
448                 /*
449                  * Still in SYN_RECV, just remove it silently.
450                  * There is no good way to pass the error to the newly
451                  * created socket, and POSIX does not want network
452                  * errors returned from accept().
453                  */
454                 inet_csk_reqsk_queue_drop(sk, req, prev);
455                 goto out;
456
457         case TCP_SYN_SENT:
458         case TCP_SYN_RECV:  /* Cannot happen.
459                                It can f.e. if SYNs crossed.
460                              */
461                 if (!sock_owned_by_user(sk)) {
462                         sk->sk_err = err;
463
464                         sk->sk_error_report(sk);
465
466                         tcp_done(sk);
467                 } else {
468                         sk->sk_err_soft = err;
469                 }
470                 goto out;
471         }
472
473         /* If we've already connected we will keep trying
474          * until we time out, or the user gives up.
475          *
476          * rfc1122 4.2.3.9 allows to consider as hard errors
477          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
478          * but it is obsoleted by pmtu discovery).
479          *
480          * Note, that in modern internet, where routing is unreliable
481          * and in each dark corner broken firewalls sit, sending random
482          * errors ordered by their masters even this two messages finally lose
483          * their original sense (even Linux sends invalid PORT_UNREACHs)
484          *
485          * Now we are in compliance with RFCs.
486          *                                                      --ANK (980905)
487          */
488
489         inet = inet_sk(sk);
490         if (!sock_owned_by_user(sk) && inet->recverr) {
491                 sk->sk_err = err;
492                 sk->sk_error_report(sk);
493         } else  { /* Only an error on timeout */
494                 sk->sk_err_soft = err;
495         }
496
497 out:
498         bh_unlock_sock(sk);
499         sock_put(sk);
500 }
501
502 /* This routine computes an IPv4 TCP checksum. */
503 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
504 {
505         struct inet_sock *inet = inet_sk(sk);
506         struct tcphdr *th = tcp_hdr(skb);
507
508         if (skb->ip_summed == CHECKSUM_PARTIAL) {
509                 th->check = ~tcp_v4_check(len, inet->saddr,
510                                           inet->daddr, 0);
511                 skb->csum_start = skb_transport_header(skb) - skb->head;
512                 skb->csum_offset = offsetof(struct tcphdr, check);
513         } else {
514                 th->check = tcp_v4_check(len, inet->saddr, inet->daddr,
515                                          csum_partial((char *)th,
516                                                       th->doff << 2,
517                                                       skb->csum));
518         }
519 }
520
521 int tcp_v4_gso_send_check(struct sk_buff *skb)
522 {
523         const struct iphdr *iph;
524         struct tcphdr *th;
525
526         if (!pskb_may_pull(skb, sizeof(*th)))
527                 return -EINVAL;
528
529         iph = ip_hdr(skb);
530         th = tcp_hdr(skb);
531
532         th->check = 0;
533         th->check = ~tcp_v4_check(skb->len, iph->saddr, iph->daddr, 0);
534         skb->csum_start = skb_transport_header(skb) - skb->head;
535         skb->csum_offset = offsetof(struct tcphdr, check);
536         skb->ip_summed = CHECKSUM_PARTIAL;
537         return 0;
538 }
539
540 /*
541  *      This routine will send an RST to the other tcp.
542  *
543  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
544  *                    for reset.
545  *      Answer: if a packet caused RST, it is not for a socket
546  *              existing in our system, if it is matched to a socket,
547  *              it is just duplicate segment or bug in other side's TCP.
548  *              So that we build reply only basing on parameters
549  *              arrived with segment.
550  *      Exception: precedence violation. We do not implement it in any case.
551  */
552
553 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
554 {
555         struct tcphdr *th = tcp_hdr(skb);
556         struct {
557                 struct tcphdr th;
558 #ifdef CONFIG_TCP_MD5SIG
559                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
560 #endif
561         } rep;
562         struct ip_reply_arg arg;
563 #ifdef CONFIG_TCP_MD5SIG
564         struct tcp_md5sig_key *key;
565 #endif
566
567         /* Never send a reset in response to a reset. */
568         if (th->rst)
569                 return;
570
571         if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
572                 return;
573
574         /* Swap the send and the receive. */
575         memset(&rep, 0, sizeof(rep));
576         rep.th.dest   = th->source;
577         rep.th.source = th->dest;
578         rep.th.doff   = sizeof(struct tcphdr) / 4;
579         rep.th.rst    = 1;
580
581         if (th->ack) {
582                 rep.th.seq = th->ack_seq;
583         } else {
584                 rep.th.ack = 1;
585                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
586                                        skb->len - (th->doff << 2));
587         }
588
589         memset(&arg, 0, sizeof(arg));
590         arg.iov[0].iov_base = (unsigned char *)&rep;
591         arg.iov[0].iov_len  = sizeof(rep.th);
592
593 #ifdef CONFIG_TCP_MD5SIG
594         key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
595         if (key) {
596                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
597                                    (TCPOPT_NOP << 16) |
598                                    (TCPOPT_MD5SIG << 8) |
599                                    TCPOLEN_MD5SIG);
600                 /* Update length and the length the header thinks exists */
601                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
602                 rep.th.doff = arg.iov[0].iov_len / 4;
603
604                 tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[1],
605                                         key,
606                                         ip_hdr(skb)->daddr,
607                                         ip_hdr(skb)->saddr,
608                                         &rep.th, IPPROTO_TCP,
609                                         arg.iov[0].iov_len);
610         }
611 #endif
612         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
613                                       ip_hdr(skb)->saddr, /* XXX */
614                                       sizeof(struct tcphdr), IPPROTO_TCP, 0);
615         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
616
617         ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
618
619         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
620         TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
621 }
622
623 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
624    outside socket context is ugly, certainly. What can I do?
625  */
626
627 static void tcp_v4_send_ack(struct tcp_timewait_sock *twsk,
628                             struct sk_buff *skb, u32 seq, u32 ack,
629                             u32 win, u32 ts)
630 {
631         struct tcphdr *th = tcp_hdr(skb);
632         struct {
633                 struct tcphdr th;
634                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
635 #ifdef CONFIG_TCP_MD5SIG
636                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
637 #endif
638                         ];
639         } rep;
640         struct ip_reply_arg arg;
641 #ifdef CONFIG_TCP_MD5SIG
642         struct tcp_md5sig_key *key;
643         struct tcp_md5sig_key tw_key;
644 #endif
645
646         memset(&rep.th, 0, sizeof(struct tcphdr));
647         memset(&arg, 0, sizeof(arg));
648
649         arg.iov[0].iov_base = (unsigned char *)&rep;
650         arg.iov[0].iov_len  = sizeof(rep.th);
651         if (ts) {
652                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
653                                    (TCPOPT_TIMESTAMP << 8) |
654                                    TCPOLEN_TIMESTAMP);
655                 rep.opt[1] = htonl(tcp_time_stamp);
656                 rep.opt[2] = htonl(ts);
657                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
658         }
659
660         /* Swap the send and the receive. */
661         rep.th.dest    = th->source;
662         rep.th.source  = th->dest;
663         rep.th.doff    = arg.iov[0].iov_len / 4;
664         rep.th.seq     = htonl(seq);
665         rep.th.ack_seq = htonl(ack);
666         rep.th.ack     = 1;
667         rep.th.window  = htons(win);
668
669 #ifdef CONFIG_TCP_MD5SIG
670         /*
671          * The SKB holds an imcoming packet, but may not have a valid ->sk
672          * pointer. This is especially the case when we're dealing with a
673          * TIME_WAIT ack, because the sk structure is long gone, and only
674          * the tcp_timewait_sock remains. So the md5 key is stashed in that
675          * structure, and we use it in preference.  I believe that (twsk ||
676          * skb->sk) holds true, but we program defensively.
677          */
678         if (!twsk && skb->sk) {
679                 key = tcp_v4_md5_do_lookup(skb->sk, ip_hdr(skb)->daddr);
680         } else if (twsk && twsk->tw_md5_keylen) {
681                 tw_key.key = twsk->tw_md5_key;
682                 tw_key.keylen = twsk->tw_md5_keylen;
683                 key = &tw_key;
684         } else
685                 key = NULL;
686
687         if (key) {
688                 int offset = (ts) ? 3 : 0;
689
690                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
691                                           (TCPOPT_NOP << 16) |
692                                           (TCPOPT_MD5SIG << 8) |
693                                           TCPOLEN_MD5SIG);
694                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
695                 rep.th.doff = arg.iov[0].iov_len/4;
696
697                 tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[offset],
698                                         key,
699                                         ip_hdr(skb)->daddr,
700                                         ip_hdr(skb)->saddr,
701                                         &rep.th, IPPROTO_TCP,
702                                         arg.iov[0].iov_len);
703         }
704 #endif
705         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
706                                       ip_hdr(skb)->saddr, /* XXX */
707                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
708         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
709         if (twsk)
710                 arg.bound_dev_if = twsk->tw_sk.tw_bound_dev_if;
711
712         ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
713
714         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
715 }
716
717 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
718 {
719         struct inet_timewait_sock *tw = inet_twsk(sk);
720         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
721
722         tcp_v4_send_ack(tcptw, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
723                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
724                         tcptw->tw_ts_recent);
725
726         inet_twsk_put(tw);
727 }
728
729 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb,
730                                   struct request_sock *req)
731 {
732         tcp_v4_send_ack(NULL, skb, tcp_rsk(req)->snt_isn + 1,
733                         tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
734                         req->ts_recent);
735 }
736
737 /*
738  *      Send a SYN-ACK after having received an ACK.
739  *      This still operates on a request_sock only, not on a big
740  *      socket.
741  */
742 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
743                               struct dst_entry *dst)
744 {
745         const struct inet_request_sock *ireq = inet_rsk(req);
746         int err = -1;
747         struct sk_buff * skb;
748
749         /* First, grab a route. */
750         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
751                 goto out;
752
753         skb = tcp_make_synack(sk, dst, req);
754
755         if (skb) {
756                 struct tcphdr *th = tcp_hdr(skb);
757
758                 th->check = tcp_v4_check(skb->len,
759                                          ireq->loc_addr,
760                                          ireq->rmt_addr,
761                                          csum_partial((char *)th, skb->len,
762                                                       skb->csum));
763
764                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
765                                             ireq->rmt_addr,
766                                             ireq->opt);
767                 err = net_xmit_eval(err);
768         }
769
770 out:
771         dst_release(dst);
772         return err;
773 }
774
775 /*
776  *      IPv4 request_sock destructor.
777  */
778 static void tcp_v4_reqsk_destructor(struct request_sock *req)
779 {
780         kfree(inet_rsk(req)->opt);
781 }
782
783 #ifdef CONFIG_SYN_COOKIES
784 static void syn_flood_warning(struct sk_buff *skb)
785 {
786         static unsigned long warntime;
787
788         if (time_after(jiffies, (warntime + HZ * 60))) {
789                 warntime = jiffies;
790                 printk(KERN_INFO
791                        "possible SYN flooding on port %d. Sending cookies.\n",
792                        ntohs(tcp_hdr(skb)->dest));
793         }
794 }
795 #endif
796
797 /*
798  * Save and compile IPv4 options into the request_sock if needed.
799  */
800 static struct ip_options *tcp_v4_save_options(struct sock *sk,
801                                               struct sk_buff *skb)
802 {
803         struct ip_options *opt = &(IPCB(skb)->opt);
804         struct ip_options *dopt = NULL;
805
806         if (opt && opt->optlen) {
807                 int opt_size = optlength(opt);
808                 dopt = kmalloc(opt_size, GFP_ATOMIC);
809                 if (dopt) {
810                         if (ip_options_echo(dopt, skb)) {
811                                 kfree(dopt);
812                                 dopt = NULL;
813                         }
814                 }
815         }
816         return dopt;
817 }
818
819 #ifdef CONFIG_TCP_MD5SIG
820 /*
821  * RFC2385 MD5 checksumming requires a mapping of
822  * IP address->MD5 Key.
823  * We need to maintain these in the sk structure.
824  */
825
826 /* Find the Key structure for an address.  */
827 static struct tcp_md5sig_key *
828                         tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
829 {
830         struct tcp_sock *tp = tcp_sk(sk);
831         int i;
832
833         if (!tp->md5sig_info || !tp->md5sig_info->entries4)
834                 return NULL;
835         for (i = 0; i < tp->md5sig_info->entries4; i++) {
836                 if (tp->md5sig_info->keys4[i].addr == addr)
837                         return &tp->md5sig_info->keys4[i].base;
838         }
839         return NULL;
840 }
841
842 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
843                                          struct sock *addr_sk)
844 {
845         return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->daddr);
846 }
847
848 EXPORT_SYMBOL(tcp_v4_md5_lookup);
849
850 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
851                                                       struct request_sock *req)
852 {
853         return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
854 }
855
856 /* This can be called on a newly created socket, from other files */
857 int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
858                       u8 *newkey, u8 newkeylen)
859 {
860         /* Add Key to the list */
861         struct tcp_md5sig_key *key;
862         struct tcp_sock *tp = tcp_sk(sk);
863         struct tcp4_md5sig_key *keys;
864
865         key = tcp_v4_md5_do_lookup(sk, addr);
866         if (key) {
867                 /* Pre-existing entry - just update that one. */
868                 kfree(key->key);
869                 key->key = newkey;
870                 key->keylen = newkeylen;
871         } else {
872                 struct tcp_md5sig_info *md5sig;
873
874                 if (!tp->md5sig_info) {
875                         tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
876                                                   GFP_ATOMIC);
877                         if (!tp->md5sig_info) {
878                                 kfree(newkey);
879                                 return -ENOMEM;
880                         }
881                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
882                 }
883                 if (tcp_alloc_md5sig_pool() == NULL) {
884                         kfree(newkey);
885                         return -ENOMEM;
886                 }
887                 md5sig = tp->md5sig_info;
888
889                 if (md5sig->alloced4 == md5sig->entries4) {
890                         keys = kmalloc((sizeof(*keys) *
891                                         (md5sig->entries4 + 1)), GFP_ATOMIC);
892                         if (!keys) {
893                                 kfree(newkey);
894                                 tcp_free_md5sig_pool();
895                                 return -ENOMEM;
896                         }
897
898                         if (md5sig->entries4)
899                                 memcpy(keys, md5sig->keys4,
900                                        sizeof(*keys) * md5sig->entries4);
901
902                         /* Free old key list, and reference new one */
903                         if (md5sig->keys4)
904                                 kfree(md5sig->keys4);
905                         md5sig->keys4 = keys;
906                         md5sig->alloced4++;
907                 }
908                 md5sig->entries4++;
909                 md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
910                 md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
911                 md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
912         }
913         return 0;
914 }
915
916 EXPORT_SYMBOL(tcp_v4_md5_do_add);
917
918 static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
919                                u8 *newkey, u8 newkeylen)
920 {
921         return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->daddr,
922                                  newkey, newkeylen);
923 }
924
925 int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
926 {
927         struct tcp_sock *tp = tcp_sk(sk);
928         int i;
929
930         for (i = 0; i < tp->md5sig_info->entries4; i++) {
931                 if (tp->md5sig_info->keys4[i].addr == addr) {
932                         /* Free the key */
933                         kfree(tp->md5sig_info->keys4[i].base.key);
934                         tp->md5sig_info->entries4--;
935
936                         if (tp->md5sig_info->entries4 == 0) {
937                                 kfree(tp->md5sig_info->keys4);
938                                 tp->md5sig_info->keys4 = NULL;
939                                 tp->md5sig_info->alloced4 = 0;
940                         } else if (tp->md5sig_info->entries4 != i) {
941                                 /* Need to do some manipulation */
942                                 memcpy(&tp->md5sig_info->keys4[i],
943                                        &tp->md5sig_info->keys4[i+1],
944                                        (tp->md5sig_info->entries4 - i) *
945                                         sizeof(struct tcp4_md5sig_key));
946                         }
947                         tcp_free_md5sig_pool();
948                         return 0;
949                 }
950         }
951         return -ENOENT;
952 }
953
954 EXPORT_SYMBOL(tcp_v4_md5_do_del);
955
956 static void tcp_v4_clear_md5_list(struct sock *sk)
957 {
958         struct tcp_sock *tp = tcp_sk(sk);
959
960         /* Free each key, then the set of key keys,
961          * the crypto element, and then decrement our
962          * hold on the last resort crypto.
963          */
964         if (tp->md5sig_info->entries4) {
965                 int i;
966                 for (i = 0; i < tp->md5sig_info->entries4; i++)
967                         kfree(tp->md5sig_info->keys4[i].base.key);
968                 tp->md5sig_info->entries4 = 0;
969                 tcp_free_md5sig_pool();
970         }
971         if (tp->md5sig_info->keys4) {
972                 kfree(tp->md5sig_info->keys4);
973                 tp->md5sig_info->keys4 = NULL;
974                 tp->md5sig_info->alloced4  = 0;
975         }
976 }
977
978 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
979                                  int optlen)
980 {
981         struct tcp_md5sig cmd;
982         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
983         u8 *newkey;
984
985         if (optlen < sizeof(cmd))
986                 return -EINVAL;
987
988         if (copy_from_user(&cmd, optval, sizeof(cmd)))
989                 return -EFAULT;
990
991         if (sin->sin_family != AF_INET)
992                 return -EINVAL;
993
994         if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
995                 if (!tcp_sk(sk)->md5sig_info)
996                         return -ENOENT;
997                 return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
998         }
999
1000         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1001                 return -EINVAL;
1002
1003         if (!tcp_sk(sk)->md5sig_info) {
1004                 struct tcp_sock *tp = tcp_sk(sk);
1005                 struct tcp_md5sig_info *p = kzalloc(sizeof(*p), GFP_KERNEL);
1006
1007                 if (!p)
1008                         return -EINVAL;
1009
1010                 tp->md5sig_info = p;
1011                 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1012         }
1013
1014         newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1015         if (!newkey)
1016                 return -ENOMEM;
1017         return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1018                                  newkey, cmd.tcpm_keylen);
1019 }
1020
1021 static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
1022                                    __be32 saddr, __be32 daddr,
1023                                    struct tcphdr *th, int protocol,
1024                                    int tcplen)
1025 {
1026         struct scatterlist sg[4];
1027         __u16 data_len;
1028         int block = 0;
1029         __sum16 old_checksum;
1030         struct tcp_md5sig_pool *hp;
1031         struct tcp4_pseudohdr *bp;
1032         struct hash_desc *desc;
1033         int err;
1034         unsigned int nbytes = 0;
1035
1036         /*
1037          * Okay, so RFC2385 is turned on for this connection,
1038          * so we need to generate the MD5 hash for the packet now.
1039          */
1040
1041         hp = tcp_get_md5sig_pool();
1042         if (!hp)
1043                 goto clear_hash_noput;
1044
1045         bp = &hp->md5_blk.ip4;
1046         desc = &hp->md5_desc;
1047
1048         /*
1049          * 1. the TCP pseudo-header (in the order: source IP address,
1050          * destination IP address, zero-padded protocol number, and
1051          * segment length)
1052          */
1053         bp->saddr = saddr;
1054         bp->daddr = daddr;
1055         bp->pad = 0;
1056         bp->protocol = protocol;
1057         bp->len = htons(tcplen);
1058
1059         sg_init_table(sg, 4);
1060
1061         sg_set_buf(&sg[block++], bp, sizeof(*bp));
1062         nbytes += sizeof(*bp);
1063
1064         /* 2. the TCP header, excluding options, and assuming a
1065          * checksum of zero/
1066          */
1067         old_checksum = th->check;
1068         th->check = 0;
1069         sg_set_buf(&sg[block++], th, sizeof(struct tcphdr));
1070         nbytes += sizeof(struct tcphdr);
1071
1072         /* 3. the TCP segment data (if any) */
1073         data_len = tcplen - (th->doff << 2);
1074         if (data_len > 0) {
1075                 unsigned char *data = (unsigned char *)th + (th->doff << 2);
1076                 sg_set_buf(&sg[block++], data, data_len);
1077                 nbytes += data_len;
1078         }
1079
1080         /* 4. an independently-specified key or password, known to both
1081          * TCPs and presumably connection-specific
1082          */
1083         sg_set_buf(&sg[block++], key->key, key->keylen);
1084         nbytes += key->keylen;
1085
1086         sg_mark_end(&sg[block - 1]);
1087
1088         /* Now store the Hash into the packet */
1089         err = crypto_hash_init(desc);
1090         if (err)
1091                 goto clear_hash;
1092         err = crypto_hash_update(desc, sg, nbytes);
1093         if (err)
1094                 goto clear_hash;
1095         err = crypto_hash_final(desc, md5_hash);
1096         if (err)
1097                 goto clear_hash;
1098
1099         /* Reset header, and free up the crypto */
1100         tcp_put_md5sig_pool();
1101         th->check = old_checksum;
1102
1103 out:
1104         return 0;
1105 clear_hash:
1106         tcp_put_md5sig_pool();
1107 clear_hash_noput:
1108         memset(md5_hash, 0, 16);
1109         goto out;
1110 }
1111
1112 int tcp_v4_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
1113                          struct sock *sk,
1114                          struct dst_entry *dst,
1115                          struct request_sock *req,
1116                          struct tcphdr *th, int protocol,
1117                          int tcplen)
1118 {
1119         __be32 saddr, daddr;
1120
1121         if (sk) {
1122                 saddr = inet_sk(sk)->saddr;
1123                 daddr = inet_sk(sk)->daddr;
1124         } else {
1125                 struct rtable *rt = (struct rtable *)dst;
1126                 BUG_ON(!rt);
1127                 saddr = rt->rt_src;
1128                 daddr = rt->rt_dst;
1129         }
1130         return tcp_v4_do_calc_md5_hash(md5_hash, key,
1131                                        saddr, daddr,
1132                                        th, protocol, tcplen);
1133 }
1134
1135 EXPORT_SYMBOL(tcp_v4_calc_md5_hash);
1136
1137 static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1138 {
1139         /*
1140          * This gets called for each TCP segment that arrives
1141          * so we want to be efficient.
1142          * We have 3 drop cases:
1143          * o No MD5 hash and one expected.
1144          * o MD5 hash and we're not expecting one.
1145          * o MD5 hash and its wrong.
1146          */
1147         __u8 *hash_location = NULL;
1148         struct tcp_md5sig_key *hash_expected;
1149         const struct iphdr *iph = ip_hdr(skb);
1150         struct tcphdr *th = tcp_hdr(skb);
1151         int length = (th->doff << 2) - sizeof(struct tcphdr);
1152         int genhash;
1153         unsigned char *ptr;
1154         unsigned char newhash[16];
1155
1156         hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1157
1158         /*
1159          * If the TCP option length is less than the TCP_MD5SIG
1160          * option length, then we can shortcut
1161          */
1162         if (length < TCPOLEN_MD5SIG) {
1163                 if (hash_expected)
1164                         return 1;
1165                 else
1166                         return 0;
1167         }
1168
1169         /* Okay, we can't shortcut - we have to grub through the options */
1170         ptr = (unsigned char *)(th + 1);
1171         while (length > 0) {
1172                 int opcode = *ptr++;
1173                 int opsize;
1174
1175                 switch (opcode) {
1176                 case TCPOPT_EOL:
1177                         goto done_opts;
1178                 case TCPOPT_NOP:
1179                         length--;
1180                         continue;
1181                 default:
1182                         opsize = *ptr++;
1183                         if (opsize < 2)
1184                                 goto done_opts;
1185                         if (opsize > length)
1186                                 goto done_opts;
1187
1188                         if (opcode == TCPOPT_MD5SIG) {
1189                                 hash_location = ptr;
1190                                 goto done_opts;
1191                         }
1192                 }
1193                 ptr += opsize-2;
1194                 length -= opsize;
1195         }
1196 done_opts:
1197         /* We've parsed the options - do we have a hash? */
1198         if (!hash_expected && !hash_location)
1199                 return 0;
1200
1201         if (hash_expected && !hash_location) {
1202                 LIMIT_NETDEBUG(KERN_INFO "MD5 Hash expected but NOT found "
1203                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
1204                                NIPQUAD(iph->saddr), ntohs(th->source),
1205                                NIPQUAD(iph->daddr), ntohs(th->dest));
1206                 return 1;
1207         }
1208
1209         if (!hash_expected && hash_location) {
1210                 LIMIT_NETDEBUG(KERN_INFO "MD5 Hash NOT expected but found "
1211                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
1212                                NIPQUAD(iph->saddr), ntohs(th->source),
1213                                NIPQUAD(iph->daddr), ntohs(th->dest));
1214                 return 1;
1215         }
1216
1217         /* Okay, so this is hash_expected and hash_location -
1218          * so we need to calculate the checksum.
1219          */
1220         genhash = tcp_v4_do_calc_md5_hash(newhash,
1221                                           hash_expected,
1222                                           iph->saddr, iph->daddr,
1223                                           th, sk->sk_protocol,
1224                                           skb->len);
1225
1226         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1227                 if (net_ratelimit()) {
1228                         printk(KERN_INFO "MD5 Hash failed for "
1229                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)%s\n",
1230                                NIPQUAD(iph->saddr), ntohs(th->source),
1231                                NIPQUAD(iph->daddr), ntohs(th->dest),
1232                                genhash ? " tcp_v4_calc_md5_hash failed" : "");
1233                 }
1234                 return 1;
1235         }
1236         return 0;
1237 }
1238
1239 #endif
1240
1241 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1242         .family         =       PF_INET,
1243         .obj_size       =       sizeof(struct tcp_request_sock),
1244         .rtx_syn_ack    =       tcp_v4_send_synack,
1245         .send_ack       =       tcp_v4_reqsk_send_ack,
1246         .destructor     =       tcp_v4_reqsk_destructor,
1247         .send_reset     =       tcp_v4_send_reset,
1248 };
1249
1250 #ifdef CONFIG_TCP_MD5SIG
1251 static struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1252         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1253 };
1254 #endif
1255
1256 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1257         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1258         .twsk_unique    = tcp_twsk_unique,
1259         .twsk_destructor= tcp_twsk_destructor,
1260 };
1261
1262 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1263 {
1264         struct inet_request_sock *ireq;
1265         struct tcp_options_received tmp_opt;
1266         struct request_sock *req;
1267         __be32 saddr = ip_hdr(skb)->saddr;
1268         __be32 daddr = ip_hdr(skb)->daddr;
1269         __u32 isn = TCP_SKB_CB(skb)->when;
1270         struct dst_entry *dst = NULL;
1271 #ifdef CONFIG_SYN_COOKIES
1272         int want_cookie = 0;
1273 #else
1274 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1275 #endif
1276
1277         /* Never answer to SYNs send to broadcast or multicast */
1278         if (((struct rtable *)skb->dst)->rt_flags &
1279             (RTCF_BROADCAST | RTCF_MULTICAST))
1280                 goto drop;
1281
1282         /* TW buckets are converted to open requests without
1283          * limitations, they conserve resources and peer is
1284          * evidently real one.
1285          */
1286         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1287 #ifdef CONFIG_SYN_COOKIES
1288                 if (sysctl_tcp_syncookies) {
1289                         want_cookie = 1;
1290                 } else
1291 #endif
1292                 goto drop;
1293         }
1294
1295         /* Accept backlog is full. If we have already queued enough
1296          * of warm entries in syn queue, drop request. It is better than
1297          * clogging syn queue with openreqs with exponentially increasing
1298          * timeout.
1299          */
1300         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1301                 goto drop;
1302
1303         req = reqsk_alloc(&tcp_request_sock_ops);
1304         if (!req)
1305                 goto drop;
1306
1307 #ifdef CONFIG_TCP_MD5SIG
1308         tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1309 #endif
1310
1311         tcp_clear_options(&tmp_opt);
1312         tmp_opt.mss_clamp = 536;
1313         tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
1314
1315         tcp_parse_options(skb, &tmp_opt, 0);
1316
1317         if (want_cookie) {
1318                 tcp_clear_options(&tmp_opt);
1319                 tmp_opt.saw_tstamp = 0;
1320         }
1321
1322         if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1323                 /* Some OSes (unknown ones, but I see them on web server, which
1324                  * contains information interesting only for windows'
1325                  * users) do not send their stamp in SYN. It is easy case.
1326                  * We simply do not advertise TS support.
1327                  */
1328                 tmp_opt.saw_tstamp = 0;
1329                 tmp_opt.tstamp_ok  = 0;
1330         }
1331         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1332
1333         tcp_openreq_init(req, &tmp_opt, skb);
1334
1335         if (security_inet_conn_request(sk, skb, req))
1336                 goto drop_and_free;
1337
1338         ireq = inet_rsk(req);
1339         ireq->loc_addr = daddr;
1340         ireq->rmt_addr = saddr;
1341         ireq->opt = tcp_v4_save_options(sk, skb);
1342         if (!want_cookie)
1343                 TCP_ECN_create_request(req, tcp_hdr(skb));
1344
1345         if (want_cookie) {
1346 #ifdef CONFIG_SYN_COOKIES
1347                 syn_flood_warning(skb);
1348 #endif
1349                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1350         } else if (!isn) {
1351                 struct inet_peer *peer = NULL;
1352
1353                 /* VJ's idea. We save last timestamp seen
1354                  * from the destination in peer table, when entering
1355                  * state TIME-WAIT, and check against it before
1356                  * accepting new connection request.
1357                  *
1358                  * If "isn" is not zero, this request hit alive
1359                  * timewait bucket, so that all the necessary checks
1360                  * are made in the function processing timewait state.
1361                  */
1362                 if (tmp_opt.saw_tstamp &&
1363                     tcp_death_row.sysctl_tw_recycle &&
1364                     (dst = inet_csk_route_req(sk, req)) != NULL &&
1365                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1366                     peer->v4daddr == saddr) {
1367                         if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1368                             (s32)(peer->tcp_ts - req->ts_recent) >
1369                                                         TCP_PAWS_WINDOW) {
1370                                 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1371                                 dst_release(dst);
1372                                 goto drop_and_free;
1373                         }
1374                 }
1375                 /* Kill the following clause, if you dislike this way. */
1376                 else if (!sysctl_tcp_syncookies &&
1377                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1378                           (sysctl_max_syn_backlog >> 2)) &&
1379                          (!peer || !peer->tcp_ts_stamp) &&
1380                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1381                         /* Without syncookies last quarter of
1382                          * backlog is filled with destinations,
1383                          * proven to be alive.
1384                          * It means that we continue to communicate
1385                          * to destinations, already remembered
1386                          * to the moment of synflood.
1387                          */
1388                         LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
1389                                        "request from %u.%u.%u.%u/%u\n",
1390                                        NIPQUAD(saddr),
1391                                        ntohs(tcp_hdr(skb)->source));
1392                         dst_release(dst);
1393                         goto drop_and_free;
1394                 }
1395
1396                 isn = tcp_v4_init_sequence(skb);
1397         }
1398         tcp_rsk(req)->snt_isn = isn;
1399
1400         if (tcp_v4_send_synack(sk, req, dst))
1401                 goto drop_and_free;
1402
1403         if (want_cookie) {
1404                 reqsk_free(req);
1405         } else {
1406                 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1407         }
1408         return 0;
1409
1410 drop_and_free:
1411         reqsk_free(req);
1412 drop:
1413         return 0;
1414 }
1415
1416
1417 /*
1418  * The three way handshake has completed - we got a valid synack -
1419  * now create the new socket.
1420  */
1421 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1422                                   struct request_sock *req,
1423                                   struct dst_entry *dst)
1424 {
1425         struct inet_request_sock *ireq;
1426         struct inet_sock *newinet;
1427         struct tcp_sock *newtp;
1428         struct sock *newsk;
1429 #ifdef CONFIG_TCP_MD5SIG
1430         struct tcp_md5sig_key *key;
1431 #endif
1432
1433         if (sk_acceptq_is_full(sk))
1434                 goto exit_overflow;
1435
1436         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1437                 goto exit;
1438
1439         newsk = tcp_create_openreq_child(sk, req, skb);
1440         if (!newsk)
1441                 goto exit;
1442
1443         newsk->sk_gso_type = SKB_GSO_TCPV4;
1444         sk_setup_caps(newsk, dst);
1445
1446         newtp                 = tcp_sk(newsk);
1447         newinet               = inet_sk(newsk);
1448         ireq                  = inet_rsk(req);
1449         newinet->daddr        = ireq->rmt_addr;
1450         newinet->rcv_saddr    = ireq->loc_addr;
1451         newinet->saddr        = ireq->loc_addr;
1452         newinet->opt          = ireq->opt;
1453         ireq->opt             = NULL;
1454         newinet->mc_index     = inet_iif(skb);
1455         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1456         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1457         if (newinet->opt)
1458                 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1459         newinet->id = newtp->write_seq ^ jiffies;
1460
1461         tcp_mtup_init(newsk);
1462         tcp_sync_mss(newsk, dst_mtu(dst));
1463         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1464         tcp_initialize_rcv_mss(newsk);
1465
1466 #ifdef CONFIG_TCP_MD5SIG
1467         /* Copy over the MD5 key from the original socket */
1468         if ((key = tcp_v4_md5_do_lookup(sk, newinet->daddr)) != NULL) {
1469                 /*
1470                  * We're using one, so create a matching key
1471                  * on the newsk structure. If we fail to get
1472                  * memory, then we end up not copying the key
1473                  * across. Shucks.
1474                  */
1475                 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1476                 if (newkey != NULL)
1477                         tcp_v4_md5_do_add(newsk, inet_sk(sk)->daddr,
1478                                           newkey, key->keylen);
1479         }
1480 #endif
1481
1482         __inet_hash(&tcp_hashinfo, newsk, 0);
1483         __inet_inherit_port(&tcp_hashinfo, sk, newsk);
1484
1485         return newsk;
1486
1487 exit_overflow:
1488         NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1489 exit:
1490         NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1491         dst_release(dst);
1492         return NULL;
1493 }
1494
1495 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1496 {
1497         struct tcphdr *th = tcp_hdr(skb);
1498         const struct iphdr *iph = ip_hdr(skb);
1499         struct sock *nsk;
1500         struct request_sock **prev;
1501         /* Find possible connection requests. */
1502         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1503                                                        iph->saddr, iph->daddr);
1504         if (req)
1505                 return tcp_check_req(sk, skb, req, prev);
1506
1507         nsk = inet_lookup_established(&tcp_hashinfo, iph->saddr, th->source,
1508                                       iph->daddr, th->dest, inet_iif(skb));
1509
1510         if (nsk) {
1511                 if (nsk->sk_state != TCP_TIME_WAIT) {
1512                         bh_lock_sock(nsk);
1513                         return nsk;
1514                 }
1515                 inet_twsk_put(inet_twsk(nsk));
1516                 return NULL;
1517         }
1518
1519 #ifdef CONFIG_SYN_COOKIES
1520         if (!th->rst && !th->syn && th->ack)
1521                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1522 #endif
1523         return sk;
1524 }
1525
1526 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1527 {
1528         const struct iphdr *iph = ip_hdr(skb);
1529
1530         if (skb->ip_summed == CHECKSUM_COMPLETE) {
1531                 if (!tcp_v4_check(skb->len, iph->saddr,
1532                                   iph->daddr, skb->csum)) {
1533                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1534                         return 0;
1535                 }
1536         }
1537
1538         skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1539                                        skb->len, IPPROTO_TCP, 0);
1540
1541         if (skb->len <= 76) {
1542                 return __skb_checksum_complete(skb);
1543         }
1544         return 0;
1545 }
1546
1547
1548 /* The socket must have it's spinlock held when we get
1549  * here.
1550  *
1551  * We have a potential double-lock case here, so even when
1552  * doing backlog processing we use the BH locking scheme.
1553  * This is because we cannot sleep with the original spinlock
1554  * held.
1555  */
1556 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1557 {
1558         struct sock *rsk;
1559 #ifdef CONFIG_TCP_MD5SIG
1560         /*
1561          * We really want to reject the packet as early as possible
1562          * if:
1563          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1564          *  o There is an MD5 option and we're not expecting one
1565          */
1566         if (tcp_v4_inbound_md5_hash(sk, skb))
1567                 goto discard;
1568 #endif
1569
1570         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1571                 TCP_CHECK_TIMER(sk);
1572                 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1573                         rsk = sk;
1574                         goto reset;
1575                 }
1576                 TCP_CHECK_TIMER(sk);
1577                 return 0;
1578         }
1579
1580         if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1581                 goto csum_err;
1582
1583         if (sk->sk_state == TCP_LISTEN) {
1584                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1585                 if (!nsk)
1586                         goto discard;
1587
1588                 if (nsk != sk) {
1589                         if (tcp_child_process(sk, nsk, skb)) {
1590                                 rsk = nsk;
1591                                 goto reset;
1592                         }
1593                         return 0;
1594                 }
1595         }
1596
1597         TCP_CHECK_TIMER(sk);
1598         if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1599                 rsk = sk;
1600                 goto reset;
1601         }
1602         TCP_CHECK_TIMER(sk);
1603         return 0;
1604
1605 reset:
1606         tcp_v4_send_reset(rsk, skb);
1607 discard:
1608         kfree_skb(skb);
1609         /* Be careful here. If this function gets more complicated and
1610          * gcc suffers from register pressure on the x86, sk (in %ebx)
1611          * might be destroyed here. This current version compiles correctly,
1612          * but you have been warned.
1613          */
1614         return 0;
1615
1616 csum_err:
1617         TCP_INC_STATS_BH(TCP_MIB_INERRS);
1618         goto discard;
1619 }
1620
1621 /*
1622  *      From tcp_input.c
1623  */
1624
1625 int tcp_v4_rcv(struct sk_buff *skb)
1626 {
1627         const struct iphdr *iph;
1628         struct tcphdr *th;
1629         struct sock *sk;
1630         int ret;
1631
1632         if (skb->pkt_type != PACKET_HOST)
1633                 goto discard_it;
1634
1635         /* Count it even if it's bad */
1636         TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1637
1638         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1639                 goto discard_it;
1640
1641         th = tcp_hdr(skb);
1642
1643         if (th->doff < sizeof(struct tcphdr) / 4)
1644                 goto bad_packet;
1645         if (!pskb_may_pull(skb, th->doff * 4))
1646                 goto discard_it;
1647
1648         /* An explanation is required here, I think.
1649          * Packet length and doff are validated by header prediction,
1650          * provided case of th->doff==0 is eliminated.
1651          * So, we defer the checks. */
1652         if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1653                 goto bad_packet;
1654
1655         th = tcp_hdr(skb);
1656         iph = ip_hdr(skb);
1657         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1658         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1659                                     skb->len - th->doff * 4);
1660         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1661         TCP_SKB_CB(skb)->when    = 0;
1662         TCP_SKB_CB(skb)->flags   = iph->tos;
1663         TCP_SKB_CB(skb)->sacked  = 0;
1664
1665         sk = __inet_lookup(&tcp_hashinfo, iph->saddr, th->source,
1666                            iph->daddr, th->dest, inet_iif(skb));
1667         if (!sk)
1668                 goto no_tcp_socket;
1669
1670 process:
1671         if (sk->sk_state == TCP_TIME_WAIT)
1672                 goto do_time_wait;
1673
1674         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1675                 goto discard_and_relse;
1676         nf_reset(skb);
1677
1678         if (sk_filter(sk, skb))
1679                 goto discard_and_relse;
1680
1681         skb->dev = NULL;
1682
1683         bh_lock_sock_nested(sk);
1684         ret = 0;
1685         if (!sock_owned_by_user(sk)) {
1686 #ifdef CONFIG_NET_DMA
1687                 struct tcp_sock *tp = tcp_sk(sk);
1688                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1689                         tp->ucopy.dma_chan = get_softnet_dma();
1690                 if (tp->ucopy.dma_chan)
1691                         ret = tcp_v4_do_rcv(sk, skb);
1692                 else
1693 #endif
1694                 {
1695                         if (!tcp_prequeue(sk, skb))
1696                         ret = tcp_v4_do_rcv(sk, skb);
1697                 }
1698         } else
1699                 sk_add_backlog(sk, skb);
1700         bh_unlock_sock(sk);
1701
1702         sock_put(sk);
1703
1704         return ret;
1705
1706 no_tcp_socket:
1707         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1708                 goto discard_it;
1709
1710         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1711 bad_packet:
1712                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1713         } else {
1714                 tcp_v4_send_reset(NULL, skb);
1715         }
1716
1717 discard_it:
1718         /* Discard frame. */
1719         kfree_skb(skb);
1720         return 0;
1721
1722 discard_and_relse:
1723         sock_put(sk);
1724         goto discard_it;
1725
1726 do_time_wait:
1727         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1728                 inet_twsk_put(inet_twsk(sk));
1729                 goto discard_it;
1730         }
1731
1732         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1733                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1734                 inet_twsk_put(inet_twsk(sk));
1735                 goto discard_it;
1736         }
1737         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1738         case TCP_TW_SYN: {
1739                 struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo,
1740                                                         iph->daddr, th->dest,
1741                                                         inet_iif(skb));
1742                 if (sk2) {
1743                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1744                         inet_twsk_put(inet_twsk(sk));
1745                         sk = sk2;
1746                         goto process;
1747                 }
1748                 /* Fall through to ACK */
1749         }
1750         case TCP_TW_ACK:
1751                 tcp_v4_timewait_ack(sk, skb);
1752                 break;
1753         case TCP_TW_RST:
1754                 goto no_tcp_socket;
1755         case TCP_TW_SUCCESS:;
1756         }
1757         goto discard_it;
1758 }
1759
1760 /* VJ's idea. Save last timestamp seen from this destination
1761  * and hold it at least for normal timewait interval to use for duplicate
1762  * segment detection in subsequent connections, before they enter synchronized
1763  * state.
1764  */
1765
1766 int tcp_v4_remember_stamp(struct sock *sk)
1767 {
1768         struct inet_sock *inet = inet_sk(sk);
1769         struct tcp_sock *tp = tcp_sk(sk);
1770         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1771         struct inet_peer *peer = NULL;
1772         int release_it = 0;
1773
1774         if (!rt || rt->rt_dst != inet->daddr) {
1775                 peer = inet_getpeer(inet->daddr, 1);
1776                 release_it = 1;
1777         } else {
1778                 if (!rt->peer)
1779                         rt_bind_peer(rt, 1);
1780                 peer = rt->peer;
1781         }
1782
1783         if (peer) {
1784                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1785                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1786                      peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1787                         peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1788                         peer->tcp_ts = tp->rx_opt.ts_recent;
1789                 }
1790                 if (release_it)
1791                         inet_putpeer(peer);
1792                 return 1;
1793         }
1794
1795         return 0;
1796 }
1797
1798 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1799 {
1800         struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1801
1802         if (peer) {
1803                 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1804
1805                 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1806                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1807                      peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1808                         peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1809                         peer->tcp_ts       = tcptw->tw_ts_recent;
1810                 }
1811                 inet_putpeer(peer);
1812                 return 1;
1813         }
1814
1815         return 0;
1816 }
1817
1818 struct inet_connection_sock_af_ops ipv4_specific = {
1819         .queue_xmit        = ip_queue_xmit,
1820         .send_check        = tcp_v4_send_check,
1821         .rebuild_header    = inet_sk_rebuild_header,
1822         .conn_request      = tcp_v4_conn_request,
1823         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1824         .remember_stamp    = tcp_v4_remember_stamp,
1825         .net_header_len    = sizeof(struct iphdr),
1826         .setsockopt        = ip_setsockopt,
1827         .getsockopt        = ip_getsockopt,
1828         .addr2sockaddr     = inet_csk_addr2sockaddr,
1829         .sockaddr_len      = sizeof(struct sockaddr_in),
1830 #ifdef CONFIG_COMPAT
1831         .compat_setsockopt = compat_ip_setsockopt,
1832         .compat_getsockopt = compat_ip_getsockopt,
1833 #endif
1834 };
1835
1836 #ifdef CONFIG_TCP_MD5SIG
1837 static struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1838         .md5_lookup             = tcp_v4_md5_lookup,
1839         .calc_md5_hash          = tcp_v4_calc_md5_hash,
1840         .md5_add                = tcp_v4_md5_add_func,
1841         .md5_parse              = tcp_v4_parse_md5_keys,
1842 };
1843 #endif
1844
1845 /* NOTE: A lot of things set to zero explicitly by call to
1846  *       sk_alloc() so need not be done here.
1847  */
1848 static int tcp_v4_init_sock(struct sock *sk)
1849 {
1850         struct inet_connection_sock *icsk = inet_csk(sk);
1851         struct tcp_sock *tp = tcp_sk(sk);
1852
1853         skb_queue_head_init(&tp->out_of_order_queue);
1854         tcp_init_xmit_timers(sk);
1855         tcp_prequeue_init(tp);
1856
1857         icsk->icsk_rto = TCP_TIMEOUT_INIT;
1858         tp->mdev = TCP_TIMEOUT_INIT;
1859
1860         /* So many TCP implementations out there (incorrectly) count the
1861          * initial SYN frame in their delayed-ACK and congestion control
1862          * algorithms that we must have the following bandaid to talk
1863          * efficiently to them.  -DaveM
1864          */
1865         tp->snd_cwnd = 2;
1866
1867         /* See draft-stevens-tcpca-spec-01 for discussion of the
1868          * initialization of these values.
1869          */
1870         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
1871         tp->snd_cwnd_clamp = ~0;
1872         tp->mss_cache = 536;
1873
1874         tp->reordering = sysctl_tcp_reordering;
1875         icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1876
1877         sk->sk_state = TCP_CLOSE;
1878
1879         sk->sk_write_space = sk_stream_write_space;
1880         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1881
1882         icsk->icsk_af_ops = &ipv4_specific;
1883         icsk->icsk_sync_mss = tcp_sync_mss;
1884 #ifdef CONFIG_TCP_MD5SIG
1885         tp->af_specific = &tcp_sock_ipv4_specific;
1886 #endif
1887
1888         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1889         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1890
1891         atomic_inc(&tcp_sockets_allocated);
1892
1893         return 0;
1894 }
1895
1896 int tcp_v4_destroy_sock(struct sock *sk)
1897 {
1898         struct tcp_sock *tp = tcp_sk(sk);
1899
1900         tcp_clear_xmit_timers(sk);
1901
1902         tcp_cleanup_congestion_control(sk);
1903
1904         /* Cleanup up the write buffer. */
1905         tcp_write_queue_purge(sk);
1906
1907         /* Cleans up our, hopefully empty, out_of_order_queue. */
1908         __skb_queue_purge(&tp->out_of_order_queue);
1909
1910 #ifdef CONFIG_TCP_MD5SIG
1911         /* Clean up the MD5 key list, if any */
1912         if (tp->md5sig_info) {
1913                 tcp_v4_clear_md5_list(sk);
1914                 kfree(tp->md5sig_info);
1915                 tp->md5sig_info = NULL;
1916         }
1917 #endif
1918
1919 #ifdef CONFIG_NET_DMA
1920         /* Cleans up our sk_async_wait_queue */
1921         __skb_queue_purge(&sk->sk_async_wait_queue);
1922 #endif
1923
1924         /* Clean prequeue, it must be empty really */
1925         __skb_queue_purge(&tp->ucopy.prequeue);
1926
1927         /* Clean up a referenced TCP bind bucket. */
1928         if (inet_csk(sk)->icsk_bind_hash)
1929                 inet_put_port(&tcp_hashinfo, sk);
1930
1931         /*
1932          * If sendmsg cached page exists, toss it.
1933          */
1934         if (sk->sk_sndmsg_page) {
1935                 __free_page(sk->sk_sndmsg_page);
1936                 sk->sk_sndmsg_page = NULL;
1937         }
1938
1939         atomic_dec(&tcp_sockets_allocated);
1940
1941         return 0;
1942 }
1943
1944 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1945
1946 #ifdef CONFIG_PROC_FS
1947 /* Proc filesystem TCP sock list dumping. */
1948
1949 static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
1950 {
1951         return hlist_empty(head) ? NULL :
1952                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1953 }
1954
1955 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1956 {
1957         return tw->tw_node.next ?
1958                 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1959 }
1960
1961 static void *listening_get_next(struct seq_file *seq, void *cur)
1962 {
1963         struct inet_connection_sock *icsk;
1964         struct hlist_node *node;
1965         struct sock *sk = cur;
1966         struct tcp_iter_state* st = seq->private;
1967
1968         if (!sk) {
1969                 st->bucket = 0;
1970                 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1971                 goto get_sk;
1972         }
1973
1974         ++st->num;
1975
1976         if (st->state == TCP_SEQ_STATE_OPENREQ) {
1977                 struct request_sock *req = cur;
1978
1979                 icsk = inet_csk(st->syn_wait_sk);
1980                 req = req->dl_next;
1981                 while (1) {
1982                         while (req) {
1983                                 if (req->rsk_ops->family == st->family) {
1984                                         cur = req;
1985                                         goto out;
1986                                 }
1987                                 req = req->dl_next;
1988                         }
1989                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1990                                 break;
1991 get_req:
1992                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1993                 }
1994                 sk        = sk_next(st->syn_wait_sk);
1995                 st->state = TCP_SEQ_STATE_LISTENING;
1996                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1997         } else {
1998                 icsk = inet_csk(sk);
1999                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2000                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
2001                         goto start_req;
2002                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2003                 sk = sk_next(sk);
2004         }
2005 get_sk:
2006         sk_for_each_from(sk, node) {
2007                 if (sk->sk_family == st->family) {
2008                         cur = sk;
2009                         goto out;
2010                 }
2011                 icsk = inet_csk(sk);
2012                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2013                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2014 start_req:
2015                         st->uid         = sock_i_uid(sk);
2016                         st->syn_wait_sk = sk;
2017                         st->state       = TCP_SEQ_STATE_OPENREQ;
2018                         st->sbucket     = 0;
2019                         goto get_req;
2020                 }
2021                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2022         }
2023         if (++st->bucket < INET_LHTABLE_SIZE) {
2024                 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
2025                 goto get_sk;
2026         }
2027         cur = NULL;
2028 out:
2029         return cur;
2030 }
2031
2032 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2033 {
2034         void *rc = listening_get_next(seq, NULL);
2035
2036         while (rc && *pos) {
2037                 rc = listening_get_next(seq, rc);
2038                 --*pos;
2039         }
2040         return rc;
2041 }
2042
2043 static void *established_get_first(struct seq_file *seq)
2044 {
2045         struct tcp_iter_state* st = seq->private;
2046         void *rc = NULL;
2047
2048         for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
2049                 struct sock *sk;
2050                 struct hlist_node *node;
2051                 struct inet_timewait_sock *tw;
2052                 rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2053
2054                 read_lock_bh(lock);
2055                 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2056                         if (sk->sk_family != st->family) {
2057                                 continue;
2058                         }
2059                         rc = sk;
2060                         goto out;
2061                 }
2062                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2063                 inet_twsk_for_each(tw, node,
2064                                    &tcp_hashinfo.ehash[st->bucket].twchain) {
2065                         if (tw->tw_family != st->family) {
2066                                 continue;
2067                         }
2068                         rc = tw;
2069                         goto out;
2070                 }
2071                 read_unlock_bh(lock);
2072                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2073         }
2074 out:
2075         return rc;
2076 }
2077
2078 static void *established_get_next(struct seq_file *seq, void *cur)
2079 {
2080         struct sock *sk = cur;
2081         struct inet_timewait_sock *tw;
2082         struct hlist_node *node;
2083         struct tcp_iter_state* st = seq->private;
2084
2085         ++st->num;
2086
2087         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2088                 tw = cur;
2089                 tw = tw_next(tw);
2090 get_tw:
2091                 while (tw && tw->tw_family != st->family) {
2092                         tw = tw_next(tw);
2093                 }
2094                 if (tw) {
2095                         cur = tw;
2096                         goto out;
2097                 }
2098                 read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2099                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2100
2101                 if (++st->bucket < tcp_hashinfo.ehash_size) {
2102                         read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2103                         sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
2104                 } else {
2105                         cur = NULL;
2106                         goto out;
2107                 }
2108         } else
2109                 sk = sk_next(sk);
2110
2111         sk_for_each_from(sk, node) {
2112                 if (sk->sk_family == st->family)
2113                         goto found;
2114         }
2115
2116         st->state = TCP_SEQ_STATE_TIME_WAIT;
2117         tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2118         goto get_tw;
2119 found:
2120         cur = sk;
2121 out:
2122         return cur;
2123 }
2124
2125 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2126 {
2127         void *rc = established_get_first(seq);
2128
2129         while (rc && pos) {
2130                 rc = established_get_next(seq, rc);
2131                 --pos;
2132         }
2133         return rc;
2134 }
2135
2136 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2137 {
2138         void *rc;
2139         struct tcp_iter_state* st = seq->private;
2140
2141         inet_listen_lock(&tcp_hashinfo);
2142         st->state = TCP_SEQ_STATE_LISTENING;
2143         rc        = listening_get_idx(seq, &pos);
2144
2145         if (!rc) {
2146                 inet_listen_unlock(&tcp_hashinfo);
2147                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2148                 rc        = established_get_idx(seq, pos);
2149         }
2150
2151         return rc;
2152 }
2153
2154 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2155 {
2156         struct tcp_iter_state* st = seq->private;
2157         st->state = TCP_SEQ_STATE_LISTENING;
2158         st->num = 0;
2159         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2160 }
2161
2162 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2163 {
2164         void *rc = NULL;
2165         struct tcp_iter_state* st;
2166
2167         if (v == SEQ_START_TOKEN) {
2168                 rc = tcp_get_idx(seq, 0);
2169                 goto out;
2170         }
2171         st = seq->private;
2172
2173         switch (st->state) {
2174         case TCP_SEQ_STATE_OPENREQ:
2175         case TCP_SEQ_STATE_LISTENING:
2176                 rc = listening_get_next(seq, v);
2177                 if (!rc) {
2178                         inet_listen_unlock(&tcp_hashinfo);
2179                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2180                         rc        = established_get_first(seq);
2181                 }
2182                 break;
2183         case TCP_SEQ_STATE_ESTABLISHED:
2184         case TCP_SEQ_STATE_TIME_WAIT:
2185                 rc = established_get_next(seq, v);
2186                 break;
2187         }
2188 out:
2189         ++*pos;
2190         return rc;
2191 }
2192
2193 static void tcp_seq_stop(struct seq_file *seq, void *v)
2194 {
2195         struct tcp_iter_state* st = seq->private;
2196
2197         switch (st->state) {
2198         case TCP_SEQ_STATE_OPENREQ:
2199                 if (v) {
2200                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2201                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2202                 }
2203         case TCP_SEQ_STATE_LISTENING:
2204                 if (v != SEQ_START_TOKEN)
2205                         inet_listen_unlock(&tcp_hashinfo);
2206                 break;
2207         case TCP_SEQ_STATE_TIME_WAIT:
2208         case TCP_SEQ_STATE_ESTABLISHED:
2209                 if (v)
2210                         read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2211                 break;
2212         }
2213 }
2214
2215 static int tcp_seq_open(struct inode *inode, struct file *file)
2216 {
2217         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2218         struct seq_file *seq;
2219         struct tcp_iter_state *s;
2220         int rc;
2221
2222         if (unlikely(afinfo == NULL))
2223                 return -EINVAL;
2224
2225         s = kzalloc(sizeof(*s), GFP_KERNEL);
2226         if (!s)
2227                 return -ENOMEM;
2228         s->family               = afinfo->family;
2229         s->seq_ops.start        = tcp_seq_start;
2230         s->seq_ops.next         = tcp_seq_next;
2231         s->seq_ops.show         = afinfo->seq_show;
2232         s->seq_ops.stop         = tcp_seq_stop;
2233
2234         rc = seq_open(file, &s->seq_ops);
2235         if (rc)
2236                 goto out_kfree;
2237         seq          = file->private_data;
2238         seq->private = s;
2239 out:
2240         return rc;
2241 out_kfree:
2242         kfree(s);
2243         goto out;
2244 }
2245
2246 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2247 {
2248         int rc = 0;
2249         struct proc_dir_entry *p;
2250
2251         if (!afinfo)
2252                 return -EINVAL;
2253         afinfo->seq_fops->owner         = afinfo->owner;
2254         afinfo->seq_fops->open          = tcp_seq_open;
2255         afinfo->seq_fops->read          = seq_read;
2256         afinfo->seq_fops->llseek        = seq_lseek;
2257         afinfo->seq_fops->release       = seq_release_private;
2258
2259         p = proc_net_fops_create(&init_net, afinfo->name, S_IRUGO, afinfo->seq_fops);
2260         if (p)
2261                 p->data = afinfo;
2262         else
2263                 rc = -ENOMEM;
2264         return rc;
2265 }
2266
2267 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2268 {
2269         if (!afinfo)
2270                 return;
2271         proc_net_remove(&init_net, afinfo->name);
2272         memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2273 }
2274
2275 static void get_openreq4(struct sock *sk, struct request_sock *req,
2276                          char *tmpbuf, int i, int uid)
2277 {
2278         const struct inet_request_sock *ireq = inet_rsk(req);
2279         int ttd = req->expires - jiffies;
2280
2281         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2282                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2283                 i,
2284                 ireq->loc_addr,
2285                 ntohs(inet_sk(sk)->sport),
2286                 ireq->rmt_addr,
2287                 ntohs(ireq->rmt_port),
2288                 TCP_SYN_RECV,
2289                 0, 0, /* could print option size, but that is af dependent. */
2290                 1,    /* timers active (only the expire timer) */
2291                 jiffies_to_clock_t(ttd),
2292                 req->retrans,
2293                 uid,
2294                 0,  /* non standard timer */
2295                 0, /* open_requests have no inode */
2296                 atomic_read(&sk->sk_refcnt),
2297                 req);
2298 }
2299
2300 static void get_tcp4_sock(struct sock *sk, char *tmpbuf, int i)
2301 {
2302         int timer_active;
2303         unsigned long timer_expires;
2304         struct tcp_sock *tp = tcp_sk(sk);
2305         const struct inet_connection_sock *icsk = inet_csk(sk);
2306         struct inet_sock *inet = inet_sk(sk);
2307         __be32 dest = inet->daddr;
2308         __be32 src = inet->rcv_saddr;
2309         __u16 destp = ntohs(inet->dport);
2310         __u16 srcp = ntohs(inet->sport);
2311
2312         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2313                 timer_active    = 1;
2314                 timer_expires   = icsk->icsk_timeout;
2315         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2316                 timer_active    = 4;
2317                 timer_expires   = icsk->icsk_timeout;
2318         } else if (timer_pending(&sk->sk_timer)) {
2319                 timer_active    = 2;
2320                 timer_expires   = sk->sk_timer.expires;
2321         } else {
2322                 timer_active    = 0;
2323                 timer_expires = jiffies;
2324         }
2325
2326         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2327                         "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2328                 i, src, srcp, dest, destp, sk->sk_state,
2329                 tp->write_seq - tp->snd_una,
2330                 sk->sk_state == TCP_LISTEN ? sk->sk_ack_backlog :
2331                                              (tp->rcv_nxt - tp->copied_seq),
2332                 timer_active,
2333                 jiffies_to_clock_t(timer_expires - jiffies),
2334                 icsk->icsk_retransmits,
2335                 sock_i_uid(sk),
2336                 icsk->icsk_probes_out,
2337                 sock_i_ino(sk),
2338                 atomic_read(&sk->sk_refcnt), sk,
2339                 icsk->icsk_rto,
2340                 icsk->icsk_ack.ato,
2341                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2342                 tp->snd_cwnd,
2343                 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2344 }
2345
2346 static void get_timewait4_sock(struct inet_timewait_sock *tw,
2347                                char *tmpbuf, int i)
2348 {
2349         __be32 dest, src;
2350         __u16 destp, srcp;
2351         int ttd = tw->tw_ttd - jiffies;
2352
2353         if (ttd < 0)
2354                 ttd = 0;
2355
2356         dest  = tw->tw_daddr;
2357         src   = tw->tw_rcv_saddr;
2358         destp = ntohs(tw->tw_dport);
2359         srcp  = ntohs(tw->tw_sport);
2360
2361         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2362                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2363                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2364                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2365                 atomic_read(&tw->tw_refcnt), tw);
2366 }
2367
2368 #define TMPSZ 150
2369
2370 static int tcp4_seq_show(struct seq_file *seq, void *v)
2371 {
2372         struct tcp_iter_state* st;
2373         char tmpbuf[TMPSZ + 1];
2374
2375         if (v == SEQ_START_TOKEN) {
2376                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2377                            "  sl  local_address rem_address   st tx_queue "
2378                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2379                            "inode");
2380                 goto out;
2381         }
2382         st = seq->private;
2383
2384         switch (st->state) {
2385         case TCP_SEQ_STATE_LISTENING:
2386         case TCP_SEQ_STATE_ESTABLISHED:
2387                 get_tcp4_sock(v, tmpbuf, st->num);
2388                 break;
2389         case TCP_SEQ_STATE_OPENREQ:
2390                 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2391                 break;
2392         case TCP_SEQ_STATE_TIME_WAIT:
2393                 get_timewait4_sock(v, tmpbuf, st->num);
2394                 break;
2395         }
2396         seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2397 out:
2398         return 0;
2399 }
2400
2401 static struct file_operations tcp4_seq_fops;
2402 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2403         .owner          = THIS_MODULE,
2404         .name           = "tcp",
2405         .family         = AF_INET,
2406         .seq_show       = tcp4_seq_show,
2407         .seq_fops       = &tcp4_seq_fops,
2408 };
2409
2410 int __init tcp4_proc_init(void)
2411 {
2412         return tcp_proc_register(&tcp4_seq_afinfo);
2413 }
2414
2415 void tcp4_proc_exit(void)
2416 {
2417         tcp_proc_unregister(&tcp4_seq_afinfo);
2418 }
2419 #endif /* CONFIG_PROC_FS */
2420
2421 DEFINE_PROTO_INUSE(tcp)
2422
2423 struct proto tcp_prot = {
2424         .name                   = "TCP",
2425         .owner                  = THIS_MODULE,
2426         .close                  = tcp_close,
2427         .connect                = tcp_v4_connect,
2428         .disconnect             = tcp_disconnect,
2429         .accept                 = inet_csk_accept,
2430         .ioctl                  = tcp_ioctl,
2431         .init                   = tcp_v4_init_sock,
2432         .destroy                = tcp_v4_destroy_sock,
2433         .shutdown               = tcp_shutdown,
2434         .setsockopt             = tcp_setsockopt,
2435         .getsockopt             = tcp_getsockopt,
2436         .recvmsg                = tcp_recvmsg,
2437         .backlog_rcv            = tcp_v4_do_rcv,
2438         .hash                   = tcp_v4_hash,
2439         .unhash                 = tcp_unhash,
2440         .get_port               = tcp_v4_get_port,
2441         .enter_memory_pressure  = tcp_enter_memory_pressure,
2442         .sockets_allocated      = &tcp_sockets_allocated,
2443         .orphan_count           = &tcp_orphan_count,
2444         .memory_allocated       = &tcp_memory_allocated,
2445         .memory_pressure        = &tcp_memory_pressure,
2446         .sysctl_mem             = sysctl_tcp_mem,
2447         .sysctl_wmem            = sysctl_tcp_wmem,
2448         .sysctl_rmem            = sysctl_tcp_rmem,
2449         .max_header             = MAX_TCP_HEADER,
2450         .obj_size               = sizeof(struct tcp_sock),
2451         .twsk_prot              = &tcp_timewait_sock_ops,
2452         .rsk_prot               = &tcp_request_sock_ops,
2453 #ifdef CONFIG_COMPAT
2454         .compat_setsockopt      = compat_tcp_setsockopt,
2455         .compat_getsockopt      = compat_tcp_getsockopt,
2456 #endif
2457         REF_PROTO_INUSE(tcp)
2458 };
2459
2460 void __init tcp_v4_init(struct net_proto_family *ops)
2461 {
2462         if (inet_csk_ctl_sock_create(&tcp_socket, PF_INET, SOCK_RAW,
2463                                      IPPROTO_TCP) < 0)
2464                 panic("Failed to create the TCP control socket.\n");
2465 }
2466
2467 EXPORT_SYMBOL(ipv4_specific);
2468 EXPORT_SYMBOL(tcp_hashinfo);
2469 EXPORT_SYMBOL(tcp_prot);
2470 EXPORT_SYMBOL(tcp_unhash);
2471 EXPORT_SYMBOL(tcp_v4_conn_request);
2472 EXPORT_SYMBOL(tcp_v4_connect);
2473 EXPORT_SYMBOL(tcp_v4_do_rcv);
2474 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2475 EXPORT_SYMBOL(tcp_v4_send_check);
2476 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2477
2478 #ifdef CONFIG_PROC_FS
2479 EXPORT_SYMBOL(tcp_proc_register);
2480 EXPORT_SYMBOL(tcp_proc_unregister);
2481 #endif
2482 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2483