[NETFILTER]: conntrack: fix refcount leak when finding expectation
[sfrench/cifs-2.6.git] / net / ipv4 / netfilter / ip_conntrack_core.c
1 /* Connection state tracking for netfilter.  This is separated from,
2    but required by, the NAT layer; it can also be used by an iptables
3    extension. */
4
5 /* (C) 1999-2001 Paul `Rusty' Russell  
6  * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License version 2 as
10  * published by the Free Software Foundation.
11  *
12  * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
13  *      - new API and handling of conntrack/nat helpers
14  *      - now capable of multiple expectations for one master
15  * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
16  *      - add usage/reference counts to ip_conntrack_expect
17  *      - export ip_conntrack[_expect]_{find_get,put} functions
18  * */
19
20 #include <linux/types.h>
21 #include <linux/icmp.h>
22 #include <linux/ip.h>
23 #include <linux/netfilter.h>
24 #include <linux/netfilter_ipv4.h>
25 #include <linux/module.h>
26 #include <linux/skbuff.h>
27 #include <linux/proc_fs.h>
28 #include <linux/vmalloc.h>
29 #include <net/checksum.h>
30 #include <net/ip.h>
31 #include <linux/stddef.h>
32 #include <linux/sysctl.h>
33 #include <linux/slab.h>
34 #include <linux/random.h>
35 #include <linux/jhash.h>
36 #include <linux/err.h>
37 #include <linux/percpu.h>
38 #include <linux/moduleparam.h>
39 #include <linux/notifier.h>
40
41 /* ip_conntrack_lock protects the main hash table, protocol/helper/expected
42    registrations, conntrack timers*/
43 #define ASSERT_READ_LOCK(x)
44 #define ASSERT_WRITE_LOCK(x)
45
46 #include <linux/netfilter_ipv4/ip_conntrack.h>
47 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
48 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
49 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
50
51 #define IP_CONNTRACK_VERSION    "2.4"
52
53 #if 0
54 #define DEBUGP printk
55 #else
56 #define DEBUGP(format, args...)
57 #endif
58
59 DEFINE_RWLOCK(ip_conntrack_lock);
60
61 /* ip_conntrack_standalone needs this */
62 atomic_t ip_conntrack_count = ATOMIC_INIT(0);
63
64 void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
65 LIST_HEAD(ip_conntrack_expect_list);
66 struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO] __read_mostly;
67 static LIST_HEAD(helpers);
68 unsigned int ip_conntrack_htable_size __read_mostly = 0;
69 int ip_conntrack_max __read_mostly;
70 struct list_head *ip_conntrack_hash __read_mostly;
71 static kmem_cache_t *ip_conntrack_cachep __read_mostly;
72 static kmem_cache_t *ip_conntrack_expect_cachep __read_mostly;
73 struct ip_conntrack ip_conntrack_untracked;
74 unsigned int ip_ct_log_invalid __read_mostly;
75 static LIST_HEAD(unconfirmed);
76 static int ip_conntrack_vmalloc __read_mostly;
77
78 static unsigned int ip_conntrack_next_id;
79 static unsigned int ip_conntrack_expect_next_id;
80 #ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
81 ATOMIC_NOTIFIER_HEAD(ip_conntrack_chain);
82 ATOMIC_NOTIFIER_HEAD(ip_conntrack_expect_chain);
83
84 DEFINE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache);
85
86 /* deliver cached events and clear cache entry - must be called with locally
87  * disabled softirqs */
88 static inline void
89 __ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ecache)
90 {
91         DEBUGP("ecache: delivering events for %p\n", ecache->ct);
92         if (is_confirmed(ecache->ct) && !is_dying(ecache->ct) && ecache->events)
93                 atomic_notifier_call_chain(&ip_conntrack_chain, ecache->events,
94                                     ecache->ct);
95         ecache->events = 0;
96         ip_conntrack_put(ecache->ct);
97         ecache->ct = NULL;
98 }
99
100 /* Deliver all cached events for a particular conntrack. This is called
101  * by code prior to async packet handling or freeing the skb */
102 void ip_ct_deliver_cached_events(const struct ip_conntrack *ct)
103 {
104         struct ip_conntrack_ecache *ecache;
105         
106         local_bh_disable();
107         ecache = &__get_cpu_var(ip_conntrack_ecache);
108         if (ecache->ct == ct)
109                 __ip_ct_deliver_cached_events(ecache);
110         local_bh_enable();
111 }
112
113 void __ip_ct_event_cache_init(struct ip_conntrack *ct)
114 {
115         struct ip_conntrack_ecache *ecache;
116
117         /* take care of delivering potentially old events */
118         ecache = &__get_cpu_var(ip_conntrack_ecache);
119         BUG_ON(ecache->ct == ct);
120         if (ecache->ct)
121                 __ip_ct_deliver_cached_events(ecache);
122         /* initialize for this conntrack/packet */
123         ecache->ct = ct;
124         nf_conntrack_get(&ct->ct_general);
125 }
126
127 /* flush the event cache - touches other CPU's data and must not be called while
128  * packets are still passing through the code */
129 static void ip_ct_event_cache_flush(void)
130 {
131         struct ip_conntrack_ecache *ecache;
132         int cpu;
133
134         for_each_possible_cpu(cpu) {
135                 ecache = &per_cpu(ip_conntrack_ecache, cpu);
136                 if (ecache->ct)
137                         ip_conntrack_put(ecache->ct);
138         }
139 }
140 #else
141 static inline void ip_ct_event_cache_flush(void) {}
142 #endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */
143
144 DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
145
146 static int ip_conntrack_hash_rnd_initted;
147 static unsigned int ip_conntrack_hash_rnd;
148
149 static u_int32_t __hash_conntrack(const struct ip_conntrack_tuple *tuple,
150                             unsigned int size, unsigned int rnd)
151 {
152         return (jhash_3words((__force u32)tuple->src.ip,
153                              ((__force u32)tuple->dst.ip ^ tuple->dst.protonum),
154                              (tuple->src.u.all | (tuple->dst.u.all << 16)),
155                              rnd) % size);
156 }
157
158 static u_int32_t
159 hash_conntrack(const struct ip_conntrack_tuple *tuple)
160 {
161         return __hash_conntrack(tuple, ip_conntrack_htable_size,
162                                 ip_conntrack_hash_rnd);
163 }
164
165 int
166 ip_ct_get_tuple(const struct iphdr *iph,
167                 const struct sk_buff *skb,
168                 unsigned int dataoff,
169                 struct ip_conntrack_tuple *tuple,
170                 const struct ip_conntrack_protocol *protocol)
171 {
172         /* Never happen */
173         if (iph->frag_off & htons(IP_OFFSET)) {
174                 printk("ip_conntrack_core: Frag of proto %u.\n",
175                        iph->protocol);
176                 return 0;
177         }
178
179         tuple->src.ip = iph->saddr;
180         tuple->dst.ip = iph->daddr;
181         tuple->dst.protonum = iph->protocol;
182         tuple->dst.dir = IP_CT_DIR_ORIGINAL;
183
184         return protocol->pkt_to_tuple(skb, dataoff, tuple);
185 }
186
187 int
188 ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse,
189                    const struct ip_conntrack_tuple *orig,
190                    const struct ip_conntrack_protocol *protocol)
191 {
192         inverse->src.ip = orig->dst.ip;
193         inverse->dst.ip = orig->src.ip;
194         inverse->dst.protonum = orig->dst.protonum;
195         inverse->dst.dir = !orig->dst.dir;
196
197         return protocol->invert_tuple(inverse, orig);
198 }
199
200
201 /* ip_conntrack_expect helper functions */
202 void ip_ct_unlink_expect(struct ip_conntrack_expect *exp)
203 {
204         ASSERT_WRITE_LOCK(&ip_conntrack_lock);
205         IP_NF_ASSERT(!timer_pending(&exp->timeout));
206         list_del(&exp->list);
207         CONNTRACK_STAT_INC(expect_delete);
208         exp->master->expecting--;
209         ip_conntrack_expect_put(exp);
210 }
211
212 static void expectation_timed_out(unsigned long ul_expect)
213 {
214         struct ip_conntrack_expect *exp = (void *)ul_expect;
215
216         write_lock_bh(&ip_conntrack_lock);
217         ip_ct_unlink_expect(exp);
218         write_unlock_bh(&ip_conntrack_lock);
219         ip_conntrack_expect_put(exp);
220 }
221
222 struct ip_conntrack_expect *
223 __ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple)
224 {
225         struct ip_conntrack_expect *i;
226         
227         list_for_each_entry(i, &ip_conntrack_expect_list, list) {
228                 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask))
229                         return i;
230         }
231         return NULL;
232 }
233
234 /* Just find a expectation corresponding to a tuple. */
235 struct ip_conntrack_expect *
236 ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple)
237 {
238         struct ip_conntrack_expect *i;
239         
240         read_lock_bh(&ip_conntrack_lock);
241         i = __ip_conntrack_expect_find(tuple);
242         if (i)
243                 atomic_inc(&i->use);
244         read_unlock_bh(&ip_conntrack_lock);
245
246         return i;
247 }
248
249 /* If an expectation for this connection is found, it gets delete from
250  * global list then returned. */
251 static struct ip_conntrack_expect *
252 find_expectation(const struct ip_conntrack_tuple *tuple)
253 {
254         struct ip_conntrack_expect *i;
255
256         list_for_each_entry(i, &ip_conntrack_expect_list, list) {
257                 /* If master is not in hash table yet (ie. packet hasn't left
258                    this machine yet), how can other end know about expected?
259                    Hence these are not the droids you are looking for (if
260                    master ct never got confirmed, we'd hold a reference to it
261                    and weird things would happen to future packets). */
262                 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
263                     && is_confirmed(i->master)) {
264                         if (i->flags & IP_CT_EXPECT_PERMANENT) {
265                                 atomic_inc(&i->use);
266                                 return i;
267                         } else if (del_timer(&i->timeout)) {
268                                 ip_ct_unlink_expect(i);
269                                 return i;
270                         }
271                 }
272         }
273         return NULL;
274 }
275
276 /* delete all expectations for this conntrack */
277 void ip_ct_remove_expectations(struct ip_conntrack *ct)
278 {
279         struct ip_conntrack_expect *i, *tmp;
280
281         /* Optimization: most connection never expect any others. */
282         if (ct->expecting == 0)
283                 return;
284
285         list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) {
286                 if (i->master == ct && del_timer(&i->timeout)) {
287                         ip_ct_unlink_expect(i);
288                         ip_conntrack_expect_put(i);
289                 }
290         }
291 }
292
293 static void
294 clean_from_lists(struct ip_conntrack *ct)
295 {
296         DEBUGP("clean_from_lists(%p)\n", ct);
297         ASSERT_WRITE_LOCK(&ip_conntrack_lock);
298         list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
299         list_del(&ct->tuplehash[IP_CT_DIR_REPLY].list);
300
301         /* Destroy all pending expectations */
302         ip_ct_remove_expectations(ct);
303 }
304
305 static void
306 destroy_conntrack(struct nf_conntrack *nfct)
307 {
308         struct ip_conntrack *ct = (struct ip_conntrack *)nfct;
309         struct ip_conntrack_protocol *proto;
310         struct ip_conntrack_helper *helper;
311
312         DEBUGP("destroy_conntrack(%p)\n", ct);
313         IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
314         IP_NF_ASSERT(!timer_pending(&ct->timeout));
315
316         ip_conntrack_event(IPCT_DESTROY, ct);
317         set_bit(IPS_DYING_BIT, &ct->status);
318
319         helper = ct->helper;
320         if (helper && helper->destroy)
321                 helper->destroy(ct);
322
323         /* To make sure we don't get any weird locking issues here:
324          * destroy_conntrack() MUST NOT be called with a write lock
325          * to ip_conntrack_lock!!! -HW */
326         proto = __ip_conntrack_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
327         if (proto && proto->destroy)
328                 proto->destroy(ct);
329
330         if (ip_conntrack_destroyed)
331                 ip_conntrack_destroyed(ct);
332
333         write_lock_bh(&ip_conntrack_lock);
334         /* Expectations will have been removed in clean_from_lists,
335          * except TFTP can create an expectation on the first packet,
336          * before connection is in the list, so we need to clean here,
337          * too. */
338         ip_ct_remove_expectations(ct);
339
340         /* We overload first tuple to link into unconfirmed list. */
341         if (!is_confirmed(ct)) {
342                 BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
343                 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
344         }
345
346         CONNTRACK_STAT_INC(delete);
347         write_unlock_bh(&ip_conntrack_lock);
348
349         if (ct->master)
350                 ip_conntrack_put(ct->master);
351
352         DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
353         ip_conntrack_free(ct);
354 }
355
356 static void death_by_timeout(unsigned long ul_conntrack)
357 {
358         struct ip_conntrack *ct = (void *)ul_conntrack;
359
360         write_lock_bh(&ip_conntrack_lock);
361         /* Inside lock so preempt is disabled on module removal path.
362          * Otherwise we can get spurious warnings. */
363         CONNTRACK_STAT_INC(delete_list);
364         clean_from_lists(ct);
365         write_unlock_bh(&ip_conntrack_lock);
366         ip_conntrack_put(ct);
367 }
368
369 struct ip_conntrack_tuple_hash *
370 __ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
371                     const struct ip_conntrack *ignored_conntrack)
372 {
373         struct ip_conntrack_tuple_hash *h;
374         unsigned int hash = hash_conntrack(tuple);
375
376         ASSERT_READ_LOCK(&ip_conntrack_lock);
377         list_for_each_entry(h, &ip_conntrack_hash[hash], list) {
378                 if (tuplehash_to_ctrack(h) != ignored_conntrack &&
379                     ip_ct_tuple_equal(tuple, &h->tuple)) {
380                         CONNTRACK_STAT_INC(found);
381                         return h;
382                 }
383                 CONNTRACK_STAT_INC(searched);
384         }
385
386         return NULL;
387 }
388
389 /* Find a connection corresponding to a tuple. */
390 struct ip_conntrack_tuple_hash *
391 ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
392                       const struct ip_conntrack *ignored_conntrack)
393 {
394         struct ip_conntrack_tuple_hash *h;
395
396         read_lock_bh(&ip_conntrack_lock);
397         h = __ip_conntrack_find(tuple, ignored_conntrack);
398         if (h)
399                 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
400         read_unlock_bh(&ip_conntrack_lock);
401
402         return h;
403 }
404
405 static void __ip_conntrack_hash_insert(struct ip_conntrack *ct,
406                                         unsigned int hash,
407                                         unsigned int repl_hash) 
408 {
409         ct->id = ++ip_conntrack_next_id;
410         list_add(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list,
411                  &ip_conntrack_hash[hash]);
412         list_add(&ct->tuplehash[IP_CT_DIR_REPLY].list,
413                  &ip_conntrack_hash[repl_hash]);
414 }
415
416 void ip_conntrack_hash_insert(struct ip_conntrack *ct)
417 {
418         unsigned int hash, repl_hash;
419
420         hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
421         repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
422
423         write_lock_bh(&ip_conntrack_lock);
424         __ip_conntrack_hash_insert(ct, hash, repl_hash);
425         write_unlock_bh(&ip_conntrack_lock);
426 }
427
428 /* Confirm a connection given skb; places it in hash table */
429 int
430 __ip_conntrack_confirm(struct sk_buff **pskb)
431 {
432         unsigned int hash, repl_hash;
433         struct ip_conntrack_tuple_hash *h;
434         struct ip_conntrack *ct;
435         enum ip_conntrack_info ctinfo;
436
437         ct = ip_conntrack_get(*pskb, &ctinfo);
438
439         /* ipt_REJECT uses ip_conntrack_attach to attach related
440            ICMP/TCP RST packets in other direction.  Actual packet
441            which created connection will be IP_CT_NEW or for an
442            expected connection, IP_CT_RELATED. */
443         if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
444                 return NF_ACCEPT;
445
446         hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
447         repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
448
449         /* We're not in hash table, and we refuse to set up related
450            connections for unconfirmed conns.  But packet copies and
451            REJECT will give spurious warnings here. */
452         /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
453
454         /* No external references means noone else could have
455            confirmed us. */
456         IP_NF_ASSERT(!is_confirmed(ct));
457         DEBUGP("Confirming conntrack %p\n", ct);
458
459         write_lock_bh(&ip_conntrack_lock);
460
461         /* See if there's one in the list already, including reverse:
462            NAT could have grabbed it without realizing, since we're
463            not in the hash.  If there is, we lost race. */
464         list_for_each_entry(h, &ip_conntrack_hash[hash], list)
465                 if (ip_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
466                                       &h->tuple))
467                         goto out;
468         list_for_each_entry(h, &ip_conntrack_hash[repl_hash], list)
469                 if (ip_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
470                                       &h->tuple))
471                         goto out;
472
473         /* Remove from unconfirmed list */
474         list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
475
476         __ip_conntrack_hash_insert(ct, hash, repl_hash);
477         /* Timer relative to confirmation time, not original
478            setting time, otherwise we'd get timer wrap in
479            weird delay cases. */
480         ct->timeout.expires += jiffies;
481         add_timer(&ct->timeout);
482         atomic_inc(&ct->ct_general.use);
483         set_bit(IPS_CONFIRMED_BIT, &ct->status);
484         CONNTRACK_STAT_INC(insert);
485         write_unlock_bh(&ip_conntrack_lock);
486         if (ct->helper)
487                 ip_conntrack_event_cache(IPCT_HELPER, *pskb);
488 #ifdef CONFIG_IP_NF_NAT_NEEDED
489         if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
490             test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
491                 ip_conntrack_event_cache(IPCT_NATINFO, *pskb);
492 #endif
493         ip_conntrack_event_cache(master_ct(ct) ?
494                                  IPCT_RELATED : IPCT_NEW, *pskb);
495
496         return NF_ACCEPT;
497
498 out:
499         CONNTRACK_STAT_INC(insert_failed);
500         write_unlock_bh(&ip_conntrack_lock);
501         return NF_DROP;
502 }
503
504 /* Returns true if a connection correspondings to the tuple (required
505    for NAT). */
506 int
507 ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
508                          const struct ip_conntrack *ignored_conntrack)
509 {
510         struct ip_conntrack_tuple_hash *h;
511
512         read_lock_bh(&ip_conntrack_lock);
513         h = __ip_conntrack_find(tuple, ignored_conntrack);
514         read_unlock_bh(&ip_conntrack_lock);
515
516         return h != NULL;
517 }
518
519 /* There's a small race here where we may free a just-assured
520    connection.  Too bad: we're in trouble anyway. */
521 static int early_drop(struct list_head *chain)
522 {
523         /* Traverse backwards: gives us oldest, which is roughly LRU */
524         struct ip_conntrack_tuple_hash *h;
525         struct ip_conntrack *ct = NULL, *tmp;
526         int dropped = 0;
527
528         read_lock_bh(&ip_conntrack_lock);
529         list_for_each_entry_reverse(h, chain, list) {
530                 tmp = tuplehash_to_ctrack(h);
531                 if (!test_bit(IPS_ASSURED_BIT, &tmp->status)) {
532                         ct = tmp;
533                         atomic_inc(&ct->ct_general.use);
534                         break;
535                 }
536         }
537         read_unlock_bh(&ip_conntrack_lock);
538
539         if (!ct)
540                 return dropped;
541
542         if (del_timer(&ct->timeout)) {
543                 death_by_timeout((unsigned long)ct);
544                 dropped = 1;
545                 CONNTRACK_STAT_INC(early_drop);
546         }
547         ip_conntrack_put(ct);
548         return dropped;
549 }
550
551 static struct ip_conntrack_helper *
552 __ip_conntrack_helper_find( const struct ip_conntrack_tuple *tuple)
553 {
554         struct ip_conntrack_helper *h;
555
556         list_for_each_entry(h, &helpers, list) {
557                 if (ip_ct_tuple_mask_cmp(tuple, &h->tuple, &h->mask))
558                         return h;
559         }
560         return NULL;
561 }
562
563 struct ip_conntrack_helper *
564 ip_conntrack_helper_find_get( const struct ip_conntrack_tuple *tuple)
565 {
566         struct ip_conntrack_helper *helper;
567
568         /* need ip_conntrack_lock to assure that helper exists until
569          * try_module_get() is called */
570         read_lock_bh(&ip_conntrack_lock);
571
572         helper = __ip_conntrack_helper_find(tuple);
573         if (helper) {
574                 /* need to increase module usage count to assure helper will
575                  * not go away while the caller is e.g. busy putting a
576                  * conntrack in the hash that uses the helper */
577                 if (!try_module_get(helper->me))
578                         helper = NULL;
579         }
580
581         read_unlock_bh(&ip_conntrack_lock);
582
583         return helper;
584 }
585
586 void ip_conntrack_helper_put(struct ip_conntrack_helper *helper)
587 {
588         module_put(helper->me);
589 }
590
591 struct ip_conntrack_protocol *
592 __ip_conntrack_proto_find(u_int8_t protocol)
593 {
594         return ip_ct_protos[protocol];
595 }
596
597 /* this is guaranteed to always return a valid protocol helper, since
598  * it falls back to generic_protocol */
599 struct ip_conntrack_protocol *
600 ip_conntrack_proto_find_get(u_int8_t protocol)
601 {
602         struct ip_conntrack_protocol *p;
603
604         preempt_disable();
605         p = __ip_conntrack_proto_find(protocol);
606         if (p) {
607                 if (!try_module_get(p->me))
608                         p = &ip_conntrack_generic_protocol;
609         }
610         preempt_enable();
611         
612         return p;
613 }
614
615 void ip_conntrack_proto_put(struct ip_conntrack_protocol *p)
616 {
617         module_put(p->me);
618 }
619
620 struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *orig,
621                                         struct ip_conntrack_tuple *repl)
622 {
623         struct ip_conntrack *conntrack;
624
625         if (!ip_conntrack_hash_rnd_initted) {
626                 get_random_bytes(&ip_conntrack_hash_rnd, 4);
627                 ip_conntrack_hash_rnd_initted = 1;
628         }
629
630         /* We don't want any race condition at early drop stage */
631         atomic_inc(&ip_conntrack_count);
632
633         if (ip_conntrack_max
634             && atomic_read(&ip_conntrack_count) > ip_conntrack_max) {
635                 unsigned int hash = hash_conntrack(orig);
636                 /* Try dropping from this hash chain. */
637                 if (!early_drop(&ip_conntrack_hash[hash])) {
638                         atomic_dec(&ip_conntrack_count);
639                         if (net_ratelimit())
640                                 printk(KERN_WARNING
641                                        "ip_conntrack: table full, dropping"
642                                        " packet.\n");
643                         return ERR_PTR(-ENOMEM);
644                 }
645         }
646
647         conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
648         if (!conntrack) {
649                 DEBUGP("Can't allocate conntrack.\n");
650                 atomic_dec(&ip_conntrack_count);
651                 return ERR_PTR(-ENOMEM);
652         }
653
654         memset(conntrack, 0, sizeof(*conntrack));
655         atomic_set(&conntrack->ct_general.use, 1);
656         conntrack->ct_general.destroy = destroy_conntrack;
657         conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
658         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
659         /* Don't set timer yet: wait for confirmation */
660         init_timer(&conntrack->timeout);
661         conntrack->timeout.data = (unsigned long)conntrack;
662         conntrack->timeout.function = death_by_timeout;
663
664         return conntrack;
665 }
666
667 void
668 ip_conntrack_free(struct ip_conntrack *conntrack)
669 {
670         atomic_dec(&ip_conntrack_count);
671         kmem_cache_free(ip_conntrack_cachep, conntrack);
672 }
673
674 /* Allocate a new conntrack: we return -ENOMEM if classification
675  * failed due to stress.   Otherwise it really is unclassifiable */
676 static struct ip_conntrack_tuple_hash *
677 init_conntrack(struct ip_conntrack_tuple *tuple,
678                struct ip_conntrack_protocol *protocol,
679                struct sk_buff *skb)
680 {
681         struct ip_conntrack *conntrack;
682         struct ip_conntrack_tuple repl_tuple;
683         struct ip_conntrack_expect *exp;
684
685         if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
686                 DEBUGP("Can't invert tuple.\n");
687                 return NULL;
688         }
689
690         conntrack = ip_conntrack_alloc(tuple, &repl_tuple);
691         if (conntrack == NULL || IS_ERR(conntrack))
692                 return (struct ip_conntrack_tuple_hash *)conntrack;
693
694         if (!protocol->new(conntrack, skb)) {
695                 ip_conntrack_free(conntrack);
696                 return NULL;
697         }
698
699         write_lock_bh(&ip_conntrack_lock);
700         exp = find_expectation(tuple);
701
702         if (exp) {
703                 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
704                         conntrack, exp);
705                 /* Welcome, Mr. Bond.  We've been expecting you... */
706                 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
707                 conntrack->master = exp->master;
708 #ifdef CONFIG_IP_NF_CONNTRACK_MARK
709                 conntrack->mark = exp->master->mark;
710 #endif
711 #if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \
712     defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE)
713                 /* this is ugly, but there is no other place where to put it */
714                 conntrack->nat.masq_index = exp->master->nat.masq_index;
715 #endif
716 #ifdef CONFIG_IP_NF_CONNTRACK_SECMARK
717                 conntrack->secmark = exp->master->secmark;
718 #endif
719                 nf_conntrack_get(&conntrack->master->ct_general);
720                 CONNTRACK_STAT_INC(expect_new);
721         } else {
722                 conntrack->helper = __ip_conntrack_helper_find(&repl_tuple);
723
724                 CONNTRACK_STAT_INC(new);
725         }
726
727         /* Overload tuple linked list to put us in unconfirmed list. */
728         list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
729
730         write_unlock_bh(&ip_conntrack_lock);
731
732         if (exp) {
733                 if (exp->expectfn)
734                         exp->expectfn(conntrack, exp);
735                 ip_conntrack_expect_put(exp);
736         }
737
738         return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
739 }
740
741 /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
742 static inline struct ip_conntrack *
743 resolve_normal_ct(struct sk_buff *skb,
744                   struct ip_conntrack_protocol *proto,
745                   int *set_reply,
746                   unsigned int hooknum,
747                   enum ip_conntrack_info *ctinfo)
748 {
749         struct ip_conntrack_tuple tuple;
750         struct ip_conntrack_tuple_hash *h;
751         struct ip_conntrack *ct;
752
753         IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
754
755         if (!ip_ct_get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4, 
756                                 &tuple,proto))
757                 return NULL;
758
759         /* look for tuple match */
760         h = ip_conntrack_find_get(&tuple, NULL);
761         if (!h) {
762                 h = init_conntrack(&tuple, proto, skb);
763                 if (!h)
764                         return NULL;
765                 if (IS_ERR(h))
766                         return (void *)h;
767         }
768         ct = tuplehash_to_ctrack(h);
769
770         /* It exists; we have (non-exclusive) reference. */
771         if (DIRECTION(h) == IP_CT_DIR_REPLY) {
772                 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
773                 /* Please set reply bit if this packet OK */
774                 *set_reply = 1;
775         } else {
776                 /* Once we've had two way comms, always ESTABLISHED. */
777                 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
778                         DEBUGP("ip_conntrack_in: normal packet for %p\n",
779                                ct);
780                         *ctinfo = IP_CT_ESTABLISHED;
781                 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
782                         DEBUGP("ip_conntrack_in: related packet for %p\n",
783                                ct);
784                         *ctinfo = IP_CT_RELATED;
785                 } else {
786                         DEBUGP("ip_conntrack_in: new packet for %p\n",
787                                ct);
788                         *ctinfo = IP_CT_NEW;
789                 }
790                 *set_reply = 0;
791         }
792         skb->nfct = &ct->ct_general;
793         skb->nfctinfo = *ctinfo;
794         return ct;
795 }
796
797 /* Netfilter hook itself. */
798 unsigned int ip_conntrack_in(unsigned int hooknum,
799                              struct sk_buff **pskb,
800                              const struct net_device *in,
801                              const struct net_device *out,
802                              int (*okfn)(struct sk_buff *))
803 {
804         struct ip_conntrack *ct;
805         enum ip_conntrack_info ctinfo;
806         struct ip_conntrack_protocol *proto;
807         int set_reply = 0;
808         int ret;
809
810         /* Previously seen (loopback or untracked)?  Ignore. */
811         if ((*pskb)->nfct) {
812                 CONNTRACK_STAT_INC(ignore);
813                 return NF_ACCEPT;
814         }
815
816         /* Never happen */
817         if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) {
818                 if (net_ratelimit()) {
819                 printk(KERN_ERR "ip_conntrack_in: Frag of proto %u (hook=%u)\n",
820                        (*pskb)->nh.iph->protocol, hooknum);
821                 }
822                 return NF_DROP;
823         }
824
825 /* Doesn't cover locally-generated broadcast, so not worth it. */
826 #if 0
827         /* Ignore broadcast: no `connection'. */
828         if ((*pskb)->pkt_type == PACKET_BROADCAST) {
829                 printk("Broadcast packet!\n");
830                 return NF_ACCEPT;
831         } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF)) 
832                    == htonl(0x000000FF)) {
833                 printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
834                        NIPQUAD((*pskb)->nh.iph->saddr),
835                        NIPQUAD((*pskb)->nh.iph->daddr),
836                        (*pskb)->sk, (*pskb)->pkt_type);
837         }
838 #endif
839
840         proto = __ip_conntrack_proto_find((*pskb)->nh.iph->protocol);
841
842         /* It may be an special packet, error, unclean...
843          * inverse of the return code tells to the netfilter
844          * core what to do with the packet. */
845         if (proto->error != NULL 
846             && (ret = proto->error(*pskb, &ctinfo, hooknum)) <= 0) {
847                 CONNTRACK_STAT_INC(error);
848                 CONNTRACK_STAT_INC(invalid);
849                 return -ret;
850         }
851
852         if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) {
853                 /* Not valid part of a connection */
854                 CONNTRACK_STAT_INC(invalid);
855                 return NF_ACCEPT;
856         }
857
858         if (IS_ERR(ct)) {
859                 /* Too stressed to deal. */
860                 CONNTRACK_STAT_INC(drop);
861                 return NF_DROP;
862         }
863
864         IP_NF_ASSERT((*pskb)->nfct);
865
866         ret = proto->packet(ct, *pskb, ctinfo);
867         if (ret < 0) {
868                 /* Invalid: inverse of the return code tells
869                  * the netfilter core what to do*/
870                 nf_conntrack_put((*pskb)->nfct);
871                 (*pskb)->nfct = NULL;
872                 CONNTRACK_STAT_INC(invalid);
873                 return -ret;
874         }
875
876         if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
877                 ip_conntrack_event_cache(IPCT_STATUS, *pskb);
878
879         return ret;
880 }
881
882 int invert_tuplepr(struct ip_conntrack_tuple *inverse,
883                    const struct ip_conntrack_tuple *orig)
884 {
885         return ip_ct_invert_tuple(inverse, orig, 
886                                   __ip_conntrack_proto_find(orig->dst.protonum));
887 }
888
889 /* Would two expected things clash? */
890 static inline int expect_clash(const struct ip_conntrack_expect *a,
891                                const struct ip_conntrack_expect *b)
892 {
893         /* Part covered by intersection of masks must be unequal,
894            otherwise they clash */
895         struct ip_conntrack_tuple intersect_mask
896                 = { { a->mask.src.ip & b->mask.src.ip,
897                       { a->mask.src.u.all & b->mask.src.u.all } },
898                     { a->mask.dst.ip & b->mask.dst.ip,
899                       { a->mask.dst.u.all & b->mask.dst.u.all },
900                       a->mask.dst.protonum & b->mask.dst.protonum } };
901
902         return ip_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
903 }
904
905 static inline int expect_matches(const struct ip_conntrack_expect *a,
906                                  const struct ip_conntrack_expect *b)
907 {
908         return a->master == b->master
909                 && ip_ct_tuple_equal(&a->tuple, &b->tuple)
910                 && ip_ct_tuple_equal(&a->mask, &b->mask);
911 }
912
913 /* Generally a bad idea to call this: could have matched already. */
914 void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
915 {
916         struct ip_conntrack_expect *i;
917
918         write_lock_bh(&ip_conntrack_lock);
919         /* choose the the oldest expectation to evict */
920         list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
921                 if (expect_matches(i, exp) && del_timer(&i->timeout)) {
922                         ip_ct_unlink_expect(i);
923                         write_unlock_bh(&ip_conntrack_lock);
924                         ip_conntrack_expect_put(i);
925                         return;
926                 }
927         }
928         write_unlock_bh(&ip_conntrack_lock);
929 }
930
931 /* We don't increase the master conntrack refcount for non-fulfilled
932  * conntracks. During the conntrack destruction, the expectations are 
933  * always killed before the conntrack itself */
934 struct ip_conntrack_expect *ip_conntrack_expect_alloc(struct ip_conntrack *me)
935 {
936         struct ip_conntrack_expect *new;
937
938         new = kmem_cache_alloc(ip_conntrack_expect_cachep, GFP_ATOMIC);
939         if (!new) {
940                 DEBUGP("expect_related: OOM allocating expect\n");
941                 return NULL;
942         }
943         new->master = me;
944         atomic_set(&new->use, 1);
945         return new;
946 }
947
948 void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
949 {
950         if (atomic_dec_and_test(&exp->use))
951                 kmem_cache_free(ip_conntrack_expect_cachep, exp);
952 }
953
954 static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
955 {
956         atomic_inc(&exp->use);
957         exp->master->expecting++;
958         list_add(&exp->list, &ip_conntrack_expect_list);
959
960         init_timer(&exp->timeout);
961         exp->timeout.data = (unsigned long)exp;
962         exp->timeout.function = expectation_timed_out;
963         exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ;
964         add_timer(&exp->timeout);
965
966         exp->id = ++ip_conntrack_expect_next_id;
967         atomic_inc(&exp->use);
968         CONNTRACK_STAT_INC(expect_create);
969 }
970
971 /* Race with expectations being used means we could have none to find; OK. */
972 static void evict_oldest_expect(struct ip_conntrack *master)
973 {
974         struct ip_conntrack_expect *i;
975
976         list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
977                 if (i->master == master) {
978                         if (del_timer(&i->timeout)) {
979                                 ip_ct_unlink_expect(i);
980                                 ip_conntrack_expect_put(i);
981                         }
982                         break;
983                 }
984         }
985 }
986
987 static inline int refresh_timer(struct ip_conntrack_expect *i)
988 {
989         if (!del_timer(&i->timeout))
990                 return 0;
991
992         i->timeout.expires = jiffies + i->master->helper->timeout*HZ;
993         add_timer(&i->timeout);
994         return 1;
995 }
996
997 int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
998 {
999         struct ip_conntrack_expect *i;
1000         int ret;
1001
1002         DEBUGP("ip_conntrack_expect_related %p\n", related_to);
1003         DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
1004         DEBUGP("mask:  "); DUMP_TUPLE(&expect->mask);
1005
1006         write_lock_bh(&ip_conntrack_lock);
1007         list_for_each_entry(i, &ip_conntrack_expect_list, list) {
1008                 if (expect_matches(i, expect)) {
1009                         /* Refresh timer: if it's dying, ignore.. */
1010                         if (refresh_timer(i)) {
1011                                 ret = 0;
1012                                 goto out;
1013                         }
1014                 } else if (expect_clash(i, expect)) {
1015                         ret = -EBUSY;
1016                         goto out;
1017                 }
1018         }
1019
1020         /* Will be over limit? */
1021         if (expect->master->helper->max_expected && 
1022             expect->master->expecting >= expect->master->helper->max_expected)
1023                 evict_oldest_expect(expect->master);
1024
1025         ip_conntrack_expect_insert(expect);
1026         ip_conntrack_expect_event(IPEXP_NEW, expect);
1027         ret = 0;
1028 out:
1029         write_unlock_bh(&ip_conntrack_lock);
1030         return ret;
1031 }
1032
1033 /* Alter reply tuple (maybe alter helper).  This is for NAT, and is
1034    implicitly racy: see __ip_conntrack_confirm */
1035 void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
1036                               const struct ip_conntrack_tuple *newreply)
1037 {
1038         write_lock_bh(&ip_conntrack_lock);
1039         /* Should be unconfirmed, so not in hash table yet */
1040         IP_NF_ASSERT(!is_confirmed(conntrack));
1041
1042         DEBUGP("Altering reply tuple of %p to ", conntrack);
1043         DUMP_TUPLE(newreply);
1044
1045         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1046         if (!conntrack->master && conntrack->expecting == 0)
1047                 conntrack->helper = __ip_conntrack_helper_find(newreply);
1048         write_unlock_bh(&ip_conntrack_lock);
1049 }
1050
1051 int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
1052 {
1053         BUG_ON(me->timeout == 0);
1054         write_lock_bh(&ip_conntrack_lock);
1055         list_add(&me->list, &helpers);
1056         write_unlock_bh(&ip_conntrack_lock);
1057
1058         return 0;
1059 }
1060
1061 struct ip_conntrack_helper *
1062 __ip_conntrack_helper_find_byname(const char *name)
1063 {
1064         struct ip_conntrack_helper *h;
1065
1066         list_for_each_entry(h, &helpers, list) {
1067                 if (!strcmp(h->name, name))
1068                         return h;
1069         }
1070
1071         return NULL;
1072 }
1073
1074 static inline void unhelp(struct ip_conntrack_tuple_hash *i,
1075                           const struct ip_conntrack_helper *me)
1076 {
1077         if (tuplehash_to_ctrack(i)->helper == me) {
1078                 ip_conntrack_event(IPCT_HELPER, tuplehash_to_ctrack(i));
1079                 tuplehash_to_ctrack(i)->helper = NULL;
1080         }
1081 }
1082
1083 void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
1084 {
1085         unsigned int i;
1086         struct ip_conntrack_tuple_hash *h;
1087         struct ip_conntrack_expect *exp, *tmp;
1088
1089         /* Need write lock here, to delete helper. */
1090         write_lock_bh(&ip_conntrack_lock);
1091         list_del(&me->list);
1092
1093         /* Get rid of expectations */
1094         list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) {
1095                 if (exp->master->helper == me && del_timer(&exp->timeout)) {
1096                         ip_ct_unlink_expect(exp);
1097                         ip_conntrack_expect_put(exp);
1098                 }
1099         }
1100         /* Get rid of expecteds, set helpers to NULL. */
1101         list_for_each_entry(h, &unconfirmed, list)
1102                 unhelp(h, me);
1103         for (i = 0; i < ip_conntrack_htable_size; i++) {
1104                 list_for_each_entry(h, &ip_conntrack_hash[i], list)
1105                         unhelp(h, me);
1106         }
1107         write_unlock_bh(&ip_conntrack_lock);
1108
1109         /* Someone could be still looking at the helper in a bh. */
1110         synchronize_net();
1111 }
1112
1113 /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
1114 void __ip_ct_refresh_acct(struct ip_conntrack *ct, 
1115                         enum ip_conntrack_info ctinfo,
1116                         const struct sk_buff *skb,
1117                         unsigned long extra_jiffies,
1118                         int do_acct)
1119 {
1120         int event = 0;
1121
1122         IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
1123         IP_NF_ASSERT(skb);
1124
1125         write_lock_bh(&ip_conntrack_lock);
1126
1127         /* Only update if this is not a fixed timeout */
1128         if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) {
1129                 write_unlock_bh(&ip_conntrack_lock);
1130                 return;
1131         }
1132
1133         /* If not in hash table, timer will not be active yet */
1134         if (!is_confirmed(ct)) {
1135                 ct->timeout.expires = extra_jiffies;
1136                 event = IPCT_REFRESH;
1137         } else {
1138                 /* Need del_timer for race avoidance (may already be dying). */
1139                 if (del_timer(&ct->timeout)) {
1140                         ct->timeout.expires = jiffies + extra_jiffies;
1141                         add_timer(&ct->timeout);
1142                         event = IPCT_REFRESH;
1143                 }
1144         }
1145
1146 #ifdef CONFIG_IP_NF_CT_ACCT
1147         if (do_acct) {
1148                 ct->counters[CTINFO2DIR(ctinfo)].packets++;
1149                 ct->counters[CTINFO2DIR(ctinfo)].bytes += 
1150                                                 ntohs(skb->nh.iph->tot_len);
1151                 if ((ct->counters[CTINFO2DIR(ctinfo)].packets & 0x80000000)
1152                     || (ct->counters[CTINFO2DIR(ctinfo)].bytes & 0x80000000))
1153                         event |= IPCT_COUNTER_FILLING;
1154         }
1155 #endif
1156
1157         write_unlock_bh(&ip_conntrack_lock);
1158
1159         /* must be unlocked when calling event cache */
1160         if (event)
1161                 ip_conntrack_event_cache(event, skb);
1162 }
1163
1164 #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
1165     defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
1166 /* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
1167  * in ip_conntrack_core, since we don't want the protocols to autoload
1168  * or depend on ctnetlink */
1169 int ip_ct_port_tuple_to_nfattr(struct sk_buff *skb,
1170                                const struct ip_conntrack_tuple *tuple)
1171 {
1172         NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(__be16),
1173                 &tuple->src.u.tcp.port);
1174         NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(__be16),
1175                 &tuple->dst.u.tcp.port);
1176         return 0;
1177
1178 nfattr_failure:
1179         return -1;
1180 }
1181
1182 int ip_ct_port_nfattr_to_tuple(struct nfattr *tb[],
1183                                struct ip_conntrack_tuple *t)
1184 {
1185         if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1])
1186                 return -EINVAL;
1187
1188         t->src.u.tcp.port =
1189                 *(__be16 *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]);
1190         t->dst.u.tcp.port =
1191                 *(__be16 *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]);
1192
1193         return 0;
1194 }
1195 #endif
1196
1197 /* Returns new sk_buff, or NULL */
1198 struct sk_buff *
1199 ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
1200 {
1201         skb_orphan(skb);
1202
1203         local_bh_disable(); 
1204         skb = ip_defrag(skb, user);
1205         local_bh_enable();
1206
1207         if (skb)
1208                 ip_send_check(skb->nh.iph);
1209         return skb;
1210 }
1211
1212 /* Used by ipt_REJECT. */
1213 static void ip_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
1214 {
1215         struct ip_conntrack *ct;
1216         enum ip_conntrack_info ctinfo;
1217
1218         /* This ICMP is in reverse direction to the packet which caused it */
1219         ct = ip_conntrack_get(skb, &ctinfo);
1220         
1221         if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1222                 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1223         else
1224                 ctinfo = IP_CT_RELATED;
1225
1226         /* Attach to new skbuff, and increment count */
1227         nskb->nfct = &ct->ct_general;
1228         nskb->nfctinfo = ctinfo;
1229         nf_conntrack_get(nskb->nfct);
1230 }
1231
1232 /* Bring out ya dead! */
1233 static struct ip_conntrack *
1234 get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
1235                 void *data, unsigned int *bucket)
1236 {
1237         struct ip_conntrack_tuple_hash *h;
1238         struct ip_conntrack *ct;
1239
1240         write_lock_bh(&ip_conntrack_lock);
1241         for (; *bucket < ip_conntrack_htable_size; (*bucket)++) {
1242                 list_for_each_entry(h, &ip_conntrack_hash[*bucket], list) {
1243                         ct = tuplehash_to_ctrack(h);
1244                         if (iter(ct, data))
1245                                 goto found;
1246                 }
1247         }
1248         list_for_each_entry(h, &unconfirmed, list) {
1249                 ct = tuplehash_to_ctrack(h);
1250                 if (iter(ct, data))
1251                         goto found;
1252         }
1253         write_unlock_bh(&ip_conntrack_lock);
1254         return NULL;
1255
1256 found:
1257         atomic_inc(&ct->ct_general.use);
1258         write_unlock_bh(&ip_conntrack_lock);
1259         return ct;
1260 }
1261
1262 void
1263 ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data)
1264 {
1265         struct ip_conntrack *ct;
1266         unsigned int bucket = 0;
1267
1268         while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) {
1269                 /* Time to push up daises... */
1270                 if (del_timer(&ct->timeout))
1271                         death_by_timeout((unsigned long)ct);
1272                 /* ... else the timer will get him soon. */
1273
1274                 ip_conntrack_put(ct);
1275         }
1276 }
1277
1278 /* Fast function for those who don't want to parse /proc (and I don't
1279    blame them). */
1280 /* Reversing the socket's dst/src point of view gives us the reply
1281    mapping. */
1282 static int
1283 getorigdst(struct sock *sk, int optval, void __user *user, int *len)
1284 {
1285         struct inet_sock *inet = inet_sk(sk);
1286         struct ip_conntrack_tuple_hash *h;
1287         struct ip_conntrack_tuple tuple;
1288         
1289         IP_CT_TUPLE_U_BLANK(&tuple);
1290         tuple.src.ip = inet->rcv_saddr;
1291         tuple.src.u.tcp.port = inet->sport;
1292         tuple.dst.ip = inet->daddr;
1293         tuple.dst.u.tcp.port = inet->dport;
1294         tuple.dst.protonum = IPPROTO_TCP;
1295
1296         /* We only do TCP at the moment: is there a better way? */
1297         if (strcmp(sk->sk_prot->name, "TCP")) {
1298                 DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
1299                 return -ENOPROTOOPT;
1300         }
1301
1302         if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
1303                 DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
1304                        *len, sizeof(struct sockaddr_in));
1305                 return -EINVAL;
1306         }
1307
1308         h = ip_conntrack_find_get(&tuple, NULL);
1309         if (h) {
1310                 struct sockaddr_in sin;
1311                 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1312
1313                 sin.sin_family = AF_INET;
1314                 sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1315                         .tuple.dst.u.tcp.port;
1316                 sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1317                         .tuple.dst.ip;
1318                 memset(sin.sin_zero, 0, sizeof(sin.sin_zero));
1319
1320                 DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
1321                        NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1322                 ip_conntrack_put(ct);
1323                 if (copy_to_user(user, &sin, sizeof(sin)) != 0)
1324                         return -EFAULT;
1325                 else
1326                         return 0;
1327         }
1328         DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
1329                NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
1330                NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
1331         return -ENOENT;
1332 }
1333
1334 static struct nf_sockopt_ops so_getorigdst = {
1335         .pf             = PF_INET,
1336         .get_optmin     = SO_ORIGINAL_DST,
1337         .get_optmax     = SO_ORIGINAL_DST+1,
1338         .get            = &getorigdst,
1339 };
1340
1341 static int kill_all(struct ip_conntrack *i, void *data)
1342 {
1343         return 1;
1344 }
1345
1346 void ip_conntrack_flush(void)
1347 {
1348         ip_ct_iterate_cleanup(kill_all, NULL);
1349 }
1350
1351 static void free_conntrack_hash(struct list_head *hash, int vmalloced,int size)
1352 {
1353         if (vmalloced)
1354                 vfree(hash);
1355         else
1356                 free_pages((unsigned long)hash, 
1357                            get_order(sizeof(struct list_head) * size));
1358 }
1359
1360 /* Mishearing the voices in his head, our hero wonders how he's
1361    supposed to kill the mall. */
1362 void ip_conntrack_cleanup(void)
1363 {
1364         ip_ct_attach = NULL;
1365
1366         /* This makes sure all current packets have passed through
1367            netfilter framework.  Roll on, two-stage module
1368            delete... */
1369         synchronize_net();
1370
1371         ip_ct_event_cache_flush();
1372  i_see_dead_people:
1373         ip_conntrack_flush();
1374         if (atomic_read(&ip_conntrack_count) != 0) {
1375                 schedule();
1376                 goto i_see_dead_people;
1377         }
1378         /* wait until all references to ip_conntrack_untracked are dropped */
1379         while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1)
1380                 schedule();
1381
1382         kmem_cache_destroy(ip_conntrack_cachep);
1383         kmem_cache_destroy(ip_conntrack_expect_cachep);
1384         free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc,
1385                             ip_conntrack_htable_size);
1386         nf_unregister_sockopt(&so_getorigdst);
1387 }
1388
1389 static struct list_head *alloc_hashtable(int size, int *vmalloced)
1390 {
1391         struct list_head *hash;
1392         unsigned int i;
1393
1394         *vmalloced = 0; 
1395         hash = (void*)__get_free_pages(GFP_KERNEL, 
1396                                        get_order(sizeof(struct list_head)
1397                                                  * size));
1398         if (!hash) { 
1399                 *vmalloced = 1;
1400                 printk(KERN_WARNING"ip_conntrack: falling back to vmalloc.\n");
1401                 hash = vmalloc(sizeof(struct list_head) * size);
1402         }
1403
1404         if (hash)
1405                 for (i = 0; i < size; i++)
1406                         INIT_LIST_HEAD(&hash[i]);
1407
1408         return hash;
1409 }
1410
1411 static int set_hashsize(const char *val, struct kernel_param *kp)
1412 {
1413         int i, bucket, hashsize, vmalloced;
1414         int old_vmalloced, old_size;
1415         int rnd;
1416         struct list_head *hash, *old_hash;
1417         struct ip_conntrack_tuple_hash *h;
1418
1419         /* On boot, we can set this without any fancy locking. */
1420         if (!ip_conntrack_htable_size)
1421                 return param_set_int(val, kp);
1422
1423         hashsize = simple_strtol(val, NULL, 0);
1424         if (!hashsize)
1425                 return -EINVAL;
1426
1427         hash = alloc_hashtable(hashsize, &vmalloced);
1428         if (!hash)
1429                 return -ENOMEM;
1430
1431         /* We have to rehash for the new table anyway, so we also can 
1432          * use a new random seed */
1433         get_random_bytes(&rnd, 4);
1434
1435         write_lock_bh(&ip_conntrack_lock);
1436         for (i = 0; i < ip_conntrack_htable_size; i++) {
1437                 while (!list_empty(&ip_conntrack_hash[i])) {
1438                         h = list_entry(ip_conntrack_hash[i].next,
1439                                        struct ip_conntrack_tuple_hash, list);
1440                         list_del(&h->list);
1441                         bucket = __hash_conntrack(&h->tuple, hashsize, rnd);
1442                         list_add_tail(&h->list, &hash[bucket]);
1443                 }
1444         }
1445         old_size = ip_conntrack_htable_size;
1446         old_vmalloced = ip_conntrack_vmalloc;
1447         old_hash = ip_conntrack_hash;
1448
1449         ip_conntrack_htable_size = hashsize;
1450         ip_conntrack_vmalloc = vmalloced;
1451         ip_conntrack_hash = hash;
1452         ip_conntrack_hash_rnd = rnd;
1453         write_unlock_bh(&ip_conntrack_lock);
1454
1455         free_conntrack_hash(old_hash, old_vmalloced, old_size);
1456         return 0;
1457 }
1458
1459 module_param_call(hashsize, set_hashsize, param_get_uint,
1460                   &ip_conntrack_htable_size, 0600);
1461
1462 int __init ip_conntrack_init(void)
1463 {
1464         unsigned int i;
1465         int ret;
1466
1467         /* Idea from tcp.c: use 1/16384 of memory.  On i386: 32MB
1468          * machine has 256 buckets.  >= 1GB machines have 8192 buckets. */
1469         if (!ip_conntrack_htable_size) {
1470                 ip_conntrack_htable_size
1471                         = (((num_physpages << PAGE_SHIFT) / 16384)
1472                            / sizeof(struct list_head));
1473                 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1474                         ip_conntrack_htable_size = 8192;
1475                 if (ip_conntrack_htable_size < 16)
1476                         ip_conntrack_htable_size = 16;
1477         }
1478         ip_conntrack_max = 8 * ip_conntrack_htable_size;
1479
1480         printk("ip_conntrack version %s (%u buckets, %d max)"
1481                " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION,
1482                ip_conntrack_htable_size, ip_conntrack_max,
1483                sizeof(struct ip_conntrack));
1484
1485         ret = nf_register_sockopt(&so_getorigdst);
1486         if (ret != 0) {
1487                 printk(KERN_ERR "Unable to register netfilter socket option\n");
1488                 return ret;
1489         }
1490
1491         ip_conntrack_hash = alloc_hashtable(ip_conntrack_htable_size,
1492                                             &ip_conntrack_vmalloc);
1493         if (!ip_conntrack_hash) {
1494                 printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
1495                 goto err_unreg_sockopt;
1496         }
1497
1498         ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
1499                                                 sizeof(struct ip_conntrack), 0,
1500                                                 0, NULL, NULL);
1501         if (!ip_conntrack_cachep) {
1502                 printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
1503                 goto err_free_hash;
1504         }
1505
1506         ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect",
1507                                         sizeof(struct ip_conntrack_expect),
1508                                         0, 0, NULL, NULL);
1509         if (!ip_conntrack_expect_cachep) {
1510                 printk(KERN_ERR "Unable to create ip_expect slab cache\n");
1511                 goto err_free_conntrack_slab;
1512         }
1513
1514         /* Don't NEED lock here, but good form anyway. */
1515         write_lock_bh(&ip_conntrack_lock);
1516         for (i = 0; i < MAX_IP_CT_PROTO; i++)
1517                 ip_ct_protos[i] = &ip_conntrack_generic_protocol;
1518         /* Sew in builtin protocols. */
1519         ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp;
1520         ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp;
1521         ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp;
1522         write_unlock_bh(&ip_conntrack_lock);
1523
1524         /* For use by ipt_REJECT */
1525         ip_ct_attach = ip_conntrack_attach;
1526
1527         /* Set up fake conntrack:
1528             - to never be deleted, not in any hashes */
1529         atomic_set(&ip_conntrack_untracked.ct_general.use, 1);
1530         /*  - and look it like as a confirmed connection */
1531         set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status);
1532
1533         return ret;
1534
1535 err_free_conntrack_slab:
1536         kmem_cache_destroy(ip_conntrack_cachep);
1537 err_free_hash:
1538         free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc,
1539                             ip_conntrack_htable_size);
1540 err_unreg_sockopt:
1541         nf_unregister_sockopt(&so_getorigdst);
1542
1543         return -ENOMEM;
1544 }