Merge branch 'for-jeff' of git://electric-eye.fr.zoreil.com/home/romieu/linux-2.6
[sfrench/cifs-2.6.git] / net / netfilter / nf_conntrack_core.c
1 /* Connection state tracking for netfilter.  This is separated from,
2    but required by, the NAT layer; it can also be used by an iptables
3    extension. */
4
5 /* (C) 1999-2001 Paul `Rusty' Russell
6  * (C) 2002-2005 Netfilter Core Team <coreteam@netfilter.org>
7  * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License version 2 as
11  * published by the Free Software Foundation.
12  *
13  * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
14  *      - new API and handling of conntrack/nat helpers
15  *      - now capable of multiple expectations for one master
16  * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
17  *      - add usage/reference counts to ip_conntrack_expect
18  *      - export ip_conntrack[_expect]_{find_get,put} functions
19  * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
20  *      - generalize L3 protocol denendent part.
21  * 23 Mar 2004: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
22  *      - add support various size of conntrack structures.
23  *
24  * Derived from net/ipv4/netfilter/ip_conntrack_core.c
25  */
26
27 #include <linux/config.h>
28 #include <linux/types.h>
29 #include <linux/netfilter.h>
30 #include <linux/module.h>
31 #include <linux/skbuff.h>
32 #include <linux/proc_fs.h>
33 #include <linux/vmalloc.h>
34 #include <linux/stddef.h>
35 #include <linux/slab.h>
36 #include <linux/random.h>
37 #include <linux/jhash.h>
38 #include <linux/err.h>
39 #include <linux/percpu.h>
40 #include <linux/moduleparam.h>
41 #include <linux/notifier.h>
42 #include <linux/kernel.h>
43 #include <linux/netdevice.h>
44 #include <linux/socket.h>
45
46 /* This rwlock protects the main hash table, protocol/helper/expected
47    registrations, conntrack timers*/
48 #define ASSERT_READ_LOCK(x)
49 #define ASSERT_WRITE_LOCK(x)
50
51 #include <net/netfilter/nf_conntrack.h>
52 #include <net/netfilter/nf_conntrack_l3proto.h>
53 #include <net/netfilter/nf_conntrack_protocol.h>
54 #include <net/netfilter/nf_conntrack_helper.h>
55 #include <net/netfilter/nf_conntrack_core.h>
56 #include <linux/netfilter_ipv4/listhelp.h>
57
58 #define NF_CONNTRACK_VERSION    "0.4.1"
59
60 #if 0
61 #define DEBUGP printk
62 #else
63 #define DEBUGP(format, args...)
64 #endif
65
66 DEFINE_RWLOCK(nf_conntrack_lock);
67
68 /* nf_conntrack_standalone needs this */
69 atomic_t nf_conntrack_count = ATOMIC_INIT(0);
70
71 void (*nf_conntrack_destroyed)(struct nf_conn *conntrack) = NULL;
72 LIST_HEAD(nf_conntrack_expect_list);
73 struct nf_conntrack_protocol **nf_ct_protos[PF_MAX];
74 struct nf_conntrack_l3proto *nf_ct_l3protos[PF_MAX];
75 static LIST_HEAD(helpers);
76 unsigned int nf_conntrack_htable_size = 0;
77 int nf_conntrack_max;
78 struct list_head *nf_conntrack_hash;
79 static kmem_cache_t *nf_conntrack_expect_cachep;
80 struct nf_conn nf_conntrack_untracked;
81 unsigned int nf_ct_log_invalid;
82 static LIST_HEAD(unconfirmed);
83 static int nf_conntrack_vmalloc;
84
85 static unsigned int nf_conntrack_next_id = 1;
86 static unsigned int nf_conntrack_expect_next_id = 1;
87 #ifdef CONFIG_NF_CONNTRACK_EVENTS
88 struct notifier_block *nf_conntrack_chain;
89 struct notifier_block *nf_conntrack_expect_chain;
90
91 DEFINE_PER_CPU(struct nf_conntrack_ecache, nf_conntrack_ecache);
92
93 /* deliver cached events and clear cache entry - must be called with locally
94  * disabled softirqs */
95 static inline void
96 __nf_ct_deliver_cached_events(struct nf_conntrack_ecache *ecache)
97 {
98         DEBUGP("ecache: delivering events for %p\n", ecache->ct);
99         if (nf_ct_is_confirmed(ecache->ct) && !nf_ct_is_dying(ecache->ct)
100             && ecache->events)
101                 notifier_call_chain(&nf_conntrack_chain, ecache->events,
102                                     ecache->ct);
103
104         ecache->events = 0;
105         nf_ct_put(ecache->ct);
106         ecache->ct = NULL;
107 }
108
109 /* Deliver all cached events for a particular conntrack. This is called
110  * by code prior to async packet handling for freeing the skb */
111 void nf_ct_deliver_cached_events(const struct nf_conn *ct)
112 {
113         struct nf_conntrack_ecache *ecache;
114
115         local_bh_disable();
116         ecache = &__get_cpu_var(nf_conntrack_ecache);
117         if (ecache->ct == ct)
118                 __nf_ct_deliver_cached_events(ecache);
119         local_bh_enable();
120 }
121
122 /* Deliver cached events for old pending events, if current conntrack != old */
123 void __nf_ct_event_cache_init(struct nf_conn *ct)
124 {
125         struct nf_conntrack_ecache *ecache;
126         
127         /* take care of delivering potentially old events */
128         ecache = &__get_cpu_var(nf_conntrack_ecache);
129         BUG_ON(ecache->ct == ct);
130         if (ecache->ct)
131                 __nf_ct_deliver_cached_events(ecache);
132         /* initialize for this conntrack/packet */
133         ecache->ct = ct;
134         nf_conntrack_get(&ct->ct_general);
135 }
136
137 /* flush the event cache - touches other CPU's data and must not be called
138  * while packets are still passing through the code */
139 static void nf_ct_event_cache_flush(void)
140 {
141         struct nf_conntrack_ecache *ecache;
142         int cpu;
143
144         for_each_cpu(cpu) {
145                 ecache = &per_cpu(nf_conntrack_ecache, cpu);
146                 if (ecache->ct)
147                         nf_ct_put(ecache->ct);
148         }
149 }
150 #else
151 static inline void nf_ct_event_cache_flush(void) {}
152 #endif /* CONFIG_NF_CONNTRACK_EVENTS */
153
154 DEFINE_PER_CPU(struct ip_conntrack_stat, nf_conntrack_stat);
155 EXPORT_PER_CPU_SYMBOL(nf_conntrack_stat);
156
157 /*
158  * This scheme offers various size of "struct nf_conn" dependent on
159  * features(helper, nat, ...)
160  */
161
162 #define NF_CT_FEATURES_NAMELEN  256
163 static struct {
164         /* name of slab cache. printed in /proc/slabinfo */
165         char *name;
166
167         /* size of slab cache */
168         size_t size;
169
170         /* slab cache pointer */
171         kmem_cache_t *cachep;
172
173         /* allocated slab cache + modules which uses this slab cache */
174         int use;
175
176         /* Initialization */
177         int (*init_conntrack)(struct nf_conn *, u_int32_t);
178
179 } nf_ct_cache[NF_CT_F_NUM];
180
181 /* protect members of nf_ct_cache except of "use" */
182 DEFINE_RWLOCK(nf_ct_cache_lock);
183
184 /* This avoids calling kmem_cache_create() with same name simultaneously */
185 DECLARE_MUTEX(nf_ct_cache_mutex);
186
187 extern struct nf_conntrack_protocol nf_conntrack_generic_protocol;
188 struct nf_conntrack_protocol *
189 __nf_ct_proto_find(u_int16_t l3proto, u_int8_t protocol)
190 {
191         if (unlikely(l3proto >= AF_MAX || nf_ct_protos[l3proto] == NULL))
192                 return &nf_conntrack_generic_protocol;
193
194         return nf_ct_protos[l3proto][protocol];
195 }
196
197 /* this is guaranteed to always return a valid protocol helper, since
198  * it falls back to generic_protocol */
199 struct nf_conntrack_protocol *
200 nf_ct_proto_find_get(u_int16_t l3proto, u_int8_t protocol)
201 {
202         struct nf_conntrack_protocol *p;
203
204         preempt_disable();
205         p = __nf_ct_proto_find(l3proto, protocol);
206         if (p) {
207                 if (!try_module_get(p->me))
208                         p = &nf_conntrack_generic_protocol;
209         }
210         preempt_enable();
211         
212         return p;
213 }
214
215 void nf_ct_proto_put(struct nf_conntrack_protocol *p)
216 {
217         module_put(p->me);
218 }
219
220 struct nf_conntrack_l3proto *
221 nf_ct_l3proto_find_get(u_int16_t l3proto)
222 {
223         struct nf_conntrack_l3proto *p;
224
225         preempt_disable();
226         p = __nf_ct_l3proto_find(l3proto);
227         if (p) {
228                 if (!try_module_get(p->me))
229                         p = &nf_conntrack_generic_l3proto;
230         }
231         preempt_enable();
232
233         return p;
234 }
235
236 void nf_ct_l3proto_put(struct nf_conntrack_l3proto *p)
237 {
238         module_put(p->me);
239 }
240
241 static int nf_conntrack_hash_rnd_initted;
242 static unsigned int nf_conntrack_hash_rnd;
243
244 static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple,
245                                   unsigned int size, unsigned int rnd)
246 {
247         unsigned int a, b;
248         a = jhash((void *)tuple->src.u3.all, sizeof(tuple->src.u3.all),
249                   ((tuple->src.l3num) << 16) | tuple->dst.protonum);
250         b = jhash((void *)tuple->dst.u3.all, sizeof(tuple->dst.u3.all),
251                         (tuple->src.u.all << 16) | tuple->dst.u.all);
252
253         return jhash_2words(a, b, rnd) % size;
254 }
255
256 static inline u_int32_t hash_conntrack(const struct nf_conntrack_tuple *tuple)
257 {
258         return __hash_conntrack(tuple, nf_conntrack_htable_size,
259                                 nf_conntrack_hash_rnd);
260 }
261
262 /* Initialize "struct nf_conn" which has spaces for helper */
263 static int
264 init_conntrack_for_helper(struct nf_conn *conntrack, u_int32_t features)
265 {
266
267         conntrack->help = (union nf_conntrack_help *)
268                 (((unsigned long)conntrack->data
269                   + (__alignof__(union nf_conntrack_help) - 1))
270                  & (~((unsigned long)(__alignof__(union nf_conntrack_help) -1))));
271         return 0;
272 }
273
274 int nf_conntrack_register_cache(u_int32_t features, const char *name,
275                                 size_t size,
276                                 int (*init)(struct nf_conn *, u_int32_t))
277 {
278         int ret = 0;
279         char *cache_name;
280         kmem_cache_t *cachep;
281
282         DEBUGP("nf_conntrack_register_cache: features=0x%x, name=%s, size=%d\n",
283                features, name, size);
284
285         if (features < NF_CT_F_BASIC || features >= NF_CT_F_NUM) {
286                 DEBUGP("nf_conntrack_register_cache: invalid features.: 0x%x\n",
287                         features);
288                 return -EINVAL;
289         }
290
291         down(&nf_ct_cache_mutex);
292
293         write_lock_bh(&nf_ct_cache_lock);
294         /* e.g: multiple helpers are loaded */
295         if (nf_ct_cache[features].use > 0) {
296                 DEBUGP("nf_conntrack_register_cache: already resisterd.\n");
297                 if ((!strncmp(nf_ct_cache[features].name, name,
298                               NF_CT_FEATURES_NAMELEN))
299                     && nf_ct_cache[features].size == size
300                     && nf_ct_cache[features].init_conntrack == init) {
301                         DEBUGP("nf_conntrack_register_cache: reusing.\n");
302                         nf_ct_cache[features].use++;
303                         ret = 0;
304                 } else
305                         ret = -EBUSY;
306
307                 write_unlock_bh(&nf_ct_cache_lock);
308                 up(&nf_ct_cache_mutex);
309                 return ret;
310         }
311         write_unlock_bh(&nf_ct_cache_lock);
312
313         /*
314          * The memory space for name of slab cache must be alive until
315          * cache is destroyed.
316          */
317         cache_name = kmalloc(sizeof(char)*NF_CT_FEATURES_NAMELEN, GFP_ATOMIC);
318         if (cache_name == NULL) {
319                 DEBUGP("nf_conntrack_register_cache: can't alloc cache_name\n");
320                 ret = -ENOMEM;
321                 goto out_up_mutex;
322         }
323
324         if (strlcpy(cache_name, name, NF_CT_FEATURES_NAMELEN)
325                                                 >= NF_CT_FEATURES_NAMELEN) {
326                 printk("nf_conntrack_register_cache: name too long\n");
327                 ret = -EINVAL;
328                 goto out_free_name;
329         }
330
331         cachep = kmem_cache_create(cache_name, size, 0, 0,
332                                    NULL, NULL);
333         if (!cachep) {
334                 printk("nf_conntrack_register_cache: Can't create slab cache "
335                        "for the features = 0x%x\n", features);
336                 ret = -ENOMEM;
337                 goto out_free_name;
338         }
339
340         write_lock_bh(&nf_ct_cache_lock);
341         nf_ct_cache[features].use = 1;
342         nf_ct_cache[features].size = size;
343         nf_ct_cache[features].init_conntrack = init;
344         nf_ct_cache[features].cachep = cachep;
345         nf_ct_cache[features].name = cache_name;
346         write_unlock_bh(&nf_ct_cache_lock);
347
348         goto out_up_mutex;
349
350 out_free_name:
351         kfree(cache_name);
352 out_up_mutex:
353         up(&nf_ct_cache_mutex);
354         return ret;
355 }
356
357 /* FIXME: In the current, only nf_conntrack_cleanup() can call this function. */
358 void nf_conntrack_unregister_cache(u_int32_t features)
359 {
360         kmem_cache_t *cachep;
361         char *name;
362
363         /*
364          * This assures that kmem_cache_create() isn't called before destroying
365          * slab cache.
366          */
367         DEBUGP("nf_conntrack_unregister_cache: 0x%04x\n", features);
368         down(&nf_ct_cache_mutex);
369
370         write_lock_bh(&nf_ct_cache_lock);
371         if (--nf_ct_cache[features].use > 0) {
372                 write_unlock_bh(&nf_ct_cache_lock);
373                 up(&nf_ct_cache_mutex);
374                 return;
375         }
376         cachep = nf_ct_cache[features].cachep;
377         name = nf_ct_cache[features].name;
378         nf_ct_cache[features].cachep = NULL;
379         nf_ct_cache[features].name = NULL;
380         nf_ct_cache[features].init_conntrack = NULL;
381         nf_ct_cache[features].size = 0;
382         write_unlock_bh(&nf_ct_cache_lock);
383
384         synchronize_net();
385
386         kmem_cache_destroy(cachep);
387         kfree(name);
388
389         up(&nf_ct_cache_mutex);
390 }
391
392 int
393 nf_ct_get_tuple(const struct sk_buff *skb,
394                 unsigned int nhoff,
395                 unsigned int dataoff,
396                 u_int16_t l3num,
397                 u_int8_t protonum,
398                 struct nf_conntrack_tuple *tuple,
399                 const struct nf_conntrack_l3proto *l3proto,
400                 const struct nf_conntrack_protocol *protocol)
401 {
402         NF_CT_TUPLE_U_BLANK(tuple);
403
404         tuple->src.l3num = l3num;
405         if (l3proto->pkt_to_tuple(skb, nhoff, tuple) == 0)
406                 return 0;
407
408         tuple->dst.protonum = protonum;
409         tuple->dst.dir = IP_CT_DIR_ORIGINAL;
410
411         return protocol->pkt_to_tuple(skb, dataoff, tuple);
412 }
413
414 int
415 nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
416                    const struct nf_conntrack_tuple *orig,
417                    const struct nf_conntrack_l3proto *l3proto,
418                    const struct nf_conntrack_protocol *protocol)
419 {
420         NF_CT_TUPLE_U_BLANK(inverse);
421
422         inverse->src.l3num = orig->src.l3num;
423         if (l3proto->invert_tuple(inverse, orig) == 0)
424                 return 0;
425
426         inverse->dst.dir = !orig->dst.dir;
427
428         inverse->dst.protonum = orig->dst.protonum;
429         return protocol->invert_tuple(inverse, orig);
430 }
431
432 /* nf_conntrack_expect helper functions */
433 void nf_ct_unlink_expect(struct nf_conntrack_expect *exp)
434 {
435         ASSERT_WRITE_LOCK(&nf_conntrack_lock);
436         NF_CT_ASSERT(!timer_pending(&exp->timeout));
437         list_del(&exp->list);
438         NF_CT_STAT_INC(expect_delete);
439         exp->master->expecting--;
440         nf_conntrack_expect_put(exp);
441 }
442
443 static void expectation_timed_out(unsigned long ul_expect)
444 {
445         struct nf_conntrack_expect *exp = (void *)ul_expect;
446
447         write_lock_bh(&nf_conntrack_lock);
448         nf_ct_unlink_expect(exp);
449         write_unlock_bh(&nf_conntrack_lock);
450         nf_conntrack_expect_put(exp);
451 }
452
453 struct nf_conntrack_expect *
454 __nf_conntrack_expect_find(const struct nf_conntrack_tuple *tuple)
455 {
456         struct nf_conntrack_expect *i;
457         
458         list_for_each_entry(i, &nf_conntrack_expect_list, list) {
459                 if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) {
460                         atomic_inc(&i->use);
461                         return i;
462                 }
463         }
464         return NULL;
465 }
466
467 /* Just find a expectation corresponding to a tuple. */
468 struct nf_conntrack_expect *
469 nf_conntrack_expect_find(const struct nf_conntrack_tuple *tuple)
470 {
471         struct nf_conntrack_expect *i;
472         
473         read_lock_bh(&nf_conntrack_lock);
474         i = __nf_conntrack_expect_find(tuple);
475         read_unlock_bh(&nf_conntrack_lock);
476
477         return i;
478 }
479
480 /* If an expectation for this connection is found, it gets delete from
481  * global list then returned. */
482 static struct nf_conntrack_expect *
483 find_expectation(const struct nf_conntrack_tuple *tuple)
484 {
485         struct nf_conntrack_expect *i;
486
487         list_for_each_entry(i, &nf_conntrack_expect_list, list) {
488         /* If master is not in hash table yet (ie. packet hasn't left
489            this machine yet), how can other end know about expected?
490            Hence these are not the droids you are looking for (if
491            master ct never got confirmed, we'd hold a reference to it
492            and weird things would happen to future packets). */
493                 if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
494                     && nf_ct_is_confirmed(i->master)) {
495                         if (i->flags & NF_CT_EXPECT_PERMANENT) {
496                                 atomic_inc(&i->use);
497                                 return i;
498                         } else if (del_timer(&i->timeout)) {
499                                 nf_ct_unlink_expect(i);
500                                 return i;
501                         }
502                 }
503         }
504         return NULL;
505 }
506
507 /* delete all expectations for this conntrack */
508 void nf_ct_remove_expectations(struct nf_conn *ct)
509 {
510         struct nf_conntrack_expect *i, *tmp;
511
512         /* Optimization: most connection never expect any others. */
513         if (ct->expecting == 0)
514                 return;
515
516         list_for_each_entry_safe(i, tmp, &nf_conntrack_expect_list, list) {
517                 if (i->master == ct && del_timer(&i->timeout)) {
518                         nf_ct_unlink_expect(i);
519                         nf_conntrack_expect_put(i);
520                 }
521         }
522 }
523
524 static void
525 clean_from_lists(struct nf_conn *ct)
526 {
527         unsigned int ho, hr;
528         
529         DEBUGP("clean_from_lists(%p)\n", ct);
530         ASSERT_WRITE_LOCK(&nf_conntrack_lock);
531
532         ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
533         hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
534         LIST_DELETE(&nf_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
535         LIST_DELETE(&nf_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
536
537         /* Destroy all pending expectations */
538         nf_ct_remove_expectations(ct);
539 }
540
541 static void
542 destroy_conntrack(struct nf_conntrack *nfct)
543 {
544         struct nf_conn *ct = (struct nf_conn *)nfct;
545         struct nf_conntrack_l3proto *l3proto;
546         struct nf_conntrack_protocol *proto;
547
548         DEBUGP("destroy_conntrack(%p)\n", ct);
549         NF_CT_ASSERT(atomic_read(&nfct->use) == 0);
550         NF_CT_ASSERT(!timer_pending(&ct->timeout));
551
552         nf_conntrack_event(IPCT_DESTROY, ct);
553         set_bit(IPS_DYING_BIT, &ct->status);
554
555         /* To make sure we don't get any weird locking issues here:
556          * destroy_conntrack() MUST NOT be called with a write lock
557          * to nf_conntrack_lock!!! -HW */
558         l3proto = __nf_ct_l3proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num);
559         if (l3proto && l3proto->destroy)
560                 l3proto->destroy(ct);
561
562         proto = __nf_ct_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num, ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
563         if (proto && proto->destroy)
564                 proto->destroy(ct);
565
566         if (nf_conntrack_destroyed)
567                 nf_conntrack_destroyed(ct);
568
569         write_lock_bh(&nf_conntrack_lock);
570         /* Expectations will have been removed in clean_from_lists,
571          * except TFTP can create an expectation on the first packet,
572          * before connection is in the list, so we need to clean here,
573          * too. */
574         nf_ct_remove_expectations(ct);
575
576         /* We overload first tuple to link into unconfirmed list. */
577         if (!nf_ct_is_confirmed(ct)) {
578                 BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
579                 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
580         }
581
582         NF_CT_STAT_INC(delete);
583         write_unlock_bh(&nf_conntrack_lock);
584
585         if (ct->master)
586                 nf_ct_put(ct->master);
587
588         DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
589         nf_conntrack_free(ct);
590 }
591
592 static void death_by_timeout(unsigned long ul_conntrack)
593 {
594         struct nf_conn *ct = (void *)ul_conntrack;
595
596         write_lock_bh(&nf_conntrack_lock);
597         /* Inside lock so preempt is disabled on module removal path.
598          * Otherwise we can get spurious warnings. */
599         NF_CT_STAT_INC(delete_list);
600         clean_from_lists(ct);
601         write_unlock_bh(&nf_conntrack_lock);
602         nf_ct_put(ct);
603 }
604
605 static inline int
606 conntrack_tuple_cmp(const struct nf_conntrack_tuple_hash *i,
607                     const struct nf_conntrack_tuple *tuple,
608                     const struct nf_conn *ignored_conntrack)
609 {
610         ASSERT_READ_LOCK(&nf_conntrack_lock);
611         return nf_ct_tuplehash_to_ctrack(i) != ignored_conntrack
612                 && nf_ct_tuple_equal(tuple, &i->tuple);
613 }
614
615 struct nf_conntrack_tuple_hash *
616 __nf_conntrack_find(const struct nf_conntrack_tuple *tuple,
617                     const struct nf_conn *ignored_conntrack)
618 {
619         struct nf_conntrack_tuple_hash *h;
620         unsigned int hash = hash_conntrack(tuple);
621
622         ASSERT_READ_LOCK(&nf_conntrack_lock);
623         list_for_each_entry(h, &nf_conntrack_hash[hash], list) {
624                 if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {
625                         NF_CT_STAT_INC(found);
626                         return h;
627                 }
628                 NF_CT_STAT_INC(searched);
629         }
630
631         return NULL;
632 }
633
634 /* Find a connection corresponding to a tuple. */
635 struct nf_conntrack_tuple_hash *
636 nf_conntrack_find_get(const struct nf_conntrack_tuple *tuple,
637                       const struct nf_conn *ignored_conntrack)
638 {
639         struct nf_conntrack_tuple_hash *h;
640
641         read_lock_bh(&nf_conntrack_lock);
642         h = __nf_conntrack_find(tuple, ignored_conntrack);
643         if (h)
644                 atomic_inc(&nf_ct_tuplehash_to_ctrack(h)->ct_general.use);
645         read_unlock_bh(&nf_conntrack_lock);
646
647         return h;
648 }
649
650 static void __nf_conntrack_hash_insert(struct nf_conn *ct,
651                                        unsigned int hash,
652                                        unsigned int repl_hash) 
653 {
654         ct->id = ++nf_conntrack_next_id;
655         list_prepend(&nf_conntrack_hash[hash],
656                      &ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
657         list_prepend(&nf_conntrack_hash[repl_hash],
658                      &ct->tuplehash[IP_CT_DIR_REPLY].list);
659 }
660
661 void nf_conntrack_hash_insert(struct nf_conn *ct)
662 {
663         unsigned int hash, repl_hash;
664
665         hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
666         repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
667
668         write_lock_bh(&nf_conntrack_lock);
669         __nf_conntrack_hash_insert(ct, hash, repl_hash);
670         write_unlock_bh(&nf_conntrack_lock);
671 }
672
673 /* Confirm a connection given skb; places it in hash table */
674 int
675 __nf_conntrack_confirm(struct sk_buff **pskb)
676 {
677         unsigned int hash, repl_hash;
678         struct nf_conn *ct;
679         enum ip_conntrack_info ctinfo;
680
681         ct = nf_ct_get(*pskb, &ctinfo);
682
683         /* ipt_REJECT uses nf_conntrack_attach to attach related
684            ICMP/TCP RST packets in other direction.  Actual packet
685            which created connection will be IP_CT_NEW or for an
686            expected connection, IP_CT_RELATED. */
687         if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
688                 return NF_ACCEPT;
689
690         hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
691         repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
692
693         /* We're not in hash table, and we refuse to set up related
694            connections for unconfirmed conns.  But packet copies and
695            REJECT will give spurious warnings here. */
696         /* NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
697
698         /* No external references means noone else could have
699            confirmed us. */
700         NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
701         DEBUGP("Confirming conntrack %p\n", ct);
702
703         write_lock_bh(&nf_conntrack_lock);
704
705         /* See if there's one in the list already, including reverse:
706            NAT could have grabbed it without realizing, since we're
707            not in the hash.  If there is, we lost race. */
708         if (!LIST_FIND(&nf_conntrack_hash[hash],
709                        conntrack_tuple_cmp,
710                        struct nf_conntrack_tuple_hash *,
711                        &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
712             && !LIST_FIND(&nf_conntrack_hash[repl_hash],
713                           conntrack_tuple_cmp,
714                           struct nf_conntrack_tuple_hash *,
715                           &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
716                 /* Remove from unconfirmed list */
717                 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
718
719                 __nf_conntrack_hash_insert(ct, hash, repl_hash);
720                 /* Timer relative to confirmation time, not original
721                    setting time, otherwise we'd get timer wrap in
722                    weird delay cases. */
723                 ct->timeout.expires += jiffies;
724                 add_timer(&ct->timeout);
725                 atomic_inc(&ct->ct_general.use);
726                 set_bit(IPS_CONFIRMED_BIT, &ct->status);
727                 NF_CT_STAT_INC(insert);
728                 write_unlock_bh(&nf_conntrack_lock);
729                 if (ct->helper)
730                         nf_conntrack_event_cache(IPCT_HELPER, *pskb);
731 #ifdef CONFIG_NF_NAT_NEEDED
732                 if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
733                     test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
734                         nf_conntrack_event_cache(IPCT_NATINFO, *pskb);
735 #endif
736                 nf_conntrack_event_cache(master_ct(ct) ?
737                                          IPCT_RELATED : IPCT_NEW, *pskb);
738                 return NF_ACCEPT;
739         }
740
741         NF_CT_STAT_INC(insert_failed);
742         write_unlock_bh(&nf_conntrack_lock);
743         return NF_DROP;
744 }
745
746 /* Returns true if a connection correspondings to the tuple (required
747    for NAT). */
748 int
749 nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
750                          const struct nf_conn *ignored_conntrack)
751 {
752         struct nf_conntrack_tuple_hash *h;
753
754         read_lock_bh(&nf_conntrack_lock);
755         h = __nf_conntrack_find(tuple, ignored_conntrack);
756         read_unlock_bh(&nf_conntrack_lock);
757
758         return h != NULL;
759 }
760
761 /* There's a small race here where we may free a just-assured
762    connection.  Too bad: we're in trouble anyway. */
763 static inline int unreplied(const struct nf_conntrack_tuple_hash *i)
764 {
765         return !(test_bit(IPS_ASSURED_BIT,
766                           &nf_ct_tuplehash_to_ctrack(i)->status));
767 }
768
769 static int early_drop(struct list_head *chain)
770 {
771         /* Traverse backwards: gives us oldest, which is roughly LRU */
772         struct nf_conntrack_tuple_hash *h;
773         struct nf_conn *ct = NULL;
774         int dropped = 0;
775
776         read_lock_bh(&nf_conntrack_lock);
777         h = LIST_FIND_B(chain, unreplied, struct nf_conntrack_tuple_hash *);
778         if (h) {
779                 ct = nf_ct_tuplehash_to_ctrack(h);
780                 atomic_inc(&ct->ct_general.use);
781         }
782         read_unlock_bh(&nf_conntrack_lock);
783
784         if (!ct)
785                 return dropped;
786
787         if (del_timer(&ct->timeout)) {
788                 death_by_timeout((unsigned long)ct);
789                 dropped = 1;
790                 NF_CT_STAT_INC(early_drop);
791         }
792         nf_ct_put(ct);
793         return dropped;
794 }
795
796 static inline int helper_cmp(const struct nf_conntrack_helper *i,
797                              const struct nf_conntrack_tuple *rtuple)
798 {
799         return nf_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
800 }
801
802 static struct nf_conntrack_helper *
803 __nf_ct_helper_find(const struct nf_conntrack_tuple *tuple)
804 {
805         return LIST_FIND(&helpers, helper_cmp,
806                          struct nf_conntrack_helper *,
807                          tuple);
808 }
809
810 struct nf_conntrack_helper *
811 nf_ct_helper_find_get( const struct nf_conntrack_tuple *tuple)
812 {
813         struct nf_conntrack_helper *helper;
814
815         /* need nf_conntrack_lock to assure that helper exists until
816          * try_module_get() is called */
817         read_lock_bh(&nf_conntrack_lock);
818
819         helper = __nf_ct_helper_find(tuple);
820         if (helper) {
821                 /* need to increase module usage count to assure helper will
822                  * not go away while the caller is e.g. busy putting a
823                  * conntrack in the hash that uses the helper */
824                 if (!try_module_get(helper->me))
825                         helper = NULL;
826         }
827
828         read_unlock_bh(&nf_conntrack_lock);
829
830         return helper;
831 }
832
833 void nf_ct_helper_put(struct nf_conntrack_helper *helper)
834 {
835         module_put(helper->me);
836 }
837
838 static struct nf_conn *
839 __nf_conntrack_alloc(const struct nf_conntrack_tuple *orig,
840                      const struct nf_conntrack_tuple *repl,
841                      const struct nf_conntrack_l3proto *l3proto)
842 {
843         struct nf_conn *conntrack = NULL;
844         u_int32_t features = 0;
845
846         if (!nf_conntrack_hash_rnd_initted) {
847                 get_random_bytes(&nf_conntrack_hash_rnd, 4);
848                 nf_conntrack_hash_rnd_initted = 1;
849         }
850
851         if (nf_conntrack_max
852             && atomic_read(&nf_conntrack_count) >= nf_conntrack_max) {
853                 unsigned int hash = hash_conntrack(orig);
854                 /* Try dropping from this hash chain. */
855                 if (!early_drop(&nf_conntrack_hash[hash])) {
856                         if (net_ratelimit())
857                                 printk(KERN_WARNING
858                                        "nf_conntrack: table full, dropping"
859                                        " packet.\n");
860                         return ERR_PTR(-ENOMEM);
861                 }
862         }
863
864         /*  find features needed by this conntrack. */
865         features = l3proto->get_features(orig);
866         read_lock_bh(&nf_conntrack_lock);
867         if (__nf_ct_helper_find(repl) != NULL)
868                 features |= NF_CT_F_HELP;
869         read_unlock_bh(&nf_conntrack_lock);
870
871         DEBUGP("nf_conntrack_alloc: features=0x%x\n", features);
872
873         read_lock_bh(&nf_ct_cache_lock);
874
875         if (!nf_ct_cache[features].use) {
876                 DEBUGP("nf_conntrack_alloc: not supported features = 0x%x\n",
877                         features);
878                 goto out;
879         }
880
881         conntrack = kmem_cache_alloc(nf_ct_cache[features].cachep, GFP_ATOMIC);
882         if (conntrack == NULL) {
883                 DEBUGP("nf_conntrack_alloc: Can't alloc conntrack from cache\n");
884                 goto out;
885         }
886
887         memset(conntrack, 0, nf_ct_cache[features].size);
888         conntrack->features = features;
889         if (nf_ct_cache[features].init_conntrack &&
890             nf_ct_cache[features].init_conntrack(conntrack, features) < 0) {
891                 DEBUGP("nf_conntrack_alloc: failed to init\n");
892                 kmem_cache_free(nf_ct_cache[features].cachep, conntrack);
893                 conntrack = NULL;
894                 goto out;
895         }
896
897         atomic_set(&conntrack->ct_general.use, 1);
898         conntrack->ct_general.destroy = destroy_conntrack;
899         conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
900         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
901         /* Don't set timer yet: wait for confirmation */
902         init_timer(&conntrack->timeout);
903         conntrack->timeout.data = (unsigned long)conntrack;
904         conntrack->timeout.function = death_by_timeout;
905
906         atomic_inc(&nf_conntrack_count);
907 out:
908         read_unlock_bh(&nf_ct_cache_lock);
909         return conntrack;
910 }
911
912 struct nf_conn *nf_conntrack_alloc(const struct nf_conntrack_tuple *orig,
913                                    const struct nf_conntrack_tuple *repl)
914 {
915         struct nf_conntrack_l3proto *l3proto;
916
917         l3proto = __nf_ct_l3proto_find(orig->src.l3num);
918         return __nf_conntrack_alloc(orig, repl, l3proto);
919 }
920
921 void nf_conntrack_free(struct nf_conn *conntrack)
922 {
923         u_int32_t features = conntrack->features;
924         NF_CT_ASSERT(features >= NF_CT_F_BASIC && features < NF_CT_F_NUM);
925         DEBUGP("nf_conntrack_free: features = 0x%x, conntrack=%p\n", features,
926                conntrack);
927         kmem_cache_free(nf_ct_cache[features].cachep, conntrack);
928         atomic_dec(&nf_conntrack_count);
929 }
930
931 /* Allocate a new conntrack: we return -ENOMEM if classification
932    failed due to stress.  Otherwise it really is unclassifiable. */
933 static struct nf_conntrack_tuple_hash *
934 init_conntrack(const struct nf_conntrack_tuple *tuple,
935                struct nf_conntrack_l3proto *l3proto,
936                struct nf_conntrack_protocol *protocol,
937                struct sk_buff *skb,
938                unsigned int dataoff)
939 {
940         struct nf_conn *conntrack;
941         struct nf_conntrack_tuple repl_tuple;
942         struct nf_conntrack_expect *exp;
943
944         if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, protocol)) {
945                 DEBUGP("Can't invert tuple.\n");
946                 return NULL;
947         }
948
949         conntrack = __nf_conntrack_alloc(tuple, &repl_tuple, l3proto);
950         if (conntrack == NULL || IS_ERR(conntrack)) {
951                 DEBUGP("Can't allocate conntrack.\n");
952                 return (struct nf_conntrack_tuple_hash *)conntrack;
953         }
954
955         if (!protocol->new(conntrack, skb, dataoff)) {
956                 nf_conntrack_free(conntrack);
957                 DEBUGP("init conntrack: can't track with proto module\n");
958                 return NULL;
959         }
960
961         write_lock_bh(&nf_conntrack_lock);
962         exp = find_expectation(tuple);
963
964         if (exp) {
965                 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
966                         conntrack, exp);
967                 /* Welcome, Mr. Bond.  We've been expecting you... */
968                 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
969                 conntrack->master = exp->master;
970 #ifdef CONFIG_NF_CONNTRACK_MARK
971                 conntrack->mark = exp->master->mark;
972 #endif
973                 nf_conntrack_get(&conntrack->master->ct_general);
974                 NF_CT_STAT_INC(expect_new);
975         } else {
976                 conntrack->helper = __nf_ct_helper_find(&repl_tuple);
977
978                 NF_CT_STAT_INC(new);
979         }
980
981         /* Overload tuple linked list to put us in unconfirmed list. */
982         list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
983
984         write_unlock_bh(&nf_conntrack_lock);
985
986         if (exp) {
987                 if (exp->expectfn)
988                         exp->expectfn(conntrack, exp);
989                 nf_conntrack_expect_put(exp);
990         }
991
992         return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
993 }
994
995 /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
996 static inline struct nf_conn *
997 resolve_normal_ct(struct sk_buff *skb,
998                   unsigned int dataoff,
999                   u_int16_t l3num,
1000                   u_int8_t protonum,
1001                   struct nf_conntrack_l3proto *l3proto,
1002                   struct nf_conntrack_protocol *proto,
1003                   int *set_reply,
1004                   enum ip_conntrack_info *ctinfo)
1005 {
1006         struct nf_conntrack_tuple tuple;
1007         struct nf_conntrack_tuple_hash *h;
1008         struct nf_conn *ct;
1009
1010         if (!nf_ct_get_tuple(skb, (unsigned int)(skb->nh.raw - skb->data),
1011                              dataoff, l3num, protonum, &tuple, l3proto,
1012                              proto)) {
1013                 DEBUGP("resolve_normal_ct: Can't get tuple\n");
1014                 return NULL;
1015         }
1016
1017         /* look for tuple match */
1018         h = nf_conntrack_find_get(&tuple, NULL);
1019         if (!h) {
1020                 h = init_conntrack(&tuple, l3proto, proto, skb, dataoff);
1021                 if (!h)
1022                         return NULL;
1023                 if (IS_ERR(h))
1024                         return (void *)h;
1025         }
1026         ct = nf_ct_tuplehash_to_ctrack(h);
1027
1028         /* It exists; we have (non-exclusive) reference. */
1029         if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) {
1030                 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
1031                 /* Please set reply bit if this packet OK */
1032                 *set_reply = 1;
1033         } else {
1034                 /* Once we've had two way comms, always ESTABLISHED. */
1035                 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
1036                         DEBUGP("nf_conntrack_in: normal packet for %p\n", ct);
1037                         *ctinfo = IP_CT_ESTABLISHED;
1038                 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
1039                         DEBUGP("nf_conntrack_in: related packet for %p\n", ct);
1040                         *ctinfo = IP_CT_RELATED;
1041                 } else {
1042                         DEBUGP("nf_conntrack_in: new packet for %p\n", ct);
1043                         *ctinfo = IP_CT_NEW;
1044                 }
1045                 *set_reply = 0;
1046         }
1047         skb->nfct = &ct->ct_general;
1048         skb->nfctinfo = *ctinfo;
1049         return ct;
1050 }
1051
1052 unsigned int
1053 nf_conntrack_in(int pf, unsigned int hooknum, struct sk_buff **pskb)
1054 {
1055         struct nf_conn *ct;
1056         enum ip_conntrack_info ctinfo;
1057         struct nf_conntrack_l3proto *l3proto;
1058         struct nf_conntrack_protocol *proto;
1059         unsigned int dataoff;
1060         u_int8_t protonum;
1061         int set_reply = 0;
1062         int ret;
1063
1064         /* Previously seen (loopback or untracked)?  Ignore. */
1065         if ((*pskb)->nfct) {
1066                 NF_CT_STAT_INC(ignore);
1067                 return NF_ACCEPT;
1068         }
1069
1070         l3proto = __nf_ct_l3proto_find((u_int16_t)pf);
1071         if ((ret = l3proto->prepare(pskb, hooknum, &dataoff, &protonum)) <= 0) {
1072                 DEBUGP("not prepared to track yet or error occured\n");
1073                 return -ret;
1074         }
1075
1076         proto = __nf_ct_proto_find((u_int16_t)pf, protonum);
1077
1078         /* It may be an special packet, error, unclean...
1079          * inverse of the return code tells to the netfilter
1080          * core what to do with the packet. */
1081         if (proto->error != NULL &&
1082             (ret = proto->error(*pskb, dataoff, &ctinfo, pf, hooknum)) <= 0) {
1083                 NF_CT_STAT_INC(error);
1084                 NF_CT_STAT_INC(invalid);
1085                 return -ret;
1086         }
1087
1088         ct = resolve_normal_ct(*pskb, dataoff, pf, protonum, l3proto, proto,
1089                                &set_reply, &ctinfo);
1090         if (!ct) {
1091                 /* Not valid part of a connection */
1092                 NF_CT_STAT_INC(invalid);
1093                 return NF_ACCEPT;
1094         }
1095
1096         if (IS_ERR(ct)) {
1097                 /* Too stressed to deal. */
1098                 NF_CT_STAT_INC(drop);
1099                 return NF_DROP;
1100         }
1101
1102         NF_CT_ASSERT((*pskb)->nfct);
1103
1104         ret = proto->packet(ct, *pskb, dataoff, ctinfo, pf, hooknum);
1105         if (ret < 0) {
1106                 /* Invalid: inverse of the return code tells
1107                  * the netfilter core what to do */
1108                 DEBUGP("nf_conntrack_in: Can't track with proto module\n");
1109                 nf_conntrack_put((*pskb)->nfct);
1110                 (*pskb)->nfct = NULL;
1111                 NF_CT_STAT_INC(invalid);
1112                 return -ret;
1113         }
1114
1115         if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
1116                 nf_conntrack_event_cache(IPCT_STATUS, *pskb);
1117
1118         return ret;
1119 }
1120
1121 int nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse,
1122                          const struct nf_conntrack_tuple *orig)
1123 {
1124         return nf_ct_invert_tuple(inverse, orig,
1125                                   __nf_ct_l3proto_find(orig->src.l3num),
1126                                   __nf_ct_proto_find(orig->src.l3num,
1127                                                      orig->dst.protonum));
1128 }
1129
1130 /* Would two expected things clash? */
1131 static inline int expect_clash(const struct nf_conntrack_expect *a,
1132                                const struct nf_conntrack_expect *b)
1133 {
1134         /* Part covered by intersection of masks must be unequal,
1135            otherwise they clash */
1136         struct nf_conntrack_tuple intersect_mask;
1137         int count;
1138
1139         intersect_mask.src.l3num = a->mask.src.l3num & b->mask.src.l3num;
1140         intersect_mask.src.u.all = a->mask.src.u.all & b->mask.src.u.all;
1141         intersect_mask.dst.u.all = a->mask.dst.u.all & b->mask.dst.u.all;
1142         intersect_mask.dst.protonum = a->mask.dst.protonum
1143                                         & b->mask.dst.protonum;
1144
1145         for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++){
1146                 intersect_mask.src.u3.all[count] =
1147                         a->mask.src.u3.all[count] & b->mask.src.u3.all[count];
1148         }
1149
1150         for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++){
1151                 intersect_mask.dst.u3.all[count] =
1152                         a->mask.dst.u3.all[count] & b->mask.dst.u3.all[count];
1153         }
1154
1155         return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
1156 }
1157
1158 static inline int expect_matches(const struct nf_conntrack_expect *a,
1159                                  const struct nf_conntrack_expect *b)
1160 {
1161         return a->master == b->master
1162                 && nf_ct_tuple_equal(&a->tuple, &b->tuple)
1163                 && nf_ct_tuple_equal(&a->mask, &b->mask);
1164 }
1165
1166 /* Generally a bad idea to call this: could have matched already. */
1167 void nf_conntrack_unexpect_related(struct nf_conntrack_expect *exp)
1168 {
1169         struct nf_conntrack_expect *i;
1170
1171         write_lock_bh(&nf_conntrack_lock);
1172         /* choose the the oldest expectation to evict */
1173         list_for_each_entry_reverse(i, &nf_conntrack_expect_list, list) {
1174                 if (expect_matches(i, exp) && del_timer(&i->timeout)) {
1175                         nf_ct_unlink_expect(i);
1176                         write_unlock_bh(&nf_conntrack_lock);
1177                         nf_conntrack_expect_put(i);
1178                         return;
1179                 }
1180         }
1181         write_unlock_bh(&nf_conntrack_lock);
1182 }
1183
1184 /* We don't increase the master conntrack refcount for non-fulfilled
1185  * conntracks. During the conntrack destruction, the expectations are
1186  * always killed before the conntrack itself */
1187 struct nf_conntrack_expect *nf_conntrack_expect_alloc(struct nf_conn *me)
1188 {
1189         struct nf_conntrack_expect *new;
1190
1191         new = kmem_cache_alloc(nf_conntrack_expect_cachep, GFP_ATOMIC);
1192         if (!new) {
1193                 DEBUGP("expect_related: OOM allocating expect\n");
1194                 return NULL;
1195         }
1196         new->master = me;
1197         atomic_set(&new->use, 1);
1198         return new;
1199 }
1200
1201 void nf_conntrack_expect_put(struct nf_conntrack_expect *exp)
1202 {
1203         if (atomic_dec_and_test(&exp->use))
1204                 kmem_cache_free(nf_conntrack_expect_cachep, exp);
1205 }
1206
1207 static void nf_conntrack_expect_insert(struct nf_conntrack_expect *exp)
1208 {
1209         atomic_inc(&exp->use);
1210         exp->master->expecting++;
1211         list_add(&exp->list, &nf_conntrack_expect_list);
1212
1213         init_timer(&exp->timeout);
1214         exp->timeout.data = (unsigned long)exp;
1215         exp->timeout.function = expectation_timed_out;
1216         exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ;
1217         add_timer(&exp->timeout);
1218
1219         exp->id = ++nf_conntrack_expect_next_id;
1220         atomic_inc(&exp->use);
1221         NF_CT_STAT_INC(expect_create);
1222 }
1223
1224 /* Race with expectations being used means we could have none to find; OK. */
1225 static void evict_oldest_expect(struct nf_conn *master)
1226 {
1227         struct nf_conntrack_expect *i;
1228
1229         list_for_each_entry_reverse(i, &nf_conntrack_expect_list, list) {
1230                 if (i->master == master) {
1231                         if (del_timer(&i->timeout)) {
1232                                 nf_ct_unlink_expect(i);
1233                                 nf_conntrack_expect_put(i);
1234                         }
1235                         break;
1236                 }
1237         }
1238 }
1239
1240 static inline int refresh_timer(struct nf_conntrack_expect *i)
1241 {
1242         if (!del_timer(&i->timeout))
1243                 return 0;
1244
1245         i->timeout.expires = jiffies + i->master->helper->timeout*HZ;
1246         add_timer(&i->timeout);
1247         return 1;
1248 }
1249
1250 int nf_conntrack_expect_related(struct nf_conntrack_expect *expect)
1251 {
1252         struct nf_conntrack_expect *i;
1253         struct nf_conn *master = expect->master;
1254         int ret;
1255
1256         DEBUGP("nf_conntrack_expect_related %p\n", related_to);
1257         DEBUGP("tuple: "); NF_CT_DUMP_TUPLE(&expect->tuple);
1258         DEBUGP("mask:  "); NF_CT_DUMP_TUPLE(&expect->mask);
1259
1260         write_lock_bh(&nf_conntrack_lock);
1261         list_for_each_entry(i, &nf_conntrack_expect_list, list) {
1262                 if (expect_matches(i, expect)) {
1263                         /* Refresh timer: if it's dying, ignore.. */
1264                         if (refresh_timer(i)) {
1265                                 ret = 0;
1266                                 goto out;
1267                         }
1268                 } else if (expect_clash(i, expect)) {
1269                         ret = -EBUSY;
1270                         goto out;
1271                 }
1272         }
1273         /* Will be over limit? */
1274         if (master->helper->max_expected && 
1275             master->expecting >= master->helper->max_expected)
1276                 evict_oldest_expect(master);
1277
1278         nf_conntrack_expect_insert(expect);
1279         nf_conntrack_expect_event(IPEXP_NEW, expect);
1280         ret = 0;
1281 out:
1282         write_unlock_bh(&nf_conntrack_lock);
1283         return ret;
1284 }
1285
1286 /* Alter reply tuple (maybe alter helper).  This is for NAT, and is
1287    implicitly racy: see __nf_conntrack_confirm */
1288 void nf_conntrack_alter_reply(struct nf_conn *conntrack,
1289                               const struct nf_conntrack_tuple *newreply)
1290 {
1291         write_lock_bh(&nf_conntrack_lock);
1292         /* Should be unconfirmed, so not in hash table yet */
1293         NF_CT_ASSERT(!nf_ct_is_confirmed(conntrack));
1294
1295         DEBUGP("Altering reply tuple of %p to ", conntrack);
1296         NF_CT_DUMP_TUPLE(newreply);
1297
1298         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1299         if (!conntrack->master && conntrack->expecting == 0)
1300                 conntrack->helper = __nf_ct_helper_find(newreply);
1301         write_unlock_bh(&nf_conntrack_lock);
1302 }
1303
1304 int nf_conntrack_helper_register(struct nf_conntrack_helper *me)
1305 {
1306         int ret;
1307         BUG_ON(me->timeout == 0);
1308
1309         ret = nf_conntrack_register_cache(NF_CT_F_HELP, "nf_conntrack:help",
1310                                           sizeof(struct nf_conn)
1311                                           + sizeof(union nf_conntrack_help)
1312                                           + __alignof__(union nf_conntrack_help),
1313                                           init_conntrack_for_helper);
1314         if (ret < 0) {
1315                 printk(KERN_ERR "nf_conntrack_helper_reigster: Unable to create slab cache for conntracks\n");
1316                 return ret;
1317         }
1318         write_lock_bh(&nf_conntrack_lock);
1319         list_prepend(&helpers, me);
1320         write_unlock_bh(&nf_conntrack_lock);
1321
1322         return 0;
1323 }
1324
1325 struct nf_conntrack_helper *
1326 __nf_conntrack_helper_find_byname(const char *name)
1327 {
1328         struct nf_conntrack_helper *h;
1329
1330         list_for_each_entry(h, &helpers, list) {
1331                 if (!strcmp(h->name, name))
1332                         return h;
1333         }
1334
1335         return NULL;
1336 }
1337
1338 static inline int unhelp(struct nf_conntrack_tuple_hash *i,
1339                          const struct nf_conntrack_helper *me)
1340 {
1341         if (nf_ct_tuplehash_to_ctrack(i)->helper == me) {
1342                 nf_conntrack_event(IPCT_HELPER, nf_ct_tuplehash_to_ctrack(i));
1343                 nf_ct_tuplehash_to_ctrack(i)->helper = NULL;
1344         }
1345         return 0;
1346 }
1347
1348 void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me)
1349 {
1350         unsigned int i;
1351         struct nf_conntrack_expect *exp, *tmp;
1352
1353         /* Need write lock here, to delete helper. */
1354         write_lock_bh(&nf_conntrack_lock);
1355         LIST_DELETE(&helpers, me);
1356
1357         /* Get rid of expectations */
1358         list_for_each_entry_safe(exp, tmp, &nf_conntrack_expect_list, list) {
1359                 if (exp->master->helper == me && del_timer(&exp->timeout)) {
1360                         nf_ct_unlink_expect(exp);
1361                         nf_conntrack_expect_put(exp);
1362                 }
1363         }
1364
1365         /* Get rid of expecteds, set helpers to NULL. */
1366         LIST_FIND_W(&unconfirmed, unhelp, struct nf_conntrack_tuple_hash*, me);
1367         for (i = 0; i < nf_conntrack_htable_size; i++)
1368                 LIST_FIND_W(&nf_conntrack_hash[i], unhelp,
1369                             struct nf_conntrack_tuple_hash *, me);
1370         write_unlock_bh(&nf_conntrack_lock);
1371
1372         /* Someone could be still looking at the helper in a bh. */
1373         synchronize_net();
1374 }
1375
1376 /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
1377 void __nf_ct_refresh_acct(struct nf_conn *ct,
1378                           enum ip_conntrack_info ctinfo,
1379                           const struct sk_buff *skb,
1380                           unsigned long extra_jiffies,
1381                           int do_acct)
1382 {
1383         int event = 0;
1384
1385         NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct);
1386         NF_CT_ASSERT(skb);
1387
1388         write_lock_bh(&nf_conntrack_lock);
1389
1390         /* If not in hash table, timer will not be active yet */
1391         if (!nf_ct_is_confirmed(ct)) {
1392                 ct->timeout.expires = extra_jiffies;
1393                 event = IPCT_REFRESH;
1394         } else {
1395                 /* Need del_timer for race avoidance (may already be dying). */
1396                 if (del_timer(&ct->timeout)) {
1397                         ct->timeout.expires = jiffies + extra_jiffies;
1398                         add_timer(&ct->timeout);
1399                         event = IPCT_REFRESH;
1400                 }
1401         }
1402
1403 #ifdef CONFIG_NF_CT_ACCT
1404         if (do_acct) {
1405                 ct->counters[CTINFO2DIR(ctinfo)].packets++;
1406                 ct->counters[CTINFO2DIR(ctinfo)].bytes +=
1407                         skb->len - (unsigned int)(skb->nh.raw - skb->data);
1408         if ((ct->counters[CTINFO2DIR(ctinfo)].packets & 0x80000000)
1409             || (ct->counters[CTINFO2DIR(ctinfo)].bytes & 0x80000000))
1410                 event |= IPCT_COUNTER_FILLING;
1411         }
1412 #endif
1413
1414         write_unlock_bh(&nf_conntrack_lock);
1415
1416         /* must be unlocked when calling event cache */
1417         if (event)
1418                 nf_conntrack_event_cache(event, skb);
1419 }
1420
1421 #if defined(CONFIG_NF_CT_NETLINK) || \
1422     defined(CONFIG_NF_CT_NETLINK_MODULE)
1423
1424 #include <linux/netfilter/nfnetlink.h>
1425 #include <linux/netfilter/nfnetlink_conntrack.h>
1426
1427 /* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
1428  * in ip_conntrack_core, since we don't want the protocols to autoload
1429  * or depend on ctnetlink */
1430 int nf_ct_port_tuple_to_nfattr(struct sk_buff *skb,
1431                                const struct nf_conntrack_tuple *tuple)
1432 {
1433         NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(u_int16_t),
1434                 &tuple->src.u.tcp.port);
1435         NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(u_int16_t),
1436                 &tuple->dst.u.tcp.port);
1437         return 0;
1438
1439 nfattr_failure:
1440         return -1;
1441 }
1442
1443 static const size_t cta_min_proto[CTA_PROTO_MAX] = {
1444         [CTA_PROTO_SRC_PORT-1]  = sizeof(u_int16_t),
1445         [CTA_PROTO_DST_PORT-1]  = sizeof(u_int16_t)
1446 };
1447
1448 int nf_ct_port_nfattr_to_tuple(struct nfattr *tb[],
1449                                struct nf_conntrack_tuple *t)
1450 {
1451         if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1])
1452                 return -EINVAL;
1453
1454         if (nfattr_bad_size(tb, CTA_PROTO_MAX, cta_min_proto))
1455                 return -EINVAL;
1456
1457         t->src.u.tcp.port =
1458                 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]);
1459         t->dst.u.tcp.port =
1460                 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]);
1461
1462         return 0;
1463 }
1464 #endif
1465
1466 /* Used by ipt_REJECT and ip6t_REJECT. */
1467 void __nf_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
1468 {
1469         struct nf_conn *ct;
1470         enum ip_conntrack_info ctinfo;
1471
1472         /* This ICMP is in reverse direction to the packet which caused it */
1473         ct = nf_ct_get(skb, &ctinfo);
1474         if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1475                 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1476         else
1477                 ctinfo = IP_CT_RELATED;
1478
1479         /* Attach to new skbuff, and increment count */
1480         nskb->nfct = &ct->ct_general;
1481         nskb->nfctinfo = ctinfo;
1482         nf_conntrack_get(nskb->nfct);
1483 }
1484
1485 static inline int
1486 do_iter(const struct nf_conntrack_tuple_hash *i,
1487         int (*iter)(struct nf_conn *i, void *data),
1488         void *data)
1489 {
1490         return iter(nf_ct_tuplehash_to_ctrack(i), data);
1491 }
1492
1493 /* Bring out ya dead! */
1494 static struct nf_conntrack_tuple_hash *
1495 get_next_corpse(int (*iter)(struct nf_conn *i, void *data),
1496                 void *data, unsigned int *bucket)
1497 {
1498         struct nf_conntrack_tuple_hash *h = NULL;
1499
1500         write_lock_bh(&nf_conntrack_lock);
1501         for (; *bucket < nf_conntrack_htable_size; (*bucket)++) {
1502                 h = LIST_FIND_W(&nf_conntrack_hash[*bucket], do_iter,
1503                                 struct nf_conntrack_tuple_hash *, iter, data);
1504                 if (h)
1505                         break;
1506         }
1507         if (!h)
1508                 h = LIST_FIND_W(&unconfirmed, do_iter,
1509                                 struct nf_conntrack_tuple_hash *, iter, data);
1510         if (h)
1511                 atomic_inc(&nf_ct_tuplehash_to_ctrack(h)->ct_general.use);
1512         write_unlock_bh(&nf_conntrack_lock);
1513
1514         return h;
1515 }
1516
1517 void
1518 nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data), void *data)
1519 {
1520         struct nf_conntrack_tuple_hash *h;
1521         unsigned int bucket = 0;
1522
1523         while ((h = get_next_corpse(iter, data, &bucket)) != NULL) {
1524                 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
1525                 /* Time to push up daises... */
1526                 if (del_timer(&ct->timeout))
1527                         death_by_timeout((unsigned long)ct);
1528                 /* ... else the timer will get him soon. */
1529
1530                 nf_ct_put(ct);
1531         }
1532 }
1533
1534 static int kill_all(struct nf_conn *i, void *data)
1535 {
1536         return 1;
1537 }
1538
1539 static void free_conntrack_hash(struct list_head *hash, int vmalloced, int size)
1540 {
1541         if (vmalloced)
1542                 vfree(hash);
1543         else
1544                 free_pages((unsigned long)hash, 
1545                            get_order(sizeof(struct list_head) * size));
1546 }
1547
1548 void nf_conntrack_flush()
1549 {
1550         nf_ct_iterate_cleanup(kill_all, NULL);
1551 }
1552
1553 /* Mishearing the voices in his head, our hero wonders how he's
1554    supposed to kill the mall. */
1555 void nf_conntrack_cleanup(void)
1556 {
1557         int i;
1558
1559         /* This makes sure all current packets have passed through
1560            netfilter framework.  Roll on, two-stage module
1561            delete... */
1562         synchronize_net();
1563
1564         nf_ct_event_cache_flush();
1565  i_see_dead_people:
1566         nf_conntrack_flush();
1567         if (atomic_read(&nf_conntrack_count) != 0) {
1568                 schedule();
1569                 goto i_see_dead_people;
1570         }
1571         /* wait until all references to nf_conntrack_untracked are dropped */
1572         while (atomic_read(&nf_conntrack_untracked.ct_general.use) > 1)
1573                 schedule();
1574
1575         for (i = 0; i < NF_CT_F_NUM; i++) {
1576                 if (nf_ct_cache[i].use == 0)
1577                         continue;
1578
1579                 NF_CT_ASSERT(nf_ct_cache[i].use == 1);
1580                 nf_ct_cache[i].use = 1;
1581                 nf_conntrack_unregister_cache(i);
1582         }
1583         kmem_cache_destroy(nf_conntrack_expect_cachep);
1584         free_conntrack_hash(nf_conntrack_hash, nf_conntrack_vmalloc,
1585                             nf_conntrack_htable_size);
1586
1587         /* free l3proto protocol tables */
1588         for (i = 0; i < PF_MAX; i++)
1589                 if (nf_ct_protos[i]) {
1590                         kfree(nf_ct_protos[i]);
1591                         nf_ct_protos[i] = NULL;
1592                 }
1593 }
1594
1595 static struct list_head *alloc_hashtable(int size, int *vmalloced)
1596 {
1597         struct list_head *hash;
1598         unsigned int i;
1599
1600         *vmalloced = 0; 
1601         hash = (void*)__get_free_pages(GFP_KERNEL, 
1602                                        get_order(sizeof(struct list_head)
1603                                                  * size));
1604         if (!hash) { 
1605                 *vmalloced = 1;
1606                 printk(KERN_WARNING "nf_conntrack: falling back to vmalloc.\n");
1607                 hash = vmalloc(sizeof(struct list_head) * size);
1608         }
1609
1610         if (hash)
1611                 for (i = 0; i < size; i++) 
1612                         INIT_LIST_HEAD(&hash[i]);
1613
1614         return hash;
1615 }
1616
1617 int set_hashsize(const char *val, struct kernel_param *kp)
1618 {
1619         int i, bucket, hashsize, vmalloced;
1620         int old_vmalloced, old_size;
1621         int rnd;
1622         struct list_head *hash, *old_hash;
1623         struct nf_conntrack_tuple_hash *h;
1624
1625         /* On boot, we can set this without any fancy locking. */
1626         if (!nf_conntrack_htable_size)
1627                 return param_set_uint(val, kp);
1628
1629         hashsize = simple_strtol(val, NULL, 0);
1630         if (!hashsize)
1631                 return -EINVAL;
1632
1633         hash = alloc_hashtable(hashsize, &vmalloced);
1634         if (!hash)
1635                 return -ENOMEM;
1636
1637         /* We have to rehahs for the new table anyway, so we also can
1638          * use a newrandom seed */
1639         get_random_bytes(&rnd, 4);
1640
1641         write_lock_bh(&nf_conntrack_lock);
1642         for (i = 0; i < nf_conntrack_htable_size; i++) {
1643                 while (!list_empty(&nf_conntrack_hash[i])) {
1644                         h = list_entry(nf_conntrack_hash[i].next,
1645                                        struct nf_conntrack_tuple_hash, list);
1646                         list_del(&h->list);
1647                         bucket = __hash_conntrack(&h->tuple, hashsize, rnd);
1648                         list_add_tail(&h->list, &hash[bucket]);
1649                 }
1650         }
1651         old_size = nf_conntrack_htable_size;
1652         old_vmalloced = nf_conntrack_vmalloc;
1653         old_hash = nf_conntrack_hash;
1654
1655         nf_conntrack_htable_size = hashsize;
1656         nf_conntrack_vmalloc = vmalloced;
1657         nf_conntrack_hash = hash;
1658         nf_conntrack_hash_rnd = rnd;
1659         write_unlock_bh(&nf_conntrack_lock);
1660
1661         free_conntrack_hash(old_hash, old_vmalloced, old_size);
1662         return 0;
1663 }
1664
1665 module_param_call(hashsize, set_hashsize, param_get_uint,
1666                   &nf_conntrack_htable_size, 0600);
1667
1668 int __init nf_conntrack_init(void)
1669 {
1670         unsigned int i;
1671         int ret;
1672
1673         /* Idea from tcp.c: use 1/16384 of memory.  On i386: 32MB
1674          * machine has 256 buckets.  >= 1GB machines have 8192 buckets. */
1675         if (!nf_conntrack_htable_size) {
1676                 nf_conntrack_htable_size
1677                         = (((num_physpages << PAGE_SHIFT) / 16384)
1678                            / sizeof(struct list_head));
1679                 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1680                         nf_conntrack_htable_size = 8192;
1681                 if (nf_conntrack_htable_size < 16)
1682                         nf_conntrack_htable_size = 16;
1683         }
1684         nf_conntrack_max = 8 * nf_conntrack_htable_size;
1685
1686         printk("nf_conntrack version %s (%u buckets, %d max)\n",
1687                NF_CONNTRACK_VERSION, nf_conntrack_htable_size,
1688                nf_conntrack_max);
1689
1690         nf_conntrack_hash = alloc_hashtable(nf_conntrack_htable_size,
1691                                             &nf_conntrack_vmalloc);
1692         if (!nf_conntrack_hash) {
1693                 printk(KERN_ERR "Unable to create nf_conntrack_hash\n");
1694                 goto err_out;
1695         }
1696
1697         ret = nf_conntrack_register_cache(NF_CT_F_BASIC, "nf_conntrack:basic",
1698                                           sizeof(struct nf_conn), NULL);
1699         if (ret < 0) {
1700                 printk(KERN_ERR "Unable to create nf_conn slab cache\n");
1701                 goto err_free_hash;
1702         }
1703
1704         nf_conntrack_expect_cachep = kmem_cache_create("nf_conntrack_expect",
1705                                         sizeof(struct nf_conntrack_expect),
1706                                         0, 0, NULL, NULL);
1707         if (!nf_conntrack_expect_cachep) {
1708                 printk(KERN_ERR "Unable to create nf_expect slab cache\n");
1709                 goto err_free_conntrack_slab;
1710         }
1711
1712         /* Don't NEED lock here, but good form anyway. */
1713         write_lock_bh(&nf_conntrack_lock);
1714         for (i = 0; i < PF_MAX; i++)
1715                 nf_ct_l3protos[i] = &nf_conntrack_generic_l3proto;
1716         write_unlock_bh(&nf_conntrack_lock);
1717
1718         /* Set up fake conntrack:
1719             - to never be deleted, not in any hashes */
1720         atomic_set(&nf_conntrack_untracked.ct_general.use, 1);
1721         /*  - and look it like as a confirmed connection */
1722         set_bit(IPS_CONFIRMED_BIT, &nf_conntrack_untracked.status);
1723
1724         return ret;
1725
1726 err_free_conntrack_slab:
1727         nf_conntrack_unregister_cache(NF_CT_F_BASIC);
1728 err_free_hash:
1729         free_conntrack_hash(nf_conntrack_hash, nf_conntrack_vmalloc,
1730                             nf_conntrack_htable_size);
1731 err_out:
1732         return -ENOMEM;
1733 }