Merge git://oss.sgi.com:8090/oss/git/xfs-2.6
[sfrench/cifs-2.6.git] / net / netfilter / nf_conntrack_core.c
1 /* Connection state tracking for netfilter.  This is separated from,
2    but required by, the NAT layer; it can also be used by an iptables
3    extension. */
4
5 /* (C) 1999-2001 Paul `Rusty' Russell
6  * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
7  * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License version 2 as
11  * published by the Free Software Foundation.
12  *
13  * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
14  *      - new API and handling of conntrack/nat helpers
15  *      - now capable of multiple expectations for one master
16  * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
17  *      - add usage/reference counts to ip_conntrack_expect
18  *      - export ip_conntrack[_expect]_{find_get,put} functions
19  * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
20  *      - generalize L3 protocol denendent part.
21  * 23 Mar 2004: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
22  *      - add support various size of conntrack structures.
23  * 26 Jan 2006: Harald Welte <laforge@netfilter.org>
24  *      - restructure nf_conn (introduce nf_conn_help)
25  *      - redesign 'features' how they were originally intended
26  * 26 Feb 2006: Pablo Neira Ayuso <pablo@eurodev.net>
27  *      - add support for L3 protocol module load on demand.
28  *
29  * Derived from net/ipv4/netfilter/ip_conntrack_core.c
30  */
31
32 #include <linux/config.h>
33 #include <linux/types.h>
34 #include <linux/netfilter.h>
35 #include <linux/module.h>
36 #include <linux/skbuff.h>
37 #include <linux/proc_fs.h>
38 #include <linux/vmalloc.h>
39 #include <linux/stddef.h>
40 #include <linux/slab.h>
41 #include <linux/random.h>
42 #include <linux/jhash.h>
43 #include <linux/err.h>
44 #include <linux/percpu.h>
45 #include <linux/moduleparam.h>
46 #include <linux/notifier.h>
47 #include <linux/kernel.h>
48 #include <linux/netdevice.h>
49 #include <linux/socket.h>
50
51 /* This rwlock protects the main hash table, protocol/helper/expected
52    registrations, conntrack timers*/
53 #define ASSERT_READ_LOCK(x)
54 #define ASSERT_WRITE_LOCK(x)
55
56 #include <net/netfilter/nf_conntrack.h>
57 #include <net/netfilter/nf_conntrack_l3proto.h>
58 #include <net/netfilter/nf_conntrack_protocol.h>
59 #include <net/netfilter/nf_conntrack_helper.h>
60 #include <net/netfilter/nf_conntrack_core.h>
61 #include <linux/netfilter_ipv4/listhelp.h>
62
63 #define NF_CONNTRACK_VERSION    "0.5.0"
64
65 #if 0
66 #define DEBUGP printk
67 #else
68 #define DEBUGP(format, args...)
69 #endif
70
71 DEFINE_RWLOCK(nf_conntrack_lock);
72
73 /* nf_conntrack_standalone needs this */
74 atomic_t nf_conntrack_count = ATOMIC_INIT(0);
75
76 void (*nf_conntrack_destroyed)(struct nf_conn *conntrack) = NULL;
77 LIST_HEAD(nf_conntrack_expect_list);
78 struct nf_conntrack_protocol **nf_ct_protos[PF_MAX];
79 struct nf_conntrack_l3proto *nf_ct_l3protos[PF_MAX];
80 static LIST_HEAD(helpers);
81 unsigned int nf_conntrack_htable_size = 0;
82 int nf_conntrack_max;
83 struct list_head *nf_conntrack_hash;
84 static kmem_cache_t *nf_conntrack_expect_cachep;
85 struct nf_conn nf_conntrack_untracked;
86 unsigned int nf_ct_log_invalid;
87 static LIST_HEAD(unconfirmed);
88 static int nf_conntrack_vmalloc;
89
90 static unsigned int nf_conntrack_next_id;
91 static unsigned int nf_conntrack_expect_next_id;
92 #ifdef CONFIG_NF_CONNTRACK_EVENTS
93 struct notifier_block *nf_conntrack_chain;
94 struct notifier_block *nf_conntrack_expect_chain;
95
96 DEFINE_PER_CPU(struct nf_conntrack_ecache, nf_conntrack_ecache);
97
98 /* deliver cached events and clear cache entry - must be called with locally
99  * disabled softirqs */
100 static inline void
101 __nf_ct_deliver_cached_events(struct nf_conntrack_ecache *ecache)
102 {
103         DEBUGP("ecache: delivering events for %p\n", ecache->ct);
104         if (nf_ct_is_confirmed(ecache->ct) && !nf_ct_is_dying(ecache->ct)
105             && ecache->events)
106                 notifier_call_chain(&nf_conntrack_chain, ecache->events,
107                                     ecache->ct);
108
109         ecache->events = 0;
110         nf_ct_put(ecache->ct);
111         ecache->ct = NULL;
112 }
113
114 /* Deliver all cached events for a particular conntrack. This is called
115  * by code prior to async packet handling for freeing the skb */
116 void nf_ct_deliver_cached_events(const struct nf_conn *ct)
117 {
118         struct nf_conntrack_ecache *ecache;
119
120         local_bh_disable();
121         ecache = &__get_cpu_var(nf_conntrack_ecache);
122         if (ecache->ct == ct)
123                 __nf_ct_deliver_cached_events(ecache);
124         local_bh_enable();
125 }
126
127 /* Deliver cached events for old pending events, if current conntrack != old */
128 void __nf_ct_event_cache_init(struct nf_conn *ct)
129 {
130         struct nf_conntrack_ecache *ecache;
131         
132         /* take care of delivering potentially old events */
133         ecache = &__get_cpu_var(nf_conntrack_ecache);
134         BUG_ON(ecache->ct == ct);
135         if (ecache->ct)
136                 __nf_ct_deliver_cached_events(ecache);
137         /* initialize for this conntrack/packet */
138         ecache->ct = ct;
139         nf_conntrack_get(&ct->ct_general);
140 }
141
142 /* flush the event cache - touches other CPU's data and must not be called
143  * while packets are still passing through the code */
144 static void nf_ct_event_cache_flush(void)
145 {
146         struct nf_conntrack_ecache *ecache;
147         int cpu;
148
149         for_each_cpu(cpu) {
150                 ecache = &per_cpu(nf_conntrack_ecache, cpu);
151                 if (ecache->ct)
152                         nf_ct_put(ecache->ct);
153         }
154 }
155 #else
156 static inline void nf_ct_event_cache_flush(void) {}
157 #endif /* CONFIG_NF_CONNTRACK_EVENTS */
158
159 DEFINE_PER_CPU(struct ip_conntrack_stat, nf_conntrack_stat);
160 EXPORT_PER_CPU_SYMBOL(nf_conntrack_stat);
161
162 /*
163  * This scheme offers various size of "struct nf_conn" dependent on
164  * features(helper, nat, ...)
165  */
166
167 #define NF_CT_FEATURES_NAMELEN  256
168 static struct {
169         /* name of slab cache. printed in /proc/slabinfo */
170         char *name;
171
172         /* size of slab cache */
173         size_t size;
174
175         /* slab cache pointer */
176         kmem_cache_t *cachep;
177
178         /* allocated slab cache + modules which uses this slab cache */
179         int use;
180
181         /* Initialization */
182         int (*init_conntrack)(struct nf_conn *, u_int32_t);
183
184 } nf_ct_cache[NF_CT_F_NUM];
185
186 /* protect members of nf_ct_cache except of "use" */
187 DEFINE_RWLOCK(nf_ct_cache_lock);
188
189 /* This avoids calling kmem_cache_create() with same name simultaneously */
190 static DEFINE_MUTEX(nf_ct_cache_mutex);
191
192 extern struct nf_conntrack_protocol nf_conntrack_generic_protocol;
193 struct nf_conntrack_protocol *
194 __nf_ct_proto_find(u_int16_t l3proto, u_int8_t protocol)
195 {
196         if (unlikely(l3proto >= AF_MAX || nf_ct_protos[l3proto] == NULL))
197                 return &nf_conntrack_generic_protocol;
198
199         return nf_ct_protos[l3proto][protocol];
200 }
201
202 /* this is guaranteed to always return a valid protocol helper, since
203  * it falls back to generic_protocol */
204 struct nf_conntrack_protocol *
205 nf_ct_proto_find_get(u_int16_t l3proto, u_int8_t protocol)
206 {
207         struct nf_conntrack_protocol *p;
208
209         preempt_disable();
210         p = __nf_ct_proto_find(l3proto, protocol);
211         if (p) {
212                 if (!try_module_get(p->me))
213                         p = &nf_conntrack_generic_protocol;
214         }
215         preempt_enable();
216         
217         return p;
218 }
219
220 void nf_ct_proto_put(struct nf_conntrack_protocol *p)
221 {
222         module_put(p->me);
223 }
224
225 struct nf_conntrack_l3proto *
226 nf_ct_l3proto_find_get(u_int16_t l3proto)
227 {
228         struct nf_conntrack_l3proto *p;
229
230         preempt_disable();
231         p = __nf_ct_l3proto_find(l3proto);
232         if (p) {
233                 if (!try_module_get(p->me))
234                         p = &nf_conntrack_generic_l3proto;
235         }
236         preempt_enable();
237
238         return p;
239 }
240
241 void nf_ct_l3proto_put(struct nf_conntrack_l3proto *p)
242 {
243         module_put(p->me);
244 }
245
246 int
247 nf_ct_l3proto_try_module_get(unsigned short l3proto)
248 {
249         int ret;
250         struct nf_conntrack_l3proto *p;
251
252 retry:  p = nf_ct_l3proto_find_get(l3proto);
253         if (p == &nf_conntrack_generic_l3proto) {
254                 ret = request_module("nf_conntrack-%d", l3proto);
255                 if (!ret)
256                         goto retry;
257
258                 return -EPROTOTYPE;
259         }
260
261         return 0;
262 }
263
264 void nf_ct_l3proto_module_put(unsigned short l3proto)
265 {
266         struct nf_conntrack_l3proto *p;
267
268         preempt_disable();
269         p = __nf_ct_l3proto_find(l3proto);
270         preempt_enable();
271
272         module_put(p->me);
273 }
274
275 static int nf_conntrack_hash_rnd_initted;
276 static unsigned int nf_conntrack_hash_rnd;
277
278 static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple,
279                                   unsigned int size, unsigned int rnd)
280 {
281         unsigned int a, b;
282         a = jhash((void *)tuple->src.u3.all, sizeof(tuple->src.u3.all),
283                   ((tuple->src.l3num) << 16) | tuple->dst.protonum);
284         b = jhash((void *)tuple->dst.u3.all, sizeof(tuple->dst.u3.all),
285                         (tuple->src.u.all << 16) | tuple->dst.u.all);
286
287         return jhash_2words(a, b, rnd) % size;
288 }
289
290 static inline u_int32_t hash_conntrack(const struct nf_conntrack_tuple *tuple)
291 {
292         return __hash_conntrack(tuple, nf_conntrack_htable_size,
293                                 nf_conntrack_hash_rnd);
294 }
295
296 int nf_conntrack_register_cache(u_int32_t features, const char *name,
297                                 size_t size)
298 {
299         int ret = 0;
300         char *cache_name;
301         kmem_cache_t *cachep;
302
303         DEBUGP("nf_conntrack_register_cache: features=0x%x, name=%s, size=%d\n",
304                features, name, size);
305
306         if (features < NF_CT_F_BASIC || features >= NF_CT_F_NUM) {
307                 DEBUGP("nf_conntrack_register_cache: invalid features.: 0x%x\n",
308                         features);
309                 return -EINVAL;
310         }
311
312         mutex_lock(&nf_ct_cache_mutex);
313
314         write_lock_bh(&nf_ct_cache_lock);
315         /* e.g: multiple helpers are loaded */
316         if (nf_ct_cache[features].use > 0) {
317                 DEBUGP("nf_conntrack_register_cache: already resisterd.\n");
318                 if ((!strncmp(nf_ct_cache[features].name, name,
319                               NF_CT_FEATURES_NAMELEN))
320                     && nf_ct_cache[features].size == size) {
321                         DEBUGP("nf_conntrack_register_cache: reusing.\n");
322                         nf_ct_cache[features].use++;
323                         ret = 0;
324                 } else
325                         ret = -EBUSY;
326
327                 write_unlock_bh(&nf_ct_cache_lock);
328                 mutex_unlock(&nf_ct_cache_mutex);
329                 return ret;
330         }
331         write_unlock_bh(&nf_ct_cache_lock);
332
333         /*
334          * The memory space for name of slab cache must be alive until
335          * cache is destroyed.
336          */
337         cache_name = kmalloc(sizeof(char)*NF_CT_FEATURES_NAMELEN, GFP_ATOMIC);
338         if (cache_name == NULL) {
339                 DEBUGP("nf_conntrack_register_cache: can't alloc cache_name\n");
340                 ret = -ENOMEM;
341                 goto out_up_mutex;
342         }
343
344         if (strlcpy(cache_name, name, NF_CT_FEATURES_NAMELEN)
345                                                 >= NF_CT_FEATURES_NAMELEN) {
346                 printk("nf_conntrack_register_cache: name too long\n");
347                 ret = -EINVAL;
348                 goto out_free_name;
349         }
350
351         cachep = kmem_cache_create(cache_name, size, 0, 0,
352                                    NULL, NULL);
353         if (!cachep) {
354                 printk("nf_conntrack_register_cache: Can't create slab cache "
355                        "for the features = 0x%x\n", features);
356                 ret = -ENOMEM;
357                 goto out_free_name;
358         }
359
360         write_lock_bh(&nf_ct_cache_lock);
361         nf_ct_cache[features].use = 1;
362         nf_ct_cache[features].size = size;
363         nf_ct_cache[features].cachep = cachep;
364         nf_ct_cache[features].name = cache_name;
365         write_unlock_bh(&nf_ct_cache_lock);
366
367         goto out_up_mutex;
368
369 out_free_name:
370         kfree(cache_name);
371 out_up_mutex:
372         mutex_unlock(&nf_ct_cache_mutex);
373         return ret;
374 }
375
376 /* FIXME: In the current, only nf_conntrack_cleanup() can call this function. */
377 void nf_conntrack_unregister_cache(u_int32_t features)
378 {
379         kmem_cache_t *cachep;
380         char *name;
381
382         /*
383          * This assures that kmem_cache_create() isn't called before destroying
384          * slab cache.
385          */
386         DEBUGP("nf_conntrack_unregister_cache: 0x%04x\n", features);
387         mutex_lock(&nf_ct_cache_mutex);
388
389         write_lock_bh(&nf_ct_cache_lock);
390         if (--nf_ct_cache[features].use > 0) {
391                 write_unlock_bh(&nf_ct_cache_lock);
392                 mutex_unlock(&nf_ct_cache_mutex);
393                 return;
394         }
395         cachep = nf_ct_cache[features].cachep;
396         name = nf_ct_cache[features].name;
397         nf_ct_cache[features].cachep = NULL;
398         nf_ct_cache[features].name = NULL;
399         nf_ct_cache[features].size = 0;
400         write_unlock_bh(&nf_ct_cache_lock);
401
402         synchronize_net();
403
404         kmem_cache_destroy(cachep);
405         kfree(name);
406
407         mutex_unlock(&nf_ct_cache_mutex);
408 }
409
410 int
411 nf_ct_get_tuple(const struct sk_buff *skb,
412                 unsigned int nhoff,
413                 unsigned int dataoff,
414                 u_int16_t l3num,
415                 u_int8_t protonum,
416                 struct nf_conntrack_tuple *tuple,
417                 const struct nf_conntrack_l3proto *l3proto,
418                 const struct nf_conntrack_protocol *protocol)
419 {
420         NF_CT_TUPLE_U_BLANK(tuple);
421
422         tuple->src.l3num = l3num;
423         if (l3proto->pkt_to_tuple(skb, nhoff, tuple) == 0)
424                 return 0;
425
426         tuple->dst.protonum = protonum;
427         tuple->dst.dir = IP_CT_DIR_ORIGINAL;
428
429         return protocol->pkt_to_tuple(skb, dataoff, tuple);
430 }
431
432 int
433 nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
434                    const struct nf_conntrack_tuple *orig,
435                    const struct nf_conntrack_l3proto *l3proto,
436                    const struct nf_conntrack_protocol *protocol)
437 {
438         NF_CT_TUPLE_U_BLANK(inverse);
439
440         inverse->src.l3num = orig->src.l3num;
441         if (l3proto->invert_tuple(inverse, orig) == 0)
442                 return 0;
443
444         inverse->dst.dir = !orig->dst.dir;
445
446         inverse->dst.protonum = orig->dst.protonum;
447         return protocol->invert_tuple(inverse, orig);
448 }
449
450 /* nf_conntrack_expect helper functions */
451 void nf_ct_unlink_expect(struct nf_conntrack_expect *exp)
452 {
453         struct nf_conn_help *master_help = nfct_help(exp->master);
454
455         NF_CT_ASSERT(master_help);
456         ASSERT_WRITE_LOCK(&nf_conntrack_lock);
457         NF_CT_ASSERT(!timer_pending(&exp->timeout));
458
459         list_del(&exp->list);
460         NF_CT_STAT_INC(expect_delete);
461         master_help->expecting--;
462         nf_conntrack_expect_put(exp);
463 }
464
465 static void expectation_timed_out(unsigned long ul_expect)
466 {
467         struct nf_conntrack_expect *exp = (void *)ul_expect;
468
469         write_lock_bh(&nf_conntrack_lock);
470         nf_ct_unlink_expect(exp);
471         write_unlock_bh(&nf_conntrack_lock);
472         nf_conntrack_expect_put(exp);
473 }
474
475 struct nf_conntrack_expect *
476 __nf_conntrack_expect_find(const struct nf_conntrack_tuple *tuple)
477 {
478         struct nf_conntrack_expect *i;
479         
480         list_for_each_entry(i, &nf_conntrack_expect_list, list) {
481                 if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) {
482                         atomic_inc(&i->use);
483                         return i;
484                 }
485         }
486         return NULL;
487 }
488
489 /* Just find a expectation corresponding to a tuple. */
490 struct nf_conntrack_expect *
491 nf_conntrack_expect_find(const struct nf_conntrack_tuple *tuple)
492 {
493         struct nf_conntrack_expect *i;
494         
495         read_lock_bh(&nf_conntrack_lock);
496         i = __nf_conntrack_expect_find(tuple);
497         read_unlock_bh(&nf_conntrack_lock);
498
499         return i;
500 }
501
502 /* If an expectation for this connection is found, it gets delete from
503  * global list then returned. */
504 static struct nf_conntrack_expect *
505 find_expectation(const struct nf_conntrack_tuple *tuple)
506 {
507         struct nf_conntrack_expect *i;
508
509         list_for_each_entry(i, &nf_conntrack_expect_list, list) {
510         /* If master is not in hash table yet (ie. packet hasn't left
511            this machine yet), how can other end know about expected?
512            Hence these are not the droids you are looking for (if
513            master ct never got confirmed, we'd hold a reference to it
514            and weird things would happen to future packets). */
515                 if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
516                     && nf_ct_is_confirmed(i->master)) {
517                         if (i->flags & NF_CT_EXPECT_PERMANENT) {
518                                 atomic_inc(&i->use);
519                                 return i;
520                         } else if (del_timer(&i->timeout)) {
521                                 nf_ct_unlink_expect(i);
522                                 return i;
523                         }
524                 }
525         }
526         return NULL;
527 }
528
529 /* delete all expectations for this conntrack */
530 void nf_ct_remove_expectations(struct nf_conn *ct)
531 {
532         struct nf_conntrack_expect *i, *tmp;
533         struct nf_conn_help *help = nfct_help(ct);
534
535         /* Optimization: most connection never expect any others. */
536         if (!help || help->expecting == 0)
537                 return;
538
539         list_for_each_entry_safe(i, tmp, &nf_conntrack_expect_list, list) {
540                 if (i->master == ct && del_timer(&i->timeout)) {
541                         nf_ct_unlink_expect(i);
542                         nf_conntrack_expect_put(i);
543                 }
544         }
545 }
546
547 static void
548 clean_from_lists(struct nf_conn *ct)
549 {
550         unsigned int ho, hr;
551         
552         DEBUGP("clean_from_lists(%p)\n", ct);
553         ASSERT_WRITE_LOCK(&nf_conntrack_lock);
554
555         ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
556         hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
557         LIST_DELETE(&nf_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
558         LIST_DELETE(&nf_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
559
560         /* Destroy all pending expectations */
561         nf_ct_remove_expectations(ct);
562 }
563
564 static void
565 destroy_conntrack(struct nf_conntrack *nfct)
566 {
567         struct nf_conn *ct = (struct nf_conn *)nfct;
568         struct nf_conntrack_l3proto *l3proto;
569         struct nf_conntrack_protocol *proto;
570
571         DEBUGP("destroy_conntrack(%p)\n", ct);
572         NF_CT_ASSERT(atomic_read(&nfct->use) == 0);
573         NF_CT_ASSERT(!timer_pending(&ct->timeout));
574
575         nf_conntrack_event(IPCT_DESTROY, ct);
576         set_bit(IPS_DYING_BIT, &ct->status);
577
578         /* To make sure we don't get any weird locking issues here:
579          * destroy_conntrack() MUST NOT be called with a write lock
580          * to nf_conntrack_lock!!! -HW */
581         l3proto = __nf_ct_l3proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num);
582         if (l3proto && l3proto->destroy)
583                 l3proto->destroy(ct);
584
585         proto = __nf_ct_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num, ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
586         if (proto && proto->destroy)
587                 proto->destroy(ct);
588
589         if (nf_conntrack_destroyed)
590                 nf_conntrack_destroyed(ct);
591
592         write_lock_bh(&nf_conntrack_lock);
593         /* Expectations will have been removed in clean_from_lists,
594          * except TFTP can create an expectation on the first packet,
595          * before connection is in the list, so we need to clean here,
596          * too. */
597         nf_ct_remove_expectations(ct);
598
599         /* We overload first tuple to link into unconfirmed list. */
600         if (!nf_ct_is_confirmed(ct)) {
601                 BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
602                 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
603         }
604
605         NF_CT_STAT_INC(delete);
606         write_unlock_bh(&nf_conntrack_lock);
607
608         if (ct->master)
609                 nf_ct_put(ct->master);
610
611         DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
612         nf_conntrack_free(ct);
613 }
614
615 static void death_by_timeout(unsigned long ul_conntrack)
616 {
617         struct nf_conn *ct = (void *)ul_conntrack;
618
619         write_lock_bh(&nf_conntrack_lock);
620         /* Inside lock so preempt is disabled on module removal path.
621          * Otherwise we can get spurious warnings. */
622         NF_CT_STAT_INC(delete_list);
623         clean_from_lists(ct);
624         write_unlock_bh(&nf_conntrack_lock);
625         nf_ct_put(ct);
626 }
627
628 static inline int
629 conntrack_tuple_cmp(const struct nf_conntrack_tuple_hash *i,
630                     const struct nf_conntrack_tuple *tuple,
631                     const struct nf_conn *ignored_conntrack)
632 {
633         ASSERT_READ_LOCK(&nf_conntrack_lock);
634         return nf_ct_tuplehash_to_ctrack(i) != ignored_conntrack
635                 && nf_ct_tuple_equal(tuple, &i->tuple);
636 }
637
638 struct nf_conntrack_tuple_hash *
639 __nf_conntrack_find(const struct nf_conntrack_tuple *tuple,
640                     const struct nf_conn *ignored_conntrack)
641 {
642         struct nf_conntrack_tuple_hash *h;
643         unsigned int hash = hash_conntrack(tuple);
644
645         ASSERT_READ_LOCK(&nf_conntrack_lock);
646         list_for_each_entry(h, &nf_conntrack_hash[hash], list) {
647                 if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {
648                         NF_CT_STAT_INC(found);
649                         return h;
650                 }
651                 NF_CT_STAT_INC(searched);
652         }
653
654         return NULL;
655 }
656
657 /* Find a connection corresponding to a tuple. */
658 struct nf_conntrack_tuple_hash *
659 nf_conntrack_find_get(const struct nf_conntrack_tuple *tuple,
660                       const struct nf_conn *ignored_conntrack)
661 {
662         struct nf_conntrack_tuple_hash *h;
663
664         read_lock_bh(&nf_conntrack_lock);
665         h = __nf_conntrack_find(tuple, ignored_conntrack);
666         if (h)
667                 atomic_inc(&nf_ct_tuplehash_to_ctrack(h)->ct_general.use);
668         read_unlock_bh(&nf_conntrack_lock);
669
670         return h;
671 }
672
673 static void __nf_conntrack_hash_insert(struct nf_conn *ct,
674                                        unsigned int hash,
675                                        unsigned int repl_hash) 
676 {
677         ct->id = ++nf_conntrack_next_id;
678         list_prepend(&nf_conntrack_hash[hash],
679                      &ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
680         list_prepend(&nf_conntrack_hash[repl_hash],
681                      &ct->tuplehash[IP_CT_DIR_REPLY].list);
682 }
683
684 void nf_conntrack_hash_insert(struct nf_conn *ct)
685 {
686         unsigned int hash, repl_hash;
687
688         hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
689         repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
690
691         write_lock_bh(&nf_conntrack_lock);
692         __nf_conntrack_hash_insert(ct, hash, repl_hash);
693         write_unlock_bh(&nf_conntrack_lock);
694 }
695
696 /* Confirm a connection given skb; places it in hash table */
697 int
698 __nf_conntrack_confirm(struct sk_buff **pskb)
699 {
700         unsigned int hash, repl_hash;
701         struct nf_conn *ct;
702         enum ip_conntrack_info ctinfo;
703
704         ct = nf_ct_get(*pskb, &ctinfo);
705
706         /* ipt_REJECT uses nf_conntrack_attach to attach related
707            ICMP/TCP RST packets in other direction.  Actual packet
708            which created connection will be IP_CT_NEW or for an
709            expected connection, IP_CT_RELATED. */
710         if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
711                 return NF_ACCEPT;
712
713         hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
714         repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
715
716         /* We're not in hash table, and we refuse to set up related
717            connections for unconfirmed conns.  But packet copies and
718            REJECT will give spurious warnings here. */
719         /* NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
720
721         /* No external references means noone else could have
722            confirmed us. */
723         NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
724         DEBUGP("Confirming conntrack %p\n", ct);
725
726         write_lock_bh(&nf_conntrack_lock);
727
728         /* See if there's one in the list already, including reverse:
729            NAT could have grabbed it without realizing, since we're
730            not in the hash.  If there is, we lost race. */
731         if (!LIST_FIND(&nf_conntrack_hash[hash],
732                        conntrack_tuple_cmp,
733                        struct nf_conntrack_tuple_hash *,
734                        &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
735             && !LIST_FIND(&nf_conntrack_hash[repl_hash],
736                           conntrack_tuple_cmp,
737                           struct nf_conntrack_tuple_hash *,
738                           &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
739                 struct nf_conn_help *help;
740                 /* Remove from unconfirmed list */
741                 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
742
743                 __nf_conntrack_hash_insert(ct, hash, repl_hash);
744                 /* Timer relative to confirmation time, not original
745                    setting time, otherwise we'd get timer wrap in
746                    weird delay cases. */
747                 ct->timeout.expires += jiffies;
748                 add_timer(&ct->timeout);
749                 atomic_inc(&ct->ct_general.use);
750                 set_bit(IPS_CONFIRMED_BIT, &ct->status);
751                 NF_CT_STAT_INC(insert);
752                 write_unlock_bh(&nf_conntrack_lock);
753                 help = nfct_help(ct);
754                 if (help && help->helper)
755                         nf_conntrack_event_cache(IPCT_HELPER, *pskb);
756 #ifdef CONFIG_NF_NAT_NEEDED
757                 if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
758                     test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
759                         nf_conntrack_event_cache(IPCT_NATINFO, *pskb);
760 #endif
761                 nf_conntrack_event_cache(master_ct(ct) ?
762                                          IPCT_RELATED : IPCT_NEW, *pskb);
763                 return NF_ACCEPT;
764         }
765
766         NF_CT_STAT_INC(insert_failed);
767         write_unlock_bh(&nf_conntrack_lock);
768         return NF_DROP;
769 }
770
771 /* Returns true if a connection correspondings to the tuple (required
772    for NAT). */
773 int
774 nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
775                          const struct nf_conn *ignored_conntrack)
776 {
777         struct nf_conntrack_tuple_hash *h;
778
779         read_lock_bh(&nf_conntrack_lock);
780         h = __nf_conntrack_find(tuple, ignored_conntrack);
781         read_unlock_bh(&nf_conntrack_lock);
782
783         return h != NULL;
784 }
785
786 /* There's a small race here where we may free a just-assured
787    connection.  Too bad: we're in trouble anyway. */
788 static inline int unreplied(const struct nf_conntrack_tuple_hash *i)
789 {
790         return !(test_bit(IPS_ASSURED_BIT,
791                           &nf_ct_tuplehash_to_ctrack(i)->status));
792 }
793
794 static int early_drop(struct list_head *chain)
795 {
796         /* Traverse backwards: gives us oldest, which is roughly LRU */
797         struct nf_conntrack_tuple_hash *h;
798         struct nf_conn *ct = NULL;
799         int dropped = 0;
800
801         read_lock_bh(&nf_conntrack_lock);
802         h = LIST_FIND_B(chain, unreplied, struct nf_conntrack_tuple_hash *);
803         if (h) {
804                 ct = nf_ct_tuplehash_to_ctrack(h);
805                 atomic_inc(&ct->ct_general.use);
806         }
807         read_unlock_bh(&nf_conntrack_lock);
808
809         if (!ct)
810                 return dropped;
811
812         if (del_timer(&ct->timeout)) {
813                 death_by_timeout((unsigned long)ct);
814                 dropped = 1;
815                 NF_CT_STAT_INC(early_drop);
816         }
817         nf_ct_put(ct);
818         return dropped;
819 }
820
821 static inline int helper_cmp(const struct nf_conntrack_helper *i,
822                              const struct nf_conntrack_tuple *rtuple)
823 {
824         return nf_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
825 }
826
827 static struct nf_conntrack_helper *
828 __nf_ct_helper_find(const struct nf_conntrack_tuple *tuple)
829 {
830         return LIST_FIND(&helpers, helper_cmp,
831                          struct nf_conntrack_helper *,
832                          tuple);
833 }
834
835 struct nf_conntrack_helper *
836 nf_ct_helper_find_get( const struct nf_conntrack_tuple *tuple)
837 {
838         struct nf_conntrack_helper *helper;
839
840         /* need nf_conntrack_lock to assure that helper exists until
841          * try_module_get() is called */
842         read_lock_bh(&nf_conntrack_lock);
843
844         helper = __nf_ct_helper_find(tuple);
845         if (helper) {
846                 /* need to increase module usage count to assure helper will
847                  * not go away while the caller is e.g. busy putting a
848                  * conntrack in the hash that uses the helper */
849                 if (!try_module_get(helper->me))
850                         helper = NULL;
851         }
852
853         read_unlock_bh(&nf_conntrack_lock);
854
855         return helper;
856 }
857
858 void nf_ct_helper_put(struct nf_conntrack_helper *helper)
859 {
860         module_put(helper->me);
861 }
862
863 static struct nf_conn *
864 __nf_conntrack_alloc(const struct nf_conntrack_tuple *orig,
865                      const struct nf_conntrack_tuple *repl,
866                      const struct nf_conntrack_l3proto *l3proto)
867 {
868         struct nf_conn *conntrack = NULL;
869         u_int32_t features = 0;
870         struct nf_conntrack_helper *helper;
871
872         if (unlikely(!nf_conntrack_hash_rnd_initted)) {
873                 get_random_bytes(&nf_conntrack_hash_rnd, 4);
874                 nf_conntrack_hash_rnd_initted = 1;
875         }
876
877         if (nf_conntrack_max
878             && atomic_read(&nf_conntrack_count) >= nf_conntrack_max) {
879                 unsigned int hash = hash_conntrack(orig);
880                 /* Try dropping from this hash chain. */
881                 if (!early_drop(&nf_conntrack_hash[hash])) {
882                         if (net_ratelimit())
883                                 printk(KERN_WARNING
884                                        "nf_conntrack: table full, dropping"
885                                        " packet.\n");
886                         return ERR_PTR(-ENOMEM);
887                 }
888         }
889
890         /*  find features needed by this conntrack. */
891         features = l3proto->get_features(orig);
892
893         /* FIXME: protect helper list per RCU */
894         read_lock_bh(&nf_conntrack_lock);
895         helper = __nf_ct_helper_find(repl);
896         if (helper)
897                 features |= NF_CT_F_HELP;
898         read_unlock_bh(&nf_conntrack_lock);
899
900         DEBUGP("nf_conntrack_alloc: features=0x%x\n", features);
901
902         read_lock_bh(&nf_ct_cache_lock);
903
904         if (unlikely(!nf_ct_cache[features].use)) {
905                 DEBUGP("nf_conntrack_alloc: not supported features = 0x%x\n",
906                         features);
907                 goto out;
908         }
909
910         conntrack = kmem_cache_alloc(nf_ct_cache[features].cachep, GFP_ATOMIC);
911         if (conntrack == NULL) {
912                 DEBUGP("nf_conntrack_alloc: Can't alloc conntrack from cache\n");
913                 goto out;
914         }
915
916         memset(conntrack, 0, nf_ct_cache[features].size);
917         conntrack->features = features;
918         if (helper) {
919                 struct nf_conn_help *help = nfct_help(conntrack);
920                 NF_CT_ASSERT(help);
921                 help->helper = helper;
922         }
923
924         atomic_set(&conntrack->ct_general.use, 1);
925         conntrack->ct_general.destroy = destroy_conntrack;
926         conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
927         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
928         /* Don't set timer yet: wait for confirmation */
929         init_timer(&conntrack->timeout);
930         conntrack->timeout.data = (unsigned long)conntrack;
931         conntrack->timeout.function = death_by_timeout;
932
933         atomic_inc(&nf_conntrack_count);
934 out:
935         read_unlock_bh(&nf_ct_cache_lock);
936         return conntrack;
937 }
938
939 struct nf_conn *nf_conntrack_alloc(const struct nf_conntrack_tuple *orig,
940                                    const struct nf_conntrack_tuple *repl)
941 {
942         struct nf_conntrack_l3proto *l3proto;
943
944         l3proto = __nf_ct_l3proto_find(orig->src.l3num);
945         return __nf_conntrack_alloc(orig, repl, l3proto);
946 }
947
948 void nf_conntrack_free(struct nf_conn *conntrack)
949 {
950         u_int32_t features = conntrack->features;
951         NF_CT_ASSERT(features >= NF_CT_F_BASIC && features < NF_CT_F_NUM);
952         DEBUGP("nf_conntrack_free: features = 0x%x, conntrack=%p\n", features,
953                conntrack);
954         kmem_cache_free(nf_ct_cache[features].cachep, conntrack);
955         atomic_dec(&nf_conntrack_count);
956 }
957
958 /* Allocate a new conntrack: we return -ENOMEM if classification
959    failed due to stress.  Otherwise it really is unclassifiable. */
960 static struct nf_conntrack_tuple_hash *
961 init_conntrack(const struct nf_conntrack_tuple *tuple,
962                struct nf_conntrack_l3proto *l3proto,
963                struct nf_conntrack_protocol *protocol,
964                struct sk_buff *skb,
965                unsigned int dataoff)
966 {
967         struct nf_conn *conntrack;
968         struct nf_conntrack_tuple repl_tuple;
969         struct nf_conntrack_expect *exp;
970
971         if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, protocol)) {
972                 DEBUGP("Can't invert tuple.\n");
973                 return NULL;
974         }
975
976         conntrack = __nf_conntrack_alloc(tuple, &repl_tuple, l3proto);
977         if (conntrack == NULL || IS_ERR(conntrack)) {
978                 DEBUGP("Can't allocate conntrack.\n");
979                 return (struct nf_conntrack_tuple_hash *)conntrack;
980         }
981
982         if (!protocol->new(conntrack, skb, dataoff)) {
983                 nf_conntrack_free(conntrack);
984                 DEBUGP("init conntrack: can't track with proto module\n");
985                 return NULL;
986         }
987
988         write_lock_bh(&nf_conntrack_lock);
989         exp = find_expectation(tuple);
990
991         if (exp) {
992                 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
993                         conntrack, exp);
994                 /* Welcome, Mr. Bond.  We've been expecting you... */
995                 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
996                 conntrack->master = exp->master;
997 #ifdef CONFIG_NF_CONNTRACK_MARK
998                 conntrack->mark = exp->master->mark;
999 #endif
1000                 nf_conntrack_get(&conntrack->master->ct_general);
1001                 NF_CT_STAT_INC(expect_new);
1002         } else
1003                 NF_CT_STAT_INC(new);
1004
1005         /* Overload tuple linked list to put us in unconfirmed list. */
1006         list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
1007
1008         write_unlock_bh(&nf_conntrack_lock);
1009
1010         if (exp) {
1011                 if (exp->expectfn)
1012                         exp->expectfn(conntrack, exp);
1013                 nf_conntrack_expect_put(exp);
1014         }
1015
1016         return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
1017 }
1018
1019 /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
1020 static inline struct nf_conn *
1021 resolve_normal_ct(struct sk_buff *skb,
1022                   unsigned int dataoff,
1023                   u_int16_t l3num,
1024                   u_int8_t protonum,
1025                   struct nf_conntrack_l3proto *l3proto,
1026                   struct nf_conntrack_protocol *proto,
1027                   int *set_reply,
1028                   enum ip_conntrack_info *ctinfo)
1029 {
1030         struct nf_conntrack_tuple tuple;
1031         struct nf_conntrack_tuple_hash *h;
1032         struct nf_conn *ct;
1033
1034         if (!nf_ct_get_tuple(skb, (unsigned int)(skb->nh.raw - skb->data),
1035                              dataoff, l3num, protonum, &tuple, l3proto,
1036                              proto)) {
1037                 DEBUGP("resolve_normal_ct: Can't get tuple\n");
1038                 return NULL;
1039         }
1040
1041         /* look for tuple match */
1042         h = nf_conntrack_find_get(&tuple, NULL);
1043         if (!h) {
1044                 h = init_conntrack(&tuple, l3proto, proto, skb, dataoff);
1045                 if (!h)
1046                         return NULL;
1047                 if (IS_ERR(h))
1048                         return (void *)h;
1049         }
1050         ct = nf_ct_tuplehash_to_ctrack(h);
1051
1052         /* It exists; we have (non-exclusive) reference. */
1053         if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) {
1054                 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
1055                 /* Please set reply bit if this packet OK */
1056                 *set_reply = 1;
1057         } else {
1058                 /* Once we've had two way comms, always ESTABLISHED. */
1059                 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
1060                         DEBUGP("nf_conntrack_in: normal packet for %p\n", ct);
1061                         *ctinfo = IP_CT_ESTABLISHED;
1062                 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
1063                         DEBUGP("nf_conntrack_in: related packet for %p\n", ct);
1064                         *ctinfo = IP_CT_RELATED;
1065                 } else {
1066                         DEBUGP("nf_conntrack_in: new packet for %p\n", ct);
1067                         *ctinfo = IP_CT_NEW;
1068                 }
1069                 *set_reply = 0;
1070         }
1071         skb->nfct = &ct->ct_general;
1072         skb->nfctinfo = *ctinfo;
1073         return ct;
1074 }
1075
1076 unsigned int
1077 nf_conntrack_in(int pf, unsigned int hooknum, struct sk_buff **pskb)
1078 {
1079         struct nf_conn *ct;
1080         enum ip_conntrack_info ctinfo;
1081         struct nf_conntrack_l3proto *l3proto;
1082         struct nf_conntrack_protocol *proto;
1083         unsigned int dataoff;
1084         u_int8_t protonum;
1085         int set_reply = 0;
1086         int ret;
1087
1088         /* Previously seen (loopback or untracked)?  Ignore. */
1089         if ((*pskb)->nfct) {
1090                 NF_CT_STAT_INC(ignore);
1091                 return NF_ACCEPT;
1092         }
1093
1094         l3proto = __nf_ct_l3proto_find((u_int16_t)pf);
1095         if ((ret = l3proto->prepare(pskb, hooknum, &dataoff, &protonum)) <= 0) {
1096                 DEBUGP("not prepared to track yet or error occured\n");
1097                 return -ret;
1098         }
1099
1100         proto = __nf_ct_proto_find((u_int16_t)pf, protonum);
1101
1102         /* It may be an special packet, error, unclean...
1103          * inverse of the return code tells to the netfilter
1104          * core what to do with the packet. */
1105         if (proto->error != NULL &&
1106             (ret = proto->error(*pskb, dataoff, &ctinfo, pf, hooknum)) <= 0) {
1107                 NF_CT_STAT_INC(error);
1108                 NF_CT_STAT_INC(invalid);
1109                 return -ret;
1110         }
1111
1112         ct = resolve_normal_ct(*pskb, dataoff, pf, protonum, l3proto, proto,
1113                                &set_reply, &ctinfo);
1114         if (!ct) {
1115                 /* Not valid part of a connection */
1116                 NF_CT_STAT_INC(invalid);
1117                 return NF_ACCEPT;
1118         }
1119
1120         if (IS_ERR(ct)) {
1121                 /* Too stressed to deal. */
1122                 NF_CT_STAT_INC(drop);
1123                 return NF_DROP;
1124         }
1125
1126         NF_CT_ASSERT((*pskb)->nfct);
1127
1128         ret = proto->packet(ct, *pskb, dataoff, ctinfo, pf, hooknum);
1129         if (ret < 0) {
1130                 /* Invalid: inverse of the return code tells
1131                  * the netfilter core what to do */
1132                 DEBUGP("nf_conntrack_in: Can't track with proto module\n");
1133                 nf_conntrack_put((*pskb)->nfct);
1134                 (*pskb)->nfct = NULL;
1135                 NF_CT_STAT_INC(invalid);
1136                 return -ret;
1137         }
1138
1139         if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
1140                 nf_conntrack_event_cache(IPCT_STATUS, *pskb);
1141
1142         return ret;
1143 }
1144
1145 int nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse,
1146                          const struct nf_conntrack_tuple *orig)
1147 {
1148         return nf_ct_invert_tuple(inverse, orig,
1149                                   __nf_ct_l3proto_find(orig->src.l3num),
1150                                   __nf_ct_proto_find(orig->src.l3num,
1151                                                      orig->dst.protonum));
1152 }
1153
1154 /* Would two expected things clash? */
1155 static inline int expect_clash(const struct nf_conntrack_expect *a,
1156                                const struct nf_conntrack_expect *b)
1157 {
1158         /* Part covered by intersection of masks must be unequal,
1159            otherwise they clash */
1160         struct nf_conntrack_tuple intersect_mask;
1161         int count;
1162
1163         intersect_mask.src.l3num = a->mask.src.l3num & b->mask.src.l3num;
1164         intersect_mask.src.u.all = a->mask.src.u.all & b->mask.src.u.all;
1165         intersect_mask.dst.u.all = a->mask.dst.u.all & b->mask.dst.u.all;
1166         intersect_mask.dst.protonum = a->mask.dst.protonum
1167                                         & b->mask.dst.protonum;
1168
1169         for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++){
1170                 intersect_mask.src.u3.all[count] =
1171                         a->mask.src.u3.all[count] & b->mask.src.u3.all[count];
1172         }
1173
1174         for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++){
1175                 intersect_mask.dst.u3.all[count] =
1176                         a->mask.dst.u3.all[count] & b->mask.dst.u3.all[count];
1177         }
1178
1179         return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
1180 }
1181
1182 static inline int expect_matches(const struct nf_conntrack_expect *a,
1183                                  const struct nf_conntrack_expect *b)
1184 {
1185         return a->master == b->master
1186                 && nf_ct_tuple_equal(&a->tuple, &b->tuple)
1187                 && nf_ct_tuple_equal(&a->mask, &b->mask);
1188 }
1189
1190 /* Generally a bad idea to call this: could have matched already. */
1191 void nf_conntrack_unexpect_related(struct nf_conntrack_expect *exp)
1192 {
1193         struct nf_conntrack_expect *i;
1194
1195         write_lock_bh(&nf_conntrack_lock);
1196         /* choose the the oldest expectation to evict */
1197         list_for_each_entry_reverse(i, &nf_conntrack_expect_list, list) {
1198                 if (expect_matches(i, exp) && del_timer(&i->timeout)) {
1199                         nf_ct_unlink_expect(i);
1200                         write_unlock_bh(&nf_conntrack_lock);
1201                         nf_conntrack_expect_put(i);
1202                         return;
1203                 }
1204         }
1205         write_unlock_bh(&nf_conntrack_lock);
1206 }
1207
1208 /* We don't increase the master conntrack refcount for non-fulfilled
1209  * conntracks. During the conntrack destruction, the expectations are
1210  * always killed before the conntrack itself */
1211 struct nf_conntrack_expect *nf_conntrack_expect_alloc(struct nf_conn *me)
1212 {
1213         struct nf_conntrack_expect *new;
1214
1215         new = kmem_cache_alloc(nf_conntrack_expect_cachep, GFP_ATOMIC);
1216         if (!new) {
1217                 DEBUGP("expect_related: OOM allocating expect\n");
1218                 return NULL;
1219         }
1220         new->master = me;
1221         atomic_set(&new->use, 1);
1222         return new;
1223 }
1224
1225 void nf_conntrack_expect_put(struct nf_conntrack_expect *exp)
1226 {
1227         if (atomic_dec_and_test(&exp->use))
1228                 kmem_cache_free(nf_conntrack_expect_cachep, exp);
1229 }
1230
1231 static void nf_conntrack_expect_insert(struct nf_conntrack_expect *exp)
1232 {
1233         struct nf_conn_help *master_help = nfct_help(exp->master);
1234
1235         atomic_inc(&exp->use);
1236         master_help->expecting++;
1237         list_add(&exp->list, &nf_conntrack_expect_list);
1238
1239         init_timer(&exp->timeout);
1240         exp->timeout.data = (unsigned long)exp;
1241         exp->timeout.function = expectation_timed_out;
1242         exp->timeout.expires = jiffies + master_help->helper->timeout * HZ;
1243         add_timer(&exp->timeout);
1244
1245         exp->id = ++nf_conntrack_expect_next_id;
1246         atomic_inc(&exp->use);
1247         NF_CT_STAT_INC(expect_create);
1248 }
1249
1250 /* Race with expectations being used means we could have none to find; OK. */
1251 static void evict_oldest_expect(struct nf_conn *master)
1252 {
1253         struct nf_conntrack_expect *i;
1254
1255         list_for_each_entry_reverse(i, &nf_conntrack_expect_list, list) {
1256                 if (i->master == master) {
1257                         if (del_timer(&i->timeout)) {
1258                                 nf_ct_unlink_expect(i);
1259                                 nf_conntrack_expect_put(i);
1260                         }
1261                         break;
1262                 }
1263         }
1264 }
1265
1266 static inline int refresh_timer(struct nf_conntrack_expect *i)
1267 {
1268         struct nf_conn_help *master_help = nfct_help(i->master);
1269
1270         if (!del_timer(&i->timeout))
1271                 return 0;
1272
1273         i->timeout.expires = jiffies + master_help->helper->timeout*HZ;
1274         add_timer(&i->timeout);
1275         return 1;
1276 }
1277
1278 int nf_conntrack_expect_related(struct nf_conntrack_expect *expect)
1279 {
1280         struct nf_conntrack_expect *i;
1281         struct nf_conn *master = expect->master;
1282         struct nf_conn_help *master_help = nfct_help(master);
1283         int ret;
1284
1285         NF_CT_ASSERT(master_help);
1286
1287         DEBUGP("nf_conntrack_expect_related %p\n", related_to);
1288         DEBUGP("tuple: "); NF_CT_DUMP_TUPLE(&expect->tuple);
1289         DEBUGP("mask:  "); NF_CT_DUMP_TUPLE(&expect->mask);
1290
1291         write_lock_bh(&nf_conntrack_lock);
1292         list_for_each_entry(i, &nf_conntrack_expect_list, list) {
1293                 if (expect_matches(i, expect)) {
1294                         /* Refresh timer: if it's dying, ignore.. */
1295                         if (refresh_timer(i)) {
1296                                 ret = 0;
1297                                 goto out;
1298                         }
1299                 } else if (expect_clash(i, expect)) {
1300                         ret = -EBUSY;
1301                         goto out;
1302                 }
1303         }
1304         /* Will be over limit? */
1305         if (master_help->helper->max_expected &&
1306             master_help->expecting >= master_help->helper->max_expected)
1307                 evict_oldest_expect(master);
1308
1309         nf_conntrack_expect_insert(expect);
1310         nf_conntrack_expect_event(IPEXP_NEW, expect);
1311         ret = 0;
1312 out:
1313         write_unlock_bh(&nf_conntrack_lock);
1314         return ret;
1315 }
1316
1317 int nf_conntrack_helper_register(struct nf_conntrack_helper *me)
1318 {
1319         int ret;
1320         BUG_ON(me->timeout == 0);
1321
1322         ret = nf_conntrack_register_cache(NF_CT_F_HELP, "nf_conntrack:help",
1323                                           sizeof(struct nf_conn)
1324                                           + sizeof(struct nf_conn_help)
1325                                           + __alignof__(struct nf_conn_help));
1326         if (ret < 0) {
1327                 printk(KERN_ERR "nf_conntrack_helper_reigster: Unable to create slab cache for conntracks\n");
1328                 return ret;
1329         }
1330         write_lock_bh(&nf_conntrack_lock);
1331         list_prepend(&helpers, me);
1332         write_unlock_bh(&nf_conntrack_lock);
1333
1334         return 0;
1335 }
1336
1337 struct nf_conntrack_helper *
1338 __nf_conntrack_helper_find_byname(const char *name)
1339 {
1340         struct nf_conntrack_helper *h;
1341
1342         list_for_each_entry(h, &helpers, list) {
1343                 if (!strcmp(h->name, name))
1344                         return h;
1345         }
1346
1347         return NULL;
1348 }
1349
1350 static inline int unhelp(struct nf_conntrack_tuple_hash *i,
1351                          const struct nf_conntrack_helper *me)
1352 {
1353         struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(i);
1354         struct nf_conn_help *help = nfct_help(ct);
1355
1356         if (help && help->helper == me) {
1357                 nf_conntrack_event(IPCT_HELPER, ct);
1358                 help->helper = NULL;
1359         }
1360         return 0;
1361 }
1362
1363 void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me)
1364 {
1365         unsigned int i;
1366         struct nf_conntrack_expect *exp, *tmp;
1367
1368         /* Need write lock here, to delete helper. */
1369         write_lock_bh(&nf_conntrack_lock);
1370         LIST_DELETE(&helpers, me);
1371
1372         /* Get rid of expectations */
1373         list_for_each_entry_safe(exp, tmp, &nf_conntrack_expect_list, list) {
1374                 struct nf_conn_help *help = nfct_help(exp->master);
1375                 if (help->helper == me && del_timer(&exp->timeout)) {
1376                         nf_ct_unlink_expect(exp);
1377                         nf_conntrack_expect_put(exp);
1378                 }
1379         }
1380
1381         /* Get rid of expecteds, set helpers to NULL. */
1382         LIST_FIND_W(&unconfirmed, unhelp, struct nf_conntrack_tuple_hash*, me);
1383         for (i = 0; i < nf_conntrack_htable_size; i++)
1384                 LIST_FIND_W(&nf_conntrack_hash[i], unhelp,
1385                             struct nf_conntrack_tuple_hash *, me);
1386         write_unlock_bh(&nf_conntrack_lock);
1387
1388         /* Someone could be still looking at the helper in a bh. */
1389         synchronize_net();
1390 }
1391
1392 /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
1393 void __nf_ct_refresh_acct(struct nf_conn *ct,
1394                           enum ip_conntrack_info ctinfo,
1395                           const struct sk_buff *skb,
1396                           unsigned long extra_jiffies,
1397                           int do_acct)
1398 {
1399         int event = 0;
1400
1401         NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct);
1402         NF_CT_ASSERT(skb);
1403
1404         write_lock_bh(&nf_conntrack_lock);
1405
1406         /* If not in hash table, timer will not be active yet */
1407         if (!nf_ct_is_confirmed(ct)) {
1408                 ct->timeout.expires = extra_jiffies;
1409                 event = IPCT_REFRESH;
1410         } else {
1411                 /* Need del_timer for race avoidance (may already be dying). */
1412                 if (del_timer(&ct->timeout)) {
1413                         ct->timeout.expires = jiffies + extra_jiffies;
1414                         add_timer(&ct->timeout);
1415                         event = IPCT_REFRESH;
1416                 }
1417         }
1418
1419 #ifdef CONFIG_NF_CT_ACCT
1420         if (do_acct) {
1421                 ct->counters[CTINFO2DIR(ctinfo)].packets++;
1422                 ct->counters[CTINFO2DIR(ctinfo)].bytes +=
1423                         skb->len - (unsigned int)(skb->nh.raw - skb->data);
1424         if ((ct->counters[CTINFO2DIR(ctinfo)].packets & 0x80000000)
1425             || (ct->counters[CTINFO2DIR(ctinfo)].bytes & 0x80000000))
1426                 event |= IPCT_COUNTER_FILLING;
1427         }
1428 #endif
1429
1430         write_unlock_bh(&nf_conntrack_lock);
1431
1432         /* must be unlocked when calling event cache */
1433         if (event)
1434                 nf_conntrack_event_cache(event, skb);
1435 }
1436
1437 #if defined(CONFIG_NF_CT_NETLINK) || \
1438     defined(CONFIG_NF_CT_NETLINK_MODULE)
1439
1440 #include <linux/netfilter/nfnetlink.h>
1441 #include <linux/netfilter/nfnetlink_conntrack.h>
1442 #include <linux/mutex.h>
1443
1444
1445 /* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
1446  * in ip_conntrack_core, since we don't want the protocols to autoload
1447  * or depend on ctnetlink */
1448 int nf_ct_port_tuple_to_nfattr(struct sk_buff *skb,
1449                                const struct nf_conntrack_tuple *tuple)
1450 {
1451         NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(u_int16_t),
1452                 &tuple->src.u.tcp.port);
1453         NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(u_int16_t),
1454                 &tuple->dst.u.tcp.port);
1455         return 0;
1456
1457 nfattr_failure:
1458         return -1;
1459 }
1460
1461 static const size_t cta_min_proto[CTA_PROTO_MAX] = {
1462         [CTA_PROTO_SRC_PORT-1]  = sizeof(u_int16_t),
1463         [CTA_PROTO_DST_PORT-1]  = sizeof(u_int16_t)
1464 };
1465
1466 int nf_ct_port_nfattr_to_tuple(struct nfattr *tb[],
1467                                struct nf_conntrack_tuple *t)
1468 {
1469         if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1])
1470                 return -EINVAL;
1471
1472         if (nfattr_bad_size(tb, CTA_PROTO_MAX, cta_min_proto))
1473                 return -EINVAL;
1474
1475         t->src.u.tcp.port =
1476                 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]);
1477         t->dst.u.tcp.port =
1478                 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]);
1479
1480         return 0;
1481 }
1482 #endif
1483
1484 /* Used by ipt_REJECT and ip6t_REJECT. */
1485 void __nf_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
1486 {
1487         struct nf_conn *ct;
1488         enum ip_conntrack_info ctinfo;
1489
1490         /* This ICMP is in reverse direction to the packet which caused it */
1491         ct = nf_ct_get(skb, &ctinfo);
1492         if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1493                 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1494         else
1495                 ctinfo = IP_CT_RELATED;
1496
1497         /* Attach to new skbuff, and increment count */
1498         nskb->nfct = &ct->ct_general;
1499         nskb->nfctinfo = ctinfo;
1500         nf_conntrack_get(nskb->nfct);
1501 }
1502
1503 static inline int
1504 do_iter(const struct nf_conntrack_tuple_hash *i,
1505         int (*iter)(struct nf_conn *i, void *data),
1506         void *data)
1507 {
1508         return iter(nf_ct_tuplehash_to_ctrack(i), data);
1509 }
1510
1511 /* Bring out ya dead! */
1512 static struct nf_conntrack_tuple_hash *
1513 get_next_corpse(int (*iter)(struct nf_conn *i, void *data),
1514                 void *data, unsigned int *bucket)
1515 {
1516         struct nf_conntrack_tuple_hash *h = NULL;
1517
1518         write_lock_bh(&nf_conntrack_lock);
1519         for (; *bucket < nf_conntrack_htable_size; (*bucket)++) {
1520                 h = LIST_FIND_W(&nf_conntrack_hash[*bucket], do_iter,
1521                                 struct nf_conntrack_tuple_hash *, iter, data);
1522                 if (h)
1523                         break;
1524         }
1525         if (!h)
1526                 h = LIST_FIND_W(&unconfirmed, do_iter,
1527                                 struct nf_conntrack_tuple_hash *, iter, data);
1528         if (h)
1529                 atomic_inc(&nf_ct_tuplehash_to_ctrack(h)->ct_general.use);
1530         write_unlock_bh(&nf_conntrack_lock);
1531
1532         return h;
1533 }
1534
1535 void
1536 nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data), void *data)
1537 {
1538         struct nf_conntrack_tuple_hash *h;
1539         unsigned int bucket = 0;
1540
1541         while ((h = get_next_corpse(iter, data, &bucket)) != NULL) {
1542                 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
1543                 /* Time to push up daises... */
1544                 if (del_timer(&ct->timeout))
1545                         death_by_timeout((unsigned long)ct);
1546                 /* ... else the timer will get him soon. */
1547
1548                 nf_ct_put(ct);
1549         }
1550 }
1551
1552 static int kill_all(struct nf_conn *i, void *data)
1553 {
1554         return 1;
1555 }
1556
1557 static void free_conntrack_hash(struct list_head *hash, int vmalloced, int size)
1558 {
1559         if (vmalloced)
1560                 vfree(hash);
1561         else
1562                 free_pages((unsigned long)hash, 
1563                            get_order(sizeof(struct list_head) * size));
1564 }
1565
1566 void nf_conntrack_flush()
1567 {
1568         nf_ct_iterate_cleanup(kill_all, NULL);
1569 }
1570
1571 /* Mishearing the voices in his head, our hero wonders how he's
1572    supposed to kill the mall. */
1573 void nf_conntrack_cleanup(void)
1574 {
1575         int i;
1576
1577         ip_ct_attach = NULL;
1578
1579         /* This makes sure all current packets have passed through
1580            netfilter framework.  Roll on, two-stage module
1581            delete... */
1582         synchronize_net();
1583
1584         nf_ct_event_cache_flush();
1585  i_see_dead_people:
1586         nf_conntrack_flush();
1587         if (atomic_read(&nf_conntrack_count) != 0) {
1588                 schedule();
1589                 goto i_see_dead_people;
1590         }
1591         /* wait until all references to nf_conntrack_untracked are dropped */
1592         while (atomic_read(&nf_conntrack_untracked.ct_general.use) > 1)
1593                 schedule();
1594
1595         for (i = 0; i < NF_CT_F_NUM; i++) {
1596                 if (nf_ct_cache[i].use == 0)
1597                         continue;
1598
1599                 NF_CT_ASSERT(nf_ct_cache[i].use == 1);
1600                 nf_ct_cache[i].use = 1;
1601                 nf_conntrack_unregister_cache(i);
1602         }
1603         kmem_cache_destroy(nf_conntrack_expect_cachep);
1604         free_conntrack_hash(nf_conntrack_hash, nf_conntrack_vmalloc,
1605                             nf_conntrack_htable_size);
1606
1607         /* free l3proto protocol tables */
1608         for (i = 0; i < PF_MAX; i++)
1609                 if (nf_ct_protos[i]) {
1610                         kfree(nf_ct_protos[i]);
1611                         nf_ct_protos[i] = NULL;
1612                 }
1613 }
1614
1615 static struct list_head *alloc_hashtable(int size, int *vmalloced)
1616 {
1617         struct list_head *hash;
1618         unsigned int i;
1619
1620         *vmalloced = 0; 
1621         hash = (void*)__get_free_pages(GFP_KERNEL, 
1622                                        get_order(sizeof(struct list_head)
1623                                                  * size));
1624         if (!hash) { 
1625                 *vmalloced = 1;
1626                 printk(KERN_WARNING "nf_conntrack: falling back to vmalloc.\n");
1627                 hash = vmalloc(sizeof(struct list_head) * size);
1628         }
1629
1630         if (hash)
1631                 for (i = 0; i < size; i++) 
1632                         INIT_LIST_HEAD(&hash[i]);
1633
1634         return hash;
1635 }
1636
1637 int set_hashsize(const char *val, struct kernel_param *kp)
1638 {
1639         int i, bucket, hashsize, vmalloced;
1640         int old_vmalloced, old_size;
1641         int rnd;
1642         struct list_head *hash, *old_hash;
1643         struct nf_conntrack_tuple_hash *h;
1644
1645         /* On boot, we can set this without any fancy locking. */
1646         if (!nf_conntrack_htable_size)
1647                 return param_set_uint(val, kp);
1648
1649         hashsize = simple_strtol(val, NULL, 0);
1650         if (!hashsize)
1651                 return -EINVAL;
1652
1653         hash = alloc_hashtable(hashsize, &vmalloced);
1654         if (!hash)
1655                 return -ENOMEM;
1656
1657         /* We have to rehahs for the new table anyway, so we also can
1658          * use a newrandom seed */
1659         get_random_bytes(&rnd, 4);
1660
1661         write_lock_bh(&nf_conntrack_lock);
1662         for (i = 0; i < nf_conntrack_htable_size; i++) {
1663                 while (!list_empty(&nf_conntrack_hash[i])) {
1664                         h = list_entry(nf_conntrack_hash[i].next,
1665                                        struct nf_conntrack_tuple_hash, list);
1666                         list_del(&h->list);
1667                         bucket = __hash_conntrack(&h->tuple, hashsize, rnd);
1668                         list_add_tail(&h->list, &hash[bucket]);
1669                 }
1670         }
1671         old_size = nf_conntrack_htable_size;
1672         old_vmalloced = nf_conntrack_vmalloc;
1673         old_hash = nf_conntrack_hash;
1674
1675         nf_conntrack_htable_size = hashsize;
1676         nf_conntrack_vmalloc = vmalloced;
1677         nf_conntrack_hash = hash;
1678         nf_conntrack_hash_rnd = rnd;
1679         write_unlock_bh(&nf_conntrack_lock);
1680
1681         free_conntrack_hash(old_hash, old_vmalloced, old_size);
1682         return 0;
1683 }
1684
1685 module_param_call(hashsize, set_hashsize, param_get_uint,
1686                   &nf_conntrack_htable_size, 0600);
1687
1688 int __init nf_conntrack_init(void)
1689 {
1690         unsigned int i;
1691         int ret;
1692
1693         /* Idea from tcp.c: use 1/16384 of memory.  On i386: 32MB
1694          * machine has 256 buckets.  >= 1GB machines have 8192 buckets. */
1695         if (!nf_conntrack_htable_size) {
1696                 nf_conntrack_htable_size
1697                         = (((num_physpages << PAGE_SHIFT) / 16384)
1698                            / sizeof(struct list_head));
1699                 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1700                         nf_conntrack_htable_size = 8192;
1701                 if (nf_conntrack_htable_size < 16)
1702                         nf_conntrack_htable_size = 16;
1703         }
1704         nf_conntrack_max = 8 * nf_conntrack_htable_size;
1705
1706         printk("nf_conntrack version %s (%u buckets, %d max)\n",
1707                NF_CONNTRACK_VERSION, nf_conntrack_htable_size,
1708                nf_conntrack_max);
1709
1710         nf_conntrack_hash = alloc_hashtable(nf_conntrack_htable_size,
1711                                             &nf_conntrack_vmalloc);
1712         if (!nf_conntrack_hash) {
1713                 printk(KERN_ERR "Unable to create nf_conntrack_hash\n");
1714                 goto err_out;
1715         }
1716
1717         ret = nf_conntrack_register_cache(NF_CT_F_BASIC, "nf_conntrack:basic",
1718                                           sizeof(struct nf_conn));
1719         if (ret < 0) {
1720                 printk(KERN_ERR "Unable to create nf_conn slab cache\n");
1721                 goto err_free_hash;
1722         }
1723
1724         nf_conntrack_expect_cachep = kmem_cache_create("nf_conntrack_expect",
1725                                         sizeof(struct nf_conntrack_expect),
1726                                         0, 0, NULL, NULL);
1727         if (!nf_conntrack_expect_cachep) {
1728                 printk(KERN_ERR "Unable to create nf_expect slab cache\n");
1729                 goto err_free_conntrack_slab;
1730         }
1731
1732         /* Don't NEED lock here, but good form anyway. */
1733         write_lock_bh(&nf_conntrack_lock);
1734         for (i = 0; i < PF_MAX; i++)
1735                 nf_ct_l3protos[i] = &nf_conntrack_generic_l3proto;
1736         write_unlock_bh(&nf_conntrack_lock);
1737
1738         /* For use by REJECT target */
1739         ip_ct_attach = __nf_conntrack_attach;
1740
1741         /* Set up fake conntrack:
1742             - to never be deleted, not in any hashes */
1743         atomic_set(&nf_conntrack_untracked.ct_general.use, 1);
1744         /*  - and look it like as a confirmed connection */
1745         set_bit(IPS_CONFIRMED_BIT, &nf_conntrack_untracked.status);
1746
1747         return ret;
1748
1749 err_free_conntrack_slab:
1750         nf_conntrack_unregister_cache(NF_CT_F_BASIC);
1751 err_free_hash:
1752         free_conntrack_hash(nf_conntrack_hash, nf_conntrack_vmalloc,
1753                             nf_conntrack_htable_size);
1754 err_out:
1755         return -ENOMEM;
1756 }