Merge HEAD from master.kernel.org:/pub/scm/linux/kernel/git/paulus/ppc64-2.6
[sfrench/cifs-2.6.git] / net / ipv4 / netfilter / ip_conntrack_core.c
1 /* Connection state tracking for netfilter.  This is separated from,
2    but required by, the NAT layer; it can also be used by an iptables
3    extension. */
4
5 /* (C) 1999-2001 Paul `Rusty' Russell  
6  * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License version 2 as
10  * published by the Free Software Foundation.
11  *
12  * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
13  *      - new API and handling of conntrack/nat helpers
14  *      - now capable of multiple expectations for one master
15  * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
16  *      - add usage/reference counts to ip_conntrack_expect
17  *      - export ip_conntrack[_expect]_{find_get,put} functions
18  * */
19
20 #include <linux/config.h>
21 #include <linux/types.h>
22 #include <linux/icmp.h>
23 #include <linux/ip.h>
24 #include <linux/netfilter.h>
25 #include <linux/netfilter_ipv4.h>
26 #include <linux/module.h>
27 #include <linux/skbuff.h>
28 #include <linux/proc_fs.h>
29 #include <linux/vmalloc.h>
30 #include <net/checksum.h>
31 #include <net/ip.h>
32 #include <linux/stddef.h>
33 #include <linux/sysctl.h>
34 #include <linux/slab.h>
35 #include <linux/random.h>
36 #include <linux/jhash.h>
37 #include <linux/err.h>
38 #include <linux/percpu.h>
39 #include <linux/moduleparam.h>
40 #include <linux/notifier.h>
41
42 /* ip_conntrack_lock protects the main hash table, protocol/helper/expected
43    registrations, conntrack timers*/
44 #define ASSERT_READ_LOCK(x)
45 #define ASSERT_WRITE_LOCK(x)
46
47 #include <linux/netfilter_ipv4/ip_conntrack.h>
48 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
49 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
50 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
51 #include <linux/netfilter_ipv4/listhelp.h>
52
53 #define IP_CONNTRACK_VERSION    "2.3"
54
55 #if 0
56 #define DEBUGP printk
57 #else
58 #define DEBUGP(format, args...)
59 #endif
60
61 DEFINE_RWLOCK(ip_conntrack_lock);
62
63 /* ip_conntrack_standalone needs this */
64 atomic_t ip_conntrack_count = ATOMIC_INIT(0);
65
66 void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
67 LIST_HEAD(ip_conntrack_expect_list);
68 struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO];
69 static LIST_HEAD(helpers);
70 unsigned int ip_conntrack_htable_size = 0;
71 int ip_conntrack_max;
72 struct list_head *ip_conntrack_hash;
73 static kmem_cache_t *ip_conntrack_cachep __read_mostly;
74 static kmem_cache_t *ip_conntrack_expect_cachep __read_mostly;
75 struct ip_conntrack ip_conntrack_untracked;
76 unsigned int ip_ct_log_invalid;
77 static LIST_HEAD(unconfirmed);
78 static int ip_conntrack_vmalloc;
79
80 static unsigned int ip_conntrack_next_id = 1;
81 static unsigned int ip_conntrack_expect_next_id = 1;
82 #ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
83 struct notifier_block *ip_conntrack_chain;
84 struct notifier_block *ip_conntrack_expect_chain;
85
86 DEFINE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache);
87
88 /* deliver cached events and clear cache entry - must be called with locally
89  * disabled softirqs */
90 static inline void
91 __ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ecache)
92 {
93         DEBUGP("ecache: delivering events for %p\n", ecache->ct);
94         if (is_confirmed(ecache->ct) && !is_dying(ecache->ct) && ecache->events)
95                 notifier_call_chain(&ip_conntrack_chain, ecache->events,
96                                     ecache->ct);
97         ecache->events = 0;
98         ip_conntrack_put(ecache->ct);
99         ecache->ct = NULL;
100 }
101
102 /* Deliver all cached events for a particular conntrack. This is called
103  * by code prior to async packet handling or freeing the skb */
104 void ip_ct_deliver_cached_events(const struct ip_conntrack *ct)
105 {
106         struct ip_conntrack_ecache *ecache;
107         
108         local_bh_disable();
109         ecache = &__get_cpu_var(ip_conntrack_ecache);
110         if (ecache->ct == ct)
111                 __ip_ct_deliver_cached_events(ecache);
112         local_bh_enable();
113 }
114
115 void __ip_ct_event_cache_init(struct ip_conntrack *ct)
116 {
117         struct ip_conntrack_ecache *ecache;
118
119         /* take care of delivering potentially old events */
120         ecache = &__get_cpu_var(ip_conntrack_ecache);
121         BUG_ON(ecache->ct == ct);
122         if (ecache->ct)
123                 __ip_ct_deliver_cached_events(ecache);
124         /* initialize for this conntrack/packet */
125         ecache->ct = ct;
126         nf_conntrack_get(&ct->ct_general);
127 }
128
129 /* flush the event cache - touches other CPU's data and must not be called while
130  * packets are still passing through the code */
131 static void ip_ct_event_cache_flush(void)
132 {
133         struct ip_conntrack_ecache *ecache;
134         int cpu;
135
136         for_each_cpu(cpu) {
137                 ecache = &per_cpu(ip_conntrack_ecache, cpu);
138                 if (ecache->ct)
139                         ip_conntrack_put(ecache->ct);
140         }
141 }
142 #else
143 static inline void ip_ct_event_cache_flush(void) {}
144 #endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */
145
146 DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
147
148 static int ip_conntrack_hash_rnd_initted;
149 static unsigned int ip_conntrack_hash_rnd;
150
151 static u_int32_t
152 hash_conntrack(const struct ip_conntrack_tuple *tuple)
153 {
154 #if 0
155         dump_tuple(tuple);
156 #endif
157         return (jhash_3words(tuple->src.ip,
158                              (tuple->dst.ip ^ tuple->dst.protonum),
159                              (tuple->src.u.all | (tuple->dst.u.all << 16)),
160                              ip_conntrack_hash_rnd) % ip_conntrack_htable_size);
161 }
162
163 int
164 ip_ct_get_tuple(const struct iphdr *iph,
165                 const struct sk_buff *skb,
166                 unsigned int dataoff,
167                 struct ip_conntrack_tuple *tuple,
168                 const struct ip_conntrack_protocol *protocol)
169 {
170         /* Never happen */
171         if (iph->frag_off & htons(IP_OFFSET)) {
172                 printk("ip_conntrack_core: Frag of proto %u.\n",
173                        iph->protocol);
174                 return 0;
175         }
176
177         tuple->src.ip = iph->saddr;
178         tuple->dst.ip = iph->daddr;
179         tuple->dst.protonum = iph->protocol;
180         tuple->dst.dir = IP_CT_DIR_ORIGINAL;
181
182         return protocol->pkt_to_tuple(skb, dataoff, tuple);
183 }
184
185 int
186 ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse,
187                    const struct ip_conntrack_tuple *orig,
188                    const struct ip_conntrack_protocol *protocol)
189 {
190         inverse->src.ip = orig->dst.ip;
191         inverse->dst.ip = orig->src.ip;
192         inverse->dst.protonum = orig->dst.protonum;
193         inverse->dst.dir = !orig->dst.dir;
194
195         return protocol->invert_tuple(inverse, orig);
196 }
197
198
199 /* ip_conntrack_expect helper functions */
200 static void unlink_expect(struct ip_conntrack_expect *exp)
201 {
202         ASSERT_WRITE_LOCK(&ip_conntrack_lock);
203         IP_NF_ASSERT(!timer_pending(&exp->timeout));
204         list_del(&exp->list);
205         CONNTRACK_STAT_INC(expect_delete);
206         exp->master->expecting--;
207         ip_conntrack_expect_put(exp);
208 }
209
210 void __ip_ct_expect_unlink_destroy(struct ip_conntrack_expect *exp)
211 {
212         unlink_expect(exp);
213         ip_conntrack_expect_put(exp);
214 }
215
216 static void expectation_timed_out(unsigned long ul_expect)
217 {
218         struct ip_conntrack_expect *exp = (void *)ul_expect;
219
220         write_lock_bh(&ip_conntrack_lock);
221         unlink_expect(exp);
222         write_unlock_bh(&ip_conntrack_lock);
223         ip_conntrack_expect_put(exp);
224 }
225
226 struct ip_conntrack_expect *
227 __ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple)
228 {
229         struct ip_conntrack_expect *i;
230         
231         list_for_each_entry(i, &ip_conntrack_expect_list, list) {
232                 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) {
233                         atomic_inc(&i->use);
234                         return i;
235                 }
236         }
237         return NULL;
238 }
239
240 /* Just find a expectation corresponding to a tuple. */
241 struct ip_conntrack_expect *
242 ip_conntrack_expect_find_get(const struct ip_conntrack_tuple *tuple)
243 {
244         struct ip_conntrack_expect *i;
245         
246         read_lock_bh(&ip_conntrack_lock);
247         i = __ip_conntrack_expect_find(tuple);
248         read_unlock_bh(&ip_conntrack_lock);
249
250         return i;
251 }
252
253 /* If an expectation for this connection is found, it gets delete from
254  * global list then returned. */
255 static struct ip_conntrack_expect *
256 find_expectation(const struct ip_conntrack_tuple *tuple)
257 {
258         struct ip_conntrack_expect *i;
259
260         list_for_each_entry(i, &ip_conntrack_expect_list, list) {
261                 /* If master is not in hash table yet (ie. packet hasn't left
262                    this machine yet), how can other end know about expected?
263                    Hence these are not the droids you are looking for (if
264                    master ct never got confirmed, we'd hold a reference to it
265                    and weird things would happen to future packets). */
266                 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
267                     && is_confirmed(i->master)
268                     && del_timer(&i->timeout)) {
269                         unlink_expect(i);
270                         return i;
271                 }
272         }
273         return NULL;
274 }
275
276 /* delete all expectations for this conntrack */
277 void ip_ct_remove_expectations(struct ip_conntrack *ct)
278 {
279         struct ip_conntrack_expect *i, *tmp;
280
281         /* Optimization: most connection never expect any others. */
282         if (ct->expecting == 0)
283                 return;
284
285         list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) {
286                 if (i->master == ct && del_timer(&i->timeout)) {
287                         unlink_expect(i);
288                         ip_conntrack_expect_put(i);
289                 }
290         }
291 }
292
293 static void
294 clean_from_lists(struct ip_conntrack *ct)
295 {
296         unsigned int ho, hr;
297         
298         DEBUGP("clean_from_lists(%p)\n", ct);
299         ASSERT_WRITE_LOCK(&ip_conntrack_lock);
300
301         ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
302         hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
303         LIST_DELETE(&ip_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
304         LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
305
306         /* Destroy all pending expectations */
307         ip_ct_remove_expectations(ct);
308 }
309
310 static void
311 destroy_conntrack(struct nf_conntrack *nfct)
312 {
313         struct ip_conntrack *ct = (struct ip_conntrack *)nfct;
314         struct ip_conntrack_protocol *proto;
315
316         DEBUGP("destroy_conntrack(%p)\n", ct);
317         IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
318         IP_NF_ASSERT(!timer_pending(&ct->timeout));
319
320         ip_conntrack_event(IPCT_DESTROY, ct);
321         set_bit(IPS_DYING_BIT, &ct->status);
322
323         /* To make sure we don't get any weird locking issues here:
324          * destroy_conntrack() MUST NOT be called with a write lock
325          * to ip_conntrack_lock!!! -HW */
326         proto = __ip_conntrack_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
327         if (proto && proto->destroy)
328                 proto->destroy(ct);
329
330         if (ip_conntrack_destroyed)
331                 ip_conntrack_destroyed(ct);
332
333         write_lock_bh(&ip_conntrack_lock);
334         /* Expectations will have been removed in clean_from_lists,
335          * except TFTP can create an expectation on the first packet,
336          * before connection is in the list, so we need to clean here,
337          * too. */
338         ip_ct_remove_expectations(ct);
339
340         /* We overload first tuple to link into unconfirmed list. */
341         if (!is_confirmed(ct)) {
342                 BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
343                 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
344         }
345
346         CONNTRACK_STAT_INC(delete);
347         write_unlock_bh(&ip_conntrack_lock);
348
349         if (ct->master)
350                 ip_conntrack_put(ct->master);
351
352         DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
353         ip_conntrack_free(ct);
354 }
355
356 static void death_by_timeout(unsigned long ul_conntrack)
357 {
358         struct ip_conntrack *ct = (void *)ul_conntrack;
359
360         write_lock_bh(&ip_conntrack_lock);
361         /* Inside lock so preempt is disabled on module removal path.
362          * Otherwise we can get spurious warnings. */
363         CONNTRACK_STAT_INC(delete_list);
364         clean_from_lists(ct);
365         write_unlock_bh(&ip_conntrack_lock);
366         ip_conntrack_put(ct);
367 }
368
369 static inline int
370 conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
371                     const struct ip_conntrack_tuple *tuple,
372                     const struct ip_conntrack *ignored_conntrack)
373 {
374         ASSERT_READ_LOCK(&ip_conntrack_lock);
375         return tuplehash_to_ctrack(i) != ignored_conntrack
376                 && ip_ct_tuple_equal(tuple, &i->tuple);
377 }
378
379 struct ip_conntrack_tuple_hash *
380 __ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
381                     const struct ip_conntrack *ignored_conntrack)
382 {
383         struct ip_conntrack_tuple_hash *h;
384         unsigned int hash = hash_conntrack(tuple);
385
386         ASSERT_READ_LOCK(&ip_conntrack_lock);
387         list_for_each_entry(h, &ip_conntrack_hash[hash], list) {
388                 if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {
389                         CONNTRACK_STAT_INC(found);
390                         return h;
391                 }
392                 CONNTRACK_STAT_INC(searched);
393         }
394
395         return NULL;
396 }
397
398 /* Find a connection corresponding to a tuple. */
399 struct ip_conntrack_tuple_hash *
400 ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
401                       const struct ip_conntrack *ignored_conntrack)
402 {
403         struct ip_conntrack_tuple_hash *h;
404
405         read_lock_bh(&ip_conntrack_lock);
406         h = __ip_conntrack_find(tuple, ignored_conntrack);
407         if (h)
408                 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
409         read_unlock_bh(&ip_conntrack_lock);
410
411         return h;
412 }
413
414 static void __ip_conntrack_hash_insert(struct ip_conntrack *ct,
415                                         unsigned int hash,
416                                         unsigned int repl_hash) 
417 {
418         ct->id = ++ip_conntrack_next_id;
419         list_prepend(&ip_conntrack_hash[hash],
420                      &ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
421         list_prepend(&ip_conntrack_hash[repl_hash],
422                      &ct->tuplehash[IP_CT_DIR_REPLY].list);
423 }
424
425 void ip_conntrack_hash_insert(struct ip_conntrack *ct)
426 {
427         unsigned int hash, repl_hash;
428
429         hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
430         repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
431
432         write_lock_bh(&ip_conntrack_lock);
433         __ip_conntrack_hash_insert(ct, hash, repl_hash);
434         write_unlock_bh(&ip_conntrack_lock);
435 }
436
437 /* Confirm a connection given skb; places it in hash table */
438 int
439 __ip_conntrack_confirm(struct sk_buff **pskb)
440 {
441         unsigned int hash, repl_hash;
442         struct ip_conntrack *ct;
443         enum ip_conntrack_info ctinfo;
444
445         ct = ip_conntrack_get(*pskb, &ctinfo);
446
447         /* ipt_REJECT uses ip_conntrack_attach to attach related
448            ICMP/TCP RST packets in other direction.  Actual packet
449            which created connection will be IP_CT_NEW or for an
450            expected connection, IP_CT_RELATED. */
451         if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
452                 return NF_ACCEPT;
453
454         hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
455         repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
456
457         /* We're not in hash table, and we refuse to set up related
458            connections for unconfirmed conns.  But packet copies and
459            REJECT will give spurious warnings here. */
460         /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
461
462         /* No external references means noone else could have
463            confirmed us. */
464         IP_NF_ASSERT(!is_confirmed(ct));
465         DEBUGP("Confirming conntrack %p\n", ct);
466
467         write_lock_bh(&ip_conntrack_lock);
468
469         /* See if there's one in the list already, including reverse:
470            NAT could have grabbed it without realizing, since we're
471            not in the hash.  If there is, we lost race. */
472         if (!LIST_FIND(&ip_conntrack_hash[hash],
473                        conntrack_tuple_cmp,
474                        struct ip_conntrack_tuple_hash *,
475                        &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
476             && !LIST_FIND(&ip_conntrack_hash[repl_hash],
477                           conntrack_tuple_cmp,
478                           struct ip_conntrack_tuple_hash *,
479                           &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
480                 /* Remove from unconfirmed list */
481                 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
482
483                 __ip_conntrack_hash_insert(ct, hash, repl_hash);
484                 /* Timer relative to confirmation time, not original
485                    setting time, otherwise we'd get timer wrap in
486                    weird delay cases. */
487                 ct->timeout.expires += jiffies;
488                 add_timer(&ct->timeout);
489                 atomic_inc(&ct->ct_general.use);
490                 set_bit(IPS_CONFIRMED_BIT, &ct->status);
491                 CONNTRACK_STAT_INC(insert);
492                 write_unlock_bh(&ip_conntrack_lock);
493                 if (ct->helper)
494                         ip_conntrack_event_cache(IPCT_HELPER, *pskb);
495 #ifdef CONFIG_IP_NF_NAT_NEEDED
496                 if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
497                     test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
498                         ip_conntrack_event_cache(IPCT_NATINFO, *pskb);
499 #endif
500                 ip_conntrack_event_cache(master_ct(ct) ?
501                                          IPCT_RELATED : IPCT_NEW, *pskb);
502
503                 return NF_ACCEPT;
504         }
505
506         CONNTRACK_STAT_INC(insert_failed);
507         write_unlock_bh(&ip_conntrack_lock);
508
509         return NF_DROP;
510 }
511
512 /* Returns true if a connection correspondings to the tuple (required
513    for NAT). */
514 int
515 ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
516                          const struct ip_conntrack *ignored_conntrack)
517 {
518         struct ip_conntrack_tuple_hash *h;
519
520         read_lock_bh(&ip_conntrack_lock);
521         h = __ip_conntrack_find(tuple, ignored_conntrack);
522         read_unlock_bh(&ip_conntrack_lock);
523
524         return h != NULL;
525 }
526
527 /* There's a small race here where we may free a just-assured
528    connection.  Too bad: we're in trouble anyway. */
529 static inline int unreplied(const struct ip_conntrack_tuple_hash *i)
530 {
531         return !(test_bit(IPS_ASSURED_BIT, &tuplehash_to_ctrack(i)->status));
532 }
533
534 static int early_drop(struct list_head *chain)
535 {
536         /* Traverse backwards: gives us oldest, which is roughly LRU */
537         struct ip_conntrack_tuple_hash *h;
538         struct ip_conntrack *ct = NULL;
539         int dropped = 0;
540
541         read_lock_bh(&ip_conntrack_lock);
542         h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *);
543         if (h) {
544                 ct = tuplehash_to_ctrack(h);
545                 atomic_inc(&ct->ct_general.use);
546         }
547         read_unlock_bh(&ip_conntrack_lock);
548
549         if (!ct)
550                 return dropped;
551
552         if (del_timer(&ct->timeout)) {
553                 death_by_timeout((unsigned long)ct);
554                 dropped = 1;
555                 CONNTRACK_STAT_INC(early_drop);
556         }
557         ip_conntrack_put(ct);
558         return dropped;
559 }
560
561 static inline int helper_cmp(const struct ip_conntrack_helper *i,
562                              const struct ip_conntrack_tuple *rtuple)
563 {
564         return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
565 }
566
567 static struct ip_conntrack_helper *
568 __ip_conntrack_helper_find( const struct ip_conntrack_tuple *tuple)
569 {
570         return LIST_FIND(&helpers, helper_cmp,
571                          struct ip_conntrack_helper *,
572                          tuple);
573 }
574
575 struct ip_conntrack_helper *
576 ip_conntrack_helper_find_get( const struct ip_conntrack_tuple *tuple)
577 {
578         struct ip_conntrack_helper *helper;
579
580         /* need ip_conntrack_lock to assure that helper exists until
581          * try_module_get() is called */
582         read_lock_bh(&ip_conntrack_lock);
583
584         helper = __ip_conntrack_helper_find(tuple);
585         if (helper) {
586                 /* need to increase module usage count to assure helper will
587                  * not go away while the caller is e.g. busy putting a
588                  * conntrack in the hash that uses the helper */
589                 if (!try_module_get(helper->me))
590                         helper = NULL;
591         }
592
593         read_unlock_bh(&ip_conntrack_lock);
594
595         return helper;
596 }
597
598 void ip_conntrack_helper_put(struct ip_conntrack_helper *helper)
599 {
600         module_put(helper->me);
601 }
602
603 struct ip_conntrack_protocol *
604 __ip_conntrack_proto_find(u_int8_t protocol)
605 {
606         return ip_ct_protos[protocol];
607 }
608
609 /* this is guaranteed to always return a valid protocol helper, since
610  * it falls back to generic_protocol */
611 struct ip_conntrack_protocol *
612 ip_conntrack_proto_find_get(u_int8_t protocol)
613 {
614         struct ip_conntrack_protocol *p;
615
616         preempt_disable();
617         p = __ip_conntrack_proto_find(protocol);
618         if (p) {
619                 if (!try_module_get(p->me))
620                         p = &ip_conntrack_generic_protocol;
621         }
622         preempt_enable();
623         
624         return p;
625 }
626
627 void ip_conntrack_proto_put(struct ip_conntrack_protocol *p)
628 {
629         module_put(p->me);
630 }
631
632 struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *orig,
633                                         struct ip_conntrack_tuple *repl)
634 {
635         struct ip_conntrack *conntrack;
636
637         if (!ip_conntrack_hash_rnd_initted) {
638                 get_random_bytes(&ip_conntrack_hash_rnd, 4);
639                 ip_conntrack_hash_rnd_initted = 1;
640         }
641
642         if (ip_conntrack_max
643             && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
644                 unsigned int hash = hash_conntrack(orig);
645                 /* Try dropping from this hash chain. */
646                 if (!early_drop(&ip_conntrack_hash[hash])) {
647                         if (net_ratelimit())
648                                 printk(KERN_WARNING
649                                        "ip_conntrack: table full, dropping"
650                                        " packet.\n");
651                         return ERR_PTR(-ENOMEM);
652                 }
653         }
654
655         conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
656         if (!conntrack) {
657                 DEBUGP("Can't allocate conntrack.\n");
658                 return ERR_PTR(-ENOMEM);
659         }
660
661         memset(conntrack, 0, sizeof(*conntrack));
662         atomic_set(&conntrack->ct_general.use, 1);
663         conntrack->ct_general.destroy = destroy_conntrack;
664         conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
665         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
666         /* Don't set timer yet: wait for confirmation */
667         init_timer(&conntrack->timeout);
668         conntrack->timeout.data = (unsigned long)conntrack;
669         conntrack->timeout.function = death_by_timeout;
670
671         atomic_inc(&ip_conntrack_count);
672
673         return conntrack;
674 }
675
676 void
677 ip_conntrack_free(struct ip_conntrack *conntrack)
678 {
679         atomic_dec(&ip_conntrack_count);
680         kmem_cache_free(ip_conntrack_cachep, conntrack);
681 }
682
683 /* Allocate a new conntrack: we return -ENOMEM if classification
684  * failed due to stress.   Otherwise it really is unclassifiable */
685 static struct ip_conntrack_tuple_hash *
686 init_conntrack(struct ip_conntrack_tuple *tuple,
687                struct ip_conntrack_protocol *protocol,
688                struct sk_buff *skb)
689 {
690         struct ip_conntrack *conntrack;
691         struct ip_conntrack_tuple repl_tuple;
692         struct ip_conntrack_expect *exp;
693
694         if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
695                 DEBUGP("Can't invert tuple.\n");
696                 return NULL;
697         }
698
699         conntrack = ip_conntrack_alloc(tuple, &repl_tuple);
700         if (conntrack == NULL || IS_ERR(conntrack))
701                 return (struct ip_conntrack_tuple_hash *)conntrack;
702
703         if (!protocol->new(conntrack, skb)) {
704                 ip_conntrack_free(conntrack);
705                 return NULL;
706         }
707
708         write_lock_bh(&ip_conntrack_lock);
709         exp = find_expectation(tuple);
710
711         if (exp) {
712                 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
713                         conntrack, exp);
714                 /* Welcome, Mr. Bond.  We've been expecting you... */
715                 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
716                 conntrack->master = exp->master;
717 #ifdef CONFIG_IP_NF_CONNTRACK_MARK
718                 conntrack->mark = exp->master->mark;
719 #endif
720 #if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \
721     defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE)
722                 /* this is ugly, but there is no other place where to put it */
723                 conntrack->nat.masq_index = exp->master->nat.masq_index;
724 #endif
725                 nf_conntrack_get(&conntrack->master->ct_general);
726                 CONNTRACK_STAT_INC(expect_new);
727         } else {
728                 conntrack->helper = __ip_conntrack_helper_find(&repl_tuple);
729
730                 CONNTRACK_STAT_INC(new);
731         }
732
733         /* Overload tuple linked list to put us in unconfirmed list. */
734         list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
735
736         write_unlock_bh(&ip_conntrack_lock);
737
738         if (exp) {
739                 if (exp->expectfn)
740                         exp->expectfn(conntrack, exp);
741                 ip_conntrack_expect_put(exp);
742         }
743
744         return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
745 }
746
747 /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
748 static inline struct ip_conntrack *
749 resolve_normal_ct(struct sk_buff *skb,
750                   struct ip_conntrack_protocol *proto,
751                   int *set_reply,
752                   unsigned int hooknum,
753                   enum ip_conntrack_info *ctinfo)
754 {
755         struct ip_conntrack_tuple tuple;
756         struct ip_conntrack_tuple_hash *h;
757         struct ip_conntrack *ct;
758
759         IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
760
761         if (!ip_ct_get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4, 
762                                 &tuple,proto))
763                 return NULL;
764
765         /* look for tuple match */
766         h = ip_conntrack_find_get(&tuple, NULL);
767         if (!h) {
768                 h = init_conntrack(&tuple, proto, skb);
769                 if (!h)
770                         return NULL;
771                 if (IS_ERR(h))
772                         return (void *)h;
773         }
774         ct = tuplehash_to_ctrack(h);
775
776         /* It exists; we have (non-exclusive) reference. */
777         if (DIRECTION(h) == IP_CT_DIR_REPLY) {
778                 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
779                 /* Please set reply bit if this packet OK */
780                 *set_reply = 1;
781         } else {
782                 /* Once we've had two way comms, always ESTABLISHED. */
783                 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
784                         DEBUGP("ip_conntrack_in: normal packet for %p\n",
785                                ct);
786                         *ctinfo = IP_CT_ESTABLISHED;
787                 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
788                         DEBUGP("ip_conntrack_in: related packet for %p\n",
789                                ct);
790                         *ctinfo = IP_CT_RELATED;
791                 } else {
792                         DEBUGP("ip_conntrack_in: new packet for %p\n",
793                                ct);
794                         *ctinfo = IP_CT_NEW;
795                 }
796                 *set_reply = 0;
797         }
798         skb->nfct = &ct->ct_general;
799         skb->nfctinfo = *ctinfo;
800         return ct;
801 }
802
803 /* Netfilter hook itself. */
804 unsigned int ip_conntrack_in(unsigned int hooknum,
805                              struct sk_buff **pskb,
806                              const struct net_device *in,
807                              const struct net_device *out,
808                              int (*okfn)(struct sk_buff *))
809 {
810         struct ip_conntrack *ct;
811         enum ip_conntrack_info ctinfo;
812         struct ip_conntrack_protocol *proto;
813         int set_reply = 0;
814         int ret;
815
816         /* Previously seen (loopback or untracked)?  Ignore. */
817         if ((*pskb)->nfct) {
818                 CONNTRACK_STAT_INC(ignore);
819                 return NF_ACCEPT;
820         }
821
822         /* Never happen */
823         if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) {
824                 if (net_ratelimit()) {
825                 printk(KERN_ERR "ip_conntrack_in: Frag of proto %u (hook=%u)\n",
826                        (*pskb)->nh.iph->protocol, hooknum);
827                 }
828                 return NF_DROP;
829         }
830
831 /* Doesn't cover locally-generated broadcast, so not worth it. */
832 #if 0
833         /* Ignore broadcast: no `connection'. */
834         if ((*pskb)->pkt_type == PACKET_BROADCAST) {
835                 printk("Broadcast packet!\n");
836                 return NF_ACCEPT;
837         } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF)) 
838                    == htonl(0x000000FF)) {
839                 printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
840                        NIPQUAD((*pskb)->nh.iph->saddr),
841                        NIPQUAD((*pskb)->nh.iph->daddr),
842                        (*pskb)->sk, (*pskb)->pkt_type);
843         }
844 #endif
845
846         proto = __ip_conntrack_proto_find((*pskb)->nh.iph->protocol);
847
848         /* It may be an special packet, error, unclean...
849          * inverse of the return code tells to the netfilter
850          * core what to do with the packet. */
851         if (proto->error != NULL 
852             && (ret = proto->error(*pskb, &ctinfo, hooknum)) <= 0) {
853                 CONNTRACK_STAT_INC(error);
854                 CONNTRACK_STAT_INC(invalid);
855                 return -ret;
856         }
857
858         if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) {
859                 /* Not valid part of a connection */
860                 CONNTRACK_STAT_INC(invalid);
861                 return NF_ACCEPT;
862         }
863
864         if (IS_ERR(ct)) {
865                 /* Too stressed to deal. */
866                 CONNTRACK_STAT_INC(drop);
867                 return NF_DROP;
868         }
869
870         IP_NF_ASSERT((*pskb)->nfct);
871
872         ret = proto->packet(ct, *pskb, ctinfo);
873         if (ret < 0) {
874                 /* Invalid: inverse of the return code tells
875                  * the netfilter core what to do*/
876                 nf_conntrack_put((*pskb)->nfct);
877                 (*pskb)->nfct = NULL;
878                 CONNTRACK_STAT_INC(invalid);
879                 return -ret;
880         }
881
882         if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
883                 ip_conntrack_event_cache(IPCT_STATUS, *pskb);
884
885         return ret;
886 }
887
888 int invert_tuplepr(struct ip_conntrack_tuple *inverse,
889                    const struct ip_conntrack_tuple *orig)
890 {
891         return ip_ct_invert_tuple(inverse, orig, 
892                                   __ip_conntrack_proto_find(orig->dst.protonum));
893 }
894
895 /* Would two expected things clash? */
896 static inline int expect_clash(const struct ip_conntrack_expect *a,
897                                const struct ip_conntrack_expect *b)
898 {
899         /* Part covered by intersection of masks must be unequal,
900            otherwise they clash */
901         struct ip_conntrack_tuple intersect_mask
902                 = { { a->mask.src.ip & b->mask.src.ip,
903                       { a->mask.src.u.all & b->mask.src.u.all } },
904                     { a->mask.dst.ip & b->mask.dst.ip,
905                       { a->mask.dst.u.all & b->mask.dst.u.all },
906                       a->mask.dst.protonum & b->mask.dst.protonum } };
907
908         return ip_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
909 }
910
911 static inline int expect_matches(const struct ip_conntrack_expect *a,
912                                  const struct ip_conntrack_expect *b)
913 {
914         return a->master == b->master
915                 && ip_ct_tuple_equal(&a->tuple, &b->tuple)
916                 && ip_ct_tuple_equal(&a->mask, &b->mask);
917 }
918
919 /* Generally a bad idea to call this: could have matched already. */
920 void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
921 {
922         struct ip_conntrack_expect *i;
923
924         write_lock_bh(&ip_conntrack_lock);
925         /* choose the the oldest expectation to evict */
926         list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
927                 if (expect_matches(i, exp) && del_timer(&i->timeout)) {
928                         unlink_expect(i);
929                         write_unlock_bh(&ip_conntrack_lock);
930                         ip_conntrack_expect_put(i);
931                         return;
932                 }
933         }
934         write_unlock_bh(&ip_conntrack_lock);
935 }
936
937 struct ip_conntrack_expect *ip_conntrack_expect_alloc(struct ip_conntrack *me)
938 {
939         struct ip_conntrack_expect *new;
940
941         new = kmem_cache_alloc(ip_conntrack_expect_cachep, GFP_ATOMIC);
942         if (!new) {
943                 DEBUGP("expect_related: OOM allocating expect\n");
944                 return NULL;
945         }
946         new->master = me;
947         atomic_inc(&new->master->ct_general.use);
948         atomic_set(&new->use, 1);
949         return new;
950 }
951
952 void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
953 {
954         if (atomic_dec_and_test(&exp->use)) {
955                 ip_conntrack_put(exp->master);
956                 kmem_cache_free(ip_conntrack_expect_cachep, exp);
957         }
958 }
959
960 static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
961 {
962         atomic_inc(&exp->use);
963         exp->master->expecting++;
964         list_add(&exp->list, &ip_conntrack_expect_list);
965
966         init_timer(&exp->timeout);
967         exp->timeout.data = (unsigned long)exp;
968         exp->timeout.function = expectation_timed_out;
969         exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ;
970         add_timer(&exp->timeout);
971
972         exp->id = ++ip_conntrack_expect_next_id;
973         atomic_inc(&exp->use);
974         CONNTRACK_STAT_INC(expect_create);
975 }
976
977 /* Race with expectations being used means we could have none to find; OK. */
978 static void evict_oldest_expect(struct ip_conntrack *master)
979 {
980         struct ip_conntrack_expect *i;
981
982         list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
983                 if (i->master == master) {
984                         if (del_timer(&i->timeout)) {
985                                 unlink_expect(i);
986                                 ip_conntrack_expect_put(i);
987                         }
988                         break;
989                 }
990         }
991 }
992
993 static inline int refresh_timer(struct ip_conntrack_expect *i)
994 {
995         if (!del_timer(&i->timeout))
996                 return 0;
997
998         i->timeout.expires = jiffies + i->master->helper->timeout*HZ;
999         add_timer(&i->timeout);
1000         return 1;
1001 }
1002
1003 int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
1004 {
1005         struct ip_conntrack_expect *i;
1006         int ret;
1007
1008         DEBUGP("ip_conntrack_expect_related %p\n", related_to);
1009         DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
1010         DEBUGP("mask:  "); DUMP_TUPLE(&expect->mask);
1011
1012         write_lock_bh(&ip_conntrack_lock);
1013         list_for_each_entry(i, &ip_conntrack_expect_list, list) {
1014                 if (expect_matches(i, expect)) {
1015                         /* Refresh timer: if it's dying, ignore.. */
1016                         if (refresh_timer(i)) {
1017                                 ret = 0;
1018                                 goto out;
1019                         }
1020                 } else if (expect_clash(i, expect)) {
1021                         ret = -EBUSY;
1022                         goto out;
1023                 }
1024         }
1025
1026         /* Will be over limit? */
1027         if (expect->master->helper->max_expected && 
1028             expect->master->expecting >= expect->master->helper->max_expected)
1029                 evict_oldest_expect(expect->master);
1030
1031         ip_conntrack_expect_insert(expect);
1032         ip_conntrack_expect_event(IPEXP_NEW, expect);
1033         ret = 0;
1034 out:
1035         write_unlock_bh(&ip_conntrack_lock);
1036         return ret;
1037 }
1038
1039 /* Alter reply tuple (maybe alter helper).  This is for NAT, and is
1040    implicitly racy: see __ip_conntrack_confirm */
1041 void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
1042                               const struct ip_conntrack_tuple *newreply)
1043 {
1044         write_lock_bh(&ip_conntrack_lock);
1045         /* Should be unconfirmed, so not in hash table yet */
1046         IP_NF_ASSERT(!is_confirmed(conntrack));
1047
1048         DEBUGP("Altering reply tuple of %p to ", conntrack);
1049         DUMP_TUPLE(newreply);
1050
1051         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1052         if (!conntrack->master && conntrack->expecting == 0)
1053                 conntrack->helper = __ip_conntrack_helper_find(newreply);
1054         write_unlock_bh(&ip_conntrack_lock);
1055 }
1056
1057 int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
1058 {
1059         BUG_ON(me->timeout == 0);
1060         write_lock_bh(&ip_conntrack_lock);
1061         list_prepend(&helpers, me);
1062         write_unlock_bh(&ip_conntrack_lock);
1063
1064         return 0;
1065 }
1066
1067 struct ip_conntrack_helper *
1068 __ip_conntrack_helper_find_byname(const char *name)
1069 {
1070         struct ip_conntrack_helper *h;
1071
1072         list_for_each_entry(h, &helpers, list) {
1073                 if (!strcmp(h->name, name))
1074                         return h;
1075         }
1076
1077         return NULL;
1078 }
1079
1080 static inline int unhelp(struct ip_conntrack_tuple_hash *i,
1081                          const struct ip_conntrack_helper *me)
1082 {
1083         if (tuplehash_to_ctrack(i)->helper == me) {
1084                 ip_conntrack_event(IPCT_HELPER, tuplehash_to_ctrack(i));
1085                 tuplehash_to_ctrack(i)->helper = NULL;
1086         }
1087         return 0;
1088 }
1089
1090 void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
1091 {
1092         unsigned int i;
1093         struct ip_conntrack_expect *exp, *tmp;
1094
1095         /* Need write lock here, to delete helper. */
1096         write_lock_bh(&ip_conntrack_lock);
1097         LIST_DELETE(&helpers, me);
1098
1099         /* Get rid of expectations */
1100         list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) {
1101                 if (exp->master->helper == me && del_timer(&exp->timeout)) {
1102                         unlink_expect(exp);
1103                         ip_conntrack_expect_put(exp);
1104                 }
1105         }
1106         /* Get rid of expecteds, set helpers to NULL. */
1107         LIST_FIND_W(&unconfirmed, unhelp, struct ip_conntrack_tuple_hash*, me);
1108         for (i = 0; i < ip_conntrack_htable_size; i++)
1109                 LIST_FIND_W(&ip_conntrack_hash[i], unhelp,
1110                             struct ip_conntrack_tuple_hash *, me);
1111         write_unlock_bh(&ip_conntrack_lock);
1112
1113         /* Someone could be still looking at the helper in a bh. */
1114         synchronize_net();
1115 }
1116
1117 static inline void ct_add_counters(struct ip_conntrack *ct,
1118                                    enum ip_conntrack_info ctinfo,
1119                                    const struct sk_buff *skb)
1120 {
1121 #ifdef CONFIG_IP_NF_CT_ACCT
1122         if (skb) {
1123                 ct->counters[CTINFO2DIR(ctinfo)].packets++;
1124                 ct->counters[CTINFO2DIR(ctinfo)].bytes += 
1125                                         ntohs(skb->nh.iph->tot_len);
1126         }
1127 #endif
1128 }
1129
1130 /* Refresh conntrack for this many jiffies and do accounting (if skb != NULL) */
1131 void ip_ct_refresh_acct(struct ip_conntrack *ct, 
1132                         enum ip_conntrack_info ctinfo,
1133                         const struct sk_buff *skb,
1134                         unsigned long extra_jiffies)
1135 {
1136         IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
1137
1138         /* If not in hash table, timer will not be active yet */
1139         if (!is_confirmed(ct)) {
1140                 ct->timeout.expires = extra_jiffies;
1141                 ct_add_counters(ct, ctinfo, skb);
1142         } else {
1143                 write_lock_bh(&ip_conntrack_lock);
1144                 /* Need del_timer for race avoidance (may already be dying). */
1145                 if (del_timer(&ct->timeout)) {
1146                         ct->timeout.expires = jiffies + extra_jiffies;
1147                         add_timer(&ct->timeout);
1148                         ip_conntrack_event_cache(IPCT_REFRESH, skb);
1149                 }
1150                 ct_add_counters(ct, ctinfo, skb);
1151                 write_unlock_bh(&ip_conntrack_lock);
1152         }
1153 }
1154
1155 #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
1156     defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
1157 /* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
1158  * in ip_conntrack_core, since we don't want the protocols to autoload
1159  * or depend on ctnetlink */
1160 int ip_ct_port_tuple_to_nfattr(struct sk_buff *skb,
1161                                const struct ip_conntrack_tuple *tuple)
1162 {
1163         NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(u_int16_t),
1164                 &tuple->src.u.tcp.port);
1165         NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(u_int16_t),
1166                 &tuple->dst.u.tcp.port);
1167         return 0;
1168
1169 nfattr_failure:
1170         return -1;
1171 }
1172
1173 int ip_ct_port_nfattr_to_tuple(struct nfattr *tb[],
1174                                struct ip_conntrack_tuple *t)
1175 {
1176         if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1])
1177                 return -EINVAL;
1178
1179         t->src.u.tcp.port =
1180                 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]);
1181         t->dst.u.tcp.port =
1182                 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]);
1183
1184         return 0;
1185 }
1186 #endif
1187
1188 /* Returns new sk_buff, or NULL */
1189 struct sk_buff *
1190 ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
1191 {
1192         skb_orphan(skb);
1193
1194         local_bh_disable(); 
1195         skb = ip_defrag(skb, user);
1196         local_bh_enable();
1197
1198         if (skb)
1199                 ip_send_check(skb->nh.iph);
1200         return skb;
1201 }
1202
1203 /* Used by ipt_REJECT. */
1204 static void ip_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
1205 {
1206         struct ip_conntrack *ct;
1207         enum ip_conntrack_info ctinfo;
1208
1209         /* This ICMP is in reverse direction to the packet which caused it */
1210         ct = ip_conntrack_get(skb, &ctinfo);
1211         
1212         if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1213                 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1214         else
1215                 ctinfo = IP_CT_RELATED;
1216
1217         /* Attach to new skbuff, and increment count */
1218         nskb->nfct = &ct->ct_general;
1219         nskb->nfctinfo = ctinfo;
1220         nf_conntrack_get(nskb->nfct);
1221 }
1222
1223 static inline int
1224 do_iter(const struct ip_conntrack_tuple_hash *i,
1225         int (*iter)(struct ip_conntrack *i, void *data),
1226         void *data)
1227 {
1228         return iter(tuplehash_to_ctrack(i), data);
1229 }
1230
1231 /* Bring out ya dead! */
1232 static struct ip_conntrack_tuple_hash *
1233 get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
1234                 void *data, unsigned int *bucket)
1235 {
1236         struct ip_conntrack_tuple_hash *h = NULL;
1237
1238         write_lock_bh(&ip_conntrack_lock);
1239         for (; *bucket < ip_conntrack_htable_size; (*bucket)++) {
1240                 h = LIST_FIND_W(&ip_conntrack_hash[*bucket], do_iter,
1241                                 struct ip_conntrack_tuple_hash *, iter, data);
1242                 if (h)
1243                         break;
1244         }
1245         if (!h)
1246                 h = LIST_FIND_W(&unconfirmed, do_iter,
1247                                 struct ip_conntrack_tuple_hash *, iter, data);
1248         if (h)
1249                 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
1250         write_unlock_bh(&ip_conntrack_lock);
1251
1252         return h;
1253 }
1254
1255 void
1256 ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data)
1257 {
1258         struct ip_conntrack_tuple_hash *h;
1259         unsigned int bucket = 0;
1260
1261         while ((h = get_next_corpse(iter, data, &bucket)) != NULL) {
1262                 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1263                 /* Time to push up daises... */
1264                 if (del_timer(&ct->timeout))
1265                         death_by_timeout((unsigned long)ct);
1266                 /* ... else the timer will get him soon. */
1267
1268                 ip_conntrack_put(ct);
1269         }
1270 }
1271
1272 /* Fast function for those who don't want to parse /proc (and I don't
1273    blame them). */
1274 /* Reversing the socket's dst/src point of view gives us the reply
1275    mapping. */
1276 static int
1277 getorigdst(struct sock *sk, int optval, void __user *user, int *len)
1278 {
1279         struct inet_sock *inet = inet_sk(sk);
1280         struct ip_conntrack_tuple_hash *h;
1281         struct ip_conntrack_tuple tuple;
1282         
1283         IP_CT_TUPLE_U_BLANK(&tuple);
1284         tuple.src.ip = inet->rcv_saddr;
1285         tuple.src.u.tcp.port = inet->sport;
1286         tuple.dst.ip = inet->daddr;
1287         tuple.dst.u.tcp.port = inet->dport;
1288         tuple.dst.protonum = IPPROTO_TCP;
1289
1290         /* We only do TCP at the moment: is there a better way? */
1291         if (strcmp(sk->sk_prot->name, "TCP")) {
1292                 DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
1293                 return -ENOPROTOOPT;
1294         }
1295
1296         if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
1297                 DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
1298                        *len, sizeof(struct sockaddr_in));
1299                 return -EINVAL;
1300         }
1301
1302         h = ip_conntrack_find_get(&tuple, NULL);
1303         if (h) {
1304                 struct sockaddr_in sin;
1305                 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1306
1307                 sin.sin_family = AF_INET;
1308                 sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1309                         .tuple.dst.u.tcp.port;
1310                 sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1311                         .tuple.dst.ip;
1312
1313                 DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
1314                        NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1315                 ip_conntrack_put(ct);
1316                 if (copy_to_user(user, &sin, sizeof(sin)) != 0)
1317                         return -EFAULT;
1318                 else
1319                         return 0;
1320         }
1321         DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
1322                NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
1323                NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
1324         return -ENOENT;
1325 }
1326
1327 static struct nf_sockopt_ops so_getorigdst = {
1328         .pf             = PF_INET,
1329         .get_optmin     = SO_ORIGINAL_DST,
1330         .get_optmax     = SO_ORIGINAL_DST+1,
1331         .get            = &getorigdst,
1332 };
1333
1334 static int kill_all(struct ip_conntrack *i, void *data)
1335 {
1336         return 1;
1337 }
1338
1339 static void free_conntrack_hash(void)
1340 {
1341         if (ip_conntrack_vmalloc)
1342                 vfree(ip_conntrack_hash);
1343         else
1344                 free_pages((unsigned long)ip_conntrack_hash, 
1345                            get_order(sizeof(struct list_head)
1346                                      * ip_conntrack_htable_size));
1347 }
1348
1349 void ip_conntrack_flush()
1350 {
1351         /* This makes sure all current packets have passed through
1352            netfilter framework.  Roll on, two-stage module
1353            delete... */
1354         synchronize_net();
1355
1356         ip_ct_event_cache_flush();
1357  i_see_dead_people:
1358         ip_ct_iterate_cleanup(kill_all, NULL);
1359         if (atomic_read(&ip_conntrack_count) != 0) {
1360                 schedule();
1361                 goto i_see_dead_people;
1362         }
1363         /* wait until all references to ip_conntrack_untracked are dropped */
1364         while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1)
1365                 schedule();
1366 }
1367
1368 /* Mishearing the voices in his head, our hero wonders how he's
1369    supposed to kill the mall. */
1370 void ip_conntrack_cleanup(void)
1371 {
1372         ip_ct_attach = NULL;
1373         ip_conntrack_flush();
1374         kmem_cache_destroy(ip_conntrack_cachep);
1375         kmem_cache_destroy(ip_conntrack_expect_cachep);
1376         free_conntrack_hash();
1377         nf_unregister_sockopt(&so_getorigdst);
1378 }
1379
1380 static int hashsize;
1381 module_param(hashsize, int, 0400);
1382
1383 int __init ip_conntrack_init(void)
1384 {
1385         unsigned int i;
1386         int ret;
1387
1388         /* Idea from tcp.c: use 1/16384 of memory.  On i386: 32MB
1389          * machine has 256 buckets.  >= 1GB machines have 8192 buckets. */
1390         if (hashsize) {
1391                 ip_conntrack_htable_size = hashsize;
1392         } else {
1393                 ip_conntrack_htable_size
1394                         = (((num_physpages << PAGE_SHIFT) / 16384)
1395                            / sizeof(struct list_head));
1396                 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1397                         ip_conntrack_htable_size = 8192;
1398                 if (ip_conntrack_htable_size < 16)
1399                         ip_conntrack_htable_size = 16;
1400         }
1401         ip_conntrack_max = 8 * ip_conntrack_htable_size;
1402
1403         printk("ip_conntrack version %s (%u buckets, %d max)"
1404                " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION,
1405                ip_conntrack_htable_size, ip_conntrack_max,
1406                sizeof(struct ip_conntrack));
1407
1408         ret = nf_register_sockopt(&so_getorigdst);
1409         if (ret != 0) {
1410                 printk(KERN_ERR "Unable to register netfilter socket option\n");
1411                 return ret;
1412         }
1413
1414         /* AK: the hash table is twice as big than needed because it
1415            uses list_head.  it would be much nicer to caches to use a
1416            single pointer list head here. */
1417         ip_conntrack_vmalloc = 0; 
1418         ip_conntrack_hash 
1419                 =(void*)__get_free_pages(GFP_KERNEL, 
1420                                          get_order(sizeof(struct list_head)
1421                                                    *ip_conntrack_htable_size));
1422         if (!ip_conntrack_hash) { 
1423                 ip_conntrack_vmalloc = 1;
1424                 printk(KERN_WARNING "ip_conntrack: falling back to vmalloc.\n");
1425                 ip_conntrack_hash = vmalloc(sizeof(struct list_head)
1426                                             * ip_conntrack_htable_size);
1427         }
1428         if (!ip_conntrack_hash) {
1429                 printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
1430                 goto err_unreg_sockopt;
1431         }
1432
1433         ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
1434                                                 sizeof(struct ip_conntrack), 0,
1435                                                 0, NULL, NULL);
1436         if (!ip_conntrack_cachep) {
1437                 printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
1438                 goto err_free_hash;
1439         }
1440
1441         ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect",
1442                                         sizeof(struct ip_conntrack_expect),
1443                                         0, 0, NULL, NULL);
1444         if (!ip_conntrack_expect_cachep) {
1445                 printk(KERN_ERR "Unable to create ip_expect slab cache\n");
1446                 goto err_free_conntrack_slab;
1447         }
1448
1449         /* Don't NEED lock here, but good form anyway. */
1450         write_lock_bh(&ip_conntrack_lock);
1451         for (i = 0; i < MAX_IP_CT_PROTO; i++)
1452                 ip_ct_protos[i] = &ip_conntrack_generic_protocol;
1453         /* Sew in builtin protocols. */
1454         ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp;
1455         ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp;
1456         ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp;
1457         write_unlock_bh(&ip_conntrack_lock);
1458
1459         for (i = 0; i < ip_conntrack_htable_size; i++)
1460                 INIT_LIST_HEAD(&ip_conntrack_hash[i]);
1461
1462         /* For use by ipt_REJECT */
1463         ip_ct_attach = ip_conntrack_attach;
1464
1465         /* Set up fake conntrack:
1466             - to never be deleted, not in any hashes */
1467         atomic_set(&ip_conntrack_untracked.ct_general.use, 1);
1468         /*  - and look it like as a confirmed connection */
1469         set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status);
1470
1471         return ret;
1472
1473 err_free_conntrack_slab:
1474         kmem_cache_destroy(ip_conntrack_cachep);
1475 err_free_hash:
1476         free_conntrack_hash();
1477 err_unreg_sockopt:
1478         nf_unregister_sockopt(&so_getorigdst);
1479
1480         return -ENOMEM;
1481 }