Merge git://git.kernel.org/pub/scm/linux/kernel/git/bcollins/linux1394-2.6
[sfrench/cifs-2.6.git] / net / ipv4 / netfilter / ip_conntrack_core.c
1 /* Connection state tracking for netfilter.  This is separated from,
2    but required by, the NAT layer; it can also be used by an iptables
3    extension. */
4
5 /* (C) 1999-2001 Paul `Rusty' Russell  
6  * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License version 2 as
10  * published by the Free Software Foundation.
11  *
12  * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
13  *      - new API and handling of conntrack/nat helpers
14  *      - now capable of multiple expectations for one master
15  * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
16  *      - add usage/reference counts to ip_conntrack_expect
17  *      - export ip_conntrack[_expect]_{find_get,put} functions
18  * */
19
20 #include <linux/config.h>
21 #include <linux/types.h>
22 #include <linux/icmp.h>
23 #include <linux/ip.h>
24 #include <linux/netfilter.h>
25 #include <linux/netfilter_ipv4.h>
26 #include <linux/module.h>
27 #include <linux/skbuff.h>
28 #include <linux/proc_fs.h>
29 #include <linux/vmalloc.h>
30 #include <net/checksum.h>
31 #include <net/ip.h>
32 #include <linux/stddef.h>
33 #include <linux/sysctl.h>
34 #include <linux/slab.h>
35 #include <linux/random.h>
36 #include <linux/jhash.h>
37 #include <linux/err.h>
38 #include <linux/percpu.h>
39 #include <linux/moduleparam.h>
40 #include <linux/notifier.h>
41
42 /* ip_conntrack_lock protects the main hash table, protocol/helper/expected
43    registrations, conntrack timers*/
44 #define ASSERT_READ_LOCK(x)
45 #define ASSERT_WRITE_LOCK(x)
46
47 #include <linux/netfilter_ipv4/ip_conntrack.h>
48 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
49 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
50 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
51 #include <linux/netfilter_ipv4/listhelp.h>
52
53 #define IP_CONNTRACK_VERSION    "2.4"
54
55 #if 0
56 #define DEBUGP printk
57 #else
58 #define DEBUGP(format, args...)
59 #endif
60
61 DEFINE_RWLOCK(ip_conntrack_lock);
62
63 /* ip_conntrack_standalone needs this */
64 atomic_t ip_conntrack_count = ATOMIC_INIT(0);
65
66 void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
67 LIST_HEAD(ip_conntrack_expect_list);
68 struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO];
69 static LIST_HEAD(helpers);
70 unsigned int ip_conntrack_htable_size = 0;
71 int ip_conntrack_max;
72 struct list_head *ip_conntrack_hash;
73 static kmem_cache_t *ip_conntrack_cachep __read_mostly;
74 static kmem_cache_t *ip_conntrack_expect_cachep __read_mostly;
75 struct ip_conntrack ip_conntrack_untracked;
76 unsigned int ip_ct_log_invalid;
77 static LIST_HEAD(unconfirmed);
78 static int ip_conntrack_vmalloc;
79
80 static unsigned int ip_conntrack_next_id;
81 static unsigned int ip_conntrack_expect_next_id;
82 #ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
83 ATOMIC_NOTIFIER_HEAD(ip_conntrack_chain);
84 ATOMIC_NOTIFIER_HEAD(ip_conntrack_expect_chain);
85
86 DEFINE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache);
87
88 /* deliver cached events and clear cache entry - must be called with locally
89  * disabled softirqs */
90 static inline void
91 __ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ecache)
92 {
93         DEBUGP("ecache: delivering events for %p\n", ecache->ct);
94         if (is_confirmed(ecache->ct) && !is_dying(ecache->ct) && ecache->events)
95                 atomic_notifier_call_chain(&ip_conntrack_chain, ecache->events,
96                                     ecache->ct);
97         ecache->events = 0;
98         ip_conntrack_put(ecache->ct);
99         ecache->ct = NULL;
100 }
101
102 /* Deliver all cached events for a particular conntrack. This is called
103  * by code prior to async packet handling or freeing the skb */
104 void ip_ct_deliver_cached_events(const struct ip_conntrack *ct)
105 {
106         struct ip_conntrack_ecache *ecache;
107         
108         local_bh_disable();
109         ecache = &__get_cpu_var(ip_conntrack_ecache);
110         if (ecache->ct == ct)
111                 __ip_ct_deliver_cached_events(ecache);
112         local_bh_enable();
113 }
114
115 void __ip_ct_event_cache_init(struct ip_conntrack *ct)
116 {
117         struct ip_conntrack_ecache *ecache;
118
119         /* take care of delivering potentially old events */
120         ecache = &__get_cpu_var(ip_conntrack_ecache);
121         BUG_ON(ecache->ct == ct);
122         if (ecache->ct)
123                 __ip_ct_deliver_cached_events(ecache);
124         /* initialize for this conntrack/packet */
125         ecache->ct = ct;
126         nf_conntrack_get(&ct->ct_general);
127 }
128
129 /* flush the event cache - touches other CPU's data and must not be called while
130  * packets are still passing through the code */
131 static void ip_ct_event_cache_flush(void)
132 {
133         struct ip_conntrack_ecache *ecache;
134         int cpu;
135
136         for_each_possible_cpu(cpu) {
137                 ecache = &per_cpu(ip_conntrack_ecache, cpu);
138                 if (ecache->ct)
139                         ip_conntrack_put(ecache->ct);
140         }
141 }
142 #else
143 static inline void ip_ct_event_cache_flush(void) {}
144 #endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */
145
146 DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
147
148 static int ip_conntrack_hash_rnd_initted;
149 static unsigned int ip_conntrack_hash_rnd;
150
151 static u_int32_t __hash_conntrack(const struct ip_conntrack_tuple *tuple,
152                             unsigned int size, unsigned int rnd)
153 {
154         return (jhash_3words(tuple->src.ip,
155                              (tuple->dst.ip ^ tuple->dst.protonum),
156                              (tuple->src.u.all | (tuple->dst.u.all << 16)),
157                              rnd) % size);
158 }
159
160 static u_int32_t
161 hash_conntrack(const struct ip_conntrack_tuple *tuple)
162 {
163         return __hash_conntrack(tuple, ip_conntrack_htable_size,
164                                 ip_conntrack_hash_rnd);
165 }
166
167 int
168 ip_ct_get_tuple(const struct iphdr *iph,
169                 const struct sk_buff *skb,
170                 unsigned int dataoff,
171                 struct ip_conntrack_tuple *tuple,
172                 const struct ip_conntrack_protocol *protocol)
173 {
174         /* Never happen */
175         if (iph->frag_off & htons(IP_OFFSET)) {
176                 printk("ip_conntrack_core: Frag of proto %u.\n",
177                        iph->protocol);
178                 return 0;
179         }
180
181         tuple->src.ip = iph->saddr;
182         tuple->dst.ip = iph->daddr;
183         tuple->dst.protonum = iph->protocol;
184         tuple->dst.dir = IP_CT_DIR_ORIGINAL;
185
186         return protocol->pkt_to_tuple(skb, dataoff, tuple);
187 }
188
189 int
190 ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse,
191                    const struct ip_conntrack_tuple *orig,
192                    const struct ip_conntrack_protocol *protocol)
193 {
194         inverse->src.ip = orig->dst.ip;
195         inverse->dst.ip = orig->src.ip;
196         inverse->dst.protonum = orig->dst.protonum;
197         inverse->dst.dir = !orig->dst.dir;
198
199         return protocol->invert_tuple(inverse, orig);
200 }
201
202
203 /* ip_conntrack_expect helper functions */
204 void ip_ct_unlink_expect(struct ip_conntrack_expect *exp)
205 {
206         ASSERT_WRITE_LOCK(&ip_conntrack_lock);
207         IP_NF_ASSERT(!timer_pending(&exp->timeout));
208         list_del(&exp->list);
209         CONNTRACK_STAT_INC(expect_delete);
210         exp->master->expecting--;
211         ip_conntrack_expect_put(exp);
212 }
213
214 static void expectation_timed_out(unsigned long ul_expect)
215 {
216         struct ip_conntrack_expect *exp = (void *)ul_expect;
217
218         write_lock_bh(&ip_conntrack_lock);
219         ip_ct_unlink_expect(exp);
220         write_unlock_bh(&ip_conntrack_lock);
221         ip_conntrack_expect_put(exp);
222 }
223
224 struct ip_conntrack_expect *
225 __ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple)
226 {
227         struct ip_conntrack_expect *i;
228         
229         list_for_each_entry(i, &ip_conntrack_expect_list, list) {
230                 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) {
231                         atomic_inc(&i->use);
232                         return i;
233                 }
234         }
235         return NULL;
236 }
237
238 /* Just find a expectation corresponding to a tuple. */
239 struct ip_conntrack_expect *
240 ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple)
241 {
242         struct ip_conntrack_expect *i;
243         
244         read_lock_bh(&ip_conntrack_lock);
245         i = __ip_conntrack_expect_find(tuple);
246         read_unlock_bh(&ip_conntrack_lock);
247
248         return i;
249 }
250
251 /* If an expectation for this connection is found, it gets delete from
252  * global list then returned. */
253 static struct ip_conntrack_expect *
254 find_expectation(const struct ip_conntrack_tuple *tuple)
255 {
256         struct ip_conntrack_expect *i;
257
258         list_for_each_entry(i, &ip_conntrack_expect_list, list) {
259                 /* If master is not in hash table yet (ie. packet hasn't left
260                    this machine yet), how can other end know about expected?
261                    Hence these are not the droids you are looking for (if
262                    master ct never got confirmed, we'd hold a reference to it
263                    and weird things would happen to future packets). */
264                 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
265                     && is_confirmed(i->master)) {
266                         if (i->flags & IP_CT_EXPECT_PERMANENT) {
267                                 atomic_inc(&i->use);
268                                 return i;
269                         } else if (del_timer(&i->timeout)) {
270                                 ip_ct_unlink_expect(i);
271                                 return i;
272                         }
273                 }
274         }
275         return NULL;
276 }
277
278 /* delete all expectations for this conntrack */
279 void ip_ct_remove_expectations(struct ip_conntrack *ct)
280 {
281         struct ip_conntrack_expect *i, *tmp;
282
283         /* Optimization: most connection never expect any others. */
284         if (ct->expecting == 0)
285                 return;
286
287         list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) {
288                 if (i->master == ct && del_timer(&i->timeout)) {
289                         ip_ct_unlink_expect(i);
290                         ip_conntrack_expect_put(i);
291                 }
292         }
293 }
294
295 static void
296 clean_from_lists(struct ip_conntrack *ct)
297 {
298         unsigned int ho, hr;
299         
300         DEBUGP("clean_from_lists(%p)\n", ct);
301         ASSERT_WRITE_LOCK(&ip_conntrack_lock);
302
303         ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
304         hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
305         LIST_DELETE(&ip_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
306         LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
307
308         /* Destroy all pending expectations */
309         ip_ct_remove_expectations(ct);
310 }
311
312 static void
313 destroy_conntrack(struct nf_conntrack *nfct)
314 {
315         struct ip_conntrack *ct = (struct ip_conntrack *)nfct;
316         struct ip_conntrack_protocol *proto;
317
318         DEBUGP("destroy_conntrack(%p)\n", ct);
319         IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
320         IP_NF_ASSERT(!timer_pending(&ct->timeout));
321
322         ip_conntrack_event(IPCT_DESTROY, ct);
323         set_bit(IPS_DYING_BIT, &ct->status);
324
325         /* To make sure we don't get any weird locking issues here:
326          * destroy_conntrack() MUST NOT be called with a write lock
327          * to ip_conntrack_lock!!! -HW */
328         proto = __ip_conntrack_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
329         if (proto && proto->destroy)
330                 proto->destroy(ct);
331
332         if (ip_conntrack_destroyed)
333                 ip_conntrack_destroyed(ct);
334
335         write_lock_bh(&ip_conntrack_lock);
336         /* Expectations will have been removed in clean_from_lists,
337          * except TFTP can create an expectation on the first packet,
338          * before connection is in the list, so we need to clean here,
339          * too. */
340         ip_ct_remove_expectations(ct);
341
342         /* We overload first tuple to link into unconfirmed list. */
343         if (!is_confirmed(ct)) {
344                 BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
345                 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
346         }
347
348         CONNTRACK_STAT_INC(delete);
349         write_unlock_bh(&ip_conntrack_lock);
350
351         if (ct->master)
352                 ip_conntrack_put(ct->master);
353
354         DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
355         ip_conntrack_free(ct);
356 }
357
358 static void death_by_timeout(unsigned long ul_conntrack)
359 {
360         struct ip_conntrack *ct = (void *)ul_conntrack;
361
362         write_lock_bh(&ip_conntrack_lock);
363         /* Inside lock so preempt is disabled on module removal path.
364          * Otherwise we can get spurious warnings. */
365         CONNTRACK_STAT_INC(delete_list);
366         clean_from_lists(ct);
367         write_unlock_bh(&ip_conntrack_lock);
368         ip_conntrack_put(ct);
369 }
370
371 static inline int
372 conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
373                     const struct ip_conntrack_tuple *tuple,
374                     const struct ip_conntrack *ignored_conntrack)
375 {
376         ASSERT_READ_LOCK(&ip_conntrack_lock);
377         return tuplehash_to_ctrack(i) != ignored_conntrack
378                 && ip_ct_tuple_equal(tuple, &i->tuple);
379 }
380
381 struct ip_conntrack_tuple_hash *
382 __ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
383                     const struct ip_conntrack *ignored_conntrack)
384 {
385         struct ip_conntrack_tuple_hash *h;
386         unsigned int hash = hash_conntrack(tuple);
387
388         ASSERT_READ_LOCK(&ip_conntrack_lock);
389         list_for_each_entry(h, &ip_conntrack_hash[hash], list) {
390                 if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {
391                         CONNTRACK_STAT_INC(found);
392                         return h;
393                 }
394                 CONNTRACK_STAT_INC(searched);
395         }
396
397         return NULL;
398 }
399
400 /* Find a connection corresponding to a tuple. */
401 struct ip_conntrack_tuple_hash *
402 ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
403                       const struct ip_conntrack *ignored_conntrack)
404 {
405         struct ip_conntrack_tuple_hash *h;
406
407         read_lock_bh(&ip_conntrack_lock);
408         h = __ip_conntrack_find(tuple, ignored_conntrack);
409         if (h)
410                 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
411         read_unlock_bh(&ip_conntrack_lock);
412
413         return h;
414 }
415
416 static void __ip_conntrack_hash_insert(struct ip_conntrack *ct,
417                                         unsigned int hash,
418                                         unsigned int repl_hash) 
419 {
420         ct->id = ++ip_conntrack_next_id;
421         list_prepend(&ip_conntrack_hash[hash],
422                      &ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
423         list_prepend(&ip_conntrack_hash[repl_hash],
424                      &ct->tuplehash[IP_CT_DIR_REPLY].list);
425 }
426
427 void ip_conntrack_hash_insert(struct ip_conntrack *ct)
428 {
429         unsigned int hash, repl_hash;
430
431         hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
432         repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
433
434         write_lock_bh(&ip_conntrack_lock);
435         __ip_conntrack_hash_insert(ct, hash, repl_hash);
436         write_unlock_bh(&ip_conntrack_lock);
437 }
438
439 /* Confirm a connection given skb; places it in hash table */
440 int
441 __ip_conntrack_confirm(struct sk_buff **pskb)
442 {
443         unsigned int hash, repl_hash;
444         struct ip_conntrack *ct;
445         enum ip_conntrack_info ctinfo;
446
447         ct = ip_conntrack_get(*pskb, &ctinfo);
448
449         /* ipt_REJECT uses ip_conntrack_attach to attach related
450            ICMP/TCP RST packets in other direction.  Actual packet
451            which created connection will be IP_CT_NEW or for an
452            expected connection, IP_CT_RELATED. */
453         if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
454                 return NF_ACCEPT;
455
456         hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
457         repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
458
459         /* We're not in hash table, and we refuse to set up related
460            connections for unconfirmed conns.  But packet copies and
461            REJECT will give spurious warnings here. */
462         /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
463
464         /* No external references means noone else could have
465            confirmed us. */
466         IP_NF_ASSERT(!is_confirmed(ct));
467         DEBUGP("Confirming conntrack %p\n", ct);
468
469         write_lock_bh(&ip_conntrack_lock);
470
471         /* See if there's one in the list already, including reverse:
472            NAT could have grabbed it without realizing, since we're
473            not in the hash.  If there is, we lost race. */
474         if (!LIST_FIND(&ip_conntrack_hash[hash],
475                        conntrack_tuple_cmp,
476                        struct ip_conntrack_tuple_hash *,
477                        &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
478             && !LIST_FIND(&ip_conntrack_hash[repl_hash],
479                           conntrack_tuple_cmp,
480                           struct ip_conntrack_tuple_hash *,
481                           &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
482                 /* Remove from unconfirmed list */
483                 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
484
485                 __ip_conntrack_hash_insert(ct, hash, repl_hash);
486                 /* Timer relative to confirmation time, not original
487                    setting time, otherwise we'd get timer wrap in
488                    weird delay cases. */
489                 ct->timeout.expires += jiffies;
490                 add_timer(&ct->timeout);
491                 atomic_inc(&ct->ct_general.use);
492                 set_bit(IPS_CONFIRMED_BIT, &ct->status);
493                 CONNTRACK_STAT_INC(insert);
494                 write_unlock_bh(&ip_conntrack_lock);
495                 if (ct->helper)
496                         ip_conntrack_event_cache(IPCT_HELPER, *pskb);
497 #ifdef CONFIG_IP_NF_NAT_NEEDED
498                 if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
499                     test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
500                         ip_conntrack_event_cache(IPCT_NATINFO, *pskb);
501 #endif
502                 ip_conntrack_event_cache(master_ct(ct) ?
503                                          IPCT_RELATED : IPCT_NEW, *pskb);
504
505                 return NF_ACCEPT;
506         }
507
508         CONNTRACK_STAT_INC(insert_failed);
509         write_unlock_bh(&ip_conntrack_lock);
510
511         return NF_DROP;
512 }
513
514 /* Returns true if a connection correspondings to the tuple (required
515    for NAT). */
516 int
517 ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
518                          const struct ip_conntrack *ignored_conntrack)
519 {
520         struct ip_conntrack_tuple_hash *h;
521
522         read_lock_bh(&ip_conntrack_lock);
523         h = __ip_conntrack_find(tuple, ignored_conntrack);
524         read_unlock_bh(&ip_conntrack_lock);
525
526         return h != NULL;
527 }
528
529 /* There's a small race here where we may free a just-assured
530    connection.  Too bad: we're in trouble anyway. */
531 static inline int unreplied(const struct ip_conntrack_tuple_hash *i)
532 {
533         return !(test_bit(IPS_ASSURED_BIT, &tuplehash_to_ctrack(i)->status));
534 }
535
536 static int early_drop(struct list_head *chain)
537 {
538         /* Traverse backwards: gives us oldest, which is roughly LRU */
539         struct ip_conntrack_tuple_hash *h;
540         struct ip_conntrack *ct = NULL;
541         int dropped = 0;
542
543         read_lock_bh(&ip_conntrack_lock);
544         h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *);
545         if (h) {
546                 ct = tuplehash_to_ctrack(h);
547                 atomic_inc(&ct->ct_general.use);
548         }
549         read_unlock_bh(&ip_conntrack_lock);
550
551         if (!ct)
552                 return dropped;
553
554         if (del_timer(&ct->timeout)) {
555                 death_by_timeout((unsigned long)ct);
556                 dropped = 1;
557                 CONNTRACK_STAT_INC(early_drop);
558         }
559         ip_conntrack_put(ct);
560         return dropped;
561 }
562
563 static inline int helper_cmp(const struct ip_conntrack_helper *i,
564                              const struct ip_conntrack_tuple *rtuple)
565 {
566         return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
567 }
568
569 static struct ip_conntrack_helper *
570 __ip_conntrack_helper_find( const struct ip_conntrack_tuple *tuple)
571 {
572         return LIST_FIND(&helpers, helper_cmp,
573                          struct ip_conntrack_helper *,
574                          tuple);
575 }
576
577 struct ip_conntrack_helper *
578 ip_conntrack_helper_find_get( const struct ip_conntrack_tuple *tuple)
579 {
580         struct ip_conntrack_helper *helper;
581
582         /* need ip_conntrack_lock to assure that helper exists until
583          * try_module_get() is called */
584         read_lock_bh(&ip_conntrack_lock);
585
586         helper = __ip_conntrack_helper_find(tuple);
587         if (helper) {
588                 /* need to increase module usage count to assure helper will
589                  * not go away while the caller is e.g. busy putting a
590                  * conntrack in the hash that uses the helper */
591                 if (!try_module_get(helper->me))
592                         helper = NULL;
593         }
594
595         read_unlock_bh(&ip_conntrack_lock);
596
597         return helper;
598 }
599
600 void ip_conntrack_helper_put(struct ip_conntrack_helper *helper)
601 {
602         module_put(helper->me);
603 }
604
605 struct ip_conntrack_protocol *
606 __ip_conntrack_proto_find(u_int8_t protocol)
607 {
608         return ip_ct_protos[protocol];
609 }
610
611 /* this is guaranteed to always return a valid protocol helper, since
612  * it falls back to generic_protocol */
613 struct ip_conntrack_protocol *
614 ip_conntrack_proto_find_get(u_int8_t protocol)
615 {
616         struct ip_conntrack_protocol *p;
617
618         preempt_disable();
619         p = __ip_conntrack_proto_find(protocol);
620         if (p) {
621                 if (!try_module_get(p->me))
622                         p = &ip_conntrack_generic_protocol;
623         }
624         preempt_enable();
625         
626         return p;
627 }
628
629 void ip_conntrack_proto_put(struct ip_conntrack_protocol *p)
630 {
631         module_put(p->me);
632 }
633
634 struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *orig,
635                                         struct ip_conntrack_tuple *repl)
636 {
637         struct ip_conntrack *conntrack;
638
639         if (!ip_conntrack_hash_rnd_initted) {
640                 get_random_bytes(&ip_conntrack_hash_rnd, 4);
641                 ip_conntrack_hash_rnd_initted = 1;
642         }
643
644         if (ip_conntrack_max
645             && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
646                 unsigned int hash = hash_conntrack(orig);
647                 /* Try dropping from this hash chain. */
648                 if (!early_drop(&ip_conntrack_hash[hash])) {
649                         if (net_ratelimit())
650                                 printk(KERN_WARNING
651                                        "ip_conntrack: table full, dropping"
652                                        " packet.\n");
653                         return ERR_PTR(-ENOMEM);
654                 }
655         }
656
657         conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
658         if (!conntrack) {
659                 DEBUGP("Can't allocate conntrack.\n");
660                 return ERR_PTR(-ENOMEM);
661         }
662
663         memset(conntrack, 0, sizeof(*conntrack));
664         atomic_set(&conntrack->ct_general.use, 1);
665         conntrack->ct_general.destroy = destroy_conntrack;
666         conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
667         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
668         /* Don't set timer yet: wait for confirmation */
669         init_timer(&conntrack->timeout);
670         conntrack->timeout.data = (unsigned long)conntrack;
671         conntrack->timeout.function = death_by_timeout;
672
673         atomic_inc(&ip_conntrack_count);
674
675         return conntrack;
676 }
677
678 void
679 ip_conntrack_free(struct ip_conntrack *conntrack)
680 {
681         atomic_dec(&ip_conntrack_count);
682         kmem_cache_free(ip_conntrack_cachep, conntrack);
683 }
684
685 /* Allocate a new conntrack: we return -ENOMEM if classification
686  * failed due to stress.   Otherwise it really is unclassifiable */
687 static struct ip_conntrack_tuple_hash *
688 init_conntrack(struct ip_conntrack_tuple *tuple,
689                struct ip_conntrack_protocol *protocol,
690                struct sk_buff *skb)
691 {
692         struct ip_conntrack *conntrack;
693         struct ip_conntrack_tuple repl_tuple;
694         struct ip_conntrack_expect *exp;
695
696         if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
697                 DEBUGP("Can't invert tuple.\n");
698                 return NULL;
699         }
700
701         conntrack = ip_conntrack_alloc(tuple, &repl_tuple);
702         if (conntrack == NULL || IS_ERR(conntrack))
703                 return (struct ip_conntrack_tuple_hash *)conntrack;
704
705         if (!protocol->new(conntrack, skb)) {
706                 ip_conntrack_free(conntrack);
707                 return NULL;
708         }
709
710         write_lock_bh(&ip_conntrack_lock);
711         exp = find_expectation(tuple);
712
713         if (exp) {
714                 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
715                         conntrack, exp);
716                 /* Welcome, Mr. Bond.  We've been expecting you... */
717                 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
718                 conntrack->master = exp->master;
719 #ifdef CONFIG_IP_NF_CONNTRACK_MARK
720                 conntrack->mark = exp->master->mark;
721 #endif
722 #if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \
723     defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE)
724                 /* this is ugly, but there is no other place where to put it */
725                 conntrack->nat.masq_index = exp->master->nat.masq_index;
726 #endif
727 #ifdef CONFIG_IP_NF_CONNTRACK_SECMARK
728                 conntrack->secmark = exp->master->secmark;
729 #endif
730                 nf_conntrack_get(&conntrack->master->ct_general);
731                 CONNTRACK_STAT_INC(expect_new);
732         } else {
733                 conntrack->helper = __ip_conntrack_helper_find(&repl_tuple);
734
735                 CONNTRACK_STAT_INC(new);
736         }
737
738         /* Overload tuple linked list to put us in unconfirmed list. */
739         list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
740
741         write_unlock_bh(&ip_conntrack_lock);
742
743         if (exp) {
744                 if (exp->expectfn)
745                         exp->expectfn(conntrack, exp);
746                 ip_conntrack_expect_put(exp);
747         }
748
749         return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
750 }
751
752 /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
753 static inline struct ip_conntrack *
754 resolve_normal_ct(struct sk_buff *skb,
755                   struct ip_conntrack_protocol *proto,
756                   int *set_reply,
757                   unsigned int hooknum,
758                   enum ip_conntrack_info *ctinfo)
759 {
760         struct ip_conntrack_tuple tuple;
761         struct ip_conntrack_tuple_hash *h;
762         struct ip_conntrack *ct;
763
764         IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
765
766         if (!ip_ct_get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4, 
767                                 &tuple,proto))
768                 return NULL;
769
770         /* look for tuple match */
771         h = ip_conntrack_find_get(&tuple, NULL);
772         if (!h) {
773                 h = init_conntrack(&tuple, proto, skb);
774                 if (!h)
775                         return NULL;
776                 if (IS_ERR(h))
777                         return (void *)h;
778         }
779         ct = tuplehash_to_ctrack(h);
780
781         /* It exists; we have (non-exclusive) reference. */
782         if (DIRECTION(h) == IP_CT_DIR_REPLY) {
783                 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
784                 /* Please set reply bit if this packet OK */
785                 *set_reply = 1;
786         } else {
787                 /* Once we've had two way comms, always ESTABLISHED. */
788                 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
789                         DEBUGP("ip_conntrack_in: normal packet for %p\n",
790                                ct);
791                         *ctinfo = IP_CT_ESTABLISHED;
792                 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
793                         DEBUGP("ip_conntrack_in: related packet for %p\n",
794                                ct);
795                         *ctinfo = IP_CT_RELATED;
796                 } else {
797                         DEBUGP("ip_conntrack_in: new packet for %p\n",
798                                ct);
799                         *ctinfo = IP_CT_NEW;
800                 }
801                 *set_reply = 0;
802         }
803         skb->nfct = &ct->ct_general;
804         skb->nfctinfo = *ctinfo;
805         return ct;
806 }
807
808 /* Netfilter hook itself. */
809 unsigned int ip_conntrack_in(unsigned int hooknum,
810                              struct sk_buff **pskb,
811                              const struct net_device *in,
812                              const struct net_device *out,
813                              int (*okfn)(struct sk_buff *))
814 {
815         struct ip_conntrack *ct;
816         enum ip_conntrack_info ctinfo;
817         struct ip_conntrack_protocol *proto;
818         int set_reply = 0;
819         int ret;
820
821         /* Previously seen (loopback or untracked)?  Ignore. */
822         if ((*pskb)->nfct) {
823                 CONNTRACK_STAT_INC(ignore);
824                 return NF_ACCEPT;
825         }
826
827         /* Never happen */
828         if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) {
829                 if (net_ratelimit()) {
830                 printk(KERN_ERR "ip_conntrack_in: Frag of proto %u (hook=%u)\n",
831                        (*pskb)->nh.iph->protocol, hooknum);
832                 }
833                 return NF_DROP;
834         }
835
836 /* Doesn't cover locally-generated broadcast, so not worth it. */
837 #if 0
838         /* Ignore broadcast: no `connection'. */
839         if ((*pskb)->pkt_type == PACKET_BROADCAST) {
840                 printk("Broadcast packet!\n");
841                 return NF_ACCEPT;
842         } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF)) 
843                    == htonl(0x000000FF)) {
844                 printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
845                        NIPQUAD((*pskb)->nh.iph->saddr),
846                        NIPQUAD((*pskb)->nh.iph->daddr),
847                        (*pskb)->sk, (*pskb)->pkt_type);
848         }
849 #endif
850
851         proto = __ip_conntrack_proto_find((*pskb)->nh.iph->protocol);
852
853         /* It may be an special packet, error, unclean...
854          * inverse of the return code tells to the netfilter
855          * core what to do with the packet. */
856         if (proto->error != NULL 
857             && (ret = proto->error(*pskb, &ctinfo, hooknum)) <= 0) {
858                 CONNTRACK_STAT_INC(error);
859                 CONNTRACK_STAT_INC(invalid);
860                 return -ret;
861         }
862
863         if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) {
864                 /* Not valid part of a connection */
865                 CONNTRACK_STAT_INC(invalid);
866                 return NF_ACCEPT;
867         }
868
869         if (IS_ERR(ct)) {
870                 /* Too stressed to deal. */
871                 CONNTRACK_STAT_INC(drop);
872                 return NF_DROP;
873         }
874
875         IP_NF_ASSERT((*pskb)->nfct);
876
877         ret = proto->packet(ct, *pskb, ctinfo);
878         if (ret < 0) {
879                 /* Invalid: inverse of the return code tells
880                  * the netfilter core what to do*/
881                 nf_conntrack_put((*pskb)->nfct);
882                 (*pskb)->nfct = NULL;
883                 CONNTRACK_STAT_INC(invalid);
884                 return -ret;
885         }
886
887         if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
888                 ip_conntrack_event_cache(IPCT_STATUS, *pskb);
889
890         return ret;
891 }
892
893 int invert_tuplepr(struct ip_conntrack_tuple *inverse,
894                    const struct ip_conntrack_tuple *orig)
895 {
896         return ip_ct_invert_tuple(inverse, orig, 
897                                   __ip_conntrack_proto_find(orig->dst.protonum));
898 }
899
900 /* Would two expected things clash? */
901 static inline int expect_clash(const struct ip_conntrack_expect *a,
902                                const struct ip_conntrack_expect *b)
903 {
904         /* Part covered by intersection of masks must be unequal,
905            otherwise they clash */
906         struct ip_conntrack_tuple intersect_mask
907                 = { { a->mask.src.ip & b->mask.src.ip,
908                       { a->mask.src.u.all & b->mask.src.u.all } },
909                     { a->mask.dst.ip & b->mask.dst.ip,
910                       { a->mask.dst.u.all & b->mask.dst.u.all },
911                       a->mask.dst.protonum & b->mask.dst.protonum } };
912
913         return ip_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
914 }
915
916 static inline int expect_matches(const struct ip_conntrack_expect *a,
917                                  const struct ip_conntrack_expect *b)
918 {
919         return a->master == b->master
920                 && ip_ct_tuple_equal(&a->tuple, &b->tuple)
921                 && ip_ct_tuple_equal(&a->mask, &b->mask);
922 }
923
924 /* Generally a bad idea to call this: could have matched already. */
925 void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
926 {
927         struct ip_conntrack_expect *i;
928
929         write_lock_bh(&ip_conntrack_lock);
930         /* choose the the oldest expectation to evict */
931         list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
932                 if (expect_matches(i, exp) && del_timer(&i->timeout)) {
933                         ip_ct_unlink_expect(i);
934                         write_unlock_bh(&ip_conntrack_lock);
935                         ip_conntrack_expect_put(i);
936                         return;
937                 }
938         }
939         write_unlock_bh(&ip_conntrack_lock);
940 }
941
942 /* We don't increase the master conntrack refcount for non-fulfilled
943  * conntracks. During the conntrack destruction, the expectations are 
944  * always killed before the conntrack itself */
945 struct ip_conntrack_expect *ip_conntrack_expect_alloc(struct ip_conntrack *me)
946 {
947         struct ip_conntrack_expect *new;
948
949         new = kmem_cache_alloc(ip_conntrack_expect_cachep, GFP_ATOMIC);
950         if (!new) {
951                 DEBUGP("expect_related: OOM allocating expect\n");
952                 return NULL;
953         }
954         new->master = me;
955         atomic_set(&new->use, 1);
956         return new;
957 }
958
959 void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
960 {
961         if (atomic_dec_and_test(&exp->use))
962                 kmem_cache_free(ip_conntrack_expect_cachep, exp);
963 }
964
965 static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
966 {
967         atomic_inc(&exp->use);
968         exp->master->expecting++;
969         list_add(&exp->list, &ip_conntrack_expect_list);
970
971         init_timer(&exp->timeout);
972         exp->timeout.data = (unsigned long)exp;
973         exp->timeout.function = expectation_timed_out;
974         exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ;
975         add_timer(&exp->timeout);
976
977         exp->id = ++ip_conntrack_expect_next_id;
978         atomic_inc(&exp->use);
979         CONNTRACK_STAT_INC(expect_create);
980 }
981
982 /* Race with expectations being used means we could have none to find; OK. */
983 static void evict_oldest_expect(struct ip_conntrack *master)
984 {
985         struct ip_conntrack_expect *i;
986
987         list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
988                 if (i->master == master) {
989                         if (del_timer(&i->timeout)) {
990                                 ip_ct_unlink_expect(i);
991                                 ip_conntrack_expect_put(i);
992                         }
993                         break;
994                 }
995         }
996 }
997
998 static inline int refresh_timer(struct ip_conntrack_expect *i)
999 {
1000         if (!del_timer(&i->timeout))
1001                 return 0;
1002
1003         i->timeout.expires = jiffies + i->master->helper->timeout*HZ;
1004         add_timer(&i->timeout);
1005         return 1;
1006 }
1007
1008 int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
1009 {
1010         struct ip_conntrack_expect *i;
1011         int ret;
1012
1013         DEBUGP("ip_conntrack_expect_related %p\n", related_to);
1014         DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
1015         DEBUGP("mask:  "); DUMP_TUPLE(&expect->mask);
1016
1017         write_lock_bh(&ip_conntrack_lock);
1018         list_for_each_entry(i, &ip_conntrack_expect_list, list) {
1019                 if (expect_matches(i, expect)) {
1020                         /* Refresh timer: if it's dying, ignore.. */
1021                         if (refresh_timer(i)) {
1022                                 ret = 0;
1023                                 goto out;
1024                         }
1025                 } else if (expect_clash(i, expect)) {
1026                         ret = -EBUSY;
1027                         goto out;
1028                 }
1029         }
1030
1031         /* Will be over limit? */
1032         if (expect->master->helper->max_expected && 
1033             expect->master->expecting >= expect->master->helper->max_expected)
1034                 evict_oldest_expect(expect->master);
1035
1036         ip_conntrack_expect_insert(expect);
1037         ip_conntrack_expect_event(IPEXP_NEW, expect);
1038         ret = 0;
1039 out:
1040         write_unlock_bh(&ip_conntrack_lock);
1041         return ret;
1042 }
1043
1044 /* Alter reply tuple (maybe alter helper).  This is for NAT, and is
1045    implicitly racy: see __ip_conntrack_confirm */
1046 void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
1047                               const struct ip_conntrack_tuple *newreply)
1048 {
1049         write_lock_bh(&ip_conntrack_lock);
1050         /* Should be unconfirmed, so not in hash table yet */
1051         IP_NF_ASSERT(!is_confirmed(conntrack));
1052
1053         DEBUGP("Altering reply tuple of %p to ", conntrack);
1054         DUMP_TUPLE(newreply);
1055
1056         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1057         if (!conntrack->master && conntrack->expecting == 0)
1058                 conntrack->helper = __ip_conntrack_helper_find(newreply);
1059         write_unlock_bh(&ip_conntrack_lock);
1060 }
1061
1062 int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
1063 {
1064         BUG_ON(me->timeout == 0);
1065         write_lock_bh(&ip_conntrack_lock);
1066         list_prepend(&helpers, me);
1067         write_unlock_bh(&ip_conntrack_lock);
1068
1069         return 0;
1070 }
1071
1072 struct ip_conntrack_helper *
1073 __ip_conntrack_helper_find_byname(const char *name)
1074 {
1075         struct ip_conntrack_helper *h;
1076
1077         list_for_each_entry(h, &helpers, list) {
1078                 if (!strcmp(h->name, name))
1079                         return h;
1080         }
1081
1082         return NULL;
1083 }
1084
1085 static inline int unhelp(struct ip_conntrack_tuple_hash *i,
1086                          const struct ip_conntrack_helper *me)
1087 {
1088         if (tuplehash_to_ctrack(i)->helper == me) {
1089                 ip_conntrack_event(IPCT_HELPER, tuplehash_to_ctrack(i));
1090                 tuplehash_to_ctrack(i)->helper = NULL;
1091         }
1092         return 0;
1093 }
1094
1095 void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
1096 {
1097         unsigned int i;
1098         struct ip_conntrack_expect *exp, *tmp;
1099
1100         /* Need write lock here, to delete helper. */
1101         write_lock_bh(&ip_conntrack_lock);
1102         LIST_DELETE(&helpers, me);
1103
1104         /* Get rid of expectations */
1105         list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) {
1106                 if (exp->master->helper == me && del_timer(&exp->timeout)) {
1107                         ip_ct_unlink_expect(exp);
1108                         ip_conntrack_expect_put(exp);
1109                 }
1110         }
1111         /* Get rid of expecteds, set helpers to NULL. */
1112         LIST_FIND_W(&unconfirmed, unhelp, struct ip_conntrack_tuple_hash*, me);
1113         for (i = 0; i < ip_conntrack_htable_size; i++)
1114                 LIST_FIND_W(&ip_conntrack_hash[i], unhelp,
1115                             struct ip_conntrack_tuple_hash *, me);
1116         write_unlock_bh(&ip_conntrack_lock);
1117
1118         /* Someone could be still looking at the helper in a bh. */
1119         synchronize_net();
1120 }
1121
1122 /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
1123 void __ip_ct_refresh_acct(struct ip_conntrack *ct, 
1124                         enum ip_conntrack_info ctinfo,
1125                         const struct sk_buff *skb,
1126                         unsigned long extra_jiffies,
1127                         int do_acct)
1128 {
1129         int event = 0;
1130
1131         IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
1132         IP_NF_ASSERT(skb);
1133
1134         write_lock_bh(&ip_conntrack_lock);
1135
1136         /* Only update if this is not a fixed timeout */
1137         if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) {
1138                 write_unlock_bh(&ip_conntrack_lock);
1139                 return;
1140         }
1141
1142         /* If not in hash table, timer will not be active yet */
1143         if (!is_confirmed(ct)) {
1144                 ct->timeout.expires = extra_jiffies;
1145                 event = IPCT_REFRESH;
1146         } else {
1147                 /* Need del_timer for race avoidance (may already be dying). */
1148                 if (del_timer(&ct->timeout)) {
1149                         ct->timeout.expires = jiffies + extra_jiffies;
1150                         add_timer(&ct->timeout);
1151                         event = IPCT_REFRESH;
1152                 }
1153         }
1154
1155 #ifdef CONFIG_IP_NF_CT_ACCT
1156         if (do_acct) {
1157                 ct->counters[CTINFO2DIR(ctinfo)].packets++;
1158                 ct->counters[CTINFO2DIR(ctinfo)].bytes += 
1159                                                 ntohs(skb->nh.iph->tot_len);
1160                 if ((ct->counters[CTINFO2DIR(ctinfo)].packets & 0x80000000)
1161                     || (ct->counters[CTINFO2DIR(ctinfo)].bytes & 0x80000000))
1162                         event |= IPCT_COUNTER_FILLING;
1163         }
1164 #endif
1165
1166         write_unlock_bh(&ip_conntrack_lock);
1167
1168         /* must be unlocked when calling event cache */
1169         if (event)
1170                 ip_conntrack_event_cache(event, skb);
1171 }
1172
1173 #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
1174     defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
1175 /* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
1176  * in ip_conntrack_core, since we don't want the protocols to autoload
1177  * or depend on ctnetlink */
1178 int ip_ct_port_tuple_to_nfattr(struct sk_buff *skb,
1179                                const struct ip_conntrack_tuple *tuple)
1180 {
1181         NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(u_int16_t),
1182                 &tuple->src.u.tcp.port);
1183         NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(u_int16_t),
1184                 &tuple->dst.u.tcp.port);
1185         return 0;
1186
1187 nfattr_failure:
1188         return -1;
1189 }
1190
1191 int ip_ct_port_nfattr_to_tuple(struct nfattr *tb[],
1192                                struct ip_conntrack_tuple *t)
1193 {
1194         if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1])
1195                 return -EINVAL;
1196
1197         t->src.u.tcp.port =
1198                 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]);
1199         t->dst.u.tcp.port =
1200                 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]);
1201
1202         return 0;
1203 }
1204 #endif
1205
1206 /* Returns new sk_buff, or NULL */
1207 struct sk_buff *
1208 ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
1209 {
1210         skb_orphan(skb);
1211
1212         local_bh_disable(); 
1213         skb = ip_defrag(skb, user);
1214         local_bh_enable();
1215
1216         if (skb)
1217                 ip_send_check(skb->nh.iph);
1218         return skb;
1219 }
1220
1221 /* Used by ipt_REJECT. */
1222 static void ip_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
1223 {
1224         struct ip_conntrack *ct;
1225         enum ip_conntrack_info ctinfo;
1226
1227         /* This ICMP is in reverse direction to the packet which caused it */
1228         ct = ip_conntrack_get(skb, &ctinfo);
1229         
1230         if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1231                 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1232         else
1233                 ctinfo = IP_CT_RELATED;
1234
1235         /* Attach to new skbuff, and increment count */
1236         nskb->nfct = &ct->ct_general;
1237         nskb->nfctinfo = ctinfo;
1238         nf_conntrack_get(nskb->nfct);
1239 }
1240
1241 static inline int
1242 do_iter(const struct ip_conntrack_tuple_hash *i,
1243         int (*iter)(struct ip_conntrack *i, void *data),
1244         void *data)
1245 {
1246         return iter(tuplehash_to_ctrack(i), data);
1247 }
1248
1249 /* Bring out ya dead! */
1250 static struct ip_conntrack_tuple_hash *
1251 get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
1252                 void *data, unsigned int *bucket)
1253 {
1254         struct ip_conntrack_tuple_hash *h = NULL;
1255
1256         write_lock_bh(&ip_conntrack_lock);
1257         for (; *bucket < ip_conntrack_htable_size; (*bucket)++) {
1258                 h = LIST_FIND_W(&ip_conntrack_hash[*bucket], do_iter,
1259                                 struct ip_conntrack_tuple_hash *, iter, data);
1260                 if (h)
1261                         break;
1262         }
1263         if (!h)
1264                 h = LIST_FIND_W(&unconfirmed, do_iter,
1265                                 struct ip_conntrack_tuple_hash *, iter, data);
1266         if (h)
1267                 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
1268         write_unlock_bh(&ip_conntrack_lock);
1269
1270         return h;
1271 }
1272
1273 void
1274 ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data)
1275 {
1276         struct ip_conntrack_tuple_hash *h;
1277         unsigned int bucket = 0;
1278
1279         while ((h = get_next_corpse(iter, data, &bucket)) != NULL) {
1280                 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1281                 /* Time to push up daises... */
1282                 if (del_timer(&ct->timeout))
1283                         death_by_timeout((unsigned long)ct);
1284                 /* ... else the timer will get him soon. */
1285
1286                 ip_conntrack_put(ct);
1287         }
1288 }
1289
1290 /* Fast function for those who don't want to parse /proc (and I don't
1291    blame them). */
1292 /* Reversing the socket's dst/src point of view gives us the reply
1293    mapping. */
1294 static int
1295 getorigdst(struct sock *sk, int optval, void __user *user, int *len)
1296 {
1297         struct inet_sock *inet = inet_sk(sk);
1298         struct ip_conntrack_tuple_hash *h;
1299         struct ip_conntrack_tuple tuple;
1300         
1301         IP_CT_TUPLE_U_BLANK(&tuple);
1302         tuple.src.ip = inet->rcv_saddr;
1303         tuple.src.u.tcp.port = inet->sport;
1304         tuple.dst.ip = inet->daddr;
1305         tuple.dst.u.tcp.port = inet->dport;
1306         tuple.dst.protonum = IPPROTO_TCP;
1307
1308         /* We only do TCP at the moment: is there a better way? */
1309         if (strcmp(sk->sk_prot->name, "TCP")) {
1310                 DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
1311                 return -ENOPROTOOPT;
1312         }
1313
1314         if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
1315                 DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
1316                        *len, sizeof(struct sockaddr_in));
1317                 return -EINVAL;
1318         }
1319
1320         h = ip_conntrack_find_get(&tuple, NULL);
1321         if (h) {
1322                 struct sockaddr_in sin;
1323                 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1324
1325                 sin.sin_family = AF_INET;
1326                 sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1327                         .tuple.dst.u.tcp.port;
1328                 sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1329                         .tuple.dst.ip;
1330                 memset(sin.sin_zero, 0, sizeof(sin.sin_zero));
1331
1332                 DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
1333                        NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1334                 ip_conntrack_put(ct);
1335                 if (copy_to_user(user, &sin, sizeof(sin)) != 0)
1336                         return -EFAULT;
1337                 else
1338                         return 0;
1339         }
1340         DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
1341                NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
1342                NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
1343         return -ENOENT;
1344 }
1345
1346 static struct nf_sockopt_ops so_getorigdst = {
1347         .pf             = PF_INET,
1348         .get_optmin     = SO_ORIGINAL_DST,
1349         .get_optmax     = SO_ORIGINAL_DST+1,
1350         .get            = &getorigdst,
1351 };
1352
1353 static int kill_all(struct ip_conntrack *i, void *data)
1354 {
1355         return 1;
1356 }
1357
1358 void ip_conntrack_flush(void)
1359 {
1360         ip_ct_iterate_cleanup(kill_all, NULL);
1361 }
1362
1363 static void free_conntrack_hash(struct list_head *hash, int vmalloced,int size)
1364 {
1365         if (vmalloced)
1366                 vfree(hash);
1367         else
1368                 free_pages((unsigned long)hash, 
1369                            get_order(sizeof(struct list_head) * size));
1370 }
1371
1372 /* Mishearing the voices in his head, our hero wonders how he's
1373    supposed to kill the mall. */
1374 void ip_conntrack_cleanup(void)
1375 {
1376         ip_ct_attach = NULL;
1377
1378         /* This makes sure all current packets have passed through
1379            netfilter framework.  Roll on, two-stage module
1380            delete... */
1381         synchronize_net();
1382
1383         ip_ct_event_cache_flush();
1384  i_see_dead_people:
1385         ip_conntrack_flush();
1386         if (atomic_read(&ip_conntrack_count) != 0) {
1387                 schedule();
1388                 goto i_see_dead_people;
1389         }
1390         /* wait until all references to ip_conntrack_untracked are dropped */
1391         while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1)
1392                 schedule();
1393
1394         kmem_cache_destroy(ip_conntrack_cachep);
1395         kmem_cache_destroy(ip_conntrack_expect_cachep);
1396         free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc,
1397                             ip_conntrack_htable_size);
1398         nf_unregister_sockopt(&so_getorigdst);
1399 }
1400
1401 static struct list_head *alloc_hashtable(int size, int *vmalloced)
1402 {
1403         struct list_head *hash;
1404         unsigned int i;
1405
1406         *vmalloced = 0; 
1407         hash = (void*)__get_free_pages(GFP_KERNEL, 
1408                                        get_order(sizeof(struct list_head)
1409                                                  * size));
1410         if (!hash) { 
1411                 *vmalloced = 1;
1412                 printk(KERN_WARNING"ip_conntrack: falling back to vmalloc.\n");
1413                 hash = vmalloc(sizeof(struct list_head) * size);
1414         }
1415
1416         if (hash)
1417                 for (i = 0; i < size; i++)
1418                         INIT_LIST_HEAD(&hash[i]);
1419
1420         return hash;
1421 }
1422
1423 static int set_hashsize(const char *val, struct kernel_param *kp)
1424 {
1425         int i, bucket, hashsize, vmalloced;
1426         int old_vmalloced, old_size;
1427         int rnd;
1428         struct list_head *hash, *old_hash;
1429         struct ip_conntrack_tuple_hash *h;
1430
1431         /* On boot, we can set this without any fancy locking. */
1432         if (!ip_conntrack_htable_size)
1433                 return param_set_int(val, kp);
1434
1435         hashsize = simple_strtol(val, NULL, 0);
1436         if (!hashsize)
1437                 return -EINVAL;
1438
1439         hash = alloc_hashtable(hashsize, &vmalloced);
1440         if (!hash)
1441                 return -ENOMEM;
1442
1443         /* We have to rehash for the new table anyway, so we also can 
1444          * use a new random seed */
1445         get_random_bytes(&rnd, 4);
1446
1447         write_lock_bh(&ip_conntrack_lock);
1448         for (i = 0; i < ip_conntrack_htable_size; i++) {
1449                 while (!list_empty(&ip_conntrack_hash[i])) {
1450                         h = list_entry(ip_conntrack_hash[i].next,
1451                                        struct ip_conntrack_tuple_hash, list);
1452                         list_del(&h->list);
1453                         bucket = __hash_conntrack(&h->tuple, hashsize, rnd);
1454                         list_add_tail(&h->list, &hash[bucket]);
1455                 }
1456         }
1457         old_size = ip_conntrack_htable_size;
1458         old_vmalloced = ip_conntrack_vmalloc;
1459         old_hash = ip_conntrack_hash;
1460
1461         ip_conntrack_htable_size = hashsize;
1462         ip_conntrack_vmalloc = vmalloced;
1463         ip_conntrack_hash = hash;
1464         ip_conntrack_hash_rnd = rnd;
1465         write_unlock_bh(&ip_conntrack_lock);
1466
1467         free_conntrack_hash(old_hash, old_vmalloced, old_size);
1468         return 0;
1469 }
1470
1471 module_param_call(hashsize, set_hashsize, param_get_uint,
1472                   &ip_conntrack_htable_size, 0600);
1473
1474 int __init ip_conntrack_init(void)
1475 {
1476         unsigned int i;
1477         int ret;
1478
1479         /* Idea from tcp.c: use 1/16384 of memory.  On i386: 32MB
1480          * machine has 256 buckets.  >= 1GB machines have 8192 buckets. */
1481         if (!ip_conntrack_htable_size) {
1482                 ip_conntrack_htable_size
1483                         = (((num_physpages << PAGE_SHIFT) / 16384)
1484                            / sizeof(struct list_head));
1485                 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1486                         ip_conntrack_htable_size = 8192;
1487                 if (ip_conntrack_htable_size < 16)
1488                         ip_conntrack_htable_size = 16;
1489         }
1490         ip_conntrack_max = 8 * ip_conntrack_htable_size;
1491
1492         printk("ip_conntrack version %s (%u buckets, %d max)"
1493                " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION,
1494                ip_conntrack_htable_size, ip_conntrack_max,
1495                sizeof(struct ip_conntrack));
1496
1497         ret = nf_register_sockopt(&so_getorigdst);
1498         if (ret != 0) {
1499                 printk(KERN_ERR "Unable to register netfilter socket option\n");
1500                 return ret;
1501         }
1502
1503         ip_conntrack_hash = alloc_hashtable(ip_conntrack_htable_size,
1504                                             &ip_conntrack_vmalloc);
1505         if (!ip_conntrack_hash) {
1506                 printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
1507                 goto err_unreg_sockopt;
1508         }
1509
1510         ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
1511                                                 sizeof(struct ip_conntrack), 0,
1512                                                 0, NULL, NULL);
1513         if (!ip_conntrack_cachep) {
1514                 printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
1515                 goto err_free_hash;
1516         }
1517
1518         ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect",
1519                                         sizeof(struct ip_conntrack_expect),
1520                                         0, 0, NULL, NULL);
1521         if (!ip_conntrack_expect_cachep) {
1522                 printk(KERN_ERR "Unable to create ip_expect slab cache\n");
1523                 goto err_free_conntrack_slab;
1524         }
1525
1526         /* Don't NEED lock here, but good form anyway. */
1527         write_lock_bh(&ip_conntrack_lock);
1528         for (i = 0; i < MAX_IP_CT_PROTO; i++)
1529                 ip_ct_protos[i] = &ip_conntrack_generic_protocol;
1530         /* Sew in builtin protocols. */
1531         ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp;
1532         ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp;
1533         ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp;
1534         write_unlock_bh(&ip_conntrack_lock);
1535
1536         /* For use by ipt_REJECT */
1537         ip_ct_attach = ip_conntrack_attach;
1538
1539         /* Set up fake conntrack:
1540             - to never be deleted, not in any hashes */
1541         atomic_set(&ip_conntrack_untracked.ct_general.use, 1);
1542         /*  - and look it like as a confirmed connection */
1543         set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status);
1544
1545         return ret;
1546
1547 err_free_conntrack_slab:
1548         kmem_cache_destroy(ip_conntrack_cachep);
1549 err_free_hash:
1550         free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc,
1551                             ip_conntrack_htable_size);
1552 err_unreg_sockopt:
1553         nf_unregister_sockopt(&so_getorigdst);
1554
1555         return -ENOMEM;
1556 }