Merge branch 'master' of master.kernel.org:/pub/scm/linux/kernel/git/davem/net-2.6
[sfrench/cifs-2.6.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Version:     $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9  *
10  * Authors:     Ross Biro
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15  *
16  * Fixes:
17  *              Alan Cox        :       Verify area fixes.
18  *              Alan Cox        :       cli() protects routing changes
19  *              Rui Oliveira    :       ICMP routing table updates
20  *              (rco@di.uminho.pt)      Routing table insertion and update
21  *              Linus Torvalds  :       Rewrote bits to be sensible
22  *              Alan Cox        :       Added BSD route gw semantics
23  *              Alan Cox        :       Super /proc >4K
24  *              Alan Cox        :       MTU in route table
25  *              Alan Cox        :       MSS actually. Also added the window
26  *                                      clamper.
27  *              Sam Lantinga    :       Fixed route matching in rt_del()
28  *              Alan Cox        :       Routing cache support.
29  *              Alan Cox        :       Removed compatibility cruft.
30  *              Alan Cox        :       RTF_REJECT support.
31  *              Alan Cox        :       TCP irtt support.
32  *              Jonathan Naylor :       Added Metric support.
33  *      Miquel van Smoorenburg  :       BSD API fixes.
34  *      Miquel van Smoorenburg  :       Metrics.
35  *              Alan Cox        :       Use __u32 properly
36  *              Alan Cox        :       Aligned routing errors more closely with BSD
37  *                                      our system is still very different.
38  *              Alan Cox        :       Faster /proc handling
39  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
40  *                                      routing caches and better behaviour.
41  *
42  *              Olaf Erb        :       irtt wasn't being copied right.
43  *              Bjorn Ekwall    :       Kerneld route support.
44  *              Alan Cox        :       Multicast fixed (I hope)
45  *              Pavel Krauz     :       Limited broadcast fixed
46  *              Mike McLagan    :       Routing by source
47  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
48  *                                      route.c and rewritten from scratch.
49  *              Andi Kleen      :       Load-limit warning messages.
50  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
51  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
52  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
53  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
54  *              Marc Boucher    :       routing by fwmark
55  *      Robert Olsson           :       Added rt_cache statistics
56  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
57  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
58  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
59  *      Ilia Sotnikov           :       Removed TOS from hash calculations
60  *
61  *              This program is free software; you can redistribute it and/or
62  *              modify it under the terms of the GNU General Public License
63  *              as published by the Free Software Foundation; either version
64  *              2 of the License, or (at your option) any later version.
65  */
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <asm/system.h>
70 #include <linux/bitops.h>
71 #include <linux/types.h>
72 #include <linux/kernel.h>
73 #include <linux/mm.h>
74 #include <linux/bootmem.h>
75 #include <linux/string.h>
76 #include <linux/socket.h>
77 #include <linux/sockios.h>
78 #include <linux/errno.h>
79 #include <linux/in.h>
80 #include <linux/inet.h>
81 #include <linux/netdevice.h>
82 #include <linux/proc_fs.h>
83 #include <linux/init.h>
84 #include <linux/workqueue.h>
85 #include <linux/skbuff.h>
86 #include <linux/inetdevice.h>
87 #include <linux/igmp.h>
88 #include <linux/pkt_sched.h>
89 #include <linux/mroute.h>
90 #include <linux/netfilter_ipv4.h>
91 #include <linux/random.h>
92 #include <linux/jhash.h>
93 #include <linux/rcupdate.h>
94 #include <linux/times.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
97 #include <net/ip.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
102 #include <net/arp.h>
103 #include <net/tcp.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/netevent.h>
107 #include <net/rtnetlink.h>
108 #ifdef CONFIG_SYSCTL
109 #include <linux/sysctl.h>
110 #endif
111
112 #define RT_FL_TOS(oldflp) \
113     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
114
115 #define IP_MAX_MTU      0xFFF0
116
117 #define RT_GC_TIMEOUT (300*HZ)
118
119 static int ip_rt_min_delay              = 2 * HZ;
120 static int ip_rt_max_delay              = 10 * HZ;
121 static int ip_rt_max_size;
122 static int ip_rt_gc_timeout             = RT_GC_TIMEOUT;
123 static int ip_rt_gc_interval            = 60 * HZ;
124 static int ip_rt_gc_min_interval        = HZ / 2;
125 static int ip_rt_redirect_number        = 9;
126 static int ip_rt_redirect_load          = HZ / 50;
127 static int ip_rt_redirect_silence       = ((HZ / 50) << (9 + 1));
128 static int ip_rt_error_cost             = HZ;
129 static int ip_rt_error_burst            = 5 * HZ;
130 static int ip_rt_gc_elasticity          = 8;
131 static int ip_rt_mtu_expires            = 10 * 60 * HZ;
132 static int ip_rt_min_pmtu               = 512 + 20 + 20;
133 static int ip_rt_min_advmss             = 256;
134 static int ip_rt_secret_interval        = 10 * 60 * HZ;
135 static unsigned long rt_deadline;
136
137 #define RTprint(a...)   printk(KERN_DEBUG a)
138
139 static struct timer_list rt_flush_timer;
140 static void rt_check_expire(struct work_struct *work);
141 static DECLARE_DELAYED_WORK(expires_work, rt_check_expire);
142 static struct timer_list rt_secret_timer;
143
144 /*
145  *      Interface to generic destination cache.
146  */
147
148 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
149 static void              ipv4_dst_destroy(struct dst_entry *dst);
150 static void              ipv4_dst_ifdown(struct dst_entry *dst,
151                                          struct net_device *dev, int how);
152 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
153 static void              ipv4_link_failure(struct sk_buff *skb);
154 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
155 static int rt_garbage_collect(void);
156
157
158 static struct dst_ops ipv4_dst_ops = {
159         .family =               AF_INET,
160         .protocol =             __constant_htons(ETH_P_IP),
161         .gc =                   rt_garbage_collect,
162         .check =                ipv4_dst_check,
163         .destroy =              ipv4_dst_destroy,
164         .ifdown =               ipv4_dst_ifdown,
165         .negative_advice =      ipv4_negative_advice,
166         .link_failure =         ipv4_link_failure,
167         .update_pmtu =          ip_rt_update_pmtu,
168         .entry_size =           sizeof(struct rtable),
169 };
170
171 #define ECN_OR_COST(class)      TC_PRIO_##class
172
173 const __u8 ip_tos2prio[16] = {
174         TC_PRIO_BESTEFFORT,
175         ECN_OR_COST(FILLER),
176         TC_PRIO_BESTEFFORT,
177         ECN_OR_COST(BESTEFFORT),
178         TC_PRIO_BULK,
179         ECN_OR_COST(BULK),
180         TC_PRIO_BULK,
181         ECN_OR_COST(BULK),
182         TC_PRIO_INTERACTIVE,
183         ECN_OR_COST(INTERACTIVE),
184         TC_PRIO_INTERACTIVE,
185         ECN_OR_COST(INTERACTIVE),
186         TC_PRIO_INTERACTIVE_BULK,
187         ECN_OR_COST(INTERACTIVE_BULK),
188         TC_PRIO_INTERACTIVE_BULK,
189         ECN_OR_COST(INTERACTIVE_BULK)
190 };
191
192
193 /*
194  * Route cache.
195  */
196
197 /* The locking scheme is rather straight forward:
198  *
199  * 1) Read-Copy Update protects the buckets of the central route hash.
200  * 2) Only writers remove entries, and they hold the lock
201  *    as they look at rtable reference counts.
202  * 3) Only readers acquire references to rtable entries,
203  *    they do so with atomic increments and with the
204  *    lock held.
205  */
206
207 struct rt_hash_bucket {
208         struct rtable   *chain;
209 };
210 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
211         defined(CONFIG_PROVE_LOCKING)
212 /*
213  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
214  * The size of this table is a power of two and depends on the number of CPUS.
215  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
216  */
217 #ifdef CONFIG_LOCKDEP
218 # define RT_HASH_LOCK_SZ        256
219 #else
220 # if NR_CPUS >= 32
221 #  define RT_HASH_LOCK_SZ       4096
222 # elif NR_CPUS >= 16
223 #  define RT_HASH_LOCK_SZ       2048
224 # elif NR_CPUS >= 8
225 #  define RT_HASH_LOCK_SZ       1024
226 # elif NR_CPUS >= 4
227 #  define RT_HASH_LOCK_SZ       512
228 # else
229 #  define RT_HASH_LOCK_SZ       256
230 # endif
231 #endif
232
233 static spinlock_t       *rt_hash_locks;
234 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
235 # define rt_hash_lock_init()    { \
236                 int i; \
237                 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
238                 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
239                 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
240                         spin_lock_init(&rt_hash_locks[i]); \
241                 }
242 #else
243 # define rt_hash_lock_addr(slot) NULL
244 # define rt_hash_lock_init()
245 #endif
246
247 static struct rt_hash_bucket    *rt_hash_table;
248 static unsigned                 rt_hash_mask;
249 static unsigned int             rt_hash_log;
250 static unsigned int             rt_hash_rnd;
251
252 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
253 #define RT_CACHE_STAT_INC(field) \
254         (__raw_get_cpu_var(rt_cache_stat).field++)
255
256 static int rt_intern_hash(unsigned hash, struct rtable *rth,
257                                 struct rtable **res);
258
259 static unsigned int rt_hash_code(u32 daddr, u32 saddr)
260 {
261         return (jhash_2words(daddr, saddr, rt_hash_rnd)
262                 & rt_hash_mask);
263 }
264
265 #define rt_hash(daddr, saddr, idx) \
266         rt_hash_code((__force u32)(__be32)(daddr),\
267                      (__force u32)(__be32)(saddr) ^ ((idx) << 5))
268
269 #ifdef CONFIG_PROC_FS
270 struct rt_cache_iter_state {
271         int bucket;
272 };
273
274 static struct rtable *rt_cache_get_first(struct seq_file *seq)
275 {
276         struct rtable *r = NULL;
277         struct rt_cache_iter_state *st = seq->private;
278
279         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
280                 rcu_read_lock_bh();
281                 r = rt_hash_table[st->bucket].chain;
282                 if (r)
283                         break;
284                 rcu_read_unlock_bh();
285         }
286         return r;
287 }
288
289 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
290 {
291         struct rt_cache_iter_state *st = rcu_dereference(seq->private);
292
293         r = r->u.dst.rt_next;
294         while (!r) {
295                 rcu_read_unlock_bh();
296                 if (--st->bucket < 0)
297                         break;
298                 rcu_read_lock_bh();
299                 r = rt_hash_table[st->bucket].chain;
300         }
301         return r;
302 }
303
304 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
305 {
306         struct rtable *r = rt_cache_get_first(seq);
307
308         if (r)
309                 while (pos && (r = rt_cache_get_next(seq, r)))
310                         --pos;
311         return pos ? NULL : r;
312 }
313
314 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
315 {
316         return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
317 }
318
319 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
320 {
321         struct rtable *r = NULL;
322
323         if (v == SEQ_START_TOKEN)
324                 r = rt_cache_get_first(seq);
325         else
326                 r = rt_cache_get_next(seq, v);
327         ++*pos;
328         return r;
329 }
330
331 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
332 {
333         if (v && v != SEQ_START_TOKEN)
334                 rcu_read_unlock_bh();
335 }
336
337 static int rt_cache_seq_show(struct seq_file *seq, void *v)
338 {
339         if (v == SEQ_START_TOKEN)
340                 seq_printf(seq, "%-127s\n",
341                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
342                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
343                            "HHUptod\tSpecDst");
344         else {
345                 struct rtable *r = v;
346                 char temp[256];
347
348                 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
349                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
350                         r->u.dst.dev ? r->u.dst.dev->name : "*",
351                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
352                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
353                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
354                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
355                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
356                         dst_metric(&r->u.dst, RTAX_WINDOW),
357                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
358                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
359                         r->fl.fl4_tos,
360                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
361                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
362                                        dev_queue_xmit) : 0,
363                         r->rt_spec_dst);
364                 seq_printf(seq, "%-127s\n", temp);
365         }
366         return 0;
367 }
368
369 static const struct seq_operations rt_cache_seq_ops = {
370         .start  = rt_cache_seq_start,
371         .next   = rt_cache_seq_next,
372         .stop   = rt_cache_seq_stop,
373         .show   = rt_cache_seq_show,
374 };
375
376 static int rt_cache_seq_open(struct inode *inode, struct file *file)
377 {
378         return seq_open_private(file, &rt_cache_seq_ops,
379                         sizeof(struct rt_cache_iter_state));
380 }
381
382 static const struct file_operations rt_cache_seq_fops = {
383         .owner   = THIS_MODULE,
384         .open    = rt_cache_seq_open,
385         .read    = seq_read,
386         .llseek  = seq_lseek,
387         .release = seq_release_private,
388 };
389
390
391 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
392 {
393         int cpu;
394
395         if (*pos == 0)
396                 return SEQ_START_TOKEN;
397
398         for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
399                 if (!cpu_possible(cpu))
400                         continue;
401                 *pos = cpu+1;
402                 return &per_cpu(rt_cache_stat, cpu);
403         }
404         return NULL;
405 }
406
407 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
408 {
409         int cpu;
410
411         for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
412                 if (!cpu_possible(cpu))
413                         continue;
414                 *pos = cpu+1;
415                 return &per_cpu(rt_cache_stat, cpu);
416         }
417         return NULL;
418
419 }
420
421 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
422 {
423
424 }
425
426 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
427 {
428         struct rt_cache_stat *st = v;
429
430         if (v == SEQ_START_TOKEN) {
431                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
432                 return 0;
433         }
434
435         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
436                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
437                    atomic_read(&ipv4_dst_ops.entries),
438                    st->in_hit,
439                    st->in_slow_tot,
440                    st->in_slow_mc,
441                    st->in_no_route,
442                    st->in_brd,
443                    st->in_martian_dst,
444                    st->in_martian_src,
445
446                    st->out_hit,
447                    st->out_slow_tot,
448                    st->out_slow_mc,
449
450                    st->gc_total,
451                    st->gc_ignored,
452                    st->gc_goal_miss,
453                    st->gc_dst_overflow,
454                    st->in_hlist_search,
455                    st->out_hlist_search
456                 );
457         return 0;
458 }
459
460 static const struct seq_operations rt_cpu_seq_ops = {
461         .start  = rt_cpu_seq_start,
462         .next   = rt_cpu_seq_next,
463         .stop   = rt_cpu_seq_stop,
464         .show   = rt_cpu_seq_show,
465 };
466
467
468 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
469 {
470         return seq_open(file, &rt_cpu_seq_ops);
471 }
472
473 static const struct file_operations rt_cpu_seq_fops = {
474         .owner   = THIS_MODULE,
475         .open    = rt_cpu_seq_open,
476         .read    = seq_read,
477         .llseek  = seq_lseek,
478         .release = seq_release,
479 };
480
481 #endif /* CONFIG_PROC_FS */
482
483 static __inline__ void rt_free(struct rtable *rt)
484 {
485         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
486 }
487
488 static __inline__ void rt_drop(struct rtable *rt)
489 {
490         ip_rt_put(rt);
491         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
492 }
493
494 static __inline__ int rt_fast_clean(struct rtable *rth)
495 {
496         /* Kill broadcast/multicast entries very aggresively, if they
497            collide in hash table with more useful entries */
498         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
499                 rth->fl.iif && rth->u.dst.rt_next;
500 }
501
502 static __inline__ int rt_valuable(struct rtable *rth)
503 {
504         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
505                 rth->u.dst.expires;
506 }
507
508 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
509 {
510         unsigned long age;
511         int ret = 0;
512
513         if (atomic_read(&rth->u.dst.__refcnt))
514                 goto out;
515
516         ret = 1;
517         if (rth->u.dst.expires &&
518             time_after_eq(jiffies, rth->u.dst.expires))
519                 goto out;
520
521         age = jiffies - rth->u.dst.lastuse;
522         ret = 0;
523         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
524             (age <= tmo2 && rt_valuable(rth)))
525                 goto out;
526         ret = 1;
527 out:    return ret;
528 }
529
530 /* Bits of score are:
531  * 31: very valuable
532  * 30: not quite useless
533  * 29..0: usage counter
534  */
535 static inline u32 rt_score(struct rtable *rt)
536 {
537         u32 score = jiffies - rt->u.dst.lastuse;
538
539         score = ~score & ~(3<<30);
540
541         if (rt_valuable(rt))
542                 score |= (1<<31);
543
544         if (!rt->fl.iif ||
545             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
546                 score |= (1<<30);
547
548         return score;
549 }
550
551 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
552 {
553         return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
554                 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
555                 (fl1->mark ^ fl2->mark) |
556                 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
557                  *(u16 *)&fl2->nl_u.ip4_u.tos) |
558                 (fl1->oif ^ fl2->oif) |
559                 (fl1->iif ^ fl2->iif)) == 0;
560 }
561
562 static void rt_check_expire(struct work_struct *work)
563 {
564         static unsigned int rover;
565         unsigned int i = rover, goal;
566         struct rtable *rth, **rthp;
567         u64 mult;
568
569         mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
570         if (ip_rt_gc_timeout > 1)
571                 do_div(mult, ip_rt_gc_timeout);
572         goal = (unsigned int)mult;
573         if (goal > rt_hash_mask)
574                 goal = rt_hash_mask + 1;
575         for (; goal > 0; goal--) {
576                 unsigned long tmo = ip_rt_gc_timeout;
577
578                 i = (i + 1) & rt_hash_mask;
579                 rthp = &rt_hash_table[i].chain;
580
581                 if (need_resched())
582                         cond_resched();
583
584                 if (*rthp == NULL)
585                         continue;
586                 spin_lock_bh(rt_hash_lock_addr(i));
587                 while ((rth = *rthp) != NULL) {
588                         if (rth->u.dst.expires) {
589                                 /* Entry is expired even if it is in use */
590                                 if (time_before_eq(jiffies, rth->u.dst.expires)) {
591                                         tmo >>= 1;
592                                         rthp = &rth->u.dst.rt_next;
593                                         continue;
594                                 }
595                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
596                                 tmo >>= 1;
597                                 rthp = &rth->u.dst.rt_next;
598                                 continue;
599                         }
600
601                         /* Cleanup aged off entries. */
602                         *rthp = rth->u.dst.rt_next;
603                         rt_free(rth);
604                 }
605                 spin_unlock_bh(rt_hash_lock_addr(i));
606         }
607         rover = i;
608         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
609 }
610
611 /* This can run from both BH and non-BH contexts, the latter
612  * in the case of a forced flush event.
613  */
614 static void rt_run_flush(unsigned long dummy)
615 {
616         int i;
617         struct rtable *rth, *next;
618
619         rt_deadline = 0;
620
621         get_random_bytes(&rt_hash_rnd, 4);
622
623         for (i = rt_hash_mask; i >= 0; i--) {
624                 spin_lock_bh(rt_hash_lock_addr(i));
625                 rth = rt_hash_table[i].chain;
626                 if (rth)
627                         rt_hash_table[i].chain = NULL;
628                 spin_unlock_bh(rt_hash_lock_addr(i));
629
630                 for (; rth; rth = next) {
631                         next = rth->u.dst.rt_next;
632                         rt_free(rth);
633                 }
634         }
635 }
636
637 static DEFINE_SPINLOCK(rt_flush_lock);
638
639 void rt_cache_flush(int delay)
640 {
641         unsigned long now = jiffies;
642         int user_mode = !in_softirq();
643
644         if (delay < 0)
645                 delay = ip_rt_min_delay;
646
647         spin_lock_bh(&rt_flush_lock);
648
649         if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
650                 long tmo = (long)(rt_deadline - now);
651
652                 /* If flush timer is already running
653                    and flush request is not immediate (delay > 0):
654
655                    if deadline is not achieved, prolongate timer to "delay",
656                    otherwise fire it at deadline time.
657                  */
658
659                 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
660                         tmo = 0;
661
662                 if (delay > tmo)
663                         delay = tmo;
664         }
665
666         if (delay <= 0) {
667                 spin_unlock_bh(&rt_flush_lock);
668                 rt_run_flush(0);
669                 return;
670         }
671
672         if (rt_deadline == 0)
673                 rt_deadline = now + ip_rt_max_delay;
674
675         mod_timer(&rt_flush_timer, now+delay);
676         spin_unlock_bh(&rt_flush_lock);
677 }
678
679 static void rt_secret_rebuild(unsigned long dummy)
680 {
681         unsigned long now = jiffies;
682
683         rt_cache_flush(0);
684         mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
685 }
686
687 /*
688    Short description of GC goals.
689
690    We want to build algorithm, which will keep routing cache
691    at some equilibrium point, when number of aged off entries
692    is kept approximately equal to newly generated ones.
693
694    Current expiration strength is variable "expire".
695    We try to adjust it dynamically, so that if networking
696    is idle expires is large enough to keep enough of warm entries,
697    and when load increases it reduces to limit cache size.
698  */
699
700 static int rt_garbage_collect(void)
701 {
702         static unsigned long expire = RT_GC_TIMEOUT;
703         static unsigned long last_gc;
704         static int rover;
705         static int equilibrium;
706         struct rtable *rth, **rthp;
707         unsigned long now = jiffies;
708         int goal;
709
710         /*
711          * Garbage collection is pretty expensive,
712          * do not make it too frequently.
713          */
714
715         RT_CACHE_STAT_INC(gc_total);
716
717         if (now - last_gc < ip_rt_gc_min_interval &&
718             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
719                 RT_CACHE_STAT_INC(gc_ignored);
720                 goto out;
721         }
722
723         /* Calculate number of entries, which we want to expire now. */
724         goal = atomic_read(&ipv4_dst_ops.entries) -
725                 (ip_rt_gc_elasticity << rt_hash_log);
726         if (goal <= 0) {
727                 if (equilibrium < ipv4_dst_ops.gc_thresh)
728                         equilibrium = ipv4_dst_ops.gc_thresh;
729                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
730                 if (goal > 0) {
731                         equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
732                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
733                 }
734         } else {
735                 /* We are in dangerous area. Try to reduce cache really
736                  * aggressively.
737                  */
738                 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
739                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
740         }
741
742         if (now - last_gc >= ip_rt_gc_min_interval)
743                 last_gc = now;
744
745         if (goal <= 0) {
746                 equilibrium += goal;
747                 goto work_done;
748         }
749
750         do {
751                 int i, k;
752
753                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
754                         unsigned long tmo = expire;
755
756                         k = (k + 1) & rt_hash_mask;
757                         rthp = &rt_hash_table[k].chain;
758                         spin_lock_bh(rt_hash_lock_addr(k));
759                         while ((rth = *rthp) != NULL) {
760                                 if (!rt_may_expire(rth, tmo, expire)) {
761                                         tmo >>= 1;
762                                         rthp = &rth->u.dst.rt_next;
763                                         continue;
764                                 }
765                                 *rthp = rth->u.dst.rt_next;
766                                 rt_free(rth);
767                                 goal--;
768                         }
769                         spin_unlock_bh(rt_hash_lock_addr(k));
770                         if (goal <= 0)
771                                 break;
772                 }
773                 rover = k;
774
775                 if (goal <= 0)
776                         goto work_done;
777
778                 /* Goal is not achieved. We stop process if:
779
780                    - if expire reduced to zero. Otherwise, expire is halfed.
781                    - if table is not full.
782                    - if we are called from interrupt.
783                    - jiffies check is just fallback/debug loop breaker.
784                      We will not spin here for long time in any case.
785                  */
786
787                 RT_CACHE_STAT_INC(gc_goal_miss);
788
789                 if (expire == 0)
790                         break;
791
792                 expire >>= 1;
793 #if RT_CACHE_DEBUG >= 2
794                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
795                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
796 #endif
797
798                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
799                         goto out;
800         } while (!in_softirq() && time_before_eq(jiffies, now));
801
802         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
803                 goto out;
804         if (net_ratelimit())
805                 printk(KERN_WARNING "dst cache overflow\n");
806         RT_CACHE_STAT_INC(gc_dst_overflow);
807         return 1;
808
809 work_done:
810         expire += ip_rt_gc_min_interval;
811         if (expire > ip_rt_gc_timeout ||
812             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
813                 expire = ip_rt_gc_timeout;
814 #if RT_CACHE_DEBUG >= 2
815         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
816                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
817 #endif
818 out:    return 0;
819 }
820
821 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
822 {
823         struct rtable   *rth, **rthp;
824         unsigned long   now;
825         struct rtable *cand, **candp;
826         u32             min_score;
827         int             chain_length;
828         int attempts = !in_softirq();
829
830 restart:
831         chain_length = 0;
832         min_score = ~(u32)0;
833         cand = NULL;
834         candp = NULL;
835         now = jiffies;
836
837         rthp = &rt_hash_table[hash].chain;
838
839         spin_lock_bh(rt_hash_lock_addr(hash));
840         while ((rth = *rthp) != NULL) {
841                 if (compare_keys(&rth->fl, &rt->fl)) {
842                         /* Put it first */
843                         *rthp = rth->u.dst.rt_next;
844                         /*
845                          * Since lookup is lockfree, the deletion
846                          * must be visible to another weakly ordered CPU before
847                          * the insertion at the start of the hash chain.
848                          */
849                         rcu_assign_pointer(rth->u.dst.rt_next,
850                                            rt_hash_table[hash].chain);
851                         /*
852                          * Since lookup is lockfree, the update writes
853                          * must be ordered for consistency on SMP.
854                          */
855                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
856
857                         dst_use(&rth->u.dst, now);
858                         spin_unlock_bh(rt_hash_lock_addr(hash));
859
860                         rt_drop(rt);
861                         *rp = rth;
862                         return 0;
863                 }
864
865                 if (!atomic_read(&rth->u.dst.__refcnt)) {
866                         u32 score = rt_score(rth);
867
868                         if (score <= min_score) {
869                                 cand = rth;
870                                 candp = rthp;
871                                 min_score = score;
872                         }
873                 }
874
875                 chain_length++;
876
877                 rthp = &rth->u.dst.rt_next;
878         }
879
880         if (cand) {
881                 /* ip_rt_gc_elasticity used to be average length of chain
882                  * length, when exceeded gc becomes really aggressive.
883                  *
884                  * The second limit is less certain. At the moment it allows
885                  * only 2 entries per bucket. We will see.
886                  */
887                 if (chain_length > ip_rt_gc_elasticity) {
888                         *candp = cand->u.dst.rt_next;
889                         rt_free(cand);
890                 }
891         }
892
893         /* Try to bind route to arp only if it is output
894            route or unicast forwarding path.
895          */
896         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
897                 int err = arp_bind_neighbour(&rt->u.dst);
898                 if (err) {
899                         spin_unlock_bh(rt_hash_lock_addr(hash));
900
901                         if (err != -ENOBUFS) {
902                                 rt_drop(rt);
903                                 return err;
904                         }
905
906                         /* Neighbour tables are full and nothing
907                            can be released. Try to shrink route cache,
908                            it is most likely it holds some neighbour records.
909                          */
910                         if (attempts-- > 0) {
911                                 int saved_elasticity = ip_rt_gc_elasticity;
912                                 int saved_int = ip_rt_gc_min_interval;
913                                 ip_rt_gc_elasticity     = 1;
914                                 ip_rt_gc_min_interval   = 0;
915                                 rt_garbage_collect();
916                                 ip_rt_gc_min_interval   = saved_int;
917                                 ip_rt_gc_elasticity     = saved_elasticity;
918                                 goto restart;
919                         }
920
921                         if (net_ratelimit())
922                                 printk(KERN_WARNING "Neighbour table overflow.\n");
923                         rt_drop(rt);
924                         return -ENOBUFS;
925                 }
926         }
927
928         rt->u.dst.rt_next = rt_hash_table[hash].chain;
929 #if RT_CACHE_DEBUG >= 2
930         if (rt->u.dst.rt_next) {
931                 struct rtable *trt;
932                 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
933                        NIPQUAD(rt->rt_dst));
934                 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
935                         printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
936                 printk("\n");
937         }
938 #endif
939         rt_hash_table[hash].chain = rt;
940         spin_unlock_bh(rt_hash_lock_addr(hash));
941         *rp = rt;
942         return 0;
943 }
944
945 void rt_bind_peer(struct rtable *rt, int create)
946 {
947         static DEFINE_SPINLOCK(rt_peer_lock);
948         struct inet_peer *peer;
949
950         peer = inet_getpeer(rt->rt_dst, create);
951
952         spin_lock_bh(&rt_peer_lock);
953         if (rt->peer == NULL) {
954                 rt->peer = peer;
955                 peer = NULL;
956         }
957         spin_unlock_bh(&rt_peer_lock);
958         if (peer)
959                 inet_putpeer(peer);
960 }
961
962 /*
963  * Peer allocation may fail only in serious out-of-memory conditions.  However
964  * we still can generate some output.
965  * Random ID selection looks a bit dangerous because we have no chances to
966  * select ID being unique in a reasonable period of time.
967  * But broken packet identifier may be better than no packet at all.
968  */
969 static void ip_select_fb_ident(struct iphdr *iph)
970 {
971         static DEFINE_SPINLOCK(ip_fb_id_lock);
972         static u32 ip_fallback_id;
973         u32 salt;
974
975         spin_lock_bh(&ip_fb_id_lock);
976         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
977         iph->id = htons(salt & 0xFFFF);
978         ip_fallback_id = salt;
979         spin_unlock_bh(&ip_fb_id_lock);
980 }
981
982 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
983 {
984         struct rtable *rt = (struct rtable *) dst;
985
986         if (rt) {
987                 if (rt->peer == NULL)
988                         rt_bind_peer(rt, 1);
989
990                 /* If peer is attached to destination, it is never detached,
991                    so that we need not to grab a lock to dereference it.
992                  */
993                 if (rt->peer) {
994                         iph->id = htons(inet_getid(rt->peer, more));
995                         return;
996                 }
997         } else
998                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
999                        __builtin_return_address(0));
1000
1001         ip_select_fb_ident(iph);
1002 }
1003
1004 static void rt_del(unsigned hash, struct rtable *rt)
1005 {
1006         struct rtable **rthp;
1007
1008         spin_lock_bh(rt_hash_lock_addr(hash));
1009         ip_rt_put(rt);
1010         for (rthp = &rt_hash_table[hash].chain; *rthp;
1011              rthp = &(*rthp)->u.dst.rt_next)
1012                 if (*rthp == rt) {
1013                         *rthp = rt->u.dst.rt_next;
1014                         rt_free(rt);
1015                         break;
1016                 }
1017         spin_unlock_bh(rt_hash_lock_addr(hash));
1018 }
1019
1020 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1021                     __be32 saddr, struct net_device *dev)
1022 {
1023         int i, k;
1024         struct in_device *in_dev = in_dev_get(dev);
1025         struct rtable *rth, **rthp;
1026         __be32  skeys[2] = { saddr, 0 };
1027         int  ikeys[2] = { dev->ifindex, 0 };
1028         struct netevent_redirect netevent;
1029
1030         if (!in_dev)
1031                 return;
1032
1033         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1034             || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1035                 goto reject_redirect;
1036
1037         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1038                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1039                         goto reject_redirect;
1040                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1041                         goto reject_redirect;
1042         } else {
1043                 if (inet_addr_type(new_gw) != RTN_UNICAST)
1044                         goto reject_redirect;
1045         }
1046
1047         for (i = 0; i < 2; i++) {
1048                 for (k = 0; k < 2; k++) {
1049                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1050
1051                         rthp=&rt_hash_table[hash].chain;
1052
1053                         rcu_read_lock();
1054                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1055                                 struct rtable *rt;
1056
1057                                 if (rth->fl.fl4_dst != daddr ||
1058                                     rth->fl.fl4_src != skeys[i] ||
1059                                     rth->fl.oif != ikeys[k] ||
1060                                     rth->fl.iif != 0) {
1061                                         rthp = &rth->u.dst.rt_next;
1062                                         continue;
1063                                 }
1064
1065                                 if (rth->rt_dst != daddr ||
1066                                     rth->rt_src != saddr ||
1067                                     rth->u.dst.error ||
1068                                     rth->rt_gateway != old_gw ||
1069                                     rth->u.dst.dev != dev)
1070                                         break;
1071
1072                                 dst_hold(&rth->u.dst);
1073                                 rcu_read_unlock();
1074
1075                                 rt = dst_alloc(&ipv4_dst_ops);
1076                                 if (rt == NULL) {
1077                                         ip_rt_put(rth);
1078                                         in_dev_put(in_dev);
1079                                         return;
1080                                 }
1081
1082                                 /* Copy all the information. */
1083                                 *rt = *rth;
1084                                 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1085                                 rt->u.dst.__use         = 1;
1086                                 atomic_set(&rt->u.dst.__refcnt, 1);
1087                                 rt->u.dst.child         = NULL;
1088                                 if (rt->u.dst.dev)
1089                                         dev_hold(rt->u.dst.dev);
1090                                 if (rt->idev)
1091                                         in_dev_hold(rt->idev);
1092                                 rt->u.dst.obsolete      = 0;
1093                                 rt->u.dst.lastuse       = jiffies;
1094                                 rt->u.dst.path          = &rt->u.dst;
1095                                 rt->u.dst.neighbour     = NULL;
1096                                 rt->u.dst.hh            = NULL;
1097                                 rt->u.dst.xfrm          = NULL;
1098
1099                                 rt->rt_flags            |= RTCF_REDIRECTED;
1100
1101                                 /* Gateway is different ... */
1102                                 rt->rt_gateway          = new_gw;
1103
1104                                 /* Redirect received -> path was valid */
1105                                 dst_confirm(&rth->u.dst);
1106
1107                                 if (rt->peer)
1108                                         atomic_inc(&rt->peer->refcnt);
1109
1110                                 if (arp_bind_neighbour(&rt->u.dst) ||
1111                                     !(rt->u.dst.neighbour->nud_state &
1112                                             NUD_VALID)) {
1113                                         if (rt->u.dst.neighbour)
1114                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1115                                         ip_rt_put(rth);
1116                                         rt_drop(rt);
1117                                         goto do_next;
1118                                 }
1119
1120                                 netevent.old = &rth->u.dst;
1121                                 netevent.new = &rt->u.dst;
1122                                 call_netevent_notifiers(NETEVENT_REDIRECT,
1123                                                         &netevent);
1124
1125                                 rt_del(hash, rth);
1126                                 if (!rt_intern_hash(hash, rt, &rt))
1127                                         ip_rt_put(rt);
1128                                 goto do_next;
1129                         }
1130                         rcu_read_unlock();
1131                 do_next:
1132                         ;
1133                 }
1134         }
1135         in_dev_put(in_dev);
1136         return;
1137
1138 reject_redirect:
1139 #ifdef CONFIG_IP_ROUTE_VERBOSE
1140         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1141                 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1142                         "%u.%u.%u.%u ignored.\n"
1143                         "  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1144                        NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1145                        NIPQUAD(saddr), NIPQUAD(daddr));
1146 #endif
1147         in_dev_put(in_dev);
1148 }
1149
1150 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1151 {
1152         struct rtable *rt = (struct rtable*)dst;
1153         struct dst_entry *ret = dst;
1154
1155         if (rt) {
1156                 if (dst->obsolete) {
1157                         ip_rt_put(rt);
1158                         ret = NULL;
1159                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1160                            rt->u.dst.expires) {
1161                         unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1162                                                 rt->fl.oif);
1163 #if RT_CACHE_DEBUG >= 1
1164                         printk(KERN_DEBUG "ip_rt_advice: redirect to "
1165                                           "%u.%u.%u.%u/%02x dropped\n",
1166                                 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1167 #endif
1168                         rt_del(hash, rt);
1169                         ret = NULL;
1170                 }
1171         }
1172         return ret;
1173 }
1174
1175 /*
1176  * Algorithm:
1177  *      1. The first ip_rt_redirect_number redirects are sent
1178  *         with exponential backoff, then we stop sending them at all,
1179  *         assuming that the host ignores our redirects.
1180  *      2. If we did not see packets requiring redirects
1181  *         during ip_rt_redirect_silence, we assume that the host
1182  *         forgot redirected route and start to send redirects again.
1183  *
1184  * This algorithm is much cheaper and more intelligent than dumb load limiting
1185  * in icmp.c.
1186  *
1187  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1188  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1189  */
1190
1191 void ip_rt_send_redirect(struct sk_buff *skb)
1192 {
1193         struct rtable *rt = (struct rtable*)skb->dst;
1194         struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1195
1196         if (!in_dev)
1197                 return;
1198
1199         if (!IN_DEV_TX_REDIRECTS(in_dev))
1200                 goto out;
1201
1202         /* No redirected packets during ip_rt_redirect_silence;
1203          * reset the algorithm.
1204          */
1205         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1206                 rt->u.dst.rate_tokens = 0;
1207
1208         /* Too many ignored redirects; do not send anything
1209          * set u.dst.rate_last to the last seen redirected packet.
1210          */
1211         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1212                 rt->u.dst.rate_last = jiffies;
1213                 goto out;
1214         }
1215
1216         /* Check for load limit; set rate_last to the latest sent
1217          * redirect.
1218          */
1219         if (rt->u.dst.rate_tokens == 0 ||
1220             time_after(jiffies,
1221                        (rt->u.dst.rate_last +
1222                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1223                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1224                 rt->u.dst.rate_last = jiffies;
1225                 ++rt->u.dst.rate_tokens;
1226 #ifdef CONFIG_IP_ROUTE_VERBOSE
1227                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1228                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1229                     net_ratelimit())
1230                         printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1231                                 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1232                                 NIPQUAD(rt->rt_src), rt->rt_iif,
1233                                 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1234 #endif
1235         }
1236 out:
1237         in_dev_put(in_dev);
1238 }
1239
1240 static int ip_error(struct sk_buff *skb)
1241 {
1242         struct rtable *rt = (struct rtable*)skb->dst;
1243         unsigned long now;
1244         int code;
1245
1246         switch (rt->u.dst.error) {
1247                 case EINVAL:
1248                 default:
1249                         goto out;
1250                 case EHOSTUNREACH:
1251                         code = ICMP_HOST_UNREACH;
1252                         break;
1253                 case ENETUNREACH:
1254                         code = ICMP_NET_UNREACH;
1255                         break;
1256                 case EACCES:
1257                         code = ICMP_PKT_FILTERED;
1258                         break;
1259         }
1260
1261         now = jiffies;
1262         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1263         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1264                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1265         rt->u.dst.rate_last = now;
1266         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1267                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1268                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1269         }
1270
1271 out:    kfree_skb(skb);
1272         return 0;
1273 }
1274
1275 /*
1276  *      The last two values are not from the RFC but
1277  *      are needed for AMPRnet AX.25 paths.
1278  */
1279
1280 static const unsigned short mtu_plateau[] =
1281 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1282
1283 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1284 {
1285         int i;
1286
1287         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1288                 if (old_mtu > mtu_plateau[i])
1289                         return mtu_plateau[i];
1290         return 68;
1291 }
1292
1293 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1294 {
1295         int i;
1296         unsigned short old_mtu = ntohs(iph->tot_len);
1297         struct rtable *rth;
1298         __be32  skeys[2] = { iph->saddr, 0, };
1299         __be32  daddr = iph->daddr;
1300         unsigned short est_mtu = 0;
1301
1302         if (ipv4_config.no_pmtu_disc)
1303                 return 0;
1304
1305         for (i = 0; i < 2; i++) {
1306                 unsigned hash = rt_hash(daddr, skeys[i], 0);
1307
1308                 rcu_read_lock();
1309                 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1310                      rth = rcu_dereference(rth->u.dst.rt_next)) {
1311                         if (rth->fl.fl4_dst == daddr &&
1312                             rth->fl.fl4_src == skeys[i] &&
1313                             rth->rt_dst  == daddr &&
1314                             rth->rt_src  == iph->saddr &&
1315                             rth->fl.iif == 0 &&
1316                             !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1317                                 unsigned short mtu = new_mtu;
1318
1319                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1320
1321                                         /* BSD 4.2 compatibility hack :-( */
1322                                         if (mtu == 0 &&
1323                                             old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1324                                             old_mtu >= 68 + (iph->ihl << 2))
1325                                                 old_mtu -= iph->ihl << 2;
1326
1327                                         mtu = guess_mtu(old_mtu);
1328                                 }
1329                                 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1330                                         if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1331                                                 dst_confirm(&rth->u.dst);
1332                                                 if (mtu < ip_rt_min_pmtu) {
1333                                                         mtu = ip_rt_min_pmtu;
1334                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1335                                                                 (1 << RTAX_MTU);
1336                                                 }
1337                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1338                                                 dst_set_expires(&rth->u.dst,
1339                                                         ip_rt_mtu_expires);
1340                                         }
1341                                         est_mtu = mtu;
1342                                 }
1343                         }
1344                 }
1345                 rcu_read_unlock();
1346         }
1347         return est_mtu ? : new_mtu;
1348 }
1349
1350 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1351 {
1352         if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1353             !(dst_metric_locked(dst, RTAX_MTU))) {
1354                 if (mtu < ip_rt_min_pmtu) {
1355                         mtu = ip_rt_min_pmtu;
1356                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1357                 }
1358                 dst->metrics[RTAX_MTU-1] = mtu;
1359                 dst_set_expires(dst, ip_rt_mtu_expires);
1360                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1361         }
1362 }
1363
1364 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1365 {
1366         return NULL;
1367 }
1368
1369 static void ipv4_dst_destroy(struct dst_entry *dst)
1370 {
1371         struct rtable *rt = (struct rtable *) dst;
1372         struct inet_peer *peer = rt->peer;
1373         struct in_device *idev = rt->idev;
1374
1375         if (peer) {
1376                 rt->peer = NULL;
1377                 inet_putpeer(peer);
1378         }
1379
1380         if (idev) {
1381                 rt->idev = NULL;
1382                 in_dev_put(idev);
1383         }
1384 }
1385
1386 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1387                             int how)
1388 {
1389         struct rtable *rt = (struct rtable *) dst;
1390         struct in_device *idev = rt->idev;
1391         if (dev != init_net.loopback_dev && idev && idev->dev == dev) {
1392                 struct in_device *loopback_idev = in_dev_get(init_net.loopback_dev);
1393                 if (loopback_idev) {
1394                         rt->idev = loopback_idev;
1395                         in_dev_put(idev);
1396                 }
1397         }
1398 }
1399
1400 static void ipv4_link_failure(struct sk_buff *skb)
1401 {
1402         struct rtable *rt;
1403
1404         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1405
1406         rt = (struct rtable *) skb->dst;
1407         if (rt)
1408                 dst_set_expires(&rt->u.dst, 0);
1409 }
1410
1411 static int ip_rt_bug(struct sk_buff *skb)
1412 {
1413         printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1414                 NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1415                 skb->dev ? skb->dev->name : "?");
1416         kfree_skb(skb);
1417         return 0;
1418 }
1419
1420 /*
1421    We do not cache source address of outgoing interface,
1422    because it is used only by IP RR, TS and SRR options,
1423    so that it out of fast path.
1424
1425    BTW remember: "addr" is allowed to be not aligned
1426    in IP options!
1427  */
1428
1429 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1430 {
1431         __be32 src;
1432         struct fib_result res;
1433
1434         if (rt->fl.iif == 0)
1435                 src = rt->rt_src;
1436         else if (fib_lookup(&rt->fl, &res) == 0) {
1437                 src = FIB_RES_PREFSRC(res);
1438                 fib_res_put(&res);
1439         } else
1440                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1441                                         RT_SCOPE_UNIVERSE);
1442         memcpy(addr, &src, 4);
1443 }
1444
1445 #ifdef CONFIG_NET_CLS_ROUTE
1446 static void set_class_tag(struct rtable *rt, u32 tag)
1447 {
1448         if (!(rt->u.dst.tclassid & 0xFFFF))
1449                 rt->u.dst.tclassid |= tag & 0xFFFF;
1450         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1451                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1452 }
1453 #endif
1454
1455 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1456 {
1457         struct fib_info *fi = res->fi;
1458
1459         if (fi) {
1460                 if (FIB_RES_GW(*res) &&
1461                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1462                         rt->rt_gateway = FIB_RES_GW(*res);
1463                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1464                        sizeof(rt->u.dst.metrics));
1465                 if (fi->fib_mtu == 0) {
1466                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1467                         if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1468                             rt->rt_gateway != rt->rt_dst &&
1469                             rt->u.dst.dev->mtu > 576)
1470                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1471                 }
1472 #ifdef CONFIG_NET_CLS_ROUTE
1473                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1474 #endif
1475         } else
1476                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1477
1478         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1479                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1480         if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1481                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1482         if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1483                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1484                                        ip_rt_min_advmss);
1485         if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1486                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1487
1488 #ifdef CONFIG_NET_CLS_ROUTE
1489 #ifdef CONFIG_IP_MULTIPLE_TABLES
1490         set_class_tag(rt, fib_rules_tclass(res));
1491 #endif
1492         set_class_tag(rt, itag);
1493 #endif
1494         rt->rt_type = res->type;
1495 }
1496
1497 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1498                                 u8 tos, struct net_device *dev, int our)
1499 {
1500         unsigned hash;
1501         struct rtable *rth;
1502         __be32 spec_dst;
1503         struct in_device *in_dev = in_dev_get(dev);
1504         u32 itag = 0;
1505
1506         /* Primary sanity checks. */
1507
1508         if (in_dev == NULL)
1509                 return -EINVAL;
1510
1511         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1512             skb->protocol != htons(ETH_P_IP))
1513                 goto e_inval;
1514
1515         if (ZERONET(saddr)) {
1516                 if (!LOCAL_MCAST(daddr))
1517                         goto e_inval;
1518                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1519         } else if (fib_validate_source(saddr, 0, tos, 0,
1520                                         dev, &spec_dst, &itag) < 0)
1521                 goto e_inval;
1522
1523         rth = dst_alloc(&ipv4_dst_ops);
1524         if (!rth)
1525                 goto e_nobufs;
1526
1527         rth->u.dst.output= ip_rt_bug;
1528
1529         atomic_set(&rth->u.dst.__refcnt, 1);
1530         rth->u.dst.flags= DST_HOST;
1531         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1532                 rth->u.dst.flags |= DST_NOPOLICY;
1533         rth->fl.fl4_dst = daddr;
1534         rth->rt_dst     = daddr;
1535         rth->fl.fl4_tos = tos;
1536         rth->fl.mark    = skb->mark;
1537         rth->fl.fl4_src = saddr;
1538         rth->rt_src     = saddr;
1539 #ifdef CONFIG_NET_CLS_ROUTE
1540         rth->u.dst.tclassid = itag;
1541 #endif
1542         rth->rt_iif     =
1543         rth->fl.iif     = dev->ifindex;
1544         rth->u.dst.dev  = init_net.loopback_dev;
1545         dev_hold(rth->u.dst.dev);
1546         rth->idev       = in_dev_get(rth->u.dst.dev);
1547         rth->fl.oif     = 0;
1548         rth->rt_gateway = daddr;
1549         rth->rt_spec_dst= spec_dst;
1550         rth->rt_type    = RTN_MULTICAST;
1551         rth->rt_flags   = RTCF_MULTICAST;
1552         if (our) {
1553                 rth->u.dst.input= ip_local_deliver;
1554                 rth->rt_flags |= RTCF_LOCAL;
1555         }
1556
1557 #ifdef CONFIG_IP_MROUTE
1558         if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1559                 rth->u.dst.input = ip_mr_input;
1560 #endif
1561         RT_CACHE_STAT_INC(in_slow_mc);
1562
1563         in_dev_put(in_dev);
1564         hash = rt_hash(daddr, saddr, dev->ifindex);
1565         return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1566
1567 e_nobufs:
1568         in_dev_put(in_dev);
1569         return -ENOBUFS;
1570
1571 e_inval:
1572         in_dev_put(in_dev);
1573         return -EINVAL;
1574 }
1575
1576
1577 static void ip_handle_martian_source(struct net_device *dev,
1578                                      struct in_device *in_dev,
1579                                      struct sk_buff *skb,
1580                                      __be32 daddr,
1581                                      __be32 saddr)
1582 {
1583         RT_CACHE_STAT_INC(in_martian_src);
1584 #ifdef CONFIG_IP_ROUTE_VERBOSE
1585         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1586                 /*
1587                  *      RFC1812 recommendation, if source is martian,
1588                  *      the only hint is MAC header.
1589                  */
1590                 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1591                         "%u.%u.%u.%u, on dev %s\n",
1592                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1593                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1594                         int i;
1595                         const unsigned char *p = skb_mac_header(skb);
1596                         printk(KERN_WARNING "ll header: ");
1597                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1598                                 printk("%02x", *p);
1599                                 if (i < (dev->hard_header_len - 1))
1600                                         printk(":");
1601                         }
1602                         printk("\n");
1603                 }
1604         }
1605 #endif
1606 }
1607
1608 static inline int __mkroute_input(struct sk_buff *skb,
1609                                   struct fib_result* res,
1610                                   struct in_device *in_dev,
1611                                   __be32 daddr, __be32 saddr, u32 tos,
1612                                   struct rtable **result)
1613 {
1614
1615         struct rtable *rth;
1616         int err;
1617         struct in_device *out_dev;
1618         unsigned flags = 0;
1619         __be32 spec_dst;
1620         u32 itag;
1621
1622         /* get a working reference to the output device */
1623         out_dev = in_dev_get(FIB_RES_DEV(*res));
1624         if (out_dev == NULL) {
1625                 if (net_ratelimit())
1626                         printk(KERN_CRIT "Bug in ip_route_input" \
1627                                "_slow(). Please, report\n");
1628                 return -EINVAL;
1629         }
1630
1631
1632         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1633                                   in_dev->dev, &spec_dst, &itag);
1634         if (err < 0) {
1635                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1636                                          saddr);
1637
1638                 err = -EINVAL;
1639                 goto cleanup;
1640         }
1641
1642         if (err)
1643                 flags |= RTCF_DIRECTSRC;
1644
1645         if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1646             (IN_DEV_SHARED_MEDIA(out_dev) ||
1647              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1648                 flags |= RTCF_DOREDIRECT;
1649
1650         if (skb->protocol != htons(ETH_P_IP)) {
1651                 /* Not IP (i.e. ARP). Do not create route, if it is
1652                  * invalid for proxy arp. DNAT routes are always valid.
1653                  */
1654                 if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1655                         err = -EINVAL;
1656                         goto cleanup;
1657                 }
1658         }
1659
1660
1661         rth = dst_alloc(&ipv4_dst_ops);
1662         if (!rth) {
1663                 err = -ENOBUFS;
1664                 goto cleanup;
1665         }
1666
1667         atomic_set(&rth->u.dst.__refcnt, 1);
1668         rth->u.dst.flags= DST_HOST;
1669         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1670                 rth->u.dst.flags |= DST_NOPOLICY;
1671         if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1672                 rth->u.dst.flags |= DST_NOXFRM;
1673         rth->fl.fl4_dst = daddr;
1674         rth->rt_dst     = daddr;
1675         rth->fl.fl4_tos = tos;
1676         rth->fl.mark    = skb->mark;
1677         rth->fl.fl4_src = saddr;
1678         rth->rt_src     = saddr;
1679         rth->rt_gateway = daddr;
1680         rth->rt_iif     =
1681                 rth->fl.iif     = in_dev->dev->ifindex;
1682         rth->u.dst.dev  = (out_dev)->dev;
1683         dev_hold(rth->u.dst.dev);
1684         rth->idev       = in_dev_get(rth->u.dst.dev);
1685         rth->fl.oif     = 0;
1686         rth->rt_spec_dst= spec_dst;
1687
1688         rth->u.dst.input = ip_forward;
1689         rth->u.dst.output = ip_output;
1690
1691         rt_set_nexthop(rth, res, itag);
1692
1693         rth->rt_flags = flags;
1694
1695         *result = rth;
1696         err = 0;
1697  cleanup:
1698         /* release the working reference to the output device */
1699         in_dev_put(out_dev);
1700         return err;
1701 }
1702
1703 static inline int ip_mkroute_input(struct sk_buff *skb,
1704                                    struct fib_result* res,
1705                                    const struct flowi *fl,
1706                                    struct in_device *in_dev,
1707                                    __be32 daddr, __be32 saddr, u32 tos)
1708 {
1709         struct rtable* rth = NULL;
1710         int err;
1711         unsigned hash;
1712
1713 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1714         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1715                 fib_select_multipath(fl, res);
1716 #endif
1717
1718         /* create a routing cache entry */
1719         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1720         if (err)
1721                 return err;
1722
1723         /* put it into the cache */
1724         hash = rt_hash(daddr, saddr, fl->iif);
1725         return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1726 }
1727
1728 /*
1729  *      NOTE. We drop all the packets that has local source
1730  *      addresses, because every properly looped back packet
1731  *      must have correct destination already attached by output routine.
1732  *
1733  *      Such approach solves two big problems:
1734  *      1. Not simplex devices are handled properly.
1735  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1736  */
1737
1738 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1739                                u8 tos, struct net_device *dev)
1740 {
1741         struct fib_result res;
1742         struct in_device *in_dev = in_dev_get(dev);
1743         struct flowi fl = { .nl_u = { .ip4_u =
1744                                       { .daddr = daddr,
1745                                         .saddr = saddr,
1746                                         .tos = tos,
1747                                         .scope = RT_SCOPE_UNIVERSE,
1748                                       } },
1749                             .mark = skb->mark,
1750                             .iif = dev->ifindex };
1751         unsigned        flags = 0;
1752         u32             itag = 0;
1753         struct rtable * rth;
1754         unsigned        hash;
1755         __be32          spec_dst;
1756         int             err = -EINVAL;
1757         int             free_res = 0;
1758
1759         /* IP on this device is disabled. */
1760
1761         if (!in_dev)
1762                 goto out;
1763
1764         /* Check for the most weird martians, which can be not detected
1765            by fib_lookup.
1766          */
1767
1768         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1769                 goto martian_source;
1770
1771         if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1772                 goto brd_input;
1773
1774         /* Accept zero addresses only to limited broadcast;
1775          * I even do not know to fix it or not. Waiting for complains :-)
1776          */
1777         if (ZERONET(saddr))
1778                 goto martian_source;
1779
1780         if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1781                 goto martian_destination;
1782
1783         /*
1784          *      Now we are ready to route packet.
1785          */
1786         if ((err = fib_lookup(&fl, &res)) != 0) {
1787                 if (!IN_DEV_FORWARD(in_dev))
1788                         goto e_hostunreach;
1789                 goto no_route;
1790         }
1791         free_res = 1;
1792
1793         RT_CACHE_STAT_INC(in_slow_tot);
1794
1795         if (res.type == RTN_BROADCAST)
1796                 goto brd_input;
1797
1798         if (res.type == RTN_LOCAL) {
1799                 int result;
1800                 result = fib_validate_source(saddr, daddr, tos,
1801                                              init_net.loopback_dev->ifindex,
1802                                              dev, &spec_dst, &itag);
1803                 if (result < 0)
1804                         goto martian_source;
1805                 if (result)
1806                         flags |= RTCF_DIRECTSRC;
1807                 spec_dst = daddr;
1808                 goto local_input;
1809         }
1810
1811         if (!IN_DEV_FORWARD(in_dev))
1812                 goto e_hostunreach;
1813         if (res.type != RTN_UNICAST)
1814                 goto martian_destination;
1815
1816         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1817 done:
1818         in_dev_put(in_dev);
1819         if (free_res)
1820                 fib_res_put(&res);
1821 out:    return err;
1822
1823 brd_input:
1824         if (skb->protocol != htons(ETH_P_IP))
1825                 goto e_inval;
1826
1827         if (ZERONET(saddr))
1828                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1829         else {
1830                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1831                                           &itag);
1832                 if (err < 0)
1833                         goto martian_source;
1834                 if (err)
1835                         flags |= RTCF_DIRECTSRC;
1836         }
1837         flags |= RTCF_BROADCAST;
1838         res.type = RTN_BROADCAST;
1839         RT_CACHE_STAT_INC(in_brd);
1840
1841 local_input:
1842         rth = dst_alloc(&ipv4_dst_ops);
1843         if (!rth)
1844                 goto e_nobufs;
1845
1846         rth->u.dst.output= ip_rt_bug;
1847
1848         atomic_set(&rth->u.dst.__refcnt, 1);
1849         rth->u.dst.flags= DST_HOST;
1850         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1851                 rth->u.dst.flags |= DST_NOPOLICY;
1852         rth->fl.fl4_dst = daddr;
1853         rth->rt_dst     = daddr;
1854         rth->fl.fl4_tos = tos;
1855         rth->fl.mark    = skb->mark;
1856         rth->fl.fl4_src = saddr;
1857         rth->rt_src     = saddr;
1858 #ifdef CONFIG_NET_CLS_ROUTE
1859         rth->u.dst.tclassid = itag;
1860 #endif
1861         rth->rt_iif     =
1862         rth->fl.iif     = dev->ifindex;
1863         rth->u.dst.dev  = init_net.loopback_dev;
1864         dev_hold(rth->u.dst.dev);
1865         rth->idev       = in_dev_get(rth->u.dst.dev);
1866         rth->rt_gateway = daddr;
1867         rth->rt_spec_dst= spec_dst;
1868         rth->u.dst.input= ip_local_deliver;
1869         rth->rt_flags   = flags|RTCF_LOCAL;
1870         if (res.type == RTN_UNREACHABLE) {
1871                 rth->u.dst.input= ip_error;
1872                 rth->u.dst.error= -err;
1873                 rth->rt_flags   &= ~RTCF_LOCAL;
1874         }
1875         rth->rt_type    = res.type;
1876         hash = rt_hash(daddr, saddr, fl.iif);
1877         err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1878         goto done;
1879
1880 no_route:
1881         RT_CACHE_STAT_INC(in_no_route);
1882         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
1883         res.type = RTN_UNREACHABLE;
1884         goto local_input;
1885
1886         /*
1887          *      Do not cache martian addresses: they should be logged (RFC1812)
1888          */
1889 martian_destination:
1890         RT_CACHE_STAT_INC(in_martian_dst);
1891 #ifdef CONFIG_IP_ROUTE_VERBOSE
1892         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1893                 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
1894                         "%u.%u.%u.%u, dev %s\n",
1895                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1896 #endif
1897
1898 e_hostunreach:
1899         err = -EHOSTUNREACH;
1900         goto done;
1901
1902 e_inval:
1903         err = -EINVAL;
1904         goto done;
1905
1906 e_nobufs:
1907         err = -ENOBUFS;
1908         goto done;
1909
1910 martian_source:
1911         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1912         goto e_inval;
1913 }
1914
1915 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1916                    u8 tos, struct net_device *dev)
1917 {
1918         struct rtable * rth;
1919         unsigned        hash;
1920         int iif = dev->ifindex;
1921
1922         tos &= IPTOS_RT_MASK;
1923         hash = rt_hash(daddr, saddr, iif);
1924
1925         rcu_read_lock();
1926         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1927              rth = rcu_dereference(rth->u.dst.rt_next)) {
1928                 if (rth->fl.fl4_dst == daddr &&
1929                     rth->fl.fl4_src == saddr &&
1930                     rth->fl.iif == iif &&
1931                     rth->fl.oif == 0 &&
1932                     rth->fl.mark == skb->mark &&
1933                     rth->fl.fl4_tos == tos) {
1934                         dst_use(&rth->u.dst, jiffies);
1935                         RT_CACHE_STAT_INC(in_hit);
1936                         rcu_read_unlock();
1937                         skb->dst = (struct dst_entry*)rth;
1938                         return 0;
1939                 }
1940                 RT_CACHE_STAT_INC(in_hlist_search);
1941         }
1942         rcu_read_unlock();
1943
1944         /* Multicast recognition logic is moved from route cache to here.
1945            The problem was that too many Ethernet cards have broken/missing
1946            hardware multicast filters :-( As result the host on multicasting
1947            network acquires a lot of useless route cache entries, sort of
1948            SDR messages from all the world. Now we try to get rid of them.
1949            Really, provided software IP multicast filter is organized
1950            reasonably (at least, hashed), it does not result in a slowdown
1951            comparing with route cache reject entries.
1952            Note, that multicast routers are not affected, because
1953            route cache entry is created eventually.
1954          */
1955         if (MULTICAST(daddr)) {
1956                 struct in_device *in_dev;
1957
1958                 rcu_read_lock();
1959                 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
1960                         int our = ip_check_mc(in_dev, daddr, saddr,
1961                                 ip_hdr(skb)->protocol);
1962                         if (our
1963 #ifdef CONFIG_IP_MROUTE
1964                             || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1965 #endif
1966                             ) {
1967                                 rcu_read_unlock();
1968                                 return ip_route_input_mc(skb, daddr, saddr,
1969                                                          tos, dev, our);
1970                         }
1971                 }
1972                 rcu_read_unlock();
1973                 return -EINVAL;
1974         }
1975         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
1976 }
1977
1978 static inline int __mkroute_output(struct rtable **result,
1979                                    struct fib_result* res,
1980                                    const struct flowi *fl,
1981                                    const struct flowi *oldflp,
1982                                    struct net_device *dev_out,
1983                                    unsigned flags)
1984 {
1985         struct rtable *rth;
1986         struct in_device *in_dev;
1987         u32 tos = RT_FL_TOS(oldflp);
1988         int err = 0;
1989
1990         if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
1991                 return -EINVAL;
1992
1993         if (fl->fl4_dst == htonl(0xFFFFFFFF))
1994                 res->type = RTN_BROADCAST;
1995         else if (MULTICAST(fl->fl4_dst))
1996                 res->type = RTN_MULTICAST;
1997         else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
1998                 return -EINVAL;
1999
2000         if (dev_out->flags & IFF_LOOPBACK)
2001                 flags |= RTCF_LOCAL;
2002
2003         /* get work reference to inet device */
2004         in_dev = in_dev_get(dev_out);
2005         if (!in_dev)
2006                 return -EINVAL;
2007
2008         if (res->type == RTN_BROADCAST) {
2009                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2010                 if (res->fi) {
2011                         fib_info_put(res->fi);
2012                         res->fi = NULL;
2013                 }
2014         } else if (res->type == RTN_MULTICAST) {
2015                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2016                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2017                                  oldflp->proto))
2018                         flags &= ~RTCF_LOCAL;
2019                 /* If multicast route do not exist use
2020                    default one, but do not gateway in this case.
2021                    Yes, it is hack.
2022                  */
2023                 if (res->fi && res->prefixlen < 4) {
2024                         fib_info_put(res->fi);
2025                         res->fi = NULL;
2026                 }
2027         }
2028
2029
2030         rth = dst_alloc(&ipv4_dst_ops);
2031         if (!rth) {
2032                 err = -ENOBUFS;
2033                 goto cleanup;
2034         }
2035
2036         atomic_set(&rth->u.dst.__refcnt, 1);
2037         rth->u.dst.flags= DST_HOST;
2038         if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2039                 rth->u.dst.flags |= DST_NOXFRM;
2040         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2041                 rth->u.dst.flags |= DST_NOPOLICY;
2042
2043         rth->fl.fl4_dst = oldflp->fl4_dst;
2044         rth->fl.fl4_tos = tos;
2045         rth->fl.fl4_src = oldflp->fl4_src;
2046         rth->fl.oif     = oldflp->oif;
2047         rth->fl.mark    = oldflp->mark;
2048         rth->rt_dst     = fl->fl4_dst;
2049         rth->rt_src     = fl->fl4_src;
2050         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2051         /* get references to the devices that are to be hold by the routing
2052            cache entry */
2053         rth->u.dst.dev  = dev_out;
2054         dev_hold(dev_out);
2055         rth->idev       = in_dev_get(dev_out);
2056         rth->rt_gateway = fl->fl4_dst;
2057         rth->rt_spec_dst= fl->fl4_src;
2058
2059         rth->u.dst.output=ip_output;
2060
2061         RT_CACHE_STAT_INC(out_slow_tot);
2062
2063         if (flags & RTCF_LOCAL) {
2064                 rth->u.dst.input = ip_local_deliver;
2065                 rth->rt_spec_dst = fl->fl4_dst;
2066         }
2067         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2068                 rth->rt_spec_dst = fl->fl4_src;
2069                 if (flags & RTCF_LOCAL &&
2070                     !(dev_out->flags & IFF_LOOPBACK)) {
2071                         rth->u.dst.output = ip_mc_output;
2072                         RT_CACHE_STAT_INC(out_slow_mc);
2073                 }
2074 #ifdef CONFIG_IP_MROUTE
2075                 if (res->type == RTN_MULTICAST) {
2076                         if (IN_DEV_MFORWARD(in_dev) &&
2077                             !LOCAL_MCAST(oldflp->fl4_dst)) {
2078                                 rth->u.dst.input = ip_mr_input;
2079                                 rth->u.dst.output = ip_mc_output;
2080                         }
2081                 }
2082 #endif
2083         }
2084
2085         rt_set_nexthop(rth, res, 0);
2086
2087         rth->rt_flags = flags;
2088
2089         *result = rth;
2090  cleanup:
2091         /* release work reference to inet device */
2092         in_dev_put(in_dev);
2093
2094         return err;
2095 }
2096
2097 static inline int ip_mkroute_output(struct rtable **rp,
2098                                     struct fib_result* res,
2099                                     const struct flowi *fl,
2100                                     const struct flowi *oldflp,
2101                                     struct net_device *dev_out,
2102                                     unsigned flags)
2103 {
2104         struct rtable *rth = NULL;
2105         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2106         unsigned hash;
2107         if (err == 0) {
2108                 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
2109                 err = rt_intern_hash(hash, rth, rp);
2110         }
2111
2112         return err;
2113 }
2114
2115 /*
2116  * Major route resolver routine.
2117  */
2118
2119 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2120 {
2121         u32 tos = RT_FL_TOS(oldflp);
2122         struct flowi fl = { .nl_u = { .ip4_u =
2123                                       { .daddr = oldflp->fl4_dst,
2124                                         .saddr = oldflp->fl4_src,
2125                                         .tos = tos & IPTOS_RT_MASK,
2126                                         .scope = ((tos & RTO_ONLINK) ?
2127                                                   RT_SCOPE_LINK :
2128                                                   RT_SCOPE_UNIVERSE),
2129                                       } },
2130                             .mark = oldflp->mark,
2131                             .iif = init_net.loopback_dev->ifindex,
2132                             .oif = oldflp->oif };
2133         struct fib_result res;
2134         unsigned flags = 0;
2135         struct net_device *dev_out = NULL;
2136         int free_res = 0;
2137         int err;
2138
2139
2140         res.fi          = NULL;
2141 #ifdef CONFIG_IP_MULTIPLE_TABLES
2142         res.r           = NULL;
2143 #endif
2144
2145         if (oldflp->fl4_src) {
2146                 err = -EINVAL;
2147                 if (MULTICAST(oldflp->fl4_src) ||
2148                     BADCLASS(oldflp->fl4_src) ||
2149                     ZERONET(oldflp->fl4_src))
2150                         goto out;
2151
2152                 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2153                 dev_out = ip_dev_find(oldflp->fl4_src);
2154                 if (dev_out == NULL)
2155                         goto out;
2156
2157                 /* I removed check for oif == dev_out->oif here.
2158                    It was wrong for two reasons:
2159                    1. ip_dev_find(saddr) can return wrong iface, if saddr is
2160                       assigned to multiple interfaces.
2161                    2. Moreover, we are allowed to send packets with saddr
2162                       of another iface. --ANK
2163                  */
2164
2165                 if (oldflp->oif == 0
2166                     && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2167                         /* Special hack: user can direct multicasts
2168                            and limited broadcast via necessary interface
2169                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2170                            This hack is not just for fun, it allows
2171                            vic,vat and friends to work.
2172                            They bind socket to loopback, set ttl to zero
2173                            and expect that it will work.
2174                            From the viewpoint of routing cache they are broken,
2175                            because we are not allowed to build multicast path
2176                            with loopback source addr (look, routing cache
2177                            cannot know, that ttl is zero, so that packet
2178                            will not leave this host and route is valid).
2179                            Luckily, this hack is good workaround.
2180                          */
2181
2182                         fl.oif = dev_out->ifindex;
2183                         goto make_route;
2184                 }
2185                 if (dev_out)
2186                         dev_put(dev_out);
2187                 dev_out = NULL;
2188         }
2189
2190
2191         if (oldflp->oif) {
2192                 dev_out = dev_get_by_index(&init_net, oldflp->oif);
2193                 err = -ENODEV;
2194                 if (dev_out == NULL)
2195                         goto out;
2196
2197                 /* RACE: Check return value of inet_select_addr instead. */
2198                 if (__in_dev_get_rtnl(dev_out) == NULL) {
2199                         dev_put(dev_out);
2200                         goto out;       /* Wrong error code */
2201                 }
2202
2203                 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2204                         if (!fl.fl4_src)
2205                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2206                                                               RT_SCOPE_LINK);
2207                         goto make_route;
2208                 }
2209                 if (!fl.fl4_src) {
2210                         if (MULTICAST(oldflp->fl4_dst))
2211                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2212                                                               fl.fl4_scope);
2213                         else if (!oldflp->fl4_dst)
2214                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2215                                                               RT_SCOPE_HOST);
2216                 }
2217         }
2218
2219         if (!fl.fl4_dst) {
2220                 fl.fl4_dst = fl.fl4_src;
2221                 if (!fl.fl4_dst)
2222                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2223                 if (dev_out)
2224                         dev_put(dev_out);
2225                 dev_out = init_net.loopback_dev;
2226                 dev_hold(dev_out);
2227                 fl.oif = init_net.loopback_dev->ifindex;
2228                 res.type = RTN_LOCAL;
2229                 flags |= RTCF_LOCAL;
2230                 goto make_route;
2231         }
2232
2233         if (fib_lookup(&fl, &res)) {
2234                 res.fi = NULL;
2235                 if (oldflp->oif) {
2236                         /* Apparently, routing tables are wrong. Assume,
2237                            that the destination is on link.
2238
2239                            WHY? DW.
2240                            Because we are allowed to send to iface
2241                            even if it has NO routes and NO assigned
2242                            addresses. When oif is specified, routing
2243                            tables are looked up with only one purpose:
2244                            to catch if destination is gatewayed, rather than
2245                            direct. Moreover, if MSG_DONTROUTE is set,
2246                            we send packet, ignoring both routing tables
2247                            and ifaddr state. --ANK
2248
2249
2250                            We could make it even if oif is unknown,
2251                            likely IPv6, but we do not.
2252                          */
2253
2254                         if (fl.fl4_src == 0)
2255                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2256                                                               RT_SCOPE_LINK);
2257                         res.type = RTN_UNICAST;
2258                         goto make_route;
2259                 }
2260                 if (dev_out)
2261                         dev_put(dev_out);
2262                 err = -ENETUNREACH;
2263                 goto out;
2264         }
2265         free_res = 1;
2266
2267         if (res.type == RTN_LOCAL) {
2268                 if (!fl.fl4_src)
2269                         fl.fl4_src = fl.fl4_dst;
2270                 if (dev_out)
2271                         dev_put(dev_out);
2272                 dev_out = init_net.loopback_dev;
2273                 dev_hold(dev_out);
2274                 fl.oif = dev_out->ifindex;
2275                 if (res.fi)
2276                         fib_info_put(res.fi);
2277                 res.fi = NULL;
2278                 flags |= RTCF_LOCAL;
2279                 goto make_route;
2280         }
2281
2282 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2283         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2284                 fib_select_multipath(&fl, &res);
2285         else
2286 #endif
2287         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2288                 fib_select_default(&fl, &res);
2289
2290         if (!fl.fl4_src)
2291                 fl.fl4_src = FIB_RES_PREFSRC(res);
2292
2293         if (dev_out)
2294                 dev_put(dev_out);
2295         dev_out = FIB_RES_DEV(res);
2296         dev_hold(dev_out);
2297         fl.oif = dev_out->ifindex;
2298
2299
2300 make_route:
2301         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2302
2303
2304         if (free_res)
2305                 fib_res_put(&res);
2306         if (dev_out)
2307                 dev_put(dev_out);
2308 out:    return err;
2309 }
2310
2311 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2312 {
2313         unsigned hash;
2314         struct rtable *rth;
2315
2316         hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
2317
2318         rcu_read_lock_bh();
2319         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2320                 rth = rcu_dereference(rth->u.dst.rt_next)) {
2321                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2322                     rth->fl.fl4_src == flp->fl4_src &&
2323                     rth->fl.iif == 0 &&
2324                     rth->fl.oif == flp->oif &&
2325                     rth->fl.mark == flp->mark &&
2326                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2327                             (IPTOS_RT_MASK | RTO_ONLINK))) {
2328                         dst_use(&rth->u.dst, jiffies);
2329                         RT_CACHE_STAT_INC(out_hit);
2330                         rcu_read_unlock_bh();
2331                         *rp = rth;
2332                         return 0;
2333                 }
2334                 RT_CACHE_STAT_INC(out_hlist_search);
2335         }
2336         rcu_read_unlock_bh();
2337
2338         return ip_route_output_slow(rp, flp);
2339 }
2340
2341 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2342
2343 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2344 {
2345 }
2346
2347 static struct dst_ops ipv4_dst_blackhole_ops = {
2348         .family                 =       AF_INET,
2349         .protocol               =       __constant_htons(ETH_P_IP),
2350         .destroy                =       ipv4_dst_destroy,
2351         .check                  =       ipv4_dst_check,
2352         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2353         .entry_size             =       sizeof(struct rtable),
2354 };
2355
2356
2357 static int ipv4_blackhole_output(struct sk_buff *skb)
2358 {
2359         kfree_skb(skb);
2360         return 0;
2361 }
2362
2363 static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp, struct sock *sk)
2364 {
2365         struct rtable *ort = *rp;
2366         struct rtable *rt = (struct rtable *)
2367                 dst_alloc(&ipv4_dst_blackhole_ops);
2368
2369         if (rt) {
2370                 struct dst_entry *new = &rt->u.dst;
2371
2372                 atomic_set(&new->__refcnt, 1);
2373                 new->__use = 1;
2374                 new->input = ipv4_blackhole_output;
2375                 new->output = ipv4_blackhole_output;
2376                 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2377
2378                 new->dev = ort->u.dst.dev;
2379                 if (new->dev)
2380                         dev_hold(new->dev);
2381
2382                 rt->fl = ort->fl;
2383
2384                 rt->idev = ort->idev;
2385                 if (rt->idev)
2386                         in_dev_hold(rt->idev);
2387                 rt->rt_flags = ort->rt_flags;
2388                 rt->rt_type = ort->rt_type;
2389                 rt->rt_dst = ort->rt_dst;
2390                 rt->rt_src = ort->rt_src;
2391                 rt->rt_iif = ort->rt_iif;
2392                 rt->rt_gateway = ort->rt_gateway;
2393                 rt->rt_spec_dst = ort->rt_spec_dst;
2394                 rt->peer = ort->peer;
2395                 if (rt->peer)
2396                         atomic_inc(&rt->peer->refcnt);
2397
2398                 dst_free(new);
2399         }
2400
2401         dst_release(&(*rp)->u.dst);
2402         *rp = rt;
2403         return (rt ? 0 : -ENOMEM);
2404 }
2405
2406 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2407 {
2408         int err;
2409
2410         if ((err = __ip_route_output_key(rp, flp)) != 0)
2411                 return err;
2412
2413         if (flp->proto) {
2414                 if (!flp->fl4_src)
2415                         flp->fl4_src = (*rp)->rt_src;
2416                 if (!flp->fl4_dst)
2417                         flp->fl4_dst = (*rp)->rt_dst;
2418                 err = __xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2419                 if (err == -EREMOTE)
2420                         err = ipv4_dst_blackhole(rp, flp, sk);
2421
2422                 return err;
2423         }
2424
2425         return 0;
2426 }
2427
2428 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2429
2430 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2431 {
2432         return ip_route_output_flow(rp, flp, NULL, 0);
2433 }
2434
2435 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2436                         int nowait, unsigned int flags)
2437 {
2438         struct rtable *rt = (struct rtable*)skb->dst;
2439         struct rtmsg *r;
2440         struct nlmsghdr *nlh;
2441         long expires;
2442         u32 id = 0, ts = 0, tsage = 0, error;
2443
2444         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2445         if (nlh == NULL)
2446                 return -EMSGSIZE;
2447
2448         r = nlmsg_data(nlh);
2449         r->rtm_family    = AF_INET;
2450         r->rtm_dst_len  = 32;
2451         r->rtm_src_len  = 0;
2452         r->rtm_tos      = rt->fl.fl4_tos;
2453         r->rtm_table    = RT_TABLE_MAIN;
2454         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2455         r->rtm_type     = rt->rt_type;
2456         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2457         r->rtm_protocol = RTPROT_UNSPEC;
2458         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2459         if (rt->rt_flags & RTCF_NOTIFY)
2460                 r->rtm_flags |= RTM_F_NOTIFY;
2461
2462         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2463
2464         if (rt->fl.fl4_src) {
2465                 r->rtm_src_len = 32;
2466                 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2467         }
2468         if (rt->u.dst.dev)
2469                 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2470 #ifdef CONFIG_NET_CLS_ROUTE
2471         if (rt->u.dst.tclassid)
2472                 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2473 #endif
2474         if (rt->fl.iif)
2475                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2476         else if (rt->rt_src != rt->fl.fl4_src)
2477                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2478
2479         if (rt->rt_dst != rt->rt_gateway)
2480                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2481
2482         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2483                 goto nla_put_failure;
2484
2485         error = rt->u.dst.error;
2486         expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2487         if (rt->peer) {
2488                 id = rt->peer->ip_id_count;
2489                 if (rt->peer->tcp_ts_stamp) {
2490                         ts = rt->peer->tcp_ts;
2491                         tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2492                 }
2493         }
2494
2495         if (rt->fl.iif) {
2496 #ifdef CONFIG_IP_MROUTE
2497                 __be32 dst = rt->rt_dst;
2498
2499                 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2500                     IPV4_DEVCONF_ALL(MC_FORWARDING)) {
2501                         int err = ipmr_get_route(skb, r, nowait);
2502                         if (err <= 0) {
2503                                 if (!nowait) {
2504                                         if (err == 0)
2505                                                 return 0;
2506                                         goto nla_put_failure;
2507                                 } else {
2508                                         if (err == -EMSGSIZE)
2509                                                 goto nla_put_failure;
2510                                         error = err;
2511                                 }
2512                         }
2513                 } else
2514 #endif
2515                         NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2516         }
2517
2518         if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2519                                expires, error) < 0)
2520                 goto nla_put_failure;
2521
2522         return nlmsg_end(skb, nlh);
2523
2524 nla_put_failure:
2525         nlmsg_cancel(skb, nlh);
2526         return -EMSGSIZE;
2527 }
2528
2529 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2530 {
2531         struct rtmsg *rtm;
2532         struct nlattr *tb[RTA_MAX+1];
2533         struct rtable *rt = NULL;
2534         __be32 dst = 0;
2535         __be32 src = 0;
2536         u32 iif;
2537         int err;
2538         struct sk_buff *skb;
2539
2540         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2541         if (err < 0)
2542                 goto errout;
2543
2544         rtm = nlmsg_data(nlh);
2545
2546         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2547         if (skb == NULL) {
2548                 err = -ENOBUFS;
2549                 goto errout;
2550         }
2551
2552         /* Reserve room for dummy headers, this skb can pass
2553            through good chunk of routing engine.
2554          */
2555         skb_reset_mac_header(skb);
2556         skb_reset_network_header(skb);
2557
2558         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2559         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2560         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2561
2562         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2563         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2564         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2565
2566         if (iif) {
2567                 struct net_device *dev;
2568
2569                 dev = __dev_get_by_index(&init_net, iif);
2570                 if (dev == NULL) {
2571                         err = -ENODEV;
2572                         goto errout_free;
2573                 }
2574
2575                 skb->protocol   = htons(ETH_P_IP);
2576                 skb->dev        = dev;
2577                 local_bh_disable();
2578                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2579                 local_bh_enable();
2580
2581                 rt = (struct rtable*) skb->dst;
2582                 if (err == 0 && rt->u.dst.error)
2583                         err = -rt->u.dst.error;
2584         } else {
2585                 struct flowi fl = {
2586                         .nl_u = {
2587                                 .ip4_u = {
2588                                         .daddr = dst,
2589                                         .saddr = src,
2590                                         .tos = rtm->rtm_tos,
2591                                 },
2592                         },
2593                         .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2594                 };
2595                 err = ip_route_output_key(&rt, &fl);
2596         }
2597
2598         if (err)
2599                 goto errout_free;
2600
2601         skb->dst = &rt->u.dst;
2602         if (rtm->rtm_flags & RTM_F_NOTIFY)
2603                 rt->rt_flags |= RTCF_NOTIFY;
2604
2605         err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2606                                 RTM_NEWROUTE, 0, 0);
2607         if (err <= 0)
2608                 goto errout_free;
2609
2610         err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
2611 errout:
2612         return err;
2613
2614 errout_free:
2615         kfree_skb(skb);
2616         goto errout;
2617 }
2618
2619 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2620 {
2621         struct rtable *rt;
2622         int h, s_h;
2623         int idx, s_idx;
2624
2625         s_h = cb->args[0];
2626         s_idx = idx = cb->args[1];
2627         for (h = 0; h <= rt_hash_mask; h++) {
2628                 if (h < s_h) continue;
2629                 if (h > s_h)
2630                         s_idx = 0;
2631                 rcu_read_lock_bh();
2632                 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2633                      rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2634                         if (idx < s_idx)
2635                                 continue;
2636                         skb->dst = dst_clone(&rt->u.dst);
2637                         if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2638                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2639                                          1, NLM_F_MULTI) <= 0) {
2640                                 dst_release(xchg(&skb->dst, NULL));
2641                                 rcu_read_unlock_bh();
2642                                 goto done;
2643                         }
2644                         dst_release(xchg(&skb->dst, NULL));
2645                 }
2646                 rcu_read_unlock_bh();
2647         }
2648
2649 done:
2650         cb->args[0] = h;
2651         cb->args[1] = idx;
2652         return skb->len;
2653 }
2654
2655 void ip_rt_multicast_event(struct in_device *in_dev)
2656 {
2657         rt_cache_flush(0);
2658 }
2659
2660 #ifdef CONFIG_SYSCTL
2661 static int flush_delay;
2662
2663 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2664                                         struct file *filp, void __user *buffer,
2665                                         size_t *lenp, loff_t *ppos)
2666 {
2667         if (write) {
2668                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2669                 rt_cache_flush(flush_delay);
2670                 return 0;
2671         }
2672
2673         return -EINVAL;
2674 }
2675
2676 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2677                                                 int __user *name,
2678                                                 int nlen,
2679                                                 void __user *oldval,
2680                                                 size_t __user *oldlenp,
2681                                                 void __user *newval,
2682                                                 size_t newlen)
2683 {
2684         int delay;
2685         if (newlen != sizeof(int))
2686                 return -EINVAL;
2687         if (get_user(delay, (int __user *)newval))
2688                 return -EFAULT;
2689         rt_cache_flush(delay);
2690         return 0;
2691 }
2692
2693 ctl_table ipv4_route_table[] = {
2694         {
2695                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
2696                 .procname       = "flush",
2697                 .data           = &flush_delay,
2698                 .maxlen         = sizeof(int),
2699                 .mode           = 0200,
2700                 .proc_handler   = &ipv4_sysctl_rtcache_flush,
2701                 .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
2702         },
2703         {
2704                 .ctl_name       = NET_IPV4_ROUTE_MIN_DELAY,
2705                 .procname       = "min_delay",
2706                 .data           = &ip_rt_min_delay,
2707                 .maxlen         = sizeof(int),
2708                 .mode           = 0644,
2709                 .proc_handler   = &proc_dointvec_jiffies,
2710                 .strategy       = &sysctl_jiffies,
2711         },
2712         {
2713                 .ctl_name       = NET_IPV4_ROUTE_MAX_DELAY,
2714                 .procname       = "max_delay",
2715                 .data           = &ip_rt_max_delay,
2716                 .maxlen         = sizeof(int),
2717                 .mode           = 0644,
2718                 .proc_handler   = &proc_dointvec_jiffies,
2719                 .strategy       = &sysctl_jiffies,
2720         },
2721         {
2722                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
2723                 .procname       = "gc_thresh",
2724                 .data           = &ipv4_dst_ops.gc_thresh,
2725                 .maxlen         = sizeof(int),
2726                 .mode           = 0644,
2727                 .proc_handler   = &proc_dointvec,
2728         },
2729         {
2730                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
2731                 .procname       = "max_size",
2732                 .data           = &ip_rt_max_size,
2733                 .maxlen         = sizeof(int),
2734                 .mode           = 0644,
2735                 .proc_handler   = &proc_dointvec,
2736         },
2737         {
2738                 /*  Deprecated. Use gc_min_interval_ms */
2739
2740                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2741                 .procname       = "gc_min_interval",
2742                 .data           = &ip_rt_gc_min_interval,
2743                 .maxlen         = sizeof(int),
2744                 .mode           = 0644,
2745                 .proc_handler   = &proc_dointvec_jiffies,
2746                 .strategy       = &sysctl_jiffies,
2747         },
2748         {
2749                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2750                 .procname       = "gc_min_interval_ms",
2751                 .data           = &ip_rt_gc_min_interval,
2752                 .maxlen         = sizeof(int),
2753                 .mode           = 0644,
2754                 .proc_handler   = &proc_dointvec_ms_jiffies,
2755                 .strategy       = &sysctl_ms_jiffies,
2756         },
2757         {
2758                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
2759                 .procname       = "gc_timeout",
2760                 .data           = &ip_rt_gc_timeout,
2761                 .maxlen         = sizeof(int),
2762                 .mode           = 0644,
2763                 .proc_handler   = &proc_dointvec_jiffies,
2764                 .strategy       = &sysctl_jiffies,
2765         },
2766         {
2767                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
2768                 .procname       = "gc_interval",
2769                 .data           = &ip_rt_gc_interval,
2770                 .maxlen         = sizeof(int),
2771                 .mode           = 0644,
2772                 .proc_handler   = &proc_dointvec_jiffies,
2773                 .strategy       = &sysctl_jiffies,
2774         },
2775         {
2776                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
2777                 .procname       = "redirect_load",
2778                 .data           = &ip_rt_redirect_load,
2779                 .maxlen         = sizeof(int),
2780                 .mode           = 0644,
2781                 .proc_handler   = &proc_dointvec,
2782         },
2783         {
2784                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2785                 .procname       = "redirect_number",
2786                 .data           = &ip_rt_redirect_number,
2787                 .maxlen         = sizeof(int),
2788                 .mode           = 0644,
2789                 .proc_handler   = &proc_dointvec,
2790         },
2791         {
2792                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2793                 .procname       = "redirect_silence",
2794                 .data           = &ip_rt_redirect_silence,
2795                 .maxlen         = sizeof(int),
2796                 .mode           = 0644,
2797                 .proc_handler   = &proc_dointvec,
2798         },
2799         {
2800                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
2801                 .procname       = "error_cost",
2802                 .data           = &ip_rt_error_cost,
2803                 .maxlen         = sizeof(int),
2804                 .mode           = 0644,
2805                 .proc_handler   = &proc_dointvec,
2806         },
2807         {
2808                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
2809                 .procname       = "error_burst",
2810                 .data           = &ip_rt_error_burst,
2811                 .maxlen         = sizeof(int),
2812                 .mode           = 0644,
2813                 .proc_handler   = &proc_dointvec,
2814         },
2815         {
2816                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
2817                 .procname       = "gc_elasticity",
2818                 .data           = &ip_rt_gc_elasticity,
2819                 .maxlen         = sizeof(int),
2820                 .mode           = 0644,
2821                 .proc_handler   = &proc_dointvec,
2822         },
2823         {
2824                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
2825                 .procname       = "mtu_expires",
2826                 .data           = &ip_rt_mtu_expires,
2827                 .maxlen         = sizeof(int),
2828                 .mode           = 0644,
2829                 .proc_handler   = &proc_dointvec_jiffies,
2830                 .strategy       = &sysctl_jiffies,
2831         },
2832         {
2833                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
2834                 .procname       = "min_pmtu",
2835                 .data           = &ip_rt_min_pmtu,
2836                 .maxlen         = sizeof(int),
2837                 .mode           = 0644,
2838                 .proc_handler   = &proc_dointvec,
2839         },
2840         {
2841                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
2842                 .procname       = "min_adv_mss",
2843                 .data           = &ip_rt_min_advmss,
2844                 .maxlen         = sizeof(int),
2845                 .mode           = 0644,
2846                 .proc_handler   = &proc_dointvec,
2847         },
2848         {
2849                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
2850                 .procname       = "secret_interval",
2851                 .data           = &ip_rt_secret_interval,
2852                 .maxlen         = sizeof(int),
2853                 .mode           = 0644,
2854                 .proc_handler   = &proc_dointvec_jiffies,
2855                 .strategy       = &sysctl_jiffies,
2856         },
2857         { .ctl_name = 0 }
2858 };
2859 #endif
2860
2861 #ifdef CONFIG_NET_CLS_ROUTE
2862 struct ip_rt_acct *ip_rt_acct;
2863
2864 /* This code sucks.  But you should have seen it before! --RR */
2865
2866 /* IP route accounting ptr for this logical cpu number. */
2867 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
2868
2869 #ifdef CONFIG_PROC_FS
2870 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
2871                            int length, int *eof, void *data)
2872 {
2873         unsigned int i;
2874
2875         if ((offset & 3) || (length & 3))
2876                 return -EIO;
2877
2878         if (offset >= sizeof(struct ip_rt_acct) * 256) {
2879                 *eof = 1;
2880                 return 0;
2881         }
2882
2883         if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
2884                 length = sizeof(struct ip_rt_acct) * 256 - offset;
2885                 *eof = 1;
2886         }
2887
2888         offset /= sizeof(u32);
2889
2890         if (length > 0) {
2891                 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
2892                 u32 *dst = (u32 *) buffer;
2893
2894                 /* Copy first cpu. */
2895                 *start = buffer;
2896                 memcpy(dst, src, length);
2897
2898                 /* Add the other cpus in, one int at a time */
2899                 for_each_possible_cpu(i) {
2900                         unsigned int j;
2901
2902                         src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
2903
2904                         for (j = 0; j < length/4; j++)
2905                                 dst[j] += src[j];
2906                 }
2907         }
2908         return length;
2909 }
2910 #endif /* CONFIG_PROC_FS */
2911 #endif /* CONFIG_NET_CLS_ROUTE */
2912
2913 static __initdata unsigned long rhash_entries;
2914 static int __init set_rhash_entries(char *str)
2915 {
2916         if (!str)
2917                 return 0;
2918         rhash_entries = simple_strtoul(str, &str, 0);
2919         return 1;
2920 }
2921 __setup("rhash_entries=", set_rhash_entries);
2922
2923 int __init ip_rt_init(void)
2924 {
2925         int rc = 0;
2926
2927         rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
2928                              (jiffies ^ (jiffies >> 7)));
2929
2930 #ifdef CONFIG_NET_CLS_ROUTE
2931         {
2932         int order;
2933         for (order = 0;
2934              (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
2935                 /* NOTHING */;
2936         ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
2937         if (!ip_rt_acct)
2938                 panic("IP: failed to allocate ip_rt_acct\n");
2939         memset(ip_rt_acct, 0, PAGE_SIZE << order);
2940         }
2941 #endif
2942
2943         ipv4_dst_ops.kmem_cachep =
2944                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2945                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2946
2947         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2948
2949         rt_hash_table = (struct rt_hash_bucket *)
2950                 alloc_large_system_hash("IP route cache",
2951                                         sizeof(struct rt_hash_bucket),
2952                                         rhash_entries,
2953                                         (num_physpages >= 128 * 1024) ?
2954                                         15 : 17,
2955                                         0,
2956                                         &rt_hash_log,
2957                                         &rt_hash_mask,
2958                                         0);
2959         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
2960         rt_hash_lock_init();
2961
2962         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
2963         ip_rt_max_size = (rt_hash_mask + 1) * 16;
2964
2965         devinet_init();
2966         ip_fib_init();
2967
2968         init_timer(&rt_flush_timer);
2969         rt_flush_timer.function = rt_run_flush;
2970         init_timer(&rt_secret_timer);
2971         rt_secret_timer.function = rt_secret_rebuild;
2972
2973         /* All the timers, started at system startup tend
2974            to synchronize. Perturb it a bit.
2975          */
2976         schedule_delayed_work(&expires_work,
2977                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
2978
2979         rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
2980                 ip_rt_secret_interval;
2981         add_timer(&rt_secret_timer);
2982
2983 #ifdef CONFIG_PROC_FS
2984         {
2985         struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
2986         if (!proc_net_fops_create(&init_net, "rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
2987             !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
2988                                              init_net.proc_net_stat))) {
2989                 return -ENOMEM;
2990         }
2991         rtstat_pde->proc_fops = &rt_cpu_seq_fops;
2992         }
2993 #ifdef CONFIG_NET_CLS_ROUTE
2994         create_proc_read_entry("rt_acct", 0, init_net.proc_net, ip_rt_acct_read, NULL);
2995 #endif
2996 #endif
2997 #ifdef CONFIG_XFRM
2998         xfrm_init();
2999         xfrm4_init();
3000 #endif
3001         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3002
3003         return rc;
3004 }
3005
3006 EXPORT_SYMBOL(__ip_select_ident);
3007 EXPORT_SYMBOL(ip_route_input);
3008 EXPORT_SYMBOL(ip_route_output_key);