Documentation: Docbook: Fix files location change of kernel/[hr]timer.c
[sfrench/cifs-2.6.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #define pr_fmt(fmt) "IPv4: " fmt
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <net/dst.h>
93 #include <net/net_namespace.h>
94 #include <net/protocol.h>
95 #include <net/ip.h>
96 #include <net/route.h>
97 #include <net/inetpeer.h>
98 #include <net/sock.h>
99 #include <net/ip_fib.h>
100 #include <net/arp.h>
101 #include <net/tcp.h>
102 #include <net/icmp.h>
103 #include <net/xfrm.h>
104 #include <net/netevent.h>
105 #include <net/rtnetlink.h>
106 #ifdef CONFIG_SYSCTL
107 #include <linux/sysctl.h>
108 #include <linux/kmemleak.h>
109 #endif
110 #include <net/secure_seq.h>
111
112 #define RT_FL_TOS(oldflp4) \
113         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
114
115 #define RT_GC_TIMEOUT (300*HZ)
116
117 static int ip_rt_max_size;
118 static int ip_rt_redirect_number __read_mostly  = 9;
119 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
120 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
121 static int ip_rt_error_cost __read_mostly       = HZ;
122 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
123 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
124 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
125 static int ip_rt_min_advmss __read_mostly       = 256;
126
127 /*
128  *      Interface to generic destination cache.
129  */
130
131 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
132 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
133 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
134 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
135 static void              ipv4_link_failure(struct sk_buff *skb);
136 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
137                                            struct sk_buff *skb, u32 mtu);
138 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
139                                         struct sk_buff *skb);
140 static void             ipv4_dst_destroy(struct dst_entry *dst);
141
142 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
143 {
144         WARN_ON(1);
145         return NULL;
146 }
147
148 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
149                                            struct sk_buff *skb,
150                                            const void *daddr);
151
152 static struct dst_ops ipv4_dst_ops = {
153         .family =               AF_INET,
154         .protocol =             cpu_to_be16(ETH_P_IP),
155         .check =                ipv4_dst_check,
156         .default_advmss =       ipv4_default_advmss,
157         .mtu =                  ipv4_mtu,
158         .cow_metrics =          ipv4_cow_metrics,
159         .destroy =              ipv4_dst_destroy,
160         .negative_advice =      ipv4_negative_advice,
161         .link_failure =         ipv4_link_failure,
162         .update_pmtu =          ip_rt_update_pmtu,
163         .redirect =             ip_do_redirect,
164         .local_out =            __ip_local_out,
165         .neigh_lookup =         ipv4_neigh_lookup,
166 };
167
168 #define ECN_OR_COST(class)      TC_PRIO_##class
169
170 const __u8 ip_tos2prio[16] = {
171         TC_PRIO_BESTEFFORT,
172         ECN_OR_COST(BESTEFFORT),
173         TC_PRIO_BESTEFFORT,
174         ECN_OR_COST(BESTEFFORT),
175         TC_PRIO_BULK,
176         ECN_OR_COST(BULK),
177         TC_PRIO_BULK,
178         ECN_OR_COST(BULK),
179         TC_PRIO_INTERACTIVE,
180         ECN_OR_COST(INTERACTIVE),
181         TC_PRIO_INTERACTIVE,
182         ECN_OR_COST(INTERACTIVE),
183         TC_PRIO_INTERACTIVE_BULK,
184         ECN_OR_COST(INTERACTIVE_BULK),
185         TC_PRIO_INTERACTIVE_BULK,
186         ECN_OR_COST(INTERACTIVE_BULK)
187 };
188 EXPORT_SYMBOL(ip_tos2prio);
189
190 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
191 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
192
193 #ifdef CONFIG_PROC_FS
194 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
195 {
196         if (*pos)
197                 return NULL;
198         return SEQ_START_TOKEN;
199 }
200
201 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
202 {
203         ++*pos;
204         return NULL;
205 }
206
207 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
208 {
209 }
210
211 static int rt_cache_seq_show(struct seq_file *seq, void *v)
212 {
213         if (v == SEQ_START_TOKEN)
214                 seq_printf(seq, "%-127s\n",
215                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
216                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
217                            "HHUptod\tSpecDst");
218         return 0;
219 }
220
221 static const struct seq_operations rt_cache_seq_ops = {
222         .start  = rt_cache_seq_start,
223         .next   = rt_cache_seq_next,
224         .stop   = rt_cache_seq_stop,
225         .show   = rt_cache_seq_show,
226 };
227
228 static int rt_cache_seq_open(struct inode *inode, struct file *file)
229 {
230         return seq_open(file, &rt_cache_seq_ops);
231 }
232
233 static const struct file_operations rt_cache_seq_fops = {
234         .owner   = THIS_MODULE,
235         .open    = rt_cache_seq_open,
236         .read    = seq_read,
237         .llseek  = seq_lseek,
238         .release = seq_release,
239 };
240
241
242 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
243 {
244         int cpu;
245
246         if (*pos == 0)
247                 return SEQ_START_TOKEN;
248
249         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
250                 if (!cpu_possible(cpu))
251                         continue;
252                 *pos = cpu+1;
253                 return &per_cpu(rt_cache_stat, cpu);
254         }
255         return NULL;
256 }
257
258 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
259 {
260         int cpu;
261
262         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
263                 if (!cpu_possible(cpu))
264                         continue;
265                 *pos = cpu+1;
266                 return &per_cpu(rt_cache_stat, cpu);
267         }
268         return NULL;
269
270 }
271
272 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
273 {
274
275 }
276
277 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
278 {
279         struct rt_cache_stat *st = v;
280
281         if (v == SEQ_START_TOKEN) {
282                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
283                 return 0;
284         }
285
286         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
287                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
288                    dst_entries_get_slow(&ipv4_dst_ops),
289                    0, /* st->in_hit */
290                    st->in_slow_tot,
291                    st->in_slow_mc,
292                    st->in_no_route,
293                    st->in_brd,
294                    st->in_martian_dst,
295                    st->in_martian_src,
296
297                    0, /* st->out_hit */
298                    st->out_slow_tot,
299                    st->out_slow_mc,
300
301                    0, /* st->gc_total */
302                    0, /* st->gc_ignored */
303                    0, /* st->gc_goal_miss */
304                    0, /* st->gc_dst_overflow */
305                    0, /* st->in_hlist_search */
306                    0  /* st->out_hlist_search */
307                 );
308         return 0;
309 }
310
311 static const struct seq_operations rt_cpu_seq_ops = {
312         .start  = rt_cpu_seq_start,
313         .next   = rt_cpu_seq_next,
314         .stop   = rt_cpu_seq_stop,
315         .show   = rt_cpu_seq_show,
316 };
317
318
319 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
320 {
321         return seq_open(file, &rt_cpu_seq_ops);
322 }
323
324 static const struct file_operations rt_cpu_seq_fops = {
325         .owner   = THIS_MODULE,
326         .open    = rt_cpu_seq_open,
327         .read    = seq_read,
328         .llseek  = seq_lseek,
329         .release = seq_release,
330 };
331
332 #ifdef CONFIG_IP_ROUTE_CLASSID
333 static int rt_acct_proc_show(struct seq_file *m, void *v)
334 {
335         struct ip_rt_acct *dst, *src;
336         unsigned int i, j;
337
338         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
339         if (!dst)
340                 return -ENOMEM;
341
342         for_each_possible_cpu(i) {
343                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
344                 for (j = 0; j < 256; j++) {
345                         dst[j].o_bytes   += src[j].o_bytes;
346                         dst[j].o_packets += src[j].o_packets;
347                         dst[j].i_bytes   += src[j].i_bytes;
348                         dst[j].i_packets += src[j].i_packets;
349                 }
350         }
351
352         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
353         kfree(dst);
354         return 0;
355 }
356
357 static int rt_acct_proc_open(struct inode *inode, struct file *file)
358 {
359         return single_open(file, rt_acct_proc_show, NULL);
360 }
361
362 static const struct file_operations rt_acct_proc_fops = {
363         .owner          = THIS_MODULE,
364         .open           = rt_acct_proc_open,
365         .read           = seq_read,
366         .llseek         = seq_lseek,
367         .release        = single_release,
368 };
369 #endif
370
371 static int __net_init ip_rt_do_proc_init(struct net *net)
372 {
373         struct proc_dir_entry *pde;
374
375         pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
376                           &rt_cache_seq_fops);
377         if (!pde)
378                 goto err1;
379
380         pde = proc_create("rt_cache", S_IRUGO,
381                           net->proc_net_stat, &rt_cpu_seq_fops);
382         if (!pde)
383                 goto err2;
384
385 #ifdef CONFIG_IP_ROUTE_CLASSID
386         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
387         if (!pde)
388                 goto err3;
389 #endif
390         return 0;
391
392 #ifdef CONFIG_IP_ROUTE_CLASSID
393 err3:
394         remove_proc_entry("rt_cache", net->proc_net_stat);
395 #endif
396 err2:
397         remove_proc_entry("rt_cache", net->proc_net);
398 err1:
399         return -ENOMEM;
400 }
401
402 static void __net_exit ip_rt_do_proc_exit(struct net *net)
403 {
404         remove_proc_entry("rt_cache", net->proc_net_stat);
405         remove_proc_entry("rt_cache", net->proc_net);
406 #ifdef CONFIG_IP_ROUTE_CLASSID
407         remove_proc_entry("rt_acct", net->proc_net);
408 #endif
409 }
410
411 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
412         .init = ip_rt_do_proc_init,
413         .exit = ip_rt_do_proc_exit,
414 };
415
416 static int __init ip_rt_proc_init(void)
417 {
418         return register_pernet_subsys(&ip_rt_proc_ops);
419 }
420
421 #else
422 static inline int ip_rt_proc_init(void)
423 {
424         return 0;
425 }
426 #endif /* CONFIG_PROC_FS */
427
428 static inline bool rt_is_expired(const struct rtable *rth)
429 {
430         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
431 }
432
433 void rt_cache_flush(struct net *net)
434 {
435         rt_genid_bump_ipv4(net);
436 }
437
438 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
439                                            struct sk_buff *skb,
440                                            const void *daddr)
441 {
442         struct net_device *dev = dst->dev;
443         const __be32 *pkey = daddr;
444         const struct rtable *rt;
445         struct neighbour *n;
446
447         rt = (const struct rtable *) dst;
448         if (rt->rt_gateway)
449                 pkey = (const __be32 *) &rt->rt_gateway;
450         else if (skb)
451                 pkey = &ip_hdr(skb)->daddr;
452
453         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
454         if (n)
455                 return n;
456         return neigh_create(&arp_tbl, pkey, dev);
457 }
458
459 /*
460  * Peer allocation may fail only in serious out-of-memory conditions.  However
461  * we still can generate some output.
462  * Random ID selection looks a bit dangerous because we have no chances to
463  * select ID being unique in a reasonable period of time.
464  * But broken packet identifier may be better than no packet at all.
465  */
466 static void ip_select_fb_ident(struct iphdr *iph)
467 {
468         static DEFINE_SPINLOCK(ip_fb_id_lock);
469         static u32 ip_fallback_id;
470         u32 salt;
471
472         spin_lock_bh(&ip_fb_id_lock);
473         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
474         iph->id = htons(salt & 0xFFFF);
475         ip_fallback_id = salt;
476         spin_unlock_bh(&ip_fb_id_lock);
477 }
478
479 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
480 {
481         struct net *net = dev_net(dst->dev);
482         struct inet_peer *peer;
483
484         peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
485         if (peer) {
486                 iph->id = htons(inet_getid(peer, more));
487                 inet_putpeer(peer);
488                 return;
489         }
490
491         ip_select_fb_ident(iph);
492 }
493 EXPORT_SYMBOL(__ip_select_ident);
494
495 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
496                              const struct iphdr *iph,
497                              int oif, u8 tos,
498                              u8 prot, u32 mark, int flow_flags)
499 {
500         if (sk) {
501                 const struct inet_sock *inet = inet_sk(sk);
502
503                 oif = sk->sk_bound_dev_if;
504                 mark = sk->sk_mark;
505                 tos = RT_CONN_FLAGS(sk);
506                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
507         }
508         flowi4_init_output(fl4, oif, mark, tos,
509                            RT_SCOPE_UNIVERSE, prot,
510                            flow_flags,
511                            iph->daddr, iph->saddr, 0, 0);
512 }
513
514 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
515                                const struct sock *sk)
516 {
517         const struct iphdr *iph = ip_hdr(skb);
518         int oif = skb->dev->ifindex;
519         u8 tos = RT_TOS(iph->tos);
520         u8 prot = iph->protocol;
521         u32 mark = skb->mark;
522
523         __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
524 }
525
526 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
527 {
528         const struct inet_sock *inet = inet_sk(sk);
529         const struct ip_options_rcu *inet_opt;
530         __be32 daddr = inet->inet_daddr;
531
532         rcu_read_lock();
533         inet_opt = rcu_dereference(inet->inet_opt);
534         if (inet_opt && inet_opt->opt.srr)
535                 daddr = inet_opt->opt.faddr;
536         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
537                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
538                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
539                            inet_sk_flowi_flags(sk),
540                            daddr, inet->inet_saddr, 0, 0);
541         rcu_read_unlock();
542 }
543
544 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
545                                  const struct sk_buff *skb)
546 {
547         if (skb)
548                 build_skb_flow_key(fl4, skb, sk);
549         else
550                 build_sk_flow_key(fl4, sk);
551 }
552
553 static inline void rt_free(struct rtable *rt)
554 {
555         call_rcu(&rt->dst.rcu_head, dst_rcu_free);
556 }
557
558 static DEFINE_SPINLOCK(fnhe_lock);
559
560 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
561 {
562         struct rtable *rt;
563
564         rt = rcu_dereference(fnhe->fnhe_rth_input);
565         if (rt) {
566                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
567                 rt_free(rt);
568         }
569         rt = rcu_dereference(fnhe->fnhe_rth_output);
570         if (rt) {
571                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
572                 rt_free(rt);
573         }
574 }
575
576 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
577 {
578         struct fib_nh_exception *fnhe, *oldest;
579
580         oldest = rcu_dereference(hash->chain);
581         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
582              fnhe = rcu_dereference(fnhe->fnhe_next)) {
583                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
584                         oldest = fnhe;
585         }
586         fnhe_flush_routes(oldest);
587         return oldest;
588 }
589
590 static inline u32 fnhe_hashfun(__be32 daddr)
591 {
592         u32 hval;
593
594         hval = (__force u32) daddr;
595         hval ^= (hval >> 11) ^ (hval >> 22);
596
597         return hval & (FNHE_HASH_SIZE - 1);
598 }
599
600 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
601 {
602         rt->rt_pmtu = fnhe->fnhe_pmtu;
603         rt->dst.expires = fnhe->fnhe_expires;
604
605         if (fnhe->fnhe_gw) {
606                 rt->rt_flags |= RTCF_REDIRECTED;
607                 rt->rt_gateway = fnhe->fnhe_gw;
608                 rt->rt_uses_gateway = 1;
609         }
610 }
611
612 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
613                                   u32 pmtu, unsigned long expires)
614 {
615         struct fnhe_hash_bucket *hash;
616         struct fib_nh_exception *fnhe;
617         struct rtable *rt;
618         unsigned int i;
619         int depth;
620         u32 hval = fnhe_hashfun(daddr);
621
622         spin_lock_bh(&fnhe_lock);
623
624         hash = nh->nh_exceptions;
625         if (!hash) {
626                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
627                 if (!hash)
628                         goto out_unlock;
629                 nh->nh_exceptions = hash;
630         }
631
632         hash += hval;
633
634         depth = 0;
635         for (fnhe = rcu_dereference(hash->chain); fnhe;
636              fnhe = rcu_dereference(fnhe->fnhe_next)) {
637                 if (fnhe->fnhe_daddr == daddr)
638                         break;
639                 depth++;
640         }
641
642         if (fnhe) {
643                 if (gw)
644                         fnhe->fnhe_gw = gw;
645                 if (pmtu) {
646                         fnhe->fnhe_pmtu = pmtu;
647                         fnhe->fnhe_expires = max(1UL, expires);
648                 }
649                 /* Update all cached dsts too */
650                 rt = rcu_dereference(fnhe->fnhe_rth_input);
651                 if (rt)
652                         fill_route_from_fnhe(rt, fnhe);
653                 rt = rcu_dereference(fnhe->fnhe_rth_output);
654                 if (rt)
655                         fill_route_from_fnhe(rt, fnhe);
656         } else {
657                 if (depth > FNHE_RECLAIM_DEPTH)
658                         fnhe = fnhe_oldest(hash);
659                 else {
660                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
661                         if (!fnhe)
662                                 goto out_unlock;
663
664                         fnhe->fnhe_next = hash->chain;
665                         rcu_assign_pointer(hash->chain, fnhe);
666                 }
667                 fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev));
668                 fnhe->fnhe_daddr = daddr;
669                 fnhe->fnhe_gw = gw;
670                 fnhe->fnhe_pmtu = pmtu;
671                 fnhe->fnhe_expires = expires;
672
673                 /* Exception created; mark the cached routes for the nexthop
674                  * stale, so anyone caching it rechecks if this exception
675                  * applies to them.
676                  */
677                 rt = rcu_dereference(nh->nh_rth_input);
678                 if (rt)
679                         rt->dst.obsolete = DST_OBSOLETE_KILL;
680
681                 for_each_possible_cpu(i) {
682                         struct rtable __rcu **prt;
683                         prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
684                         rt = rcu_dereference(*prt);
685                         if (rt)
686                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
687                 }
688         }
689
690         fnhe->fnhe_stamp = jiffies;
691
692 out_unlock:
693         spin_unlock_bh(&fnhe_lock);
694 }
695
696 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
697                              bool kill_route)
698 {
699         __be32 new_gw = icmp_hdr(skb)->un.gateway;
700         __be32 old_gw = ip_hdr(skb)->saddr;
701         struct net_device *dev = skb->dev;
702         struct in_device *in_dev;
703         struct fib_result res;
704         struct neighbour *n;
705         struct net *net;
706
707         switch (icmp_hdr(skb)->code & 7) {
708         case ICMP_REDIR_NET:
709         case ICMP_REDIR_NETTOS:
710         case ICMP_REDIR_HOST:
711         case ICMP_REDIR_HOSTTOS:
712                 break;
713
714         default:
715                 return;
716         }
717
718         if (rt->rt_gateway != old_gw)
719                 return;
720
721         in_dev = __in_dev_get_rcu(dev);
722         if (!in_dev)
723                 return;
724
725         net = dev_net(dev);
726         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
727             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
728             ipv4_is_zeronet(new_gw))
729                 goto reject_redirect;
730
731         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
732                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
733                         goto reject_redirect;
734                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
735                         goto reject_redirect;
736         } else {
737                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
738                         goto reject_redirect;
739         }
740
741         n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
742         if (n) {
743                 if (!(n->nud_state & NUD_VALID)) {
744                         neigh_event_send(n, NULL);
745                 } else {
746                         if (fib_lookup(net, fl4, &res) == 0) {
747                                 struct fib_nh *nh = &FIB_RES_NH(res);
748
749                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
750                                                       0, 0);
751                         }
752                         if (kill_route)
753                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
754                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
755                 }
756                 neigh_release(n);
757         }
758         return;
759
760 reject_redirect:
761 #ifdef CONFIG_IP_ROUTE_VERBOSE
762         if (IN_DEV_LOG_MARTIANS(in_dev)) {
763                 const struct iphdr *iph = (const struct iphdr *) skb->data;
764                 __be32 daddr = iph->daddr;
765                 __be32 saddr = iph->saddr;
766
767                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
768                                      "  Advised path = %pI4 -> %pI4\n",
769                                      &old_gw, dev->name, &new_gw,
770                                      &saddr, &daddr);
771         }
772 #endif
773         ;
774 }
775
776 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
777 {
778         struct rtable *rt;
779         struct flowi4 fl4;
780         const struct iphdr *iph = (const struct iphdr *) skb->data;
781         int oif = skb->dev->ifindex;
782         u8 tos = RT_TOS(iph->tos);
783         u8 prot = iph->protocol;
784         u32 mark = skb->mark;
785
786         rt = (struct rtable *) dst;
787
788         __build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0);
789         __ip_do_redirect(rt, skb, &fl4, true);
790 }
791
792 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
793 {
794         struct rtable *rt = (struct rtable *)dst;
795         struct dst_entry *ret = dst;
796
797         if (rt) {
798                 if (dst->obsolete > 0) {
799                         ip_rt_put(rt);
800                         ret = NULL;
801                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
802                            rt->dst.expires) {
803                         ip_rt_put(rt);
804                         ret = NULL;
805                 }
806         }
807         return ret;
808 }
809
810 /*
811  * Algorithm:
812  *      1. The first ip_rt_redirect_number redirects are sent
813  *         with exponential backoff, then we stop sending them at all,
814  *         assuming that the host ignores our redirects.
815  *      2. If we did not see packets requiring redirects
816  *         during ip_rt_redirect_silence, we assume that the host
817  *         forgot redirected route and start to send redirects again.
818  *
819  * This algorithm is much cheaper and more intelligent than dumb load limiting
820  * in icmp.c.
821  *
822  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
823  * and "frag. need" (breaks PMTU discovery) in icmp.c.
824  */
825
826 void ip_rt_send_redirect(struct sk_buff *skb)
827 {
828         struct rtable *rt = skb_rtable(skb);
829         struct in_device *in_dev;
830         struct inet_peer *peer;
831         struct net *net;
832         int log_martians;
833
834         rcu_read_lock();
835         in_dev = __in_dev_get_rcu(rt->dst.dev);
836         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
837                 rcu_read_unlock();
838                 return;
839         }
840         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
841         rcu_read_unlock();
842
843         net = dev_net(rt->dst.dev);
844         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
845         if (!peer) {
846                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
847                           rt_nexthop(rt, ip_hdr(skb)->daddr));
848                 return;
849         }
850
851         /* No redirected packets during ip_rt_redirect_silence;
852          * reset the algorithm.
853          */
854         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
855                 peer->rate_tokens = 0;
856
857         /* Too many ignored redirects; do not send anything
858          * set dst.rate_last to the last seen redirected packet.
859          */
860         if (peer->rate_tokens >= ip_rt_redirect_number) {
861                 peer->rate_last = jiffies;
862                 goto out_put_peer;
863         }
864
865         /* Check for load limit; set rate_last to the latest sent
866          * redirect.
867          */
868         if (peer->rate_tokens == 0 ||
869             time_after(jiffies,
870                        (peer->rate_last +
871                         (ip_rt_redirect_load << peer->rate_tokens)))) {
872                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
873
874                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
875                 peer->rate_last = jiffies;
876                 ++peer->rate_tokens;
877 #ifdef CONFIG_IP_ROUTE_VERBOSE
878                 if (log_martians &&
879                     peer->rate_tokens == ip_rt_redirect_number)
880                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
881                                              &ip_hdr(skb)->saddr, inet_iif(skb),
882                                              &ip_hdr(skb)->daddr, &gw);
883 #endif
884         }
885 out_put_peer:
886         inet_putpeer(peer);
887 }
888
889 static int ip_error(struct sk_buff *skb)
890 {
891         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
892         struct rtable *rt = skb_rtable(skb);
893         struct inet_peer *peer;
894         unsigned long now;
895         struct net *net;
896         bool send;
897         int code;
898
899         net = dev_net(rt->dst.dev);
900         if (!IN_DEV_FORWARD(in_dev)) {
901                 switch (rt->dst.error) {
902                 case EHOSTUNREACH:
903                         IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
904                         break;
905
906                 case ENETUNREACH:
907                         IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
908                         break;
909                 }
910                 goto out;
911         }
912
913         switch (rt->dst.error) {
914         case EINVAL:
915         default:
916                 goto out;
917         case EHOSTUNREACH:
918                 code = ICMP_HOST_UNREACH;
919                 break;
920         case ENETUNREACH:
921                 code = ICMP_NET_UNREACH;
922                 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
923                 break;
924         case EACCES:
925                 code = ICMP_PKT_FILTERED;
926                 break;
927         }
928
929         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
930
931         send = true;
932         if (peer) {
933                 now = jiffies;
934                 peer->rate_tokens += now - peer->rate_last;
935                 if (peer->rate_tokens > ip_rt_error_burst)
936                         peer->rate_tokens = ip_rt_error_burst;
937                 peer->rate_last = now;
938                 if (peer->rate_tokens >= ip_rt_error_cost)
939                         peer->rate_tokens -= ip_rt_error_cost;
940                 else
941                         send = false;
942                 inet_putpeer(peer);
943         }
944         if (send)
945                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
946
947 out:    kfree_skb(skb);
948         return 0;
949 }
950
951 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
952 {
953         struct dst_entry *dst = &rt->dst;
954         struct fib_result res;
955
956         if (dst_metric_locked(dst, RTAX_MTU))
957                 return;
958
959         if (dst->dev->mtu < mtu)
960                 return;
961
962         if (mtu < ip_rt_min_pmtu)
963                 mtu = ip_rt_min_pmtu;
964
965         if (rt->rt_pmtu == mtu &&
966             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
967                 return;
968
969         rcu_read_lock();
970         if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
971                 struct fib_nh *nh = &FIB_RES_NH(res);
972
973                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
974                                       jiffies + ip_rt_mtu_expires);
975         }
976         rcu_read_unlock();
977 }
978
979 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
980                               struct sk_buff *skb, u32 mtu)
981 {
982         struct rtable *rt = (struct rtable *) dst;
983         struct flowi4 fl4;
984
985         ip_rt_build_flow_key(&fl4, sk, skb);
986         __ip_rt_update_pmtu(rt, &fl4, mtu);
987 }
988
989 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
990                       int oif, u32 mark, u8 protocol, int flow_flags)
991 {
992         const struct iphdr *iph = (const struct iphdr *) skb->data;
993         struct flowi4 fl4;
994         struct rtable *rt;
995
996         __build_flow_key(&fl4, NULL, iph, oif,
997                          RT_TOS(iph->tos), protocol, mark, flow_flags);
998         rt = __ip_route_output_key(net, &fl4);
999         if (!IS_ERR(rt)) {
1000                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1001                 ip_rt_put(rt);
1002         }
1003 }
1004 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1005
1006 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1007 {
1008         const struct iphdr *iph = (const struct iphdr *) skb->data;
1009         struct flowi4 fl4;
1010         struct rtable *rt;
1011
1012         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1013         rt = __ip_route_output_key(sock_net(sk), &fl4);
1014         if (!IS_ERR(rt)) {
1015                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1016                 ip_rt_put(rt);
1017         }
1018 }
1019
1020 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1021 {
1022         const struct iphdr *iph = (const struct iphdr *) skb->data;
1023         struct flowi4 fl4;
1024         struct rtable *rt;
1025         struct dst_entry *dst;
1026         bool new = false;
1027
1028         bh_lock_sock(sk);
1029
1030         if (!ip_sk_accept_pmtu(sk))
1031                 goto out;
1032
1033         rt = (struct rtable *) __sk_dst_get(sk);
1034
1035         if (sock_owned_by_user(sk) || !rt) {
1036                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1037                 goto out;
1038         }
1039
1040         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1041
1042         if (!__sk_dst_check(sk, 0)) {
1043                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1044                 if (IS_ERR(rt))
1045                         goto out;
1046
1047                 new = true;
1048         }
1049
1050         __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1051
1052         dst = dst_check(&rt->dst, 0);
1053         if (!dst) {
1054                 if (new)
1055                         dst_release(&rt->dst);
1056
1057                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1058                 if (IS_ERR(rt))
1059                         goto out;
1060
1061                 new = true;
1062         }
1063
1064         if (new)
1065                 __sk_dst_set(sk, &rt->dst);
1066
1067 out:
1068         bh_unlock_sock(sk);
1069 }
1070 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1071
1072 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1073                    int oif, u32 mark, u8 protocol, int flow_flags)
1074 {
1075         const struct iphdr *iph = (const struct iphdr *) skb->data;
1076         struct flowi4 fl4;
1077         struct rtable *rt;
1078
1079         __build_flow_key(&fl4, NULL, iph, oif,
1080                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1081         rt = __ip_route_output_key(net, &fl4);
1082         if (!IS_ERR(rt)) {
1083                 __ip_do_redirect(rt, skb, &fl4, false);
1084                 ip_rt_put(rt);
1085         }
1086 }
1087 EXPORT_SYMBOL_GPL(ipv4_redirect);
1088
1089 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1090 {
1091         const struct iphdr *iph = (const struct iphdr *) skb->data;
1092         struct flowi4 fl4;
1093         struct rtable *rt;
1094
1095         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1096         rt = __ip_route_output_key(sock_net(sk), &fl4);
1097         if (!IS_ERR(rt)) {
1098                 __ip_do_redirect(rt, skb, &fl4, false);
1099                 ip_rt_put(rt);
1100         }
1101 }
1102 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1103
1104 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1105 {
1106         struct rtable *rt = (struct rtable *) dst;
1107
1108         /* All IPV4 dsts are created with ->obsolete set to the value
1109          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1110          * into this function always.
1111          *
1112          * When a PMTU/redirect information update invalidates a route,
1113          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1114          * DST_OBSOLETE_DEAD by dst_free().
1115          */
1116         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1117                 return NULL;
1118         return dst;
1119 }
1120
1121 static void ipv4_link_failure(struct sk_buff *skb)
1122 {
1123         struct rtable *rt;
1124
1125         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1126
1127         rt = skb_rtable(skb);
1128         if (rt)
1129                 dst_set_expires(&rt->dst, 0);
1130 }
1131
1132 static int ip_rt_bug(struct sock *sk, struct sk_buff *skb)
1133 {
1134         pr_debug("%s: %pI4 -> %pI4, %s\n",
1135                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1136                  skb->dev ? skb->dev->name : "?");
1137         kfree_skb(skb);
1138         WARN_ON(1);
1139         return 0;
1140 }
1141
1142 /*
1143    We do not cache source address of outgoing interface,
1144    because it is used only by IP RR, TS and SRR options,
1145    so that it out of fast path.
1146
1147    BTW remember: "addr" is allowed to be not aligned
1148    in IP options!
1149  */
1150
1151 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1152 {
1153         __be32 src;
1154
1155         if (rt_is_output_route(rt))
1156                 src = ip_hdr(skb)->saddr;
1157         else {
1158                 struct fib_result res;
1159                 struct flowi4 fl4;
1160                 struct iphdr *iph;
1161
1162                 iph = ip_hdr(skb);
1163
1164                 memset(&fl4, 0, sizeof(fl4));
1165                 fl4.daddr = iph->daddr;
1166                 fl4.saddr = iph->saddr;
1167                 fl4.flowi4_tos = RT_TOS(iph->tos);
1168                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1169                 fl4.flowi4_iif = skb->dev->ifindex;
1170                 fl4.flowi4_mark = skb->mark;
1171
1172                 rcu_read_lock();
1173                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1174                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1175                 else
1176                         src = inet_select_addr(rt->dst.dev,
1177                                                rt_nexthop(rt, iph->daddr),
1178                                                RT_SCOPE_UNIVERSE);
1179                 rcu_read_unlock();
1180         }
1181         memcpy(addr, &src, 4);
1182 }
1183
1184 #ifdef CONFIG_IP_ROUTE_CLASSID
1185 static void set_class_tag(struct rtable *rt, u32 tag)
1186 {
1187         if (!(rt->dst.tclassid & 0xFFFF))
1188                 rt->dst.tclassid |= tag & 0xFFFF;
1189         if (!(rt->dst.tclassid & 0xFFFF0000))
1190                 rt->dst.tclassid |= tag & 0xFFFF0000;
1191 }
1192 #endif
1193
1194 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1195 {
1196         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1197
1198         if (advmss == 0) {
1199                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1200                                ip_rt_min_advmss);
1201                 if (advmss > 65535 - 40)
1202                         advmss = 65535 - 40;
1203         }
1204         return advmss;
1205 }
1206
1207 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1208 {
1209         const struct rtable *rt = (const struct rtable *) dst;
1210         unsigned int mtu = rt->rt_pmtu;
1211
1212         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1213                 mtu = dst_metric_raw(dst, RTAX_MTU);
1214
1215         if (mtu)
1216                 return mtu;
1217
1218         mtu = dst->dev->mtu;
1219
1220         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1221                 if (rt->rt_uses_gateway && mtu > 576)
1222                         mtu = 576;
1223         }
1224
1225         return min_t(unsigned int, mtu, IP_MAX_MTU);
1226 }
1227
1228 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1229 {
1230         struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1231         struct fib_nh_exception *fnhe;
1232         u32 hval;
1233
1234         if (!hash)
1235                 return NULL;
1236
1237         hval = fnhe_hashfun(daddr);
1238
1239         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1240              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1241                 if (fnhe->fnhe_daddr == daddr)
1242                         return fnhe;
1243         }
1244         return NULL;
1245 }
1246
1247 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1248                               __be32 daddr)
1249 {
1250         bool ret = false;
1251
1252         spin_lock_bh(&fnhe_lock);
1253
1254         if (daddr == fnhe->fnhe_daddr) {
1255                 struct rtable __rcu **porig;
1256                 struct rtable *orig;
1257                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1258
1259                 if (rt_is_input_route(rt))
1260                         porig = &fnhe->fnhe_rth_input;
1261                 else
1262                         porig = &fnhe->fnhe_rth_output;
1263                 orig = rcu_dereference(*porig);
1264
1265                 if (fnhe->fnhe_genid != genid) {
1266                         fnhe->fnhe_genid = genid;
1267                         fnhe->fnhe_gw = 0;
1268                         fnhe->fnhe_pmtu = 0;
1269                         fnhe->fnhe_expires = 0;
1270                         fnhe_flush_routes(fnhe);
1271                         orig = NULL;
1272                 }
1273                 fill_route_from_fnhe(rt, fnhe);
1274                 if (!rt->rt_gateway)
1275                         rt->rt_gateway = daddr;
1276
1277                 if (!(rt->dst.flags & DST_NOCACHE)) {
1278                         rcu_assign_pointer(*porig, rt);
1279                         if (orig)
1280                                 rt_free(orig);
1281                         ret = true;
1282                 }
1283
1284                 fnhe->fnhe_stamp = jiffies;
1285         }
1286         spin_unlock_bh(&fnhe_lock);
1287
1288         return ret;
1289 }
1290
1291 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1292 {
1293         struct rtable *orig, *prev, **p;
1294         bool ret = true;
1295
1296         if (rt_is_input_route(rt)) {
1297                 p = (struct rtable **)&nh->nh_rth_input;
1298         } else {
1299                 p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
1300         }
1301         orig = *p;
1302
1303         prev = cmpxchg(p, orig, rt);
1304         if (prev == orig) {
1305                 if (orig)
1306                         rt_free(orig);
1307         } else
1308                 ret = false;
1309
1310         return ret;
1311 }
1312
1313 static DEFINE_SPINLOCK(rt_uncached_lock);
1314 static LIST_HEAD(rt_uncached_list);
1315
1316 static void rt_add_uncached_list(struct rtable *rt)
1317 {
1318         spin_lock_bh(&rt_uncached_lock);
1319         list_add_tail(&rt->rt_uncached, &rt_uncached_list);
1320         spin_unlock_bh(&rt_uncached_lock);
1321 }
1322
1323 static void ipv4_dst_destroy(struct dst_entry *dst)
1324 {
1325         struct rtable *rt = (struct rtable *) dst;
1326
1327         if (!list_empty(&rt->rt_uncached)) {
1328                 spin_lock_bh(&rt_uncached_lock);
1329                 list_del(&rt->rt_uncached);
1330                 spin_unlock_bh(&rt_uncached_lock);
1331         }
1332 }
1333
1334 void rt_flush_dev(struct net_device *dev)
1335 {
1336         if (!list_empty(&rt_uncached_list)) {
1337                 struct net *net = dev_net(dev);
1338                 struct rtable *rt;
1339
1340                 spin_lock_bh(&rt_uncached_lock);
1341                 list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
1342                         if (rt->dst.dev != dev)
1343                                 continue;
1344                         rt->dst.dev = net->loopback_dev;
1345                         dev_hold(rt->dst.dev);
1346                         dev_put(dev);
1347                 }
1348                 spin_unlock_bh(&rt_uncached_lock);
1349         }
1350 }
1351
1352 static bool rt_cache_valid(const struct rtable *rt)
1353 {
1354         return  rt &&
1355                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1356                 !rt_is_expired(rt);
1357 }
1358
1359 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1360                            const struct fib_result *res,
1361                            struct fib_nh_exception *fnhe,
1362                            struct fib_info *fi, u16 type, u32 itag)
1363 {
1364         bool cached = false;
1365
1366         if (fi) {
1367                 struct fib_nh *nh = &FIB_RES_NH(*res);
1368
1369                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1370                         rt->rt_gateway = nh->nh_gw;
1371                         rt->rt_uses_gateway = 1;
1372                 }
1373                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1374 #ifdef CONFIG_IP_ROUTE_CLASSID
1375                 rt->dst.tclassid = nh->nh_tclassid;
1376 #endif
1377                 if (unlikely(fnhe))
1378                         cached = rt_bind_exception(rt, fnhe, daddr);
1379                 else if (!(rt->dst.flags & DST_NOCACHE))
1380                         cached = rt_cache_route(nh, rt);
1381                 if (unlikely(!cached)) {
1382                         /* Routes we intend to cache in nexthop exception or
1383                          * FIB nexthop have the DST_NOCACHE bit clear.
1384                          * However, if we are unsuccessful at storing this
1385                          * route into the cache we really need to set it.
1386                          */
1387                         rt->dst.flags |= DST_NOCACHE;
1388                         if (!rt->rt_gateway)
1389                                 rt->rt_gateway = daddr;
1390                         rt_add_uncached_list(rt);
1391                 }
1392         } else
1393                 rt_add_uncached_list(rt);
1394
1395 #ifdef CONFIG_IP_ROUTE_CLASSID
1396 #ifdef CONFIG_IP_MULTIPLE_TABLES
1397         set_class_tag(rt, res->tclassid);
1398 #endif
1399         set_class_tag(rt, itag);
1400 #endif
1401 }
1402
1403 static struct rtable *rt_dst_alloc(struct net_device *dev,
1404                                    bool nopolicy, bool noxfrm, bool will_cache)
1405 {
1406         return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1407                          (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1408                          (nopolicy ? DST_NOPOLICY : 0) |
1409                          (noxfrm ? DST_NOXFRM : 0));
1410 }
1411
1412 /* called in rcu_read_lock() section */
1413 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1414                                 u8 tos, struct net_device *dev, int our)
1415 {
1416         struct rtable *rth;
1417         struct in_device *in_dev = __in_dev_get_rcu(dev);
1418         u32 itag = 0;
1419         int err;
1420
1421         /* Primary sanity checks. */
1422
1423         if (in_dev == NULL)
1424                 return -EINVAL;
1425
1426         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1427             skb->protocol != htons(ETH_P_IP))
1428                 goto e_inval;
1429
1430         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1431                 if (ipv4_is_loopback(saddr))
1432                         goto e_inval;
1433
1434         if (ipv4_is_zeronet(saddr)) {
1435                 if (!ipv4_is_local_multicast(daddr))
1436                         goto e_inval;
1437         } else {
1438                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1439                                           in_dev, &itag);
1440                 if (err < 0)
1441                         goto e_err;
1442         }
1443         rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1444                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1445         if (!rth)
1446                 goto e_nobufs;
1447
1448 #ifdef CONFIG_IP_ROUTE_CLASSID
1449         rth->dst.tclassid = itag;
1450 #endif
1451         rth->dst.output = ip_rt_bug;
1452
1453         rth->rt_genid   = rt_genid_ipv4(dev_net(dev));
1454         rth->rt_flags   = RTCF_MULTICAST;
1455         rth->rt_type    = RTN_MULTICAST;
1456         rth->rt_is_input= 1;
1457         rth->rt_iif     = 0;
1458         rth->rt_pmtu    = 0;
1459         rth->rt_gateway = 0;
1460         rth->rt_uses_gateway = 0;
1461         INIT_LIST_HEAD(&rth->rt_uncached);
1462         if (our) {
1463                 rth->dst.input= ip_local_deliver;
1464                 rth->rt_flags |= RTCF_LOCAL;
1465         }
1466
1467 #ifdef CONFIG_IP_MROUTE
1468         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1469                 rth->dst.input = ip_mr_input;
1470 #endif
1471         RT_CACHE_STAT_INC(in_slow_mc);
1472
1473         skb_dst_set(skb, &rth->dst);
1474         return 0;
1475
1476 e_nobufs:
1477         return -ENOBUFS;
1478 e_inval:
1479         return -EINVAL;
1480 e_err:
1481         return err;
1482 }
1483
1484
1485 static void ip_handle_martian_source(struct net_device *dev,
1486                                      struct in_device *in_dev,
1487                                      struct sk_buff *skb,
1488                                      __be32 daddr,
1489                                      __be32 saddr)
1490 {
1491         RT_CACHE_STAT_INC(in_martian_src);
1492 #ifdef CONFIG_IP_ROUTE_VERBOSE
1493         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1494                 /*
1495                  *      RFC1812 recommendation, if source is martian,
1496                  *      the only hint is MAC header.
1497                  */
1498                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1499                         &daddr, &saddr, dev->name);
1500                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1501                         print_hex_dump(KERN_WARNING, "ll header: ",
1502                                        DUMP_PREFIX_OFFSET, 16, 1,
1503                                        skb_mac_header(skb),
1504                                        dev->hard_header_len, true);
1505                 }
1506         }
1507 #endif
1508 }
1509
1510 /* called in rcu_read_lock() section */
1511 static int __mkroute_input(struct sk_buff *skb,
1512                            const struct fib_result *res,
1513                            struct in_device *in_dev,
1514                            __be32 daddr, __be32 saddr, u32 tos)
1515 {
1516         struct fib_nh_exception *fnhe;
1517         struct rtable *rth;
1518         int err;
1519         struct in_device *out_dev;
1520         unsigned int flags = 0;
1521         bool do_cache;
1522         u32 itag = 0;
1523
1524         /* get a working reference to the output device */
1525         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1526         if (out_dev == NULL) {
1527                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1528                 return -EINVAL;
1529         }
1530
1531         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1532                                   in_dev->dev, in_dev, &itag);
1533         if (err < 0) {
1534                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1535                                          saddr);
1536
1537                 goto cleanup;
1538         }
1539
1540         do_cache = res->fi && !itag;
1541         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1542             (IN_DEV_SHARED_MEDIA(out_dev) ||
1543              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) {
1544                 flags |= RTCF_DOREDIRECT;
1545                 do_cache = false;
1546         }
1547
1548         if (skb->protocol != htons(ETH_P_IP)) {
1549                 /* Not IP (i.e. ARP). Do not create route, if it is
1550                  * invalid for proxy arp. DNAT routes are always valid.
1551                  *
1552                  * Proxy arp feature have been extended to allow, ARP
1553                  * replies back to the same interface, to support
1554                  * Private VLAN switch technologies. See arp.c.
1555                  */
1556                 if (out_dev == in_dev &&
1557                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1558                         err = -EINVAL;
1559                         goto cleanup;
1560                 }
1561         }
1562
1563         fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1564         if (do_cache) {
1565                 if (fnhe != NULL)
1566                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1567                 else
1568                         rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1569
1570                 if (rt_cache_valid(rth)) {
1571                         skb_dst_set_noref(skb, &rth->dst);
1572                         goto out;
1573                 }
1574         }
1575
1576         rth = rt_dst_alloc(out_dev->dev,
1577                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1578                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1579         if (!rth) {
1580                 err = -ENOBUFS;
1581                 goto cleanup;
1582         }
1583
1584         rth->rt_genid = rt_genid_ipv4(dev_net(rth->dst.dev));
1585         rth->rt_flags = flags;
1586         rth->rt_type = res->type;
1587         rth->rt_is_input = 1;
1588         rth->rt_iif     = 0;
1589         rth->rt_pmtu    = 0;
1590         rth->rt_gateway = 0;
1591         rth->rt_uses_gateway = 0;
1592         INIT_LIST_HEAD(&rth->rt_uncached);
1593         RT_CACHE_STAT_INC(in_slow_tot);
1594
1595         rth->dst.input = ip_forward;
1596         rth->dst.output = ip_output;
1597
1598         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
1599         skb_dst_set(skb, &rth->dst);
1600 out:
1601         err = 0;
1602  cleanup:
1603         return err;
1604 }
1605
1606 static int ip_mkroute_input(struct sk_buff *skb,
1607                             struct fib_result *res,
1608                             const struct flowi4 *fl4,
1609                             struct in_device *in_dev,
1610                             __be32 daddr, __be32 saddr, u32 tos)
1611 {
1612 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1613         if (res->fi && res->fi->fib_nhs > 1)
1614                 fib_select_multipath(res);
1615 #endif
1616
1617         /* create a routing cache entry */
1618         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1619 }
1620
1621 /*
1622  *      NOTE. We drop all the packets that has local source
1623  *      addresses, because every properly looped back packet
1624  *      must have correct destination already attached by output routine.
1625  *
1626  *      Such approach solves two big problems:
1627  *      1. Not simplex devices are handled properly.
1628  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1629  *      called with rcu_read_lock()
1630  */
1631
1632 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1633                                u8 tos, struct net_device *dev)
1634 {
1635         struct fib_result res;
1636         struct in_device *in_dev = __in_dev_get_rcu(dev);
1637         struct flowi4   fl4;
1638         unsigned int    flags = 0;
1639         u32             itag = 0;
1640         struct rtable   *rth;
1641         int             err = -EINVAL;
1642         struct net    *net = dev_net(dev);
1643         bool do_cache;
1644
1645         /* IP on this device is disabled. */
1646
1647         if (!in_dev)
1648                 goto out;
1649
1650         /* Check for the most weird martians, which can be not detected
1651            by fib_lookup.
1652          */
1653
1654         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1655                 goto martian_source;
1656
1657         res.fi = NULL;
1658         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1659                 goto brd_input;
1660
1661         /* Accept zero addresses only to limited broadcast;
1662          * I even do not know to fix it or not. Waiting for complains :-)
1663          */
1664         if (ipv4_is_zeronet(saddr))
1665                 goto martian_source;
1666
1667         if (ipv4_is_zeronet(daddr))
1668                 goto martian_destination;
1669
1670         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1671          * and call it once if daddr or/and saddr are loopback addresses
1672          */
1673         if (ipv4_is_loopback(daddr)) {
1674                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1675                         goto martian_destination;
1676         } else if (ipv4_is_loopback(saddr)) {
1677                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1678                         goto martian_source;
1679         }
1680
1681         /*
1682          *      Now we are ready to route packet.
1683          */
1684         fl4.flowi4_oif = 0;
1685         fl4.flowi4_iif = dev->ifindex;
1686         fl4.flowi4_mark = skb->mark;
1687         fl4.flowi4_tos = tos;
1688         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1689         fl4.daddr = daddr;
1690         fl4.saddr = saddr;
1691         err = fib_lookup(net, &fl4, &res);
1692         if (err != 0) {
1693                 if (!IN_DEV_FORWARD(in_dev))
1694                         err = -EHOSTUNREACH;
1695                 goto no_route;
1696         }
1697
1698         if (res.type == RTN_BROADCAST)
1699                 goto brd_input;
1700
1701         if (res.type == RTN_LOCAL) {
1702                 err = fib_validate_source(skb, saddr, daddr, tos,
1703                                           0, dev, in_dev, &itag);
1704                 if (err < 0)
1705                         goto martian_source_keep_err;
1706                 goto local_input;
1707         }
1708
1709         if (!IN_DEV_FORWARD(in_dev)) {
1710                 err = -EHOSTUNREACH;
1711                 goto no_route;
1712         }
1713         if (res.type != RTN_UNICAST)
1714                 goto martian_destination;
1715
1716         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1717 out:    return err;
1718
1719 brd_input:
1720         if (skb->protocol != htons(ETH_P_IP))
1721                 goto e_inval;
1722
1723         if (!ipv4_is_zeronet(saddr)) {
1724                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1725                                           in_dev, &itag);
1726                 if (err < 0)
1727                         goto martian_source_keep_err;
1728         }
1729         flags |= RTCF_BROADCAST;
1730         res.type = RTN_BROADCAST;
1731         RT_CACHE_STAT_INC(in_brd);
1732
1733 local_input:
1734         do_cache = false;
1735         if (res.fi) {
1736                 if (!itag) {
1737                         rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1738                         if (rt_cache_valid(rth)) {
1739                                 skb_dst_set_noref(skb, &rth->dst);
1740                                 err = 0;
1741                                 goto out;
1742                         }
1743                         do_cache = true;
1744                 }
1745         }
1746
1747         rth = rt_dst_alloc(net->loopback_dev,
1748                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1749         if (!rth)
1750                 goto e_nobufs;
1751
1752         rth->dst.input= ip_local_deliver;
1753         rth->dst.output= ip_rt_bug;
1754 #ifdef CONFIG_IP_ROUTE_CLASSID
1755         rth->dst.tclassid = itag;
1756 #endif
1757
1758         rth->rt_genid = rt_genid_ipv4(net);
1759         rth->rt_flags   = flags|RTCF_LOCAL;
1760         rth->rt_type    = res.type;
1761         rth->rt_is_input = 1;
1762         rth->rt_iif     = 0;
1763         rth->rt_pmtu    = 0;
1764         rth->rt_gateway = 0;
1765         rth->rt_uses_gateway = 0;
1766         INIT_LIST_HEAD(&rth->rt_uncached);
1767         RT_CACHE_STAT_INC(in_slow_tot);
1768         if (res.type == RTN_UNREACHABLE) {
1769                 rth->dst.input= ip_error;
1770                 rth->dst.error= -err;
1771                 rth->rt_flags   &= ~RTCF_LOCAL;
1772         }
1773         if (do_cache) {
1774                 if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) {
1775                         rth->dst.flags |= DST_NOCACHE;
1776                         rt_add_uncached_list(rth);
1777                 }
1778         }
1779         skb_dst_set(skb, &rth->dst);
1780         err = 0;
1781         goto out;
1782
1783 no_route:
1784         RT_CACHE_STAT_INC(in_no_route);
1785         res.type = RTN_UNREACHABLE;
1786         if (err == -ESRCH)
1787                 err = -ENETUNREACH;
1788         goto local_input;
1789
1790         /*
1791          *      Do not cache martian addresses: they should be logged (RFC1812)
1792          */
1793 martian_destination:
1794         RT_CACHE_STAT_INC(in_martian_dst);
1795 #ifdef CONFIG_IP_ROUTE_VERBOSE
1796         if (IN_DEV_LOG_MARTIANS(in_dev))
1797                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1798                                      &daddr, &saddr, dev->name);
1799 #endif
1800
1801 e_inval:
1802         err = -EINVAL;
1803         goto out;
1804
1805 e_nobufs:
1806         err = -ENOBUFS;
1807         goto out;
1808
1809 martian_source:
1810         err = -EINVAL;
1811 martian_source_keep_err:
1812         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1813         goto out;
1814 }
1815
1816 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1817                          u8 tos, struct net_device *dev)
1818 {
1819         int res;
1820
1821         rcu_read_lock();
1822
1823         /* Multicast recognition logic is moved from route cache to here.
1824            The problem was that too many Ethernet cards have broken/missing
1825            hardware multicast filters :-( As result the host on multicasting
1826            network acquires a lot of useless route cache entries, sort of
1827            SDR messages from all the world. Now we try to get rid of them.
1828            Really, provided software IP multicast filter is organized
1829            reasonably (at least, hashed), it does not result in a slowdown
1830            comparing with route cache reject entries.
1831            Note, that multicast routers are not affected, because
1832            route cache entry is created eventually.
1833          */
1834         if (ipv4_is_multicast(daddr)) {
1835                 struct in_device *in_dev = __in_dev_get_rcu(dev);
1836
1837                 if (in_dev) {
1838                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1839                                                   ip_hdr(skb)->protocol);
1840                         if (our
1841 #ifdef CONFIG_IP_MROUTE
1842                                 ||
1843                             (!ipv4_is_local_multicast(daddr) &&
1844                              IN_DEV_MFORWARD(in_dev))
1845 #endif
1846                            ) {
1847                                 int res = ip_route_input_mc(skb, daddr, saddr,
1848                                                             tos, dev, our);
1849                                 rcu_read_unlock();
1850                                 return res;
1851                         }
1852                 }
1853                 rcu_read_unlock();
1854                 return -EINVAL;
1855         }
1856         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1857         rcu_read_unlock();
1858         return res;
1859 }
1860 EXPORT_SYMBOL(ip_route_input_noref);
1861
1862 /* called with rcu_read_lock() */
1863 static struct rtable *__mkroute_output(const struct fib_result *res,
1864                                        const struct flowi4 *fl4, int orig_oif,
1865                                        struct net_device *dev_out,
1866                                        unsigned int flags)
1867 {
1868         struct fib_info *fi = res->fi;
1869         struct fib_nh_exception *fnhe;
1870         struct in_device *in_dev;
1871         u16 type = res->type;
1872         struct rtable *rth;
1873         bool do_cache;
1874
1875         in_dev = __in_dev_get_rcu(dev_out);
1876         if (!in_dev)
1877                 return ERR_PTR(-EINVAL);
1878
1879         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1880                 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1881                         return ERR_PTR(-EINVAL);
1882
1883         if (ipv4_is_lbcast(fl4->daddr))
1884                 type = RTN_BROADCAST;
1885         else if (ipv4_is_multicast(fl4->daddr))
1886                 type = RTN_MULTICAST;
1887         else if (ipv4_is_zeronet(fl4->daddr))
1888                 return ERR_PTR(-EINVAL);
1889
1890         if (dev_out->flags & IFF_LOOPBACK)
1891                 flags |= RTCF_LOCAL;
1892
1893         do_cache = true;
1894         if (type == RTN_BROADCAST) {
1895                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
1896                 fi = NULL;
1897         } else if (type == RTN_MULTICAST) {
1898                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
1899                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1900                                      fl4->flowi4_proto))
1901                         flags &= ~RTCF_LOCAL;
1902                 else
1903                         do_cache = false;
1904                 /* If multicast route do not exist use
1905                  * default one, but do not gateway in this case.
1906                  * Yes, it is hack.
1907                  */
1908                 if (fi && res->prefixlen < 4)
1909                         fi = NULL;
1910         }
1911
1912         fnhe = NULL;
1913         do_cache &= fi != NULL;
1914         if (do_cache) {
1915                 struct rtable __rcu **prth;
1916                 struct fib_nh *nh = &FIB_RES_NH(*res);
1917
1918                 fnhe = find_exception(nh, fl4->daddr);
1919                 if (fnhe)
1920                         prth = &fnhe->fnhe_rth_output;
1921                 else {
1922                         if (unlikely(fl4->flowi4_flags &
1923                                      FLOWI_FLAG_KNOWN_NH &&
1924                                      !(nh->nh_gw &&
1925                                        nh->nh_scope == RT_SCOPE_LINK))) {
1926                                 do_cache = false;
1927                                 goto add;
1928                         }
1929                         prth = __this_cpu_ptr(nh->nh_pcpu_rth_output);
1930                 }
1931                 rth = rcu_dereference(*prth);
1932                 if (rt_cache_valid(rth)) {
1933                         dst_hold(&rth->dst);
1934                         return rth;
1935                 }
1936         }
1937
1938 add:
1939         rth = rt_dst_alloc(dev_out,
1940                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1941                            IN_DEV_CONF_GET(in_dev, NOXFRM),
1942                            do_cache);
1943         if (!rth)
1944                 return ERR_PTR(-ENOBUFS);
1945
1946         rth->dst.output = ip_output;
1947
1948         rth->rt_genid = rt_genid_ipv4(dev_net(dev_out));
1949         rth->rt_flags   = flags;
1950         rth->rt_type    = type;
1951         rth->rt_is_input = 0;
1952         rth->rt_iif     = orig_oif ? : 0;
1953         rth->rt_pmtu    = 0;
1954         rth->rt_gateway = 0;
1955         rth->rt_uses_gateway = 0;
1956         INIT_LIST_HEAD(&rth->rt_uncached);
1957
1958         RT_CACHE_STAT_INC(out_slow_tot);
1959
1960         if (flags & RTCF_LOCAL)
1961                 rth->dst.input = ip_local_deliver;
1962         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
1963                 if (flags & RTCF_LOCAL &&
1964                     !(dev_out->flags & IFF_LOOPBACK)) {
1965                         rth->dst.output = ip_mc_output;
1966                         RT_CACHE_STAT_INC(out_slow_mc);
1967                 }
1968 #ifdef CONFIG_IP_MROUTE
1969                 if (type == RTN_MULTICAST) {
1970                         if (IN_DEV_MFORWARD(in_dev) &&
1971                             !ipv4_is_local_multicast(fl4->daddr)) {
1972                                 rth->dst.input = ip_mr_input;
1973                                 rth->dst.output = ip_mc_output;
1974                         }
1975                 }
1976 #endif
1977         }
1978
1979         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
1980
1981         return rth;
1982 }
1983
1984 /*
1985  * Major route resolver routine.
1986  */
1987
1988 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1989 {
1990         struct net_device *dev_out = NULL;
1991         __u8 tos = RT_FL_TOS(fl4);
1992         unsigned int flags = 0;
1993         struct fib_result res;
1994         struct rtable *rth;
1995         int orig_oif;
1996
1997         res.tclassid    = 0;
1998         res.fi          = NULL;
1999         res.table       = NULL;
2000
2001         orig_oif = fl4->flowi4_oif;
2002
2003         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2004         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2005         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2006                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2007
2008         rcu_read_lock();
2009         if (fl4->saddr) {
2010                 rth = ERR_PTR(-EINVAL);
2011                 if (ipv4_is_multicast(fl4->saddr) ||
2012                     ipv4_is_lbcast(fl4->saddr) ||
2013                     ipv4_is_zeronet(fl4->saddr))
2014                         goto out;
2015
2016                 /* I removed check for oif == dev_out->oif here.
2017                    It was wrong for two reasons:
2018                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2019                       is assigned to multiple interfaces.
2020                    2. Moreover, we are allowed to send packets with saddr
2021                       of another iface. --ANK
2022                  */
2023
2024                 if (fl4->flowi4_oif == 0 &&
2025                     (ipv4_is_multicast(fl4->daddr) ||
2026                      ipv4_is_lbcast(fl4->daddr))) {
2027                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2028                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2029                         if (dev_out == NULL)
2030                                 goto out;
2031
2032                         /* Special hack: user can direct multicasts
2033                            and limited broadcast via necessary interface
2034                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2035                            This hack is not just for fun, it allows
2036                            vic,vat and friends to work.
2037                            They bind socket to loopback, set ttl to zero
2038                            and expect that it will work.
2039                            From the viewpoint of routing cache they are broken,
2040                            because we are not allowed to build multicast path
2041                            with loopback source addr (look, routing cache
2042                            cannot know, that ttl is zero, so that packet
2043                            will not leave this host and route is valid).
2044                            Luckily, this hack is good workaround.
2045                          */
2046
2047                         fl4->flowi4_oif = dev_out->ifindex;
2048                         goto make_route;
2049                 }
2050
2051                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2052                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2053                         if (!__ip_dev_find(net, fl4->saddr, false))
2054                                 goto out;
2055                 }
2056         }
2057
2058
2059         if (fl4->flowi4_oif) {
2060                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2061                 rth = ERR_PTR(-ENODEV);
2062                 if (dev_out == NULL)
2063                         goto out;
2064
2065                 /* RACE: Check return value of inet_select_addr instead. */
2066                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2067                         rth = ERR_PTR(-ENETUNREACH);
2068                         goto out;
2069                 }
2070                 if (ipv4_is_local_multicast(fl4->daddr) ||
2071                     ipv4_is_lbcast(fl4->daddr)) {
2072                         if (!fl4->saddr)
2073                                 fl4->saddr = inet_select_addr(dev_out, 0,
2074                                                               RT_SCOPE_LINK);
2075                         goto make_route;
2076                 }
2077                 if (!fl4->saddr) {
2078                         if (ipv4_is_multicast(fl4->daddr))
2079                                 fl4->saddr = inet_select_addr(dev_out, 0,
2080                                                               fl4->flowi4_scope);
2081                         else if (!fl4->daddr)
2082                                 fl4->saddr = inet_select_addr(dev_out, 0,
2083                                                               RT_SCOPE_HOST);
2084                 }
2085         }
2086
2087         if (!fl4->daddr) {
2088                 fl4->daddr = fl4->saddr;
2089                 if (!fl4->daddr)
2090                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2091                 dev_out = net->loopback_dev;
2092                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2093                 res.type = RTN_LOCAL;
2094                 flags |= RTCF_LOCAL;
2095                 goto make_route;
2096         }
2097
2098         if (fib_lookup(net, fl4, &res)) {
2099                 res.fi = NULL;
2100                 res.table = NULL;
2101                 if (fl4->flowi4_oif) {
2102                         /* Apparently, routing tables are wrong. Assume,
2103                            that the destination is on link.
2104
2105                            WHY? DW.
2106                            Because we are allowed to send to iface
2107                            even if it has NO routes and NO assigned
2108                            addresses. When oif is specified, routing
2109                            tables are looked up with only one purpose:
2110                            to catch if destination is gatewayed, rather than
2111                            direct. Moreover, if MSG_DONTROUTE is set,
2112                            we send packet, ignoring both routing tables
2113                            and ifaddr state. --ANK
2114
2115
2116                            We could make it even if oif is unknown,
2117                            likely IPv6, but we do not.
2118                          */
2119
2120                         if (fl4->saddr == 0)
2121                                 fl4->saddr = inet_select_addr(dev_out, 0,
2122                                                               RT_SCOPE_LINK);
2123                         res.type = RTN_UNICAST;
2124                         goto make_route;
2125                 }
2126                 rth = ERR_PTR(-ENETUNREACH);
2127                 goto out;
2128         }
2129
2130         if (res.type == RTN_LOCAL) {
2131                 if (!fl4->saddr) {
2132                         if (res.fi->fib_prefsrc)
2133                                 fl4->saddr = res.fi->fib_prefsrc;
2134                         else
2135                                 fl4->saddr = fl4->daddr;
2136                 }
2137                 dev_out = net->loopback_dev;
2138                 fl4->flowi4_oif = dev_out->ifindex;
2139                 flags |= RTCF_LOCAL;
2140                 goto make_route;
2141         }
2142
2143 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2144         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2145                 fib_select_multipath(&res);
2146         else
2147 #endif
2148         if (!res.prefixlen &&
2149             res.table->tb_num_default > 1 &&
2150             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2151                 fib_select_default(&res);
2152
2153         if (!fl4->saddr)
2154                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2155
2156         dev_out = FIB_RES_DEV(res);
2157         fl4->flowi4_oif = dev_out->ifindex;
2158
2159
2160 make_route:
2161         rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2162
2163 out:
2164         rcu_read_unlock();
2165         return rth;
2166 }
2167 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2168
2169 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2170 {
2171         return NULL;
2172 }
2173
2174 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2175 {
2176         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2177
2178         return mtu ? : dst->dev->mtu;
2179 }
2180
2181 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2182                                           struct sk_buff *skb, u32 mtu)
2183 {
2184 }
2185
2186 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2187                                        struct sk_buff *skb)
2188 {
2189 }
2190
2191 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2192                                           unsigned long old)
2193 {
2194         return NULL;
2195 }
2196
2197 static struct dst_ops ipv4_dst_blackhole_ops = {
2198         .family                 =       AF_INET,
2199         .protocol               =       cpu_to_be16(ETH_P_IP),
2200         .check                  =       ipv4_blackhole_dst_check,
2201         .mtu                    =       ipv4_blackhole_mtu,
2202         .default_advmss         =       ipv4_default_advmss,
2203         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2204         .redirect               =       ipv4_rt_blackhole_redirect,
2205         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2206         .neigh_lookup           =       ipv4_neigh_lookup,
2207 };
2208
2209 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2210 {
2211         struct rtable *ort = (struct rtable *) dst_orig;
2212         struct rtable *rt;
2213
2214         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2215         if (rt) {
2216                 struct dst_entry *new = &rt->dst;
2217
2218                 new->__use = 1;
2219                 new->input = dst_discard;
2220                 new->output = dst_discard_sk;
2221
2222                 new->dev = ort->dst.dev;
2223                 if (new->dev)
2224                         dev_hold(new->dev);
2225
2226                 rt->rt_is_input = ort->rt_is_input;
2227                 rt->rt_iif = ort->rt_iif;
2228                 rt->rt_pmtu = ort->rt_pmtu;
2229
2230                 rt->rt_genid = rt_genid_ipv4(net);
2231                 rt->rt_flags = ort->rt_flags;
2232                 rt->rt_type = ort->rt_type;
2233                 rt->rt_gateway = ort->rt_gateway;
2234                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2235
2236                 INIT_LIST_HEAD(&rt->rt_uncached);
2237
2238                 dst_free(new);
2239         }
2240
2241         dst_release(dst_orig);
2242
2243         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2244 }
2245
2246 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2247                                     struct sock *sk)
2248 {
2249         struct rtable *rt = __ip_route_output_key(net, flp4);
2250
2251         if (IS_ERR(rt))
2252                 return rt;
2253
2254         if (flp4->flowi4_proto)
2255                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2256                                                    flowi4_to_flowi(flp4),
2257                                                    sk, 0);
2258
2259         return rt;
2260 }
2261 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2262
2263 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
2264                         struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2265                         u32 seq, int event, int nowait, unsigned int flags)
2266 {
2267         struct rtable *rt = skb_rtable(skb);
2268         struct rtmsg *r;
2269         struct nlmsghdr *nlh;
2270         unsigned long expires = 0;
2271         u32 error;
2272         u32 metrics[RTAX_MAX];
2273
2274         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2275         if (nlh == NULL)
2276                 return -EMSGSIZE;
2277
2278         r = nlmsg_data(nlh);
2279         r->rtm_family    = AF_INET;
2280         r->rtm_dst_len  = 32;
2281         r->rtm_src_len  = 0;
2282         r->rtm_tos      = fl4->flowi4_tos;
2283         r->rtm_table    = RT_TABLE_MAIN;
2284         if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2285                 goto nla_put_failure;
2286         r->rtm_type     = rt->rt_type;
2287         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2288         r->rtm_protocol = RTPROT_UNSPEC;
2289         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2290         if (rt->rt_flags & RTCF_NOTIFY)
2291                 r->rtm_flags |= RTM_F_NOTIFY;
2292
2293         if (nla_put_be32(skb, RTA_DST, dst))
2294                 goto nla_put_failure;
2295         if (src) {
2296                 r->rtm_src_len = 32;
2297                 if (nla_put_be32(skb, RTA_SRC, src))
2298                         goto nla_put_failure;
2299         }
2300         if (rt->dst.dev &&
2301             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2302                 goto nla_put_failure;
2303 #ifdef CONFIG_IP_ROUTE_CLASSID
2304         if (rt->dst.tclassid &&
2305             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2306                 goto nla_put_failure;
2307 #endif
2308         if (!rt_is_input_route(rt) &&
2309             fl4->saddr != src) {
2310                 if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
2311                         goto nla_put_failure;
2312         }
2313         if (rt->rt_uses_gateway &&
2314             nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2315                 goto nla_put_failure;
2316
2317         expires = rt->dst.expires;
2318         if (expires) {
2319                 unsigned long now = jiffies;
2320
2321                 if (time_before(now, expires))
2322                         expires -= now;
2323                 else
2324                         expires = 0;
2325         }
2326
2327         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2328         if (rt->rt_pmtu && expires)
2329                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2330         if (rtnetlink_put_metrics(skb, metrics) < 0)
2331                 goto nla_put_failure;
2332
2333         if (fl4->flowi4_mark &&
2334             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2335                 goto nla_put_failure;
2336
2337         error = rt->dst.error;
2338
2339         if (rt_is_input_route(rt)) {
2340 #ifdef CONFIG_IP_MROUTE
2341                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2342                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2343                         int err = ipmr_get_route(net, skb,
2344                                                  fl4->saddr, fl4->daddr,
2345                                                  r, nowait);
2346                         if (err <= 0) {
2347                                 if (!nowait) {
2348                                         if (err == 0)
2349                                                 return 0;
2350                                         goto nla_put_failure;
2351                                 } else {
2352                                         if (err == -EMSGSIZE)
2353                                                 goto nla_put_failure;
2354                                         error = err;
2355                                 }
2356                         }
2357                 } else
2358 #endif
2359                         if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2360                                 goto nla_put_failure;
2361         }
2362
2363         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2364                 goto nla_put_failure;
2365
2366         return nlmsg_end(skb, nlh);
2367
2368 nla_put_failure:
2369         nlmsg_cancel(skb, nlh);
2370         return -EMSGSIZE;
2371 }
2372
2373 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2374 {
2375         struct net *net = sock_net(in_skb->sk);
2376         struct rtmsg *rtm;
2377         struct nlattr *tb[RTA_MAX+1];
2378         struct rtable *rt = NULL;
2379         struct flowi4 fl4;
2380         __be32 dst = 0;
2381         __be32 src = 0;
2382         u32 iif;
2383         int err;
2384         int mark;
2385         struct sk_buff *skb;
2386
2387         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2388         if (err < 0)
2389                 goto errout;
2390
2391         rtm = nlmsg_data(nlh);
2392
2393         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2394         if (skb == NULL) {
2395                 err = -ENOBUFS;
2396                 goto errout;
2397         }
2398
2399         /* Reserve room for dummy headers, this skb can pass
2400            through good chunk of routing engine.
2401          */
2402         skb_reset_mac_header(skb);
2403         skb_reset_network_header(skb);
2404
2405         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2406         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2407         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2408
2409         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2410         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2411         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2412         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2413
2414         memset(&fl4, 0, sizeof(fl4));
2415         fl4.daddr = dst;
2416         fl4.saddr = src;
2417         fl4.flowi4_tos = rtm->rtm_tos;
2418         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2419         fl4.flowi4_mark = mark;
2420
2421         if (iif) {
2422                 struct net_device *dev;
2423
2424                 dev = __dev_get_by_index(net, iif);
2425                 if (dev == NULL) {
2426                         err = -ENODEV;
2427                         goto errout_free;
2428                 }
2429
2430                 skb->protocol   = htons(ETH_P_IP);
2431                 skb->dev        = dev;
2432                 skb->mark       = mark;
2433                 local_bh_disable();
2434                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2435                 local_bh_enable();
2436
2437                 rt = skb_rtable(skb);
2438                 if (err == 0 && rt->dst.error)
2439                         err = -rt->dst.error;
2440         } else {
2441                 rt = ip_route_output_key(net, &fl4);
2442
2443                 err = 0;
2444                 if (IS_ERR(rt))
2445                         err = PTR_ERR(rt);
2446         }
2447
2448         if (err)
2449                 goto errout_free;
2450
2451         skb_dst_set(skb, &rt->dst);
2452         if (rtm->rtm_flags & RTM_F_NOTIFY)
2453                 rt->rt_flags |= RTCF_NOTIFY;
2454
2455         err = rt_fill_info(net, dst, src, &fl4, skb,
2456                            NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2457                            RTM_NEWROUTE, 0, 0);
2458         if (err <= 0)
2459                 goto errout_free;
2460
2461         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2462 errout:
2463         return err;
2464
2465 errout_free:
2466         kfree_skb(skb);
2467         goto errout;
2468 }
2469
2470 void ip_rt_multicast_event(struct in_device *in_dev)
2471 {
2472         rt_cache_flush(dev_net(in_dev->dev));
2473 }
2474
2475 #ifdef CONFIG_SYSCTL
2476 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
2477 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2478 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2479 static int ip_rt_gc_elasticity __read_mostly    = 8;
2480
2481 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2482                                         void __user *buffer,
2483                                         size_t *lenp, loff_t *ppos)
2484 {
2485         struct net *net = (struct net *)__ctl->extra1;
2486
2487         if (write) {
2488                 rt_cache_flush(net);
2489                 fnhe_genid_bump(net);
2490                 return 0;
2491         }
2492
2493         return -EINVAL;
2494 }
2495
2496 static struct ctl_table ipv4_route_table[] = {
2497         {
2498                 .procname       = "gc_thresh",
2499                 .data           = &ipv4_dst_ops.gc_thresh,
2500                 .maxlen         = sizeof(int),
2501                 .mode           = 0644,
2502                 .proc_handler   = proc_dointvec,
2503         },
2504         {
2505                 .procname       = "max_size",
2506                 .data           = &ip_rt_max_size,
2507                 .maxlen         = sizeof(int),
2508                 .mode           = 0644,
2509                 .proc_handler   = proc_dointvec,
2510         },
2511         {
2512                 /*  Deprecated. Use gc_min_interval_ms */
2513
2514                 .procname       = "gc_min_interval",
2515                 .data           = &ip_rt_gc_min_interval,
2516                 .maxlen         = sizeof(int),
2517                 .mode           = 0644,
2518                 .proc_handler   = proc_dointvec_jiffies,
2519         },
2520         {
2521                 .procname       = "gc_min_interval_ms",
2522                 .data           = &ip_rt_gc_min_interval,
2523                 .maxlen         = sizeof(int),
2524                 .mode           = 0644,
2525                 .proc_handler   = proc_dointvec_ms_jiffies,
2526         },
2527         {
2528                 .procname       = "gc_timeout",
2529                 .data           = &ip_rt_gc_timeout,
2530                 .maxlen         = sizeof(int),
2531                 .mode           = 0644,
2532                 .proc_handler   = proc_dointvec_jiffies,
2533         },
2534         {
2535                 .procname       = "gc_interval",
2536                 .data           = &ip_rt_gc_interval,
2537                 .maxlen         = sizeof(int),
2538                 .mode           = 0644,
2539                 .proc_handler   = proc_dointvec_jiffies,
2540         },
2541         {
2542                 .procname       = "redirect_load",
2543                 .data           = &ip_rt_redirect_load,
2544                 .maxlen         = sizeof(int),
2545                 .mode           = 0644,
2546                 .proc_handler   = proc_dointvec,
2547         },
2548         {
2549                 .procname       = "redirect_number",
2550                 .data           = &ip_rt_redirect_number,
2551                 .maxlen         = sizeof(int),
2552                 .mode           = 0644,
2553                 .proc_handler   = proc_dointvec,
2554         },
2555         {
2556                 .procname       = "redirect_silence",
2557                 .data           = &ip_rt_redirect_silence,
2558                 .maxlen         = sizeof(int),
2559                 .mode           = 0644,
2560                 .proc_handler   = proc_dointvec,
2561         },
2562         {
2563                 .procname       = "error_cost",
2564                 .data           = &ip_rt_error_cost,
2565                 .maxlen         = sizeof(int),
2566                 .mode           = 0644,
2567                 .proc_handler   = proc_dointvec,
2568         },
2569         {
2570                 .procname       = "error_burst",
2571                 .data           = &ip_rt_error_burst,
2572                 .maxlen         = sizeof(int),
2573                 .mode           = 0644,
2574                 .proc_handler   = proc_dointvec,
2575         },
2576         {
2577                 .procname       = "gc_elasticity",
2578                 .data           = &ip_rt_gc_elasticity,
2579                 .maxlen         = sizeof(int),
2580                 .mode           = 0644,
2581                 .proc_handler   = proc_dointvec,
2582         },
2583         {
2584                 .procname       = "mtu_expires",
2585                 .data           = &ip_rt_mtu_expires,
2586                 .maxlen         = sizeof(int),
2587                 .mode           = 0644,
2588                 .proc_handler   = proc_dointvec_jiffies,
2589         },
2590         {
2591                 .procname       = "min_pmtu",
2592                 .data           = &ip_rt_min_pmtu,
2593                 .maxlen         = sizeof(int),
2594                 .mode           = 0644,
2595                 .proc_handler   = proc_dointvec,
2596         },
2597         {
2598                 .procname       = "min_adv_mss",
2599                 .data           = &ip_rt_min_advmss,
2600                 .maxlen         = sizeof(int),
2601                 .mode           = 0644,
2602                 .proc_handler   = proc_dointvec,
2603         },
2604         { }
2605 };
2606
2607 static struct ctl_table ipv4_route_flush_table[] = {
2608         {
2609                 .procname       = "flush",
2610                 .maxlen         = sizeof(int),
2611                 .mode           = 0200,
2612                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2613         },
2614         { },
2615 };
2616
2617 static __net_init int sysctl_route_net_init(struct net *net)
2618 {
2619         struct ctl_table *tbl;
2620
2621         tbl = ipv4_route_flush_table;
2622         if (!net_eq(net, &init_net)) {
2623                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2624                 if (tbl == NULL)
2625                         goto err_dup;
2626
2627                 /* Don't export sysctls to unprivileged users */
2628                 if (net->user_ns != &init_user_ns)
2629                         tbl[0].procname = NULL;
2630         }
2631         tbl[0].extra1 = net;
2632
2633         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2634         if (net->ipv4.route_hdr == NULL)
2635                 goto err_reg;
2636         return 0;
2637
2638 err_reg:
2639         if (tbl != ipv4_route_flush_table)
2640                 kfree(tbl);
2641 err_dup:
2642         return -ENOMEM;
2643 }
2644
2645 static __net_exit void sysctl_route_net_exit(struct net *net)
2646 {
2647         struct ctl_table *tbl;
2648
2649         tbl = net->ipv4.route_hdr->ctl_table_arg;
2650         unregister_net_sysctl_table(net->ipv4.route_hdr);
2651         BUG_ON(tbl == ipv4_route_flush_table);
2652         kfree(tbl);
2653 }
2654
2655 static __net_initdata struct pernet_operations sysctl_route_ops = {
2656         .init = sysctl_route_net_init,
2657         .exit = sysctl_route_net_exit,
2658 };
2659 #endif
2660
2661 static __net_init int rt_genid_init(struct net *net)
2662 {
2663         atomic_set(&net->ipv4.rt_genid, 0);
2664         atomic_set(&net->fnhe_genid, 0);
2665         get_random_bytes(&net->ipv4.dev_addr_genid,
2666                          sizeof(net->ipv4.dev_addr_genid));
2667         return 0;
2668 }
2669
2670 static __net_initdata struct pernet_operations rt_genid_ops = {
2671         .init = rt_genid_init,
2672 };
2673
2674 static int __net_init ipv4_inetpeer_init(struct net *net)
2675 {
2676         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2677
2678         if (!bp)
2679                 return -ENOMEM;
2680         inet_peer_base_init(bp);
2681         net->ipv4.peers = bp;
2682         return 0;
2683 }
2684
2685 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2686 {
2687         struct inet_peer_base *bp = net->ipv4.peers;
2688
2689         net->ipv4.peers = NULL;
2690         inetpeer_invalidate_tree(bp);
2691         kfree(bp);
2692 }
2693
2694 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2695         .init   =       ipv4_inetpeer_init,
2696         .exit   =       ipv4_inetpeer_exit,
2697 };
2698
2699 #ifdef CONFIG_IP_ROUTE_CLASSID
2700 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2701 #endif /* CONFIG_IP_ROUTE_CLASSID */
2702
2703 int __init ip_rt_init(void)
2704 {
2705         int rc = 0;
2706
2707 #ifdef CONFIG_IP_ROUTE_CLASSID
2708         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2709         if (!ip_rt_acct)
2710                 panic("IP: failed to allocate ip_rt_acct\n");
2711 #endif
2712
2713         ipv4_dst_ops.kmem_cachep =
2714                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2715                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2716
2717         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2718
2719         if (dst_entries_init(&ipv4_dst_ops) < 0)
2720                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
2721
2722         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2723                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2724
2725         ipv4_dst_ops.gc_thresh = ~0;
2726         ip_rt_max_size = INT_MAX;
2727
2728         devinet_init();
2729         ip_fib_init();
2730
2731         if (ip_rt_proc_init())
2732                 pr_err("Unable to create route proc files\n");
2733 #ifdef CONFIG_XFRM
2734         xfrm_init();
2735         xfrm4_init();
2736 #endif
2737         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2738
2739 #ifdef CONFIG_SYSCTL
2740         register_pernet_subsys(&sysctl_route_ops);
2741 #endif
2742         register_pernet_subsys(&rt_genid_ops);
2743         register_pernet_subsys(&ipv4_inetpeer_ops);
2744         return rc;
2745 }
2746
2747 #ifdef CONFIG_SYSCTL
2748 /*
2749  * We really need to sanitize the damn ipv4 init order, then all
2750  * this nonsense will go away.
2751  */
2752 void __init ip_static_sysctl_init(void)
2753 {
2754         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2755 }
2756 #endif