ipv4: add a sock pointer to dst->output() path.
[sfrench/cifs-2.6.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #define pr_fmt(fmt) "IPv4: " fmt
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <net/dst.h>
93 #include <net/net_namespace.h>
94 #include <net/protocol.h>
95 #include <net/ip.h>
96 #include <net/route.h>
97 #include <net/inetpeer.h>
98 #include <net/sock.h>
99 #include <net/ip_fib.h>
100 #include <net/arp.h>
101 #include <net/tcp.h>
102 #include <net/icmp.h>
103 #include <net/xfrm.h>
104 #include <net/netevent.h>
105 #include <net/rtnetlink.h>
106 #ifdef CONFIG_SYSCTL
107 #include <linux/sysctl.h>
108 #include <linux/kmemleak.h>
109 #endif
110 #include <net/secure_seq.h>
111
112 #define RT_FL_TOS(oldflp4) \
113         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
114
115 #define RT_GC_TIMEOUT (300*HZ)
116
117 static int ip_rt_max_size;
118 static int ip_rt_redirect_number __read_mostly  = 9;
119 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
120 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
121 static int ip_rt_error_cost __read_mostly       = HZ;
122 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
123 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
124 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
125 static int ip_rt_min_advmss __read_mostly       = 256;
126
127 /*
128  *      Interface to generic destination cache.
129  */
130
131 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
132 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
133 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
134 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
135 static void              ipv4_link_failure(struct sk_buff *skb);
136 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
137                                            struct sk_buff *skb, u32 mtu);
138 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
139                                         struct sk_buff *skb);
140 static void             ipv4_dst_destroy(struct dst_entry *dst);
141
142 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
143 {
144         WARN_ON(1);
145         return NULL;
146 }
147
148 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
149                                            struct sk_buff *skb,
150                                            const void *daddr);
151
152 static struct dst_ops ipv4_dst_ops = {
153         .family =               AF_INET,
154         .protocol =             cpu_to_be16(ETH_P_IP),
155         .check =                ipv4_dst_check,
156         .default_advmss =       ipv4_default_advmss,
157         .mtu =                  ipv4_mtu,
158         .cow_metrics =          ipv4_cow_metrics,
159         .destroy =              ipv4_dst_destroy,
160         .negative_advice =      ipv4_negative_advice,
161         .link_failure =         ipv4_link_failure,
162         .update_pmtu =          ip_rt_update_pmtu,
163         .redirect =             ip_do_redirect,
164         .local_out =            __ip_local_out,
165         .neigh_lookup =         ipv4_neigh_lookup,
166 };
167
168 #define ECN_OR_COST(class)      TC_PRIO_##class
169
170 const __u8 ip_tos2prio[16] = {
171         TC_PRIO_BESTEFFORT,
172         ECN_OR_COST(BESTEFFORT),
173         TC_PRIO_BESTEFFORT,
174         ECN_OR_COST(BESTEFFORT),
175         TC_PRIO_BULK,
176         ECN_OR_COST(BULK),
177         TC_PRIO_BULK,
178         ECN_OR_COST(BULK),
179         TC_PRIO_INTERACTIVE,
180         ECN_OR_COST(INTERACTIVE),
181         TC_PRIO_INTERACTIVE,
182         ECN_OR_COST(INTERACTIVE),
183         TC_PRIO_INTERACTIVE_BULK,
184         ECN_OR_COST(INTERACTIVE_BULK),
185         TC_PRIO_INTERACTIVE_BULK,
186         ECN_OR_COST(INTERACTIVE_BULK)
187 };
188 EXPORT_SYMBOL(ip_tos2prio);
189
190 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
191 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
192
193 #ifdef CONFIG_PROC_FS
194 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
195 {
196         if (*pos)
197                 return NULL;
198         return SEQ_START_TOKEN;
199 }
200
201 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
202 {
203         ++*pos;
204         return NULL;
205 }
206
207 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
208 {
209 }
210
211 static int rt_cache_seq_show(struct seq_file *seq, void *v)
212 {
213         if (v == SEQ_START_TOKEN)
214                 seq_printf(seq, "%-127s\n",
215                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
216                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
217                            "HHUptod\tSpecDst");
218         return 0;
219 }
220
221 static const struct seq_operations rt_cache_seq_ops = {
222         .start  = rt_cache_seq_start,
223         .next   = rt_cache_seq_next,
224         .stop   = rt_cache_seq_stop,
225         .show   = rt_cache_seq_show,
226 };
227
228 static int rt_cache_seq_open(struct inode *inode, struct file *file)
229 {
230         return seq_open(file, &rt_cache_seq_ops);
231 }
232
233 static const struct file_operations rt_cache_seq_fops = {
234         .owner   = THIS_MODULE,
235         .open    = rt_cache_seq_open,
236         .read    = seq_read,
237         .llseek  = seq_lseek,
238         .release = seq_release,
239 };
240
241
242 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
243 {
244         int cpu;
245
246         if (*pos == 0)
247                 return SEQ_START_TOKEN;
248
249         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
250                 if (!cpu_possible(cpu))
251                         continue;
252                 *pos = cpu+1;
253                 return &per_cpu(rt_cache_stat, cpu);
254         }
255         return NULL;
256 }
257
258 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
259 {
260         int cpu;
261
262         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
263                 if (!cpu_possible(cpu))
264                         continue;
265                 *pos = cpu+1;
266                 return &per_cpu(rt_cache_stat, cpu);
267         }
268         return NULL;
269
270 }
271
272 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
273 {
274
275 }
276
277 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
278 {
279         struct rt_cache_stat *st = v;
280
281         if (v == SEQ_START_TOKEN) {
282                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
283                 return 0;
284         }
285
286         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
287                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
288                    dst_entries_get_slow(&ipv4_dst_ops),
289                    0, /* st->in_hit */
290                    st->in_slow_tot,
291                    st->in_slow_mc,
292                    st->in_no_route,
293                    st->in_brd,
294                    st->in_martian_dst,
295                    st->in_martian_src,
296
297                    0, /* st->out_hit */
298                    st->out_slow_tot,
299                    st->out_slow_mc,
300
301                    0, /* st->gc_total */
302                    0, /* st->gc_ignored */
303                    0, /* st->gc_goal_miss */
304                    0, /* st->gc_dst_overflow */
305                    0, /* st->in_hlist_search */
306                    0  /* st->out_hlist_search */
307                 );
308         return 0;
309 }
310
311 static const struct seq_operations rt_cpu_seq_ops = {
312         .start  = rt_cpu_seq_start,
313         .next   = rt_cpu_seq_next,
314         .stop   = rt_cpu_seq_stop,
315         .show   = rt_cpu_seq_show,
316 };
317
318
319 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
320 {
321         return seq_open(file, &rt_cpu_seq_ops);
322 }
323
324 static const struct file_operations rt_cpu_seq_fops = {
325         .owner   = THIS_MODULE,
326         .open    = rt_cpu_seq_open,
327         .read    = seq_read,
328         .llseek  = seq_lseek,
329         .release = seq_release,
330 };
331
332 #ifdef CONFIG_IP_ROUTE_CLASSID
333 static int rt_acct_proc_show(struct seq_file *m, void *v)
334 {
335         struct ip_rt_acct *dst, *src;
336         unsigned int i, j;
337
338         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
339         if (!dst)
340                 return -ENOMEM;
341
342         for_each_possible_cpu(i) {
343                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
344                 for (j = 0; j < 256; j++) {
345                         dst[j].o_bytes   += src[j].o_bytes;
346                         dst[j].o_packets += src[j].o_packets;
347                         dst[j].i_bytes   += src[j].i_bytes;
348                         dst[j].i_packets += src[j].i_packets;
349                 }
350         }
351
352         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
353         kfree(dst);
354         return 0;
355 }
356
357 static int rt_acct_proc_open(struct inode *inode, struct file *file)
358 {
359         return single_open(file, rt_acct_proc_show, NULL);
360 }
361
362 static const struct file_operations rt_acct_proc_fops = {
363         .owner          = THIS_MODULE,
364         .open           = rt_acct_proc_open,
365         .read           = seq_read,
366         .llseek         = seq_lseek,
367         .release        = single_release,
368 };
369 #endif
370
371 static int __net_init ip_rt_do_proc_init(struct net *net)
372 {
373         struct proc_dir_entry *pde;
374
375         pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
376                           &rt_cache_seq_fops);
377         if (!pde)
378                 goto err1;
379
380         pde = proc_create("rt_cache", S_IRUGO,
381                           net->proc_net_stat, &rt_cpu_seq_fops);
382         if (!pde)
383                 goto err2;
384
385 #ifdef CONFIG_IP_ROUTE_CLASSID
386         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
387         if (!pde)
388                 goto err3;
389 #endif
390         return 0;
391
392 #ifdef CONFIG_IP_ROUTE_CLASSID
393 err3:
394         remove_proc_entry("rt_cache", net->proc_net_stat);
395 #endif
396 err2:
397         remove_proc_entry("rt_cache", net->proc_net);
398 err1:
399         return -ENOMEM;
400 }
401
402 static void __net_exit ip_rt_do_proc_exit(struct net *net)
403 {
404         remove_proc_entry("rt_cache", net->proc_net_stat);
405         remove_proc_entry("rt_cache", net->proc_net);
406 #ifdef CONFIG_IP_ROUTE_CLASSID
407         remove_proc_entry("rt_acct", net->proc_net);
408 #endif
409 }
410
411 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
412         .init = ip_rt_do_proc_init,
413         .exit = ip_rt_do_proc_exit,
414 };
415
416 static int __init ip_rt_proc_init(void)
417 {
418         return register_pernet_subsys(&ip_rt_proc_ops);
419 }
420
421 #else
422 static inline int ip_rt_proc_init(void)
423 {
424         return 0;
425 }
426 #endif /* CONFIG_PROC_FS */
427
428 static inline bool rt_is_expired(const struct rtable *rth)
429 {
430         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
431 }
432
433 void rt_cache_flush(struct net *net)
434 {
435         rt_genid_bump_ipv4(net);
436 }
437
438 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
439                                            struct sk_buff *skb,
440                                            const void *daddr)
441 {
442         struct net_device *dev = dst->dev;
443         const __be32 *pkey = daddr;
444         const struct rtable *rt;
445         struct neighbour *n;
446
447         rt = (const struct rtable *) dst;
448         if (rt->rt_gateway)
449                 pkey = (const __be32 *) &rt->rt_gateway;
450         else if (skb)
451                 pkey = &ip_hdr(skb)->daddr;
452
453         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
454         if (n)
455                 return n;
456         return neigh_create(&arp_tbl, pkey, dev);
457 }
458
459 /*
460  * Peer allocation may fail only in serious out-of-memory conditions.  However
461  * we still can generate some output.
462  * Random ID selection looks a bit dangerous because we have no chances to
463  * select ID being unique in a reasonable period of time.
464  * But broken packet identifier may be better than no packet at all.
465  */
466 static void ip_select_fb_ident(struct iphdr *iph)
467 {
468         static DEFINE_SPINLOCK(ip_fb_id_lock);
469         static u32 ip_fallback_id;
470         u32 salt;
471
472         spin_lock_bh(&ip_fb_id_lock);
473         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
474         iph->id = htons(salt & 0xFFFF);
475         ip_fallback_id = salt;
476         spin_unlock_bh(&ip_fb_id_lock);
477 }
478
479 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
480 {
481         struct net *net = dev_net(dst->dev);
482         struct inet_peer *peer;
483
484         peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
485         if (peer) {
486                 iph->id = htons(inet_getid(peer, more));
487                 inet_putpeer(peer);
488                 return;
489         }
490
491         ip_select_fb_ident(iph);
492 }
493 EXPORT_SYMBOL(__ip_select_ident);
494
495 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
496                              const struct iphdr *iph,
497                              int oif, u8 tos,
498                              u8 prot, u32 mark, int flow_flags)
499 {
500         if (sk) {
501                 const struct inet_sock *inet = inet_sk(sk);
502
503                 oif = sk->sk_bound_dev_if;
504                 mark = sk->sk_mark;
505                 tos = RT_CONN_FLAGS(sk);
506                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
507         }
508         flowi4_init_output(fl4, oif, mark, tos,
509                            RT_SCOPE_UNIVERSE, prot,
510                            flow_flags,
511                            iph->daddr, iph->saddr, 0, 0);
512 }
513
514 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
515                                const struct sock *sk)
516 {
517         const struct iphdr *iph = ip_hdr(skb);
518         int oif = skb->dev->ifindex;
519         u8 tos = RT_TOS(iph->tos);
520         u8 prot = iph->protocol;
521         u32 mark = skb->mark;
522
523         __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
524 }
525
526 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
527 {
528         const struct inet_sock *inet = inet_sk(sk);
529         const struct ip_options_rcu *inet_opt;
530         __be32 daddr = inet->inet_daddr;
531
532         rcu_read_lock();
533         inet_opt = rcu_dereference(inet->inet_opt);
534         if (inet_opt && inet_opt->opt.srr)
535                 daddr = inet_opt->opt.faddr;
536         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
537                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
538                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
539                            inet_sk_flowi_flags(sk),
540                            daddr, inet->inet_saddr, 0, 0);
541         rcu_read_unlock();
542 }
543
544 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
545                                  const struct sk_buff *skb)
546 {
547         if (skb)
548                 build_skb_flow_key(fl4, skb, sk);
549         else
550                 build_sk_flow_key(fl4, sk);
551 }
552
553 static inline void rt_free(struct rtable *rt)
554 {
555         call_rcu(&rt->dst.rcu_head, dst_rcu_free);
556 }
557
558 static DEFINE_SPINLOCK(fnhe_lock);
559
560 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
561 {
562         struct rtable *rt;
563
564         rt = rcu_dereference(fnhe->fnhe_rth_input);
565         if (rt) {
566                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
567                 rt_free(rt);
568         }
569         rt = rcu_dereference(fnhe->fnhe_rth_output);
570         if (rt) {
571                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
572                 rt_free(rt);
573         }
574 }
575
576 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
577 {
578         struct fib_nh_exception *fnhe, *oldest;
579
580         oldest = rcu_dereference(hash->chain);
581         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
582              fnhe = rcu_dereference(fnhe->fnhe_next)) {
583                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
584                         oldest = fnhe;
585         }
586         fnhe_flush_routes(oldest);
587         return oldest;
588 }
589
590 static inline u32 fnhe_hashfun(__be32 daddr)
591 {
592         u32 hval;
593
594         hval = (__force u32) daddr;
595         hval ^= (hval >> 11) ^ (hval >> 22);
596
597         return hval & (FNHE_HASH_SIZE - 1);
598 }
599
600 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
601 {
602         rt->rt_pmtu = fnhe->fnhe_pmtu;
603         rt->dst.expires = fnhe->fnhe_expires;
604
605         if (fnhe->fnhe_gw) {
606                 rt->rt_flags |= RTCF_REDIRECTED;
607                 rt->rt_gateway = fnhe->fnhe_gw;
608                 rt->rt_uses_gateway = 1;
609         }
610 }
611
612 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
613                                   u32 pmtu, unsigned long expires)
614 {
615         struct fnhe_hash_bucket *hash;
616         struct fib_nh_exception *fnhe;
617         struct rtable *rt;
618         unsigned int i;
619         int depth;
620         u32 hval = fnhe_hashfun(daddr);
621
622         spin_lock_bh(&fnhe_lock);
623
624         hash = nh->nh_exceptions;
625         if (!hash) {
626                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
627                 if (!hash)
628                         goto out_unlock;
629                 nh->nh_exceptions = hash;
630         }
631
632         hash += hval;
633
634         depth = 0;
635         for (fnhe = rcu_dereference(hash->chain); fnhe;
636              fnhe = rcu_dereference(fnhe->fnhe_next)) {
637                 if (fnhe->fnhe_daddr == daddr)
638                         break;
639                 depth++;
640         }
641
642         if (fnhe) {
643                 if (gw)
644                         fnhe->fnhe_gw = gw;
645                 if (pmtu) {
646                         fnhe->fnhe_pmtu = pmtu;
647                         fnhe->fnhe_expires = max(1UL, expires);
648                 }
649                 /* Update all cached dsts too */
650                 rt = rcu_dereference(fnhe->fnhe_rth_input);
651                 if (rt)
652                         fill_route_from_fnhe(rt, fnhe);
653                 rt = rcu_dereference(fnhe->fnhe_rth_output);
654                 if (rt)
655                         fill_route_from_fnhe(rt, fnhe);
656         } else {
657                 if (depth > FNHE_RECLAIM_DEPTH)
658                         fnhe = fnhe_oldest(hash);
659                 else {
660                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
661                         if (!fnhe)
662                                 goto out_unlock;
663
664                         fnhe->fnhe_next = hash->chain;
665                         rcu_assign_pointer(hash->chain, fnhe);
666                 }
667                 fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev));
668                 fnhe->fnhe_daddr = daddr;
669                 fnhe->fnhe_gw = gw;
670                 fnhe->fnhe_pmtu = pmtu;
671                 fnhe->fnhe_expires = expires;
672
673                 /* Exception created; mark the cached routes for the nexthop
674                  * stale, so anyone caching it rechecks if this exception
675                  * applies to them.
676                  */
677                 rt = rcu_dereference(nh->nh_rth_input);
678                 if (rt)
679                         rt->dst.obsolete = DST_OBSOLETE_KILL;
680
681                 for_each_possible_cpu(i) {
682                         struct rtable __rcu **prt;
683                         prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
684                         rt = rcu_dereference(*prt);
685                         if (rt)
686                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
687                 }
688         }
689
690         fnhe->fnhe_stamp = jiffies;
691
692 out_unlock:
693         spin_unlock_bh(&fnhe_lock);
694 }
695
696 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
697                              bool kill_route)
698 {
699         __be32 new_gw = icmp_hdr(skb)->un.gateway;
700         __be32 old_gw = ip_hdr(skb)->saddr;
701         struct net_device *dev = skb->dev;
702         struct in_device *in_dev;
703         struct fib_result res;
704         struct neighbour *n;
705         struct net *net;
706
707         switch (icmp_hdr(skb)->code & 7) {
708         case ICMP_REDIR_NET:
709         case ICMP_REDIR_NETTOS:
710         case ICMP_REDIR_HOST:
711         case ICMP_REDIR_HOSTTOS:
712                 break;
713
714         default:
715                 return;
716         }
717
718         if (rt->rt_gateway != old_gw)
719                 return;
720
721         in_dev = __in_dev_get_rcu(dev);
722         if (!in_dev)
723                 return;
724
725         net = dev_net(dev);
726         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
727             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
728             ipv4_is_zeronet(new_gw))
729                 goto reject_redirect;
730
731         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
732                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
733                         goto reject_redirect;
734                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
735                         goto reject_redirect;
736         } else {
737                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
738                         goto reject_redirect;
739         }
740
741         n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
742         if (n) {
743                 if (!(n->nud_state & NUD_VALID)) {
744                         neigh_event_send(n, NULL);
745                 } else {
746                         if (fib_lookup(net, fl4, &res) == 0) {
747                                 struct fib_nh *nh = &FIB_RES_NH(res);
748
749                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
750                                                       0, 0);
751                         }
752                         if (kill_route)
753                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
754                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
755                 }
756                 neigh_release(n);
757         }
758         return;
759
760 reject_redirect:
761 #ifdef CONFIG_IP_ROUTE_VERBOSE
762         if (IN_DEV_LOG_MARTIANS(in_dev)) {
763                 const struct iphdr *iph = (const struct iphdr *) skb->data;
764                 __be32 daddr = iph->daddr;
765                 __be32 saddr = iph->saddr;
766
767                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
768                                      "  Advised path = %pI4 -> %pI4\n",
769                                      &old_gw, dev->name, &new_gw,
770                                      &saddr, &daddr);
771         }
772 #endif
773         ;
774 }
775
776 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
777 {
778         struct rtable *rt;
779         struct flowi4 fl4;
780         const struct iphdr *iph = (const struct iphdr *) skb->data;
781         int oif = skb->dev->ifindex;
782         u8 tos = RT_TOS(iph->tos);
783         u8 prot = iph->protocol;
784         u32 mark = skb->mark;
785
786         rt = (struct rtable *) dst;
787
788         __build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0);
789         __ip_do_redirect(rt, skb, &fl4, true);
790 }
791
792 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
793 {
794         struct rtable *rt = (struct rtable *)dst;
795         struct dst_entry *ret = dst;
796
797         if (rt) {
798                 if (dst->obsolete > 0) {
799                         ip_rt_put(rt);
800                         ret = NULL;
801                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
802                            rt->dst.expires) {
803                         ip_rt_put(rt);
804                         ret = NULL;
805                 }
806         }
807         return ret;
808 }
809
810 /*
811  * Algorithm:
812  *      1. The first ip_rt_redirect_number redirects are sent
813  *         with exponential backoff, then we stop sending them at all,
814  *         assuming that the host ignores our redirects.
815  *      2. If we did not see packets requiring redirects
816  *         during ip_rt_redirect_silence, we assume that the host
817  *         forgot redirected route and start to send redirects again.
818  *
819  * This algorithm is much cheaper and more intelligent than dumb load limiting
820  * in icmp.c.
821  *
822  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
823  * and "frag. need" (breaks PMTU discovery) in icmp.c.
824  */
825
826 void ip_rt_send_redirect(struct sk_buff *skb)
827 {
828         struct rtable *rt = skb_rtable(skb);
829         struct in_device *in_dev;
830         struct inet_peer *peer;
831         struct net *net;
832         int log_martians;
833
834         rcu_read_lock();
835         in_dev = __in_dev_get_rcu(rt->dst.dev);
836         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
837                 rcu_read_unlock();
838                 return;
839         }
840         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
841         rcu_read_unlock();
842
843         net = dev_net(rt->dst.dev);
844         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
845         if (!peer) {
846                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
847                           rt_nexthop(rt, ip_hdr(skb)->daddr));
848                 return;
849         }
850
851         /* No redirected packets during ip_rt_redirect_silence;
852          * reset the algorithm.
853          */
854         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
855                 peer->rate_tokens = 0;
856
857         /* Too many ignored redirects; do not send anything
858          * set dst.rate_last to the last seen redirected packet.
859          */
860         if (peer->rate_tokens >= ip_rt_redirect_number) {
861                 peer->rate_last = jiffies;
862                 goto out_put_peer;
863         }
864
865         /* Check for load limit; set rate_last to the latest sent
866          * redirect.
867          */
868         if (peer->rate_tokens == 0 ||
869             time_after(jiffies,
870                        (peer->rate_last +
871                         (ip_rt_redirect_load << peer->rate_tokens)))) {
872                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
873
874                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
875                 peer->rate_last = jiffies;
876                 ++peer->rate_tokens;
877 #ifdef CONFIG_IP_ROUTE_VERBOSE
878                 if (log_martians &&
879                     peer->rate_tokens == ip_rt_redirect_number)
880                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
881                                              &ip_hdr(skb)->saddr, inet_iif(skb),
882                                              &ip_hdr(skb)->daddr, &gw);
883 #endif
884         }
885 out_put_peer:
886         inet_putpeer(peer);
887 }
888
889 static int ip_error(struct sk_buff *skb)
890 {
891         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
892         struct rtable *rt = skb_rtable(skb);
893         struct inet_peer *peer;
894         unsigned long now;
895         struct net *net;
896         bool send;
897         int code;
898
899         net = dev_net(rt->dst.dev);
900         if (!IN_DEV_FORWARD(in_dev)) {
901                 switch (rt->dst.error) {
902                 case EHOSTUNREACH:
903                         IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
904                         break;
905
906                 case ENETUNREACH:
907                         IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
908                         break;
909                 }
910                 goto out;
911         }
912
913         switch (rt->dst.error) {
914         case EINVAL:
915         default:
916                 goto out;
917         case EHOSTUNREACH:
918                 code = ICMP_HOST_UNREACH;
919                 break;
920         case ENETUNREACH:
921                 code = ICMP_NET_UNREACH;
922                 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
923                 break;
924         case EACCES:
925                 code = ICMP_PKT_FILTERED;
926                 break;
927         }
928
929         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
930
931         send = true;
932         if (peer) {
933                 now = jiffies;
934                 peer->rate_tokens += now - peer->rate_last;
935                 if (peer->rate_tokens > ip_rt_error_burst)
936                         peer->rate_tokens = ip_rt_error_burst;
937                 peer->rate_last = now;
938                 if (peer->rate_tokens >= ip_rt_error_cost)
939                         peer->rate_tokens -= ip_rt_error_cost;
940                 else
941                         send = false;
942                 inet_putpeer(peer);
943         }
944         if (send)
945                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
946
947 out:    kfree_skb(skb);
948         return 0;
949 }
950
951 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
952 {
953         struct dst_entry *dst = &rt->dst;
954         struct fib_result res;
955
956         if (dst_metric_locked(dst, RTAX_MTU))
957                 return;
958
959         if (dst->dev->mtu < mtu)
960                 return;
961
962         if (mtu < ip_rt_min_pmtu)
963                 mtu = ip_rt_min_pmtu;
964
965         if (rt->rt_pmtu == mtu &&
966             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
967                 return;
968
969         rcu_read_lock();
970         if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
971                 struct fib_nh *nh = &FIB_RES_NH(res);
972
973                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
974                                       jiffies + ip_rt_mtu_expires);
975         }
976         rcu_read_unlock();
977 }
978
979 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
980                               struct sk_buff *skb, u32 mtu)
981 {
982         struct rtable *rt = (struct rtable *) dst;
983         struct flowi4 fl4;
984
985         ip_rt_build_flow_key(&fl4, sk, skb);
986         __ip_rt_update_pmtu(rt, &fl4, mtu);
987 }
988
989 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
990                       int oif, u32 mark, u8 protocol, int flow_flags)
991 {
992         const struct iphdr *iph = (const struct iphdr *) skb->data;
993         struct flowi4 fl4;
994         struct rtable *rt;
995
996         __build_flow_key(&fl4, NULL, iph, oif,
997                          RT_TOS(iph->tos), protocol, mark, flow_flags);
998         rt = __ip_route_output_key(net, &fl4);
999         if (!IS_ERR(rt)) {
1000                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1001                 ip_rt_put(rt);
1002         }
1003 }
1004 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1005
1006 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1007 {
1008         const struct iphdr *iph = (const struct iphdr *) skb->data;
1009         struct flowi4 fl4;
1010         struct rtable *rt;
1011
1012         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1013         rt = __ip_route_output_key(sock_net(sk), &fl4);
1014         if (!IS_ERR(rt)) {
1015                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1016                 ip_rt_put(rt);
1017         }
1018 }
1019
1020 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1021 {
1022         const struct iphdr *iph = (const struct iphdr *) skb->data;
1023         struct flowi4 fl4;
1024         struct rtable *rt;
1025         struct dst_entry *dst;
1026         bool new = false;
1027
1028         bh_lock_sock(sk);
1029
1030         if (!ip_sk_accept_pmtu(sk))
1031                 goto out;
1032
1033         rt = (struct rtable *) __sk_dst_get(sk);
1034
1035         if (sock_owned_by_user(sk) || !rt) {
1036                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1037                 goto out;
1038         }
1039
1040         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1041
1042         if (!__sk_dst_check(sk, 0)) {
1043                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1044                 if (IS_ERR(rt))
1045                         goto out;
1046
1047                 new = true;
1048         }
1049
1050         __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1051
1052         dst = dst_check(&rt->dst, 0);
1053         if (!dst) {
1054                 if (new)
1055                         dst_release(&rt->dst);
1056
1057                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1058                 if (IS_ERR(rt))
1059                         goto out;
1060
1061                 new = true;
1062         }
1063
1064         if (new)
1065                 __sk_dst_set(sk, &rt->dst);
1066
1067 out:
1068         bh_unlock_sock(sk);
1069 }
1070 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1071
1072 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1073                    int oif, u32 mark, u8 protocol, int flow_flags)
1074 {
1075         const struct iphdr *iph = (const struct iphdr *) skb->data;
1076         struct flowi4 fl4;
1077         struct rtable *rt;
1078
1079         __build_flow_key(&fl4, NULL, iph, oif,
1080                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1081         rt = __ip_route_output_key(net, &fl4);
1082         if (!IS_ERR(rt)) {
1083                 __ip_do_redirect(rt, skb, &fl4, false);
1084                 ip_rt_put(rt);
1085         }
1086 }
1087 EXPORT_SYMBOL_GPL(ipv4_redirect);
1088
1089 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1090 {
1091         const struct iphdr *iph = (const struct iphdr *) skb->data;
1092         struct flowi4 fl4;
1093         struct rtable *rt;
1094
1095         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1096         rt = __ip_route_output_key(sock_net(sk), &fl4);
1097         if (!IS_ERR(rt)) {
1098                 __ip_do_redirect(rt, skb, &fl4, false);
1099                 ip_rt_put(rt);
1100         }
1101 }
1102 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1103
1104 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1105 {
1106         struct rtable *rt = (struct rtable *) dst;
1107
1108         /* All IPV4 dsts are created with ->obsolete set to the value
1109          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1110          * into this function always.
1111          *
1112          * When a PMTU/redirect information update invalidates a route,
1113          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1114          * DST_OBSOLETE_DEAD by dst_free().
1115          */
1116         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1117                 return NULL;
1118         return dst;
1119 }
1120
1121 static void ipv4_link_failure(struct sk_buff *skb)
1122 {
1123         struct rtable *rt;
1124
1125         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1126
1127         rt = skb_rtable(skb);
1128         if (rt)
1129                 dst_set_expires(&rt->dst, 0);
1130 }
1131
1132 static int ip_rt_bug(struct sock *sk, struct sk_buff *skb)
1133 {
1134         pr_debug("%s: %pI4 -> %pI4, %s\n",
1135                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1136                  skb->dev ? skb->dev->name : "?");
1137         kfree_skb(skb);
1138         WARN_ON(1);
1139         return 0;
1140 }
1141
1142 /*
1143    We do not cache source address of outgoing interface,
1144    because it is used only by IP RR, TS and SRR options,
1145    so that it out of fast path.
1146
1147    BTW remember: "addr" is allowed to be not aligned
1148    in IP options!
1149  */
1150
1151 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1152 {
1153         __be32 src;
1154
1155         if (rt_is_output_route(rt))
1156                 src = ip_hdr(skb)->saddr;
1157         else {
1158                 struct fib_result res;
1159                 struct flowi4 fl4;
1160                 struct iphdr *iph;
1161
1162                 iph = ip_hdr(skb);
1163
1164                 memset(&fl4, 0, sizeof(fl4));
1165                 fl4.daddr = iph->daddr;
1166                 fl4.saddr = iph->saddr;
1167                 fl4.flowi4_tos = RT_TOS(iph->tos);
1168                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1169                 fl4.flowi4_iif = skb->dev->ifindex;
1170                 fl4.flowi4_mark = skb->mark;
1171
1172                 rcu_read_lock();
1173                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1174                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1175                 else
1176                         src = inet_select_addr(rt->dst.dev,
1177                                                rt_nexthop(rt, iph->daddr),
1178                                                RT_SCOPE_UNIVERSE);
1179                 rcu_read_unlock();
1180         }
1181         memcpy(addr, &src, 4);
1182 }
1183
1184 #ifdef CONFIG_IP_ROUTE_CLASSID
1185 static void set_class_tag(struct rtable *rt, u32 tag)
1186 {
1187         if (!(rt->dst.tclassid & 0xFFFF))
1188                 rt->dst.tclassid |= tag & 0xFFFF;
1189         if (!(rt->dst.tclassid & 0xFFFF0000))
1190                 rt->dst.tclassid |= tag & 0xFFFF0000;
1191 }
1192 #endif
1193
1194 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1195 {
1196         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1197
1198         if (advmss == 0) {
1199                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1200                                ip_rt_min_advmss);
1201                 if (advmss > 65535 - 40)
1202                         advmss = 65535 - 40;
1203         }
1204         return advmss;
1205 }
1206
1207 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1208 {
1209         const struct rtable *rt = (const struct rtable *) dst;
1210         unsigned int mtu = rt->rt_pmtu;
1211
1212         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1213                 mtu = dst_metric_raw(dst, RTAX_MTU);
1214
1215         if (mtu)
1216                 return mtu;
1217
1218         mtu = dst->dev->mtu;
1219
1220         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1221                 if (rt->rt_uses_gateway && mtu > 576)
1222                         mtu = 576;
1223         }
1224
1225         return min_t(unsigned int, mtu, IP_MAX_MTU);
1226 }
1227
1228 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1229 {
1230         struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1231         struct fib_nh_exception *fnhe;
1232         u32 hval;
1233
1234         if (!hash)
1235                 return NULL;
1236
1237         hval = fnhe_hashfun(daddr);
1238
1239         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1240              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1241                 if (fnhe->fnhe_daddr == daddr)
1242                         return fnhe;
1243         }
1244         return NULL;
1245 }
1246
1247 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1248                               __be32 daddr)
1249 {
1250         bool ret = false;
1251
1252         spin_lock_bh(&fnhe_lock);
1253
1254         if (daddr == fnhe->fnhe_daddr) {
1255                 struct rtable __rcu **porig;
1256                 struct rtable *orig;
1257                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1258
1259                 if (rt_is_input_route(rt))
1260                         porig = &fnhe->fnhe_rth_input;
1261                 else
1262                         porig = &fnhe->fnhe_rth_output;
1263                 orig = rcu_dereference(*porig);
1264
1265                 if (fnhe->fnhe_genid != genid) {
1266                         fnhe->fnhe_genid = genid;
1267                         fnhe->fnhe_gw = 0;
1268                         fnhe->fnhe_pmtu = 0;
1269                         fnhe->fnhe_expires = 0;
1270                         fnhe_flush_routes(fnhe);
1271                         orig = NULL;
1272                 }
1273                 fill_route_from_fnhe(rt, fnhe);
1274                 if (!rt->rt_gateway)
1275                         rt->rt_gateway = daddr;
1276
1277                 if (!(rt->dst.flags & DST_NOCACHE)) {
1278                         rcu_assign_pointer(*porig, rt);
1279                         if (orig)
1280                                 rt_free(orig);
1281                         ret = true;
1282                 }
1283
1284                 fnhe->fnhe_stamp = jiffies;
1285         }
1286         spin_unlock_bh(&fnhe_lock);
1287
1288         return ret;
1289 }
1290
1291 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1292 {
1293         struct rtable *orig, *prev, **p;
1294         bool ret = true;
1295
1296         if (rt_is_input_route(rt)) {
1297                 p = (struct rtable **)&nh->nh_rth_input;
1298         } else {
1299                 p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
1300         }
1301         orig = *p;
1302
1303         prev = cmpxchg(p, orig, rt);
1304         if (prev == orig) {
1305                 if (orig)
1306                         rt_free(orig);
1307         } else
1308                 ret = false;
1309
1310         return ret;
1311 }
1312
1313 static DEFINE_SPINLOCK(rt_uncached_lock);
1314 static LIST_HEAD(rt_uncached_list);
1315
1316 static void rt_add_uncached_list(struct rtable *rt)
1317 {
1318         spin_lock_bh(&rt_uncached_lock);
1319         list_add_tail(&rt->rt_uncached, &rt_uncached_list);
1320         spin_unlock_bh(&rt_uncached_lock);
1321 }
1322
1323 static void ipv4_dst_destroy(struct dst_entry *dst)
1324 {
1325         struct rtable *rt = (struct rtable *) dst;
1326
1327         if (!list_empty(&rt->rt_uncached)) {
1328                 spin_lock_bh(&rt_uncached_lock);
1329                 list_del(&rt->rt_uncached);
1330                 spin_unlock_bh(&rt_uncached_lock);
1331         }
1332 }
1333
1334 void rt_flush_dev(struct net_device *dev)
1335 {
1336         if (!list_empty(&rt_uncached_list)) {
1337                 struct net *net = dev_net(dev);
1338                 struct rtable *rt;
1339
1340                 spin_lock_bh(&rt_uncached_lock);
1341                 list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
1342                         if (rt->dst.dev != dev)
1343                                 continue;
1344                         rt->dst.dev = net->loopback_dev;
1345                         dev_hold(rt->dst.dev);
1346                         dev_put(dev);
1347                 }
1348                 spin_unlock_bh(&rt_uncached_lock);
1349         }
1350 }
1351
1352 static bool rt_cache_valid(const struct rtable *rt)
1353 {
1354         return  rt &&
1355                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1356                 !rt_is_expired(rt);
1357 }
1358
1359 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1360                            const struct fib_result *res,
1361                            struct fib_nh_exception *fnhe,
1362                            struct fib_info *fi, u16 type, u32 itag)
1363 {
1364         bool cached = false;
1365
1366         if (fi) {
1367                 struct fib_nh *nh = &FIB_RES_NH(*res);
1368
1369                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1370                         rt->rt_gateway = nh->nh_gw;
1371                         rt->rt_uses_gateway = 1;
1372                 }
1373                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1374 #ifdef CONFIG_IP_ROUTE_CLASSID
1375                 rt->dst.tclassid = nh->nh_tclassid;
1376 #endif
1377                 if (unlikely(fnhe))
1378                         cached = rt_bind_exception(rt, fnhe, daddr);
1379                 else if (!(rt->dst.flags & DST_NOCACHE))
1380                         cached = rt_cache_route(nh, rt);
1381                 if (unlikely(!cached)) {
1382                         /* Routes we intend to cache in nexthop exception or
1383                          * FIB nexthop have the DST_NOCACHE bit clear.
1384                          * However, if we are unsuccessful at storing this
1385                          * route into the cache we really need to set it.
1386                          */
1387                         rt->dst.flags |= DST_NOCACHE;
1388                         if (!rt->rt_gateway)
1389                                 rt->rt_gateway = daddr;
1390                         rt_add_uncached_list(rt);
1391                 }
1392         } else
1393                 rt_add_uncached_list(rt);
1394
1395 #ifdef CONFIG_IP_ROUTE_CLASSID
1396 #ifdef CONFIG_IP_MULTIPLE_TABLES
1397         set_class_tag(rt, res->tclassid);
1398 #endif
1399         set_class_tag(rt, itag);
1400 #endif
1401 }
1402
1403 static struct rtable *rt_dst_alloc(struct net_device *dev,
1404                                    bool nopolicy, bool noxfrm, bool will_cache)
1405 {
1406         return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1407                          (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1408                          (nopolicy ? DST_NOPOLICY : 0) |
1409                          (noxfrm ? DST_NOXFRM : 0));
1410 }
1411
1412 /* called in rcu_read_lock() section */
1413 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1414                                 u8 tos, struct net_device *dev, int our)
1415 {
1416         struct rtable *rth;
1417         struct in_device *in_dev = __in_dev_get_rcu(dev);
1418         u32 itag = 0;
1419         int err;
1420
1421         /* Primary sanity checks. */
1422
1423         if (in_dev == NULL)
1424                 return -EINVAL;
1425
1426         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1427             skb->protocol != htons(ETH_P_IP))
1428                 goto e_inval;
1429
1430         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1431                 if (ipv4_is_loopback(saddr))
1432                         goto e_inval;
1433
1434         if (ipv4_is_zeronet(saddr)) {
1435                 if (!ipv4_is_local_multicast(daddr))
1436                         goto e_inval;
1437         } else {
1438                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1439                                           in_dev, &itag);
1440                 if (err < 0)
1441                         goto e_err;
1442         }
1443         rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1444                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1445         if (!rth)
1446                 goto e_nobufs;
1447
1448 #ifdef CONFIG_IP_ROUTE_CLASSID
1449         rth->dst.tclassid = itag;
1450 #endif
1451         rth->dst.output = ip_rt_bug;
1452
1453         rth->rt_genid   = rt_genid_ipv4(dev_net(dev));
1454         rth->rt_flags   = RTCF_MULTICAST;
1455         rth->rt_type    = RTN_MULTICAST;
1456         rth->rt_is_input= 1;
1457         rth->rt_iif     = 0;
1458         rth->rt_pmtu    = 0;
1459         rth->rt_gateway = 0;
1460         rth->rt_uses_gateway = 0;
1461         INIT_LIST_HEAD(&rth->rt_uncached);
1462         if (our) {
1463                 rth->dst.input= ip_local_deliver;
1464                 rth->rt_flags |= RTCF_LOCAL;
1465         }
1466
1467 #ifdef CONFIG_IP_MROUTE
1468         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1469                 rth->dst.input = ip_mr_input;
1470 #endif
1471         RT_CACHE_STAT_INC(in_slow_mc);
1472
1473         skb_dst_set(skb, &rth->dst);
1474         return 0;
1475
1476 e_nobufs:
1477         return -ENOBUFS;
1478 e_inval:
1479         return -EINVAL;
1480 e_err:
1481         return err;
1482 }
1483
1484
1485 static void ip_handle_martian_source(struct net_device *dev,
1486                                      struct in_device *in_dev,
1487                                      struct sk_buff *skb,
1488                                      __be32 daddr,
1489                                      __be32 saddr)
1490 {
1491         RT_CACHE_STAT_INC(in_martian_src);
1492 #ifdef CONFIG_IP_ROUTE_VERBOSE
1493         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1494                 /*
1495                  *      RFC1812 recommendation, if source is martian,
1496                  *      the only hint is MAC header.
1497                  */
1498                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1499                         &daddr, &saddr, dev->name);
1500                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1501                         print_hex_dump(KERN_WARNING, "ll header: ",
1502                                        DUMP_PREFIX_OFFSET, 16, 1,
1503                                        skb_mac_header(skb),
1504                                        dev->hard_header_len, true);
1505                 }
1506         }
1507 #endif
1508 }
1509
1510 /* called in rcu_read_lock() section */
1511 static int __mkroute_input(struct sk_buff *skb,
1512                            const struct fib_result *res,
1513                            struct in_device *in_dev,
1514                            __be32 daddr, __be32 saddr, u32 tos)
1515 {
1516         struct fib_nh_exception *fnhe;
1517         struct rtable *rth;
1518         int err;
1519         struct in_device *out_dev;
1520         unsigned int flags = 0;
1521         bool do_cache;
1522         u32 itag;
1523
1524         /* get a working reference to the output device */
1525         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1526         if (out_dev == NULL) {
1527                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1528                 return -EINVAL;
1529         }
1530
1531         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1532                                   in_dev->dev, in_dev, &itag);
1533         if (err < 0) {
1534                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1535                                          saddr);
1536
1537                 goto cleanup;
1538         }
1539
1540         do_cache = res->fi && !itag;
1541         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1542             (IN_DEV_SHARED_MEDIA(out_dev) ||
1543              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) {
1544                 flags |= RTCF_DOREDIRECT;
1545                 do_cache = false;
1546         }
1547
1548         if (skb->protocol != htons(ETH_P_IP)) {
1549                 /* Not IP (i.e. ARP). Do not create route, if it is
1550                  * invalid for proxy arp. DNAT routes are always valid.
1551                  *
1552                  * Proxy arp feature have been extended to allow, ARP
1553                  * replies back to the same interface, to support
1554                  * Private VLAN switch technologies. See arp.c.
1555                  */
1556                 if (out_dev == in_dev &&
1557                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1558                         err = -EINVAL;
1559                         goto cleanup;
1560                 }
1561         }
1562
1563         fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1564         if (do_cache) {
1565                 if (fnhe != NULL)
1566                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1567                 else
1568                         rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1569
1570                 if (rt_cache_valid(rth)) {
1571                         skb_dst_set_noref(skb, &rth->dst);
1572                         goto out;
1573                 }
1574         }
1575
1576         rth = rt_dst_alloc(out_dev->dev,
1577                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1578                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1579         if (!rth) {
1580                 err = -ENOBUFS;
1581                 goto cleanup;
1582         }
1583
1584         rth->rt_genid = rt_genid_ipv4(dev_net(rth->dst.dev));
1585         rth->rt_flags = flags;
1586         rth->rt_type = res->type;
1587         rth->rt_is_input = 1;
1588         rth->rt_iif     = 0;
1589         rth->rt_pmtu    = 0;
1590         rth->rt_gateway = 0;
1591         rth->rt_uses_gateway = 0;
1592         INIT_LIST_HEAD(&rth->rt_uncached);
1593         RT_CACHE_STAT_INC(in_slow_tot);
1594
1595         rth->dst.input = ip_forward;
1596         rth->dst.output = ip_output;
1597
1598         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
1599         skb_dst_set(skb, &rth->dst);
1600 out:
1601         err = 0;
1602  cleanup:
1603         return err;
1604 }
1605
1606 static int ip_mkroute_input(struct sk_buff *skb,
1607                             struct fib_result *res,
1608                             const struct flowi4 *fl4,
1609                             struct in_device *in_dev,
1610                             __be32 daddr, __be32 saddr, u32 tos)
1611 {
1612 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1613         if (res->fi && res->fi->fib_nhs > 1)
1614                 fib_select_multipath(res);
1615 #endif
1616
1617         /* create a routing cache entry */
1618         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1619 }
1620
1621 /*
1622  *      NOTE. We drop all the packets that has local source
1623  *      addresses, because every properly looped back packet
1624  *      must have correct destination already attached by output routine.
1625  *
1626  *      Such approach solves two big problems:
1627  *      1. Not simplex devices are handled properly.
1628  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1629  *      called with rcu_read_lock()
1630  */
1631
1632 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1633                                u8 tos, struct net_device *dev)
1634 {
1635         struct fib_result res;
1636         struct in_device *in_dev = __in_dev_get_rcu(dev);
1637         struct flowi4   fl4;
1638         unsigned int    flags = 0;
1639         u32             itag = 0;
1640         struct rtable   *rth;
1641         int             err = -EINVAL;
1642         struct net    *net = dev_net(dev);
1643         bool do_cache;
1644
1645         /* IP on this device is disabled. */
1646
1647         if (!in_dev)
1648                 goto out;
1649
1650         /* Check for the most weird martians, which can be not detected
1651            by fib_lookup.
1652          */
1653
1654         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1655                 goto martian_source;
1656
1657         res.fi = NULL;
1658         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1659                 goto brd_input;
1660
1661         /* Accept zero addresses only to limited broadcast;
1662          * I even do not know to fix it or not. Waiting for complains :-)
1663          */
1664         if (ipv4_is_zeronet(saddr))
1665                 goto martian_source;
1666
1667         if (ipv4_is_zeronet(daddr))
1668                 goto martian_destination;
1669
1670         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1671          * and call it once if daddr or/and saddr are loopback addresses
1672          */
1673         if (ipv4_is_loopback(daddr)) {
1674                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1675                         goto martian_destination;
1676         } else if (ipv4_is_loopback(saddr)) {
1677                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1678                         goto martian_source;
1679         }
1680
1681         /*
1682          *      Now we are ready to route packet.
1683          */
1684         fl4.flowi4_oif = 0;
1685         fl4.flowi4_iif = dev->ifindex;
1686         fl4.flowi4_mark = skb->mark;
1687         fl4.flowi4_tos = tos;
1688         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1689         fl4.daddr = daddr;
1690         fl4.saddr = saddr;
1691         err = fib_lookup(net, &fl4, &res);
1692         if (err != 0) {
1693                 if (!IN_DEV_FORWARD(in_dev))
1694                         err = -EHOSTUNREACH;
1695                 goto no_route;
1696         }
1697
1698         if (res.type == RTN_BROADCAST)
1699                 goto brd_input;
1700
1701         if (res.type == RTN_LOCAL) {
1702                 err = fib_validate_source(skb, saddr, daddr, tos,
1703                                           LOOPBACK_IFINDEX,
1704                                           dev, in_dev, &itag);
1705                 if (err < 0)
1706                         goto martian_source_keep_err;
1707                 goto local_input;
1708         }
1709
1710         if (!IN_DEV_FORWARD(in_dev)) {
1711                 err = -EHOSTUNREACH;
1712                 goto no_route;
1713         }
1714         if (res.type != RTN_UNICAST)
1715                 goto martian_destination;
1716
1717         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1718 out:    return err;
1719
1720 brd_input:
1721         if (skb->protocol != htons(ETH_P_IP))
1722                 goto e_inval;
1723
1724         if (!ipv4_is_zeronet(saddr)) {
1725                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1726                                           in_dev, &itag);
1727                 if (err < 0)
1728                         goto martian_source_keep_err;
1729         }
1730         flags |= RTCF_BROADCAST;
1731         res.type = RTN_BROADCAST;
1732         RT_CACHE_STAT_INC(in_brd);
1733
1734 local_input:
1735         do_cache = false;
1736         if (res.fi) {
1737                 if (!itag) {
1738                         rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1739                         if (rt_cache_valid(rth)) {
1740                                 skb_dst_set_noref(skb, &rth->dst);
1741                                 err = 0;
1742                                 goto out;
1743                         }
1744                         do_cache = true;
1745                 }
1746         }
1747
1748         rth = rt_dst_alloc(net->loopback_dev,
1749                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1750         if (!rth)
1751                 goto e_nobufs;
1752
1753         rth->dst.input= ip_local_deliver;
1754         rth->dst.output= ip_rt_bug;
1755 #ifdef CONFIG_IP_ROUTE_CLASSID
1756         rth->dst.tclassid = itag;
1757 #endif
1758
1759         rth->rt_genid = rt_genid_ipv4(net);
1760         rth->rt_flags   = flags|RTCF_LOCAL;
1761         rth->rt_type    = res.type;
1762         rth->rt_is_input = 1;
1763         rth->rt_iif     = 0;
1764         rth->rt_pmtu    = 0;
1765         rth->rt_gateway = 0;
1766         rth->rt_uses_gateway = 0;
1767         INIT_LIST_HEAD(&rth->rt_uncached);
1768         RT_CACHE_STAT_INC(in_slow_tot);
1769         if (res.type == RTN_UNREACHABLE) {
1770                 rth->dst.input= ip_error;
1771                 rth->dst.error= -err;
1772                 rth->rt_flags   &= ~RTCF_LOCAL;
1773         }
1774         if (do_cache) {
1775                 if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) {
1776                         rth->dst.flags |= DST_NOCACHE;
1777                         rt_add_uncached_list(rth);
1778                 }
1779         }
1780         skb_dst_set(skb, &rth->dst);
1781         err = 0;
1782         goto out;
1783
1784 no_route:
1785         RT_CACHE_STAT_INC(in_no_route);
1786         res.type = RTN_UNREACHABLE;
1787         if (err == -ESRCH)
1788                 err = -ENETUNREACH;
1789         goto local_input;
1790
1791         /*
1792          *      Do not cache martian addresses: they should be logged (RFC1812)
1793          */
1794 martian_destination:
1795         RT_CACHE_STAT_INC(in_martian_dst);
1796 #ifdef CONFIG_IP_ROUTE_VERBOSE
1797         if (IN_DEV_LOG_MARTIANS(in_dev))
1798                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1799                                      &daddr, &saddr, dev->name);
1800 #endif
1801
1802 e_inval:
1803         err = -EINVAL;
1804         goto out;
1805
1806 e_nobufs:
1807         err = -ENOBUFS;
1808         goto out;
1809
1810 martian_source:
1811         err = -EINVAL;
1812 martian_source_keep_err:
1813         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1814         goto out;
1815 }
1816
1817 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1818                          u8 tos, struct net_device *dev)
1819 {
1820         int res;
1821
1822         rcu_read_lock();
1823
1824         /* Multicast recognition logic is moved from route cache to here.
1825            The problem was that too many Ethernet cards have broken/missing
1826            hardware multicast filters :-( As result the host on multicasting
1827            network acquires a lot of useless route cache entries, sort of
1828            SDR messages from all the world. Now we try to get rid of them.
1829            Really, provided software IP multicast filter is organized
1830            reasonably (at least, hashed), it does not result in a slowdown
1831            comparing with route cache reject entries.
1832            Note, that multicast routers are not affected, because
1833            route cache entry is created eventually.
1834          */
1835         if (ipv4_is_multicast(daddr)) {
1836                 struct in_device *in_dev = __in_dev_get_rcu(dev);
1837
1838                 if (in_dev) {
1839                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1840                                                   ip_hdr(skb)->protocol);
1841                         if (our
1842 #ifdef CONFIG_IP_MROUTE
1843                                 ||
1844                             (!ipv4_is_local_multicast(daddr) &&
1845                              IN_DEV_MFORWARD(in_dev))
1846 #endif
1847                            ) {
1848                                 int res = ip_route_input_mc(skb, daddr, saddr,
1849                                                             tos, dev, our);
1850                                 rcu_read_unlock();
1851                                 return res;
1852                         }
1853                 }
1854                 rcu_read_unlock();
1855                 return -EINVAL;
1856         }
1857         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1858         rcu_read_unlock();
1859         return res;
1860 }
1861 EXPORT_SYMBOL(ip_route_input_noref);
1862
1863 /* called with rcu_read_lock() */
1864 static struct rtable *__mkroute_output(const struct fib_result *res,
1865                                        const struct flowi4 *fl4, int orig_oif,
1866                                        struct net_device *dev_out,
1867                                        unsigned int flags)
1868 {
1869         struct fib_info *fi = res->fi;
1870         struct fib_nh_exception *fnhe;
1871         struct in_device *in_dev;
1872         u16 type = res->type;
1873         struct rtable *rth;
1874         bool do_cache;
1875
1876         in_dev = __in_dev_get_rcu(dev_out);
1877         if (!in_dev)
1878                 return ERR_PTR(-EINVAL);
1879
1880         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1881                 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1882                         return ERR_PTR(-EINVAL);
1883
1884         if (ipv4_is_lbcast(fl4->daddr))
1885                 type = RTN_BROADCAST;
1886         else if (ipv4_is_multicast(fl4->daddr))
1887                 type = RTN_MULTICAST;
1888         else if (ipv4_is_zeronet(fl4->daddr))
1889                 return ERR_PTR(-EINVAL);
1890
1891         if (dev_out->flags & IFF_LOOPBACK)
1892                 flags |= RTCF_LOCAL;
1893
1894         do_cache = true;
1895         if (type == RTN_BROADCAST) {
1896                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
1897                 fi = NULL;
1898         } else if (type == RTN_MULTICAST) {
1899                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
1900                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1901                                      fl4->flowi4_proto))
1902                         flags &= ~RTCF_LOCAL;
1903                 else
1904                         do_cache = false;
1905                 /* If multicast route do not exist use
1906                  * default one, but do not gateway in this case.
1907                  * Yes, it is hack.
1908                  */
1909                 if (fi && res->prefixlen < 4)
1910                         fi = NULL;
1911         }
1912
1913         fnhe = NULL;
1914         do_cache &= fi != NULL;
1915         if (do_cache) {
1916                 struct rtable __rcu **prth;
1917                 struct fib_nh *nh = &FIB_RES_NH(*res);
1918
1919                 fnhe = find_exception(nh, fl4->daddr);
1920                 if (fnhe)
1921                         prth = &fnhe->fnhe_rth_output;
1922                 else {
1923                         if (unlikely(fl4->flowi4_flags &
1924                                      FLOWI_FLAG_KNOWN_NH &&
1925                                      !(nh->nh_gw &&
1926                                        nh->nh_scope == RT_SCOPE_LINK))) {
1927                                 do_cache = false;
1928                                 goto add;
1929                         }
1930                         prth = __this_cpu_ptr(nh->nh_pcpu_rth_output);
1931                 }
1932                 rth = rcu_dereference(*prth);
1933                 if (rt_cache_valid(rth)) {
1934                         dst_hold(&rth->dst);
1935                         return rth;
1936                 }
1937         }
1938
1939 add:
1940         rth = rt_dst_alloc(dev_out,
1941                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1942                            IN_DEV_CONF_GET(in_dev, NOXFRM),
1943                            do_cache);
1944         if (!rth)
1945                 return ERR_PTR(-ENOBUFS);
1946
1947         rth->dst.output = ip_output;
1948
1949         rth->rt_genid = rt_genid_ipv4(dev_net(dev_out));
1950         rth->rt_flags   = flags;
1951         rth->rt_type    = type;
1952         rth->rt_is_input = 0;
1953         rth->rt_iif     = orig_oif ? : 0;
1954         rth->rt_pmtu    = 0;
1955         rth->rt_gateway = 0;
1956         rth->rt_uses_gateway = 0;
1957         INIT_LIST_HEAD(&rth->rt_uncached);
1958
1959         RT_CACHE_STAT_INC(out_slow_tot);
1960
1961         if (flags & RTCF_LOCAL)
1962                 rth->dst.input = ip_local_deliver;
1963         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
1964                 if (flags & RTCF_LOCAL &&
1965                     !(dev_out->flags & IFF_LOOPBACK)) {
1966                         rth->dst.output = ip_mc_output;
1967                         RT_CACHE_STAT_INC(out_slow_mc);
1968                 }
1969 #ifdef CONFIG_IP_MROUTE
1970                 if (type == RTN_MULTICAST) {
1971                         if (IN_DEV_MFORWARD(in_dev) &&
1972                             !ipv4_is_local_multicast(fl4->daddr)) {
1973                                 rth->dst.input = ip_mr_input;
1974                                 rth->dst.output = ip_mc_output;
1975                         }
1976                 }
1977 #endif
1978         }
1979
1980         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
1981
1982         return rth;
1983 }
1984
1985 /*
1986  * Major route resolver routine.
1987  */
1988
1989 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1990 {
1991         struct net_device *dev_out = NULL;
1992         __u8 tos = RT_FL_TOS(fl4);
1993         unsigned int flags = 0;
1994         struct fib_result res;
1995         struct rtable *rth;
1996         int orig_oif;
1997
1998         res.tclassid    = 0;
1999         res.fi          = NULL;
2000         res.table       = NULL;
2001
2002         orig_oif = fl4->flowi4_oif;
2003
2004         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2005         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2006         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2007                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2008
2009         rcu_read_lock();
2010         if (fl4->saddr) {
2011                 rth = ERR_PTR(-EINVAL);
2012                 if (ipv4_is_multicast(fl4->saddr) ||
2013                     ipv4_is_lbcast(fl4->saddr) ||
2014                     ipv4_is_zeronet(fl4->saddr))
2015                         goto out;
2016
2017                 /* I removed check for oif == dev_out->oif here.
2018                    It was wrong for two reasons:
2019                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2020                       is assigned to multiple interfaces.
2021                    2. Moreover, we are allowed to send packets with saddr
2022                       of another iface. --ANK
2023                  */
2024
2025                 if (fl4->flowi4_oif == 0 &&
2026                     (ipv4_is_multicast(fl4->daddr) ||
2027                      ipv4_is_lbcast(fl4->daddr))) {
2028                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2029                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2030                         if (dev_out == NULL)
2031                                 goto out;
2032
2033                         /* Special hack: user can direct multicasts
2034                            and limited broadcast via necessary interface
2035                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2036                            This hack is not just for fun, it allows
2037                            vic,vat and friends to work.
2038                            They bind socket to loopback, set ttl to zero
2039                            and expect that it will work.
2040                            From the viewpoint of routing cache they are broken,
2041                            because we are not allowed to build multicast path
2042                            with loopback source addr (look, routing cache
2043                            cannot know, that ttl is zero, so that packet
2044                            will not leave this host and route is valid).
2045                            Luckily, this hack is good workaround.
2046                          */
2047
2048                         fl4->flowi4_oif = dev_out->ifindex;
2049                         goto make_route;
2050                 }
2051
2052                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2053                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2054                         if (!__ip_dev_find(net, fl4->saddr, false))
2055                                 goto out;
2056                 }
2057         }
2058
2059
2060         if (fl4->flowi4_oif) {
2061                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2062                 rth = ERR_PTR(-ENODEV);
2063                 if (dev_out == NULL)
2064                         goto out;
2065
2066                 /* RACE: Check return value of inet_select_addr instead. */
2067                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2068                         rth = ERR_PTR(-ENETUNREACH);
2069                         goto out;
2070                 }
2071                 if (ipv4_is_local_multicast(fl4->daddr) ||
2072                     ipv4_is_lbcast(fl4->daddr)) {
2073                         if (!fl4->saddr)
2074                                 fl4->saddr = inet_select_addr(dev_out, 0,
2075                                                               RT_SCOPE_LINK);
2076                         goto make_route;
2077                 }
2078                 if (!fl4->saddr) {
2079                         if (ipv4_is_multicast(fl4->daddr))
2080                                 fl4->saddr = inet_select_addr(dev_out, 0,
2081                                                               fl4->flowi4_scope);
2082                         else if (!fl4->daddr)
2083                                 fl4->saddr = inet_select_addr(dev_out, 0,
2084                                                               RT_SCOPE_HOST);
2085                 }
2086         }
2087
2088         if (!fl4->daddr) {
2089                 fl4->daddr = fl4->saddr;
2090                 if (!fl4->daddr)
2091                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2092                 dev_out = net->loopback_dev;
2093                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2094                 res.type = RTN_LOCAL;
2095                 flags |= RTCF_LOCAL;
2096                 goto make_route;
2097         }
2098
2099         if (fib_lookup(net, fl4, &res)) {
2100                 res.fi = NULL;
2101                 res.table = NULL;
2102                 if (fl4->flowi4_oif) {
2103                         /* Apparently, routing tables are wrong. Assume,
2104                            that the destination is on link.
2105
2106                            WHY? DW.
2107                            Because we are allowed to send to iface
2108                            even if it has NO routes and NO assigned
2109                            addresses. When oif is specified, routing
2110                            tables are looked up with only one purpose:
2111                            to catch if destination is gatewayed, rather than
2112                            direct. Moreover, if MSG_DONTROUTE is set,
2113                            we send packet, ignoring both routing tables
2114                            and ifaddr state. --ANK
2115
2116
2117                            We could make it even if oif is unknown,
2118                            likely IPv6, but we do not.
2119                          */
2120
2121                         if (fl4->saddr == 0)
2122                                 fl4->saddr = inet_select_addr(dev_out, 0,
2123                                                               RT_SCOPE_LINK);
2124                         res.type = RTN_UNICAST;
2125                         goto make_route;
2126                 }
2127                 rth = ERR_PTR(-ENETUNREACH);
2128                 goto out;
2129         }
2130
2131         if (res.type == RTN_LOCAL) {
2132                 if (!fl4->saddr) {
2133                         if (res.fi->fib_prefsrc)
2134                                 fl4->saddr = res.fi->fib_prefsrc;
2135                         else
2136                                 fl4->saddr = fl4->daddr;
2137                 }
2138                 dev_out = net->loopback_dev;
2139                 fl4->flowi4_oif = dev_out->ifindex;
2140                 flags |= RTCF_LOCAL;
2141                 goto make_route;
2142         }
2143
2144 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2145         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2146                 fib_select_multipath(&res);
2147         else
2148 #endif
2149         if (!res.prefixlen &&
2150             res.table->tb_num_default > 1 &&
2151             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2152                 fib_select_default(&res);
2153
2154         if (!fl4->saddr)
2155                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2156
2157         dev_out = FIB_RES_DEV(res);
2158         fl4->flowi4_oif = dev_out->ifindex;
2159
2160
2161 make_route:
2162         rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2163
2164 out:
2165         rcu_read_unlock();
2166         return rth;
2167 }
2168 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2169
2170 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2171 {
2172         return NULL;
2173 }
2174
2175 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2176 {
2177         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2178
2179         return mtu ? : dst->dev->mtu;
2180 }
2181
2182 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2183                                           struct sk_buff *skb, u32 mtu)
2184 {
2185 }
2186
2187 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2188                                        struct sk_buff *skb)
2189 {
2190 }
2191
2192 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2193                                           unsigned long old)
2194 {
2195         return NULL;
2196 }
2197
2198 static struct dst_ops ipv4_dst_blackhole_ops = {
2199         .family                 =       AF_INET,
2200         .protocol               =       cpu_to_be16(ETH_P_IP),
2201         .check                  =       ipv4_blackhole_dst_check,
2202         .mtu                    =       ipv4_blackhole_mtu,
2203         .default_advmss         =       ipv4_default_advmss,
2204         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2205         .redirect               =       ipv4_rt_blackhole_redirect,
2206         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2207         .neigh_lookup           =       ipv4_neigh_lookup,
2208 };
2209
2210 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2211 {
2212         struct rtable *ort = (struct rtable *) dst_orig;
2213         struct rtable *rt;
2214
2215         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2216         if (rt) {
2217                 struct dst_entry *new = &rt->dst;
2218
2219                 new->__use = 1;
2220                 new->input = dst_discard;
2221                 new->output = dst_discard_sk;
2222
2223                 new->dev = ort->dst.dev;
2224                 if (new->dev)
2225                         dev_hold(new->dev);
2226
2227                 rt->rt_is_input = ort->rt_is_input;
2228                 rt->rt_iif = ort->rt_iif;
2229                 rt->rt_pmtu = ort->rt_pmtu;
2230
2231                 rt->rt_genid = rt_genid_ipv4(net);
2232                 rt->rt_flags = ort->rt_flags;
2233                 rt->rt_type = ort->rt_type;
2234                 rt->rt_gateway = ort->rt_gateway;
2235                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2236
2237                 INIT_LIST_HEAD(&rt->rt_uncached);
2238
2239                 dst_free(new);
2240         }
2241
2242         dst_release(dst_orig);
2243
2244         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2245 }
2246
2247 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2248                                     struct sock *sk)
2249 {
2250         struct rtable *rt = __ip_route_output_key(net, flp4);
2251
2252         if (IS_ERR(rt))
2253                 return rt;
2254
2255         if (flp4->flowi4_proto)
2256                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2257                                                    flowi4_to_flowi(flp4),
2258                                                    sk, 0);
2259
2260         return rt;
2261 }
2262 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2263
2264 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
2265                         struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2266                         u32 seq, int event, int nowait, unsigned int flags)
2267 {
2268         struct rtable *rt = skb_rtable(skb);
2269         struct rtmsg *r;
2270         struct nlmsghdr *nlh;
2271         unsigned long expires = 0;
2272         u32 error;
2273         u32 metrics[RTAX_MAX];
2274
2275         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2276         if (nlh == NULL)
2277                 return -EMSGSIZE;
2278
2279         r = nlmsg_data(nlh);
2280         r->rtm_family    = AF_INET;
2281         r->rtm_dst_len  = 32;
2282         r->rtm_src_len  = 0;
2283         r->rtm_tos      = fl4->flowi4_tos;
2284         r->rtm_table    = RT_TABLE_MAIN;
2285         if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2286                 goto nla_put_failure;
2287         r->rtm_type     = rt->rt_type;
2288         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2289         r->rtm_protocol = RTPROT_UNSPEC;
2290         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2291         if (rt->rt_flags & RTCF_NOTIFY)
2292                 r->rtm_flags |= RTM_F_NOTIFY;
2293
2294         if (nla_put_be32(skb, RTA_DST, dst))
2295                 goto nla_put_failure;
2296         if (src) {
2297                 r->rtm_src_len = 32;
2298                 if (nla_put_be32(skb, RTA_SRC, src))
2299                         goto nla_put_failure;
2300         }
2301         if (rt->dst.dev &&
2302             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2303                 goto nla_put_failure;
2304 #ifdef CONFIG_IP_ROUTE_CLASSID
2305         if (rt->dst.tclassid &&
2306             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2307                 goto nla_put_failure;
2308 #endif
2309         if (!rt_is_input_route(rt) &&
2310             fl4->saddr != src) {
2311                 if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
2312                         goto nla_put_failure;
2313         }
2314         if (rt->rt_uses_gateway &&
2315             nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2316                 goto nla_put_failure;
2317
2318         expires = rt->dst.expires;
2319         if (expires) {
2320                 unsigned long now = jiffies;
2321
2322                 if (time_before(now, expires))
2323                         expires -= now;
2324                 else
2325                         expires = 0;
2326         }
2327
2328         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2329         if (rt->rt_pmtu && expires)
2330                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2331         if (rtnetlink_put_metrics(skb, metrics) < 0)
2332                 goto nla_put_failure;
2333
2334         if (fl4->flowi4_mark &&
2335             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2336                 goto nla_put_failure;
2337
2338         error = rt->dst.error;
2339
2340         if (rt_is_input_route(rt)) {
2341 #ifdef CONFIG_IP_MROUTE
2342                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2343                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2344                         int err = ipmr_get_route(net, skb,
2345                                                  fl4->saddr, fl4->daddr,
2346                                                  r, nowait);
2347                         if (err <= 0) {
2348                                 if (!nowait) {
2349                                         if (err == 0)
2350                                                 return 0;
2351                                         goto nla_put_failure;
2352                                 } else {
2353                                         if (err == -EMSGSIZE)
2354                                                 goto nla_put_failure;
2355                                         error = err;
2356                                 }
2357                         }
2358                 } else
2359 #endif
2360                         if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2361                                 goto nla_put_failure;
2362         }
2363
2364         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2365                 goto nla_put_failure;
2366
2367         return nlmsg_end(skb, nlh);
2368
2369 nla_put_failure:
2370         nlmsg_cancel(skb, nlh);
2371         return -EMSGSIZE;
2372 }
2373
2374 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2375 {
2376         struct net *net = sock_net(in_skb->sk);
2377         struct rtmsg *rtm;
2378         struct nlattr *tb[RTA_MAX+1];
2379         struct rtable *rt = NULL;
2380         struct flowi4 fl4;
2381         __be32 dst = 0;
2382         __be32 src = 0;
2383         u32 iif;
2384         int err;
2385         int mark;
2386         struct sk_buff *skb;
2387
2388         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2389         if (err < 0)
2390                 goto errout;
2391
2392         rtm = nlmsg_data(nlh);
2393
2394         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2395         if (skb == NULL) {
2396                 err = -ENOBUFS;
2397                 goto errout;
2398         }
2399
2400         /* Reserve room for dummy headers, this skb can pass
2401            through good chunk of routing engine.
2402          */
2403         skb_reset_mac_header(skb);
2404         skb_reset_network_header(skb);
2405
2406         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2407         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2408         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2409
2410         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2411         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2412         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2413         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2414
2415         memset(&fl4, 0, sizeof(fl4));
2416         fl4.daddr = dst;
2417         fl4.saddr = src;
2418         fl4.flowi4_tos = rtm->rtm_tos;
2419         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2420         fl4.flowi4_mark = mark;
2421
2422         if (iif) {
2423                 struct net_device *dev;
2424
2425                 dev = __dev_get_by_index(net, iif);
2426                 if (dev == NULL) {
2427                         err = -ENODEV;
2428                         goto errout_free;
2429                 }
2430
2431                 skb->protocol   = htons(ETH_P_IP);
2432                 skb->dev        = dev;
2433                 skb->mark       = mark;
2434                 local_bh_disable();
2435                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2436                 local_bh_enable();
2437
2438                 rt = skb_rtable(skb);
2439                 if (err == 0 && rt->dst.error)
2440                         err = -rt->dst.error;
2441         } else {
2442                 rt = ip_route_output_key(net, &fl4);
2443
2444                 err = 0;
2445                 if (IS_ERR(rt))
2446                         err = PTR_ERR(rt);
2447         }
2448
2449         if (err)
2450                 goto errout_free;
2451
2452         skb_dst_set(skb, &rt->dst);
2453         if (rtm->rtm_flags & RTM_F_NOTIFY)
2454                 rt->rt_flags |= RTCF_NOTIFY;
2455
2456         err = rt_fill_info(net, dst, src, &fl4, skb,
2457                            NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2458                            RTM_NEWROUTE, 0, 0);
2459         if (err <= 0)
2460                 goto errout_free;
2461
2462         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2463 errout:
2464         return err;
2465
2466 errout_free:
2467         kfree_skb(skb);
2468         goto errout;
2469 }
2470
2471 void ip_rt_multicast_event(struct in_device *in_dev)
2472 {
2473         rt_cache_flush(dev_net(in_dev->dev));
2474 }
2475
2476 #ifdef CONFIG_SYSCTL
2477 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
2478 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2479 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2480 static int ip_rt_gc_elasticity __read_mostly    = 8;
2481
2482 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2483                                         void __user *buffer,
2484                                         size_t *lenp, loff_t *ppos)
2485 {
2486         struct net *net = (struct net *)__ctl->extra1;
2487
2488         if (write) {
2489                 rt_cache_flush(net);
2490                 fnhe_genid_bump(net);
2491                 return 0;
2492         }
2493
2494         return -EINVAL;
2495 }
2496
2497 static struct ctl_table ipv4_route_table[] = {
2498         {
2499                 .procname       = "gc_thresh",
2500                 .data           = &ipv4_dst_ops.gc_thresh,
2501                 .maxlen         = sizeof(int),
2502                 .mode           = 0644,
2503                 .proc_handler   = proc_dointvec,
2504         },
2505         {
2506                 .procname       = "max_size",
2507                 .data           = &ip_rt_max_size,
2508                 .maxlen         = sizeof(int),
2509                 .mode           = 0644,
2510                 .proc_handler   = proc_dointvec,
2511         },
2512         {
2513                 /*  Deprecated. Use gc_min_interval_ms */
2514
2515                 .procname       = "gc_min_interval",
2516                 .data           = &ip_rt_gc_min_interval,
2517                 .maxlen         = sizeof(int),
2518                 .mode           = 0644,
2519                 .proc_handler   = proc_dointvec_jiffies,
2520         },
2521         {
2522                 .procname       = "gc_min_interval_ms",
2523                 .data           = &ip_rt_gc_min_interval,
2524                 .maxlen         = sizeof(int),
2525                 .mode           = 0644,
2526                 .proc_handler   = proc_dointvec_ms_jiffies,
2527         },
2528         {
2529                 .procname       = "gc_timeout",
2530                 .data           = &ip_rt_gc_timeout,
2531                 .maxlen         = sizeof(int),
2532                 .mode           = 0644,
2533                 .proc_handler   = proc_dointvec_jiffies,
2534         },
2535         {
2536                 .procname       = "gc_interval",
2537                 .data           = &ip_rt_gc_interval,
2538                 .maxlen         = sizeof(int),
2539                 .mode           = 0644,
2540                 .proc_handler   = proc_dointvec_jiffies,
2541         },
2542         {
2543                 .procname       = "redirect_load",
2544                 .data           = &ip_rt_redirect_load,
2545                 .maxlen         = sizeof(int),
2546                 .mode           = 0644,
2547                 .proc_handler   = proc_dointvec,
2548         },
2549         {
2550                 .procname       = "redirect_number",
2551                 .data           = &ip_rt_redirect_number,
2552                 .maxlen         = sizeof(int),
2553                 .mode           = 0644,
2554                 .proc_handler   = proc_dointvec,
2555         },
2556         {
2557                 .procname       = "redirect_silence",
2558                 .data           = &ip_rt_redirect_silence,
2559                 .maxlen         = sizeof(int),
2560                 .mode           = 0644,
2561                 .proc_handler   = proc_dointvec,
2562         },
2563         {
2564                 .procname       = "error_cost",
2565                 .data           = &ip_rt_error_cost,
2566                 .maxlen         = sizeof(int),
2567                 .mode           = 0644,
2568                 .proc_handler   = proc_dointvec,
2569         },
2570         {
2571                 .procname       = "error_burst",
2572                 .data           = &ip_rt_error_burst,
2573                 .maxlen         = sizeof(int),
2574                 .mode           = 0644,
2575                 .proc_handler   = proc_dointvec,
2576         },
2577         {
2578                 .procname       = "gc_elasticity",
2579                 .data           = &ip_rt_gc_elasticity,
2580                 .maxlen         = sizeof(int),
2581                 .mode           = 0644,
2582                 .proc_handler   = proc_dointvec,
2583         },
2584         {
2585                 .procname       = "mtu_expires",
2586                 .data           = &ip_rt_mtu_expires,
2587                 .maxlen         = sizeof(int),
2588                 .mode           = 0644,
2589                 .proc_handler   = proc_dointvec_jiffies,
2590         },
2591         {
2592                 .procname       = "min_pmtu",
2593                 .data           = &ip_rt_min_pmtu,
2594                 .maxlen         = sizeof(int),
2595                 .mode           = 0644,
2596                 .proc_handler   = proc_dointvec,
2597         },
2598         {
2599                 .procname       = "min_adv_mss",
2600                 .data           = &ip_rt_min_advmss,
2601                 .maxlen         = sizeof(int),
2602                 .mode           = 0644,
2603                 .proc_handler   = proc_dointvec,
2604         },
2605         { }
2606 };
2607
2608 static struct ctl_table ipv4_route_flush_table[] = {
2609         {
2610                 .procname       = "flush",
2611                 .maxlen         = sizeof(int),
2612                 .mode           = 0200,
2613                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2614         },
2615         { },
2616 };
2617
2618 static __net_init int sysctl_route_net_init(struct net *net)
2619 {
2620         struct ctl_table *tbl;
2621
2622         tbl = ipv4_route_flush_table;
2623         if (!net_eq(net, &init_net)) {
2624                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2625                 if (tbl == NULL)
2626                         goto err_dup;
2627
2628                 /* Don't export sysctls to unprivileged users */
2629                 if (net->user_ns != &init_user_ns)
2630                         tbl[0].procname = NULL;
2631         }
2632         tbl[0].extra1 = net;
2633
2634         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2635         if (net->ipv4.route_hdr == NULL)
2636                 goto err_reg;
2637         return 0;
2638
2639 err_reg:
2640         if (tbl != ipv4_route_flush_table)
2641                 kfree(tbl);
2642 err_dup:
2643         return -ENOMEM;
2644 }
2645
2646 static __net_exit void sysctl_route_net_exit(struct net *net)
2647 {
2648         struct ctl_table *tbl;
2649
2650         tbl = net->ipv4.route_hdr->ctl_table_arg;
2651         unregister_net_sysctl_table(net->ipv4.route_hdr);
2652         BUG_ON(tbl == ipv4_route_flush_table);
2653         kfree(tbl);
2654 }
2655
2656 static __net_initdata struct pernet_operations sysctl_route_ops = {
2657         .init = sysctl_route_net_init,
2658         .exit = sysctl_route_net_exit,
2659 };
2660 #endif
2661
2662 static __net_init int rt_genid_init(struct net *net)
2663 {
2664         atomic_set(&net->ipv4.rt_genid, 0);
2665         atomic_set(&net->fnhe_genid, 0);
2666         get_random_bytes(&net->ipv4.dev_addr_genid,
2667                          sizeof(net->ipv4.dev_addr_genid));
2668         return 0;
2669 }
2670
2671 static __net_initdata struct pernet_operations rt_genid_ops = {
2672         .init = rt_genid_init,
2673 };
2674
2675 static int __net_init ipv4_inetpeer_init(struct net *net)
2676 {
2677         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2678
2679         if (!bp)
2680                 return -ENOMEM;
2681         inet_peer_base_init(bp);
2682         net->ipv4.peers = bp;
2683         return 0;
2684 }
2685
2686 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2687 {
2688         struct inet_peer_base *bp = net->ipv4.peers;
2689
2690         net->ipv4.peers = NULL;
2691         inetpeer_invalidate_tree(bp);
2692         kfree(bp);
2693 }
2694
2695 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2696         .init   =       ipv4_inetpeer_init,
2697         .exit   =       ipv4_inetpeer_exit,
2698 };
2699
2700 #ifdef CONFIG_IP_ROUTE_CLASSID
2701 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2702 #endif /* CONFIG_IP_ROUTE_CLASSID */
2703
2704 int __init ip_rt_init(void)
2705 {
2706         int rc = 0;
2707
2708 #ifdef CONFIG_IP_ROUTE_CLASSID
2709         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2710         if (!ip_rt_acct)
2711                 panic("IP: failed to allocate ip_rt_acct\n");
2712 #endif
2713
2714         ipv4_dst_ops.kmem_cachep =
2715                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2716                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2717
2718         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2719
2720         if (dst_entries_init(&ipv4_dst_ops) < 0)
2721                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
2722
2723         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2724                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2725
2726         ipv4_dst_ops.gc_thresh = ~0;
2727         ip_rt_max_size = INT_MAX;
2728
2729         devinet_init();
2730         ip_fib_init();
2731
2732         if (ip_rt_proc_init())
2733                 pr_err("Unable to create route proc files\n");
2734 #ifdef CONFIG_XFRM
2735         xfrm_init();
2736         xfrm4_init();
2737 #endif
2738         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2739
2740 #ifdef CONFIG_SYSCTL
2741         register_pernet_subsys(&sysctl_route_ops);
2742 #endif
2743         register_pernet_subsys(&rt_genid_ops);
2744         register_pernet_subsys(&ipv4_inetpeer_ops);
2745         return rc;
2746 }
2747
2748 #ifdef CONFIG_SYSCTL
2749 /*
2750  * We really need to sanitize the damn ipv4 init order, then all
2751  * this nonsense will go away.
2752  */
2753 void __init ip_static_sysctl_init(void)
2754 {
2755         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2756 }
2757 #endif