Merge tag 'pwm/for-4.1-rc5' of git://git.kernel.org/pub/scm/linux/kernel/git/thierry...
[sfrench/cifs-2.6.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #define pr_fmt(fmt) "IPv4: " fmt
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <linux/jhash.h>
93 #include <net/dst.h>
94 #include <net/net_namespace.h>
95 #include <net/protocol.h>
96 #include <net/ip.h>
97 #include <net/route.h>
98 #include <net/inetpeer.h>
99 #include <net/sock.h>
100 #include <net/ip_fib.h>
101 #include <net/arp.h>
102 #include <net/tcp.h>
103 #include <net/icmp.h>
104 #include <net/xfrm.h>
105 #include <net/netevent.h>
106 #include <net/rtnetlink.h>
107 #ifdef CONFIG_SYSCTL
108 #include <linux/sysctl.h>
109 #include <linux/kmemleak.h>
110 #endif
111 #include <net/secure_seq.h>
112
113 #define RT_FL_TOS(oldflp4) \
114         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
115
116 #define RT_GC_TIMEOUT (300*HZ)
117
118 static int ip_rt_max_size;
119 static int ip_rt_redirect_number __read_mostly  = 9;
120 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
121 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
122 static int ip_rt_error_cost __read_mostly       = HZ;
123 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
124 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
125 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
126 static int ip_rt_min_advmss __read_mostly       = 256;
127
128 /*
129  *      Interface to generic destination cache.
130  */
131
132 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
133 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
134 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
135 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
136 static void              ipv4_link_failure(struct sk_buff *skb);
137 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
138                                            struct sk_buff *skb, u32 mtu);
139 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
140                                         struct sk_buff *skb);
141 static void             ipv4_dst_destroy(struct dst_entry *dst);
142
143 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
144 {
145         WARN_ON(1);
146         return NULL;
147 }
148
149 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
150                                            struct sk_buff *skb,
151                                            const void *daddr);
152
153 static struct dst_ops ipv4_dst_ops = {
154         .family =               AF_INET,
155         .check =                ipv4_dst_check,
156         .default_advmss =       ipv4_default_advmss,
157         .mtu =                  ipv4_mtu,
158         .cow_metrics =          ipv4_cow_metrics,
159         .destroy =              ipv4_dst_destroy,
160         .negative_advice =      ipv4_negative_advice,
161         .link_failure =         ipv4_link_failure,
162         .update_pmtu =          ip_rt_update_pmtu,
163         .redirect =             ip_do_redirect,
164         .local_out =            __ip_local_out,
165         .neigh_lookup =         ipv4_neigh_lookup,
166 };
167
168 #define ECN_OR_COST(class)      TC_PRIO_##class
169
170 const __u8 ip_tos2prio[16] = {
171         TC_PRIO_BESTEFFORT,
172         ECN_OR_COST(BESTEFFORT),
173         TC_PRIO_BESTEFFORT,
174         ECN_OR_COST(BESTEFFORT),
175         TC_PRIO_BULK,
176         ECN_OR_COST(BULK),
177         TC_PRIO_BULK,
178         ECN_OR_COST(BULK),
179         TC_PRIO_INTERACTIVE,
180         ECN_OR_COST(INTERACTIVE),
181         TC_PRIO_INTERACTIVE,
182         ECN_OR_COST(INTERACTIVE),
183         TC_PRIO_INTERACTIVE_BULK,
184         ECN_OR_COST(INTERACTIVE_BULK),
185         TC_PRIO_INTERACTIVE_BULK,
186         ECN_OR_COST(INTERACTIVE_BULK)
187 };
188 EXPORT_SYMBOL(ip_tos2prio);
189
190 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
191 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
192
193 #ifdef CONFIG_PROC_FS
194 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
195 {
196         if (*pos)
197                 return NULL;
198         return SEQ_START_TOKEN;
199 }
200
201 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
202 {
203         ++*pos;
204         return NULL;
205 }
206
207 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
208 {
209 }
210
211 static int rt_cache_seq_show(struct seq_file *seq, void *v)
212 {
213         if (v == SEQ_START_TOKEN)
214                 seq_printf(seq, "%-127s\n",
215                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
216                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
217                            "HHUptod\tSpecDst");
218         return 0;
219 }
220
221 static const struct seq_operations rt_cache_seq_ops = {
222         .start  = rt_cache_seq_start,
223         .next   = rt_cache_seq_next,
224         .stop   = rt_cache_seq_stop,
225         .show   = rt_cache_seq_show,
226 };
227
228 static int rt_cache_seq_open(struct inode *inode, struct file *file)
229 {
230         return seq_open(file, &rt_cache_seq_ops);
231 }
232
233 static const struct file_operations rt_cache_seq_fops = {
234         .owner   = THIS_MODULE,
235         .open    = rt_cache_seq_open,
236         .read    = seq_read,
237         .llseek  = seq_lseek,
238         .release = seq_release,
239 };
240
241
242 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
243 {
244         int cpu;
245
246         if (*pos == 0)
247                 return SEQ_START_TOKEN;
248
249         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
250                 if (!cpu_possible(cpu))
251                         continue;
252                 *pos = cpu+1;
253                 return &per_cpu(rt_cache_stat, cpu);
254         }
255         return NULL;
256 }
257
258 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
259 {
260         int cpu;
261
262         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
263                 if (!cpu_possible(cpu))
264                         continue;
265                 *pos = cpu+1;
266                 return &per_cpu(rt_cache_stat, cpu);
267         }
268         return NULL;
269
270 }
271
272 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
273 {
274
275 }
276
277 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
278 {
279         struct rt_cache_stat *st = v;
280
281         if (v == SEQ_START_TOKEN) {
282                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
283                 return 0;
284         }
285
286         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
287                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
288                    dst_entries_get_slow(&ipv4_dst_ops),
289                    0, /* st->in_hit */
290                    st->in_slow_tot,
291                    st->in_slow_mc,
292                    st->in_no_route,
293                    st->in_brd,
294                    st->in_martian_dst,
295                    st->in_martian_src,
296
297                    0, /* st->out_hit */
298                    st->out_slow_tot,
299                    st->out_slow_mc,
300
301                    0, /* st->gc_total */
302                    0, /* st->gc_ignored */
303                    0, /* st->gc_goal_miss */
304                    0, /* st->gc_dst_overflow */
305                    0, /* st->in_hlist_search */
306                    0  /* st->out_hlist_search */
307                 );
308         return 0;
309 }
310
311 static const struct seq_operations rt_cpu_seq_ops = {
312         .start  = rt_cpu_seq_start,
313         .next   = rt_cpu_seq_next,
314         .stop   = rt_cpu_seq_stop,
315         .show   = rt_cpu_seq_show,
316 };
317
318
319 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
320 {
321         return seq_open(file, &rt_cpu_seq_ops);
322 }
323
324 static const struct file_operations rt_cpu_seq_fops = {
325         .owner   = THIS_MODULE,
326         .open    = rt_cpu_seq_open,
327         .read    = seq_read,
328         .llseek  = seq_lseek,
329         .release = seq_release,
330 };
331
332 #ifdef CONFIG_IP_ROUTE_CLASSID
333 static int rt_acct_proc_show(struct seq_file *m, void *v)
334 {
335         struct ip_rt_acct *dst, *src;
336         unsigned int i, j;
337
338         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
339         if (!dst)
340                 return -ENOMEM;
341
342         for_each_possible_cpu(i) {
343                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
344                 for (j = 0; j < 256; j++) {
345                         dst[j].o_bytes   += src[j].o_bytes;
346                         dst[j].o_packets += src[j].o_packets;
347                         dst[j].i_bytes   += src[j].i_bytes;
348                         dst[j].i_packets += src[j].i_packets;
349                 }
350         }
351
352         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
353         kfree(dst);
354         return 0;
355 }
356
357 static int rt_acct_proc_open(struct inode *inode, struct file *file)
358 {
359         return single_open(file, rt_acct_proc_show, NULL);
360 }
361
362 static const struct file_operations rt_acct_proc_fops = {
363         .owner          = THIS_MODULE,
364         .open           = rt_acct_proc_open,
365         .read           = seq_read,
366         .llseek         = seq_lseek,
367         .release        = single_release,
368 };
369 #endif
370
371 static int __net_init ip_rt_do_proc_init(struct net *net)
372 {
373         struct proc_dir_entry *pde;
374
375         pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
376                           &rt_cache_seq_fops);
377         if (!pde)
378                 goto err1;
379
380         pde = proc_create("rt_cache", S_IRUGO,
381                           net->proc_net_stat, &rt_cpu_seq_fops);
382         if (!pde)
383                 goto err2;
384
385 #ifdef CONFIG_IP_ROUTE_CLASSID
386         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
387         if (!pde)
388                 goto err3;
389 #endif
390         return 0;
391
392 #ifdef CONFIG_IP_ROUTE_CLASSID
393 err3:
394         remove_proc_entry("rt_cache", net->proc_net_stat);
395 #endif
396 err2:
397         remove_proc_entry("rt_cache", net->proc_net);
398 err1:
399         return -ENOMEM;
400 }
401
402 static void __net_exit ip_rt_do_proc_exit(struct net *net)
403 {
404         remove_proc_entry("rt_cache", net->proc_net_stat);
405         remove_proc_entry("rt_cache", net->proc_net);
406 #ifdef CONFIG_IP_ROUTE_CLASSID
407         remove_proc_entry("rt_acct", net->proc_net);
408 #endif
409 }
410
411 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
412         .init = ip_rt_do_proc_init,
413         .exit = ip_rt_do_proc_exit,
414 };
415
416 static int __init ip_rt_proc_init(void)
417 {
418         return register_pernet_subsys(&ip_rt_proc_ops);
419 }
420
421 #else
422 static inline int ip_rt_proc_init(void)
423 {
424         return 0;
425 }
426 #endif /* CONFIG_PROC_FS */
427
428 static inline bool rt_is_expired(const struct rtable *rth)
429 {
430         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
431 }
432
433 void rt_cache_flush(struct net *net)
434 {
435         rt_genid_bump_ipv4(net);
436 }
437
438 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
439                                            struct sk_buff *skb,
440                                            const void *daddr)
441 {
442         struct net_device *dev = dst->dev;
443         const __be32 *pkey = daddr;
444         const struct rtable *rt;
445         struct neighbour *n;
446
447         rt = (const struct rtable *) dst;
448         if (rt->rt_gateway)
449                 pkey = (const __be32 *) &rt->rt_gateway;
450         else if (skb)
451                 pkey = &ip_hdr(skb)->daddr;
452
453         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
454         if (n)
455                 return n;
456         return neigh_create(&arp_tbl, pkey, dev);
457 }
458
459 #define IP_IDENTS_SZ 2048u
460 struct ip_ident_bucket {
461         atomic_t        id;
462         u32             stamp32;
463 };
464
465 static struct ip_ident_bucket *ip_idents __read_mostly;
466
467 /* In order to protect privacy, we add a perturbation to identifiers
468  * if one generator is seldom used. This makes hard for an attacker
469  * to infer how many packets were sent between two points in time.
470  */
471 u32 ip_idents_reserve(u32 hash, int segs)
472 {
473         struct ip_ident_bucket *bucket = ip_idents + hash % IP_IDENTS_SZ;
474         u32 old = ACCESS_ONCE(bucket->stamp32);
475         u32 now = (u32)jiffies;
476         u32 delta = 0;
477
478         if (old != now && cmpxchg(&bucket->stamp32, old, now) == old)
479                 delta = prandom_u32_max(now - old);
480
481         return atomic_add_return(segs + delta, &bucket->id) - segs;
482 }
483 EXPORT_SYMBOL(ip_idents_reserve);
484
485 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
486 {
487         static u32 ip_idents_hashrnd __read_mostly;
488         u32 hash, id;
489
490         net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
491
492         hash = jhash_3words((__force u32)iph->daddr,
493                             (__force u32)iph->saddr,
494                             iph->protocol ^ net_hash_mix(net),
495                             ip_idents_hashrnd);
496         id = ip_idents_reserve(hash, segs);
497         iph->id = htons(id);
498 }
499 EXPORT_SYMBOL(__ip_select_ident);
500
501 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
502                              const struct iphdr *iph,
503                              int oif, u8 tos,
504                              u8 prot, u32 mark, int flow_flags)
505 {
506         if (sk) {
507                 const struct inet_sock *inet = inet_sk(sk);
508
509                 oif = sk->sk_bound_dev_if;
510                 mark = sk->sk_mark;
511                 tos = RT_CONN_FLAGS(sk);
512                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
513         }
514         flowi4_init_output(fl4, oif, mark, tos,
515                            RT_SCOPE_UNIVERSE, prot,
516                            flow_flags,
517                            iph->daddr, iph->saddr, 0, 0);
518 }
519
520 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
521                                const struct sock *sk)
522 {
523         const struct iphdr *iph = ip_hdr(skb);
524         int oif = skb->dev->ifindex;
525         u8 tos = RT_TOS(iph->tos);
526         u8 prot = iph->protocol;
527         u32 mark = skb->mark;
528
529         __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
530 }
531
532 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
533 {
534         const struct inet_sock *inet = inet_sk(sk);
535         const struct ip_options_rcu *inet_opt;
536         __be32 daddr = inet->inet_daddr;
537
538         rcu_read_lock();
539         inet_opt = rcu_dereference(inet->inet_opt);
540         if (inet_opt && inet_opt->opt.srr)
541                 daddr = inet_opt->opt.faddr;
542         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
543                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
544                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
545                            inet_sk_flowi_flags(sk),
546                            daddr, inet->inet_saddr, 0, 0);
547         rcu_read_unlock();
548 }
549
550 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
551                                  const struct sk_buff *skb)
552 {
553         if (skb)
554                 build_skb_flow_key(fl4, skb, sk);
555         else
556                 build_sk_flow_key(fl4, sk);
557 }
558
559 static inline void rt_free(struct rtable *rt)
560 {
561         call_rcu(&rt->dst.rcu_head, dst_rcu_free);
562 }
563
564 static DEFINE_SPINLOCK(fnhe_lock);
565
566 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
567 {
568         struct rtable *rt;
569
570         rt = rcu_dereference(fnhe->fnhe_rth_input);
571         if (rt) {
572                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
573                 rt_free(rt);
574         }
575         rt = rcu_dereference(fnhe->fnhe_rth_output);
576         if (rt) {
577                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
578                 rt_free(rt);
579         }
580 }
581
582 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
583 {
584         struct fib_nh_exception *fnhe, *oldest;
585
586         oldest = rcu_dereference(hash->chain);
587         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
588              fnhe = rcu_dereference(fnhe->fnhe_next)) {
589                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
590                         oldest = fnhe;
591         }
592         fnhe_flush_routes(oldest);
593         return oldest;
594 }
595
596 static inline u32 fnhe_hashfun(__be32 daddr)
597 {
598         static u32 fnhe_hashrnd __read_mostly;
599         u32 hval;
600
601         net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
602         hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
603         return hash_32(hval, FNHE_HASH_SHIFT);
604 }
605
606 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
607 {
608         rt->rt_pmtu = fnhe->fnhe_pmtu;
609         rt->dst.expires = fnhe->fnhe_expires;
610
611         if (fnhe->fnhe_gw) {
612                 rt->rt_flags |= RTCF_REDIRECTED;
613                 rt->rt_gateway = fnhe->fnhe_gw;
614                 rt->rt_uses_gateway = 1;
615         }
616 }
617
618 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
619                                   u32 pmtu, unsigned long expires)
620 {
621         struct fnhe_hash_bucket *hash;
622         struct fib_nh_exception *fnhe;
623         struct rtable *rt;
624         unsigned int i;
625         int depth;
626         u32 hval = fnhe_hashfun(daddr);
627
628         spin_lock_bh(&fnhe_lock);
629
630         hash = rcu_dereference(nh->nh_exceptions);
631         if (!hash) {
632                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
633                 if (!hash)
634                         goto out_unlock;
635                 rcu_assign_pointer(nh->nh_exceptions, hash);
636         }
637
638         hash += hval;
639
640         depth = 0;
641         for (fnhe = rcu_dereference(hash->chain); fnhe;
642              fnhe = rcu_dereference(fnhe->fnhe_next)) {
643                 if (fnhe->fnhe_daddr == daddr)
644                         break;
645                 depth++;
646         }
647
648         if (fnhe) {
649                 if (gw)
650                         fnhe->fnhe_gw = gw;
651                 if (pmtu) {
652                         fnhe->fnhe_pmtu = pmtu;
653                         fnhe->fnhe_expires = max(1UL, expires);
654                 }
655                 /* Update all cached dsts too */
656                 rt = rcu_dereference(fnhe->fnhe_rth_input);
657                 if (rt)
658                         fill_route_from_fnhe(rt, fnhe);
659                 rt = rcu_dereference(fnhe->fnhe_rth_output);
660                 if (rt)
661                         fill_route_from_fnhe(rt, fnhe);
662         } else {
663                 if (depth > FNHE_RECLAIM_DEPTH)
664                         fnhe = fnhe_oldest(hash);
665                 else {
666                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
667                         if (!fnhe)
668                                 goto out_unlock;
669
670                         fnhe->fnhe_next = hash->chain;
671                         rcu_assign_pointer(hash->chain, fnhe);
672                 }
673                 fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev));
674                 fnhe->fnhe_daddr = daddr;
675                 fnhe->fnhe_gw = gw;
676                 fnhe->fnhe_pmtu = pmtu;
677                 fnhe->fnhe_expires = expires;
678
679                 /* Exception created; mark the cached routes for the nexthop
680                  * stale, so anyone caching it rechecks if this exception
681                  * applies to them.
682                  */
683                 rt = rcu_dereference(nh->nh_rth_input);
684                 if (rt)
685                         rt->dst.obsolete = DST_OBSOLETE_KILL;
686
687                 for_each_possible_cpu(i) {
688                         struct rtable __rcu **prt;
689                         prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
690                         rt = rcu_dereference(*prt);
691                         if (rt)
692                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
693                 }
694         }
695
696         fnhe->fnhe_stamp = jiffies;
697
698 out_unlock:
699         spin_unlock_bh(&fnhe_lock);
700 }
701
702 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
703                              bool kill_route)
704 {
705         __be32 new_gw = icmp_hdr(skb)->un.gateway;
706         __be32 old_gw = ip_hdr(skb)->saddr;
707         struct net_device *dev = skb->dev;
708         struct in_device *in_dev;
709         struct fib_result res;
710         struct neighbour *n;
711         struct net *net;
712
713         switch (icmp_hdr(skb)->code & 7) {
714         case ICMP_REDIR_NET:
715         case ICMP_REDIR_NETTOS:
716         case ICMP_REDIR_HOST:
717         case ICMP_REDIR_HOSTTOS:
718                 break;
719
720         default:
721                 return;
722         }
723
724         if (rt->rt_gateway != old_gw)
725                 return;
726
727         in_dev = __in_dev_get_rcu(dev);
728         if (!in_dev)
729                 return;
730
731         net = dev_net(dev);
732         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
733             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
734             ipv4_is_zeronet(new_gw))
735                 goto reject_redirect;
736
737         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
738                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
739                         goto reject_redirect;
740                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
741                         goto reject_redirect;
742         } else {
743                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
744                         goto reject_redirect;
745         }
746
747         n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
748         if (!IS_ERR(n)) {
749                 if (!(n->nud_state & NUD_VALID)) {
750                         neigh_event_send(n, NULL);
751                 } else {
752                         if (fib_lookup(net, fl4, &res) == 0) {
753                                 struct fib_nh *nh = &FIB_RES_NH(res);
754
755                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
756                                                       0, 0);
757                         }
758                         if (kill_route)
759                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
760                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
761                 }
762                 neigh_release(n);
763         }
764         return;
765
766 reject_redirect:
767 #ifdef CONFIG_IP_ROUTE_VERBOSE
768         if (IN_DEV_LOG_MARTIANS(in_dev)) {
769                 const struct iphdr *iph = (const struct iphdr *) skb->data;
770                 __be32 daddr = iph->daddr;
771                 __be32 saddr = iph->saddr;
772
773                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
774                                      "  Advised path = %pI4 -> %pI4\n",
775                                      &old_gw, dev->name, &new_gw,
776                                      &saddr, &daddr);
777         }
778 #endif
779         ;
780 }
781
782 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
783 {
784         struct rtable *rt;
785         struct flowi4 fl4;
786         const struct iphdr *iph = (const struct iphdr *) skb->data;
787         int oif = skb->dev->ifindex;
788         u8 tos = RT_TOS(iph->tos);
789         u8 prot = iph->protocol;
790         u32 mark = skb->mark;
791
792         rt = (struct rtable *) dst;
793
794         __build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0);
795         __ip_do_redirect(rt, skb, &fl4, true);
796 }
797
798 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
799 {
800         struct rtable *rt = (struct rtable *)dst;
801         struct dst_entry *ret = dst;
802
803         if (rt) {
804                 if (dst->obsolete > 0) {
805                         ip_rt_put(rt);
806                         ret = NULL;
807                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
808                            rt->dst.expires) {
809                         ip_rt_put(rt);
810                         ret = NULL;
811                 }
812         }
813         return ret;
814 }
815
816 /*
817  * Algorithm:
818  *      1. The first ip_rt_redirect_number redirects are sent
819  *         with exponential backoff, then we stop sending them at all,
820  *         assuming that the host ignores our redirects.
821  *      2. If we did not see packets requiring redirects
822  *         during ip_rt_redirect_silence, we assume that the host
823  *         forgot redirected route and start to send redirects again.
824  *
825  * This algorithm is much cheaper and more intelligent than dumb load limiting
826  * in icmp.c.
827  *
828  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
829  * and "frag. need" (breaks PMTU discovery) in icmp.c.
830  */
831
832 void ip_rt_send_redirect(struct sk_buff *skb)
833 {
834         struct rtable *rt = skb_rtable(skb);
835         struct in_device *in_dev;
836         struct inet_peer *peer;
837         struct net *net;
838         int log_martians;
839
840         rcu_read_lock();
841         in_dev = __in_dev_get_rcu(rt->dst.dev);
842         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
843                 rcu_read_unlock();
844                 return;
845         }
846         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
847         rcu_read_unlock();
848
849         net = dev_net(rt->dst.dev);
850         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
851         if (!peer) {
852                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
853                           rt_nexthop(rt, ip_hdr(skb)->daddr));
854                 return;
855         }
856
857         /* No redirected packets during ip_rt_redirect_silence;
858          * reset the algorithm.
859          */
860         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
861                 peer->rate_tokens = 0;
862
863         /* Too many ignored redirects; do not send anything
864          * set dst.rate_last to the last seen redirected packet.
865          */
866         if (peer->rate_tokens >= ip_rt_redirect_number) {
867                 peer->rate_last = jiffies;
868                 goto out_put_peer;
869         }
870
871         /* Check for load limit; set rate_last to the latest sent
872          * redirect.
873          */
874         if (peer->rate_tokens == 0 ||
875             time_after(jiffies,
876                        (peer->rate_last +
877                         (ip_rt_redirect_load << peer->rate_tokens)))) {
878                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
879
880                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
881                 peer->rate_last = jiffies;
882                 ++peer->rate_tokens;
883 #ifdef CONFIG_IP_ROUTE_VERBOSE
884                 if (log_martians &&
885                     peer->rate_tokens == ip_rt_redirect_number)
886                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
887                                              &ip_hdr(skb)->saddr, inet_iif(skb),
888                                              &ip_hdr(skb)->daddr, &gw);
889 #endif
890         }
891 out_put_peer:
892         inet_putpeer(peer);
893 }
894
895 static int ip_error(struct sk_buff *skb)
896 {
897         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
898         struct rtable *rt = skb_rtable(skb);
899         struct inet_peer *peer;
900         unsigned long now;
901         struct net *net;
902         bool send;
903         int code;
904
905         net = dev_net(rt->dst.dev);
906         if (!IN_DEV_FORWARD(in_dev)) {
907                 switch (rt->dst.error) {
908                 case EHOSTUNREACH:
909                         IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
910                         break;
911
912                 case ENETUNREACH:
913                         IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
914                         break;
915                 }
916                 goto out;
917         }
918
919         switch (rt->dst.error) {
920         case EINVAL:
921         default:
922                 goto out;
923         case EHOSTUNREACH:
924                 code = ICMP_HOST_UNREACH;
925                 break;
926         case ENETUNREACH:
927                 code = ICMP_NET_UNREACH;
928                 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
929                 break;
930         case EACCES:
931                 code = ICMP_PKT_FILTERED;
932                 break;
933         }
934
935         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
936
937         send = true;
938         if (peer) {
939                 now = jiffies;
940                 peer->rate_tokens += now - peer->rate_last;
941                 if (peer->rate_tokens > ip_rt_error_burst)
942                         peer->rate_tokens = ip_rt_error_burst;
943                 peer->rate_last = now;
944                 if (peer->rate_tokens >= ip_rt_error_cost)
945                         peer->rate_tokens -= ip_rt_error_cost;
946                 else
947                         send = false;
948                 inet_putpeer(peer);
949         }
950         if (send)
951                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
952
953 out:    kfree_skb(skb);
954         return 0;
955 }
956
957 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
958 {
959         struct dst_entry *dst = &rt->dst;
960         struct fib_result res;
961
962         if (dst_metric_locked(dst, RTAX_MTU))
963                 return;
964
965         if (ipv4_mtu(dst) < mtu)
966                 return;
967
968         if (mtu < ip_rt_min_pmtu)
969                 mtu = ip_rt_min_pmtu;
970
971         if (rt->rt_pmtu == mtu &&
972             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
973                 return;
974
975         rcu_read_lock();
976         if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
977                 struct fib_nh *nh = &FIB_RES_NH(res);
978
979                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
980                                       jiffies + ip_rt_mtu_expires);
981         }
982         rcu_read_unlock();
983 }
984
985 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
986                               struct sk_buff *skb, u32 mtu)
987 {
988         struct rtable *rt = (struct rtable *) dst;
989         struct flowi4 fl4;
990
991         ip_rt_build_flow_key(&fl4, sk, skb);
992         __ip_rt_update_pmtu(rt, &fl4, mtu);
993 }
994
995 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
996                       int oif, u32 mark, u8 protocol, int flow_flags)
997 {
998         const struct iphdr *iph = (const struct iphdr *) skb->data;
999         struct flowi4 fl4;
1000         struct rtable *rt;
1001
1002         if (!mark)
1003                 mark = IP4_REPLY_MARK(net, skb->mark);
1004
1005         __build_flow_key(&fl4, NULL, iph, oif,
1006                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1007         rt = __ip_route_output_key(net, &fl4);
1008         if (!IS_ERR(rt)) {
1009                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1010                 ip_rt_put(rt);
1011         }
1012 }
1013 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1014
1015 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1016 {
1017         const struct iphdr *iph = (const struct iphdr *) skb->data;
1018         struct flowi4 fl4;
1019         struct rtable *rt;
1020
1021         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1022
1023         if (!fl4.flowi4_mark)
1024                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1025
1026         rt = __ip_route_output_key(sock_net(sk), &fl4);
1027         if (!IS_ERR(rt)) {
1028                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1029                 ip_rt_put(rt);
1030         }
1031 }
1032
1033 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1034 {
1035         const struct iphdr *iph = (const struct iphdr *) skb->data;
1036         struct flowi4 fl4;
1037         struct rtable *rt;
1038         struct dst_entry *odst = NULL;
1039         bool new = false;
1040
1041         bh_lock_sock(sk);
1042
1043         if (!ip_sk_accept_pmtu(sk))
1044                 goto out;
1045
1046         odst = sk_dst_get(sk);
1047
1048         if (sock_owned_by_user(sk) || !odst) {
1049                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1050                 goto out;
1051         }
1052
1053         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1054
1055         rt = (struct rtable *)odst;
1056         if (odst->obsolete && !odst->ops->check(odst, 0)) {
1057                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1058                 if (IS_ERR(rt))
1059                         goto out;
1060
1061                 new = true;
1062         }
1063
1064         __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1065
1066         if (!dst_check(&rt->dst, 0)) {
1067                 if (new)
1068                         dst_release(&rt->dst);
1069
1070                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1071                 if (IS_ERR(rt))
1072                         goto out;
1073
1074                 new = true;
1075         }
1076
1077         if (new)
1078                 sk_dst_set(sk, &rt->dst);
1079
1080 out:
1081         bh_unlock_sock(sk);
1082         dst_release(odst);
1083 }
1084 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1085
1086 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1087                    int oif, u32 mark, u8 protocol, int flow_flags)
1088 {
1089         const struct iphdr *iph = (const struct iphdr *) skb->data;
1090         struct flowi4 fl4;
1091         struct rtable *rt;
1092
1093         __build_flow_key(&fl4, NULL, iph, oif,
1094                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1095         rt = __ip_route_output_key(net, &fl4);
1096         if (!IS_ERR(rt)) {
1097                 __ip_do_redirect(rt, skb, &fl4, false);
1098                 ip_rt_put(rt);
1099         }
1100 }
1101 EXPORT_SYMBOL_GPL(ipv4_redirect);
1102
1103 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1104 {
1105         const struct iphdr *iph = (const struct iphdr *) skb->data;
1106         struct flowi4 fl4;
1107         struct rtable *rt;
1108
1109         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1110         rt = __ip_route_output_key(sock_net(sk), &fl4);
1111         if (!IS_ERR(rt)) {
1112                 __ip_do_redirect(rt, skb, &fl4, false);
1113                 ip_rt_put(rt);
1114         }
1115 }
1116 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1117
1118 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1119 {
1120         struct rtable *rt = (struct rtable *) dst;
1121
1122         /* All IPV4 dsts are created with ->obsolete set to the value
1123          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1124          * into this function always.
1125          *
1126          * When a PMTU/redirect information update invalidates a route,
1127          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1128          * DST_OBSOLETE_DEAD by dst_free().
1129          */
1130         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1131                 return NULL;
1132         return dst;
1133 }
1134
1135 static void ipv4_link_failure(struct sk_buff *skb)
1136 {
1137         struct rtable *rt;
1138
1139         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1140
1141         rt = skb_rtable(skb);
1142         if (rt)
1143                 dst_set_expires(&rt->dst, 0);
1144 }
1145
1146 static int ip_rt_bug(struct sock *sk, struct sk_buff *skb)
1147 {
1148         pr_debug("%s: %pI4 -> %pI4, %s\n",
1149                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1150                  skb->dev ? skb->dev->name : "?");
1151         kfree_skb(skb);
1152         WARN_ON(1);
1153         return 0;
1154 }
1155
1156 /*
1157    We do not cache source address of outgoing interface,
1158    because it is used only by IP RR, TS and SRR options,
1159    so that it out of fast path.
1160
1161    BTW remember: "addr" is allowed to be not aligned
1162    in IP options!
1163  */
1164
1165 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1166 {
1167         __be32 src;
1168
1169         if (rt_is_output_route(rt))
1170                 src = ip_hdr(skb)->saddr;
1171         else {
1172                 struct fib_result res;
1173                 struct flowi4 fl4;
1174                 struct iphdr *iph;
1175
1176                 iph = ip_hdr(skb);
1177
1178                 memset(&fl4, 0, sizeof(fl4));
1179                 fl4.daddr = iph->daddr;
1180                 fl4.saddr = iph->saddr;
1181                 fl4.flowi4_tos = RT_TOS(iph->tos);
1182                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1183                 fl4.flowi4_iif = skb->dev->ifindex;
1184                 fl4.flowi4_mark = skb->mark;
1185
1186                 rcu_read_lock();
1187                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1188                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1189                 else
1190                         src = inet_select_addr(rt->dst.dev,
1191                                                rt_nexthop(rt, iph->daddr),
1192                                                RT_SCOPE_UNIVERSE);
1193                 rcu_read_unlock();
1194         }
1195         memcpy(addr, &src, 4);
1196 }
1197
1198 #ifdef CONFIG_IP_ROUTE_CLASSID
1199 static void set_class_tag(struct rtable *rt, u32 tag)
1200 {
1201         if (!(rt->dst.tclassid & 0xFFFF))
1202                 rt->dst.tclassid |= tag & 0xFFFF;
1203         if (!(rt->dst.tclassid & 0xFFFF0000))
1204                 rt->dst.tclassid |= tag & 0xFFFF0000;
1205 }
1206 #endif
1207
1208 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1209 {
1210         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1211
1212         if (advmss == 0) {
1213                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1214                                ip_rt_min_advmss);
1215                 if (advmss > 65535 - 40)
1216                         advmss = 65535 - 40;
1217         }
1218         return advmss;
1219 }
1220
1221 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1222 {
1223         const struct rtable *rt = (const struct rtable *) dst;
1224         unsigned int mtu = rt->rt_pmtu;
1225
1226         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1227                 mtu = dst_metric_raw(dst, RTAX_MTU);
1228
1229         if (mtu)
1230                 return mtu;
1231
1232         mtu = dst->dev->mtu;
1233
1234         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1235                 if (rt->rt_uses_gateway && mtu > 576)
1236                         mtu = 576;
1237         }
1238
1239         return min_t(unsigned int, mtu, IP_MAX_MTU);
1240 }
1241
1242 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1243 {
1244         struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1245         struct fib_nh_exception *fnhe;
1246         u32 hval;
1247
1248         if (!hash)
1249                 return NULL;
1250
1251         hval = fnhe_hashfun(daddr);
1252
1253         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1254              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1255                 if (fnhe->fnhe_daddr == daddr)
1256                         return fnhe;
1257         }
1258         return NULL;
1259 }
1260
1261 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1262                               __be32 daddr)
1263 {
1264         bool ret = false;
1265
1266         spin_lock_bh(&fnhe_lock);
1267
1268         if (daddr == fnhe->fnhe_daddr) {
1269                 struct rtable __rcu **porig;
1270                 struct rtable *orig;
1271                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1272
1273                 if (rt_is_input_route(rt))
1274                         porig = &fnhe->fnhe_rth_input;
1275                 else
1276                         porig = &fnhe->fnhe_rth_output;
1277                 orig = rcu_dereference(*porig);
1278
1279                 if (fnhe->fnhe_genid != genid) {
1280                         fnhe->fnhe_genid = genid;
1281                         fnhe->fnhe_gw = 0;
1282                         fnhe->fnhe_pmtu = 0;
1283                         fnhe->fnhe_expires = 0;
1284                         fnhe_flush_routes(fnhe);
1285                         orig = NULL;
1286                 }
1287                 fill_route_from_fnhe(rt, fnhe);
1288                 if (!rt->rt_gateway)
1289                         rt->rt_gateway = daddr;
1290
1291                 if (!(rt->dst.flags & DST_NOCACHE)) {
1292                         rcu_assign_pointer(*porig, rt);
1293                         if (orig)
1294                                 rt_free(orig);
1295                         ret = true;
1296                 }
1297
1298                 fnhe->fnhe_stamp = jiffies;
1299         }
1300         spin_unlock_bh(&fnhe_lock);
1301
1302         return ret;
1303 }
1304
1305 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1306 {
1307         struct rtable *orig, *prev, **p;
1308         bool ret = true;
1309
1310         if (rt_is_input_route(rt)) {
1311                 p = (struct rtable **)&nh->nh_rth_input;
1312         } else {
1313                 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1314         }
1315         orig = *p;
1316
1317         prev = cmpxchg(p, orig, rt);
1318         if (prev == orig) {
1319                 if (orig)
1320                         rt_free(orig);
1321         } else
1322                 ret = false;
1323
1324         return ret;
1325 }
1326
1327 struct uncached_list {
1328         spinlock_t              lock;
1329         struct list_head        head;
1330 };
1331
1332 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1333
1334 static void rt_add_uncached_list(struct rtable *rt)
1335 {
1336         struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1337
1338         rt->rt_uncached_list = ul;
1339
1340         spin_lock_bh(&ul->lock);
1341         list_add_tail(&rt->rt_uncached, &ul->head);
1342         spin_unlock_bh(&ul->lock);
1343 }
1344
1345 static void ipv4_dst_destroy(struct dst_entry *dst)
1346 {
1347         struct rtable *rt = (struct rtable *) dst;
1348
1349         if (!list_empty(&rt->rt_uncached)) {
1350                 struct uncached_list *ul = rt->rt_uncached_list;
1351
1352                 spin_lock_bh(&ul->lock);
1353                 list_del(&rt->rt_uncached);
1354                 spin_unlock_bh(&ul->lock);
1355         }
1356 }
1357
1358 void rt_flush_dev(struct net_device *dev)
1359 {
1360         struct net *net = dev_net(dev);
1361         struct rtable *rt;
1362         int cpu;
1363
1364         for_each_possible_cpu(cpu) {
1365                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1366
1367                 spin_lock_bh(&ul->lock);
1368                 list_for_each_entry(rt, &ul->head, rt_uncached) {
1369                         if (rt->dst.dev != dev)
1370                                 continue;
1371                         rt->dst.dev = net->loopback_dev;
1372                         dev_hold(rt->dst.dev);
1373                         dev_put(dev);
1374                 }
1375                 spin_unlock_bh(&ul->lock);
1376         }
1377 }
1378
1379 static bool rt_cache_valid(const struct rtable *rt)
1380 {
1381         return  rt &&
1382                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1383                 !rt_is_expired(rt);
1384 }
1385
1386 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1387                            const struct fib_result *res,
1388                            struct fib_nh_exception *fnhe,
1389                            struct fib_info *fi, u16 type, u32 itag)
1390 {
1391         bool cached = false;
1392
1393         if (fi) {
1394                 struct fib_nh *nh = &FIB_RES_NH(*res);
1395
1396                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1397                         rt->rt_gateway = nh->nh_gw;
1398                         rt->rt_uses_gateway = 1;
1399                 }
1400                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1401 #ifdef CONFIG_IP_ROUTE_CLASSID
1402                 rt->dst.tclassid = nh->nh_tclassid;
1403 #endif
1404                 if (unlikely(fnhe))
1405                         cached = rt_bind_exception(rt, fnhe, daddr);
1406                 else if (!(rt->dst.flags & DST_NOCACHE))
1407                         cached = rt_cache_route(nh, rt);
1408                 if (unlikely(!cached)) {
1409                         /* Routes we intend to cache in nexthop exception or
1410                          * FIB nexthop have the DST_NOCACHE bit clear.
1411                          * However, if we are unsuccessful at storing this
1412                          * route into the cache we really need to set it.
1413                          */
1414                         rt->dst.flags |= DST_NOCACHE;
1415                         if (!rt->rt_gateway)
1416                                 rt->rt_gateway = daddr;
1417                         rt_add_uncached_list(rt);
1418                 }
1419         } else
1420                 rt_add_uncached_list(rt);
1421
1422 #ifdef CONFIG_IP_ROUTE_CLASSID
1423 #ifdef CONFIG_IP_MULTIPLE_TABLES
1424         set_class_tag(rt, res->tclassid);
1425 #endif
1426         set_class_tag(rt, itag);
1427 #endif
1428 }
1429
1430 static struct rtable *rt_dst_alloc(struct net_device *dev,
1431                                    bool nopolicy, bool noxfrm, bool will_cache)
1432 {
1433         return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1434                          (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1435                          (nopolicy ? DST_NOPOLICY : 0) |
1436                          (noxfrm ? DST_NOXFRM : 0));
1437 }
1438
1439 /* called in rcu_read_lock() section */
1440 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1441                                 u8 tos, struct net_device *dev, int our)
1442 {
1443         struct rtable *rth;
1444         struct in_device *in_dev = __in_dev_get_rcu(dev);
1445         u32 itag = 0;
1446         int err;
1447
1448         /* Primary sanity checks. */
1449
1450         if (!in_dev)
1451                 return -EINVAL;
1452
1453         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1454             skb->protocol != htons(ETH_P_IP))
1455                 goto e_inval;
1456
1457         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1458                 if (ipv4_is_loopback(saddr))
1459                         goto e_inval;
1460
1461         if (ipv4_is_zeronet(saddr)) {
1462                 if (!ipv4_is_local_multicast(daddr))
1463                         goto e_inval;
1464         } else {
1465                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1466                                           in_dev, &itag);
1467                 if (err < 0)
1468                         goto e_err;
1469         }
1470         rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1471                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1472         if (!rth)
1473                 goto e_nobufs;
1474
1475 #ifdef CONFIG_IP_ROUTE_CLASSID
1476         rth->dst.tclassid = itag;
1477 #endif
1478         rth->dst.output = ip_rt_bug;
1479
1480         rth->rt_genid   = rt_genid_ipv4(dev_net(dev));
1481         rth->rt_flags   = RTCF_MULTICAST;
1482         rth->rt_type    = RTN_MULTICAST;
1483         rth->rt_is_input= 1;
1484         rth->rt_iif     = 0;
1485         rth->rt_pmtu    = 0;
1486         rth->rt_gateway = 0;
1487         rth->rt_uses_gateway = 0;
1488         INIT_LIST_HEAD(&rth->rt_uncached);
1489         if (our) {
1490                 rth->dst.input= ip_local_deliver;
1491                 rth->rt_flags |= RTCF_LOCAL;
1492         }
1493
1494 #ifdef CONFIG_IP_MROUTE
1495         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1496                 rth->dst.input = ip_mr_input;
1497 #endif
1498         RT_CACHE_STAT_INC(in_slow_mc);
1499
1500         skb_dst_set(skb, &rth->dst);
1501         return 0;
1502
1503 e_nobufs:
1504         return -ENOBUFS;
1505 e_inval:
1506         return -EINVAL;
1507 e_err:
1508         return err;
1509 }
1510
1511
1512 static void ip_handle_martian_source(struct net_device *dev,
1513                                      struct in_device *in_dev,
1514                                      struct sk_buff *skb,
1515                                      __be32 daddr,
1516                                      __be32 saddr)
1517 {
1518         RT_CACHE_STAT_INC(in_martian_src);
1519 #ifdef CONFIG_IP_ROUTE_VERBOSE
1520         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1521                 /*
1522                  *      RFC1812 recommendation, if source is martian,
1523                  *      the only hint is MAC header.
1524                  */
1525                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1526                         &daddr, &saddr, dev->name);
1527                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1528                         print_hex_dump(KERN_WARNING, "ll header: ",
1529                                        DUMP_PREFIX_OFFSET, 16, 1,
1530                                        skb_mac_header(skb),
1531                                        dev->hard_header_len, true);
1532                 }
1533         }
1534 #endif
1535 }
1536
1537 /* called in rcu_read_lock() section */
1538 static int __mkroute_input(struct sk_buff *skb,
1539                            const struct fib_result *res,
1540                            struct in_device *in_dev,
1541                            __be32 daddr, __be32 saddr, u32 tos)
1542 {
1543         struct fib_nh_exception *fnhe;
1544         struct rtable *rth;
1545         int err;
1546         struct in_device *out_dev;
1547         unsigned int flags = 0;
1548         bool do_cache;
1549         u32 itag = 0;
1550
1551         /* get a working reference to the output device */
1552         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1553         if (!out_dev) {
1554                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1555                 return -EINVAL;
1556         }
1557
1558         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1559                                   in_dev->dev, in_dev, &itag);
1560         if (err < 0) {
1561                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1562                                          saddr);
1563
1564                 goto cleanup;
1565         }
1566
1567         do_cache = res->fi && !itag;
1568         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1569             skb->protocol == htons(ETH_P_IP) &&
1570             (IN_DEV_SHARED_MEDIA(out_dev) ||
1571              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1572                 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1573
1574         if (skb->protocol != htons(ETH_P_IP)) {
1575                 /* Not IP (i.e. ARP). Do not create route, if it is
1576                  * invalid for proxy arp. DNAT routes are always valid.
1577                  *
1578                  * Proxy arp feature have been extended to allow, ARP
1579                  * replies back to the same interface, to support
1580                  * Private VLAN switch technologies. See arp.c.
1581                  */
1582                 if (out_dev == in_dev &&
1583                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1584                         err = -EINVAL;
1585                         goto cleanup;
1586                 }
1587         }
1588
1589         fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1590         if (do_cache) {
1591                 if (fnhe)
1592                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1593                 else
1594                         rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1595
1596                 if (rt_cache_valid(rth)) {
1597                         skb_dst_set_noref(skb, &rth->dst);
1598                         goto out;
1599                 }
1600         }
1601
1602         rth = rt_dst_alloc(out_dev->dev,
1603                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1604                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1605         if (!rth) {
1606                 err = -ENOBUFS;
1607                 goto cleanup;
1608         }
1609
1610         rth->rt_genid = rt_genid_ipv4(dev_net(rth->dst.dev));
1611         rth->rt_flags = flags;
1612         rth->rt_type = res->type;
1613         rth->rt_is_input = 1;
1614         rth->rt_iif     = 0;
1615         rth->rt_pmtu    = 0;
1616         rth->rt_gateway = 0;
1617         rth->rt_uses_gateway = 0;
1618         INIT_LIST_HEAD(&rth->rt_uncached);
1619         RT_CACHE_STAT_INC(in_slow_tot);
1620
1621         rth->dst.input = ip_forward;
1622         rth->dst.output = ip_output;
1623
1624         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
1625         skb_dst_set(skb, &rth->dst);
1626 out:
1627         err = 0;
1628  cleanup:
1629         return err;
1630 }
1631
1632 static int ip_mkroute_input(struct sk_buff *skb,
1633                             struct fib_result *res,
1634                             const struct flowi4 *fl4,
1635                             struct in_device *in_dev,
1636                             __be32 daddr, __be32 saddr, u32 tos)
1637 {
1638 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1639         if (res->fi && res->fi->fib_nhs > 1)
1640                 fib_select_multipath(res);
1641 #endif
1642
1643         /* create a routing cache entry */
1644         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1645 }
1646
1647 /*
1648  *      NOTE. We drop all the packets that has local source
1649  *      addresses, because every properly looped back packet
1650  *      must have correct destination already attached by output routine.
1651  *
1652  *      Such approach solves two big problems:
1653  *      1. Not simplex devices are handled properly.
1654  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1655  *      called with rcu_read_lock()
1656  */
1657
1658 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1659                                u8 tos, struct net_device *dev)
1660 {
1661         struct fib_result res;
1662         struct in_device *in_dev = __in_dev_get_rcu(dev);
1663         struct flowi4   fl4;
1664         unsigned int    flags = 0;
1665         u32             itag = 0;
1666         struct rtable   *rth;
1667         int             err = -EINVAL;
1668         struct net    *net = dev_net(dev);
1669         bool do_cache;
1670
1671         /* IP on this device is disabled. */
1672
1673         if (!in_dev)
1674                 goto out;
1675
1676         /* Check for the most weird martians, which can be not detected
1677            by fib_lookup.
1678          */
1679
1680         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1681                 goto martian_source;
1682
1683         res.fi = NULL;
1684         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1685                 goto brd_input;
1686
1687         /* Accept zero addresses only to limited broadcast;
1688          * I even do not know to fix it or not. Waiting for complains :-)
1689          */
1690         if (ipv4_is_zeronet(saddr))
1691                 goto martian_source;
1692
1693         if (ipv4_is_zeronet(daddr))
1694                 goto martian_destination;
1695
1696         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1697          * and call it once if daddr or/and saddr are loopback addresses
1698          */
1699         if (ipv4_is_loopback(daddr)) {
1700                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1701                         goto martian_destination;
1702         } else if (ipv4_is_loopback(saddr)) {
1703                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1704                         goto martian_source;
1705         }
1706
1707         /*
1708          *      Now we are ready to route packet.
1709          */
1710         fl4.flowi4_oif = 0;
1711         fl4.flowi4_iif = dev->ifindex;
1712         fl4.flowi4_mark = skb->mark;
1713         fl4.flowi4_tos = tos;
1714         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1715         fl4.daddr = daddr;
1716         fl4.saddr = saddr;
1717         err = fib_lookup(net, &fl4, &res);
1718         if (err != 0) {
1719                 if (!IN_DEV_FORWARD(in_dev))
1720                         err = -EHOSTUNREACH;
1721                 goto no_route;
1722         }
1723
1724         if (res.type == RTN_BROADCAST)
1725                 goto brd_input;
1726
1727         if (res.type == RTN_LOCAL) {
1728                 err = fib_validate_source(skb, saddr, daddr, tos,
1729                                           0, dev, in_dev, &itag);
1730                 if (err < 0)
1731                         goto martian_source_keep_err;
1732                 goto local_input;
1733         }
1734
1735         if (!IN_DEV_FORWARD(in_dev)) {
1736                 err = -EHOSTUNREACH;
1737                 goto no_route;
1738         }
1739         if (res.type != RTN_UNICAST)
1740                 goto martian_destination;
1741
1742         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1743 out:    return err;
1744
1745 brd_input:
1746         if (skb->protocol != htons(ETH_P_IP))
1747                 goto e_inval;
1748
1749         if (!ipv4_is_zeronet(saddr)) {
1750                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1751                                           in_dev, &itag);
1752                 if (err < 0)
1753                         goto martian_source_keep_err;
1754         }
1755         flags |= RTCF_BROADCAST;
1756         res.type = RTN_BROADCAST;
1757         RT_CACHE_STAT_INC(in_brd);
1758
1759 local_input:
1760         do_cache = false;
1761         if (res.fi) {
1762                 if (!itag) {
1763                         rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1764                         if (rt_cache_valid(rth)) {
1765                                 skb_dst_set_noref(skb, &rth->dst);
1766                                 err = 0;
1767                                 goto out;
1768                         }
1769                         do_cache = true;
1770                 }
1771         }
1772
1773         rth = rt_dst_alloc(net->loopback_dev,
1774                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1775         if (!rth)
1776                 goto e_nobufs;
1777
1778         rth->dst.input= ip_local_deliver;
1779         rth->dst.output= ip_rt_bug;
1780 #ifdef CONFIG_IP_ROUTE_CLASSID
1781         rth->dst.tclassid = itag;
1782 #endif
1783
1784         rth->rt_genid = rt_genid_ipv4(net);
1785         rth->rt_flags   = flags|RTCF_LOCAL;
1786         rth->rt_type    = res.type;
1787         rth->rt_is_input = 1;
1788         rth->rt_iif     = 0;
1789         rth->rt_pmtu    = 0;
1790         rth->rt_gateway = 0;
1791         rth->rt_uses_gateway = 0;
1792         INIT_LIST_HEAD(&rth->rt_uncached);
1793         RT_CACHE_STAT_INC(in_slow_tot);
1794         if (res.type == RTN_UNREACHABLE) {
1795                 rth->dst.input= ip_error;
1796                 rth->dst.error= -err;
1797                 rth->rt_flags   &= ~RTCF_LOCAL;
1798         }
1799         if (do_cache) {
1800                 if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) {
1801                         rth->dst.flags |= DST_NOCACHE;
1802                         rt_add_uncached_list(rth);
1803                 }
1804         }
1805         skb_dst_set(skb, &rth->dst);
1806         err = 0;
1807         goto out;
1808
1809 no_route:
1810         RT_CACHE_STAT_INC(in_no_route);
1811         res.type = RTN_UNREACHABLE;
1812         res.fi = NULL;
1813         goto local_input;
1814
1815         /*
1816          *      Do not cache martian addresses: they should be logged (RFC1812)
1817          */
1818 martian_destination:
1819         RT_CACHE_STAT_INC(in_martian_dst);
1820 #ifdef CONFIG_IP_ROUTE_VERBOSE
1821         if (IN_DEV_LOG_MARTIANS(in_dev))
1822                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1823                                      &daddr, &saddr, dev->name);
1824 #endif
1825
1826 e_inval:
1827         err = -EINVAL;
1828         goto out;
1829
1830 e_nobufs:
1831         err = -ENOBUFS;
1832         goto out;
1833
1834 martian_source:
1835         err = -EINVAL;
1836 martian_source_keep_err:
1837         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1838         goto out;
1839 }
1840
1841 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1842                          u8 tos, struct net_device *dev)
1843 {
1844         int res;
1845
1846         rcu_read_lock();
1847
1848         /* Multicast recognition logic is moved from route cache to here.
1849            The problem was that too many Ethernet cards have broken/missing
1850            hardware multicast filters :-( As result the host on multicasting
1851            network acquires a lot of useless route cache entries, sort of
1852            SDR messages from all the world. Now we try to get rid of them.
1853            Really, provided software IP multicast filter is organized
1854            reasonably (at least, hashed), it does not result in a slowdown
1855            comparing with route cache reject entries.
1856            Note, that multicast routers are not affected, because
1857            route cache entry is created eventually.
1858          */
1859         if (ipv4_is_multicast(daddr)) {
1860                 struct in_device *in_dev = __in_dev_get_rcu(dev);
1861
1862                 if (in_dev) {
1863                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1864                                                   ip_hdr(skb)->protocol);
1865                         if (our
1866 #ifdef CONFIG_IP_MROUTE
1867                                 ||
1868                             (!ipv4_is_local_multicast(daddr) &&
1869                              IN_DEV_MFORWARD(in_dev))
1870 #endif
1871                            ) {
1872                                 int res = ip_route_input_mc(skb, daddr, saddr,
1873                                                             tos, dev, our);
1874                                 rcu_read_unlock();
1875                                 return res;
1876                         }
1877                 }
1878                 rcu_read_unlock();
1879                 return -EINVAL;
1880         }
1881         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1882         rcu_read_unlock();
1883         return res;
1884 }
1885 EXPORT_SYMBOL(ip_route_input_noref);
1886
1887 /* called with rcu_read_lock() */
1888 static struct rtable *__mkroute_output(const struct fib_result *res,
1889                                        const struct flowi4 *fl4, int orig_oif,
1890                                        struct net_device *dev_out,
1891                                        unsigned int flags)
1892 {
1893         struct fib_info *fi = res->fi;
1894         struct fib_nh_exception *fnhe;
1895         struct in_device *in_dev;
1896         u16 type = res->type;
1897         struct rtable *rth;
1898         bool do_cache;
1899
1900         in_dev = __in_dev_get_rcu(dev_out);
1901         if (!in_dev)
1902                 return ERR_PTR(-EINVAL);
1903
1904         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1905                 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1906                         return ERR_PTR(-EINVAL);
1907
1908         if (ipv4_is_lbcast(fl4->daddr))
1909                 type = RTN_BROADCAST;
1910         else if (ipv4_is_multicast(fl4->daddr))
1911                 type = RTN_MULTICAST;
1912         else if (ipv4_is_zeronet(fl4->daddr))
1913                 return ERR_PTR(-EINVAL);
1914
1915         if (dev_out->flags & IFF_LOOPBACK)
1916                 flags |= RTCF_LOCAL;
1917
1918         do_cache = true;
1919         if (type == RTN_BROADCAST) {
1920                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
1921                 fi = NULL;
1922         } else if (type == RTN_MULTICAST) {
1923                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
1924                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1925                                      fl4->flowi4_proto))
1926                         flags &= ~RTCF_LOCAL;
1927                 else
1928                         do_cache = false;
1929                 /* If multicast route do not exist use
1930                  * default one, but do not gateway in this case.
1931                  * Yes, it is hack.
1932                  */
1933                 if (fi && res->prefixlen < 4)
1934                         fi = NULL;
1935         }
1936
1937         fnhe = NULL;
1938         do_cache &= fi != NULL;
1939         if (do_cache) {
1940                 struct rtable __rcu **prth;
1941                 struct fib_nh *nh = &FIB_RES_NH(*res);
1942
1943                 fnhe = find_exception(nh, fl4->daddr);
1944                 if (fnhe)
1945                         prth = &fnhe->fnhe_rth_output;
1946                 else {
1947                         if (unlikely(fl4->flowi4_flags &
1948                                      FLOWI_FLAG_KNOWN_NH &&
1949                                      !(nh->nh_gw &&
1950                                        nh->nh_scope == RT_SCOPE_LINK))) {
1951                                 do_cache = false;
1952                                 goto add;
1953                         }
1954                         prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
1955                 }
1956                 rth = rcu_dereference(*prth);
1957                 if (rt_cache_valid(rth)) {
1958                         dst_hold(&rth->dst);
1959                         return rth;
1960                 }
1961         }
1962
1963 add:
1964         rth = rt_dst_alloc(dev_out,
1965                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1966                            IN_DEV_CONF_GET(in_dev, NOXFRM),
1967                            do_cache);
1968         if (!rth)
1969                 return ERR_PTR(-ENOBUFS);
1970
1971         rth->dst.output = ip_output;
1972
1973         rth->rt_genid = rt_genid_ipv4(dev_net(dev_out));
1974         rth->rt_flags   = flags;
1975         rth->rt_type    = type;
1976         rth->rt_is_input = 0;
1977         rth->rt_iif     = orig_oif ? : 0;
1978         rth->rt_pmtu    = 0;
1979         rth->rt_gateway = 0;
1980         rth->rt_uses_gateway = 0;
1981         INIT_LIST_HEAD(&rth->rt_uncached);
1982
1983         RT_CACHE_STAT_INC(out_slow_tot);
1984
1985         if (flags & RTCF_LOCAL)
1986                 rth->dst.input = ip_local_deliver;
1987         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
1988                 if (flags & RTCF_LOCAL &&
1989                     !(dev_out->flags & IFF_LOOPBACK)) {
1990                         rth->dst.output = ip_mc_output;
1991                         RT_CACHE_STAT_INC(out_slow_mc);
1992                 }
1993 #ifdef CONFIG_IP_MROUTE
1994                 if (type == RTN_MULTICAST) {
1995                         if (IN_DEV_MFORWARD(in_dev) &&
1996                             !ipv4_is_local_multicast(fl4->daddr)) {
1997                                 rth->dst.input = ip_mr_input;
1998                                 rth->dst.output = ip_mc_output;
1999                         }
2000                 }
2001 #endif
2002         }
2003
2004         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
2005
2006         return rth;
2007 }
2008
2009 /*
2010  * Major route resolver routine.
2011  */
2012
2013 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
2014 {
2015         struct net_device *dev_out = NULL;
2016         __u8 tos = RT_FL_TOS(fl4);
2017         unsigned int flags = 0;
2018         struct fib_result res;
2019         struct rtable *rth;
2020         int orig_oif;
2021
2022         res.tclassid    = 0;
2023         res.fi          = NULL;
2024         res.table       = NULL;
2025
2026         orig_oif = fl4->flowi4_oif;
2027
2028         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2029         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2030         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2031                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2032
2033         rcu_read_lock();
2034         if (fl4->saddr) {
2035                 rth = ERR_PTR(-EINVAL);
2036                 if (ipv4_is_multicast(fl4->saddr) ||
2037                     ipv4_is_lbcast(fl4->saddr) ||
2038                     ipv4_is_zeronet(fl4->saddr))
2039                         goto out;
2040
2041                 /* I removed check for oif == dev_out->oif here.
2042                    It was wrong for two reasons:
2043                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2044                       is assigned to multiple interfaces.
2045                    2. Moreover, we are allowed to send packets with saddr
2046                       of another iface. --ANK
2047                  */
2048
2049                 if (fl4->flowi4_oif == 0 &&
2050                     (ipv4_is_multicast(fl4->daddr) ||
2051                      ipv4_is_lbcast(fl4->daddr))) {
2052                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2053                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2054                         if (!dev_out)
2055                                 goto out;
2056
2057                         /* Special hack: user can direct multicasts
2058                            and limited broadcast via necessary interface
2059                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2060                            This hack is not just for fun, it allows
2061                            vic,vat and friends to work.
2062                            They bind socket to loopback, set ttl to zero
2063                            and expect that it will work.
2064                            From the viewpoint of routing cache they are broken,
2065                            because we are not allowed to build multicast path
2066                            with loopback source addr (look, routing cache
2067                            cannot know, that ttl is zero, so that packet
2068                            will not leave this host and route is valid).
2069                            Luckily, this hack is good workaround.
2070                          */
2071
2072                         fl4->flowi4_oif = dev_out->ifindex;
2073                         goto make_route;
2074                 }
2075
2076                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2077                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2078                         if (!__ip_dev_find(net, fl4->saddr, false))
2079                                 goto out;
2080                 }
2081         }
2082
2083
2084         if (fl4->flowi4_oif) {
2085                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2086                 rth = ERR_PTR(-ENODEV);
2087                 if (!dev_out)
2088                         goto out;
2089
2090                 /* RACE: Check return value of inet_select_addr instead. */
2091                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2092                         rth = ERR_PTR(-ENETUNREACH);
2093                         goto out;
2094                 }
2095                 if (ipv4_is_local_multicast(fl4->daddr) ||
2096                     ipv4_is_lbcast(fl4->daddr)) {
2097                         if (!fl4->saddr)
2098                                 fl4->saddr = inet_select_addr(dev_out, 0,
2099                                                               RT_SCOPE_LINK);
2100                         goto make_route;
2101                 }
2102                 if (!fl4->saddr) {
2103                         if (ipv4_is_multicast(fl4->daddr))
2104                                 fl4->saddr = inet_select_addr(dev_out, 0,
2105                                                               fl4->flowi4_scope);
2106                         else if (!fl4->daddr)
2107                                 fl4->saddr = inet_select_addr(dev_out, 0,
2108                                                               RT_SCOPE_HOST);
2109                 }
2110         }
2111
2112         if (!fl4->daddr) {
2113                 fl4->daddr = fl4->saddr;
2114                 if (!fl4->daddr)
2115                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2116                 dev_out = net->loopback_dev;
2117                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2118                 res.type = RTN_LOCAL;
2119                 flags |= RTCF_LOCAL;
2120                 goto make_route;
2121         }
2122
2123         if (fib_lookup(net, fl4, &res)) {
2124                 res.fi = NULL;
2125                 res.table = NULL;
2126                 if (fl4->flowi4_oif) {
2127                         /* Apparently, routing tables are wrong. Assume,
2128                            that the destination is on link.
2129
2130                            WHY? DW.
2131                            Because we are allowed to send to iface
2132                            even if it has NO routes and NO assigned
2133                            addresses. When oif is specified, routing
2134                            tables are looked up with only one purpose:
2135                            to catch if destination is gatewayed, rather than
2136                            direct. Moreover, if MSG_DONTROUTE is set,
2137                            we send packet, ignoring both routing tables
2138                            and ifaddr state. --ANK
2139
2140
2141                            We could make it even if oif is unknown,
2142                            likely IPv6, but we do not.
2143                          */
2144
2145                         if (fl4->saddr == 0)
2146                                 fl4->saddr = inet_select_addr(dev_out, 0,
2147                                                               RT_SCOPE_LINK);
2148                         res.type = RTN_UNICAST;
2149                         goto make_route;
2150                 }
2151                 rth = ERR_PTR(-ENETUNREACH);
2152                 goto out;
2153         }
2154
2155         if (res.type == RTN_LOCAL) {
2156                 if (!fl4->saddr) {
2157                         if (res.fi->fib_prefsrc)
2158                                 fl4->saddr = res.fi->fib_prefsrc;
2159                         else
2160                                 fl4->saddr = fl4->daddr;
2161                 }
2162                 dev_out = net->loopback_dev;
2163                 fl4->flowi4_oif = dev_out->ifindex;
2164                 flags |= RTCF_LOCAL;
2165                 goto make_route;
2166         }
2167
2168 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2169         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2170                 fib_select_multipath(&res);
2171         else
2172 #endif
2173         if (!res.prefixlen &&
2174             res.table->tb_num_default > 1 &&
2175             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2176                 fib_select_default(&res);
2177
2178         if (!fl4->saddr)
2179                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2180
2181         dev_out = FIB_RES_DEV(res);
2182         fl4->flowi4_oif = dev_out->ifindex;
2183
2184
2185 make_route:
2186         rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2187
2188 out:
2189         rcu_read_unlock();
2190         return rth;
2191 }
2192 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2193
2194 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2195 {
2196         return NULL;
2197 }
2198
2199 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2200 {
2201         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2202
2203         return mtu ? : dst->dev->mtu;
2204 }
2205
2206 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2207                                           struct sk_buff *skb, u32 mtu)
2208 {
2209 }
2210
2211 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2212                                        struct sk_buff *skb)
2213 {
2214 }
2215
2216 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2217                                           unsigned long old)
2218 {
2219         return NULL;
2220 }
2221
2222 static struct dst_ops ipv4_dst_blackhole_ops = {
2223         .family                 =       AF_INET,
2224         .check                  =       ipv4_blackhole_dst_check,
2225         .mtu                    =       ipv4_blackhole_mtu,
2226         .default_advmss         =       ipv4_default_advmss,
2227         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2228         .redirect               =       ipv4_rt_blackhole_redirect,
2229         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2230         .neigh_lookup           =       ipv4_neigh_lookup,
2231 };
2232
2233 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2234 {
2235         struct rtable *ort = (struct rtable *) dst_orig;
2236         struct rtable *rt;
2237
2238         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2239         if (rt) {
2240                 struct dst_entry *new = &rt->dst;
2241
2242                 new->__use = 1;
2243                 new->input = dst_discard;
2244                 new->output = dst_discard_sk;
2245
2246                 new->dev = ort->dst.dev;
2247                 if (new->dev)
2248                         dev_hold(new->dev);
2249
2250                 rt->rt_is_input = ort->rt_is_input;
2251                 rt->rt_iif = ort->rt_iif;
2252                 rt->rt_pmtu = ort->rt_pmtu;
2253
2254                 rt->rt_genid = rt_genid_ipv4(net);
2255                 rt->rt_flags = ort->rt_flags;
2256                 rt->rt_type = ort->rt_type;
2257                 rt->rt_gateway = ort->rt_gateway;
2258                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2259
2260                 INIT_LIST_HEAD(&rt->rt_uncached);
2261
2262                 dst_free(new);
2263         }
2264
2265         dst_release(dst_orig);
2266
2267         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2268 }
2269
2270 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2271                                     struct sock *sk)
2272 {
2273         struct rtable *rt = __ip_route_output_key(net, flp4);
2274
2275         if (IS_ERR(rt))
2276                 return rt;
2277
2278         if (flp4->flowi4_proto)
2279                 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2280                                                         flowi4_to_flowi(flp4),
2281                                                         sk, 0);
2282
2283         return rt;
2284 }
2285 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2286
2287 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
2288                         struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2289                         u32 seq, int event, int nowait, unsigned int flags)
2290 {
2291         struct rtable *rt = skb_rtable(skb);
2292         struct rtmsg *r;
2293         struct nlmsghdr *nlh;
2294         unsigned long expires = 0;
2295         u32 error;
2296         u32 metrics[RTAX_MAX];
2297
2298         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2299         if (!nlh)
2300                 return -EMSGSIZE;
2301
2302         r = nlmsg_data(nlh);
2303         r->rtm_family    = AF_INET;
2304         r->rtm_dst_len  = 32;
2305         r->rtm_src_len  = 0;
2306         r->rtm_tos      = fl4->flowi4_tos;
2307         r->rtm_table    = RT_TABLE_MAIN;
2308         if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2309                 goto nla_put_failure;
2310         r->rtm_type     = rt->rt_type;
2311         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2312         r->rtm_protocol = RTPROT_UNSPEC;
2313         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2314         if (rt->rt_flags & RTCF_NOTIFY)
2315                 r->rtm_flags |= RTM_F_NOTIFY;
2316         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2317                 r->rtm_flags |= RTCF_DOREDIRECT;
2318
2319         if (nla_put_in_addr(skb, RTA_DST, dst))
2320                 goto nla_put_failure;
2321         if (src) {
2322                 r->rtm_src_len = 32;
2323                 if (nla_put_in_addr(skb, RTA_SRC, src))
2324                         goto nla_put_failure;
2325         }
2326         if (rt->dst.dev &&
2327             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2328                 goto nla_put_failure;
2329 #ifdef CONFIG_IP_ROUTE_CLASSID
2330         if (rt->dst.tclassid &&
2331             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2332                 goto nla_put_failure;
2333 #endif
2334         if (!rt_is_input_route(rt) &&
2335             fl4->saddr != src) {
2336                 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2337                         goto nla_put_failure;
2338         }
2339         if (rt->rt_uses_gateway &&
2340             nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2341                 goto nla_put_failure;
2342
2343         expires = rt->dst.expires;
2344         if (expires) {
2345                 unsigned long now = jiffies;
2346
2347                 if (time_before(now, expires))
2348                         expires -= now;
2349                 else
2350                         expires = 0;
2351         }
2352
2353         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2354         if (rt->rt_pmtu && expires)
2355                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2356         if (rtnetlink_put_metrics(skb, metrics) < 0)
2357                 goto nla_put_failure;
2358
2359         if (fl4->flowi4_mark &&
2360             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2361                 goto nla_put_failure;
2362
2363         error = rt->dst.error;
2364
2365         if (rt_is_input_route(rt)) {
2366 #ifdef CONFIG_IP_MROUTE
2367                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2368                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2369                         int err = ipmr_get_route(net, skb,
2370                                                  fl4->saddr, fl4->daddr,
2371                                                  r, nowait);
2372                         if (err <= 0) {
2373                                 if (!nowait) {
2374                                         if (err == 0)
2375                                                 return 0;
2376                                         goto nla_put_failure;
2377                                 } else {
2378                                         if (err == -EMSGSIZE)
2379                                                 goto nla_put_failure;
2380                                         error = err;
2381                                 }
2382                         }
2383                 } else
2384 #endif
2385                         if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2386                                 goto nla_put_failure;
2387         }
2388
2389         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2390                 goto nla_put_failure;
2391
2392         nlmsg_end(skb, nlh);
2393         return 0;
2394
2395 nla_put_failure:
2396         nlmsg_cancel(skb, nlh);
2397         return -EMSGSIZE;
2398 }
2399
2400 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2401 {
2402         struct net *net = sock_net(in_skb->sk);
2403         struct rtmsg *rtm;
2404         struct nlattr *tb[RTA_MAX+1];
2405         struct rtable *rt = NULL;
2406         struct flowi4 fl4;
2407         __be32 dst = 0;
2408         __be32 src = 0;
2409         u32 iif;
2410         int err;
2411         int mark;
2412         struct sk_buff *skb;
2413
2414         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2415         if (err < 0)
2416                 goto errout;
2417
2418         rtm = nlmsg_data(nlh);
2419
2420         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2421         if (!skb) {
2422                 err = -ENOBUFS;
2423                 goto errout;
2424         }
2425
2426         /* Reserve room for dummy headers, this skb can pass
2427            through good chunk of routing engine.
2428          */
2429         skb_reset_mac_header(skb);
2430         skb_reset_network_header(skb);
2431
2432         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2433         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2434         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2435
2436         src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2437         dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2438         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2439         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2440
2441         memset(&fl4, 0, sizeof(fl4));
2442         fl4.daddr = dst;
2443         fl4.saddr = src;
2444         fl4.flowi4_tos = rtm->rtm_tos;
2445         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2446         fl4.flowi4_mark = mark;
2447
2448         if (iif) {
2449                 struct net_device *dev;
2450
2451                 dev = __dev_get_by_index(net, iif);
2452                 if (!dev) {
2453                         err = -ENODEV;
2454                         goto errout_free;
2455                 }
2456
2457                 skb->protocol   = htons(ETH_P_IP);
2458                 skb->dev        = dev;
2459                 skb->mark       = mark;
2460                 local_bh_disable();
2461                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2462                 local_bh_enable();
2463
2464                 rt = skb_rtable(skb);
2465                 if (err == 0 && rt->dst.error)
2466                         err = -rt->dst.error;
2467         } else {
2468                 rt = ip_route_output_key(net, &fl4);
2469
2470                 err = 0;
2471                 if (IS_ERR(rt))
2472                         err = PTR_ERR(rt);
2473         }
2474
2475         if (err)
2476                 goto errout_free;
2477
2478         skb_dst_set(skb, &rt->dst);
2479         if (rtm->rtm_flags & RTM_F_NOTIFY)
2480                 rt->rt_flags |= RTCF_NOTIFY;
2481
2482         err = rt_fill_info(net, dst, src, &fl4, skb,
2483                            NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2484                            RTM_NEWROUTE, 0, 0);
2485         if (err < 0)
2486                 goto errout_free;
2487
2488         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2489 errout:
2490         return err;
2491
2492 errout_free:
2493         kfree_skb(skb);
2494         goto errout;
2495 }
2496
2497 void ip_rt_multicast_event(struct in_device *in_dev)
2498 {
2499         rt_cache_flush(dev_net(in_dev->dev));
2500 }
2501
2502 #ifdef CONFIG_SYSCTL
2503 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
2504 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2505 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2506 static int ip_rt_gc_elasticity __read_mostly    = 8;
2507
2508 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2509                                         void __user *buffer,
2510                                         size_t *lenp, loff_t *ppos)
2511 {
2512         struct net *net = (struct net *)__ctl->extra1;
2513
2514         if (write) {
2515                 rt_cache_flush(net);
2516                 fnhe_genid_bump(net);
2517                 return 0;
2518         }
2519
2520         return -EINVAL;
2521 }
2522
2523 static struct ctl_table ipv4_route_table[] = {
2524         {
2525                 .procname       = "gc_thresh",
2526                 .data           = &ipv4_dst_ops.gc_thresh,
2527                 .maxlen         = sizeof(int),
2528                 .mode           = 0644,
2529                 .proc_handler   = proc_dointvec,
2530         },
2531         {
2532                 .procname       = "max_size",
2533                 .data           = &ip_rt_max_size,
2534                 .maxlen         = sizeof(int),
2535                 .mode           = 0644,
2536                 .proc_handler   = proc_dointvec,
2537         },
2538         {
2539                 /*  Deprecated. Use gc_min_interval_ms */
2540
2541                 .procname       = "gc_min_interval",
2542                 .data           = &ip_rt_gc_min_interval,
2543                 .maxlen         = sizeof(int),
2544                 .mode           = 0644,
2545                 .proc_handler   = proc_dointvec_jiffies,
2546         },
2547         {
2548                 .procname       = "gc_min_interval_ms",
2549                 .data           = &ip_rt_gc_min_interval,
2550                 .maxlen         = sizeof(int),
2551                 .mode           = 0644,
2552                 .proc_handler   = proc_dointvec_ms_jiffies,
2553         },
2554         {
2555                 .procname       = "gc_timeout",
2556                 .data           = &ip_rt_gc_timeout,
2557                 .maxlen         = sizeof(int),
2558                 .mode           = 0644,
2559                 .proc_handler   = proc_dointvec_jiffies,
2560         },
2561         {
2562                 .procname       = "gc_interval",
2563                 .data           = &ip_rt_gc_interval,
2564                 .maxlen         = sizeof(int),
2565                 .mode           = 0644,
2566                 .proc_handler   = proc_dointvec_jiffies,
2567         },
2568         {
2569                 .procname       = "redirect_load",
2570                 .data           = &ip_rt_redirect_load,
2571                 .maxlen         = sizeof(int),
2572                 .mode           = 0644,
2573                 .proc_handler   = proc_dointvec,
2574         },
2575         {
2576                 .procname       = "redirect_number",
2577                 .data           = &ip_rt_redirect_number,
2578                 .maxlen         = sizeof(int),
2579                 .mode           = 0644,
2580                 .proc_handler   = proc_dointvec,
2581         },
2582         {
2583                 .procname       = "redirect_silence",
2584                 .data           = &ip_rt_redirect_silence,
2585                 .maxlen         = sizeof(int),
2586                 .mode           = 0644,
2587                 .proc_handler   = proc_dointvec,
2588         },
2589         {
2590                 .procname       = "error_cost",
2591                 .data           = &ip_rt_error_cost,
2592                 .maxlen         = sizeof(int),
2593                 .mode           = 0644,
2594                 .proc_handler   = proc_dointvec,
2595         },
2596         {
2597                 .procname       = "error_burst",
2598                 .data           = &ip_rt_error_burst,
2599                 .maxlen         = sizeof(int),
2600                 .mode           = 0644,
2601                 .proc_handler   = proc_dointvec,
2602         },
2603         {
2604                 .procname       = "gc_elasticity",
2605                 .data           = &ip_rt_gc_elasticity,
2606                 .maxlen         = sizeof(int),
2607                 .mode           = 0644,
2608                 .proc_handler   = proc_dointvec,
2609         },
2610         {
2611                 .procname       = "mtu_expires",
2612                 .data           = &ip_rt_mtu_expires,
2613                 .maxlen         = sizeof(int),
2614                 .mode           = 0644,
2615                 .proc_handler   = proc_dointvec_jiffies,
2616         },
2617         {
2618                 .procname       = "min_pmtu",
2619                 .data           = &ip_rt_min_pmtu,
2620                 .maxlen         = sizeof(int),
2621                 .mode           = 0644,
2622                 .proc_handler   = proc_dointvec,
2623         },
2624         {
2625                 .procname       = "min_adv_mss",
2626                 .data           = &ip_rt_min_advmss,
2627                 .maxlen         = sizeof(int),
2628                 .mode           = 0644,
2629                 .proc_handler   = proc_dointvec,
2630         },
2631         { }
2632 };
2633
2634 static struct ctl_table ipv4_route_flush_table[] = {
2635         {
2636                 .procname       = "flush",
2637                 .maxlen         = sizeof(int),
2638                 .mode           = 0200,
2639                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2640         },
2641         { },
2642 };
2643
2644 static __net_init int sysctl_route_net_init(struct net *net)
2645 {
2646         struct ctl_table *tbl;
2647
2648         tbl = ipv4_route_flush_table;
2649         if (!net_eq(net, &init_net)) {
2650                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2651                 if (!tbl)
2652                         goto err_dup;
2653
2654                 /* Don't export sysctls to unprivileged users */
2655                 if (net->user_ns != &init_user_ns)
2656                         tbl[0].procname = NULL;
2657         }
2658         tbl[0].extra1 = net;
2659
2660         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2661         if (!net->ipv4.route_hdr)
2662                 goto err_reg;
2663         return 0;
2664
2665 err_reg:
2666         if (tbl != ipv4_route_flush_table)
2667                 kfree(tbl);
2668 err_dup:
2669         return -ENOMEM;
2670 }
2671
2672 static __net_exit void sysctl_route_net_exit(struct net *net)
2673 {
2674         struct ctl_table *tbl;
2675
2676         tbl = net->ipv4.route_hdr->ctl_table_arg;
2677         unregister_net_sysctl_table(net->ipv4.route_hdr);
2678         BUG_ON(tbl == ipv4_route_flush_table);
2679         kfree(tbl);
2680 }
2681
2682 static __net_initdata struct pernet_operations sysctl_route_ops = {
2683         .init = sysctl_route_net_init,
2684         .exit = sysctl_route_net_exit,
2685 };
2686 #endif
2687
2688 static __net_init int rt_genid_init(struct net *net)
2689 {
2690         atomic_set(&net->ipv4.rt_genid, 0);
2691         atomic_set(&net->fnhe_genid, 0);
2692         get_random_bytes(&net->ipv4.dev_addr_genid,
2693                          sizeof(net->ipv4.dev_addr_genid));
2694         return 0;
2695 }
2696
2697 static __net_initdata struct pernet_operations rt_genid_ops = {
2698         .init = rt_genid_init,
2699 };
2700
2701 static int __net_init ipv4_inetpeer_init(struct net *net)
2702 {
2703         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2704
2705         if (!bp)
2706                 return -ENOMEM;
2707         inet_peer_base_init(bp);
2708         net->ipv4.peers = bp;
2709         return 0;
2710 }
2711
2712 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2713 {
2714         struct inet_peer_base *bp = net->ipv4.peers;
2715
2716         net->ipv4.peers = NULL;
2717         inetpeer_invalidate_tree(bp);
2718         kfree(bp);
2719 }
2720
2721 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2722         .init   =       ipv4_inetpeer_init,
2723         .exit   =       ipv4_inetpeer_exit,
2724 };
2725
2726 #ifdef CONFIG_IP_ROUTE_CLASSID
2727 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2728 #endif /* CONFIG_IP_ROUTE_CLASSID */
2729
2730 int __init ip_rt_init(void)
2731 {
2732         int rc = 0;
2733         int cpu;
2734
2735         ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
2736         if (!ip_idents)
2737                 panic("IP: failed to allocate ip_idents\n");
2738
2739         prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
2740
2741         for_each_possible_cpu(cpu) {
2742                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
2743
2744                 INIT_LIST_HEAD(&ul->head);
2745                 spin_lock_init(&ul->lock);
2746         }
2747 #ifdef CONFIG_IP_ROUTE_CLASSID
2748         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2749         if (!ip_rt_acct)
2750                 panic("IP: failed to allocate ip_rt_acct\n");
2751 #endif
2752
2753         ipv4_dst_ops.kmem_cachep =
2754                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2755                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2756
2757         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2758
2759         if (dst_entries_init(&ipv4_dst_ops) < 0)
2760                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
2761
2762         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2763                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2764
2765         ipv4_dst_ops.gc_thresh = ~0;
2766         ip_rt_max_size = INT_MAX;
2767
2768         devinet_init();
2769         ip_fib_init();
2770
2771         if (ip_rt_proc_init())
2772                 pr_err("Unable to create route proc files\n");
2773 #ifdef CONFIG_XFRM
2774         xfrm_init();
2775         xfrm4_init();
2776 #endif
2777         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2778
2779 #ifdef CONFIG_SYSCTL
2780         register_pernet_subsys(&sysctl_route_ops);
2781 #endif
2782         register_pernet_subsys(&rt_genid_ops);
2783         register_pernet_subsys(&ipv4_inetpeer_ops);
2784         return rc;
2785 }
2786
2787 #ifdef CONFIG_SYSCTL
2788 /*
2789  * We really need to sanitize the damn ipv4 init order, then all
2790  * this nonsense will go away.
2791  */
2792 void __init ip_static_sysctl_init(void)
2793 {
2794         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2795 }
2796 #endif