Merge tag 'linux-kselftest-4.10-rc4-fixes' of git://git.kernel.org/pub/scm/linux...
[sfrench/cifs-2.6.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #define pr_fmt(fmt) "IPv4: " fmt
66
67 #include <linux/module.h>
68 #include <linux/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <linux/jhash.h>
93 #include <net/dst.h>
94 #include <net/dst_metadata.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
97 #include <net/ip.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
102 #include <net/arp.h>
103 #include <net/tcp.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/lwtunnel.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
109 #ifdef CONFIG_SYSCTL
110 #include <linux/sysctl.h>
111 #include <linux/kmemleak.h>
112 #endif
113 #include <net/secure_seq.h>
114 #include <net/ip_tunnels.h>
115 #include <net/l3mdev.h>
116
117 #define RT_FL_TOS(oldflp4) \
118         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
119
120 #define RT_GC_TIMEOUT (300*HZ)
121
122 static int ip_rt_max_size;
123 static int ip_rt_redirect_number __read_mostly  = 9;
124 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
125 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
126 static int ip_rt_error_cost __read_mostly       = HZ;
127 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
128 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
129 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
130 static int ip_rt_min_advmss __read_mostly       = 256;
131
132 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
133 /*
134  *      Interface to generic destination cache.
135  */
136
137 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
138 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
139 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
140 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
141 static void              ipv4_link_failure(struct sk_buff *skb);
142 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
143                                            struct sk_buff *skb, u32 mtu);
144 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
145                                         struct sk_buff *skb);
146 static void             ipv4_dst_destroy(struct dst_entry *dst);
147
148 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
149 {
150         WARN_ON(1);
151         return NULL;
152 }
153
154 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
155                                            struct sk_buff *skb,
156                                            const void *daddr);
157
158 static struct dst_ops ipv4_dst_ops = {
159         .family =               AF_INET,
160         .check =                ipv4_dst_check,
161         .default_advmss =       ipv4_default_advmss,
162         .mtu =                  ipv4_mtu,
163         .cow_metrics =          ipv4_cow_metrics,
164         .destroy =              ipv4_dst_destroy,
165         .negative_advice =      ipv4_negative_advice,
166         .link_failure =         ipv4_link_failure,
167         .update_pmtu =          ip_rt_update_pmtu,
168         .redirect =             ip_do_redirect,
169         .local_out =            __ip_local_out,
170         .neigh_lookup =         ipv4_neigh_lookup,
171 };
172
173 #define ECN_OR_COST(class)      TC_PRIO_##class
174
175 const __u8 ip_tos2prio[16] = {
176         TC_PRIO_BESTEFFORT,
177         ECN_OR_COST(BESTEFFORT),
178         TC_PRIO_BESTEFFORT,
179         ECN_OR_COST(BESTEFFORT),
180         TC_PRIO_BULK,
181         ECN_OR_COST(BULK),
182         TC_PRIO_BULK,
183         ECN_OR_COST(BULK),
184         TC_PRIO_INTERACTIVE,
185         ECN_OR_COST(INTERACTIVE),
186         TC_PRIO_INTERACTIVE,
187         ECN_OR_COST(INTERACTIVE),
188         TC_PRIO_INTERACTIVE_BULK,
189         ECN_OR_COST(INTERACTIVE_BULK),
190         TC_PRIO_INTERACTIVE_BULK,
191         ECN_OR_COST(INTERACTIVE_BULK)
192 };
193 EXPORT_SYMBOL(ip_tos2prio);
194
195 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
196 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
197
198 #ifdef CONFIG_PROC_FS
199 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
200 {
201         if (*pos)
202                 return NULL;
203         return SEQ_START_TOKEN;
204 }
205
206 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
207 {
208         ++*pos;
209         return NULL;
210 }
211
212 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
213 {
214 }
215
216 static int rt_cache_seq_show(struct seq_file *seq, void *v)
217 {
218         if (v == SEQ_START_TOKEN)
219                 seq_printf(seq, "%-127s\n",
220                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
221                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
222                            "HHUptod\tSpecDst");
223         return 0;
224 }
225
226 static const struct seq_operations rt_cache_seq_ops = {
227         .start  = rt_cache_seq_start,
228         .next   = rt_cache_seq_next,
229         .stop   = rt_cache_seq_stop,
230         .show   = rt_cache_seq_show,
231 };
232
233 static int rt_cache_seq_open(struct inode *inode, struct file *file)
234 {
235         return seq_open(file, &rt_cache_seq_ops);
236 }
237
238 static const struct file_operations rt_cache_seq_fops = {
239         .owner   = THIS_MODULE,
240         .open    = rt_cache_seq_open,
241         .read    = seq_read,
242         .llseek  = seq_lseek,
243         .release = seq_release,
244 };
245
246
247 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
248 {
249         int cpu;
250
251         if (*pos == 0)
252                 return SEQ_START_TOKEN;
253
254         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
255                 if (!cpu_possible(cpu))
256                         continue;
257                 *pos = cpu+1;
258                 return &per_cpu(rt_cache_stat, cpu);
259         }
260         return NULL;
261 }
262
263 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
264 {
265         int cpu;
266
267         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
268                 if (!cpu_possible(cpu))
269                         continue;
270                 *pos = cpu+1;
271                 return &per_cpu(rt_cache_stat, cpu);
272         }
273         return NULL;
274
275 }
276
277 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
278 {
279
280 }
281
282 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
283 {
284         struct rt_cache_stat *st = v;
285
286         if (v == SEQ_START_TOKEN) {
287                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
288                 return 0;
289         }
290
291         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
292                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
293                    dst_entries_get_slow(&ipv4_dst_ops),
294                    0, /* st->in_hit */
295                    st->in_slow_tot,
296                    st->in_slow_mc,
297                    st->in_no_route,
298                    st->in_brd,
299                    st->in_martian_dst,
300                    st->in_martian_src,
301
302                    0, /* st->out_hit */
303                    st->out_slow_tot,
304                    st->out_slow_mc,
305
306                    0, /* st->gc_total */
307                    0, /* st->gc_ignored */
308                    0, /* st->gc_goal_miss */
309                    0, /* st->gc_dst_overflow */
310                    0, /* st->in_hlist_search */
311                    0  /* st->out_hlist_search */
312                 );
313         return 0;
314 }
315
316 static const struct seq_operations rt_cpu_seq_ops = {
317         .start  = rt_cpu_seq_start,
318         .next   = rt_cpu_seq_next,
319         .stop   = rt_cpu_seq_stop,
320         .show   = rt_cpu_seq_show,
321 };
322
323
324 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
325 {
326         return seq_open(file, &rt_cpu_seq_ops);
327 }
328
329 static const struct file_operations rt_cpu_seq_fops = {
330         .owner   = THIS_MODULE,
331         .open    = rt_cpu_seq_open,
332         .read    = seq_read,
333         .llseek  = seq_lseek,
334         .release = seq_release,
335 };
336
337 #ifdef CONFIG_IP_ROUTE_CLASSID
338 static int rt_acct_proc_show(struct seq_file *m, void *v)
339 {
340         struct ip_rt_acct *dst, *src;
341         unsigned int i, j;
342
343         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
344         if (!dst)
345                 return -ENOMEM;
346
347         for_each_possible_cpu(i) {
348                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
349                 for (j = 0; j < 256; j++) {
350                         dst[j].o_bytes   += src[j].o_bytes;
351                         dst[j].o_packets += src[j].o_packets;
352                         dst[j].i_bytes   += src[j].i_bytes;
353                         dst[j].i_packets += src[j].i_packets;
354                 }
355         }
356
357         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
358         kfree(dst);
359         return 0;
360 }
361
362 static int rt_acct_proc_open(struct inode *inode, struct file *file)
363 {
364         return single_open(file, rt_acct_proc_show, NULL);
365 }
366
367 static const struct file_operations rt_acct_proc_fops = {
368         .owner          = THIS_MODULE,
369         .open           = rt_acct_proc_open,
370         .read           = seq_read,
371         .llseek         = seq_lseek,
372         .release        = single_release,
373 };
374 #endif
375
376 static int __net_init ip_rt_do_proc_init(struct net *net)
377 {
378         struct proc_dir_entry *pde;
379
380         pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
381                           &rt_cache_seq_fops);
382         if (!pde)
383                 goto err1;
384
385         pde = proc_create("rt_cache", S_IRUGO,
386                           net->proc_net_stat, &rt_cpu_seq_fops);
387         if (!pde)
388                 goto err2;
389
390 #ifdef CONFIG_IP_ROUTE_CLASSID
391         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
392         if (!pde)
393                 goto err3;
394 #endif
395         return 0;
396
397 #ifdef CONFIG_IP_ROUTE_CLASSID
398 err3:
399         remove_proc_entry("rt_cache", net->proc_net_stat);
400 #endif
401 err2:
402         remove_proc_entry("rt_cache", net->proc_net);
403 err1:
404         return -ENOMEM;
405 }
406
407 static void __net_exit ip_rt_do_proc_exit(struct net *net)
408 {
409         remove_proc_entry("rt_cache", net->proc_net_stat);
410         remove_proc_entry("rt_cache", net->proc_net);
411 #ifdef CONFIG_IP_ROUTE_CLASSID
412         remove_proc_entry("rt_acct", net->proc_net);
413 #endif
414 }
415
416 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
417         .init = ip_rt_do_proc_init,
418         .exit = ip_rt_do_proc_exit,
419 };
420
421 static int __init ip_rt_proc_init(void)
422 {
423         return register_pernet_subsys(&ip_rt_proc_ops);
424 }
425
426 #else
427 static inline int ip_rt_proc_init(void)
428 {
429         return 0;
430 }
431 #endif /* CONFIG_PROC_FS */
432
433 static inline bool rt_is_expired(const struct rtable *rth)
434 {
435         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
436 }
437
438 void rt_cache_flush(struct net *net)
439 {
440         rt_genid_bump_ipv4(net);
441 }
442
443 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
444                                            struct sk_buff *skb,
445                                            const void *daddr)
446 {
447         struct net_device *dev = dst->dev;
448         const __be32 *pkey = daddr;
449         const struct rtable *rt;
450         struct neighbour *n;
451
452         rt = (const struct rtable *) dst;
453         if (rt->rt_gateway)
454                 pkey = (const __be32 *) &rt->rt_gateway;
455         else if (skb)
456                 pkey = &ip_hdr(skb)->daddr;
457
458         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
459         if (n)
460                 return n;
461         return neigh_create(&arp_tbl, pkey, dev);
462 }
463
464 #define IP_IDENTS_SZ 2048u
465
466 static atomic_t *ip_idents __read_mostly;
467 static u32 *ip_tstamps __read_mostly;
468
469 /* In order to protect privacy, we add a perturbation to identifiers
470  * if one generator is seldom used. This makes hard for an attacker
471  * to infer how many packets were sent between two points in time.
472  */
473 u32 ip_idents_reserve(u32 hash, int segs)
474 {
475         u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
476         atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
477         u32 old = ACCESS_ONCE(*p_tstamp);
478         u32 now = (u32)jiffies;
479         u32 new, delta = 0;
480
481         if (old != now && cmpxchg(p_tstamp, old, now) == old)
482                 delta = prandom_u32_max(now - old);
483
484         /* Do not use atomic_add_return() as it makes UBSAN unhappy */
485         do {
486                 old = (u32)atomic_read(p_id);
487                 new = old + delta + segs;
488         } while (atomic_cmpxchg(p_id, old, new) != old);
489
490         return new - segs;
491 }
492 EXPORT_SYMBOL(ip_idents_reserve);
493
494 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
495 {
496         static u32 ip_idents_hashrnd __read_mostly;
497         u32 hash, id;
498
499         net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
500
501         hash = jhash_3words((__force u32)iph->daddr,
502                             (__force u32)iph->saddr,
503                             iph->protocol ^ net_hash_mix(net),
504                             ip_idents_hashrnd);
505         id = ip_idents_reserve(hash, segs);
506         iph->id = htons(id);
507 }
508 EXPORT_SYMBOL(__ip_select_ident);
509
510 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
511                              const struct sock *sk,
512                              const struct iphdr *iph,
513                              int oif, u8 tos,
514                              u8 prot, u32 mark, int flow_flags)
515 {
516         if (sk) {
517                 const struct inet_sock *inet = inet_sk(sk);
518
519                 oif = sk->sk_bound_dev_if;
520                 mark = sk->sk_mark;
521                 tos = RT_CONN_FLAGS(sk);
522                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
523         }
524         flowi4_init_output(fl4, oif, mark, tos,
525                            RT_SCOPE_UNIVERSE, prot,
526                            flow_flags,
527                            iph->daddr, iph->saddr, 0, 0,
528                            sock_net_uid(net, sk));
529 }
530
531 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
532                                const struct sock *sk)
533 {
534         const struct net *net = dev_net(skb->dev);
535         const struct iphdr *iph = ip_hdr(skb);
536         int oif = skb->dev->ifindex;
537         u8 tos = RT_TOS(iph->tos);
538         u8 prot = iph->protocol;
539         u32 mark = skb->mark;
540
541         __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
542 }
543
544 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
545 {
546         const struct inet_sock *inet = inet_sk(sk);
547         const struct ip_options_rcu *inet_opt;
548         __be32 daddr = inet->inet_daddr;
549
550         rcu_read_lock();
551         inet_opt = rcu_dereference(inet->inet_opt);
552         if (inet_opt && inet_opt->opt.srr)
553                 daddr = inet_opt->opt.faddr;
554         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
555                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
556                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
557                            inet_sk_flowi_flags(sk),
558                            daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
559         rcu_read_unlock();
560 }
561
562 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
563                                  const struct sk_buff *skb)
564 {
565         if (skb)
566                 build_skb_flow_key(fl4, skb, sk);
567         else
568                 build_sk_flow_key(fl4, sk);
569 }
570
571 static inline void rt_free(struct rtable *rt)
572 {
573         call_rcu(&rt->dst.rcu_head, dst_rcu_free);
574 }
575
576 static DEFINE_SPINLOCK(fnhe_lock);
577
578 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
579 {
580         struct rtable *rt;
581
582         rt = rcu_dereference(fnhe->fnhe_rth_input);
583         if (rt) {
584                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
585                 rt_free(rt);
586         }
587         rt = rcu_dereference(fnhe->fnhe_rth_output);
588         if (rt) {
589                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
590                 rt_free(rt);
591         }
592 }
593
594 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
595 {
596         struct fib_nh_exception *fnhe, *oldest;
597
598         oldest = rcu_dereference(hash->chain);
599         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
600              fnhe = rcu_dereference(fnhe->fnhe_next)) {
601                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
602                         oldest = fnhe;
603         }
604         fnhe_flush_routes(oldest);
605         return oldest;
606 }
607
608 static inline u32 fnhe_hashfun(__be32 daddr)
609 {
610         static u32 fnhe_hashrnd __read_mostly;
611         u32 hval;
612
613         net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
614         hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
615         return hash_32(hval, FNHE_HASH_SHIFT);
616 }
617
618 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
619 {
620         rt->rt_pmtu = fnhe->fnhe_pmtu;
621         rt->dst.expires = fnhe->fnhe_expires;
622
623         if (fnhe->fnhe_gw) {
624                 rt->rt_flags |= RTCF_REDIRECTED;
625                 rt->rt_gateway = fnhe->fnhe_gw;
626                 rt->rt_uses_gateway = 1;
627         }
628 }
629
630 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
631                                   u32 pmtu, unsigned long expires)
632 {
633         struct fnhe_hash_bucket *hash;
634         struct fib_nh_exception *fnhe;
635         struct rtable *rt;
636         unsigned int i;
637         int depth;
638         u32 hval = fnhe_hashfun(daddr);
639
640         spin_lock_bh(&fnhe_lock);
641
642         hash = rcu_dereference(nh->nh_exceptions);
643         if (!hash) {
644                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
645                 if (!hash)
646                         goto out_unlock;
647                 rcu_assign_pointer(nh->nh_exceptions, hash);
648         }
649
650         hash += hval;
651
652         depth = 0;
653         for (fnhe = rcu_dereference(hash->chain); fnhe;
654              fnhe = rcu_dereference(fnhe->fnhe_next)) {
655                 if (fnhe->fnhe_daddr == daddr)
656                         break;
657                 depth++;
658         }
659
660         if (fnhe) {
661                 if (gw)
662                         fnhe->fnhe_gw = gw;
663                 if (pmtu) {
664                         fnhe->fnhe_pmtu = pmtu;
665                         fnhe->fnhe_expires = max(1UL, expires);
666                 }
667                 /* Update all cached dsts too */
668                 rt = rcu_dereference(fnhe->fnhe_rth_input);
669                 if (rt)
670                         fill_route_from_fnhe(rt, fnhe);
671                 rt = rcu_dereference(fnhe->fnhe_rth_output);
672                 if (rt)
673                         fill_route_from_fnhe(rt, fnhe);
674         } else {
675                 if (depth > FNHE_RECLAIM_DEPTH)
676                         fnhe = fnhe_oldest(hash);
677                 else {
678                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
679                         if (!fnhe)
680                                 goto out_unlock;
681
682                         fnhe->fnhe_next = hash->chain;
683                         rcu_assign_pointer(hash->chain, fnhe);
684                 }
685                 fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev));
686                 fnhe->fnhe_daddr = daddr;
687                 fnhe->fnhe_gw = gw;
688                 fnhe->fnhe_pmtu = pmtu;
689                 fnhe->fnhe_expires = expires;
690
691                 /* Exception created; mark the cached routes for the nexthop
692                  * stale, so anyone caching it rechecks if this exception
693                  * applies to them.
694                  */
695                 rt = rcu_dereference(nh->nh_rth_input);
696                 if (rt)
697                         rt->dst.obsolete = DST_OBSOLETE_KILL;
698
699                 for_each_possible_cpu(i) {
700                         struct rtable __rcu **prt;
701                         prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
702                         rt = rcu_dereference(*prt);
703                         if (rt)
704                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
705                 }
706         }
707
708         fnhe->fnhe_stamp = jiffies;
709
710 out_unlock:
711         spin_unlock_bh(&fnhe_lock);
712 }
713
714 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
715                              bool kill_route)
716 {
717         __be32 new_gw = icmp_hdr(skb)->un.gateway;
718         __be32 old_gw = ip_hdr(skb)->saddr;
719         struct net_device *dev = skb->dev;
720         struct in_device *in_dev;
721         struct fib_result res;
722         struct neighbour *n;
723         struct net *net;
724
725         switch (icmp_hdr(skb)->code & 7) {
726         case ICMP_REDIR_NET:
727         case ICMP_REDIR_NETTOS:
728         case ICMP_REDIR_HOST:
729         case ICMP_REDIR_HOSTTOS:
730                 break;
731
732         default:
733                 return;
734         }
735
736         if (rt->rt_gateway != old_gw)
737                 return;
738
739         in_dev = __in_dev_get_rcu(dev);
740         if (!in_dev)
741                 return;
742
743         net = dev_net(dev);
744         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
745             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
746             ipv4_is_zeronet(new_gw))
747                 goto reject_redirect;
748
749         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
750                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
751                         goto reject_redirect;
752                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
753                         goto reject_redirect;
754         } else {
755                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
756                         goto reject_redirect;
757         }
758
759         n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
760         if (!n)
761                 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
762         if (!IS_ERR(n)) {
763                 if (!(n->nud_state & NUD_VALID)) {
764                         neigh_event_send(n, NULL);
765                 } else {
766                         if (fib_lookup(net, fl4, &res, 0) == 0) {
767                                 struct fib_nh *nh = &FIB_RES_NH(res);
768
769                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
770                                                 0, jiffies + ip_rt_gc_timeout);
771                         }
772                         if (kill_route)
773                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
774                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
775                 }
776                 neigh_release(n);
777         }
778         return;
779
780 reject_redirect:
781 #ifdef CONFIG_IP_ROUTE_VERBOSE
782         if (IN_DEV_LOG_MARTIANS(in_dev)) {
783                 const struct iphdr *iph = (const struct iphdr *) skb->data;
784                 __be32 daddr = iph->daddr;
785                 __be32 saddr = iph->saddr;
786
787                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
788                                      "  Advised path = %pI4 -> %pI4\n",
789                                      &old_gw, dev->name, &new_gw,
790                                      &saddr, &daddr);
791         }
792 #endif
793         ;
794 }
795
796 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
797 {
798         struct rtable *rt;
799         struct flowi4 fl4;
800         const struct iphdr *iph = (const struct iphdr *) skb->data;
801         struct net *net = dev_net(skb->dev);
802         int oif = skb->dev->ifindex;
803         u8 tos = RT_TOS(iph->tos);
804         u8 prot = iph->protocol;
805         u32 mark = skb->mark;
806
807         rt = (struct rtable *) dst;
808
809         __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
810         __ip_do_redirect(rt, skb, &fl4, true);
811 }
812
813 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
814 {
815         struct rtable *rt = (struct rtable *)dst;
816         struct dst_entry *ret = dst;
817
818         if (rt) {
819                 if (dst->obsolete > 0) {
820                         ip_rt_put(rt);
821                         ret = NULL;
822                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
823                            rt->dst.expires) {
824                         ip_rt_put(rt);
825                         ret = NULL;
826                 }
827         }
828         return ret;
829 }
830
831 /*
832  * Algorithm:
833  *      1. The first ip_rt_redirect_number redirects are sent
834  *         with exponential backoff, then we stop sending them at all,
835  *         assuming that the host ignores our redirects.
836  *      2. If we did not see packets requiring redirects
837  *         during ip_rt_redirect_silence, we assume that the host
838  *         forgot redirected route and start to send redirects again.
839  *
840  * This algorithm is much cheaper and more intelligent than dumb load limiting
841  * in icmp.c.
842  *
843  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
844  * and "frag. need" (breaks PMTU discovery) in icmp.c.
845  */
846
847 void ip_rt_send_redirect(struct sk_buff *skb)
848 {
849         struct rtable *rt = skb_rtable(skb);
850         struct in_device *in_dev;
851         struct inet_peer *peer;
852         struct net *net;
853         int log_martians;
854         int vif;
855
856         rcu_read_lock();
857         in_dev = __in_dev_get_rcu(rt->dst.dev);
858         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
859                 rcu_read_unlock();
860                 return;
861         }
862         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
863         vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
864         rcu_read_unlock();
865
866         net = dev_net(rt->dst.dev);
867         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
868         if (!peer) {
869                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
870                           rt_nexthop(rt, ip_hdr(skb)->daddr));
871                 return;
872         }
873
874         /* No redirected packets during ip_rt_redirect_silence;
875          * reset the algorithm.
876          */
877         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
878                 peer->rate_tokens = 0;
879
880         /* Too many ignored redirects; do not send anything
881          * set dst.rate_last to the last seen redirected packet.
882          */
883         if (peer->rate_tokens >= ip_rt_redirect_number) {
884                 peer->rate_last = jiffies;
885                 goto out_put_peer;
886         }
887
888         /* Check for load limit; set rate_last to the latest sent
889          * redirect.
890          */
891         if (peer->rate_tokens == 0 ||
892             time_after(jiffies,
893                        (peer->rate_last +
894                         (ip_rt_redirect_load << peer->rate_tokens)))) {
895                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
896
897                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
898                 peer->rate_last = jiffies;
899                 ++peer->rate_tokens;
900 #ifdef CONFIG_IP_ROUTE_VERBOSE
901                 if (log_martians &&
902                     peer->rate_tokens == ip_rt_redirect_number)
903                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
904                                              &ip_hdr(skb)->saddr, inet_iif(skb),
905                                              &ip_hdr(skb)->daddr, &gw);
906 #endif
907         }
908 out_put_peer:
909         inet_putpeer(peer);
910 }
911
912 static int ip_error(struct sk_buff *skb)
913 {
914         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
915         struct rtable *rt = skb_rtable(skb);
916         struct inet_peer *peer;
917         unsigned long now;
918         struct net *net;
919         bool send;
920         int code;
921
922         /* IP on this device is disabled. */
923         if (!in_dev)
924                 goto out;
925
926         net = dev_net(rt->dst.dev);
927         if (!IN_DEV_FORWARD(in_dev)) {
928                 switch (rt->dst.error) {
929                 case EHOSTUNREACH:
930                         __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
931                         break;
932
933                 case ENETUNREACH:
934                         __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
935                         break;
936                 }
937                 goto out;
938         }
939
940         switch (rt->dst.error) {
941         case EINVAL:
942         default:
943                 goto out;
944         case EHOSTUNREACH:
945                 code = ICMP_HOST_UNREACH;
946                 break;
947         case ENETUNREACH:
948                 code = ICMP_NET_UNREACH;
949                 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
950                 break;
951         case EACCES:
952                 code = ICMP_PKT_FILTERED;
953                 break;
954         }
955
956         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
957                                l3mdev_master_ifindex(skb->dev), 1);
958
959         send = true;
960         if (peer) {
961                 now = jiffies;
962                 peer->rate_tokens += now - peer->rate_last;
963                 if (peer->rate_tokens > ip_rt_error_burst)
964                         peer->rate_tokens = ip_rt_error_burst;
965                 peer->rate_last = now;
966                 if (peer->rate_tokens >= ip_rt_error_cost)
967                         peer->rate_tokens -= ip_rt_error_cost;
968                 else
969                         send = false;
970                 inet_putpeer(peer);
971         }
972         if (send)
973                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
974
975 out:    kfree_skb(skb);
976         return 0;
977 }
978
979 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
980 {
981         struct dst_entry *dst = &rt->dst;
982         struct fib_result res;
983
984         if (dst_metric_locked(dst, RTAX_MTU))
985                 return;
986
987         if (ipv4_mtu(dst) < mtu)
988                 return;
989
990         if (mtu < ip_rt_min_pmtu)
991                 mtu = ip_rt_min_pmtu;
992
993         if (rt->rt_pmtu == mtu &&
994             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
995                 return;
996
997         rcu_read_lock();
998         if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
999                 struct fib_nh *nh = &FIB_RES_NH(res);
1000
1001                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
1002                                       jiffies + ip_rt_mtu_expires);
1003         }
1004         rcu_read_unlock();
1005 }
1006
1007 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1008                               struct sk_buff *skb, u32 mtu)
1009 {
1010         struct rtable *rt = (struct rtable *) dst;
1011         struct flowi4 fl4;
1012
1013         ip_rt_build_flow_key(&fl4, sk, skb);
1014         __ip_rt_update_pmtu(rt, &fl4, mtu);
1015 }
1016
1017 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1018                       int oif, u32 mark, u8 protocol, int flow_flags)
1019 {
1020         const struct iphdr *iph = (const struct iphdr *) skb->data;
1021         struct flowi4 fl4;
1022         struct rtable *rt;
1023
1024         if (!mark)
1025                 mark = IP4_REPLY_MARK(net, skb->mark);
1026
1027         __build_flow_key(net, &fl4, NULL, iph, oif,
1028                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1029         rt = __ip_route_output_key(net, &fl4);
1030         if (!IS_ERR(rt)) {
1031                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1032                 ip_rt_put(rt);
1033         }
1034 }
1035 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1036
1037 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1038 {
1039         const struct iphdr *iph = (const struct iphdr *) skb->data;
1040         struct flowi4 fl4;
1041         struct rtable *rt;
1042
1043         __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1044
1045         if (!fl4.flowi4_mark)
1046                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1047
1048         rt = __ip_route_output_key(sock_net(sk), &fl4);
1049         if (!IS_ERR(rt)) {
1050                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1051                 ip_rt_put(rt);
1052         }
1053 }
1054
1055 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1056 {
1057         const struct iphdr *iph = (const struct iphdr *) skb->data;
1058         struct flowi4 fl4;
1059         struct rtable *rt;
1060         struct dst_entry *odst = NULL;
1061         bool new = false;
1062         struct net *net = sock_net(sk);
1063
1064         bh_lock_sock(sk);
1065
1066         if (!ip_sk_accept_pmtu(sk))
1067                 goto out;
1068
1069         odst = sk_dst_get(sk);
1070
1071         if (sock_owned_by_user(sk) || !odst) {
1072                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1073                 goto out;
1074         }
1075
1076         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1077
1078         rt = (struct rtable *)odst;
1079         if (odst->obsolete && !odst->ops->check(odst, 0)) {
1080                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1081                 if (IS_ERR(rt))
1082                         goto out;
1083
1084                 new = true;
1085         }
1086
1087         __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1088
1089         if (!dst_check(&rt->dst, 0)) {
1090                 if (new)
1091                         dst_release(&rt->dst);
1092
1093                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1094                 if (IS_ERR(rt))
1095                         goto out;
1096
1097                 new = true;
1098         }
1099
1100         if (new)
1101                 sk_dst_set(sk, &rt->dst);
1102
1103 out:
1104         bh_unlock_sock(sk);
1105         dst_release(odst);
1106 }
1107 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1108
1109 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1110                    int oif, u32 mark, u8 protocol, int flow_flags)
1111 {
1112         const struct iphdr *iph = (const struct iphdr *) skb->data;
1113         struct flowi4 fl4;
1114         struct rtable *rt;
1115
1116         __build_flow_key(net, &fl4, NULL, iph, oif,
1117                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1118         rt = __ip_route_output_key(net, &fl4);
1119         if (!IS_ERR(rt)) {
1120                 __ip_do_redirect(rt, skb, &fl4, false);
1121                 ip_rt_put(rt);
1122         }
1123 }
1124 EXPORT_SYMBOL_GPL(ipv4_redirect);
1125
1126 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1127 {
1128         const struct iphdr *iph = (const struct iphdr *) skb->data;
1129         struct flowi4 fl4;
1130         struct rtable *rt;
1131         struct net *net = sock_net(sk);
1132
1133         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1134         rt = __ip_route_output_key(net, &fl4);
1135         if (!IS_ERR(rt)) {
1136                 __ip_do_redirect(rt, skb, &fl4, false);
1137                 ip_rt_put(rt);
1138         }
1139 }
1140 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1141
1142 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1143 {
1144         struct rtable *rt = (struct rtable *) dst;
1145
1146         /* All IPV4 dsts are created with ->obsolete set to the value
1147          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1148          * into this function always.
1149          *
1150          * When a PMTU/redirect information update invalidates a route,
1151          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1152          * DST_OBSOLETE_DEAD by dst_free().
1153          */
1154         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1155                 return NULL;
1156         return dst;
1157 }
1158
1159 static void ipv4_link_failure(struct sk_buff *skb)
1160 {
1161         struct rtable *rt;
1162
1163         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1164
1165         rt = skb_rtable(skb);
1166         if (rt)
1167                 dst_set_expires(&rt->dst, 0);
1168 }
1169
1170 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1171 {
1172         pr_debug("%s: %pI4 -> %pI4, %s\n",
1173                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1174                  skb->dev ? skb->dev->name : "?");
1175         kfree_skb(skb);
1176         WARN_ON(1);
1177         return 0;
1178 }
1179
1180 /*
1181    We do not cache source address of outgoing interface,
1182    because it is used only by IP RR, TS and SRR options,
1183    so that it out of fast path.
1184
1185    BTW remember: "addr" is allowed to be not aligned
1186    in IP options!
1187  */
1188
1189 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1190 {
1191         __be32 src;
1192
1193         if (rt_is_output_route(rt))
1194                 src = ip_hdr(skb)->saddr;
1195         else {
1196                 struct fib_result res;
1197                 struct flowi4 fl4;
1198                 struct iphdr *iph;
1199
1200                 iph = ip_hdr(skb);
1201
1202                 memset(&fl4, 0, sizeof(fl4));
1203                 fl4.daddr = iph->daddr;
1204                 fl4.saddr = iph->saddr;
1205                 fl4.flowi4_tos = RT_TOS(iph->tos);
1206                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1207                 fl4.flowi4_iif = skb->dev->ifindex;
1208                 fl4.flowi4_mark = skb->mark;
1209
1210                 rcu_read_lock();
1211                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1212                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1213                 else
1214                         src = inet_select_addr(rt->dst.dev,
1215                                                rt_nexthop(rt, iph->daddr),
1216                                                RT_SCOPE_UNIVERSE);
1217                 rcu_read_unlock();
1218         }
1219         memcpy(addr, &src, 4);
1220 }
1221
1222 #ifdef CONFIG_IP_ROUTE_CLASSID
1223 static void set_class_tag(struct rtable *rt, u32 tag)
1224 {
1225         if (!(rt->dst.tclassid & 0xFFFF))
1226                 rt->dst.tclassid |= tag & 0xFFFF;
1227         if (!(rt->dst.tclassid & 0xFFFF0000))
1228                 rt->dst.tclassid |= tag & 0xFFFF0000;
1229 }
1230 #endif
1231
1232 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1233 {
1234         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1235
1236         if (advmss == 0) {
1237                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1238                                ip_rt_min_advmss);
1239                 if (advmss > 65535 - 40)
1240                         advmss = 65535 - 40;
1241         }
1242         return advmss;
1243 }
1244
1245 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1246 {
1247         const struct rtable *rt = (const struct rtable *) dst;
1248         unsigned int mtu = rt->rt_pmtu;
1249
1250         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1251                 mtu = dst_metric_raw(dst, RTAX_MTU);
1252
1253         if (mtu)
1254                 return mtu;
1255
1256         mtu = dst->dev->mtu;
1257
1258         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1259                 if (rt->rt_uses_gateway && mtu > 576)
1260                         mtu = 576;
1261         }
1262
1263         mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1264
1265         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1266 }
1267
1268 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1269 {
1270         struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1271         struct fib_nh_exception *fnhe;
1272         u32 hval;
1273
1274         if (!hash)
1275                 return NULL;
1276
1277         hval = fnhe_hashfun(daddr);
1278
1279         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1280              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1281                 if (fnhe->fnhe_daddr == daddr)
1282                         return fnhe;
1283         }
1284         return NULL;
1285 }
1286
1287 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1288                               __be32 daddr)
1289 {
1290         bool ret = false;
1291
1292         spin_lock_bh(&fnhe_lock);
1293
1294         if (daddr == fnhe->fnhe_daddr) {
1295                 struct rtable __rcu **porig;
1296                 struct rtable *orig;
1297                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1298
1299                 if (rt_is_input_route(rt))
1300                         porig = &fnhe->fnhe_rth_input;
1301                 else
1302                         porig = &fnhe->fnhe_rth_output;
1303                 orig = rcu_dereference(*porig);
1304
1305                 if (fnhe->fnhe_genid != genid) {
1306                         fnhe->fnhe_genid = genid;
1307                         fnhe->fnhe_gw = 0;
1308                         fnhe->fnhe_pmtu = 0;
1309                         fnhe->fnhe_expires = 0;
1310                         fnhe_flush_routes(fnhe);
1311                         orig = NULL;
1312                 }
1313                 fill_route_from_fnhe(rt, fnhe);
1314                 if (!rt->rt_gateway)
1315                         rt->rt_gateway = daddr;
1316
1317                 if (!(rt->dst.flags & DST_NOCACHE)) {
1318                         rcu_assign_pointer(*porig, rt);
1319                         if (orig)
1320                                 rt_free(orig);
1321                         ret = true;
1322                 }
1323
1324                 fnhe->fnhe_stamp = jiffies;
1325         }
1326         spin_unlock_bh(&fnhe_lock);
1327
1328         return ret;
1329 }
1330
1331 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1332 {
1333         struct rtable *orig, *prev, **p;
1334         bool ret = true;
1335
1336         if (rt_is_input_route(rt)) {
1337                 p = (struct rtable **)&nh->nh_rth_input;
1338         } else {
1339                 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1340         }
1341         orig = *p;
1342
1343         prev = cmpxchg(p, orig, rt);
1344         if (prev == orig) {
1345                 if (orig)
1346                         rt_free(orig);
1347         } else
1348                 ret = false;
1349
1350         return ret;
1351 }
1352
1353 struct uncached_list {
1354         spinlock_t              lock;
1355         struct list_head        head;
1356 };
1357
1358 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1359
1360 static void rt_add_uncached_list(struct rtable *rt)
1361 {
1362         struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1363
1364         rt->rt_uncached_list = ul;
1365
1366         spin_lock_bh(&ul->lock);
1367         list_add_tail(&rt->rt_uncached, &ul->head);
1368         spin_unlock_bh(&ul->lock);
1369 }
1370
1371 static void ipv4_dst_destroy(struct dst_entry *dst)
1372 {
1373         struct rtable *rt = (struct rtable *) dst;
1374
1375         if (!list_empty(&rt->rt_uncached)) {
1376                 struct uncached_list *ul = rt->rt_uncached_list;
1377
1378                 spin_lock_bh(&ul->lock);
1379                 list_del(&rt->rt_uncached);
1380                 spin_unlock_bh(&ul->lock);
1381         }
1382 }
1383
1384 void rt_flush_dev(struct net_device *dev)
1385 {
1386         struct net *net = dev_net(dev);
1387         struct rtable *rt;
1388         int cpu;
1389
1390         for_each_possible_cpu(cpu) {
1391                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1392
1393                 spin_lock_bh(&ul->lock);
1394                 list_for_each_entry(rt, &ul->head, rt_uncached) {
1395                         if (rt->dst.dev != dev)
1396                                 continue;
1397                         rt->dst.dev = net->loopback_dev;
1398                         dev_hold(rt->dst.dev);
1399                         dev_put(dev);
1400                 }
1401                 spin_unlock_bh(&ul->lock);
1402         }
1403 }
1404
1405 static bool rt_cache_valid(const struct rtable *rt)
1406 {
1407         return  rt &&
1408                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1409                 !rt_is_expired(rt);
1410 }
1411
1412 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1413                            const struct fib_result *res,
1414                            struct fib_nh_exception *fnhe,
1415                            struct fib_info *fi, u16 type, u32 itag)
1416 {
1417         bool cached = false;
1418
1419         if (fi) {
1420                 struct fib_nh *nh = &FIB_RES_NH(*res);
1421
1422                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1423                         rt->rt_gateway = nh->nh_gw;
1424                         rt->rt_uses_gateway = 1;
1425                 }
1426                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1427 #ifdef CONFIG_IP_ROUTE_CLASSID
1428                 rt->dst.tclassid = nh->nh_tclassid;
1429 #endif
1430                 rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1431                 if (unlikely(fnhe))
1432                         cached = rt_bind_exception(rt, fnhe, daddr);
1433                 else if (!(rt->dst.flags & DST_NOCACHE))
1434                         cached = rt_cache_route(nh, rt);
1435                 if (unlikely(!cached)) {
1436                         /* Routes we intend to cache in nexthop exception or
1437                          * FIB nexthop have the DST_NOCACHE bit clear.
1438                          * However, if we are unsuccessful at storing this
1439                          * route into the cache we really need to set it.
1440                          */
1441                         rt->dst.flags |= DST_NOCACHE;
1442                         if (!rt->rt_gateway)
1443                                 rt->rt_gateway = daddr;
1444                         rt_add_uncached_list(rt);
1445                 }
1446         } else
1447                 rt_add_uncached_list(rt);
1448
1449 #ifdef CONFIG_IP_ROUTE_CLASSID
1450 #ifdef CONFIG_IP_MULTIPLE_TABLES
1451         set_class_tag(rt, res->tclassid);
1452 #endif
1453         set_class_tag(rt, itag);
1454 #endif
1455 }
1456
1457 struct rtable *rt_dst_alloc(struct net_device *dev,
1458                             unsigned int flags, u16 type,
1459                             bool nopolicy, bool noxfrm, bool will_cache)
1460 {
1461         struct rtable *rt;
1462
1463         rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1464                        (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1465                        (nopolicy ? DST_NOPOLICY : 0) |
1466                        (noxfrm ? DST_NOXFRM : 0));
1467
1468         if (rt) {
1469                 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1470                 rt->rt_flags = flags;
1471                 rt->rt_type = type;
1472                 rt->rt_is_input = 0;
1473                 rt->rt_iif = 0;
1474                 rt->rt_pmtu = 0;
1475                 rt->rt_gateway = 0;
1476                 rt->rt_uses_gateway = 0;
1477                 rt->rt_table_id = 0;
1478                 INIT_LIST_HEAD(&rt->rt_uncached);
1479
1480                 rt->dst.output = ip_output;
1481                 if (flags & RTCF_LOCAL)
1482                         rt->dst.input = ip_local_deliver;
1483         }
1484
1485         return rt;
1486 }
1487 EXPORT_SYMBOL(rt_dst_alloc);
1488
1489 /* called in rcu_read_lock() section */
1490 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1491                                 u8 tos, struct net_device *dev, int our)
1492 {
1493         struct rtable *rth;
1494         struct in_device *in_dev = __in_dev_get_rcu(dev);
1495         unsigned int flags = RTCF_MULTICAST;
1496         u32 itag = 0;
1497         int err;
1498
1499         /* Primary sanity checks. */
1500
1501         if (!in_dev)
1502                 return -EINVAL;
1503
1504         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1505             skb->protocol != htons(ETH_P_IP))
1506                 goto e_inval;
1507
1508         if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1509                 goto e_inval;
1510
1511         if (ipv4_is_zeronet(saddr)) {
1512                 if (!ipv4_is_local_multicast(daddr))
1513                         goto e_inval;
1514         } else {
1515                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1516                                           in_dev, &itag);
1517                 if (err < 0)
1518                         goto e_err;
1519         }
1520         if (our)
1521                 flags |= RTCF_LOCAL;
1522
1523         rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1524                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1525         if (!rth)
1526                 goto e_nobufs;
1527
1528 #ifdef CONFIG_IP_ROUTE_CLASSID
1529         rth->dst.tclassid = itag;
1530 #endif
1531         rth->dst.output = ip_rt_bug;
1532         rth->rt_is_input= 1;
1533
1534 #ifdef CONFIG_IP_MROUTE
1535         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1536                 rth->dst.input = ip_mr_input;
1537 #endif
1538         RT_CACHE_STAT_INC(in_slow_mc);
1539
1540         skb_dst_set(skb, &rth->dst);
1541         return 0;
1542
1543 e_nobufs:
1544         return -ENOBUFS;
1545 e_inval:
1546         return -EINVAL;
1547 e_err:
1548         return err;
1549 }
1550
1551
1552 static void ip_handle_martian_source(struct net_device *dev,
1553                                      struct in_device *in_dev,
1554                                      struct sk_buff *skb,
1555                                      __be32 daddr,
1556                                      __be32 saddr)
1557 {
1558         RT_CACHE_STAT_INC(in_martian_src);
1559 #ifdef CONFIG_IP_ROUTE_VERBOSE
1560         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1561                 /*
1562                  *      RFC1812 recommendation, if source is martian,
1563                  *      the only hint is MAC header.
1564                  */
1565                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1566                         &daddr, &saddr, dev->name);
1567                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1568                         print_hex_dump(KERN_WARNING, "ll header: ",
1569                                        DUMP_PREFIX_OFFSET, 16, 1,
1570                                        skb_mac_header(skb),
1571                                        dev->hard_header_len, true);
1572                 }
1573         }
1574 #endif
1575 }
1576
1577 static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1578 {
1579         struct fnhe_hash_bucket *hash;
1580         struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1581         u32 hval = fnhe_hashfun(daddr);
1582
1583         spin_lock_bh(&fnhe_lock);
1584
1585         hash = rcu_dereference_protected(nh->nh_exceptions,
1586                                          lockdep_is_held(&fnhe_lock));
1587         hash += hval;
1588
1589         fnhe_p = &hash->chain;
1590         fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1591         while (fnhe) {
1592                 if (fnhe->fnhe_daddr == daddr) {
1593                         rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1594                                 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1595                         fnhe_flush_routes(fnhe);
1596                         kfree_rcu(fnhe, rcu);
1597                         break;
1598                 }
1599                 fnhe_p = &fnhe->fnhe_next;
1600                 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1601                                                  lockdep_is_held(&fnhe_lock));
1602         }
1603
1604         spin_unlock_bh(&fnhe_lock);
1605 }
1606
1607 static void set_lwt_redirect(struct rtable *rth)
1608 {
1609         if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
1610                 rth->dst.lwtstate->orig_output = rth->dst.output;
1611                 rth->dst.output = lwtunnel_output;
1612         }
1613
1614         if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
1615                 rth->dst.lwtstate->orig_input = rth->dst.input;
1616                 rth->dst.input = lwtunnel_input;
1617         }
1618 }
1619
1620 /* called in rcu_read_lock() section */
1621 static int __mkroute_input(struct sk_buff *skb,
1622                            const struct fib_result *res,
1623                            struct in_device *in_dev,
1624                            __be32 daddr, __be32 saddr, u32 tos)
1625 {
1626         struct fib_nh_exception *fnhe;
1627         struct rtable *rth;
1628         int err;
1629         struct in_device *out_dev;
1630         bool do_cache;
1631         u32 itag = 0;
1632
1633         /* get a working reference to the output device */
1634         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1635         if (!out_dev) {
1636                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1637                 return -EINVAL;
1638         }
1639
1640         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1641                                   in_dev->dev, in_dev, &itag);
1642         if (err < 0) {
1643                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1644                                          saddr);
1645
1646                 goto cleanup;
1647         }
1648
1649         do_cache = res->fi && !itag;
1650         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1651             skb->protocol == htons(ETH_P_IP) &&
1652             (IN_DEV_SHARED_MEDIA(out_dev) ||
1653              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1654                 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1655
1656         if (skb->protocol != htons(ETH_P_IP)) {
1657                 /* Not IP (i.e. ARP). Do not create route, if it is
1658                  * invalid for proxy arp. DNAT routes are always valid.
1659                  *
1660                  * Proxy arp feature have been extended to allow, ARP
1661                  * replies back to the same interface, to support
1662                  * Private VLAN switch technologies. See arp.c.
1663                  */
1664                 if (out_dev == in_dev &&
1665                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1666                         err = -EINVAL;
1667                         goto cleanup;
1668                 }
1669         }
1670
1671         fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1672         if (do_cache) {
1673                 if (fnhe) {
1674                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1675                         if (rth && rth->dst.expires &&
1676                             time_after(jiffies, rth->dst.expires)) {
1677                                 ip_del_fnhe(&FIB_RES_NH(*res), daddr);
1678                                 fnhe = NULL;
1679                         } else {
1680                                 goto rt_cache;
1681                         }
1682                 }
1683
1684                 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1685
1686 rt_cache:
1687                 if (rt_cache_valid(rth)) {
1688                         skb_dst_set_noref(skb, &rth->dst);
1689                         goto out;
1690                 }
1691         }
1692
1693         rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1694                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1695                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1696         if (!rth) {
1697                 err = -ENOBUFS;
1698                 goto cleanup;
1699         }
1700
1701         rth->rt_is_input = 1;
1702         if (res->table)
1703                 rth->rt_table_id = res->table->tb_id;
1704         RT_CACHE_STAT_INC(in_slow_tot);
1705
1706         rth->dst.input = ip_forward;
1707
1708         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
1709         set_lwt_redirect(rth);
1710         skb_dst_set(skb, &rth->dst);
1711 out:
1712         err = 0;
1713  cleanup:
1714         return err;
1715 }
1716
1717 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1718
1719 /* To make ICMP packets follow the right flow, the multipath hash is
1720  * calculated from the inner IP addresses in reverse order.
1721  */
1722 static int ip_multipath_icmp_hash(struct sk_buff *skb)
1723 {
1724         const struct iphdr *outer_iph = ip_hdr(skb);
1725         struct icmphdr _icmph;
1726         const struct icmphdr *icmph;
1727         struct iphdr _inner_iph;
1728         const struct iphdr *inner_iph;
1729
1730         if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1731                 goto standard_hash;
1732
1733         icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1734                                    &_icmph);
1735         if (!icmph)
1736                 goto standard_hash;
1737
1738         if (icmph->type != ICMP_DEST_UNREACH &&
1739             icmph->type != ICMP_REDIRECT &&
1740             icmph->type != ICMP_TIME_EXCEEDED &&
1741             icmph->type != ICMP_PARAMETERPROB) {
1742                 goto standard_hash;
1743         }
1744
1745         inner_iph = skb_header_pointer(skb,
1746                                        outer_iph->ihl * 4 + sizeof(_icmph),
1747                                        sizeof(_inner_iph), &_inner_iph);
1748         if (!inner_iph)
1749                 goto standard_hash;
1750
1751         return fib_multipath_hash(inner_iph->daddr, inner_iph->saddr);
1752
1753 standard_hash:
1754         return fib_multipath_hash(outer_iph->saddr, outer_iph->daddr);
1755 }
1756
1757 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1758
1759 static int ip_mkroute_input(struct sk_buff *skb,
1760                             struct fib_result *res,
1761                             const struct flowi4 *fl4,
1762                             struct in_device *in_dev,
1763                             __be32 daddr, __be32 saddr, u32 tos)
1764 {
1765 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1766         if (res->fi && res->fi->fib_nhs > 1) {
1767                 int h;
1768
1769                 if (unlikely(ip_hdr(skb)->protocol == IPPROTO_ICMP))
1770                         h = ip_multipath_icmp_hash(skb);
1771                 else
1772                         h = fib_multipath_hash(saddr, daddr);
1773                 fib_select_multipath(res, h);
1774         }
1775 #endif
1776
1777         /* create a routing cache entry */
1778         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1779 }
1780
1781 /*
1782  *      NOTE. We drop all the packets that has local source
1783  *      addresses, because every properly looped back packet
1784  *      must have correct destination already attached by output routine.
1785  *
1786  *      Such approach solves two big problems:
1787  *      1. Not simplex devices are handled properly.
1788  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1789  *      called with rcu_read_lock()
1790  */
1791
1792 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1793                                u8 tos, struct net_device *dev)
1794 {
1795         struct fib_result res;
1796         struct in_device *in_dev = __in_dev_get_rcu(dev);
1797         struct ip_tunnel_info *tun_info;
1798         struct flowi4   fl4;
1799         unsigned int    flags = 0;
1800         u32             itag = 0;
1801         struct rtable   *rth;
1802         int             err = -EINVAL;
1803         struct net    *net = dev_net(dev);
1804         bool do_cache;
1805
1806         /* IP on this device is disabled. */
1807
1808         if (!in_dev)
1809                 goto out;
1810
1811         /* Check for the most weird martians, which can be not detected
1812            by fib_lookup.
1813          */
1814
1815         tun_info = skb_tunnel_info(skb);
1816         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1817                 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1818         else
1819                 fl4.flowi4_tun_key.tun_id = 0;
1820         skb_dst_drop(skb);
1821
1822         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1823                 goto martian_source;
1824
1825         res.fi = NULL;
1826         res.table = NULL;
1827         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1828                 goto brd_input;
1829
1830         /* Accept zero addresses only to limited broadcast;
1831          * I even do not know to fix it or not. Waiting for complains :-)
1832          */
1833         if (ipv4_is_zeronet(saddr))
1834                 goto martian_source;
1835
1836         if (ipv4_is_zeronet(daddr))
1837                 goto martian_destination;
1838
1839         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1840          * and call it once if daddr or/and saddr are loopback addresses
1841          */
1842         if (ipv4_is_loopback(daddr)) {
1843                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1844                         goto martian_destination;
1845         } else if (ipv4_is_loopback(saddr)) {
1846                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1847                         goto martian_source;
1848         }
1849
1850         /*
1851          *      Now we are ready to route packet.
1852          */
1853         fl4.flowi4_oif = 0;
1854         fl4.flowi4_iif = dev->ifindex;
1855         fl4.flowi4_mark = skb->mark;
1856         fl4.flowi4_tos = tos;
1857         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1858         fl4.flowi4_flags = 0;
1859         fl4.daddr = daddr;
1860         fl4.saddr = saddr;
1861         err = fib_lookup(net, &fl4, &res, 0);
1862         if (err != 0) {
1863                 if (!IN_DEV_FORWARD(in_dev))
1864                         err = -EHOSTUNREACH;
1865                 goto no_route;
1866         }
1867
1868         if (res.type == RTN_BROADCAST)
1869                 goto brd_input;
1870
1871         if (res.type == RTN_LOCAL) {
1872                 err = fib_validate_source(skb, saddr, daddr, tos,
1873                                           0, dev, in_dev, &itag);
1874                 if (err < 0)
1875                         goto martian_source;
1876                 goto local_input;
1877         }
1878
1879         if (!IN_DEV_FORWARD(in_dev)) {
1880                 err = -EHOSTUNREACH;
1881                 goto no_route;
1882         }
1883         if (res.type != RTN_UNICAST)
1884                 goto martian_destination;
1885
1886         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1887 out:    return err;
1888
1889 brd_input:
1890         if (skb->protocol != htons(ETH_P_IP))
1891                 goto e_inval;
1892
1893         if (!ipv4_is_zeronet(saddr)) {
1894                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1895                                           in_dev, &itag);
1896                 if (err < 0)
1897                         goto martian_source;
1898         }
1899         flags |= RTCF_BROADCAST;
1900         res.type = RTN_BROADCAST;
1901         RT_CACHE_STAT_INC(in_brd);
1902
1903 local_input:
1904         do_cache = false;
1905         if (res.fi) {
1906                 if (!itag) {
1907                         rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1908                         if (rt_cache_valid(rth)) {
1909                                 skb_dst_set_noref(skb, &rth->dst);
1910                                 err = 0;
1911                                 goto out;
1912                         }
1913                         do_cache = true;
1914                 }
1915         }
1916
1917         rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
1918                            flags | RTCF_LOCAL, res.type,
1919                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1920         if (!rth)
1921                 goto e_nobufs;
1922
1923         rth->dst.output= ip_rt_bug;
1924 #ifdef CONFIG_IP_ROUTE_CLASSID
1925         rth->dst.tclassid = itag;
1926 #endif
1927         rth->rt_is_input = 1;
1928         if (res.table)
1929                 rth->rt_table_id = res.table->tb_id;
1930
1931         RT_CACHE_STAT_INC(in_slow_tot);
1932         if (res.type == RTN_UNREACHABLE) {
1933                 rth->dst.input= ip_error;
1934                 rth->dst.error= -err;
1935                 rth->rt_flags   &= ~RTCF_LOCAL;
1936         }
1937
1938         if (do_cache) {
1939                 struct fib_nh *nh = &FIB_RES_NH(res);
1940
1941                 rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1942                 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
1943                         WARN_ON(rth->dst.input == lwtunnel_input);
1944                         rth->dst.lwtstate->orig_input = rth->dst.input;
1945                         rth->dst.input = lwtunnel_input;
1946                 }
1947
1948                 if (unlikely(!rt_cache_route(nh, rth))) {
1949                         rth->dst.flags |= DST_NOCACHE;
1950                         rt_add_uncached_list(rth);
1951                 }
1952         }
1953         skb_dst_set(skb, &rth->dst);
1954         err = 0;
1955         goto out;
1956
1957 no_route:
1958         RT_CACHE_STAT_INC(in_no_route);
1959         res.type = RTN_UNREACHABLE;
1960         res.fi = NULL;
1961         res.table = NULL;
1962         goto local_input;
1963
1964         /*
1965          *      Do not cache martian addresses: they should be logged (RFC1812)
1966          */
1967 martian_destination:
1968         RT_CACHE_STAT_INC(in_martian_dst);
1969 #ifdef CONFIG_IP_ROUTE_VERBOSE
1970         if (IN_DEV_LOG_MARTIANS(in_dev))
1971                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1972                                      &daddr, &saddr, dev->name);
1973 #endif
1974
1975 e_inval:
1976         err = -EINVAL;
1977         goto out;
1978
1979 e_nobufs:
1980         err = -ENOBUFS;
1981         goto out;
1982
1983 martian_source:
1984         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1985         goto out;
1986 }
1987
1988 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1989                          u8 tos, struct net_device *dev)
1990 {
1991         int res;
1992
1993         rcu_read_lock();
1994
1995         /* Multicast recognition logic is moved from route cache to here.
1996            The problem was that too many Ethernet cards have broken/missing
1997            hardware multicast filters :-( As result the host on multicasting
1998            network acquires a lot of useless route cache entries, sort of
1999            SDR messages from all the world. Now we try to get rid of them.
2000            Really, provided software IP multicast filter is organized
2001            reasonably (at least, hashed), it does not result in a slowdown
2002            comparing with route cache reject entries.
2003            Note, that multicast routers are not affected, because
2004            route cache entry is created eventually.
2005          */
2006         if (ipv4_is_multicast(daddr)) {
2007                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2008                 int our = 0;
2009
2010                 if (in_dev)
2011                         our = ip_check_mc_rcu(in_dev, daddr, saddr,
2012                                               ip_hdr(skb)->protocol);
2013
2014                 /* check l3 master if no match yet */
2015                 if ((!in_dev || !our) && netif_is_l3_slave(dev)) {
2016                         struct in_device *l3_in_dev;
2017
2018                         l3_in_dev = __in_dev_get_rcu(skb->dev);
2019                         if (l3_in_dev)
2020                                 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2021                                                       ip_hdr(skb)->protocol);
2022                 }
2023
2024                 res = -EINVAL;
2025                 if (our
2026 #ifdef CONFIG_IP_MROUTE
2027                         ||
2028                     (!ipv4_is_local_multicast(daddr) &&
2029                      IN_DEV_MFORWARD(in_dev))
2030 #endif
2031                    ) {
2032                         res = ip_route_input_mc(skb, daddr, saddr,
2033                                                 tos, dev, our);
2034                 }
2035                 rcu_read_unlock();
2036                 return res;
2037         }
2038         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2039         rcu_read_unlock();
2040         return res;
2041 }
2042 EXPORT_SYMBOL(ip_route_input_noref);
2043
2044 /* called with rcu_read_lock() */
2045 static struct rtable *__mkroute_output(const struct fib_result *res,
2046                                        const struct flowi4 *fl4, int orig_oif,
2047                                        struct net_device *dev_out,
2048                                        unsigned int flags)
2049 {
2050         struct fib_info *fi = res->fi;
2051         struct fib_nh_exception *fnhe;
2052         struct in_device *in_dev;
2053         u16 type = res->type;
2054         struct rtable *rth;
2055         bool do_cache;
2056
2057         in_dev = __in_dev_get_rcu(dev_out);
2058         if (!in_dev)
2059                 return ERR_PTR(-EINVAL);
2060
2061         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2062                 if (ipv4_is_loopback(fl4->saddr) &&
2063                     !(dev_out->flags & IFF_LOOPBACK) &&
2064                     !netif_is_l3_master(dev_out))
2065                         return ERR_PTR(-EINVAL);
2066
2067         if (ipv4_is_lbcast(fl4->daddr))
2068                 type = RTN_BROADCAST;
2069         else if (ipv4_is_multicast(fl4->daddr))
2070                 type = RTN_MULTICAST;
2071         else if (ipv4_is_zeronet(fl4->daddr))
2072                 return ERR_PTR(-EINVAL);
2073
2074         if (dev_out->flags & IFF_LOOPBACK)
2075                 flags |= RTCF_LOCAL;
2076
2077         do_cache = true;
2078         if (type == RTN_BROADCAST) {
2079                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2080                 fi = NULL;
2081         } else if (type == RTN_MULTICAST) {
2082                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2083                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2084                                      fl4->flowi4_proto))
2085                         flags &= ~RTCF_LOCAL;
2086                 else
2087                         do_cache = false;
2088                 /* If multicast route do not exist use
2089                  * default one, but do not gateway in this case.
2090                  * Yes, it is hack.
2091                  */
2092                 if (fi && res->prefixlen < 4)
2093                         fi = NULL;
2094         } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2095                    (orig_oif != dev_out->ifindex)) {
2096                 /* For local routes that require a particular output interface
2097                  * we do not want to cache the result.  Caching the result
2098                  * causes incorrect behaviour when there are multiple source
2099                  * addresses on the interface, the end result being that if the
2100                  * intended recipient is waiting on that interface for the
2101                  * packet he won't receive it because it will be delivered on
2102                  * the loopback interface and the IP_PKTINFO ipi_ifindex will
2103                  * be set to the loopback interface as well.
2104                  */
2105                 fi = NULL;
2106         }
2107
2108         fnhe = NULL;
2109         do_cache &= fi != NULL;
2110         if (do_cache) {
2111                 struct rtable __rcu **prth;
2112                 struct fib_nh *nh = &FIB_RES_NH(*res);
2113
2114                 fnhe = find_exception(nh, fl4->daddr);
2115                 if (fnhe) {
2116                         prth = &fnhe->fnhe_rth_output;
2117                         rth = rcu_dereference(*prth);
2118                         if (rth && rth->dst.expires &&
2119                             time_after(jiffies, rth->dst.expires)) {
2120                                 ip_del_fnhe(nh, fl4->daddr);
2121                                 fnhe = NULL;
2122                         } else {
2123                                 goto rt_cache;
2124                         }
2125                 }
2126
2127                 if (unlikely(fl4->flowi4_flags &
2128                              FLOWI_FLAG_KNOWN_NH &&
2129                              !(nh->nh_gw &&
2130                                nh->nh_scope == RT_SCOPE_LINK))) {
2131                         do_cache = false;
2132                         goto add;
2133                 }
2134                 prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2135                 rth = rcu_dereference(*prth);
2136
2137 rt_cache:
2138                 if (rt_cache_valid(rth)) {
2139                         dst_hold(&rth->dst);
2140                         return rth;
2141                 }
2142         }
2143
2144 add:
2145         rth = rt_dst_alloc(dev_out, flags, type,
2146                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2147                            IN_DEV_CONF_GET(in_dev, NOXFRM),
2148                            do_cache);
2149         if (!rth)
2150                 return ERR_PTR(-ENOBUFS);
2151
2152         rth->rt_iif     = orig_oif ? : 0;
2153         if (res->table)
2154                 rth->rt_table_id = res->table->tb_id;
2155
2156         RT_CACHE_STAT_INC(out_slow_tot);
2157
2158         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2159                 if (flags & RTCF_LOCAL &&
2160                     !(dev_out->flags & IFF_LOOPBACK)) {
2161                         rth->dst.output = ip_mc_output;
2162                         RT_CACHE_STAT_INC(out_slow_mc);
2163                 }
2164 #ifdef CONFIG_IP_MROUTE
2165                 if (type == RTN_MULTICAST) {
2166                         if (IN_DEV_MFORWARD(in_dev) &&
2167                             !ipv4_is_local_multicast(fl4->daddr)) {
2168                                 rth->dst.input = ip_mr_input;
2169                                 rth->dst.output = ip_mc_output;
2170                         }
2171                 }
2172 #endif
2173         }
2174
2175         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
2176         set_lwt_redirect(rth);
2177
2178         return rth;
2179 }
2180
2181 /*
2182  * Major route resolver routine.
2183  */
2184
2185 struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2186                                           int mp_hash)
2187 {
2188         struct net_device *dev_out = NULL;
2189         __u8 tos = RT_FL_TOS(fl4);
2190         unsigned int flags = 0;
2191         struct fib_result res;
2192         struct rtable *rth;
2193         int orig_oif;
2194         int err = -ENETUNREACH;
2195
2196         res.tclassid    = 0;
2197         res.fi          = NULL;
2198         res.table       = NULL;
2199
2200         orig_oif = fl4->flowi4_oif;
2201
2202         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2203         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2204         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2205                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2206
2207         rcu_read_lock();
2208         if (fl4->saddr) {
2209                 rth = ERR_PTR(-EINVAL);
2210                 if (ipv4_is_multicast(fl4->saddr) ||
2211                     ipv4_is_lbcast(fl4->saddr) ||
2212                     ipv4_is_zeronet(fl4->saddr))
2213                         goto out;
2214
2215                 /* I removed check for oif == dev_out->oif here.
2216                    It was wrong for two reasons:
2217                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2218                       is assigned to multiple interfaces.
2219                    2. Moreover, we are allowed to send packets with saddr
2220                       of another iface. --ANK
2221                  */
2222
2223                 if (fl4->flowi4_oif == 0 &&
2224                     (ipv4_is_multicast(fl4->daddr) ||
2225                      ipv4_is_lbcast(fl4->daddr))) {
2226                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2227                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2228                         if (!dev_out)
2229                                 goto out;
2230
2231                         /* Special hack: user can direct multicasts
2232                            and limited broadcast via necessary interface
2233                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2234                            This hack is not just for fun, it allows
2235                            vic,vat and friends to work.
2236                            They bind socket to loopback, set ttl to zero
2237                            and expect that it will work.
2238                            From the viewpoint of routing cache they are broken,
2239                            because we are not allowed to build multicast path
2240                            with loopback source addr (look, routing cache
2241                            cannot know, that ttl is zero, so that packet
2242                            will not leave this host and route is valid).
2243                            Luckily, this hack is good workaround.
2244                          */
2245
2246                         fl4->flowi4_oif = dev_out->ifindex;
2247                         goto make_route;
2248                 }
2249
2250                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2251                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2252                         if (!__ip_dev_find(net, fl4->saddr, false))
2253                                 goto out;
2254                 }
2255         }
2256
2257
2258         if (fl4->flowi4_oif) {
2259                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2260                 rth = ERR_PTR(-ENODEV);
2261                 if (!dev_out)
2262                         goto out;
2263
2264                 /* RACE: Check return value of inet_select_addr instead. */
2265                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2266                         rth = ERR_PTR(-ENETUNREACH);
2267                         goto out;
2268                 }
2269                 if (ipv4_is_local_multicast(fl4->daddr) ||
2270                     ipv4_is_lbcast(fl4->daddr) ||
2271                     fl4->flowi4_proto == IPPROTO_IGMP) {
2272                         if (!fl4->saddr)
2273                                 fl4->saddr = inet_select_addr(dev_out, 0,
2274                                                               RT_SCOPE_LINK);
2275                         goto make_route;
2276                 }
2277                 if (!fl4->saddr) {
2278                         if (ipv4_is_multicast(fl4->daddr))
2279                                 fl4->saddr = inet_select_addr(dev_out, 0,
2280                                                               fl4->flowi4_scope);
2281                         else if (!fl4->daddr)
2282                                 fl4->saddr = inet_select_addr(dev_out, 0,
2283                                                               RT_SCOPE_HOST);
2284                 }
2285         }
2286
2287         if (!fl4->daddr) {
2288                 fl4->daddr = fl4->saddr;
2289                 if (!fl4->daddr)
2290                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2291                 dev_out = net->loopback_dev;
2292                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2293                 res.type = RTN_LOCAL;
2294                 flags |= RTCF_LOCAL;
2295                 goto make_route;
2296         }
2297
2298         err = fib_lookup(net, fl4, &res, 0);
2299         if (err) {
2300                 res.fi = NULL;
2301                 res.table = NULL;
2302                 if (fl4->flowi4_oif &&
2303                     (ipv4_is_multicast(fl4->daddr) ||
2304                     !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2305                         /* Apparently, routing tables are wrong. Assume,
2306                            that the destination is on link.
2307
2308                            WHY? DW.
2309                            Because we are allowed to send to iface
2310                            even if it has NO routes and NO assigned
2311                            addresses. When oif is specified, routing
2312                            tables are looked up with only one purpose:
2313                            to catch if destination is gatewayed, rather than
2314                            direct. Moreover, if MSG_DONTROUTE is set,
2315                            we send packet, ignoring both routing tables
2316                            and ifaddr state. --ANK
2317
2318
2319                            We could make it even if oif is unknown,
2320                            likely IPv6, but we do not.
2321                          */
2322
2323                         if (fl4->saddr == 0)
2324                                 fl4->saddr = inet_select_addr(dev_out, 0,
2325                                                               RT_SCOPE_LINK);
2326                         res.type = RTN_UNICAST;
2327                         goto make_route;
2328                 }
2329                 rth = ERR_PTR(err);
2330                 goto out;
2331         }
2332
2333         if (res.type == RTN_LOCAL) {
2334                 if (!fl4->saddr) {
2335                         if (res.fi->fib_prefsrc)
2336                                 fl4->saddr = res.fi->fib_prefsrc;
2337                         else
2338                                 fl4->saddr = fl4->daddr;
2339                 }
2340
2341                 /* L3 master device is the loopback for that domain */
2342                 dev_out = l3mdev_master_dev_rcu(dev_out) ? : net->loopback_dev;
2343                 fl4->flowi4_oif = dev_out->ifindex;
2344                 flags |= RTCF_LOCAL;
2345                 goto make_route;
2346         }
2347
2348         fib_select_path(net, &res, fl4, mp_hash);
2349
2350         dev_out = FIB_RES_DEV(res);
2351         fl4->flowi4_oif = dev_out->ifindex;
2352
2353
2354 make_route:
2355         rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2356
2357 out:
2358         rcu_read_unlock();
2359         return rth;
2360 }
2361 EXPORT_SYMBOL_GPL(__ip_route_output_key_hash);
2362
2363 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2364 {
2365         return NULL;
2366 }
2367
2368 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2369 {
2370         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2371
2372         return mtu ? : dst->dev->mtu;
2373 }
2374
2375 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2376                                           struct sk_buff *skb, u32 mtu)
2377 {
2378 }
2379
2380 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2381                                        struct sk_buff *skb)
2382 {
2383 }
2384
2385 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2386                                           unsigned long old)
2387 {
2388         return NULL;
2389 }
2390
2391 static struct dst_ops ipv4_dst_blackhole_ops = {
2392         .family                 =       AF_INET,
2393         .check                  =       ipv4_blackhole_dst_check,
2394         .mtu                    =       ipv4_blackhole_mtu,
2395         .default_advmss         =       ipv4_default_advmss,
2396         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2397         .redirect               =       ipv4_rt_blackhole_redirect,
2398         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2399         .neigh_lookup           =       ipv4_neigh_lookup,
2400 };
2401
2402 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2403 {
2404         struct rtable *ort = (struct rtable *) dst_orig;
2405         struct rtable *rt;
2406
2407         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2408         if (rt) {
2409                 struct dst_entry *new = &rt->dst;
2410
2411                 new->__use = 1;
2412                 new->input = dst_discard;
2413                 new->output = dst_discard_out;
2414
2415                 new->dev = ort->dst.dev;
2416                 if (new->dev)
2417                         dev_hold(new->dev);
2418
2419                 rt->rt_is_input = ort->rt_is_input;
2420                 rt->rt_iif = ort->rt_iif;
2421                 rt->rt_pmtu = ort->rt_pmtu;
2422
2423                 rt->rt_genid = rt_genid_ipv4(net);
2424                 rt->rt_flags = ort->rt_flags;
2425                 rt->rt_type = ort->rt_type;
2426                 rt->rt_gateway = ort->rt_gateway;
2427                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2428
2429                 INIT_LIST_HEAD(&rt->rt_uncached);
2430                 dst_free(new);
2431         }
2432
2433         dst_release(dst_orig);
2434
2435         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2436 }
2437
2438 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2439                                     const struct sock *sk)
2440 {
2441         struct rtable *rt = __ip_route_output_key(net, flp4);
2442
2443         if (IS_ERR(rt))
2444                 return rt;
2445
2446         if (flp4->flowi4_proto)
2447                 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2448                                                         flowi4_to_flowi(flp4),
2449                                                         sk, 0);
2450
2451         return rt;
2452 }
2453 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2454
2455 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
2456                         struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2457                         u32 seq, int event, int nowait, unsigned int flags)
2458 {
2459         struct rtable *rt = skb_rtable(skb);
2460         struct rtmsg *r;
2461         struct nlmsghdr *nlh;
2462         unsigned long expires = 0;
2463         u32 error;
2464         u32 metrics[RTAX_MAX];
2465
2466         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2467         if (!nlh)
2468                 return -EMSGSIZE;
2469
2470         r = nlmsg_data(nlh);
2471         r->rtm_family    = AF_INET;
2472         r->rtm_dst_len  = 32;
2473         r->rtm_src_len  = 0;
2474         r->rtm_tos      = fl4->flowi4_tos;
2475         r->rtm_table    = table_id;
2476         if (nla_put_u32(skb, RTA_TABLE, table_id))
2477                 goto nla_put_failure;
2478         r->rtm_type     = rt->rt_type;
2479         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2480         r->rtm_protocol = RTPROT_UNSPEC;
2481         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2482         if (rt->rt_flags & RTCF_NOTIFY)
2483                 r->rtm_flags |= RTM_F_NOTIFY;
2484         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2485                 r->rtm_flags |= RTCF_DOREDIRECT;
2486
2487         if (nla_put_in_addr(skb, RTA_DST, dst))
2488                 goto nla_put_failure;
2489         if (src) {
2490                 r->rtm_src_len = 32;
2491                 if (nla_put_in_addr(skb, RTA_SRC, src))
2492                         goto nla_put_failure;
2493         }
2494         if (rt->dst.dev &&
2495             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2496                 goto nla_put_failure;
2497 #ifdef CONFIG_IP_ROUTE_CLASSID
2498         if (rt->dst.tclassid &&
2499             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2500                 goto nla_put_failure;
2501 #endif
2502         if (!rt_is_input_route(rt) &&
2503             fl4->saddr != src) {
2504                 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2505                         goto nla_put_failure;
2506         }
2507         if (rt->rt_uses_gateway &&
2508             nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2509                 goto nla_put_failure;
2510
2511         expires = rt->dst.expires;
2512         if (expires) {
2513                 unsigned long now = jiffies;
2514
2515                 if (time_before(now, expires))
2516                         expires -= now;
2517                 else
2518                         expires = 0;
2519         }
2520
2521         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2522         if (rt->rt_pmtu && expires)
2523                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2524         if (rtnetlink_put_metrics(skb, metrics) < 0)
2525                 goto nla_put_failure;
2526
2527         if (fl4->flowi4_mark &&
2528             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2529                 goto nla_put_failure;
2530
2531         if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2532             nla_put_u32(skb, RTA_UID,
2533                         from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2534                 goto nla_put_failure;
2535
2536         error = rt->dst.error;
2537
2538         if (rt_is_input_route(rt)) {
2539 #ifdef CONFIG_IP_MROUTE
2540                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2541                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2542                         int err = ipmr_get_route(net, skb,
2543                                                  fl4->saddr, fl4->daddr,
2544                                                  r, nowait, portid);
2545
2546                         if (err <= 0) {
2547                                 if (!nowait) {
2548                                         if (err == 0)
2549                                                 return 0;
2550                                         goto nla_put_failure;
2551                                 } else {
2552                                         if (err == -EMSGSIZE)
2553                                                 goto nla_put_failure;
2554                                         error = err;
2555                                 }
2556                         }
2557                 } else
2558 #endif
2559                         if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2560                                 goto nla_put_failure;
2561         }
2562
2563         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2564                 goto nla_put_failure;
2565
2566         nlmsg_end(skb, nlh);
2567         return 0;
2568
2569 nla_put_failure:
2570         nlmsg_cancel(skb, nlh);
2571         return -EMSGSIZE;
2572 }
2573
2574 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2575 {
2576         struct net *net = sock_net(in_skb->sk);
2577         struct rtmsg *rtm;
2578         struct nlattr *tb[RTA_MAX+1];
2579         struct rtable *rt = NULL;
2580         struct flowi4 fl4;
2581         __be32 dst = 0;
2582         __be32 src = 0;
2583         u32 iif;
2584         int err;
2585         int mark;
2586         struct sk_buff *skb;
2587         u32 table_id = RT_TABLE_MAIN;
2588         kuid_t uid;
2589
2590         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2591         if (err < 0)
2592                 goto errout;
2593
2594         rtm = nlmsg_data(nlh);
2595
2596         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2597         if (!skb) {
2598                 err = -ENOBUFS;
2599                 goto errout;
2600         }
2601
2602         /* Reserve room for dummy headers, this skb can pass
2603            through good chunk of routing engine.
2604          */
2605         skb_reset_mac_header(skb);
2606         skb_reset_network_header(skb);
2607
2608         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2609         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2610         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2611
2612         src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2613         dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2614         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2615         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2616         if (tb[RTA_UID])
2617                 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2618         else
2619                 uid = (iif ? INVALID_UID : current_uid());
2620
2621         memset(&fl4, 0, sizeof(fl4));
2622         fl4.daddr = dst;
2623         fl4.saddr = src;
2624         fl4.flowi4_tos = rtm->rtm_tos;
2625         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2626         fl4.flowi4_mark = mark;
2627         fl4.flowi4_uid = uid;
2628
2629         if (iif) {
2630                 struct net_device *dev;
2631
2632                 dev = __dev_get_by_index(net, iif);
2633                 if (!dev) {
2634                         err = -ENODEV;
2635                         goto errout_free;
2636                 }
2637
2638                 skb->protocol   = htons(ETH_P_IP);
2639                 skb->dev        = dev;
2640                 skb->mark       = mark;
2641                 local_bh_disable();
2642                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2643                 local_bh_enable();
2644
2645                 rt = skb_rtable(skb);
2646                 if (err == 0 && rt->dst.error)
2647                         err = -rt->dst.error;
2648         } else {
2649                 rt = ip_route_output_key(net, &fl4);
2650
2651                 err = 0;
2652                 if (IS_ERR(rt))
2653                         err = PTR_ERR(rt);
2654         }
2655
2656         if (err)
2657                 goto errout_free;
2658
2659         skb_dst_set(skb, &rt->dst);
2660         if (rtm->rtm_flags & RTM_F_NOTIFY)
2661                 rt->rt_flags |= RTCF_NOTIFY;
2662
2663         if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2664                 table_id = rt->rt_table_id;
2665
2666         err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
2667                            NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2668                            RTM_NEWROUTE, 0, 0);
2669         if (err < 0)
2670                 goto errout_free;
2671
2672         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2673 errout:
2674         return err;
2675
2676 errout_free:
2677         kfree_skb(skb);
2678         goto errout;
2679 }
2680
2681 void ip_rt_multicast_event(struct in_device *in_dev)
2682 {
2683         rt_cache_flush(dev_net(in_dev->dev));
2684 }
2685
2686 #ifdef CONFIG_SYSCTL
2687 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2688 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2689 static int ip_rt_gc_elasticity __read_mostly    = 8;
2690
2691 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2692                                         void __user *buffer,
2693                                         size_t *lenp, loff_t *ppos)
2694 {
2695         struct net *net = (struct net *)__ctl->extra1;
2696
2697         if (write) {
2698                 rt_cache_flush(net);
2699                 fnhe_genid_bump(net);
2700                 return 0;
2701         }
2702
2703         return -EINVAL;
2704 }
2705
2706 static struct ctl_table ipv4_route_table[] = {
2707         {
2708                 .procname       = "gc_thresh",
2709                 .data           = &ipv4_dst_ops.gc_thresh,
2710                 .maxlen         = sizeof(int),
2711                 .mode           = 0644,
2712                 .proc_handler   = proc_dointvec,
2713         },
2714         {
2715                 .procname       = "max_size",
2716                 .data           = &ip_rt_max_size,
2717                 .maxlen         = sizeof(int),
2718                 .mode           = 0644,
2719                 .proc_handler   = proc_dointvec,
2720         },
2721         {
2722                 /*  Deprecated. Use gc_min_interval_ms */
2723
2724                 .procname       = "gc_min_interval",
2725                 .data           = &ip_rt_gc_min_interval,
2726                 .maxlen         = sizeof(int),
2727                 .mode           = 0644,
2728                 .proc_handler   = proc_dointvec_jiffies,
2729         },
2730         {
2731                 .procname       = "gc_min_interval_ms",
2732                 .data           = &ip_rt_gc_min_interval,
2733                 .maxlen         = sizeof(int),
2734                 .mode           = 0644,
2735                 .proc_handler   = proc_dointvec_ms_jiffies,
2736         },
2737         {
2738                 .procname       = "gc_timeout",
2739                 .data           = &ip_rt_gc_timeout,
2740                 .maxlen         = sizeof(int),
2741                 .mode           = 0644,
2742                 .proc_handler   = proc_dointvec_jiffies,
2743         },
2744         {
2745                 .procname       = "gc_interval",
2746                 .data           = &ip_rt_gc_interval,
2747                 .maxlen         = sizeof(int),
2748                 .mode           = 0644,
2749                 .proc_handler   = proc_dointvec_jiffies,
2750         },
2751         {
2752                 .procname       = "redirect_load",
2753                 .data           = &ip_rt_redirect_load,
2754                 .maxlen         = sizeof(int),
2755                 .mode           = 0644,
2756                 .proc_handler   = proc_dointvec,
2757         },
2758         {
2759                 .procname       = "redirect_number",
2760                 .data           = &ip_rt_redirect_number,
2761                 .maxlen         = sizeof(int),
2762                 .mode           = 0644,
2763                 .proc_handler   = proc_dointvec,
2764         },
2765         {
2766                 .procname       = "redirect_silence",
2767                 .data           = &ip_rt_redirect_silence,
2768                 .maxlen         = sizeof(int),
2769                 .mode           = 0644,
2770                 .proc_handler   = proc_dointvec,
2771         },
2772         {
2773                 .procname       = "error_cost",
2774                 .data           = &ip_rt_error_cost,
2775                 .maxlen         = sizeof(int),
2776                 .mode           = 0644,
2777                 .proc_handler   = proc_dointvec,
2778         },
2779         {
2780                 .procname       = "error_burst",
2781                 .data           = &ip_rt_error_burst,
2782                 .maxlen         = sizeof(int),
2783                 .mode           = 0644,
2784                 .proc_handler   = proc_dointvec,
2785         },
2786         {
2787                 .procname       = "gc_elasticity",
2788                 .data           = &ip_rt_gc_elasticity,
2789                 .maxlen         = sizeof(int),
2790                 .mode           = 0644,
2791                 .proc_handler   = proc_dointvec,
2792         },
2793         {
2794                 .procname       = "mtu_expires",
2795                 .data           = &ip_rt_mtu_expires,
2796                 .maxlen         = sizeof(int),
2797                 .mode           = 0644,
2798                 .proc_handler   = proc_dointvec_jiffies,
2799         },
2800         {
2801                 .procname       = "min_pmtu",
2802                 .data           = &ip_rt_min_pmtu,
2803                 .maxlen         = sizeof(int),
2804                 .mode           = 0644,
2805                 .proc_handler   = proc_dointvec,
2806         },
2807         {
2808                 .procname       = "min_adv_mss",
2809                 .data           = &ip_rt_min_advmss,
2810                 .maxlen         = sizeof(int),
2811                 .mode           = 0644,
2812                 .proc_handler   = proc_dointvec,
2813         },
2814         { }
2815 };
2816
2817 static struct ctl_table ipv4_route_flush_table[] = {
2818         {
2819                 .procname       = "flush",
2820                 .maxlen         = sizeof(int),
2821                 .mode           = 0200,
2822                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2823         },
2824         { },
2825 };
2826
2827 static __net_init int sysctl_route_net_init(struct net *net)
2828 {
2829         struct ctl_table *tbl;
2830
2831         tbl = ipv4_route_flush_table;
2832         if (!net_eq(net, &init_net)) {
2833                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2834                 if (!tbl)
2835                         goto err_dup;
2836
2837                 /* Don't export sysctls to unprivileged users */
2838                 if (net->user_ns != &init_user_ns)
2839                         tbl[0].procname = NULL;
2840         }
2841         tbl[0].extra1 = net;
2842
2843         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2844         if (!net->ipv4.route_hdr)
2845                 goto err_reg;
2846         return 0;
2847
2848 err_reg:
2849         if (tbl != ipv4_route_flush_table)
2850                 kfree(tbl);
2851 err_dup:
2852         return -ENOMEM;
2853 }
2854
2855 static __net_exit void sysctl_route_net_exit(struct net *net)
2856 {
2857         struct ctl_table *tbl;
2858
2859         tbl = net->ipv4.route_hdr->ctl_table_arg;
2860         unregister_net_sysctl_table(net->ipv4.route_hdr);
2861         BUG_ON(tbl == ipv4_route_flush_table);
2862         kfree(tbl);
2863 }
2864
2865 static __net_initdata struct pernet_operations sysctl_route_ops = {
2866         .init = sysctl_route_net_init,
2867         .exit = sysctl_route_net_exit,
2868 };
2869 #endif
2870
2871 static __net_init int rt_genid_init(struct net *net)
2872 {
2873         atomic_set(&net->ipv4.rt_genid, 0);
2874         atomic_set(&net->fnhe_genid, 0);
2875         get_random_bytes(&net->ipv4.dev_addr_genid,
2876                          sizeof(net->ipv4.dev_addr_genid));
2877         return 0;
2878 }
2879
2880 static __net_initdata struct pernet_operations rt_genid_ops = {
2881         .init = rt_genid_init,
2882 };
2883
2884 static int __net_init ipv4_inetpeer_init(struct net *net)
2885 {
2886         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2887
2888         if (!bp)
2889                 return -ENOMEM;
2890         inet_peer_base_init(bp);
2891         net->ipv4.peers = bp;
2892         return 0;
2893 }
2894
2895 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2896 {
2897         struct inet_peer_base *bp = net->ipv4.peers;
2898
2899         net->ipv4.peers = NULL;
2900         inetpeer_invalidate_tree(bp);
2901         kfree(bp);
2902 }
2903
2904 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2905         .init   =       ipv4_inetpeer_init,
2906         .exit   =       ipv4_inetpeer_exit,
2907 };
2908
2909 #ifdef CONFIG_IP_ROUTE_CLASSID
2910 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2911 #endif /* CONFIG_IP_ROUTE_CLASSID */
2912
2913 int __init ip_rt_init(void)
2914 {
2915         int rc = 0;
2916         int cpu;
2917
2918         ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
2919         if (!ip_idents)
2920                 panic("IP: failed to allocate ip_idents\n");
2921
2922         prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
2923
2924         ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
2925         if (!ip_tstamps)
2926                 panic("IP: failed to allocate ip_tstamps\n");
2927
2928         for_each_possible_cpu(cpu) {
2929                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
2930
2931                 INIT_LIST_HEAD(&ul->head);
2932                 spin_lock_init(&ul->lock);
2933         }
2934 #ifdef CONFIG_IP_ROUTE_CLASSID
2935         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2936         if (!ip_rt_acct)
2937                 panic("IP: failed to allocate ip_rt_acct\n");
2938 #endif
2939
2940         ipv4_dst_ops.kmem_cachep =
2941                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2942                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2943
2944         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2945
2946         if (dst_entries_init(&ipv4_dst_ops) < 0)
2947                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
2948
2949         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2950                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2951
2952         ipv4_dst_ops.gc_thresh = ~0;
2953         ip_rt_max_size = INT_MAX;
2954
2955         devinet_init();
2956         ip_fib_init();
2957
2958         if (ip_rt_proc_init())
2959                 pr_err("Unable to create route proc files\n");
2960 #ifdef CONFIG_XFRM
2961         xfrm_init();
2962         xfrm4_init();
2963 #endif
2964         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2965
2966 #ifdef CONFIG_SYSCTL
2967         register_pernet_subsys(&sysctl_route_ops);
2968 #endif
2969         register_pernet_subsys(&rt_genid_ops);
2970         register_pernet_subsys(&ipv4_inetpeer_ops);
2971         return rc;
2972 }
2973
2974 #ifdef CONFIG_SYSCTL
2975 /*
2976  * We really need to sanitize the damn ipv4 init order, then all
2977  * this nonsense will go away.
2978  */
2979 void __init ip_static_sysctl_init(void)
2980 {
2981         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2982 }
2983 #endif