Merge tag 'reset-for-v5.3' of git://git.pengutronix.de/git/pza/linux into arm/drivers
[sfrench/cifs-2.6.git] / net / ipv4 / route.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
4  *              operating system.  INET is implemented using the  BSD Socket
5  *              interface as the means of communication with the user level.
6  *
7  *              ROUTE - implementation of the IP router.
8  *
9  * Authors:     Ross Biro
10  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
11  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
12  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
13  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
14  *
15  * Fixes:
16  *              Alan Cox        :       Verify area fixes.
17  *              Alan Cox        :       cli() protects routing changes
18  *              Rui Oliveira    :       ICMP routing table updates
19  *              (rco@di.uminho.pt)      Routing table insertion and update
20  *              Linus Torvalds  :       Rewrote bits to be sensible
21  *              Alan Cox        :       Added BSD route gw semantics
22  *              Alan Cox        :       Super /proc >4K
23  *              Alan Cox        :       MTU in route table
24  *              Alan Cox        :       MSS actually. Also added the window
25  *                                      clamper.
26  *              Sam Lantinga    :       Fixed route matching in rt_del()
27  *              Alan Cox        :       Routing cache support.
28  *              Alan Cox        :       Removed compatibility cruft.
29  *              Alan Cox        :       RTF_REJECT support.
30  *              Alan Cox        :       TCP irtt support.
31  *              Jonathan Naylor :       Added Metric support.
32  *      Miquel van Smoorenburg  :       BSD API fixes.
33  *      Miquel van Smoorenburg  :       Metrics.
34  *              Alan Cox        :       Use __u32 properly
35  *              Alan Cox        :       Aligned routing errors more closely with BSD
36  *                                      our system is still very different.
37  *              Alan Cox        :       Faster /proc handling
38  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
39  *                                      routing caches and better behaviour.
40  *
41  *              Olaf Erb        :       irtt wasn't being copied right.
42  *              Bjorn Ekwall    :       Kerneld route support.
43  *              Alan Cox        :       Multicast fixed (I hope)
44  *              Pavel Krauz     :       Limited broadcast fixed
45  *              Mike McLagan    :       Routing by source
46  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
47  *                                      route.c and rewritten from scratch.
48  *              Andi Kleen      :       Load-limit warning messages.
49  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
50  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
51  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
52  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
53  *              Marc Boucher    :       routing by fwmark
54  *      Robert Olsson           :       Added rt_cache statistics
55  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
56  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
57  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
58  *      Ilia Sotnikov           :       Removed TOS from hash calculations
59  */
60
61 #define pr_fmt(fmt) "IPv4: " fmt
62
63 #include <linux/module.h>
64 #include <linux/uaccess.h>
65 #include <linux/bitops.h>
66 #include <linux/types.h>
67 #include <linux/kernel.h>
68 #include <linux/mm.h>
69 #include <linux/string.h>
70 #include <linux/socket.h>
71 #include <linux/sockios.h>
72 #include <linux/errno.h>
73 #include <linux/in.h>
74 #include <linux/inet.h>
75 #include <linux/netdevice.h>
76 #include <linux/proc_fs.h>
77 #include <linux/init.h>
78 #include <linux/skbuff.h>
79 #include <linux/inetdevice.h>
80 #include <linux/igmp.h>
81 #include <linux/pkt_sched.h>
82 #include <linux/mroute.h>
83 #include <linux/netfilter_ipv4.h>
84 #include <linux/random.h>
85 #include <linux/rcupdate.h>
86 #include <linux/times.h>
87 #include <linux/slab.h>
88 #include <linux/jhash.h>
89 #include <net/dst.h>
90 #include <net/dst_metadata.h>
91 #include <net/net_namespace.h>
92 #include <net/protocol.h>
93 #include <net/ip.h>
94 #include <net/route.h>
95 #include <net/inetpeer.h>
96 #include <net/sock.h>
97 #include <net/ip_fib.h>
98 #include <net/arp.h>
99 #include <net/tcp.h>
100 #include <net/icmp.h>
101 #include <net/xfrm.h>
102 #include <net/lwtunnel.h>
103 #include <net/netevent.h>
104 #include <net/rtnetlink.h>
105 #ifdef CONFIG_SYSCTL
106 #include <linux/sysctl.h>
107 #endif
108 #include <net/secure_seq.h>
109 #include <net/ip_tunnels.h>
110 #include <net/l3mdev.h>
111
112 #include "fib_lookup.h"
113
114 #define RT_FL_TOS(oldflp4) \
115         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
116
117 #define RT_GC_TIMEOUT (300*HZ)
118
119 static int ip_rt_max_size;
120 static int ip_rt_redirect_number __read_mostly  = 9;
121 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
122 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
123 static int ip_rt_error_cost __read_mostly       = HZ;
124 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
125 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
126 static u32 ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
127 static int ip_rt_min_advmss __read_mostly       = 256;
128
129 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
130
131 /*
132  *      Interface to generic destination cache.
133  */
134
135 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
136 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
137 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
138 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
139 static void              ipv4_link_failure(struct sk_buff *skb);
140 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
141                                            struct sk_buff *skb, u32 mtu);
142 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
143                                         struct sk_buff *skb);
144 static void             ipv4_dst_destroy(struct dst_entry *dst);
145
146 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
147 {
148         WARN_ON(1);
149         return NULL;
150 }
151
152 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
153                                            struct sk_buff *skb,
154                                            const void *daddr);
155 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
156
157 static struct dst_ops ipv4_dst_ops = {
158         .family =               AF_INET,
159         .check =                ipv4_dst_check,
160         .default_advmss =       ipv4_default_advmss,
161         .mtu =                  ipv4_mtu,
162         .cow_metrics =          ipv4_cow_metrics,
163         .destroy =              ipv4_dst_destroy,
164         .negative_advice =      ipv4_negative_advice,
165         .link_failure =         ipv4_link_failure,
166         .update_pmtu =          ip_rt_update_pmtu,
167         .redirect =             ip_do_redirect,
168         .local_out =            __ip_local_out,
169         .neigh_lookup =         ipv4_neigh_lookup,
170         .confirm_neigh =        ipv4_confirm_neigh,
171 };
172
173 #define ECN_OR_COST(class)      TC_PRIO_##class
174
175 const __u8 ip_tos2prio[16] = {
176         TC_PRIO_BESTEFFORT,
177         ECN_OR_COST(BESTEFFORT),
178         TC_PRIO_BESTEFFORT,
179         ECN_OR_COST(BESTEFFORT),
180         TC_PRIO_BULK,
181         ECN_OR_COST(BULK),
182         TC_PRIO_BULK,
183         ECN_OR_COST(BULK),
184         TC_PRIO_INTERACTIVE,
185         ECN_OR_COST(INTERACTIVE),
186         TC_PRIO_INTERACTIVE,
187         ECN_OR_COST(INTERACTIVE),
188         TC_PRIO_INTERACTIVE_BULK,
189         ECN_OR_COST(INTERACTIVE_BULK),
190         TC_PRIO_INTERACTIVE_BULK,
191         ECN_OR_COST(INTERACTIVE_BULK)
192 };
193 EXPORT_SYMBOL(ip_tos2prio);
194
195 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
196 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
197
198 #ifdef CONFIG_PROC_FS
199 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
200 {
201         if (*pos)
202                 return NULL;
203         return SEQ_START_TOKEN;
204 }
205
206 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
207 {
208         ++*pos;
209         return NULL;
210 }
211
212 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
213 {
214 }
215
216 static int rt_cache_seq_show(struct seq_file *seq, void *v)
217 {
218         if (v == SEQ_START_TOKEN)
219                 seq_printf(seq, "%-127s\n",
220                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
221                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
222                            "HHUptod\tSpecDst");
223         return 0;
224 }
225
226 static const struct seq_operations rt_cache_seq_ops = {
227         .start  = rt_cache_seq_start,
228         .next   = rt_cache_seq_next,
229         .stop   = rt_cache_seq_stop,
230         .show   = rt_cache_seq_show,
231 };
232
233 static int rt_cache_seq_open(struct inode *inode, struct file *file)
234 {
235         return seq_open(file, &rt_cache_seq_ops);
236 }
237
238 static const struct file_operations rt_cache_seq_fops = {
239         .open    = rt_cache_seq_open,
240         .read    = seq_read,
241         .llseek  = seq_lseek,
242         .release = seq_release,
243 };
244
245
246 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
247 {
248         int cpu;
249
250         if (*pos == 0)
251                 return SEQ_START_TOKEN;
252
253         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
254                 if (!cpu_possible(cpu))
255                         continue;
256                 *pos = cpu+1;
257                 return &per_cpu(rt_cache_stat, cpu);
258         }
259         return NULL;
260 }
261
262 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
263 {
264         int cpu;
265
266         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
267                 if (!cpu_possible(cpu))
268                         continue;
269                 *pos = cpu+1;
270                 return &per_cpu(rt_cache_stat, cpu);
271         }
272         return NULL;
273
274 }
275
276 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
277 {
278
279 }
280
281 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
282 {
283         struct rt_cache_stat *st = v;
284
285         if (v == SEQ_START_TOKEN) {
286                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
287                 return 0;
288         }
289
290         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
291                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
292                    dst_entries_get_slow(&ipv4_dst_ops),
293                    0, /* st->in_hit */
294                    st->in_slow_tot,
295                    st->in_slow_mc,
296                    st->in_no_route,
297                    st->in_brd,
298                    st->in_martian_dst,
299                    st->in_martian_src,
300
301                    0, /* st->out_hit */
302                    st->out_slow_tot,
303                    st->out_slow_mc,
304
305                    0, /* st->gc_total */
306                    0, /* st->gc_ignored */
307                    0, /* st->gc_goal_miss */
308                    0, /* st->gc_dst_overflow */
309                    0, /* st->in_hlist_search */
310                    0  /* st->out_hlist_search */
311                 );
312         return 0;
313 }
314
315 static const struct seq_operations rt_cpu_seq_ops = {
316         .start  = rt_cpu_seq_start,
317         .next   = rt_cpu_seq_next,
318         .stop   = rt_cpu_seq_stop,
319         .show   = rt_cpu_seq_show,
320 };
321
322
323 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
324 {
325         return seq_open(file, &rt_cpu_seq_ops);
326 }
327
328 static const struct file_operations rt_cpu_seq_fops = {
329         .open    = rt_cpu_seq_open,
330         .read    = seq_read,
331         .llseek  = seq_lseek,
332         .release = seq_release,
333 };
334
335 #ifdef CONFIG_IP_ROUTE_CLASSID
336 static int rt_acct_proc_show(struct seq_file *m, void *v)
337 {
338         struct ip_rt_acct *dst, *src;
339         unsigned int i, j;
340
341         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
342         if (!dst)
343                 return -ENOMEM;
344
345         for_each_possible_cpu(i) {
346                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
347                 for (j = 0; j < 256; j++) {
348                         dst[j].o_bytes   += src[j].o_bytes;
349                         dst[j].o_packets += src[j].o_packets;
350                         dst[j].i_bytes   += src[j].i_bytes;
351                         dst[j].i_packets += src[j].i_packets;
352                 }
353         }
354
355         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
356         kfree(dst);
357         return 0;
358 }
359 #endif
360
361 static int __net_init ip_rt_do_proc_init(struct net *net)
362 {
363         struct proc_dir_entry *pde;
364
365         pde = proc_create("rt_cache", 0444, net->proc_net,
366                           &rt_cache_seq_fops);
367         if (!pde)
368                 goto err1;
369
370         pde = proc_create("rt_cache", 0444,
371                           net->proc_net_stat, &rt_cpu_seq_fops);
372         if (!pde)
373                 goto err2;
374
375 #ifdef CONFIG_IP_ROUTE_CLASSID
376         pde = proc_create_single("rt_acct", 0, net->proc_net,
377                         rt_acct_proc_show);
378         if (!pde)
379                 goto err3;
380 #endif
381         return 0;
382
383 #ifdef CONFIG_IP_ROUTE_CLASSID
384 err3:
385         remove_proc_entry("rt_cache", net->proc_net_stat);
386 #endif
387 err2:
388         remove_proc_entry("rt_cache", net->proc_net);
389 err1:
390         return -ENOMEM;
391 }
392
393 static void __net_exit ip_rt_do_proc_exit(struct net *net)
394 {
395         remove_proc_entry("rt_cache", net->proc_net_stat);
396         remove_proc_entry("rt_cache", net->proc_net);
397 #ifdef CONFIG_IP_ROUTE_CLASSID
398         remove_proc_entry("rt_acct", net->proc_net);
399 #endif
400 }
401
402 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
403         .init = ip_rt_do_proc_init,
404         .exit = ip_rt_do_proc_exit,
405 };
406
407 static int __init ip_rt_proc_init(void)
408 {
409         return register_pernet_subsys(&ip_rt_proc_ops);
410 }
411
412 #else
413 static inline int ip_rt_proc_init(void)
414 {
415         return 0;
416 }
417 #endif /* CONFIG_PROC_FS */
418
419 static inline bool rt_is_expired(const struct rtable *rth)
420 {
421         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
422 }
423
424 void rt_cache_flush(struct net *net)
425 {
426         rt_genid_bump_ipv4(net);
427 }
428
429 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
430                                            struct sk_buff *skb,
431                                            const void *daddr)
432 {
433         const struct rtable *rt = container_of(dst, struct rtable, dst);
434         struct net_device *dev = dst->dev;
435         struct neighbour *n;
436
437         rcu_read_lock_bh();
438
439         if (likely(rt->rt_gw_family == AF_INET)) {
440                 n = ip_neigh_gw4(dev, rt->rt_gw4);
441         } else if (rt->rt_gw_family == AF_INET6) {
442                 n = ip_neigh_gw6(dev, &rt->rt_gw6);
443         } else {
444                 __be32 pkey;
445
446                 pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
447                 n = ip_neigh_gw4(dev, pkey);
448         }
449
450         if (n && !refcount_inc_not_zero(&n->refcnt))
451                 n = NULL;
452
453         rcu_read_unlock_bh();
454
455         return n;
456 }
457
458 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
459 {
460         const struct rtable *rt = container_of(dst, struct rtable, dst);
461         struct net_device *dev = dst->dev;
462         const __be32 *pkey = daddr;
463
464         if (rt->rt_gw_family == AF_INET) {
465                 pkey = (const __be32 *)&rt->rt_gw4;
466         } else if (rt->rt_gw_family == AF_INET6) {
467                 return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
468         } else if (!daddr ||
469                  (rt->rt_flags &
470                   (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
471                 return;
472         }
473         __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
474 }
475
476 #define IP_IDENTS_SZ 2048u
477
478 static atomic_t *ip_idents __read_mostly;
479 static u32 *ip_tstamps __read_mostly;
480
481 /* In order to protect privacy, we add a perturbation to identifiers
482  * if one generator is seldom used. This makes hard for an attacker
483  * to infer how many packets were sent between two points in time.
484  */
485 u32 ip_idents_reserve(u32 hash, int segs)
486 {
487         u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
488         atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
489         u32 old = READ_ONCE(*p_tstamp);
490         u32 now = (u32)jiffies;
491         u32 new, delta = 0;
492
493         if (old != now && cmpxchg(p_tstamp, old, now) == old)
494                 delta = prandom_u32_max(now - old);
495
496         /* Do not use atomic_add_return() as it makes UBSAN unhappy */
497         do {
498                 old = (u32)atomic_read(p_id);
499                 new = old + delta + segs;
500         } while (atomic_cmpxchg(p_id, old, new) != old);
501
502         return new - segs;
503 }
504 EXPORT_SYMBOL(ip_idents_reserve);
505
506 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
507 {
508         u32 hash, id;
509
510         /* Note the following code is not safe, but this is okay. */
511         if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
512                 get_random_bytes(&net->ipv4.ip_id_key,
513                                  sizeof(net->ipv4.ip_id_key));
514
515         hash = siphash_3u32((__force u32)iph->daddr,
516                             (__force u32)iph->saddr,
517                             iph->protocol,
518                             &net->ipv4.ip_id_key);
519         id = ip_idents_reserve(hash, segs);
520         iph->id = htons(id);
521 }
522 EXPORT_SYMBOL(__ip_select_ident);
523
524 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
525                              const struct sock *sk,
526                              const struct iphdr *iph,
527                              int oif, u8 tos,
528                              u8 prot, u32 mark, int flow_flags)
529 {
530         if (sk) {
531                 const struct inet_sock *inet = inet_sk(sk);
532
533                 oif = sk->sk_bound_dev_if;
534                 mark = sk->sk_mark;
535                 tos = RT_CONN_FLAGS(sk);
536                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
537         }
538         flowi4_init_output(fl4, oif, mark, tos,
539                            RT_SCOPE_UNIVERSE, prot,
540                            flow_flags,
541                            iph->daddr, iph->saddr, 0, 0,
542                            sock_net_uid(net, sk));
543 }
544
545 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
546                                const struct sock *sk)
547 {
548         const struct net *net = dev_net(skb->dev);
549         const struct iphdr *iph = ip_hdr(skb);
550         int oif = skb->dev->ifindex;
551         u8 tos = RT_TOS(iph->tos);
552         u8 prot = iph->protocol;
553         u32 mark = skb->mark;
554
555         __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
556 }
557
558 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
559 {
560         const struct inet_sock *inet = inet_sk(sk);
561         const struct ip_options_rcu *inet_opt;
562         __be32 daddr = inet->inet_daddr;
563
564         rcu_read_lock();
565         inet_opt = rcu_dereference(inet->inet_opt);
566         if (inet_opt && inet_opt->opt.srr)
567                 daddr = inet_opt->opt.faddr;
568         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
569                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
570                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
571                            inet_sk_flowi_flags(sk),
572                            daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
573         rcu_read_unlock();
574 }
575
576 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
577                                  const struct sk_buff *skb)
578 {
579         if (skb)
580                 build_skb_flow_key(fl4, skb, sk);
581         else
582                 build_sk_flow_key(fl4, sk);
583 }
584
585 static DEFINE_SPINLOCK(fnhe_lock);
586
587 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
588 {
589         struct rtable *rt;
590
591         rt = rcu_dereference(fnhe->fnhe_rth_input);
592         if (rt) {
593                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
594                 dst_dev_put(&rt->dst);
595                 dst_release(&rt->dst);
596         }
597         rt = rcu_dereference(fnhe->fnhe_rth_output);
598         if (rt) {
599                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
600                 dst_dev_put(&rt->dst);
601                 dst_release(&rt->dst);
602         }
603 }
604
605 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
606 {
607         struct fib_nh_exception *fnhe, *oldest;
608
609         oldest = rcu_dereference(hash->chain);
610         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
611              fnhe = rcu_dereference(fnhe->fnhe_next)) {
612                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
613                         oldest = fnhe;
614         }
615         fnhe_flush_routes(oldest);
616         return oldest;
617 }
618
619 static inline u32 fnhe_hashfun(__be32 daddr)
620 {
621         static u32 fnhe_hashrnd __read_mostly;
622         u32 hval;
623
624         net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
625         hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
626         return hash_32(hval, FNHE_HASH_SHIFT);
627 }
628
629 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
630 {
631         rt->rt_pmtu = fnhe->fnhe_pmtu;
632         rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
633         rt->dst.expires = fnhe->fnhe_expires;
634
635         if (fnhe->fnhe_gw) {
636                 rt->rt_flags |= RTCF_REDIRECTED;
637                 rt->rt_gw_family = AF_INET;
638                 rt->rt_gw4 = fnhe->fnhe_gw;
639         }
640 }
641
642 static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
643                                   __be32 gw, u32 pmtu, bool lock,
644                                   unsigned long expires)
645 {
646         struct fnhe_hash_bucket *hash;
647         struct fib_nh_exception *fnhe;
648         struct rtable *rt;
649         u32 genid, hval;
650         unsigned int i;
651         int depth;
652
653         genid = fnhe_genid(dev_net(nhc->nhc_dev));
654         hval = fnhe_hashfun(daddr);
655
656         spin_lock_bh(&fnhe_lock);
657
658         hash = rcu_dereference(nhc->nhc_exceptions);
659         if (!hash) {
660                 hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
661                 if (!hash)
662                         goto out_unlock;
663                 rcu_assign_pointer(nhc->nhc_exceptions, hash);
664         }
665
666         hash += hval;
667
668         depth = 0;
669         for (fnhe = rcu_dereference(hash->chain); fnhe;
670              fnhe = rcu_dereference(fnhe->fnhe_next)) {
671                 if (fnhe->fnhe_daddr == daddr)
672                         break;
673                 depth++;
674         }
675
676         if (fnhe) {
677                 if (fnhe->fnhe_genid != genid)
678                         fnhe->fnhe_genid = genid;
679                 if (gw)
680                         fnhe->fnhe_gw = gw;
681                 if (pmtu) {
682                         fnhe->fnhe_pmtu = pmtu;
683                         fnhe->fnhe_mtu_locked = lock;
684                 }
685                 fnhe->fnhe_expires = max(1UL, expires);
686                 /* Update all cached dsts too */
687                 rt = rcu_dereference(fnhe->fnhe_rth_input);
688                 if (rt)
689                         fill_route_from_fnhe(rt, fnhe);
690                 rt = rcu_dereference(fnhe->fnhe_rth_output);
691                 if (rt)
692                         fill_route_from_fnhe(rt, fnhe);
693         } else {
694                 if (depth > FNHE_RECLAIM_DEPTH)
695                         fnhe = fnhe_oldest(hash);
696                 else {
697                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
698                         if (!fnhe)
699                                 goto out_unlock;
700
701                         fnhe->fnhe_next = hash->chain;
702                         rcu_assign_pointer(hash->chain, fnhe);
703                 }
704                 fnhe->fnhe_genid = genid;
705                 fnhe->fnhe_daddr = daddr;
706                 fnhe->fnhe_gw = gw;
707                 fnhe->fnhe_pmtu = pmtu;
708                 fnhe->fnhe_mtu_locked = lock;
709                 fnhe->fnhe_expires = max(1UL, expires);
710
711                 /* Exception created; mark the cached routes for the nexthop
712                  * stale, so anyone caching it rechecks if this exception
713                  * applies to them.
714                  */
715                 rt = rcu_dereference(nhc->nhc_rth_input);
716                 if (rt)
717                         rt->dst.obsolete = DST_OBSOLETE_KILL;
718
719                 for_each_possible_cpu(i) {
720                         struct rtable __rcu **prt;
721                         prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
722                         rt = rcu_dereference(*prt);
723                         if (rt)
724                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
725                 }
726         }
727
728         fnhe->fnhe_stamp = jiffies;
729
730 out_unlock:
731         spin_unlock_bh(&fnhe_lock);
732 }
733
734 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
735                              bool kill_route)
736 {
737         __be32 new_gw = icmp_hdr(skb)->un.gateway;
738         __be32 old_gw = ip_hdr(skb)->saddr;
739         struct net_device *dev = skb->dev;
740         struct in_device *in_dev;
741         struct fib_result res;
742         struct neighbour *n;
743         struct net *net;
744
745         switch (icmp_hdr(skb)->code & 7) {
746         case ICMP_REDIR_NET:
747         case ICMP_REDIR_NETTOS:
748         case ICMP_REDIR_HOST:
749         case ICMP_REDIR_HOSTTOS:
750                 break;
751
752         default:
753                 return;
754         }
755
756         if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
757                 return;
758
759         in_dev = __in_dev_get_rcu(dev);
760         if (!in_dev)
761                 return;
762
763         net = dev_net(dev);
764         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
765             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
766             ipv4_is_zeronet(new_gw))
767                 goto reject_redirect;
768
769         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
770                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
771                         goto reject_redirect;
772                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
773                         goto reject_redirect;
774         } else {
775                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
776                         goto reject_redirect;
777         }
778
779         n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
780         if (!n)
781                 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
782         if (!IS_ERR(n)) {
783                 if (!(n->nud_state & NUD_VALID)) {
784                         neigh_event_send(n, NULL);
785                 } else {
786                         if (fib_lookup(net, fl4, &res, 0) == 0) {
787                                 struct fib_nh_common *nhc = FIB_RES_NHC(res);
788
789                                 update_or_create_fnhe(nhc, fl4->daddr, new_gw,
790                                                 0, false,
791                                                 jiffies + ip_rt_gc_timeout);
792                         }
793                         if (kill_route)
794                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
795                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
796                 }
797                 neigh_release(n);
798         }
799         return;
800
801 reject_redirect:
802 #ifdef CONFIG_IP_ROUTE_VERBOSE
803         if (IN_DEV_LOG_MARTIANS(in_dev)) {
804                 const struct iphdr *iph = (const struct iphdr *) skb->data;
805                 __be32 daddr = iph->daddr;
806                 __be32 saddr = iph->saddr;
807
808                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
809                                      "  Advised path = %pI4 -> %pI4\n",
810                                      &old_gw, dev->name, &new_gw,
811                                      &saddr, &daddr);
812         }
813 #endif
814         ;
815 }
816
817 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
818 {
819         struct rtable *rt;
820         struct flowi4 fl4;
821         const struct iphdr *iph = (const struct iphdr *) skb->data;
822         struct net *net = dev_net(skb->dev);
823         int oif = skb->dev->ifindex;
824         u8 tos = RT_TOS(iph->tos);
825         u8 prot = iph->protocol;
826         u32 mark = skb->mark;
827
828         rt = (struct rtable *) dst;
829
830         __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
831         __ip_do_redirect(rt, skb, &fl4, true);
832 }
833
834 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
835 {
836         struct rtable *rt = (struct rtable *)dst;
837         struct dst_entry *ret = dst;
838
839         if (rt) {
840                 if (dst->obsolete > 0) {
841                         ip_rt_put(rt);
842                         ret = NULL;
843                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
844                            rt->dst.expires) {
845                         ip_rt_put(rt);
846                         ret = NULL;
847                 }
848         }
849         return ret;
850 }
851
852 /*
853  * Algorithm:
854  *      1. The first ip_rt_redirect_number redirects are sent
855  *         with exponential backoff, then we stop sending them at all,
856  *         assuming that the host ignores our redirects.
857  *      2. If we did not see packets requiring redirects
858  *         during ip_rt_redirect_silence, we assume that the host
859  *         forgot redirected route and start to send redirects again.
860  *
861  * This algorithm is much cheaper and more intelligent than dumb load limiting
862  * in icmp.c.
863  *
864  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
865  * and "frag. need" (breaks PMTU discovery) in icmp.c.
866  */
867
868 void ip_rt_send_redirect(struct sk_buff *skb)
869 {
870         struct rtable *rt = skb_rtable(skb);
871         struct in_device *in_dev;
872         struct inet_peer *peer;
873         struct net *net;
874         int log_martians;
875         int vif;
876
877         rcu_read_lock();
878         in_dev = __in_dev_get_rcu(rt->dst.dev);
879         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
880                 rcu_read_unlock();
881                 return;
882         }
883         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
884         vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
885         rcu_read_unlock();
886
887         net = dev_net(rt->dst.dev);
888         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
889         if (!peer) {
890                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
891                           rt_nexthop(rt, ip_hdr(skb)->daddr));
892                 return;
893         }
894
895         /* No redirected packets during ip_rt_redirect_silence;
896          * reset the algorithm.
897          */
898         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
899                 peer->rate_tokens = 0;
900                 peer->n_redirects = 0;
901         }
902
903         /* Too many ignored redirects; do not send anything
904          * set dst.rate_last to the last seen redirected packet.
905          */
906         if (peer->n_redirects >= ip_rt_redirect_number) {
907                 peer->rate_last = jiffies;
908                 goto out_put_peer;
909         }
910
911         /* Check for load limit; set rate_last to the latest sent
912          * redirect.
913          */
914         if (peer->rate_tokens == 0 ||
915             time_after(jiffies,
916                        (peer->rate_last +
917                         (ip_rt_redirect_load << peer->rate_tokens)))) {
918                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
919
920                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
921                 peer->rate_last = jiffies;
922                 ++peer->rate_tokens;
923                 ++peer->n_redirects;
924 #ifdef CONFIG_IP_ROUTE_VERBOSE
925                 if (log_martians &&
926                     peer->rate_tokens == ip_rt_redirect_number)
927                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
928                                              &ip_hdr(skb)->saddr, inet_iif(skb),
929                                              &ip_hdr(skb)->daddr, &gw);
930 #endif
931         }
932 out_put_peer:
933         inet_putpeer(peer);
934 }
935
936 static int ip_error(struct sk_buff *skb)
937 {
938         struct rtable *rt = skb_rtable(skb);
939         struct net_device *dev = skb->dev;
940         struct in_device *in_dev;
941         struct inet_peer *peer;
942         unsigned long now;
943         struct net *net;
944         bool send;
945         int code;
946
947         if (netif_is_l3_master(skb->dev)) {
948                 dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
949                 if (!dev)
950                         goto out;
951         }
952
953         in_dev = __in_dev_get_rcu(dev);
954
955         /* IP on this device is disabled. */
956         if (!in_dev)
957                 goto out;
958
959         net = dev_net(rt->dst.dev);
960         if (!IN_DEV_FORWARD(in_dev)) {
961                 switch (rt->dst.error) {
962                 case EHOSTUNREACH:
963                         __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
964                         break;
965
966                 case ENETUNREACH:
967                         __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
968                         break;
969                 }
970                 goto out;
971         }
972
973         switch (rt->dst.error) {
974         case EINVAL:
975         default:
976                 goto out;
977         case EHOSTUNREACH:
978                 code = ICMP_HOST_UNREACH;
979                 break;
980         case ENETUNREACH:
981                 code = ICMP_NET_UNREACH;
982                 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
983                 break;
984         case EACCES:
985                 code = ICMP_PKT_FILTERED;
986                 break;
987         }
988
989         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
990                                l3mdev_master_ifindex(skb->dev), 1);
991
992         send = true;
993         if (peer) {
994                 now = jiffies;
995                 peer->rate_tokens += now - peer->rate_last;
996                 if (peer->rate_tokens > ip_rt_error_burst)
997                         peer->rate_tokens = ip_rt_error_burst;
998                 peer->rate_last = now;
999                 if (peer->rate_tokens >= ip_rt_error_cost)
1000                         peer->rate_tokens -= ip_rt_error_cost;
1001                 else
1002                         send = false;
1003                 inet_putpeer(peer);
1004         }
1005         if (send)
1006                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1007
1008 out:    kfree_skb(skb);
1009         return 0;
1010 }
1011
1012 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1013 {
1014         struct dst_entry *dst = &rt->dst;
1015         u32 old_mtu = ipv4_mtu(dst);
1016         struct fib_result res;
1017         bool lock = false;
1018
1019         if (ip_mtu_locked(dst))
1020                 return;
1021
1022         if (old_mtu < mtu)
1023                 return;
1024
1025         if (mtu < ip_rt_min_pmtu) {
1026                 lock = true;
1027                 mtu = min(old_mtu, ip_rt_min_pmtu);
1028         }
1029
1030         if (rt->rt_pmtu == mtu && !lock &&
1031             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1032                 return;
1033
1034         rcu_read_lock();
1035         if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1036                 struct fib_nh_common *nhc = FIB_RES_NHC(res);
1037
1038                 update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
1039                                       jiffies + ip_rt_mtu_expires);
1040         }
1041         rcu_read_unlock();
1042 }
1043
1044 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1045                               struct sk_buff *skb, u32 mtu)
1046 {
1047         struct rtable *rt = (struct rtable *) dst;
1048         struct flowi4 fl4;
1049
1050         ip_rt_build_flow_key(&fl4, sk, skb);
1051         __ip_rt_update_pmtu(rt, &fl4, mtu);
1052 }
1053
1054 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1055                       int oif, u8 protocol)
1056 {
1057         const struct iphdr *iph = (const struct iphdr *) skb->data;
1058         struct flowi4 fl4;
1059         struct rtable *rt;
1060         u32 mark = IP4_REPLY_MARK(net, skb->mark);
1061
1062         __build_flow_key(net, &fl4, NULL, iph, oif,
1063                          RT_TOS(iph->tos), protocol, mark, 0);
1064         rt = __ip_route_output_key(net, &fl4);
1065         if (!IS_ERR(rt)) {
1066                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1067                 ip_rt_put(rt);
1068         }
1069 }
1070 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1071
1072 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1073 {
1074         const struct iphdr *iph = (const struct iphdr *) skb->data;
1075         struct flowi4 fl4;
1076         struct rtable *rt;
1077
1078         __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1079
1080         if (!fl4.flowi4_mark)
1081                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1082
1083         rt = __ip_route_output_key(sock_net(sk), &fl4);
1084         if (!IS_ERR(rt)) {
1085                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1086                 ip_rt_put(rt);
1087         }
1088 }
1089
1090 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1091 {
1092         const struct iphdr *iph = (const struct iphdr *) skb->data;
1093         struct flowi4 fl4;
1094         struct rtable *rt;
1095         struct dst_entry *odst = NULL;
1096         bool new = false;
1097         struct net *net = sock_net(sk);
1098
1099         bh_lock_sock(sk);
1100
1101         if (!ip_sk_accept_pmtu(sk))
1102                 goto out;
1103
1104         odst = sk_dst_get(sk);
1105
1106         if (sock_owned_by_user(sk) || !odst) {
1107                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1108                 goto out;
1109         }
1110
1111         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1112
1113         rt = (struct rtable *)odst;
1114         if (odst->obsolete && !odst->ops->check(odst, 0)) {
1115                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1116                 if (IS_ERR(rt))
1117                         goto out;
1118
1119                 new = true;
1120         }
1121
1122         __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
1123
1124         if (!dst_check(&rt->dst, 0)) {
1125                 if (new)
1126                         dst_release(&rt->dst);
1127
1128                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1129                 if (IS_ERR(rt))
1130                         goto out;
1131
1132                 new = true;
1133         }
1134
1135         if (new)
1136                 sk_dst_set(sk, &rt->dst);
1137
1138 out:
1139         bh_unlock_sock(sk);
1140         dst_release(odst);
1141 }
1142 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1143
1144 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1145                    int oif, u8 protocol)
1146 {
1147         const struct iphdr *iph = (const struct iphdr *) skb->data;
1148         struct flowi4 fl4;
1149         struct rtable *rt;
1150
1151         __build_flow_key(net, &fl4, NULL, iph, oif,
1152                          RT_TOS(iph->tos), protocol, 0, 0);
1153         rt = __ip_route_output_key(net, &fl4);
1154         if (!IS_ERR(rt)) {
1155                 __ip_do_redirect(rt, skb, &fl4, false);
1156                 ip_rt_put(rt);
1157         }
1158 }
1159 EXPORT_SYMBOL_GPL(ipv4_redirect);
1160
1161 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1162 {
1163         const struct iphdr *iph = (const struct iphdr *) skb->data;
1164         struct flowi4 fl4;
1165         struct rtable *rt;
1166         struct net *net = sock_net(sk);
1167
1168         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1169         rt = __ip_route_output_key(net, &fl4);
1170         if (!IS_ERR(rt)) {
1171                 __ip_do_redirect(rt, skb, &fl4, false);
1172                 ip_rt_put(rt);
1173         }
1174 }
1175 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1176
1177 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1178 {
1179         struct rtable *rt = (struct rtable *) dst;
1180
1181         /* All IPV4 dsts are created with ->obsolete set to the value
1182          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1183          * into this function always.
1184          *
1185          * When a PMTU/redirect information update invalidates a route,
1186          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1187          * DST_OBSOLETE_DEAD.
1188          */
1189         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1190                 return NULL;
1191         return dst;
1192 }
1193
1194 static void ipv4_send_dest_unreach(struct sk_buff *skb)
1195 {
1196         struct ip_options opt;
1197         int res;
1198
1199         /* Recompile ip options since IPCB may not be valid anymore.
1200          * Also check we have a reasonable ipv4 header.
1201          */
1202         if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1203             ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1204                 return;
1205
1206         memset(&opt, 0, sizeof(opt));
1207         if (ip_hdr(skb)->ihl > 5) {
1208                 if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1209                         return;
1210                 opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1211
1212                 rcu_read_lock();
1213                 res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
1214                 rcu_read_unlock();
1215
1216                 if (res)
1217                         return;
1218         }
1219         __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1220 }
1221
1222 static void ipv4_link_failure(struct sk_buff *skb)
1223 {
1224         struct rtable *rt;
1225
1226         ipv4_send_dest_unreach(skb);
1227
1228         rt = skb_rtable(skb);
1229         if (rt)
1230                 dst_set_expires(&rt->dst, 0);
1231 }
1232
1233 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1234 {
1235         pr_debug("%s: %pI4 -> %pI4, %s\n",
1236                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1237                  skb->dev ? skb->dev->name : "?");
1238         kfree_skb(skb);
1239         WARN_ON(1);
1240         return 0;
1241 }
1242
1243 /*
1244    We do not cache source address of outgoing interface,
1245    because it is used only by IP RR, TS and SRR options,
1246    so that it out of fast path.
1247
1248    BTW remember: "addr" is allowed to be not aligned
1249    in IP options!
1250  */
1251
1252 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1253 {
1254         __be32 src;
1255
1256         if (rt_is_output_route(rt))
1257                 src = ip_hdr(skb)->saddr;
1258         else {
1259                 struct fib_result res;
1260                 struct iphdr *iph = ip_hdr(skb);
1261                 struct flowi4 fl4 = {
1262                         .daddr = iph->daddr,
1263                         .saddr = iph->saddr,
1264                         .flowi4_tos = RT_TOS(iph->tos),
1265                         .flowi4_oif = rt->dst.dev->ifindex,
1266                         .flowi4_iif = skb->dev->ifindex,
1267                         .flowi4_mark = skb->mark,
1268                 };
1269
1270                 rcu_read_lock();
1271                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1272                         src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
1273                 else
1274                         src = inet_select_addr(rt->dst.dev,
1275                                                rt_nexthop(rt, iph->daddr),
1276                                                RT_SCOPE_UNIVERSE);
1277                 rcu_read_unlock();
1278         }
1279         memcpy(addr, &src, 4);
1280 }
1281
1282 #ifdef CONFIG_IP_ROUTE_CLASSID
1283 static void set_class_tag(struct rtable *rt, u32 tag)
1284 {
1285         if (!(rt->dst.tclassid & 0xFFFF))
1286                 rt->dst.tclassid |= tag & 0xFFFF;
1287         if (!(rt->dst.tclassid & 0xFFFF0000))
1288                 rt->dst.tclassid |= tag & 0xFFFF0000;
1289 }
1290 #endif
1291
1292 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1293 {
1294         unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1295         unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1296                                     ip_rt_min_advmss);
1297
1298         return min(advmss, IPV4_MAX_PMTU - header_size);
1299 }
1300
1301 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1302 {
1303         const struct rtable *rt = (const struct rtable *) dst;
1304         unsigned int mtu = rt->rt_pmtu;
1305
1306         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1307                 mtu = dst_metric_raw(dst, RTAX_MTU);
1308
1309         if (mtu)
1310                 return mtu;
1311
1312         mtu = READ_ONCE(dst->dev->mtu);
1313
1314         if (unlikely(ip_mtu_locked(dst))) {
1315                 if (rt->rt_gw_family && mtu > 576)
1316                         mtu = 576;
1317         }
1318
1319         mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1320
1321         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1322 }
1323
1324 static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
1325 {
1326         struct fnhe_hash_bucket *hash;
1327         struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1328         u32 hval = fnhe_hashfun(daddr);
1329
1330         spin_lock_bh(&fnhe_lock);
1331
1332         hash = rcu_dereference_protected(nhc->nhc_exceptions,
1333                                          lockdep_is_held(&fnhe_lock));
1334         hash += hval;
1335
1336         fnhe_p = &hash->chain;
1337         fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1338         while (fnhe) {
1339                 if (fnhe->fnhe_daddr == daddr) {
1340                         rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1341                                 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1342                         /* set fnhe_daddr to 0 to ensure it won't bind with
1343                          * new dsts in rt_bind_exception().
1344                          */
1345                         fnhe->fnhe_daddr = 0;
1346                         fnhe_flush_routes(fnhe);
1347                         kfree_rcu(fnhe, rcu);
1348                         break;
1349                 }
1350                 fnhe_p = &fnhe->fnhe_next;
1351                 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1352                                                  lockdep_is_held(&fnhe_lock));
1353         }
1354
1355         spin_unlock_bh(&fnhe_lock);
1356 }
1357
1358 static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
1359                                                __be32 daddr)
1360 {
1361         struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
1362         struct fib_nh_exception *fnhe;
1363         u32 hval;
1364
1365         if (!hash)
1366                 return NULL;
1367
1368         hval = fnhe_hashfun(daddr);
1369
1370         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1371              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1372                 if (fnhe->fnhe_daddr == daddr) {
1373                         if (fnhe->fnhe_expires &&
1374                             time_after(jiffies, fnhe->fnhe_expires)) {
1375                                 ip_del_fnhe(nhc, daddr);
1376                                 break;
1377                         }
1378                         return fnhe;
1379                 }
1380         }
1381         return NULL;
1382 }
1383
1384 /* MTU selection:
1385  * 1. mtu on route is locked - use it
1386  * 2. mtu from nexthop exception
1387  * 3. mtu from egress device
1388  */
1389
1390 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1391 {
1392         struct fib_nh_common *nhc = res->nhc;
1393         struct net_device *dev = nhc->nhc_dev;
1394         struct fib_info *fi = res->fi;
1395         u32 mtu = 0;
1396
1397         if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu ||
1398             fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1399                 mtu = fi->fib_mtu;
1400
1401         if (likely(!mtu)) {
1402                 struct fib_nh_exception *fnhe;
1403
1404                 fnhe = find_exception(nhc, daddr);
1405                 if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1406                         mtu = fnhe->fnhe_pmtu;
1407         }
1408
1409         if (likely(!mtu))
1410                 mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1411
1412         return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
1413 }
1414
1415 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1416                               __be32 daddr, const bool do_cache)
1417 {
1418         bool ret = false;
1419
1420         spin_lock_bh(&fnhe_lock);
1421
1422         if (daddr == fnhe->fnhe_daddr) {
1423                 struct rtable __rcu **porig;
1424                 struct rtable *orig;
1425                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1426
1427                 if (rt_is_input_route(rt))
1428                         porig = &fnhe->fnhe_rth_input;
1429                 else
1430                         porig = &fnhe->fnhe_rth_output;
1431                 orig = rcu_dereference(*porig);
1432
1433                 if (fnhe->fnhe_genid != genid) {
1434                         fnhe->fnhe_genid = genid;
1435                         fnhe->fnhe_gw = 0;
1436                         fnhe->fnhe_pmtu = 0;
1437                         fnhe->fnhe_expires = 0;
1438                         fnhe->fnhe_mtu_locked = false;
1439                         fnhe_flush_routes(fnhe);
1440                         orig = NULL;
1441                 }
1442                 fill_route_from_fnhe(rt, fnhe);
1443                 if (!rt->rt_gw4) {
1444                         rt->rt_gw4 = daddr;
1445                         rt->rt_gw_family = AF_INET;
1446                 }
1447
1448                 if (do_cache) {
1449                         dst_hold(&rt->dst);
1450                         rcu_assign_pointer(*porig, rt);
1451                         if (orig) {
1452                                 dst_dev_put(&orig->dst);
1453                                 dst_release(&orig->dst);
1454                         }
1455                         ret = true;
1456                 }
1457
1458                 fnhe->fnhe_stamp = jiffies;
1459         }
1460         spin_unlock_bh(&fnhe_lock);
1461
1462         return ret;
1463 }
1464
1465 static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
1466 {
1467         struct rtable *orig, *prev, **p;
1468         bool ret = true;
1469
1470         if (rt_is_input_route(rt)) {
1471                 p = (struct rtable **)&nhc->nhc_rth_input;
1472         } else {
1473                 p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
1474         }
1475         orig = *p;
1476
1477         /* hold dst before doing cmpxchg() to avoid race condition
1478          * on this dst
1479          */
1480         dst_hold(&rt->dst);
1481         prev = cmpxchg(p, orig, rt);
1482         if (prev == orig) {
1483                 if (orig) {
1484                         dst_dev_put(&orig->dst);
1485                         dst_release(&orig->dst);
1486                 }
1487         } else {
1488                 dst_release(&rt->dst);
1489                 ret = false;
1490         }
1491
1492         return ret;
1493 }
1494
1495 struct uncached_list {
1496         spinlock_t              lock;
1497         struct list_head        head;
1498 };
1499
1500 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1501
1502 void rt_add_uncached_list(struct rtable *rt)
1503 {
1504         struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1505
1506         rt->rt_uncached_list = ul;
1507
1508         spin_lock_bh(&ul->lock);
1509         list_add_tail(&rt->rt_uncached, &ul->head);
1510         spin_unlock_bh(&ul->lock);
1511 }
1512
1513 void rt_del_uncached_list(struct rtable *rt)
1514 {
1515         if (!list_empty(&rt->rt_uncached)) {
1516                 struct uncached_list *ul = rt->rt_uncached_list;
1517
1518                 spin_lock_bh(&ul->lock);
1519                 list_del(&rt->rt_uncached);
1520                 spin_unlock_bh(&ul->lock);
1521         }
1522 }
1523
1524 static void ipv4_dst_destroy(struct dst_entry *dst)
1525 {
1526         struct rtable *rt = (struct rtable *)dst;
1527
1528         ip_dst_metrics_put(dst);
1529         rt_del_uncached_list(rt);
1530 }
1531
1532 void rt_flush_dev(struct net_device *dev)
1533 {
1534         struct net *net = dev_net(dev);
1535         struct rtable *rt;
1536         int cpu;
1537
1538         for_each_possible_cpu(cpu) {
1539                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1540
1541                 spin_lock_bh(&ul->lock);
1542                 list_for_each_entry(rt, &ul->head, rt_uncached) {
1543                         if (rt->dst.dev != dev)
1544                                 continue;
1545                         rt->dst.dev = net->loopback_dev;
1546                         dev_hold(rt->dst.dev);
1547                         dev_put(dev);
1548                 }
1549                 spin_unlock_bh(&ul->lock);
1550         }
1551 }
1552
1553 static bool rt_cache_valid(const struct rtable *rt)
1554 {
1555         return  rt &&
1556                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1557                 !rt_is_expired(rt);
1558 }
1559
1560 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1561                            const struct fib_result *res,
1562                            struct fib_nh_exception *fnhe,
1563                            struct fib_info *fi, u16 type, u32 itag,
1564                            const bool do_cache)
1565 {
1566         bool cached = false;
1567
1568         if (fi) {
1569                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1570
1571                 if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
1572                         rt->rt_gw_family = nhc->nhc_gw_family;
1573                         /* only INET and INET6 are supported */
1574                         if (likely(nhc->nhc_gw_family == AF_INET))
1575                                 rt->rt_gw4 = nhc->nhc_gw.ipv4;
1576                         else
1577                                 rt->rt_gw6 = nhc->nhc_gw.ipv6;
1578                 }
1579
1580                 ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1581
1582 #ifdef CONFIG_IP_ROUTE_CLASSID
1583                 {
1584                         struct fib_nh *nh;
1585
1586                         nh = container_of(nhc, struct fib_nh, nh_common);
1587                         rt->dst.tclassid = nh->nh_tclassid;
1588                 }
1589 #endif
1590                 rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
1591                 if (unlikely(fnhe))
1592                         cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1593                 else if (do_cache)
1594                         cached = rt_cache_route(nhc, rt);
1595                 if (unlikely(!cached)) {
1596                         /* Routes we intend to cache in nexthop exception or
1597                          * FIB nexthop have the DST_NOCACHE bit clear.
1598                          * However, if we are unsuccessful at storing this
1599                          * route into the cache we really need to set it.
1600                          */
1601                         if (!rt->rt_gw4) {
1602                                 rt->rt_gw_family = AF_INET;
1603                                 rt->rt_gw4 = daddr;
1604                         }
1605                         rt_add_uncached_list(rt);
1606                 }
1607         } else
1608                 rt_add_uncached_list(rt);
1609
1610 #ifdef CONFIG_IP_ROUTE_CLASSID
1611 #ifdef CONFIG_IP_MULTIPLE_TABLES
1612         set_class_tag(rt, res->tclassid);
1613 #endif
1614         set_class_tag(rt, itag);
1615 #endif
1616 }
1617
1618 struct rtable *rt_dst_alloc(struct net_device *dev,
1619                             unsigned int flags, u16 type,
1620                             bool nopolicy, bool noxfrm, bool will_cache)
1621 {
1622         struct rtable *rt;
1623
1624         rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1625                        (will_cache ? 0 : DST_HOST) |
1626                        (nopolicy ? DST_NOPOLICY : 0) |
1627                        (noxfrm ? DST_NOXFRM : 0));
1628
1629         if (rt) {
1630                 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1631                 rt->rt_flags = flags;
1632                 rt->rt_type = type;
1633                 rt->rt_is_input = 0;
1634                 rt->rt_iif = 0;
1635                 rt->rt_pmtu = 0;
1636                 rt->rt_mtu_locked = 0;
1637                 rt->rt_gw_family = 0;
1638                 rt->rt_gw4 = 0;
1639                 INIT_LIST_HEAD(&rt->rt_uncached);
1640
1641                 rt->dst.output = ip_output;
1642                 if (flags & RTCF_LOCAL)
1643                         rt->dst.input = ip_local_deliver;
1644         }
1645
1646         return rt;
1647 }
1648 EXPORT_SYMBOL(rt_dst_alloc);
1649
1650 /* called in rcu_read_lock() section */
1651 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1652                           u8 tos, struct net_device *dev,
1653                           struct in_device *in_dev, u32 *itag)
1654 {
1655         int err;
1656
1657         /* Primary sanity checks. */
1658         if (!in_dev)
1659                 return -EINVAL;
1660
1661         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1662             skb->protocol != htons(ETH_P_IP))
1663                 return -EINVAL;
1664
1665         if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1666                 return -EINVAL;
1667
1668         if (ipv4_is_zeronet(saddr)) {
1669                 if (!ipv4_is_local_multicast(daddr) &&
1670                     ip_hdr(skb)->protocol != IPPROTO_IGMP)
1671                         return -EINVAL;
1672         } else {
1673                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1674                                           in_dev, itag);
1675                 if (err < 0)
1676                         return err;
1677         }
1678         return 0;
1679 }
1680
1681 /* called in rcu_read_lock() section */
1682 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1683                              u8 tos, struct net_device *dev, int our)
1684 {
1685         struct in_device *in_dev = __in_dev_get_rcu(dev);
1686         unsigned int flags = RTCF_MULTICAST;
1687         struct rtable *rth;
1688         u32 itag = 0;
1689         int err;
1690
1691         err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1692         if (err)
1693                 return err;
1694
1695         if (our)
1696                 flags |= RTCF_LOCAL;
1697
1698         rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1699                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1700         if (!rth)
1701                 return -ENOBUFS;
1702
1703 #ifdef CONFIG_IP_ROUTE_CLASSID
1704         rth->dst.tclassid = itag;
1705 #endif
1706         rth->dst.output = ip_rt_bug;
1707         rth->rt_is_input= 1;
1708
1709 #ifdef CONFIG_IP_MROUTE
1710         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1711                 rth->dst.input = ip_mr_input;
1712 #endif
1713         RT_CACHE_STAT_INC(in_slow_mc);
1714
1715         skb_dst_set(skb, &rth->dst);
1716         return 0;
1717 }
1718
1719
1720 static void ip_handle_martian_source(struct net_device *dev,
1721                                      struct in_device *in_dev,
1722                                      struct sk_buff *skb,
1723                                      __be32 daddr,
1724                                      __be32 saddr)
1725 {
1726         RT_CACHE_STAT_INC(in_martian_src);
1727 #ifdef CONFIG_IP_ROUTE_VERBOSE
1728         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1729                 /*
1730                  *      RFC1812 recommendation, if source is martian,
1731                  *      the only hint is MAC header.
1732                  */
1733                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1734                         &daddr, &saddr, dev->name);
1735                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1736                         print_hex_dump(KERN_WARNING, "ll header: ",
1737                                        DUMP_PREFIX_OFFSET, 16, 1,
1738                                        skb_mac_header(skb),
1739                                        dev->hard_header_len, false);
1740                 }
1741         }
1742 #endif
1743 }
1744
1745 /* called in rcu_read_lock() section */
1746 static int __mkroute_input(struct sk_buff *skb,
1747                            const struct fib_result *res,
1748                            struct in_device *in_dev,
1749                            __be32 daddr, __be32 saddr, u32 tos)
1750 {
1751         struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1752         struct net_device *dev = nhc->nhc_dev;
1753         struct fib_nh_exception *fnhe;
1754         struct rtable *rth;
1755         int err;
1756         struct in_device *out_dev;
1757         bool do_cache;
1758         u32 itag = 0;
1759
1760         /* get a working reference to the output device */
1761         out_dev = __in_dev_get_rcu(dev);
1762         if (!out_dev) {
1763                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1764                 return -EINVAL;
1765         }
1766
1767         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1768                                   in_dev->dev, in_dev, &itag);
1769         if (err < 0) {
1770                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1771                                          saddr);
1772
1773                 goto cleanup;
1774         }
1775
1776         do_cache = res->fi && !itag;
1777         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1778             skb->protocol == htons(ETH_P_IP)) {
1779                 __be32 gw;
1780
1781                 gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
1782                 if (IN_DEV_SHARED_MEDIA(out_dev) ||
1783                     inet_addr_onlink(out_dev, saddr, gw))
1784                         IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1785         }
1786
1787         if (skb->protocol != htons(ETH_P_IP)) {
1788                 /* Not IP (i.e. ARP). Do not create route, if it is
1789                  * invalid for proxy arp. DNAT routes are always valid.
1790                  *
1791                  * Proxy arp feature have been extended to allow, ARP
1792                  * replies back to the same interface, to support
1793                  * Private VLAN switch technologies. See arp.c.
1794                  */
1795                 if (out_dev == in_dev &&
1796                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1797                         err = -EINVAL;
1798                         goto cleanup;
1799                 }
1800         }
1801
1802         fnhe = find_exception(nhc, daddr);
1803         if (do_cache) {
1804                 if (fnhe)
1805                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1806                 else
1807                         rth = rcu_dereference(nhc->nhc_rth_input);
1808                 if (rt_cache_valid(rth)) {
1809                         skb_dst_set_noref(skb, &rth->dst);
1810                         goto out;
1811                 }
1812         }
1813
1814         rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1815                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1816                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1817         if (!rth) {
1818                 err = -ENOBUFS;
1819                 goto cleanup;
1820         }
1821
1822         rth->rt_is_input = 1;
1823         RT_CACHE_STAT_INC(in_slow_tot);
1824
1825         rth->dst.input = ip_forward;
1826
1827         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1828                        do_cache);
1829         lwtunnel_set_redirect(&rth->dst);
1830         skb_dst_set(skb, &rth->dst);
1831 out:
1832         err = 0;
1833  cleanup:
1834         return err;
1835 }
1836
1837 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1838 /* To make ICMP packets follow the right flow, the multipath hash is
1839  * calculated from the inner IP addresses.
1840  */
1841 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1842                                  struct flow_keys *hash_keys)
1843 {
1844         const struct iphdr *outer_iph = ip_hdr(skb);
1845         const struct iphdr *key_iph = outer_iph;
1846         const struct iphdr *inner_iph;
1847         const struct icmphdr *icmph;
1848         struct iphdr _inner_iph;
1849         struct icmphdr _icmph;
1850
1851         if (likely(outer_iph->protocol != IPPROTO_ICMP))
1852                 goto out;
1853
1854         if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1855                 goto out;
1856
1857         icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1858                                    &_icmph);
1859         if (!icmph)
1860                 goto out;
1861
1862         if (icmph->type != ICMP_DEST_UNREACH &&
1863             icmph->type != ICMP_REDIRECT &&
1864             icmph->type != ICMP_TIME_EXCEEDED &&
1865             icmph->type != ICMP_PARAMETERPROB)
1866                 goto out;
1867
1868         inner_iph = skb_header_pointer(skb,
1869                                        outer_iph->ihl * 4 + sizeof(_icmph),
1870                                        sizeof(_inner_iph), &_inner_iph);
1871         if (!inner_iph)
1872                 goto out;
1873
1874         key_iph = inner_iph;
1875 out:
1876         hash_keys->addrs.v4addrs.src = key_iph->saddr;
1877         hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1878 }
1879
1880 /* if skb is set it will be used and fl4 can be NULL */
1881 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1882                        const struct sk_buff *skb, struct flow_keys *flkeys)
1883 {
1884         u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
1885         struct flow_keys hash_keys;
1886         u32 mhash;
1887
1888         switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1889         case 0:
1890                 memset(&hash_keys, 0, sizeof(hash_keys));
1891                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1892                 if (skb) {
1893                         ip_multipath_l3_keys(skb, &hash_keys);
1894                 } else {
1895                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1896                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1897                 }
1898                 break;
1899         case 1:
1900                 /* skb is currently provided only when forwarding */
1901                 if (skb) {
1902                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1903                         struct flow_keys keys;
1904
1905                         /* short-circuit if we already have L4 hash present */
1906                         if (skb->l4_hash)
1907                                 return skb_get_hash_raw(skb) >> 1;
1908
1909                         memset(&hash_keys, 0, sizeof(hash_keys));
1910
1911                         if (!flkeys) {
1912                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
1913                                 flkeys = &keys;
1914                         }
1915
1916                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1917                         hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1918                         hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1919                         hash_keys.ports.src = flkeys->ports.src;
1920                         hash_keys.ports.dst = flkeys->ports.dst;
1921                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1922                 } else {
1923                         memset(&hash_keys, 0, sizeof(hash_keys));
1924                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1925                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1926                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1927                         hash_keys.ports.src = fl4->fl4_sport;
1928                         hash_keys.ports.dst = fl4->fl4_dport;
1929                         hash_keys.basic.ip_proto = fl4->flowi4_proto;
1930                 }
1931                 break;
1932         }
1933         mhash = flow_hash_from_keys(&hash_keys);
1934
1935         if (multipath_hash)
1936                 mhash = jhash_2words(mhash, multipath_hash, 0);
1937
1938         return mhash >> 1;
1939 }
1940 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1941
1942 static int ip_mkroute_input(struct sk_buff *skb,
1943                             struct fib_result *res,
1944                             struct in_device *in_dev,
1945                             __be32 daddr, __be32 saddr, u32 tos,
1946                             struct flow_keys *hkeys)
1947 {
1948 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1949         if (res->fi && res->fi->fib_nhs > 1) {
1950                 int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
1951
1952                 fib_select_multipath(res, h);
1953         }
1954 #endif
1955
1956         /* create a routing cache entry */
1957         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1958 }
1959
1960 /*
1961  *      NOTE. We drop all the packets that has local source
1962  *      addresses, because every properly looped back packet
1963  *      must have correct destination already attached by output routine.
1964  *
1965  *      Such approach solves two big problems:
1966  *      1. Not simplex devices are handled properly.
1967  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1968  *      called with rcu_read_lock()
1969  */
1970
1971 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1972                                u8 tos, struct net_device *dev,
1973                                struct fib_result *res)
1974 {
1975         struct in_device *in_dev = __in_dev_get_rcu(dev);
1976         struct flow_keys *flkeys = NULL, _flkeys;
1977         struct net    *net = dev_net(dev);
1978         struct ip_tunnel_info *tun_info;
1979         int             err = -EINVAL;
1980         unsigned int    flags = 0;
1981         u32             itag = 0;
1982         struct rtable   *rth;
1983         struct flowi4   fl4;
1984         bool do_cache = true;
1985
1986         /* IP on this device is disabled. */
1987
1988         if (!in_dev)
1989                 goto out;
1990
1991         /* Check for the most weird martians, which can be not detected
1992            by fib_lookup.
1993          */
1994
1995         tun_info = skb_tunnel_info(skb);
1996         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1997                 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1998         else
1999                 fl4.flowi4_tun_key.tun_id = 0;
2000         skb_dst_drop(skb);
2001
2002         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2003                 goto martian_source;
2004
2005         res->fi = NULL;
2006         res->table = NULL;
2007         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2008                 goto brd_input;
2009
2010         /* Accept zero addresses only to limited broadcast;
2011          * I even do not know to fix it or not. Waiting for complains :-)
2012          */
2013         if (ipv4_is_zeronet(saddr))
2014                 goto martian_source;
2015
2016         if (ipv4_is_zeronet(daddr))
2017                 goto martian_destination;
2018
2019         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2020          * and call it once if daddr or/and saddr are loopback addresses
2021          */
2022         if (ipv4_is_loopback(daddr)) {
2023                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2024                         goto martian_destination;
2025         } else if (ipv4_is_loopback(saddr)) {
2026                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2027                         goto martian_source;
2028         }
2029
2030         /*
2031          *      Now we are ready to route packet.
2032          */
2033         fl4.flowi4_oif = 0;
2034         fl4.flowi4_iif = dev->ifindex;
2035         fl4.flowi4_mark = skb->mark;
2036         fl4.flowi4_tos = tos;
2037         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2038         fl4.flowi4_flags = 0;
2039         fl4.daddr = daddr;
2040         fl4.saddr = saddr;
2041         fl4.flowi4_uid = sock_net_uid(net, NULL);
2042
2043         if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2044                 flkeys = &_flkeys;
2045         } else {
2046                 fl4.flowi4_proto = 0;
2047                 fl4.fl4_sport = 0;
2048                 fl4.fl4_dport = 0;
2049         }
2050
2051         err = fib_lookup(net, &fl4, res, 0);
2052         if (err != 0) {
2053                 if (!IN_DEV_FORWARD(in_dev))
2054                         err = -EHOSTUNREACH;
2055                 goto no_route;
2056         }
2057
2058         if (res->type == RTN_BROADCAST) {
2059                 if (IN_DEV_BFORWARD(in_dev))
2060                         goto make_route;
2061                 /* not do cache if bc_forwarding is enabled */
2062                 if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
2063                         do_cache = false;
2064                 goto brd_input;
2065         }
2066
2067         if (res->type == RTN_LOCAL) {
2068                 err = fib_validate_source(skb, saddr, daddr, tos,
2069                                           0, dev, in_dev, &itag);
2070                 if (err < 0)
2071                         goto martian_source;
2072                 goto local_input;
2073         }
2074
2075         if (!IN_DEV_FORWARD(in_dev)) {
2076                 err = -EHOSTUNREACH;
2077                 goto no_route;
2078         }
2079         if (res->type != RTN_UNICAST)
2080                 goto martian_destination;
2081
2082 make_route:
2083         err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2084 out:    return err;
2085
2086 brd_input:
2087         if (skb->protocol != htons(ETH_P_IP))
2088                 goto e_inval;
2089
2090         if (!ipv4_is_zeronet(saddr)) {
2091                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2092                                           in_dev, &itag);
2093                 if (err < 0)
2094                         goto martian_source;
2095         }
2096         flags |= RTCF_BROADCAST;
2097         res->type = RTN_BROADCAST;
2098         RT_CACHE_STAT_INC(in_brd);
2099
2100 local_input:
2101         do_cache &= res->fi && !itag;
2102         if (do_cache) {
2103                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2104
2105                 rth = rcu_dereference(nhc->nhc_rth_input);
2106                 if (rt_cache_valid(rth)) {
2107                         skb_dst_set_noref(skb, &rth->dst);
2108                         err = 0;
2109                         goto out;
2110                 }
2111         }
2112
2113         rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2114                            flags | RTCF_LOCAL, res->type,
2115                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2116         if (!rth)
2117                 goto e_nobufs;
2118
2119         rth->dst.output= ip_rt_bug;
2120 #ifdef CONFIG_IP_ROUTE_CLASSID
2121         rth->dst.tclassid = itag;
2122 #endif
2123         rth->rt_is_input = 1;
2124
2125         RT_CACHE_STAT_INC(in_slow_tot);
2126         if (res->type == RTN_UNREACHABLE) {
2127                 rth->dst.input= ip_error;
2128                 rth->dst.error= -err;
2129                 rth->rt_flags   &= ~RTCF_LOCAL;
2130         }
2131
2132         if (do_cache) {
2133                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2134
2135                 rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
2136                 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2137                         WARN_ON(rth->dst.input == lwtunnel_input);
2138                         rth->dst.lwtstate->orig_input = rth->dst.input;
2139                         rth->dst.input = lwtunnel_input;
2140                 }
2141
2142                 if (unlikely(!rt_cache_route(nhc, rth)))
2143                         rt_add_uncached_list(rth);
2144         }
2145         skb_dst_set(skb, &rth->dst);
2146         err = 0;
2147         goto out;
2148
2149 no_route:
2150         RT_CACHE_STAT_INC(in_no_route);
2151         res->type = RTN_UNREACHABLE;
2152         res->fi = NULL;
2153         res->table = NULL;
2154         goto local_input;
2155
2156         /*
2157          *      Do not cache martian addresses: they should be logged (RFC1812)
2158          */
2159 martian_destination:
2160         RT_CACHE_STAT_INC(in_martian_dst);
2161 #ifdef CONFIG_IP_ROUTE_VERBOSE
2162         if (IN_DEV_LOG_MARTIANS(in_dev))
2163                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2164                                      &daddr, &saddr, dev->name);
2165 #endif
2166
2167 e_inval:
2168         err = -EINVAL;
2169         goto out;
2170
2171 e_nobufs:
2172         err = -ENOBUFS;
2173         goto out;
2174
2175 martian_source:
2176         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2177         goto out;
2178 }
2179
2180 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2181                          u8 tos, struct net_device *dev)
2182 {
2183         struct fib_result res;
2184         int err;
2185
2186         tos &= IPTOS_RT_MASK;
2187         rcu_read_lock();
2188         err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2189         rcu_read_unlock();
2190
2191         return err;
2192 }
2193 EXPORT_SYMBOL(ip_route_input_noref);
2194
2195 /* called with rcu_read_lock held */
2196 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2197                        u8 tos, struct net_device *dev, struct fib_result *res)
2198 {
2199         /* Multicast recognition logic is moved from route cache to here.
2200            The problem was that too many Ethernet cards have broken/missing
2201            hardware multicast filters :-( As result the host on multicasting
2202            network acquires a lot of useless route cache entries, sort of
2203            SDR messages from all the world. Now we try to get rid of them.
2204            Really, provided software IP multicast filter is organized
2205            reasonably (at least, hashed), it does not result in a slowdown
2206            comparing with route cache reject entries.
2207            Note, that multicast routers are not affected, because
2208            route cache entry is created eventually.
2209          */
2210         if (ipv4_is_multicast(daddr)) {
2211                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2212                 int our = 0;
2213                 int err = -EINVAL;
2214
2215                 if (!in_dev)
2216                         return err;
2217                 our = ip_check_mc_rcu(in_dev, daddr, saddr,
2218                                       ip_hdr(skb)->protocol);
2219
2220                 /* check l3 master if no match yet */
2221                 if (!our && netif_is_l3_slave(dev)) {
2222                         struct in_device *l3_in_dev;
2223
2224                         l3_in_dev = __in_dev_get_rcu(skb->dev);
2225                         if (l3_in_dev)
2226                                 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2227                                                       ip_hdr(skb)->protocol);
2228                 }
2229
2230                 if (our
2231 #ifdef CONFIG_IP_MROUTE
2232                         ||
2233                     (!ipv4_is_local_multicast(daddr) &&
2234                      IN_DEV_MFORWARD(in_dev))
2235 #endif
2236                    ) {
2237                         err = ip_route_input_mc(skb, daddr, saddr,
2238                                                 tos, dev, our);
2239                 }
2240                 return err;
2241         }
2242
2243         return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2244 }
2245
2246 /* called with rcu_read_lock() */
2247 static struct rtable *__mkroute_output(const struct fib_result *res,
2248                                        const struct flowi4 *fl4, int orig_oif,
2249                                        struct net_device *dev_out,
2250                                        unsigned int flags)
2251 {
2252         struct fib_info *fi = res->fi;
2253         struct fib_nh_exception *fnhe;
2254         struct in_device *in_dev;
2255         u16 type = res->type;
2256         struct rtable *rth;
2257         bool do_cache;
2258
2259         in_dev = __in_dev_get_rcu(dev_out);
2260         if (!in_dev)
2261                 return ERR_PTR(-EINVAL);
2262
2263         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2264                 if (ipv4_is_loopback(fl4->saddr) &&
2265                     !(dev_out->flags & IFF_LOOPBACK) &&
2266                     !netif_is_l3_master(dev_out))
2267                         return ERR_PTR(-EINVAL);
2268
2269         if (ipv4_is_lbcast(fl4->daddr))
2270                 type = RTN_BROADCAST;
2271         else if (ipv4_is_multicast(fl4->daddr))
2272                 type = RTN_MULTICAST;
2273         else if (ipv4_is_zeronet(fl4->daddr))
2274                 return ERR_PTR(-EINVAL);
2275
2276         if (dev_out->flags & IFF_LOOPBACK)
2277                 flags |= RTCF_LOCAL;
2278
2279         do_cache = true;
2280         if (type == RTN_BROADCAST) {
2281                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2282                 fi = NULL;
2283         } else if (type == RTN_MULTICAST) {
2284                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2285                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2286                                      fl4->flowi4_proto))
2287                         flags &= ~RTCF_LOCAL;
2288                 else
2289                         do_cache = false;
2290                 /* If multicast route do not exist use
2291                  * default one, but do not gateway in this case.
2292                  * Yes, it is hack.
2293                  */
2294                 if (fi && res->prefixlen < 4)
2295                         fi = NULL;
2296         } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2297                    (orig_oif != dev_out->ifindex)) {
2298                 /* For local routes that require a particular output interface
2299                  * we do not want to cache the result.  Caching the result
2300                  * causes incorrect behaviour when there are multiple source
2301                  * addresses on the interface, the end result being that if the
2302                  * intended recipient is waiting on that interface for the
2303                  * packet he won't receive it because it will be delivered on
2304                  * the loopback interface and the IP_PKTINFO ipi_ifindex will
2305                  * be set to the loopback interface as well.
2306                  */
2307                 do_cache = false;
2308         }
2309
2310         fnhe = NULL;
2311         do_cache &= fi != NULL;
2312         if (fi) {
2313                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2314                 struct rtable __rcu **prth;
2315
2316                 fnhe = find_exception(nhc, fl4->daddr);
2317                 if (!do_cache)
2318                         goto add;
2319                 if (fnhe) {
2320                         prth = &fnhe->fnhe_rth_output;
2321                 } else {
2322                         if (unlikely(fl4->flowi4_flags &
2323                                      FLOWI_FLAG_KNOWN_NH &&
2324                                      !(nhc->nhc_gw_family &&
2325                                        nhc->nhc_scope == RT_SCOPE_LINK))) {
2326                                 do_cache = false;
2327                                 goto add;
2328                         }
2329                         prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
2330                 }
2331                 rth = rcu_dereference(*prth);
2332                 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2333                         return rth;
2334         }
2335
2336 add:
2337         rth = rt_dst_alloc(dev_out, flags, type,
2338                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2339                            IN_DEV_CONF_GET(in_dev, NOXFRM),
2340                            do_cache);
2341         if (!rth)
2342                 return ERR_PTR(-ENOBUFS);
2343
2344         rth->rt_iif = orig_oif;
2345
2346         RT_CACHE_STAT_INC(out_slow_tot);
2347
2348         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2349                 if (flags & RTCF_LOCAL &&
2350                     !(dev_out->flags & IFF_LOOPBACK)) {
2351                         rth->dst.output = ip_mc_output;
2352                         RT_CACHE_STAT_INC(out_slow_mc);
2353                 }
2354 #ifdef CONFIG_IP_MROUTE
2355                 if (type == RTN_MULTICAST) {
2356                         if (IN_DEV_MFORWARD(in_dev) &&
2357                             !ipv4_is_local_multicast(fl4->daddr)) {
2358                                 rth->dst.input = ip_mr_input;
2359                                 rth->dst.output = ip_mc_output;
2360                         }
2361                 }
2362 #endif
2363         }
2364
2365         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2366         lwtunnel_set_redirect(&rth->dst);
2367
2368         return rth;
2369 }
2370
2371 /*
2372  * Major route resolver routine.
2373  */
2374
2375 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2376                                         const struct sk_buff *skb)
2377 {
2378         __u8 tos = RT_FL_TOS(fl4);
2379         struct fib_result res = {
2380                 .type           = RTN_UNSPEC,
2381                 .fi             = NULL,
2382                 .table          = NULL,
2383                 .tclassid       = 0,
2384         };
2385         struct rtable *rth;
2386
2387         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2388         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2389         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2390                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2391
2392         rcu_read_lock();
2393         rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2394         rcu_read_unlock();
2395
2396         return rth;
2397 }
2398 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2399
2400 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2401                                             struct fib_result *res,
2402                                             const struct sk_buff *skb)
2403 {
2404         struct net_device *dev_out = NULL;
2405         int orig_oif = fl4->flowi4_oif;
2406         unsigned int flags = 0;
2407         struct rtable *rth;
2408         int err = -ENETUNREACH;
2409
2410         if (fl4->saddr) {
2411                 rth = ERR_PTR(-EINVAL);
2412                 if (ipv4_is_multicast(fl4->saddr) ||
2413                     ipv4_is_lbcast(fl4->saddr) ||
2414                     ipv4_is_zeronet(fl4->saddr))
2415                         goto out;
2416
2417                 /* I removed check for oif == dev_out->oif here.
2418                    It was wrong for two reasons:
2419                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2420                       is assigned to multiple interfaces.
2421                    2. Moreover, we are allowed to send packets with saddr
2422                       of another iface. --ANK
2423                  */
2424
2425                 if (fl4->flowi4_oif == 0 &&
2426                     (ipv4_is_multicast(fl4->daddr) ||
2427                      ipv4_is_lbcast(fl4->daddr))) {
2428                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2429                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2430                         if (!dev_out)
2431                                 goto out;
2432
2433                         /* Special hack: user can direct multicasts
2434                            and limited broadcast via necessary interface
2435                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2436                            This hack is not just for fun, it allows
2437                            vic,vat and friends to work.
2438                            They bind socket to loopback, set ttl to zero
2439                            and expect that it will work.
2440                            From the viewpoint of routing cache they are broken,
2441                            because we are not allowed to build multicast path
2442                            with loopback source addr (look, routing cache
2443                            cannot know, that ttl is zero, so that packet
2444                            will not leave this host and route is valid).
2445                            Luckily, this hack is good workaround.
2446                          */
2447
2448                         fl4->flowi4_oif = dev_out->ifindex;
2449                         goto make_route;
2450                 }
2451
2452                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2453                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2454                         if (!__ip_dev_find(net, fl4->saddr, false))
2455                                 goto out;
2456                 }
2457         }
2458
2459
2460         if (fl4->flowi4_oif) {
2461                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2462                 rth = ERR_PTR(-ENODEV);
2463                 if (!dev_out)
2464                         goto out;
2465
2466                 /* RACE: Check return value of inet_select_addr instead. */
2467                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2468                         rth = ERR_PTR(-ENETUNREACH);
2469                         goto out;
2470                 }
2471                 if (ipv4_is_local_multicast(fl4->daddr) ||
2472                     ipv4_is_lbcast(fl4->daddr) ||
2473                     fl4->flowi4_proto == IPPROTO_IGMP) {
2474                         if (!fl4->saddr)
2475                                 fl4->saddr = inet_select_addr(dev_out, 0,
2476                                                               RT_SCOPE_LINK);
2477                         goto make_route;
2478                 }
2479                 if (!fl4->saddr) {
2480                         if (ipv4_is_multicast(fl4->daddr))
2481                                 fl4->saddr = inet_select_addr(dev_out, 0,
2482                                                               fl4->flowi4_scope);
2483                         else if (!fl4->daddr)
2484                                 fl4->saddr = inet_select_addr(dev_out, 0,
2485                                                               RT_SCOPE_HOST);
2486                 }
2487         }
2488
2489         if (!fl4->daddr) {
2490                 fl4->daddr = fl4->saddr;
2491                 if (!fl4->daddr)
2492                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2493                 dev_out = net->loopback_dev;
2494                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2495                 res->type = RTN_LOCAL;
2496                 flags |= RTCF_LOCAL;
2497                 goto make_route;
2498         }
2499
2500         err = fib_lookup(net, fl4, res, 0);
2501         if (err) {
2502                 res->fi = NULL;
2503                 res->table = NULL;
2504                 if (fl4->flowi4_oif &&
2505                     (ipv4_is_multicast(fl4->daddr) ||
2506                     !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2507                         /* Apparently, routing tables are wrong. Assume,
2508                            that the destination is on link.
2509
2510                            WHY? DW.
2511                            Because we are allowed to send to iface
2512                            even if it has NO routes and NO assigned
2513                            addresses. When oif is specified, routing
2514                            tables are looked up with only one purpose:
2515                            to catch if destination is gatewayed, rather than
2516                            direct. Moreover, if MSG_DONTROUTE is set,
2517                            we send packet, ignoring both routing tables
2518                            and ifaddr state. --ANK
2519
2520
2521                            We could make it even if oif is unknown,
2522                            likely IPv6, but we do not.
2523                          */
2524
2525                         if (fl4->saddr == 0)
2526                                 fl4->saddr = inet_select_addr(dev_out, 0,
2527                                                               RT_SCOPE_LINK);
2528                         res->type = RTN_UNICAST;
2529                         goto make_route;
2530                 }
2531                 rth = ERR_PTR(err);
2532                 goto out;
2533         }
2534
2535         if (res->type == RTN_LOCAL) {
2536                 if (!fl4->saddr) {
2537                         if (res->fi->fib_prefsrc)
2538                                 fl4->saddr = res->fi->fib_prefsrc;
2539                         else
2540                                 fl4->saddr = fl4->daddr;
2541                 }
2542
2543                 /* L3 master device is the loopback for that domain */
2544                 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2545                         net->loopback_dev;
2546
2547                 /* make sure orig_oif points to fib result device even
2548                  * though packet rx/tx happens over loopback or l3mdev
2549                  */
2550                 orig_oif = FIB_RES_OIF(*res);
2551
2552                 fl4->flowi4_oif = dev_out->ifindex;
2553                 flags |= RTCF_LOCAL;
2554                 goto make_route;
2555         }
2556
2557         fib_select_path(net, res, fl4, skb);
2558
2559         dev_out = FIB_RES_DEV(*res);
2560         fl4->flowi4_oif = dev_out->ifindex;
2561
2562
2563 make_route:
2564         rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2565
2566 out:
2567         return rth;
2568 }
2569
2570 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2571 {
2572         return NULL;
2573 }
2574
2575 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2576 {
2577         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2578
2579         return mtu ? : dst->dev->mtu;
2580 }
2581
2582 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2583                                           struct sk_buff *skb, u32 mtu)
2584 {
2585 }
2586
2587 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2588                                        struct sk_buff *skb)
2589 {
2590 }
2591
2592 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2593                                           unsigned long old)
2594 {
2595         return NULL;
2596 }
2597
2598 static struct dst_ops ipv4_dst_blackhole_ops = {
2599         .family                 =       AF_INET,
2600         .check                  =       ipv4_blackhole_dst_check,
2601         .mtu                    =       ipv4_blackhole_mtu,
2602         .default_advmss         =       ipv4_default_advmss,
2603         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2604         .redirect               =       ipv4_rt_blackhole_redirect,
2605         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2606         .neigh_lookup           =       ipv4_neigh_lookup,
2607 };
2608
2609 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2610 {
2611         struct rtable *ort = (struct rtable *) dst_orig;
2612         struct rtable *rt;
2613
2614         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2615         if (rt) {
2616                 struct dst_entry *new = &rt->dst;
2617
2618                 new->__use = 1;
2619                 new->input = dst_discard;
2620                 new->output = dst_discard_out;
2621
2622                 new->dev = net->loopback_dev;
2623                 if (new->dev)
2624                         dev_hold(new->dev);
2625
2626                 rt->rt_is_input = ort->rt_is_input;
2627                 rt->rt_iif = ort->rt_iif;
2628                 rt->rt_pmtu = ort->rt_pmtu;
2629                 rt->rt_mtu_locked = ort->rt_mtu_locked;
2630
2631                 rt->rt_genid = rt_genid_ipv4(net);
2632                 rt->rt_flags = ort->rt_flags;
2633                 rt->rt_type = ort->rt_type;
2634                 rt->rt_gw_family = ort->rt_gw_family;
2635                 if (rt->rt_gw_family == AF_INET)
2636                         rt->rt_gw4 = ort->rt_gw4;
2637                 else if (rt->rt_gw_family == AF_INET6)
2638                         rt->rt_gw6 = ort->rt_gw6;
2639
2640                 INIT_LIST_HEAD(&rt->rt_uncached);
2641         }
2642
2643         dst_release(dst_orig);
2644
2645         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2646 }
2647
2648 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2649                                     const struct sock *sk)
2650 {
2651         struct rtable *rt = __ip_route_output_key(net, flp4);
2652
2653         if (IS_ERR(rt))
2654                 return rt;
2655
2656         if (flp4->flowi4_proto)
2657                 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2658                                                         flowi4_to_flowi(flp4),
2659                                                         sk, 0);
2660
2661         return rt;
2662 }
2663 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2664
2665 /* called with rcu_read_lock held */
2666 static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2667                         struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2668                         struct sk_buff *skb, u32 portid, u32 seq)
2669 {
2670         struct rtmsg *r;
2671         struct nlmsghdr *nlh;
2672         unsigned long expires = 0;
2673         u32 error;
2674         u32 metrics[RTAX_MAX];
2675
2676         nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
2677         if (!nlh)
2678                 return -EMSGSIZE;
2679
2680         r = nlmsg_data(nlh);
2681         r->rtm_family    = AF_INET;
2682         r->rtm_dst_len  = 32;
2683         r->rtm_src_len  = 0;
2684         r->rtm_tos      = fl4->flowi4_tos;
2685         r->rtm_table    = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2686         if (nla_put_u32(skb, RTA_TABLE, table_id))
2687                 goto nla_put_failure;
2688         r->rtm_type     = rt->rt_type;
2689         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2690         r->rtm_protocol = RTPROT_UNSPEC;
2691         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2692         if (rt->rt_flags & RTCF_NOTIFY)
2693                 r->rtm_flags |= RTM_F_NOTIFY;
2694         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2695                 r->rtm_flags |= RTCF_DOREDIRECT;
2696
2697         if (nla_put_in_addr(skb, RTA_DST, dst))
2698                 goto nla_put_failure;
2699         if (src) {
2700                 r->rtm_src_len = 32;
2701                 if (nla_put_in_addr(skb, RTA_SRC, src))
2702                         goto nla_put_failure;
2703         }
2704         if (rt->dst.dev &&
2705             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2706                 goto nla_put_failure;
2707 #ifdef CONFIG_IP_ROUTE_CLASSID
2708         if (rt->dst.tclassid &&
2709             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2710                 goto nla_put_failure;
2711 #endif
2712         if (!rt_is_input_route(rt) &&
2713             fl4->saddr != src) {
2714                 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2715                         goto nla_put_failure;
2716         }
2717         if (rt->rt_gw_family == AF_INET &&
2718             nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
2719                 goto nla_put_failure;
2720         } else if (rt->rt_gw_family == AF_INET6) {
2721                 int alen = sizeof(struct in6_addr);
2722                 struct nlattr *nla;
2723                 struct rtvia *via;
2724
2725                 nla = nla_reserve(skb, RTA_VIA, alen + 2);
2726                 if (!nla)
2727                         goto nla_put_failure;
2728
2729                 via = nla_data(nla);
2730                 via->rtvia_family = AF_INET6;
2731                 memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
2732         }
2733
2734         expires = rt->dst.expires;
2735         if (expires) {
2736                 unsigned long now = jiffies;
2737
2738                 if (time_before(now, expires))
2739                         expires -= now;
2740                 else
2741                         expires = 0;
2742         }
2743
2744         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2745         if (rt->rt_pmtu && expires)
2746                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2747         if (rt->rt_mtu_locked && expires)
2748                 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2749         if (rtnetlink_put_metrics(skb, metrics) < 0)
2750                 goto nla_put_failure;
2751
2752         if (fl4->flowi4_mark &&
2753             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2754                 goto nla_put_failure;
2755
2756         if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2757             nla_put_u32(skb, RTA_UID,
2758                         from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2759                 goto nla_put_failure;
2760
2761         error = rt->dst.error;
2762
2763         if (rt_is_input_route(rt)) {
2764 #ifdef CONFIG_IP_MROUTE
2765                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2766                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2767                         int err = ipmr_get_route(net, skb,
2768                                                  fl4->saddr, fl4->daddr,
2769                                                  r, portid);
2770
2771                         if (err <= 0) {
2772                                 if (err == 0)
2773                                         return 0;
2774                                 goto nla_put_failure;
2775                         }
2776                 } else
2777 #endif
2778                         if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2779                                 goto nla_put_failure;
2780         }
2781
2782         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2783                 goto nla_put_failure;
2784
2785         nlmsg_end(skb, nlh);
2786         return 0;
2787
2788 nla_put_failure:
2789         nlmsg_cancel(skb, nlh);
2790         return -EMSGSIZE;
2791 }
2792
2793 static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
2794                                                    u8 ip_proto, __be16 sport,
2795                                                    __be16 dport)
2796 {
2797         struct sk_buff *skb;
2798         struct iphdr *iph;
2799
2800         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2801         if (!skb)
2802                 return NULL;
2803
2804         /* Reserve room for dummy headers, this skb can pass
2805          * through good chunk of routing engine.
2806          */
2807         skb_reset_mac_header(skb);
2808         skb_reset_network_header(skb);
2809         skb->protocol = htons(ETH_P_IP);
2810         iph = skb_put(skb, sizeof(struct iphdr));
2811         iph->protocol = ip_proto;
2812         iph->saddr = src;
2813         iph->daddr = dst;
2814         iph->version = 0x4;
2815         iph->frag_off = 0;
2816         iph->ihl = 0x5;
2817         skb_set_transport_header(skb, skb->len);
2818
2819         switch (iph->protocol) {
2820         case IPPROTO_UDP: {
2821                 struct udphdr *udph;
2822
2823                 udph = skb_put_zero(skb, sizeof(struct udphdr));
2824                 udph->source = sport;
2825                 udph->dest = dport;
2826                 udph->len = sizeof(struct udphdr);
2827                 udph->check = 0;
2828                 break;
2829         }
2830         case IPPROTO_TCP: {
2831                 struct tcphdr *tcph;
2832
2833                 tcph = skb_put_zero(skb, sizeof(struct tcphdr));
2834                 tcph->source    = sport;
2835                 tcph->dest      = dport;
2836                 tcph->doff      = sizeof(struct tcphdr) / 4;
2837                 tcph->rst = 1;
2838                 tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
2839                                             src, dst, 0);
2840                 break;
2841         }
2842         case IPPROTO_ICMP: {
2843                 struct icmphdr *icmph;
2844
2845                 icmph = skb_put_zero(skb, sizeof(struct icmphdr));
2846                 icmph->type = ICMP_ECHO;
2847                 icmph->code = 0;
2848         }
2849         }
2850
2851         return skb;
2852 }
2853
2854 static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
2855                                        const struct nlmsghdr *nlh,
2856                                        struct nlattr **tb,
2857                                        struct netlink_ext_ack *extack)
2858 {
2859         struct rtmsg *rtm;
2860         int i, err;
2861
2862         if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
2863                 NL_SET_ERR_MSG(extack,
2864                                "ipv4: Invalid header for route get request");
2865                 return -EINVAL;
2866         }
2867
2868         if (!netlink_strict_get_check(skb))
2869                 return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
2870                                               rtm_ipv4_policy, extack);
2871
2872         rtm = nlmsg_data(nlh);
2873         if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
2874             (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
2875             rtm->rtm_table || rtm->rtm_protocol ||
2876             rtm->rtm_scope || rtm->rtm_type) {
2877                 NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
2878                 return -EINVAL;
2879         }
2880
2881         if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
2882                                RTM_F_LOOKUP_TABLE |
2883                                RTM_F_FIB_MATCH)) {
2884                 NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
2885                 return -EINVAL;
2886         }
2887
2888         err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
2889                                             rtm_ipv4_policy, extack);
2890         if (err)
2891                 return err;
2892
2893         if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
2894             (tb[RTA_DST] && !rtm->rtm_dst_len)) {
2895                 NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
2896                 return -EINVAL;
2897         }
2898
2899         for (i = 0; i <= RTA_MAX; i++) {
2900                 if (!tb[i])
2901                         continue;
2902
2903                 switch (i) {
2904                 case RTA_IIF:
2905                 case RTA_OIF:
2906                 case RTA_SRC:
2907                 case RTA_DST:
2908                 case RTA_IP_PROTO:
2909                 case RTA_SPORT:
2910                 case RTA_DPORT:
2911                 case RTA_MARK:
2912                 case RTA_UID:
2913                         break;
2914                 default:
2915                         NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
2916                         return -EINVAL;
2917                 }
2918         }
2919
2920         return 0;
2921 }
2922
2923 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2924                              struct netlink_ext_ack *extack)
2925 {
2926         struct net *net = sock_net(in_skb->sk);
2927         struct nlattr *tb[RTA_MAX+1];
2928         u32 table_id = RT_TABLE_MAIN;
2929         __be16 sport = 0, dport = 0;
2930         struct fib_result res = {};
2931         u8 ip_proto = IPPROTO_UDP;
2932         struct rtable *rt = NULL;
2933         struct sk_buff *skb;
2934         struct rtmsg *rtm;
2935         struct flowi4 fl4 = {};
2936         __be32 dst = 0;
2937         __be32 src = 0;
2938         kuid_t uid;
2939         u32 iif;
2940         int err;
2941         int mark;
2942
2943         err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
2944         if (err < 0)
2945                 return err;
2946
2947         rtm = nlmsg_data(nlh);
2948         src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2949         dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2950         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2951         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2952         if (tb[RTA_UID])
2953                 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2954         else
2955                 uid = (iif ? INVALID_UID : current_uid());
2956
2957         if (tb[RTA_IP_PROTO]) {
2958                 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
2959                                                   &ip_proto, AF_INET, extack);
2960                 if (err)
2961                         return err;
2962         }
2963
2964         if (tb[RTA_SPORT])
2965                 sport = nla_get_be16(tb[RTA_SPORT]);
2966
2967         if (tb[RTA_DPORT])
2968                 dport = nla_get_be16(tb[RTA_DPORT]);
2969
2970         skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
2971         if (!skb)
2972                 return -ENOBUFS;
2973
2974         fl4.daddr = dst;
2975         fl4.saddr = src;
2976         fl4.flowi4_tos = rtm->rtm_tos;
2977         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2978         fl4.flowi4_mark = mark;
2979         fl4.flowi4_uid = uid;
2980         if (sport)
2981                 fl4.fl4_sport = sport;
2982         if (dport)
2983                 fl4.fl4_dport = dport;
2984         fl4.flowi4_proto = ip_proto;
2985
2986         rcu_read_lock();
2987
2988         if (iif) {
2989                 struct net_device *dev;
2990
2991                 dev = dev_get_by_index_rcu(net, iif);
2992                 if (!dev) {
2993                         err = -ENODEV;
2994                         goto errout_rcu;
2995                 }
2996
2997                 fl4.flowi4_iif = iif; /* for rt_fill_info */
2998                 skb->dev        = dev;
2999                 skb->mark       = mark;
3000                 err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
3001                                          dev, &res);
3002
3003                 rt = skb_rtable(skb);
3004                 if (err == 0 && rt->dst.error)
3005                         err = -rt->dst.error;
3006         } else {
3007                 fl4.flowi4_iif = LOOPBACK_IFINDEX;
3008                 skb->dev = net->loopback_dev;
3009                 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
3010                 err = 0;
3011                 if (IS_ERR(rt))
3012                         err = PTR_ERR(rt);
3013                 else
3014                         skb_dst_set(skb, &rt->dst);
3015         }
3016
3017         if (err)
3018                 goto errout_rcu;
3019
3020         if (rtm->rtm_flags & RTM_F_NOTIFY)
3021                 rt->rt_flags |= RTCF_NOTIFY;
3022
3023         if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
3024                 table_id = res.table ? res.table->tb_id : 0;
3025
3026         /* reset skb for netlink reply msg */
3027         skb_trim(skb, 0);
3028         skb_reset_network_header(skb);
3029         skb_reset_transport_header(skb);
3030         skb_reset_mac_header(skb);
3031
3032         if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
3033                 if (!res.fi) {
3034                         err = fib_props[res.type].error;
3035                         if (!err)
3036                                 err = -EHOSTUNREACH;
3037                         goto errout_rcu;
3038                 }
3039                 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
3040                                     nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
3041                                     rt->rt_type, res.prefix, res.prefixlen,
3042                                     fl4.flowi4_tos, res.fi, 0);
3043         } else {
3044                 err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
3045                                    NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
3046         }
3047         if (err < 0)
3048                 goto errout_rcu;
3049
3050         rcu_read_unlock();
3051
3052         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3053
3054 errout_free:
3055         return err;
3056 errout_rcu:
3057         rcu_read_unlock();
3058         kfree_skb(skb);
3059         goto errout_free;
3060 }
3061
3062 void ip_rt_multicast_event(struct in_device *in_dev)
3063 {
3064         rt_cache_flush(dev_net(in_dev->dev));
3065 }
3066
3067 #ifdef CONFIG_SYSCTL
3068 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
3069 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
3070 static int ip_rt_gc_elasticity __read_mostly    = 8;
3071 static int ip_min_valid_pmtu __read_mostly      = IPV4_MIN_MTU;
3072
3073 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3074                                         void __user *buffer,
3075                                         size_t *lenp, loff_t *ppos)
3076 {
3077         struct net *net = (struct net *)__ctl->extra1;
3078
3079         if (write) {
3080                 rt_cache_flush(net);
3081                 fnhe_genid_bump(net);
3082                 return 0;
3083         }
3084
3085         return -EINVAL;
3086 }
3087
3088 static struct ctl_table ipv4_route_table[] = {
3089         {
3090                 .procname       = "gc_thresh",
3091                 .data           = &ipv4_dst_ops.gc_thresh,
3092                 .maxlen         = sizeof(int),
3093                 .mode           = 0644,
3094                 .proc_handler   = proc_dointvec,
3095         },
3096         {
3097                 .procname       = "max_size",
3098                 .data           = &ip_rt_max_size,
3099                 .maxlen         = sizeof(int),
3100                 .mode           = 0644,
3101                 .proc_handler   = proc_dointvec,
3102         },
3103         {
3104                 /*  Deprecated. Use gc_min_interval_ms */
3105
3106                 .procname       = "gc_min_interval",
3107                 .data           = &ip_rt_gc_min_interval,
3108                 .maxlen         = sizeof(int),
3109                 .mode           = 0644,
3110                 .proc_handler   = proc_dointvec_jiffies,
3111         },
3112         {
3113                 .procname       = "gc_min_interval_ms",
3114                 .data           = &ip_rt_gc_min_interval,
3115                 .maxlen         = sizeof(int),
3116                 .mode           = 0644,
3117                 .proc_handler   = proc_dointvec_ms_jiffies,
3118         },
3119         {
3120                 .procname       = "gc_timeout",
3121                 .data           = &ip_rt_gc_timeout,
3122                 .maxlen         = sizeof(int),
3123                 .mode           = 0644,
3124                 .proc_handler   = proc_dointvec_jiffies,
3125         },
3126         {
3127                 .procname       = "gc_interval",
3128                 .data           = &ip_rt_gc_interval,
3129                 .maxlen         = sizeof(int),
3130                 .mode           = 0644,
3131                 .proc_handler   = proc_dointvec_jiffies,
3132         },
3133         {
3134                 .procname       = "redirect_load",
3135                 .data           = &ip_rt_redirect_load,
3136                 .maxlen         = sizeof(int),
3137                 .mode           = 0644,
3138                 .proc_handler   = proc_dointvec,
3139         },
3140         {
3141                 .procname       = "redirect_number",
3142                 .data           = &ip_rt_redirect_number,
3143                 .maxlen         = sizeof(int),
3144                 .mode           = 0644,
3145                 .proc_handler   = proc_dointvec,
3146         },
3147         {
3148                 .procname       = "redirect_silence",
3149                 .data           = &ip_rt_redirect_silence,
3150                 .maxlen         = sizeof(int),
3151                 .mode           = 0644,
3152                 .proc_handler   = proc_dointvec,
3153         },
3154         {
3155                 .procname       = "error_cost",
3156                 .data           = &ip_rt_error_cost,
3157                 .maxlen         = sizeof(int),
3158                 .mode           = 0644,
3159                 .proc_handler   = proc_dointvec,
3160         },
3161         {
3162                 .procname       = "error_burst",
3163                 .data           = &ip_rt_error_burst,
3164                 .maxlen         = sizeof(int),
3165                 .mode           = 0644,
3166                 .proc_handler   = proc_dointvec,
3167         },
3168         {
3169                 .procname       = "gc_elasticity",
3170                 .data           = &ip_rt_gc_elasticity,
3171                 .maxlen         = sizeof(int),
3172                 .mode           = 0644,
3173                 .proc_handler   = proc_dointvec,
3174         },
3175         {
3176                 .procname       = "mtu_expires",
3177                 .data           = &ip_rt_mtu_expires,
3178                 .maxlen         = sizeof(int),
3179                 .mode           = 0644,
3180                 .proc_handler   = proc_dointvec_jiffies,
3181         },
3182         {
3183                 .procname       = "min_pmtu",
3184                 .data           = &ip_rt_min_pmtu,
3185                 .maxlen         = sizeof(int),
3186                 .mode           = 0644,
3187                 .proc_handler   = proc_dointvec_minmax,
3188                 .extra1         = &ip_min_valid_pmtu,
3189         },
3190         {
3191                 .procname       = "min_adv_mss",
3192                 .data           = &ip_rt_min_advmss,
3193                 .maxlen         = sizeof(int),
3194                 .mode           = 0644,
3195                 .proc_handler   = proc_dointvec,
3196         },
3197         { }
3198 };
3199
3200 static struct ctl_table ipv4_route_flush_table[] = {
3201         {
3202                 .procname       = "flush",
3203                 .maxlen         = sizeof(int),
3204                 .mode           = 0200,
3205                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3206         },
3207         { },
3208 };
3209
3210 static __net_init int sysctl_route_net_init(struct net *net)
3211 {
3212         struct ctl_table *tbl;
3213
3214         tbl = ipv4_route_flush_table;
3215         if (!net_eq(net, &init_net)) {
3216                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3217                 if (!tbl)
3218                         goto err_dup;
3219
3220                 /* Don't export sysctls to unprivileged users */
3221                 if (net->user_ns != &init_user_ns)
3222                         tbl[0].procname = NULL;
3223         }
3224         tbl[0].extra1 = net;
3225
3226         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3227         if (!net->ipv4.route_hdr)
3228                 goto err_reg;
3229         return 0;
3230
3231 err_reg:
3232         if (tbl != ipv4_route_flush_table)
3233                 kfree(tbl);
3234 err_dup:
3235         return -ENOMEM;
3236 }
3237
3238 static __net_exit void sysctl_route_net_exit(struct net *net)
3239 {
3240         struct ctl_table *tbl;
3241
3242         tbl = net->ipv4.route_hdr->ctl_table_arg;
3243         unregister_net_sysctl_table(net->ipv4.route_hdr);
3244         BUG_ON(tbl == ipv4_route_flush_table);
3245         kfree(tbl);
3246 }
3247
3248 static __net_initdata struct pernet_operations sysctl_route_ops = {
3249         .init = sysctl_route_net_init,
3250         .exit = sysctl_route_net_exit,
3251 };
3252 #endif
3253
3254 static __net_init int rt_genid_init(struct net *net)
3255 {
3256         atomic_set(&net->ipv4.rt_genid, 0);
3257         atomic_set(&net->fnhe_genid, 0);
3258         atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3259         return 0;
3260 }
3261
3262 static __net_initdata struct pernet_operations rt_genid_ops = {
3263         .init = rt_genid_init,
3264 };
3265
3266 static int __net_init ipv4_inetpeer_init(struct net *net)
3267 {
3268         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3269
3270         if (!bp)
3271                 return -ENOMEM;
3272         inet_peer_base_init(bp);
3273         net->ipv4.peers = bp;
3274         return 0;
3275 }
3276
3277 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3278 {
3279         struct inet_peer_base *bp = net->ipv4.peers;
3280
3281         net->ipv4.peers = NULL;
3282         inetpeer_invalidate_tree(bp);
3283         kfree(bp);
3284 }
3285
3286 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3287         .init   =       ipv4_inetpeer_init,
3288         .exit   =       ipv4_inetpeer_exit,
3289 };
3290
3291 #ifdef CONFIG_IP_ROUTE_CLASSID
3292 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3293 #endif /* CONFIG_IP_ROUTE_CLASSID */
3294
3295 int __init ip_rt_init(void)
3296 {
3297         int cpu;
3298
3299         ip_idents = kmalloc_array(IP_IDENTS_SZ, sizeof(*ip_idents),
3300                                   GFP_KERNEL);
3301         if (!ip_idents)
3302                 panic("IP: failed to allocate ip_idents\n");
3303
3304         prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3305
3306         ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3307         if (!ip_tstamps)
3308                 panic("IP: failed to allocate ip_tstamps\n");
3309
3310         for_each_possible_cpu(cpu) {
3311                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3312
3313                 INIT_LIST_HEAD(&ul->head);
3314                 spin_lock_init(&ul->lock);
3315         }
3316 #ifdef CONFIG_IP_ROUTE_CLASSID
3317         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3318         if (!ip_rt_acct)
3319                 panic("IP: failed to allocate ip_rt_acct\n");
3320 #endif
3321
3322         ipv4_dst_ops.kmem_cachep =
3323                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3324                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3325
3326         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3327
3328         if (dst_entries_init(&ipv4_dst_ops) < 0)
3329                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3330
3331         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3332                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3333
3334         ipv4_dst_ops.gc_thresh = ~0;
3335         ip_rt_max_size = INT_MAX;
3336
3337         devinet_init();
3338         ip_fib_init();
3339
3340         if (ip_rt_proc_init())
3341                 pr_err("Unable to create route proc files\n");
3342 #ifdef CONFIG_XFRM
3343         xfrm_init();
3344         xfrm4_init();
3345 #endif
3346         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3347                       RTNL_FLAG_DOIT_UNLOCKED);
3348
3349 #ifdef CONFIG_SYSCTL
3350         register_pernet_subsys(&sysctl_route_ops);
3351 #endif
3352         register_pernet_subsys(&rt_genid_ops);
3353         register_pernet_subsys(&ipv4_inetpeer_ops);
3354         return 0;
3355 }
3356
3357 #ifdef CONFIG_SYSCTL
3358 /*
3359  * We really need to sanitize the damn ipv4 init order, then all
3360  * this nonsense will go away.
3361  */
3362 void __init ip_static_sysctl_init(void)
3363 {
3364         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3365 }
3366 #endif