Merge branch 'timers-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...
[sfrench/cifs-2.6.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <trace/events/fib6.h>
67
68 #include <linux/uaccess.h>
69
70 #ifdef CONFIG_SYSCTL
71 #include <linux/sysctl.h>
72 #endif
73
74 enum rt6_nud_state {
75         RT6_NUD_FAIL_HARD = -3,
76         RT6_NUD_FAIL_PROBE = -2,
77         RT6_NUD_FAIL_DO_RR = -1,
78         RT6_NUD_SUCCEED = 1
79 };
80
81 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
82 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
83 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
84 static unsigned int      ip6_mtu(const struct dst_entry *dst);
85 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
86 static void             ip6_dst_destroy(struct dst_entry *);
87 static void             ip6_dst_ifdown(struct dst_entry *,
88                                        struct net_device *dev, int how);
89 static int               ip6_dst_gc(struct dst_ops *ops);
90
91 static int              ip6_pkt_discard(struct sk_buff *skb);
92 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
93 static int              ip6_pkt_prohibit(struct sk_buff *skb);
94 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
95 static void             ip6_link_failure(struct sk_buff *skb);
96 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
97                                            struct sk_buff *skb, u32 mtu);
98 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
99                                         struct sk_buff *skb);
100 static void             rt6_dst_from_metrics_check(struct rt6_info *rt);
101 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
102 static size_t rt6_nlmsg_size(struct rt6_info *rt);
103 static int rt6_fill_node(struct net *net,
104                          struct sk_buff *skb, struct rt6_info *rt,
105                          struct in6_addr *dst, struct in6_addr *src,
106                          int iif, int type, u32 portid, u32 seq,
107                          unsigned int flags);
108 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
109                                            struct in6_addr *daddr,
110                                            struct in6_addr *saddr);
111
112 #ifdef CONFIG_IPV6_ROUTE_INFO
113 static struct rt6_info *rt6_add_route_info(struct net *net,
114                                            const struct in6_addr *prefix, int prefixlen,
115                                            const struct in6_addr *gwaddr,
116                                            struct net_device *dev,
117                                            unsigned int pref);
118 static struct rt6_info *rt6_get_route_info(struct net *net,
119                                            const struct in6_addr *prefix, int prefixlen,
120                                            const struct in6_addr *gwaddr,
121                                            struct net_device *dev);
122 #endif
123
124 struct uncached_list {
125         spinlock_t              lock;
126         struct list_head        head;
127 };
128
129 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
130
131 void rt6_uncached_list_add(struct rt6_info *rt)
132 {
133         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
134
135         rt->rt6i_uncached_list = ul;
136
137         spin_lock_bh(&ul->lock);
138         list_add_tail(&rt->rt6i_uncached, &ul->head);
139         spin_unlock_bh(&ul->lock);
140 }
141
142 void rt6_uncached_list_del(struct rt6_info *rt)
143 {
144         if (!list_empty(&rt->rt6i_uncached)) {
145                 struct uncached_list *ul = rt->rt6i_uncached_list;
146                 struct net *net = dev_net(rt->dst.dev);
147
148                 spin_lock_bh(&ul->lock);
149                 list_del(&rt->rt6i_uncached);
150                 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
151                 spin_unlock_bh(&ul->lock);
152         }
153 }
154
155 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
156 {
157         struct net_device *loopback_dev = net->loopback_dev;
158         int cpu;
159
160         if (dev == loopback_dev)
161                 return;
162
163         for_each_possible_cpu(cpu) {
164                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
165                 struct rt6_info *rt;
166
167                 spin_lock_bh(&ul->lock);
168                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
169                         struct inet6_dev *rt_idev = rt->rt6i_idev;
170                         struct net_device *rt_dev = rt->dst.dev;
171
172                         if (rt_idev->dev == dev) {
173                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
174                                 in6_dev_put(rt_idev);
175                         }
176
177                         if (rt_dev == dev) {
178                                 rt->dst.dev = loopback_dev;
179                                 dev_hold(rt->dst.dev);
180                                 dev_put(rt_dev);
181                         }
182                 }
183                 spin_unlock_bh(&ul->lock);
184         }
185 }
186
187 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
188 {
189         return dst_metrics_write_ptr(&rt->from->dst);
190 }
191
192 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
193 {
194         struct rt6_info *rt = (struct rt6_info *)dst;
195
196         if (rt->rt6i_flags & RTF_PCPU)
197                 return rt6_pcpu_cow_metrics(rt);
198         else if (rt->rt6i_flags & RTF_CACHE)
199                 return NULL;
200         else
201                 return dst_cow_metrics_generic(dst, old);
202 }
203
204 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
205                                              struct sk_buff *skb,
206                                              const void *daddr)
207 {
208         struct in6_addr *p = &rt->rt6i_gateway;
209
210         if (!ipv6_addr_any(p))
211                 return (const void *) p;
212         else if (skb)
213                 return &ipv6_hdr(skb)->daddr;
214         return daddr;
215 }
216
217 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
218                                           struct sk_buff *skb,
219                                           const void *daddr)
220 {
221         struct rt6_info *rt = (struct rt6_info *) dst;
222         struct neighbour *n;
223
224         daddr = choose_neigh_daddr(rt, skb, daddr);
225         n = __ipv6_neigh_lookup(dst->dev, daddr);
226         if (n)
227                 return n;
228         return neigh_create(&nd_tbl, daddr, dst->dev);
229 }
230
231 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
232 {
233         struct net_device *dev = dst->dev;
234         struct rt6_info *rt = (struct rt6_info *)dst;
235
236         daddr = choose_neigh_daddr(rt, NULL, daddr);
237         if (!daddr)
238                 return;
239         if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
240                 return;
241         if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
242                 return;
243         __ipv6_confirm_neigh(dev, daddr);
244 }
245
246 static struct dst_ops ip6_dst_ops_template = {
247         .family                 =       AF_INET6,
248         .gc                     =       ip6_dst_gc,
249         .gc_thresh              =       1024,
250         .check                  =       ip6_dst_check,
251         .default_advmss         =       ip6_default_advmss,
252         .mtu                    =       ip6_mtu,
253         .cow_metrics            =       ipv6_cow_metrics,
254         .destroy                =       ip6_dst_destroy,
255         .ifdown                 =       ip6_dst_ifdown,
256         .negative_advice        =       ip6_negative_advice,
257         .link_failure           =       ip6_link_failure,
258         .update_pmtu            =       ip6_rt_update_pmtu,
259         .redirect               =       rt6_do_redirect,
260         .local_out              =       __ip6_local_out,
261         .neigh_lookup           =       ip6_neigh_lookup,
262         .confirm_neigh          =       ip6_confirm_neigh,
263 };
264
265 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
266 {
267         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
268
269         return mtu ? : dst->dev->mtu;
270 }
271
272 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
273                                          struct sk_buff *skb, u32 mtu)
274 {
275 }
276
277 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
278                                       struct sk_buff *skb)
279 {
280 }
281
282 static struct dst_ops ip6_dst_blackhole_ops = {
283         .family                 =       AF_INET6,
284         .destroy                =       ip6_dst_destroy,
285         .check                  =       ip6_dst_check,
286         .mtu                    =       ip6_blackhole_mtu,
287         .default_advmss         =       ip6_default_advmss,
288         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
289         .redirect               =       ip6_rt_blackhole_redirect,
290         .cow_metrics            =       dst_cow_metrics_generic,
291         .neigh_lookup           =       ip6_neigh_lookup,
292 };
293
294 static const u32 ip6_template_metrics[RTAX_MAX] = {
295         [RTAX_HOPLIMIT - 1] = 0,
296 };
297
298 static const struct rt6_info ip6_null_entry_template = {
299         .dst = {
300                 .__refcnt       = ATOMIC_INIT(1),
301                 .__use          = 1,
302                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
303                 .error          = -ENETUNREACH,
304                 .input          = ip6_pkt_discard,
305                 .output         = ip6_pkt_discard_out,
306         },
307         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
308         .rt6i_protocol  = RTPROT_KERNEL,
309         .rt6i_metric    = ~(u32) 0,
310         .rt6i_ref       = ATOMIC_INIT(1),
311 };
312
313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
314
315 static const struct rt6_info ip6_prohibit_entry_template = {
316         .dst = {
317                 .__refcnt       = ATOMIC_INIT(1),
318                 .__use          = 1,
319                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
320                 .error          = -EACCES,
321                 .input          = ip6_pkt_prohibit,
322                 .output         = ip6_pkt_prohibit_out,
323         },
324         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
325         .rt6i_protocol  = RTPROT_KERNEL,
326         .rt6i_metric    = ~(u32) 0,
327         .rt6i_ref       = ATOMIC_INIT(1),
328 };
329
330 static const struct rt6_info ip6_blk_hole_entry_template = {
331         .dst = {
332                 .__refcnt       = ATOMIC_INIT(1),
333                 .__use          = 1,
334                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
335                 .error          = -EINVAL,
336                 .input          = dst_discard,
337                 .output         = dst_discard_out,
338         },
339         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
340         .rt6i_protocol  = RTPROT_KERNEL,
341         .rt6i_metric    = ~(u32) 0,
342         .rt6i_ref       = ATOMIC_INIT(1),
343 };
344
345 #endif
346
347 static void rt6_info_init(struct rt6_info *rt)
348 {
349         struct dst_entry *dst = &rt->dst;
350
351         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
352         INIT_LIST_HEAD(&rt->rt6i_siblings);
353         INIT_LIST_HEAD(&rt->rt6i_uncached);
354 }
355
356 /* allocate dst with ip6_dst_ops */
357 static struct rt6_info *__ip6_dst_alloc(struct net *net,
358                                         struct net_device *dev,
359                                         int flags)
360 {
361         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
362                                         1, DST_OBSOLETE_FORCE_CHK, flags);
363
364         if (rt) {
365                 rt6_info_init(rt);
366                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
367         }
368
369         return rt;
370 }
371
372 struct rt6_info *ip6_dst_alloc(struct net *net,
373                                struct net_device *dev,
374                                int flags)
375 {
376         struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
377
378         if (rt) {
379                 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
380                 if (!rt->rt6i_pcpu) {
381                         dst_release_immediate(&rt->dst);
382                         return NULL;
383                 }
384         }
385
386         return rt;
387 }
388 EXPORT_SYMBOL(ip6_dst_alloc);
389
390 static void ip6_dst_destroy(struct dst_entry *dst)
391 {
392         struct rt6_info *rt = (struct rt6_info *)dst;
393         struct rt6_exception_bucket *bucket;
394         struct rt6_info *from = rt->from;
395         struct inet6_dev *idev;
396
397         dst_destroy_metrics_generic(dst);
398         free_percpu(rt->rt6i_pcpu);
399         rt6_uncached_list_del(rt);
400
401         idev = rt->rt6i_idev;
402         if (idev) {
403                 rt->rt6i_idev = NULL;
404                 in6_dev_put(idev);
405         }
406         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1);
407         if (bucket) {
408                 rt->rt6i_exception_bucket = NULL;
409                 kfree(bucket);
410         }
411
412         rt->from = NULL;
413         dst_release(&from->dst);
414 }
415
416 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
417                            int how)
418 {
419         struct rt6_info *rt = (struct rt6_info *)dst;
420         struct inet6_dev *idev = rt->rt6i_idev;
421         struct net_device *loopback_dev =
422                 dev_net(dev)->loopback_dev;
423
424         if (idev && idev->dev != loopback_dev) {
425                 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
426                 if (loopback_idev) {
427                         rt->rt6i_idev = loopback_idev;
428                         in6_dev_put(idev);
429                 }
430         }
431 }
432
433 static bool __rt6_check_expired(const struct rt6_info *rt)
434 {
435         if (rt->rt6i_flags & RTF_EXPIRES)
436                 return time_after(jiffies, rt->dst.expires);
437         else
438                 return false;
439 }
440
441 static bool rt6_check_expired(const struct rt6_info *rt)
442 {
443         if (rt->rt6i_flags & RTF_EXPIRES) {
444                 if (time_after(jiffies, rt->dst.expires))
445                         return true;
446         } else if (rt->from) {
447                 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
448                         rt6_check_expired(rt->from);
449         }
450         return false;
451 }
452
453 static struct rt6_info *rt6_multipath_select(const struct net *net,
454                                              struct rt6_info *match,
455                                              struct flowi6 *fl6, int oif,
456                                              const struct sk_buff *skb,
457                                              int strict)
458 {
459         struct rt6_info *sibling, *next_sibling;
460
461         /* We might have already computed the hash for ICMPv6 errors. In such
462          * case it will always be non-zero. Otherwise now is the time to do it.
463          */
464         if (!fl6->mp_hash)
465                 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
466
467         if (fl6->mp_hash <= atomic_read(&match->rt6i_nh_upper_bound))
468                 return match;
469
470         list_for_each_entry_safe(sibling, next_sibling, &match->rt6i_siblings,
471                                  rt6i_siblings) {
472                 if (fl6->mp_hash > atomic_read(&sibling->rt6i_nh_upper_bound))
473                         continue;
474                 if (rt6_score_route(sibling, oif, strict) < 0)
475                         break;
476                 match = sibling;
477                 break;
478         }
479
480         return match;
481 }
482
483 /*
484  *      Route lookup. rcu_read_lock() should be held.
485  */
486
487 static inline struct rt6_info *rt6_device_match(struct net *net,
488                                                     struct rt6_info *rt,
489                                                     const struct in6_addr *saddr,
490                                                     int oif,
491                                                     int flags)
492 {
493         struct rt6_info *local = NULL;
494         struct rt6_info *sprt;
495
496         if (!oif && ipv6_addr_any(saddr) && !(rt->rt6i_nh_flags & RTNH_F_DEAD))
497                 return rt;
498
499         for (sprt = rt; sprt; sprt = rcu_dereference(sprt->rt6_next)) {
500                 struct net_device *dev = sprt->dst.dev;
501
502                 if (sprt->rt6i_nh_flags & RTNH_F_DEAD)
503                         continue;
504
505                 if (oif) {
506                         if (dev->ifindex == oif)
507                                 return sprt;
508                         if (dev->flags & IFF_LOOPBACK) {
509                                 if (!sprt->rt6i_idev ||
510                                     sprt->rt6i_idev->dev->ifindex != oif) {
511                                         if (flags & RT6_LOOKUP_F_IFACE)
512                                                 continue;
513                                         if (local &&
514                                             local->rt6i_idev->dev->ifindex == oif)
515                                                 continue;
516                                 }
517                                 local = sprt;
518                         }
519                 } else {
520                         if (ipv6_chk_addr(net, saddr, dev,
521                                           flags & RT6_LOOKUP_F_IFACE))
522                                 return sprt;
523                 }
524         }
525
526         if (oif) {
527                 if (local)
528                         return local;
529
530                 if (flags & RT6_LOOKUP_F_IFACE)
531                         return net->ipv6.ip6_null_entry;
532         }
533
534         return rt->rt6i_nh_flags & RTNH_F_DEAD ? net->ipv6.ip6_null_entry : rt;
535 }
536
537 #ifdef CONFIG_IPV6_ROUTER_PREF
538 struct __rt6_probe_work {
539         struct work_struct work;
540         struct in6_addr target;
541         struct net_device *dev;
542 };
543
544 static void rt6_probe_deferred(struct work_struct *w)
545 {
546         struct in6_addr mcaddr;
547         struct __rt6_probe_work *work =
548                 container_of(w, struct __rt6_probe_work, work);
549
550         addrconf_addr_solict_mult(&work->target, &mcaddr);
551         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
552         dev_put(work->dev);
553         kfree(work);
554 }
555
556 static void rt6_probe(struct rt6_info *rt)
557 {
558         struct __rt6_probe_work *work;
559         struct neighbour *neigh;
560         /*
561          * Okay, this does not seem to be appropriate
562          * for now, however, we need to check if it
563          * is really so; aka Router Reachability Probing.
564          *
565          * Router Reachability Probe MUST be rate-limited
566          * to no more than one per minute.
567          */
568         if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
569                 return;
570         rcu_read_lock_bh();
571         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
572         if (neigh) {
573                 if (neigh->nud_state & NUD_VALID)
574                         goto out;
575
576                 work = NULL;
577                 write_lock(&neigh->lock);
578                 if (!(neigh->nud_state & NUD_VALID) &&
579                     time_after(jiffies,
580                                neigh->updated +
581                                rt->rt6i_idev->cnf.rtr_probe_interval)) {
582                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
583                         if (work)
584                                 __neigh_set_probe_once(neigh);
585                 }
586                 write_unlock(&neigh->lock);
587         } else {
588                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
589         }
590
591         if (work) {
592                 INIT_WORK(&work->work, rt6_probe_deferred);
593                 work->target = rt->rt6i_gateway;
594                 dev_hold(rt->dst.dev);
595                 work->dev = rt->dst.dev;
596                 schedule_work(&work->work);
597         }
598
599 out:
600         rcu_read_unlock_bh();
601 }
602 #else
603 static inline void rt6_probe(struct rt6_info *rt)
604 {
605 }
606 #endif
607
608 /*
609  * Default Router Selection (RFC 2461 6.3.6)
610  */
611 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
612 {
613         struct net_device *dev = rt->dst.dev;
614         if (!oif || dev->ifindex == oif)
615                 return 2;
616         if ((dev->flags & IFF_LOOPBACK) &&
617             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
618                 return 1;
619         return 0;
620 }
621
622 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
623 {
624         struct neighbour *neigh;
625         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
626
627         if (rt->rt6i_flags & RTF_NONEXTHOP ||
628             !(rt->rt6i_flags & RTF_GATEWAY))
629                 return RT6_NUD_SUCCEED;
630
631         rcu_read_lock_bh();
632         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
633         if (neigh) {
634                 read_lock(&neigh->lock);
635                 if (neigh->nud_state & NUD_VALID)
636                         ret = RT6_NUD_SUCCEED;
637 #ifdef CONFIG_IPV6_ROUTER_PREF
638                 else if (!(neigh->nud_state & NUD_FAILED))
639                         ret = RT6_NUD_SUCCEED;
640                 else
641                         ret = RT6_NUD_FAIL_PROBE;
642 #endif
643                 read_unlock(&neigh->lock);
644         } else {
645                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
646                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
647         }
648         rcu_read_unlock_bh();
649
650         return ret;
651 }
652
653 static int rt6_score_route(struct rt6_info *rt, int oif,
654                            int strict)
655 {
656         int m;
657
658         m = rt6_check_dev(rt, oif);
659         if (!m && (strict & RT6_LOOKUP_F_IFACE))
660                 return RT6_NUD_FAIL_HARD;
661 #ifdef CONFIG_IPV6_ROUTER_PREF
662         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
663 #endif
664         if (strict & RT6_LOOKUP_F_REACHABLE) {
665                 int n = rt6_check_neigh(rt);
666                 if (n < 0)
667                         return n;
668         }
669         return m;
670 }
671
672 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
673                                    int *mpri, struct rt6_info *match,
674                                    bool *do_rr)
675 {
676         int m;
677         bool match_do_rr = false;
678         struct inet6_dev *idev = rt->rt6i_idev;
679
680         if (rt->rt6i_nh_flags & RTNH_F_DEAD)
681                 goto out;
682
683         if (idev->cnf.ignore_routes_with_linkdown &&
684             rt->rt6i_nh_flags & RTNH_F_LINKDOWN &&
685             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
686                 goto out;
687
688         if (rt6_check_expired(rt))
689                 goto out;
690
691         m = rt6_score_route(rt, oif, strict);
692         if (m == RT6_NUD_FAIL_DO_RR) {
693                 match_do_rr = true;
694                 m = 0; /* lowest valid score */
695         } else if (m == RT6_NUD_FAIL_HARD) {
696                 goto out;
697         }
698
699         if (strict & RT6_LOOKUP_F_REACHABLE)
700                 rt6_probe(rt);
701
702         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
703         if (m > *mpri) {
704                 *do_rr = match_do_rr;
705                 *mpri = m;
706                 match = rt;
707         }
708 out:
709         return match;
710 }
711
712 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
713                                      struct rt6_info *leaf,
714                                      struct rt6_info *rr_head,
715                                      u32 metric, int oif, int strict,
716                                      bool *do_rr)
717 {
718         struct rt6_info *rt, *match, *cont;
719         int mpri = -1;
720
721         match = NULL;
722         cont = NULL;
723         for (rt = rr_head; rt; rt = rcu_dereference(rt->rt6_next)) {
724                 if (rt->rt6i_metric != metric) {
725                         cont = rt;
726                         break;
727                 }
728
729                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
730         }
731
732         for (rt = leaf; rt && rt != rr_head;
733              rt = rcu_dereference(rt->rt6_next)) {
734                 if (rt->rt6i_metric != metric) {
735                         cont = rt;
736                         break;
737                 }
738
739                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
740         }
741
742         if (match || !cont)
743                 return match;
744
745         for (rt = cont; rt; rt = rcu_dereference(rt->rt6_next))
746                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
747
748         return match;
749 }
750
751 static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
752                                    int oif, int strict)
753 {
754         struct rt6_info *leaf = rcu_dereference(fn->leaf);
755         struct rt6_info *match, *rt0;
756         bool do_rr = false;
757         int key_plen;
758
759         if (!leaf || leaf == net->ipv6.ip6_null_entry)
760                 return net->ipv6.ip6_null_entry;
761
762         rt0 = rcu_dereference(fn->rr_ptr);
763         if (!rt0)
764                 rt0 = leaf;
765
766         /* Double check to make sure fn is not an intermediate node
767          * and fn->leaf does not points to its child's leaf
768          * (This might happen if all routes under fn are deleted from
769          * the tree and fib6_repair_tree() is called on the node.)
770          */
771         key_plen = rt0->rt6i_dst.plen;
772 #ifdef CONFIG_IPV6_SUBTREES
773         if (rt0->rt6i_src.plen)
774                 key_plen = rt0->rt6i_src.plen;
775 #endif
776         if (fn->fn_bit != key_plen)
777                 return net->ipv6.ip6_null_entry;
778
779         match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict,
780                              &do_rr);
781
782         if (do_rr) {
783                 struct rt6_info *next = rcu_dereference(rt0->rt6_next);
784
785                 /* no entries matched; do round-robin */
786                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
787                         next = leaf;
788
789                 if (next != rt0) {
790                         spin_lock_bh(&leaf->rt6i_table->tb6_lock);
791                         /* make sure next is not being deleted from the tree */
792                         if (next->rt6i_node)
793                                 rcu_assign_pointer(fn->rr_ptr, next);
794                         spin_unlock_bh(&leaf->rt6i_table->tb6_lock);
795                 }
796         }
797
798         return match ? match : net->ipv6.ip6_null_entry;
799 }
800
801 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
802 {
803         return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
804 }
805
806 #ifdef CONFIG_IPV6_ROUTE_INFO
807 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
808                   const struct in6_addr *gwaddr)
809 {
810         struct net *net = dev_net(dev);
811         struct route_info *rinfo = (struct route_info *) opt;
812         struct in6_addr prefix_buf, *prefix;
813         unsigned int pref;
814         unsigned long lifetime;
815         struct rt6_info *rt;
816
817         if (len < sizeof(struct route_info)) {
818                 return -EINVAL;
819         }
820
821         /* Sanity check for prefix_len and length */
822         if (rinfo->length > 3) {
823                 return -EINVAL;
824         } else if (rinfo->prefix_len > 128) {
825                 return -EINVAL;
826         } else if (rinfo->prefix_len > 64) {
827                 if (rinfo->length < 2) {
828                         return -EINVAL;
829                 }
830         } else if (rinfo->prefix_len > 0) {
831                 if (rinfo->length < 1) {
832                         return -EINVAL;
833                 }
834         }
835
836         pref = rinfo->route_pref;
837         if (pref == ICMPV6_ROUTER_PREF_INVALID)
838                 return -EINVAL;
839
840         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
841
842         if (rinfo->length == 3)
843                 prefix = (struct in6_addr *)rinfo->prefix;
844         else {
845                 /* this function is safe */
846                 ipv6_addr_prefix(&prefix_buf,
847                                  (struct in6_addr *)rinfo->prefix,
848                                  rinfo->prefix_len);
849                 prefix = &prefix_buf;
850         }
851
852         if (rinfo->prefix_len == 0)
853                 rt = rt6_get_dflt_router(gwaddr, dev);
854         else
855                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
856                                         gwaddr, dev);
857
858         if (rt && !lifetime) {
859                 ip6_del_rt(rt);
860                 rt = NULL;
861         }
862
863         if (!rt && lifetime)
864                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
865                                         dev, pref);
866         else if (rt)
867                 rt->rt6i_flags = RTF_ROUTEINFO |
868                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
869
870         if (rt) {
871                 if (!addrconf_finite_timeout(lifetime))
872                         rt6_clean_expires(rt);
873                 else
874                         rt6_set_expires(rt, jiffies + HZ * lifetime);
875
876                 ip6_rt_put(rt);
877         }
878         return 0;
879 }
880 #endif
881
882 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
883                                         struct in6_addr *saddr)
884 {
885         struct fib6_node *pn, *sn;
886         while (1) {
887                 if (fn->fn_flags & RTN_TL_ROOT)
888                         return NULL;
889                 pn = rcu_dereference(fn->parent);
890                 sn = FIB6_SUBTREE(pn);
891                 if (sn && sn != fn)
892                         fn = fib6_lookup(sn, NULL, saddr);
893                 else
894                         fn = pn;
895                 if (fn->fn_flags & RTN_RTINFO)
896                         return fn;
897         }
898 }
899
900 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
901                           bool null_fallback)
902 {
903         struct rt6_info *rt = *prt;
904
905         if (dst_hold_safe(&rt->dst))
906                 return true;
907         if (null_fallback) {
908                 rt = net->ipv6.ip6_null_entry;
909                 dst_hold(&rt->dst);
910         } else {
911                 rt = NULL;
912         }
913         *prt = rt;
914         return false;
915 }
916
917 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
918                                              struct fib6_table *table,
919                                              struct flowi6 *fl6,
920                                              const struct sk_buff *skb,
921                                              int flags)
922 {
923         struct rt6_info *rt, *rt_cache;
924         struct fib6_node *fn;
925
926         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
927                 flags &= ~RT6_LOOKUP_F_IFACE;
928
929         rcu_read_lock();
930         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
931 restart:
932         rt = rcu_dereference(fn->leaf);
933         if (!rt) {
934                 rt = net->ipv6.ip6_null_entry;
935         } else {
936                 rt = rt6_device_match(net, rt, &fl6->saddr,
937                                       fl6->flowi6_oif, flags);
938                 if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
939                         rt = rt6_multipath_select(net, rt, fl6, fl6->flowi6_oif,
940                                                   skb, flags);
941         }
942         if (rt == net->ipv6.ip6_null_entry) {
943                 fn = fib6_backtrack(fn, &fl6->saddr);
944                 if (fn)
945                         goto restart;
946         }
947         /* Search through exception table */
948         rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
949         if (rt_cache)
950                 rt = rt_cache;
951
952         if (ip6_hold_safe(net, &rt, true))
953                 dst_use_noref(&rt->dst, jiffies);
954
955         rcu_read_unlock();
956
957         trace_fib6_table_lookup(net, rt, table, fl6);
958
959         return rt;
960
961 }
962
963 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
964                                    const struct sk_buff *skb, int flags)
965 {
966         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
967 }
968 EXPORT_SYMBOL_GPL(ip6_route_lookup);
969
970 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
971                             const struct in6_addr *saddr, int oif,
972                             const struct sk_buff *skb, int strict)
973 {
974         struct flowi6 fl6 = {
975                 .flowi6_oif = oif,
976                 .daddr = *daddr,
977         };
978         struct dst_entry *dst;
979         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
980
981         if (saddr) {
982                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
983                 flags |= RT6_LOOKUP_F_HAS_SADDR;
984         }
985
986         dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
987         if (dst->error == 0)
988                 return (struct rt6_info *) dst;
989
990         dst_release(dst);
991
992         return NULL;
993 }
994 EXPORT_SYMBOL(rt6_lookup);
995
996 /* ip6_ins_rt is called with FREE table->tb6_lock.
997  * It takes new route entry, the addition fails by any reason the
998  * route is released.
999  * Caller must hold dst before calling it.
1000  */
1001
1002 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
1003                         struct mx6_config *mxc,
1004                         struct netlink_ext_ack *extack)
1005 {
1006         int err;
1007         struct fib6_table *table;
1008
1009         table = rt->rt6i_table;
1010         spin_lock_bh(&table->tb6_lock);
1011         err = fib6_add(&table->tb6_root, rt, info, mxc, extack);
1012         spin_unlock_bh(&table->tb6_lock);
1013
1014         return err;
1015 }
1016
1017 int ip6_ins_rt(struct rt6_info *rt)
1018 {
1019         struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
1020         struct mx6_config mxc = { .mx = NULL, };
1021
1022         /* Hold dst to account for the reference from the fib6 tree */
1023         dst_hold(&rt->dst);
1024         return __ip6_ins_rt(rt, &info, &mxc, NULL);
1025 }
1026
1027 /* called with rcu_lock held */
1028 static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
1029 {
1030         struct net_device *dev = rt->dst.dev;
1031
1032         if (rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) {
1033                 /* for copies of local routes, dst->dev needs to be the
1034                  * device if it is a master device, the master device if
1035                  * device is enslaved, and the loopback as the default
1036                  */
1037                 if (netif_is_l3_slave(dev) &&
1038                     !rt6_need_strict(&rt->rt6i_dst.addr))
1039                         dev = l3mdev_master_dev_rcu(dev);
1040                 else if (!netif_is_l3_master(dev))
1041                         dev = dev_net(dev)->loopback_dev;
1042                 /* last case is netif_is_l3_master(dev) is true in which
1043                  * case we want dev returned to be dev
1044                  */
1045         }
1046
1047         return dev;
1048 }
1049
1050 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
1051                                            const struct in6_addr *daddr,
1052                                            const struct in6_addr *saddr)
1053 {
1054         struct net_device *dev;
1055         struct rt6_info *rt;
1056
1057         /*
1058          *      Clone the route.
1059          */
1060
1061         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1062                 ort = ort->from;
1063
1064         rcu_read_lock();
1065         dev = ip6_rt_get_dev_rcu(ort);
1066         rt = __ip6_dst_alloc(dev_net(dev), dev, 0);
1067         rcu_read_unlock();
1068         if (!rt)
1069                 return NULL;
1070
1071         ip6_rt_copy_init(rt, ort);
1072         rt->rt6i_flags |= RTF_CACHE;
1073         rt->rt6i_metric = 0;
1074         rt->dst.flags |= DST_HOST;
1075         rt->rt6i_dst.addr = *daddr;
1076         rt->rt6i_dst.plen = 128;
1077
1078         if (!rt6_is_gw_or_nonexthop(ort)) {
1079                 if (ort->rt6i_dst.plen != 128 &&
1080                     ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
1081                         rt->rt6i_flags |= RTF_ANYCAST;
1082 #ifdef CONFIG_IPV6_SUBTREES
1083                 if (rt->rt6i_src.plen && saddr) {
1084                         rt->rt6i_src.addr = *saddr;
1085                         rt->rt6i_src.plen = 128;
1086                 }
1087 #endif
1088         }
1089
1090         return rt;
1091 }
1092
1093 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
1094 {
1095         struct net_device *dev;
1096         struct rt6_info *pcpu_rt;
1097
1098         rcu_read_lock();
1099         dev = ip6_rt_get_dev_rcu(rt);
1100         pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags);
1101         rcu_read_unlock();
1102         if (!pcpu_rt)
1103                 return NULL;
1104         ip6_rt_copy_init(pcpu_rt, rt);
1105         pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1106         pcpu_rt->rt6i_flags |= RTF_PCPU;
1107         return pcpu_rt;
1108 }
1109
1110 /* It should be called with rcu_read_lock() acquired */
1111 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1112 {
1113         struct rt6_info *pcpu_rt, **p;
1114
1115         p = this_cpu_ptr(rt->rt6i_pcpu);
1116         pcpu_rt = *p;
1117
1118         if (pcpu_rt && ip6_hold_safe(NULL, &pcpu_rt, false))
1119                 rt6_dst_from_metrics_check(pcpu_rt);
1120
1121         return pcpu_rt;
1122 }
1123
1124 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1125 {
1126         struct rt6_info *pcpu_rt, *prev, **p;
1127
1128         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1129         if (!pcpu_rt) {
1130                 struct net *net = dev_net(rt->dst.dev);
1131
1132                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1133                 return net->ipv6.ip6_null_entry;
1134         }
1135
1136         dst_hold(&pcpu_rt->dst);
1137         p = this_cpu_ptr(rt->rt6i_pcpu);
1138         prev = cmpxchg(p, NULL, pcpu_rt);
1139         BUG_ON(prev);
1140
1141         rt6_dst_from_metrics_check(pcpu_rt);
1142         return pcpu_rt;
1143 }
1144
1145 /* exception hash table implementation
1146  */
1147 static DEFINE_SPINLOCK(rt6_exception_lock);
1148
1149 /* Remove rt6_ex from hash table and free the memory
1150  * Caller must hold rt6_exception_lock
1151  */
1152 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1153                                  struct rt6_exception *rt6_ex)
1154 {
1155         struct net *net;
1156
1157         if (!bucket || !rt6_ex)
1158                 return;
1159
1160         net = dev_net(rt6_ex->rt6i->dst.dev);
1161         rt6_ex->rt6i->rt6i_node = NULL;
1162         hlist_del_rcu(&rt6_ex->hlist);
1163         rt6_release(rt6_ex->rt6i);
1164         kfree_rcu(rt6_ex, rcu);
1165         WARN_ON_ONCE(!bucket->depth);
1166         bucket->depth--;
1167         net->ipv6.rt6_stats->fib_rt_cache--;
1168 }
1169
1170 /* Remove oldest rt6_ex in bucket and free the memory
1171  * Caller must hold rt6_exception_lock
1172  */
1173 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1174 {
1175         struct rt6_exception *rt6_ex, *oldest = NULL;
1176
1177         if (!bucket)
1178                 return;
1179
1180         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1181                 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1182                         oldest = rt6_ex;
1183         }
1184         rt6_remove_exception(bucket, oldest);
1185 }
1186
1187 static u32 rt6_exception_hash(const struct in6_addr *dst,
1188                               const struct in6_addr *src)
1189 {
1190         static u32 seed __read_mostly;
1191         u32 val;
1192
1193         net_get_random_once(&seed, sizeof(seed));
1194         val = jhash(dst, sizeof(*dst), seed);
1195
1196 #ifdef CONFIG_IPV6_SUBTREES
1197         if (src)
1198                 val = jhash(src, sizeof(*src), val);
1199 #endif
1200         return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1201 }
1202
1203 /* Helper function to find the cached rt in the hash table
1204  * and update bucket pointer to point to the bucket for this
1205  * (daddr, saddr) pair
1206  * Caller must hold rt6_exception_lock
1207  */
1208 static struct rt6_exception *
1209 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1210                               const struct in6_addr *daddr,
1211                               const struct in6_addr *saddr)
1212 {
1213         struct rt6_exception *rt6_ex;
1214         u32 hval;
1215
1216         if (!(*bucket) || !daddr)
1217                 return NULL;
1218
1219         hval = rt6_exception_hash(daddr, saddr);
1220         *bucket += hval;
1221
1222         hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1223                 struct rt6_info *rt6 = rt6_ex->rt6i;
1224                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1225
1226 #ifdef CONFIG_IPV6_SUBTREES
1227                 if (matched && saddr)
1228                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1229 #endif
1230                 if (matched)
1231                         return rt6_ex;
1232         }
1233         return NULL;
1234 }
1235
1236 /* Helper function to find the cached rt in the hash table
1237  * and update bucket pointer to point to the bucket for this
1238  * (daddr, saddr) pair
1239  * Caller must hold rcu_read_lock()
1240  */
1241 static struct rt6_exception *
1242 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1243                          const struct in6_addr *daddr,
1244                          const struct in6_addr *saddr)
1245 {
1246         struct rt6_exception *rt6_ex;
1247         u32 hval;
1248
1249         WARN_ON_ONCE(!rcu_read_lock_held());
1250
1251         if (!(*bucket) || !daddr)
1252                 return NULL;
1253
1254         hval = rt6_exception_hash(daddr, saddr);
1255         *bucket += hval;
1256
1257         hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1258                 struct rt6_info *rt6 = rt6_ex->rt6i;
1259                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1260
1261 #ifdef CONFIG_IPV6_SUBTREES
1262                 if (matched && saddr)
1263                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1264 #endif
1265                 if (matched)
1266                         return rt6_ex;
1267         }
1268         return NULL;
1269 }
1270
1271 static int rt6_insert_exception(struct rt6_info *nrt,
1272                                 struct rt6_info *ort)
1273 {
1274         struct net *net = dev_net(ort->dst.dev);
1275         struct rt6_exception_bucket *bucket;
1276         struct in6_addr *src_key = NULL;
1277         struct rt6_exception *rt6_ex;
1278         int err = 0;
1279
1280         /* ort can't be a cache or pcpu route */
1281         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1282                 ort = ort->from;
1283         WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU));
1284
1285         spin_lock_bh(&rt6_exception_lock);
1286
1287         if (ort->exception_bucket_flushed) {
1288                 err = -EINVAL;
1289                 goto out;
1290         }
1291
1292         bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1293                                         lockdep_is_held(&rt6_exception_lock));
1294         if (!bucket) {
1295                 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1296                                  GFP_ATOMIC);
1297                 if (!bucket) {
1298                         err = -ENOMEM;
1299                         goto out;
1300                 }
1301                 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1302         }
1303
1304 #ifdef CONFIG_IPV6_SUBTREES
1305         /* rt6i_src.plen != 0 indicates ort is in subtree
1306          * and exception table is indexed by a hash of
1307          * both rt6i_dst and rt6i_src.
1308          * Otherwise, the exception table is indexed by
1309          * a hash of only rt6i_dst.
1310          */
1311         if (ort->rt6i_src.plen)
1312                 src_key = &nrt->rt6i_src.addr;
1313 #endif
1314
1315         /* Update rt6i_prefsrc as it could be changed
1316          * in rt6_remove_prefsrc()
1317          */
1318         nrt->rt6i_prefsrc = ort->rt6i_prefsrc;
1319         /* rt6_mtu_change() might lower mtu on ort.
1320          * Only insert this exception route if its mtu
1321          * is less than ort's mtu value.
1322          */
1323         if (nrt->rt6i_pmtu >= dst_mtu(&ort->dst)) {
1324                 err = -EINVAL;
1325                 goto out;
1326         }
1327
1328         rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1329                                                src_key);
1330         if (rt6_ex)
1331                 rt6_remove_exception(bucket, rt6_ex);
1332
1333         rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1334         if (!rt6_ex) {
1335                 err = -ENOMEM;
1336                 goto out;
1337         }
1338         rt6_ex->rt6i = nrt;
1339         rt6_ex->stamp = jiffies;
1340         atomic_inc(&nrt->rt6i_ref);
1341         nrt->rt6i_node = ort->rt6i_node;
1342         hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1343         bucket->depth++;
1344         net->ipv6.rt6_stats->fib_rt_cache++;
1345
1346         if (bucket->depth > FIB6_MAX_DEPTH)
1347                 rt6_exception_remove_oldest(bucket);
1348
1349 out:
1350         spin_unlock_bh(&rt6_exception_lock);
1351
1352         /* Update fn->fn_sernum to invalidate all cached dst */
1353         if (!err) {
1354                 spin_lock_bh(&ort->rt6i_table->tb6_lock);
1355                 fib6_update_sernum(ort);
1356                 spin_unlock_bh(&ort->rt6i_table->tb6_lock);
1357                 fib6_force_start_gc(net);
1358         }
1359
1360         return err;
1361 }
1362
1363 void rt6_flush_exceptions(struct rt6_info *rt)
1364 {
1365         struct rt6_exception_bucket *bucket;
1366         struct rt6_exception *rt6_ex;
1367         struct hlist_node *tmp;
1368         int i;
1369
1370         spin_lock_bh(&rt6_exception_lock);
1371         /* Prevent rt6_insert_exception() to recreate the bucket list */
1372         rt->exception_bucket_flushed = 1;
1373
1374         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1375                                     lockdep_is_held(&rt6_exception_lock));
1376         if (!bucket)
1377                 goto out;
1378
1379         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1380                 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1381                         rt6_remove_exception(bucket, rt6_ex);
1382                 WARN_ON_ONCE(bucket->depth);
1383                 bucket++;
1384         }
1385
1386 out:
1387         spin_unlock_bh(&rt6_exception_lock);
1388 }
1389
1390 /* Find cached rt in the hash table inside passed in rt
1391  * Caller has to hold rcu_read_lock()
1392  */
1393 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
1394                                            struct in6_addr *daddr,
1395                                            struct in6_addr *saddr)
1396 {
1397         struct rt6_exception_bucket *bucket;
1398         struct in6_addr *src_key = NULL;
1399         struct rt6_exception *rt6_ex;
1400         struct rt6_info *res = NULL;
1401
1402         bucket = rcu_dereference(rt->rt6i_exception_bucket);
1403
1404 #ifdef CONFIG_IPV6_SUBTREES
1405         /* rt6i_src.plen != 0 indicates rt is in subtree
1406          * and exception table is indexed by a hash of
1407          * both rt6i_dst and rt6i_src.
1408          * Otherwise, the exception table is indexed by
1409          * a hash of only rt6i_dst.
1410          */
1411         if (rt->rt6i_src.plen)
1412                 src_key = saddr;
1413 #endif
1414         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1415
1416         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1417                 res = rt6_ex->rt6i;
1418
1419         return res;
1420 }
1421
1422 /* Remove the passed in cached rt from the hash table that contains it */
1423 int rt6_remove_exception_rt(struct rt6_info *rt)
1424 {
1425         struct rt6_exception_bucket *bucket;
1426         struct rt6_info *from = rt->from;
1427         struct in6_addr *src_key = NULL;
1428         struct rt6_exception *rt6_ex;
1429         int err;
1430
1431         if (!from ||
1432             !(rt->rt6i_flags & RTF_CACHE))
1433                 return -EINVAL;
1434
1435         if (!rcu_access_pointer(from->rt6i_exception_bucket))
1436                 return -ENOENT;
1437
1438         spin_lock_bh(&rt6_exception_lock);
1439         bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1440                                     lockdep_is_held(&rt6_exception_lock));
1441 #ifdef CONFIG_IPV6_SUBTREES
1442         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1443          * and exception table is indexed by a hash of
1444          * both rt6i_dst and rt6i_src.
1445          * Otherwise, the exception table is indexed by
1446          * a hash of only rt6i_dst.
1447          */
1448         if (from->rt6i_src.plen)
1449                 src_key = &rt->rt6i_src.addr;
1450 #endif
1451         rt6_ex = __rt6_find_exception_spinlock(&bucket,
1452                                                &rt->rt6i_dst.addr,
1453                                                src_key);
1454         if (rt6_ex) {
1455                 rt6_remove_exception(bucket, rt6_ex);
1456                 err = 0;
1457         } else {
1458                 err = -ENOENT;
1459         }
1460
1461         spin_unlock_bh(&rt6_exception_lock);
1462         return err;
1463 }
1464
1465 /* Find rt6_ex which contains the passed in rt cache and
1466  * refresh its stamp
1467  */
1468 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1469 {
1470         struct rt6_exception_bucket *bucket;
1471         struct rt6_info *from = rt->from;
1472         struct in6_addr *src_key = NULL;
1473         struct rt6_exception *rt6_ex;
1474
1475         if (!from ||
1476             !(rt->rt6i_flags & RTF_CACHE))
1477                 return;
1478
1479         rcu_read_lock();
1480         bucket = rcu_dereference(from->rt6i_exception_bucket);
1481
1482 #ifdef CONFIG_IPV6_SUBTREES
1483         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1484          * and exception table is indexed by a hash of
1485          * both rt6i_dst and rt6i_src.
1486          * Otherwise, the exception table is indexed by
1487          * a hash of only rt6i_dst.
1488          */
1489         if (from->rt6i_src.plen)
1490                 src_key = &rt->rt6i_src.addr;
1491 #endif
1492         rt6_ex = __rt6_find_exception_rcu(&bucket,
1493                                           &rt->rt6i_dst.addr,
1494                                           src_key);
1495         if (rt6_ex)
1496                 rt6_ex->stamp = jiffies;
1497
1498         rcu_read_unlock();
1499 }
1500
1501 static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt)
1502 {
1503         struct rt6_exception_bucket *bucket;
1504         struct rt6_exception *rt6_ex;
1505         int i;
1506
1507         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1508                                         lockdep_is_held(&rt6_exception_lock));
1509
1510         if (bucket) {
1511                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1512                         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1513                                 rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1514                         }
1515                         bucket++;
1516                 }
1517         }
1518 }
1519
1520 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1521                                          struct rt6_info *rt, int mtu)
1522 {
1523         /* If the new MTU is lower than the route PMTU, this new MTU will be the
1524          * lowest MTU in the path: always allow updating the route PMTU to
1525          * reflect PMTU decreases.
1526          *
1527          * If the new MTU is higher, and the route PMTU is equal to the local
1528          * MTU, this means the old MTU is the lowest in the path, so allow
1529          * updating it: if other nodes now have lower MTUs, PMTU discovery will
1530          * handle this.
1531          */
1532
1533         if (dst_mtu(&rt->dst) >= mtu)
1534                 return true;
1535
1536         if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1537                 return true;
1538
1539         return false;
1540 }
1541
1542 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1543                                        struct rt6_info *rt, int mtu)
1544 {
1545         struct rt6_exception_bucket *bucket;
1546         struct rt6_exception *rt6_ex;
1547         int i;
1548
1549         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1550                                         lockdep_is_held(&rt6_exception_lock));
1551
1552         if (!bucket)
1553                 return;
1554
1555         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1556                 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1557                         struct rt6_info *entry = rt6_ex->rt6i;
1558
1559                         /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1560                          * route), the metrics of its rt->dst.from have already
1561                          * been updated.
1562                          */
1563                         if (entry->rt6i_pmtu &&
1564                             rt6_mtu_change_route_allowed(idev, entry, mtu))
1565                                 entry->rt6i_pmtu = mtu;
1566                 }
1567                 bucket++;
1568         }
1569 }
1570
1571 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
1572
1573 static void rt6_exceptions_clean_tohost(struct rt6_info *rt,
1574                                         struct in6_addr *gateway)
1575 {
1576         struct rt6_exception_bucket *bucket;
1577         struct rt6_exception *rt6_ex;
1578         struct hlist_node *tmp;
1579         int i;
1580
1581         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1582                 return;
1583
1584         spin_lock_bh(&rt6_exception_lock);
1585         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1586                                      lockdep_is_held(&rt6_exception_lock));
1587
1588         if (bucket) {
1589                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1590                         hlist_for_each_entry_safe(rt6_ex, tmp,
1591                                                   &bucket->chain, hlist) {
1592                                 struct rt6_info *entry = rt6_ex->rt6i;
1593
1594                                 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1595                                     RTF_CACHE_GATEWAY &&
1596                                     ipv6_addr_equal(gateway,
1597                                                     &entry->rt6i_gateway)) {
1598                                         rt6_remove_exception(bucket, rt6_ex);
1599                                 }
1600                         }
1601                         bucket++;
1602                 }
1603         }
1604
1605         spin_unlock_bh(&rt6_exception_lock);
1606 }
1607
1608 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1609                                       struct rt6_exception *rt6_ex,
1610                                       struct fib6_gc_args *gc_args,
1611                                       unsigned long now)
1612 {
1613         struct rt6_info *rt = rt6_ex->rt6i;
1614
1615         /* we are pruning and obsoleting aged-out and non gateway exceptions
1616          * even if others have still references to them, so that on next
1617          * dst_check() such references can be dropped.
1618          * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1619          * expired, independently from their aging, as per RFC 8201 section 4
1620          */
1621         if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1622                 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1623                         RT6_TRACE("aging clone %p\n", rt);
1624                         rt6_remove_exception(bucket, rt6_ex);
1625                         return;
1626                 }
1627         } else if (time_after(jiffies, rt->dst.expires)) {
1628                 RT6_TRACE("purging expired route %p\n", rt);
1629                 rt6_remove_exception(bucket, rt6_ex);
1630                 return;
1631         }
1632
1633         if (rt->rt6i_flags & RTF_GATEWAY) {
1634                 struct neighbour *neigh;
1635                 __u8 neigh_flags = 0;
1636
1637                 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1638                 if (neigh)
1639                         neigh_flags = neigh->flags;
1640
1641                 if (!(neigh_flags & NTF_ROUTER)) {
1642                         RT6_TRACE("purging route %p via non-router but gateway\n",
1643                                   rt);
1644                         rt6_remove_exception(bucket, rt6_ex);
1645                         return;
1646                 }
1647         }
1648
1649         gc_args->more++;
1650 }
1651
1652 void rt6_age_exceptions(struct rt6_info *rt,
1653                         struct fib6_gc_args *gc_args,
1654                         unsigned long now)
1655 {
1656         struct rt6_exception_bucket *bucket;
1657         struct rt6_exception *rt6_ex;
1658         struct hlist_node *tmp;
1659         int i;
1660
1661         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1662                 return;
1663
1664         rcu_read_lock_bh();
1665         spin_lock(&rt6_exception_lock);
1666         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1667                                     lockdep_is_held(&rt6_exception_lock));
1668
1669         if (bucket) {
1670                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1671                         hlist_for_each_entry_safe(rt6_ex, tmp,
1672                                                   &bucket->chain, hlist) {
1673                                 rt6_age_examine_exception(bucket, rt6_ex,
1674                                                           gc_args, now);
1675                         }
1676                         bucket++;
1677                 }
1678         }
1679         spin_unlock(&rt6_exception_lock);
1680         rcu_read_unlock_bh();
1681 }
1682
1683 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1684                                int oif, struct flowi6 *fl6,
1685                                const struct sk_buff *skb, int flags)
1686 {
1687         struct fib6_node *fn, *saved_fn;
1688         struct rt6_info *rt, *rt_cache;
1689         int strict = 0;
1690
1691         strict |= flags & RT6_LOOKUP_F_IFACE;
1692         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1693         if (net->ipv6.devconf_all->forwarding == 0)
1694                 strict |= RT6_LOOKUP_F_REACHABLE;
1695
1696         rcu_read_lock();
1697
1698         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1699         saved_fn = fn;
1700
1701         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1702                 oif = 0;
1703
1704 redo_rt6_select:
1705         rt = rt6_select(net, fn, oif, strict);
1706         if (rt->rt6i_nsiblings)
1707                 rt = rt6_multipath_select(net, rt, fl6, oif, skb, strict);
1708         if (rt == net->ipv6.ip6_null_entry) {
1709                 fn = fib6_backtrack(fn, &fl6->saddr);
1710                 if (fn)
1711                         goto redo_rt6_select;
1712                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1713                         /* also consider unreachable route */
1714                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1715                         fn = saved_fn;
1716                         goto redo_rt6_select;
1717                 }
1718         }
1719
1720         /*Search through exception table */
1721         rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
1722         if (rt_cache)
1723                 rt = rt_cache;
1724
1725         if (rt == net->ipv6.ip6_null_entry) {
1726                 rcu_read_unlock();
1727                 dst_hold(&rt->dst);
1728                 trace_fib6_table_lookup(net, rt, table, fl6);
1729                 return rt;
1730         } else if (rt->rt6i_flags & RTF_CACHE) {
1731                 if (ip6_hold_safe(net, &rt, true)) {
1732                         dst_use_noref(&rt->dst, jiffies);
1733                         rt6_dst_from_metrics_check(rt);
1734                 }
1735                 rcu_read_unlock();
1736                 trace_fib6_table_lookup(net, rt, table, fl6);
1737                 return rt;
1738         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1739                             !(rt->rt6i_flags & RTF_GATEWAY))) {
1740                 /* Create a RTF_CACHE clone which will not be
1741                  * owned by the fib6 tree.  It is for the special case where
1742                  * the daddr in the skb during the neighbor look-up is different
1743                  * from the fl6->daddr used to look-up route here.
1744                  */
1745
1746                 struct rt6_info *uncached_rt;
1747
1748                 if (ip6_hold_safe(net, &rt, true)) {
1749                         dst_use_noref(&rt->dst, jiffies);
1750                 } else {
1751                         rcu_read_unlock();
1752                         uncached_rt = rt;
1753                         goto uncached_rt_out;
1754                 }
1755                 rcu_read_unlock();
1756
1757                 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1758                 dst_release(&rt->dst);
1759
1760                 if (uncached_rt) {
1761                         /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1762                          * No need for another dst_hold()
1763                          */
1764                         rt6_uncached_list_add(uncached_rt);
1765                         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1766                 } else {
1767                         uncached_rt = net->ipv6.ip6_null_entry;
1768                         dst_hold(&uncached_rt->dst);
1769                 }
1770
1771 uncached_rt_out:
1772                 trace_fib6_table_lookup(net, uncached_rt, table, fl6);
1773                 return uncached_rt;
1774
1775         } else {
1776                 /* Get a percpu copy */
1777
1778                 struct rt6_info *pcpu_rt;
1779
1780                 dst_use_noref(&rt->dst, jiffies);
1781                 local_bh_disable();
1782                 pcpu_rt = rt6_get_pcpu_route(rt);
1783
1784                 if (!pcpu_rt) {
1785                         /* atomic_inc_not_zero() is needed when using rcu */
1786                         if (atomic_inc_not_zero(&rt->rt6i_ref)) {
1787                                 /* No dst_hold() on rt is needed because grabbing
1788                                  * rt->rt6i_ref makes sure rt can't be released.
1789                                  */
1790                                 pcpu_rt = rt6_make_pcpu_route(rt);
1791                                 rt6_release(rt);
1792                         } else {
1793                                 /* rt is already removed from tree */
1794                                 pcpu_rt = net->ipv6.ip6_null_entry;
1795                                 dst_hold(&pcpu_rt->dst);
1796                         }
1797                 }
1798                 local_bh_enable();
1799                 rcu_read_unlock();
1800                 trace_fib6_table_lookup(net, pcpu_rt, table, fl6);
1801                 return pcpu_rt;
1802         }
1803 }
1804 EXPORT_SYMBOL_GPL(ip6_pol_route);
1805
1806 static struct rt6_info *ip6_pol_route_input(struct net *net,
1807                                             struct fib6_table *table,
1808                                             struct flowi6 *fl6,
1809                                             const struct sk_buff *skb,
1810                                             int flags)
1811 {
1812         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1813 }
1814
1815 struct dst_entry *ip6_route_input_lookup(struct net *net,
1816                                          struct net_device *dev,
1817                                          struct flowi6 *fl6,
1818                                          const struct sk_buff *skb,
1819                                          int flags)
1820 {
1821         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1822                 flags |= RT6_LOOKUP_F_IFACE;
1823
1824         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1825 }
1826 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1827
1828 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1829                                   struct flow_keys *keys,
1830                                   struct flow_keys *flkeys)
1831 {
1832         const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1833         const struct ipv6hdr *key_iph = outer_iph;
1834         struct flow_keys *_flkeys = flkeys;
1835         const struct ipv6hdr *inner_iph;
1836         const struct icmp6hdr *icmph;
1837         struct ipv6hdr _inner_iph;
1838
1839         if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1840                 goto out;
1841
1842         icmph = icmp6_hdr(skb);
1843         if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1844             icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1845             icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1846             icmph->icmp6_type != ICMPV6_PARAMPROB)
1847                 goto out;
1848
1849         inner_iph = skb_header_pointer(skb,
1850                                        skb_transport_offset(skb) + sizeof(*icmph),
1851                                        sizeof(_inner_iph), &_inner_iph);
1852         if (!inner_iph)
1853                 goto out;
1854
1855         key_iph = inner_iph;
1856         _flkeys = NULL;
1857 out:
1858         if (_flkeys) {
1859                 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1860                 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1861                 keys->tags.flow_label = _flkeys->tags.flow_label;
1862                 keys->basic.ip_proto = _flkeys->basic.ip_proto;
1863         } else {
1864                 keys->addrs.v6addrs.src = key_iph->saddr;
1865                 keys->addrs.v6addrs.dst = key_iph->daddr;
1866                 keys->tags.flow_label = ip6_flowinfo(key_iph);
1867                 keys->basic.ip_proto = key_iph->nexthdr;
1868         }
1869 }
1870
1871 /* if skb is set it will be used and fl6 can be NULL */
1872 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
1873                        const struct sk_buff *skb, struct flow_keys *flkeys)
1874 {
1875         struct flow_keys hash_keys;
1876         u32 mhash;
1877
1878         switch (ip6_multipath_hash_policy(net)) {
1879         case 0:
1880                 memset(&hash_keys, 0, sizeof(hash_keys));
1881                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1882                 if (skb) {
1883                         ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
1884                 } else {
1885                         hash_keys.addrs.v6addrs.src = fl6->saddr;
1886                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
1887                         hash_keys.tags.flow_label = (__force u32)fl6->flowlabel;
1888                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
1889                 }
1890                 break;
1891         case 1:
1892                 if (skb) {
1893                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1894                         struct flow_keys keys;
1895
1896                         /* short-circuit if we already have L4 hash present */
1897                         if (skb->l4_hash)
1898                                 return skb_get_hash_raw(skb) >> 1;
1899
1900                         memset(&hash_keys, 0, sizeof(hash_keys));
1901
1902                         if (!flkeys) {
1903                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
1904                                 flkeys = &keys;
1905                         }
1906                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1907                         hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
1908                         hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
1909                         hash_keys.ports.src = flkeys->ports.src;
1910                         hash_keys.ports.dst = flkeys->ports.dst;
1911                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1912                 } else {
1913                         memset(&hash_keys, 0, sizeof(hash_keys));
1914                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1915                         hash_keys.addrs.v6addrs.src = fl6->saddr;
1916                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
1917                         hash_keys.ports.src = fl6->fl6_sport;
1918                         hash_keys.ports.dst = fl6->fl6_dport;
1919                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
1920                 }
1921                 break;
1922         }
1923         mhash = flow_hash_from_keys(&hash_keys);
1924
1925         return mhash >> 1;
1926 }
1927
1928 void ip6_route_input(struct sk_buff *skb)
1929 {
1930         const struct ipv6hdr *iph = ipv6_hdr(skb);
1931         struct net *net = dev_net(skb->dev);
1932         int flags = RT6_LOOKUP_F_HAS_SADDR;
1933         struct ip_tunnel_info *tun_info;
1934         struct flowi6 fl6 = {
1935                 .flowi6_iif = skb->dev->ifindex,
1936                 .daddr = iph->daddr,
1937                 .saddr = iph->saddr,
1938                 .flowlabel = ip6_flowinfo(iph),
1939                 .flowi6_mark = skb->mark,
1940                 .flowi6_proto = iph->nexthdr,
1941         };
1942         struct flow_keys *flkeys = NULL, _flkeys;
1943
1944         tun_info = skb_tunnel_info(skb);
1945         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1946                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1947
1948         if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
1949                 flkeys = &_flkeys;
1950
1951         if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
1952                 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
1953         skb_dst_drop(skb);
1954         skb_dst_set(skb,
1955                     ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
1956 }
1957
1958 static struct rt6_info *ip6_pol_route_output(struct net *net,
1959                                              struct fib6_table *table,
1960                                              struct flowi6 *fl6,
1961                                              const struct sk_buff *skb,
1962                                              int flags)
1963 {
1964         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
1965 }
1966
1967 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1968                                          struct flowi6 *fl6, int flags)
1969 {
1970         bool any_src;
1971
1972         if (rt6_need_strict(&fl6->daddr)) {
1973                 struct dst_entry *dst;
1974
1975                 dst = l3mdev_link_scope_lookup(net, fl6);
1976                 if (dst)
1977                         return dst;
1978         }
1979
1980         fl6->flowi6_iif = LOOPBACK_IFINDEX;
1981
1982         any_src = ipv6_addr_any(&fl6->saddr);
1983         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1984             (fl6->flowi6_oif && any_src))
1985                 flags |= RT6_LOOKUP_F_IFACE;
1986
1987         if (!any_src)
1988                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1989         else if (sk)
1990                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1991
1992         return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
1993 }
1994 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1995
1996 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1997 {
1998         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1999         struct net_device *loopback_dev = net->loopback_dev;
2000         struct dst_entry *new = NULL;
2001
2002         rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2003                        DST_OBSOLETE_DEAD, 0);
2004         if (rt) {
2005                 rt6_info_init(rt);
2006                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2007
2008                 new = &rt->dst;
2009                 new->__use = 1;
2010                 new->input = dst_discard;
2011                 new->output = dst_discard_out;
2012
2013                 dst_copy_metrics(new, &ort->dst);
2014
2015                 rt->rt6i_idev = in6_dev_get(loopback_dev);
2016                 rt->rt6i_gateway = ort->rt6i_gateway;
2017                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2018                 rt->rt6i_metric = 0;
2019
2020                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2021 #ifdef CONFIG_IPV6_SUBTREES
2022                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2023 #endif
2024         }
2025
2026         dst_release(dst_orig);
2027         return new ? new : ERR_PTR(-ENOMEM);
2028 }
2029
2030 /*
2031  *      Destination cache support functions
2032  */
2033
2034 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
2035 {
2036         if (rt->from &&
2037             dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(&rt->from->dst))
2038                 dst_init_metrics(&rt->dst, dst_metrics_ptr(&rt->from->dst), true);
2039 }
2040
2041 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
2042 {
2043         u32 rt_cookie = 0;
2044
2045         if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
2046                 return NULL;
2047
2048         if (rt6_check_expired(rt))
2049                 return NULL;
2050
2051         return &rt->dst;
2052 }
2053
2054 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
2055 {
2056         if (!__rt6_check_expired(rt) &&
2057             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2058             rt6_check(rt->from, cookie))
2059                 return &rt->dst;
2060         else
2061                 return NULL;
2062 }
2063
2064 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2065 {
2066         struct rt6_info *rt;
2067
2068         rt = (struct rt6_info *) dst;
2069
2070         /* All IPV6 dsts are created with ->obsolete set to the value
2071          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2072          * into this function always.
2073          */
2074
2075         rt6_dst_from_metrics_check(rt);
2076
2077         if (rt->rt6i_flags & RTF_PCPU ||
2078             (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from))
2079                 return rt6_dst_from_check(rt, cookie);
2080         else
2081                 return rt6_check(rt, cookie);
2082 }
2083
2084 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2085 {
2086         struct rt6_info *rt = (struct rt6_info *) dst;
2087
2088         if (rt) {
2089                 if (rt->rt6i_flags & RTF_CACHE) {
2090                         if (rt6_check_expired(rt)) {
2091                                 ip6_del_rt(rt);
2092                                 dst = NULL;
2093                         }
2094                 } else {
2095                         dst_release(dst);
2096                         dst = NULL;
2097                 }
2098         }
2099         return dst;
2100 }
2101
2102 static void ip6_link_failure(struct sk_buff *skb)
2103 {
2104         struct rt6_info *rt;
2105
2106         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2107
2108         rt = (struct rt6_info *) skb_dst(skb);
2109         if (rt) {
2110                 if (rt->rt6i_flags & RTF_CACHE) {
2111                         if (dst_hold_safe(&rt->dst))
2112                                 ip6_del_rt(rt);
2113                 } else {
2114                         struct fib6_node *fn;
2115
2116                         rcu_read_lock();
2117                         fn = rcu_dereference(rt->rt6i_node);
2118                         if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2119                                 fn->fn_sernum = -1;
2120                         rcu_read_unlock();
2121                 }
2122         }
2123 }
2124
2125 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2126 {
2127         struct net *net = dev_net(rt->dst.dev);
2128
2129         rt->rt6i_flags |= RTF_MODIFIED;
2130         rt->rt6i_pmtu = mtu;
2131         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2132 }
2133
2134 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2135 {
2136         return !(rt->rt6i_flags & RTF_CACHE) &&
2137                 (rt->rt6i_flags & RTF_PCPU ||
2138                  rcu_access_pointer(rt->rt6i_node));
2139 }
2140
2141 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2142                                  const struct ipv6hdr *iph, u32 mtu)
2143 {
2144         const struct in6_addr *daddr, *saddr;
2145         struct rt6_info *rt6 = (struct rt6_info *)dst;
2146
2147         if (rt6->rt6i_flags & RTF_LOCAL)
2148                 return;
2149
2150         if (dst_metric_locked(dst, RTAX_MTU))
2151                 return;
2152
2153         if (iph) {
2154                 daddr = &iph->daddr;
2155                 saddr = &iph->saddr;
2156         } else if (sk) {
2157                 daddr = &sk->sk_v6_daddr;
2158                 saddr = &inet6_sk(sk)->saddr;
2159         } else {
2160                 daddr = NULL;
2161                 saddr = NULL;
2162         }
2163         dst_confirm_neigh(dst, daddr);
2164         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2165         if (mtu >= dst_mtu(dst))
2166                 return;
2167
2168         if (!rt6_cache_allowed_for_pmtu(rt6)) {
2169                 rt6_do_update_pmtu(rt6, mtu);
2170                 /* update rt6_ex->stamp for cache */
2171                 if (rt6->rt6i_flags & RTF_CACHE)
2172                         rt6_update_exception_stamp_rt(rt6);
2173         } else if (daddr) {
2174                 struct rt6_info *nrt6;
2175
2176                 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
2177                 if (nrt6) {
2178                         rt6_do_update_pmtu(nrt6, mtu);
2179                         if (rt6_insert_exception(nrt6, rt6))
2180                                 dst_release_immediate(&nrt6->dst);
2181                 }
2182         }
2183 }
2184
2185 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2186                                struct sk_buff *skb, u32 mtu)
2187 {
2188         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2189 }
2190
2191 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2192                      int oif, u32 mark, kuid_t uid)
2193 {
2194         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2195         struct dst_entry *dst;
2196         struct flowi6 fl6;
2197
2198         memset(&fl6, 0, sizeof(fl6));
2199         fl6.flowi6_oif = oif;
2200         fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2201         fl6.daddr = iph->daddr;
2202         fl6.saddr = iph->saddr;
2203         fl6.flowlabel = ip6_flowinfo(iph);
2204         fl6.flowi6_uid = uid;
2205
2206         dst = ip6_route_output(net, NULL, &fl6);
2207         if (!dst->error)
2208                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2209         dst_release(dst);
2210 }
2211 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2212
2213 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2214 {
2215         struct dst_entry *dst;
2216
2217         ip6_update_pmtu(skb, sock_net(sk), mtu,
2218                         sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2219
2220         dst = __sk_dst_get(sk);
2221         if (!dst || !dst->obsolete ||
2222             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2223                 return;
2224
2225         bh_lock_sock(sk);
2226         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2227                 ip6_datagram_dst_update(sk, false);
2228         bh_unlock_sock(sk);
2229 }
2230 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2231
2232 /* Handle redirects */
2233 struct ip6rd_flowi {
2234         struct flowi6 fl6;
2235         struct in6_addr gateway;
2236 };
2237
2238 static struct rt6_info *__ip6_route_redirect(struct net *net,
2239                                              struct fib6_table *table,
2240                                              struct flowi6 *fl6,
2241                                              const struct sk_buff *skb,
2242                                              int flags)
2243 {
2244         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2245         struct rt6_info *rt, *rt_cache;
2246         struct fib6_node *fn;
2247
2248         /* Get the "current" route for this destination and
2249          * check if the redirect has come from appropriate router.
2250          *
2251          * RFC 4861 specifies that redirects should only be
2252          * accepted if they come from the nexthop to the target.
2253          * Due to the way the routes are chosen, this notion
2254          * is a bit fuzzy and one might need to check all possible
2255          * routes.
2256          */
2257
2258         rcu_read_lock();
2259         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2260 restart:
2261         for_each_fib6_node_rt_rcu(fn) {
2262                 if (rt->rt6i_nh_flags & RTNH_F_DEAD)
2263                         continue;
2264                 if (rt6_check_expired(rt))
2265                         continue;
2266                 if (rt->dst.error)
2267                         break;
2268                 if (!(rt->rt6i_flags & RTF_GATEWAY))
2269                         continue;
2270                 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
2271                         continue;
2272                 /* rt_cache's gateway might be different from its 'parent'
2273                  * in the case of an ip redirect.
2274                  * So we keep searching in the exception table if the gateway
2275                  * is different.
2276                  */
2277                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) {
2278                         rt_cache = rt6_find_cached_rt(rt,
2279                                                       &fl6->daddr,
2280                                                       &fl6->saddr);
2281                         if (rt_cache &&
2282                             ipv6_addr_equal(&rdfl->gateway,
2283                                             &rt_cache->rt6i_gateway)) {
2284                                 rt = rt_cache;
2285                                 break;
2286                         }
2287                         continue;
2288                 }
2289                 break;
2290         }
2291
2292         if (!rt)
2293                 rt = net->ipv6.ip6_null_entry;
2294         else if (rt->dst.error) {
2295                 rt = net->ipv6.ip6_null_entry;
2296                 goto out;
2297         }
2298
2299         if (rt == net->ipv6.ip6_null_entry) {
2300                 fn = fib6_backtrack(fn, &fl6->saddr);
2301                 if (fn)
2302                         goto restart;
2303         }
2304
2305 out:
2306         ip6_hold_safe(net, &rt, true);
2307
2308         rcu_read_unlock();
2309
2310         trace_fib6_table_lookup(net, rt, table, fl6);
2311         return rt;
2312 };
2313
2314 static struct dst_entry *ip6_route_redirect(struct net *net,
2315                                             const struct flowi6 *fl6,
2316                                             const struct sk_buff *skb,
2317                                             const struct in6_addr *gateway)
2318 {
2319         int flags = RT6_LOOKUP_F_HAS_SADDR;
2320         struct ip6rd_flowi rdfl;
2321
2322         rdfl.fl6 = *fl6;
2323         rdfl.gateway = *gateway;
2324
2325         return fib6_rule_lookup(net, &rdfl.fl6, skb,
2326                                 flags, __ip6_route_redirect);
2327 }
2328
2329 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2330                   kuid_t uid)
2331 {
2332         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2333         struct dst_entry *dst;
2334         struct flowi6 fl6;
2335
2336         memset(&fl6, 0, sizeof(fl6));
2337         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2338         fl6.flowi6_oif = oif;
2339         fl6.flowi6_mark = mark;
2340         fl6.daddr = iph->daddr;
2341         fl6.saddr = iph->saddr;
2342         fl6.flowlabel = ip6_flowinfo(iph);
2343         fl6.flowi6_uid = uid;
2344
2345         dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2346         rt6_do_redirect(dst, NULL, skb);
2347         dst_release(dst);
2348 }
2349 EXPORT_SYMBOL_GPL(ip6_redirect);
2350
2351 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2352                             u32 mark)
2353 {
2354         const struct ipv6hdr *iph = ipv6_hdr(skb);
2355         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2356         struct dst_entry *dst;
2357         struct flowi6 fl6;
2358
2359         memset(&fl6, 0, sizeof(fl6));
2360         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2361         fl6.flowi6_oif = oif;
2362         fl6.flowi6_mark = mark;
2363         fl6.daddr = msg->dest;
2364         fl6.saddr = iph->daddr;
2365         fl6.flowi6_uid = sock_net_uid(net, NULL);
2366
2367         dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2368         rt6_do_redirect(dst, NULL, skb);
2369         dst_release(dst);
2370 }
2371
2372 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2373 {
2374         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2375                      sk->sk_uid);
2376 }
2377 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2378
2379 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2380 {
2381         struct net_device *dev = dst->dev;
2382         unsigned int mtu = dst_mtu(dst);
2383         struct net *net = dev_net(dev);
2384
2385         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2386
2387         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2388                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2389
2390         /*
2391          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2392          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2393          * IPV6_MAXPLEN is also valid and means: "any MSS,
2394          * rely only on pmtu discovery"
2395          */
2396         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2397                 mtu = IPV6_MAXPLEN;
2398         return mtu;
2399 }
2400
2401 static unsigned int ip6_mtu(const struct dst_entry *dst)
2402 {
2403         const struct rt6_info *rt = (const struct rt6_info *)dst;
2404         unsigned int mtu = rt->rt6i_pmtu;
2405         struct inet6_dev *idev;
2406
2407         if (mtu)
2408                 goto out;
2409
2410         mtu = dst_metric_raw(dst, RTAX_MTU);
2411         if (mtu)
2412                 goto out;
2413
2414         mtu = IPV6_MIN_MTU;
2415
2416         rcu_read_lock();
2417         idev = __in6_dev_get(dst->dev);
2418         if (idev)
2419                 mtu = idev->cnf.mtu6;
2420         rcu_read_unlock();
2421
2422 out:
2423         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2424
2425         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2426 }
2427
2428 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2429                                   struct flowi6 *fl6)
2430 {
2431         struct dst_entry *dst;
2432         struct rt6_info *rt;
2433         struct inet6_dev *idev = in6_dev_get(dev);
2434         struct net *net = dev_net(dev);
2435
2436         if (unlikely(!idev))
2437                 return ERR_PTR(-ENODEV);
2438
2439         rt = ip6_dst_alloc(net, dev, 0);
2440         if (unlikely(!rt)) {
2441                 in6_dev_put(idev);
2442                 dst = ERR_PTR(-ENOMEM);
2443                 goto out;
2444         }
2445
2446         rt->dst.flags |= DST_HOST;
2447         rt->dst.input = ip6_input;
2448         rt->dst.output  = ip6_output;
2449         rt->rt6i_gateway  = fl6->daddr;
2450         rt->rt6i_dst.addr = fl6->daddr;
2451         rt->rt6i_dst.plen = 128;
2452         rt->rt6i_idev     = idev;
2453         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2454
2455         /* Add this dst into uncached_list so that rt6_disable_ip() can
2456          * do proper release of the net_device
2457          */
2458         rt6_uncached_list_add(rt);
2459         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2460
2461         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2462
2463 out:
2464         return dst;
2465 }
2466
2467 static int ip6_dst_gc(struct dst_ops *ops)
2468 {
2469         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2470         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2471         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2472         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2473         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2474         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2475         int entries;
2476
2477         entries = dst_entries_get_fast(ops);
2478         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2479             entries <= rt_max_size)
2480                 goto out;
2481
2482         net->ipv6.ip6_rt_gc_expire++;
2483         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2484         entries = dst_entries_get_slow(ops);
2485         if (entries < ops->gc_thresh)
2486                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2487 out:
2488         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2489         return entries > rt_max_size;
2490 }
2491
2492 static int ip6_convert_metrics(struct mx6_config *mxc,
2493                                const struct fib6_config *cfg)
2494 {
2495         struct net *net = cfg->fc_nlinfo.nl_net;
2496         bool ecn_ca = false;
2497         struct nlattr *nla;
2498         int remaining;
2499         u32 *mp;
2500
2501         if (!cfg->fc_mx)
2502                 return 0;
2503
2504         mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
2505         if (unlikely(!mp))
2506                 return -ENOMEM;
2507
2508         nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
2509                 int type = nla_type(nla);
2510                 u32 val;
2511
2512                 if (!type)
2513                         continue;
2514                 if (unlikely(type > RTAX_MAX))
2515                         goto err;
2516
2517                 if (type == RTAX_CC_ALGO) {
2518                         char tmp[TCP_CA_NAME_MAX];
2519
2520                         nla_strlcpy(tmp, nla, sizeof(tmp));
2521                         val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca);
2522                         if (val == TCP_CA_UNSPEC)
2523                                 goto err;
2524                 } else {
2525                         val = nla_get_u32(nla);
2526                 }
2527                 if (type == RTAX_HOPLIMIT && val > 255)
2528                         val = 255;
2529                 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
2530                         goto err;
2531
2532                 mp[type - 1] = val;
2533                 __set_bit(type - 1, mxc->mx_valid);
2534         }
2535
2536         if (ecn_ca) {
2537                 __set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
2538                 mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
2539         }
2540
2541         mxc->mx = mp;
2542         return 0;
2543  err:
2544         kfree(mp);
2545         return -EINVAL;
2546 }
2547
2548 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2549                                             struct fib6_config *cfg,
2550                                             const struct in6_addr *gw_addr,
2551                                             u32 tbid, int flags)
2552 {
2553         struct flowi6 fl6 = {
2554                 .flowi6_oif = cfg->fc_ifindex,
2555                 .daddr = *gw_addr,
2556                 .saddr = cfg->fc_prefsrc,
2557         };
2558         struct fib6_table *table;
2559         struct rt6_info *rt;
2560
2561         table = fib6_get_table(net, tbid);
2562         if (!table)
2563                 return NULL;
2564
2565         if (!ipv6_addr_any(&cfg->fc_prefsrc))
2566                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2567
2568         flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2569         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2570
2571         /* if table lookup failed, fall back to full lookup */
2572         if (rt == net->ipv6.ip6_null_entry) {
2573                 ip6_rt_put(rt);
2574                 rt = NULL;
2575         }
2576
2577         return rt;
2578 }
2579
2580 static int ip6_route_check_nh_onlink(struct net *net,
2581                                      struct fib6_config *cfg,
2582                                      const struct net_device *dev,
2583                                      struct netlink_ext_ack *extack)
2584 {
2585         u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2586         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2587         u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2588         struct rt6_info *grt;
2589         int err;
2590
2591         err = 0;
2592         grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2593         if (grt) {
2594                 if (!grt->dst.error &&
2595                     (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2596                         NL_SET_ERR_MSG(extack,
2597                                        "Nexthop has invalid gateway or device mismatch");
2598                         err = -EINVAL;
2599                 }
2600
2601                 ip6_rt_put(grt);
2602         }
2603
2604         return err;
2605 }
2606
2607 static int ip6_route_check_nh(struct net *net,
2608                               struct fib6_config *cfg,
2609                               struct net_device **_dev,
2610                               struct inet6_dev **idev)
2611 {
2612         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2613         struct net_device *dev = _dev ? *_dev : NULL;
2614         struct rt6_info *grt = NULL;
2615         int err = -EHOSTUNREACH;
2616
2617         if (cfg->fc_table) {
2618                 int flags = RT6_LOOKUP_F_IFACE;
2619
2620                 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2621                                           cfg->fc_table, flags);
2622                 if (grt) {
2623                         if (grt->rt6i_flags & RTF_GATEWAY ||
2624                             (dev && dev != grt->dst.dev)) {
2625                                 ip6_rt_put(grt);
2626                                 grt = NULL;
2627                         }
2628                 }
2629         }
2630
2631         if (!grt)
2632                 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2633
2634         if (!grt)
2635                 goto out;
2636
2637         if (dev) {
2638                 if (dev != grt->dst.dev) {
2639                         ip6_rt_put(grt);
2640                         goto out;
2641                 }
2642         } else {
2643                 *_dev = dev = grt->dst.dev;
2644                 *idev = grt->rt6i_idev;
2645                 dev_hold(dev);
2646                 in6_dev_hold(grt->rt6i_idev);
2647         }
2648
2649         if (!(grt->rt6i_flags & RTF_GATEWAY))
2650                 err = 0;
2651
2652         ip6_rt_put(grt);
2653
2654 out:
2655         return err;
2656 }
2657
2658 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2659                            struct net_device **_dev, struct inet6_dev **idev,
2660                            struct netlink_ext_ack *extack)
2661 {
2662         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2663         int gwa_type = ipv6_addr_type(gw_addr);
2664         bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2665         const struct net_device *dev = *_dev;
2666         bool need_addr_check = !dev;
2667         int err = -EINVAL;
2668
2669         /* if gw_addr is local we will fail to detect this in case
2670          * address is still TENTATIVE (DAD in progress). rt6_lookup()
2671          * will return already-added prefix route via interface that
2672          * prefix route was assigned to, which might be non-loopback.
2673          */
2674         if (dev &&
2675             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2676                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2677                 goto out;
2678         }
2679
2680         if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2681                 /* IPv6 strictly inhibits using not link-local
2682                  * addresses as nexthop address.
2683                  * Otherwise, router will not able to send redirects.
2684                  * It is very good, but in some (rare!) circumstances
2685                  * (SIT, PtP, NBMA NOARP links) it is handy to allow
2686                  * some exceptions. --ANK
2687                  * We allow IPv4-mapped nexthops to support RFC4798-type
2688                  * addressing
2689                  */
2690                 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2691                         NL_SET_ERR_MSG(extack, "Invalid gateway address");
2692                         goto out;
2693                 }
2694
2695                 if (cfg->fc_flags & RTNH_F_ONLINK)
2696                         err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2697                 else
2698                         err = ip6_route_check_nh(net, cfg, _dev, idev);
2699
2700                 if (err)
2701                         goto out;
2702         }
2703
2704         /* reload in case device was changed */
2705         dev = *_dev;
2706
2707         err = -EINVAL;
2708         if (!dev) {
2709                 NL_SET_ERR_MSG(extack, "Egress device not specified");
2710                 goto out;
2711         } else if (dev->flags & IFF_LOOPBACK) {
2712                 NL_SET_ERR_MSG(extack,
2713                                "Egress device can not be loopback device for this route");
2714                 goto out;
2715         }
2716
2717         /* if we did not check gw_addr above, do so now that the
2718          * egress device has been resolved.
2719          */
2720         if (need_addr_check &&
2721             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2722                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2723                 goto out;
2724         }
2725
2726         err = 0;
2727 out:
2728         return err;
2729 }
2730
2731 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
2732                                               struct netlink_ext_ack *extack)
2733 {
2734         struct net *net = cfg->fc_nlinfo.nl_net;
2735         struct rt6_info *rt = NULL;
2736         struct net_device *dev = NULL;
2737         struct inet6_dev *idev = NULL;
2738         struct fib6_table *table;
2739         int addr_type;
2740         int err = -EINVAL;
2741
2742         /* RTF_PCPU is an internal flag; can not be set by userspace */
2743         if (cfg->fc_flags & RTF_PCPU) {
2744                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2745                 goto out;
2746         }
2747
2748         /* RTF_CACHE is an internal flag; can not be set by userspace */
2749         if (cfg->fc_flags & RTF_CACHE) {
2750                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2751                 goto out;
2752         }
2753
2754         if (cfg->fc_dst_len > 128) {
2755                 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2756                 goto out;
2757         }
2758         if (cfg->fc_src_len > 128) {
2759                 NL_SET_ERR_MSG(extack, "Invalid source address length");
2760                 goto out;
2761         }
2762 #ifndef CONFIG_IPV6_SUBTREES
2763         if (cfg->fc_src_len) {
2764                 NL_SET_ERR_MSG(extack,
2765                                "Specifying source address requires IPV6_SUBTREES to be enabled");
2766                 goto out;
2767         }
2768 #endif
2769         if (cfg->fc_ifindex) {
2770                 err = -ENODEV;
2771                 dev = dev_get_by_index(net, cfg->fc_ifindex);
2772                 if (!dev)
2773                         goto out;
2774                 idev = in6_dev_get(dev);
2775                 if (!idev)
2776                         goto out;
2777         }
2778
2779         if (cfg->fc_metric == 0)
2780                 cfg->fc_metric = IP6_RT_PRIO_USER;
2781
2782         if (cfg->fc_flags & RTNH_F_ONLINK) {
2783                 if (!dev) {
2784                         NL_SET_ERR_MSG(extack,
2785                                        "Nexthop device required for onlink");
2786                         err = -ENODEV;
2787                         goto out;
2788                 }
2789
2790                 if (!(dev->flags & IFF_UP)) {
2791                         NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2792                         err = -ENETDOWN;
2793                         goto out;
2794                 }
2795         }
2796
2797         err = -ENOBUFS;
2798         if (cfg->fc_nlinfo.nlh &&
2799             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2800                 table = fib6_get_table(net, cfg->fc_table);
2801                 if (!table) {
2802                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2803                         table = fib6_new_table(net, cfg->fc_table);
2804                 }
2805         } else {
2806                 table = fib6_new_table(net, cfg->fc_table);
2807         }
2808
2809         if (!table)
2810                 goto out;
2811
2812         rt = ip6_dst_alloc(net, NULL,
2813                            (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
2814
2815         if (!rt) {
2816                 err = -ENOMEM;
2817                 goto out;
2818         }
2819
2820         if (cfg->fc_flags & RTF_EXPIRES)
2821                 rt6_set_expires(rt, jiffies +
2822                                 clock_t_to_jiffies(cfg->fc_expires));
2823         else
2824                 rt6_clean_expires(rt);
2825
2826         if (cfg->fc_protocol == RTPROT_UNSPEC)
2827                 cfg->fc_protocol = RTPROT_BOOT;
2828         rt->rt6i_protocol = cfg->fc_protocol;
2829
2830         addr_type = ipv6_addr_type(&cfg->fc_dst);
2831
2832         if (addr_type & IPV6_ADDR_MULTICAST)
2833                 rt->dst.input = ip6_mc_input;
2834         else if (cfg->fc_flags & RTF_LOCAL)
2835                 rt->dst.input = ip6_input;
2836         else
2837                 rt->dst.input = ip6_forward;
2838
2839         rt->dst.output = ip6_output;
2840
2841         if (cfg->fc_encap) {
2842                 struct lwtunnel_state *lwtstate;
2843
2844                 err = lwtunnel_build_state(cfg->fc_encap_type,
2845                                            cfg->fc_encap, AF_INET6, cfg,
2846                                            &lwtstate, extack);
2847                 if (err)
2848                         goto out;
2849                 rt->dst.lwtstate = lwtstate_get(lwtstate);
2850                 lwtunnel_set_redirect(&rt->dst);
2851         }
2852
2853         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
2854         rt->rt6i_dst.plen = cfg->fc_dst_len;
2855         if (rt->rt6i_dst.plen == 128)
2856                 rt->dst.flags |= DST_HOST;
2857
2858 #ifdef CONFIG_IPV6_SUBTREES
2859         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
2860         rt->rt6i_src.plen = cfg->fc_src_len;
2861 #endif
2862
2863         rt->rt6i_metric = cfg->fc_metric;
2864         rt->rt6i_nh_weight = 1;
2865
2866         /* We cannot add true routes via loopback here,
2867            they would result in kernel looping; promote them to reject routes
2868          */
2869         if ((cfg->fc_flags & RTF_REJECT) ||
2870             (dev && (dev->flags & IFF_LOOPBACK) &&
2871              !(addr_type & IPV6_ADDR_LOOPBACK) &&
2872              !(cfg->fc_flags & RTF_LOCAL))) {
2873                 /* hold loopback dev/idev if we haven't done so. */
2874                 if (dev != net->loopback_dev) {
2875                         if (dev) {
2876                                 dev_put(dev);
2877                                 in6_dev_put(idev);
2878                         }
2879                         dev = net->loopback_dev;
2880                         dev_hold(dev);
2881                         idev = in6_dev_get(dev);
2882                         if (!idev) {
2883                                 err = -ENODEV;
2884                                 goto out;
2885                         }
2886                 }
2887                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
2888                 switch (cfg->fc_type) {
2889                 case RTN_BLACKHOLE:
2890                         rt->dst.error = -EINVAL;
2891                         rt->dst.output = dst_discard_out;
2892                         rt->dst.input = dst_discard;
2893                         break;
2894                 case RTN_PROHIBIT:
2895                         rt->dst.error = -EACCES;
2896                         rt->dst.output = ip6_pkt_prohibit_out;
2897                         rt->dst.input = ip6_pkt_prohibit;
2898                         break;
2899                 case RTN_THROW:
2900                 case RTN_UNREACHABLE:
2901                 default:
2902                         rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
2903                                         : (cfg->fc_type == RTN_UNREACHABLE)
2904                                         ? -EHOSTUNREACH : -ENETUNREACH;
2905                         rt->dst.output = ip6_pkt_discard_out;
2906                         rt->dst.input = ip6_pkt_discard;
2907                         break;
2908                 }
2909                 goto install_route;
2910         }
2911
2912         if (cfg->fc_flags & RTF_GATEWAY) {
2913                 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
2914                 if (err)
2915                         goto out;
2916
2917                 rt->rt6i_gateway = cfg->fc_gateway;
2918         }
2919
2920         err = -ENODEV;
2921         if (!dev)
2922                 goto out;
2923
2924         if (idev->cnf.disable_ipv6) {
2925                 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
2926                 err = -EACCES;
2927                 goto out;
2928         }
2929
2930         if (!(dev->flags & IFF_UP)) {
2931                 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2932                 err = -ENETDOWN;
2933                 goto out;
2934         }
2935
2936         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2937                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2938                         NL_SET_ERR_MSG(extack, "Invalid source address");
2939                         err = -EINVAL;
2940                         goto out;
2941                 }
2942                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2943                 rt->rt6i_prefsrc.plen = 128;
2944         } else
2945                 rt->rt6i_prefsrc.plen = 0;
2946
2947         rt->rt6i_flags = cfg->fc_flags;
2948
2949 install_route:
2950         if (!(rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
2951             !netif_carrier_ok(dev))
2952                 rt->rt6i_nh_flags |= RTNH_F_LINKDOWN;
2953         rt->rt6i_nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
2954         rt->dst.dev = dev;
2955         rt->rt6i_idev = idev;
2956         rt->rt6i_table = table;
2957
2958         cfg->fc_nlinfo.nl_net = dev_net(dev);
2959
2960         return rt;
2961 out:
2962         if (dev)
2963                 dev_put(dev);
2964         if (idev)
2965                 in6_dev_put(idev);
2966         if (rt)
2967                 dst_release_immediate(&rt->dst);
2968
2969         return ERR_PTR(err);
2970 }
2971
2972 int ip6_route_add(struct fib6_config *cfg,
2973                   struct netlink_ext_ack *extack)
2974 {
2975         struct mx6_config mxc = { .mx = NULL, };
2976         struct rt6_info *rt;
2977         int err;
2978
2979         rt = ip6_route_info_create(cfg, extack);
2980         if (IS_ERR(rt)) {
2981                 err = PTR_ERR(rt);
2982                 rt = NULL;
2983                 goto out;
2984         }
2985
2986         err = ip6_convert_metrics(&mxc, cfg);
2987         if (err)
2988                 goto out;
2989
2990         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack);
2991
2992         kfree(mxc.mx);
2993
2994         return err;
2995 out:
2996         if (rt)
2997                 dst_release_immediate(&rt->dst);
2998
2999         return err;
3000 }
3001
3002 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
3003 {
3004         int err;
3005         struct fib6_table *table;
3006         struct net *net = dev_net(rt->dst.dev);
3007
3008         if (rt == net->ipv6.ip6_null_entry) {
3009                 err = -ENOENT;
3010                 goto out;
3011         }
3012
3013         table = rt->rt6i_table;
3014         spin_lock_bh(&table->tb6_lock);
3015         err = fib6_del(rt, info);
3016         spin_unlock_bh(&table->tb6_lock);
3017
3018 out:
3019         ip6_rt_put(rt);
3020         return err;
3021 }
3022
3023 int ip6_del_rt(struct rt6_info *rt)
3024 {
3025         struct nl_info info = {
3026                 .nl_net = dev_net(rt->dst.dev),
3027         };
3028         return __ip6_del_rt(rt, &info);
3029 }
3030
3031 static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
3032 {
3033         struct nl_info *info = &cfg->fc_nlinfo;
3034         struct net *net = info->nl_net;
3035         struct sk_buff *skb = NULL;
3036         struct fib6_table *table;
3037         int err = -ENOENT;
3038
3039         if (rt == net->ipv6.ip6_null_entry)
3040                 goto out_put;
3041         table = rt->rt6i_table;
3042         spin_lock_bh(&table->tb6_lock);
3043
3044         if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
3045                 struct rt6_info *sibling, *next_sibling;
3046
3047                 /* prefer to send a single notification with all hops */
3048                 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3049                 if (skb) {
3050                         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3051
3052                         if (rt6_fill_node(net, skb, rt,
3053                                           NULL, NULL, 0, RTM_DELROUTE,
3054                                           info->portid, seq, 0) < 0) {
3055                                 kfree_skb(skb);
3056                                 skb = NULL;
3057                         } else
3058                                 info->skip_notify = 1;
3059                 }
3060
3061                 list_for_each_entry_safe(sibling, next_sibling,
3062                                          &rt->rt6i_siblings,
3063                                          rt6i_siblings) {
3064                         err = fib6_del(sibling, info);
3065                         if (err)
3066                                 goto out_unlock;
3067                 }
3068         }
3069
3070         err = fib6_del(rt, info);
3071 out_unlock:
3072         spin_unlock_bh(&table->tb6_lock);
3073 out_put:
3074         ip6_rt_put(rt);
3075
3076         if (skb) {
3077                 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3078                             info->nlh, gfp_any());
3079         }
3080         return err;
3081 }
3082
3083 static int ip6_route_del(struct fib6_config *cfg,
3084                          struct netlink_ext_ack *extack)
3085 {
3086         struct rt6_info *rt, *rt_cache;
3087         struct fib6_table *table;
3088         struct fib6_node *fn;
3089         int err = -ESRCH;
3090
3091         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3092         if (!table) {
3093                 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3094                 return err;
3095         }
3096
3097         rcu_read_lock();
3098
3099         fn = fib6_locate(&table->tb6_root,
3100                          &cfg->fc_dst, cfg->fc_dst_len,
3101                          &cfg->fc_src, cfg->fc_src_len,
3102                          !(cfg->fc_flags & RTF_CACHE));
3103
3104         if (fn) {
3105                 for_each_fib6_node_rt_rcu(fn) {
3106                         if (cfg->fc_flags & RTF_CACHE) {
3107                                 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3108                                                               &cfg->fc_src);
3109                                 if (!rt_cache)
3110                                         continue;
3111                                 rt = rt_cache;
3112                         }
3113                         if (cfg->fc_ifindex &&
3114                             (!rt->dst.dev ||
3115                              rt->dst.dev->ifindex != cfg->fc_ifindex))
3116                                 continue;
3117                         if (cfg->fc_flags & RTF_GATEWAY &&
3118                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3119                                 continue;
3120                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
3121                                 continue;
3122                         if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
3123                                 continue;
3124                         if (!dst_hold_safe(&rt->dst))
3125                                 break;
3126                         rcu_read_unlock();
3127
3128                         /* if gateway was specified only delete the one hop */
3129                         if (cfg->fc_flags & RTF_GATEWAY)
3130                                 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3131
3132                         return __ip6_del_rt_siblings(rt, cfg);
3133                 }
3134         }
3135         rcu_read_unlock();
3136
3137         return err;
3138 }
3139
3140 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3141 {
3142         struct netevent_redirect netevent;
3143         struct rt6_info *rt, *nrt = NULL;
3144         struct ndisc_options ndopts;
3145         struct inet6_dev *in6_dev;
3146         struct neighbour *neigh;
3147         struct rd_msg *msg;
3148         int optlen, on_link;
3149         u8 *lladdr;
3150
3151         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3152         optlen -= sizeof(*msg);
3153
3154         if (optlen < 0) {
3155                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3156                 return;
3157         }
3158
3159         msg = (struct rd_msg *)icmp6_hdr(skb);
3160
3161         if (ipv6_addr_is_multicast(&msg->dest)) {
3162                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3163                 return;
3164         }
3165
3166         on_link = 0;
3167         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3168                 on_link = 1;
3169         } else if (ipv6_addr_type(&msg->target) !=
3170                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3171                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3172                 return;
3173         }
3174
3175         in6_dev = __in6_dev_get(skb->dev);
3176         if (!in6_dev)
3177                 return;
3178         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3179                 return;
3180
3181         /* RFC2461 8.1:
3182          *      The IP source address of the Redirect MUST be the same as the current
3183          *      first-hop router for the specified ICMP Destination Address.
3184          */
3185
3186         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3187                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3188                 return;
3189         }
3190
3191         lladdr = NULL;
3192         if (ndopts.nd_opts_tgt_lladdr) {
3193                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3194                                              skb->dev);
3195                 if (!lladdr) {
3196                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3197                         return;
3198                 }
3199         }
3200
3201         rt = (struct rt6_info *) dst;
3202         if (rt->rt6i_flags & RTF_REJECT) {
3203                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3204                 return;
3205         }
3206
3207         /* Redirect received -> path was valid.
3208          * Look, redirects are sent only in response to data packets,
3209          * so that this nexthop apparently is reachable. --ANK
3210          */
3211         dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3212
3213         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3214         if (!neigh)
3215                 return;
3216
3217         /*
3218          *      We have finally decided to accept it.
3219          */
3220
3221         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3222                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
3223                      NEIGH_UPDATE_F_OVERRIDE|
3224                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3225                                      NEIGH_UPDATE_F_ISROUTER)),
3226                      NDISC_REDIRECT, &ndopts);
3227
3228         nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
3229         if (!nrt)
3230                 goto out;
3231
3232         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3233         if (on_link)
3234                 nrt->rt6i_flags &= ~RTF_GATEWAY;
3235
3236         nrt->rt6i_protocol = RTPROT_REDIRECT;
3237         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3238
3239         /* No need to remove rt from the exception table if rt is
3240          * a cached route because rt6_insert_exception() will
3241          * takes care of it
3242          */
3243         if (rt6_insert_exception(nrt, rt)) {
3244                 dst_release_immediate(&nrt->dst);
3245                 goto out;
3246         }
3247
3248         netevent.old = &rt->dst;
3249         netevent.new = &nrt->dst;
3250         netevent.daddr = &msg->dest;
3251         netevent.neigh = neigh;
3252         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3253
3254 out:
3255         neigh_release(neigh);
3256 }
3257
3258 /*
3259  *      Misc support functions
3260  */
3261
3262 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
3263 {
3264         BUG_ON(from->from);
3265
3266         rt->rt6i_flags &= ~RTF_EXPIRES;
3267         dst_hold(&from->dst);
3268         rt->from = from;
3269         dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
3270 }
3271
3272 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
3273 {
3274         rt->dst.input = ort->dst.input;
3275         rt->dst.output = ort->dst.output;
3276         rt->rt6i_dst = ort->rt6i_dst;
3277         rt->dst.error = ort->dst.error;
3278         rt->rt6i_idev = ort->rt6i_idev;
3279         if (rt->rt6i_idev)
3280                 in6_dev_hold(rt->rt6i_idev);
3281         rt->dst.lastuse = jiffies;
3282         rt->rt6i_gateway = ort->rt6i_gateway;
3283         rt->rt6i_flags = ort->rt6i_flags;
3284         rt6_set_from(rt, ort);
3285         rt->rt6i_metric = ort->rt6i_metric;
3286 #ifdef CONFIG_IPV6_SUBTREES
3287         rt->rt6i_src = ort->rt6i_src;
3288 #endif
3289         rt->rt6i_prefsrc = ort->rt6i_prefsrc;
3290         rt->rt6i_table = ort->rt6i_table;
3291         rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
3292 }
3293
3294 #ifdef CONFIG_IPV6_ROUTE_INFO
3295 static struct rt6_info *rt6_get_route_info(struct net *net,
3296                                            const struct in6_addr *prefix, int prefixlen,
3297                                            const struct in6_addr *gwaddr,
3298                                            struct net_device *dev)
3299 {
3300         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3301         int ifindex = dev->ifindex;
3302         struct fib6_node *fn;
3303         struct rt6_info *rt = NULL;
3304         struct fib6_table *table;
3305
3306         table = fib6_get_table(net, tb_id);
3307         if (!table)
3308                 return NULL;
3309
3310         rcu_read_lock();
3311         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3312         if (!fn)
3313                 goto out;
3314
3315         for_each_fib6_node_rt_rcu(fn) {
3316                 if (rt->dst.dev->ifindex != ifindex)
3317                         continue;
3318                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3319                         continue;
3320                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
3321                         continue;
3322                 ip6_hold_safe(NULL, &rt, false);
3323                 break;
3324         }
3325 out:
3326         rcu_read_unlock();
3327         return rt;
3328 }
3329
3330 static struct rt6_info *rt6_add_route_info(struct net *net,
3331                                            const struct in6_addr *prefix, int prefixlen,
3332                                            const struct in6_addr *gwaddr,
3333                                            struct net_device *dev,
3334                                            unsigned int pref)
3335 {
3336         struct fib6_config cfg = {
3337                 .fc_metric      = IP6_RT_PRIO_USER,
3338                 .fc_ifindex     = dev->ifindex,
3339                 .fc_dst_len     = prefixlen,
3340                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3341                                   RTF_UP | RTF_PREF(pref),
3342                 .fc_protocol = RTPROT_RA,
3343                 .fc_nlinfo.portid = 0,
3344                 .fc_nlinfo.nlh = NULL,
3345                 .fc_nlinfo.nl_net = net,
3346         };
3347
3348         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3349         cfg.fc_dst = *prefix;
3350         cfg.fc_gateway = *gwaddr;
3351
3352         /* We should treat it as a default route if prefix length is 0. */
3353         if (!prefixlen)
3354                 cfg.fc_flags |= RTF_DEFAULT;
3355
3356         ip6_route_add(&cfg, NULL);
3357
3358         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3359 }
3360 #endif
3361
3362 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
3363 {
3364         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3365         struct rt6_info *rt;
3366         struct fib6_table *table;
3367
3368         table = fib6_get_table(dev_net(dev), tb_id);
3369         if (!table)
3370                 return NULL;
3371
3372         rcu_read_lock();
3373         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3374                 if (dev == rt->dst.dev &&
3375                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3376                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
3377                         break;
3378         }
3379         if (rt)
3380                 ip6_hold_safe(NULL, &rt, false);
3381         rcu_read_unlock();
3382         return rt;
3383 }
3384
3385 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
3386                                      struct net_device *dev,
3387                                      unsigned int pref)
3388 {
3389         struct fib6_config cfg = {
3390                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3391                 .fc_metric      = IP6_RT_PRIO_USER,
3392                 .fc_ifindex     = dev->ifindex,
3393                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3394                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3395                 .fc_protocol = RTPROT_RA,
3396                 .fc_nlinfo.portid = 0,
3397                 .fc_nlinfo.nlh = NULL,
3398                 .fc_nlinfo.nl_net = dev_net(dev),
3399         };
3400
3401         cfg.fc_gateway = *gwaddr;
3402
3403         if (!ip6_route_add(&cfg, NULL)) {
3404                 struct fib6_table *table;
3405
3406                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3407                 if (table)
3408                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3409         }
3410
3411         return rt6_get_dflt_router(gwaddr, dev);
3412 }
3413
3414 static void __rt6_purge_dflt_routers(struct fib6_table *table)
3415 {
3416         struct rt6_info *rt;
3417
3418 restart:
3419         rcu_read_lock();
3420         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3421                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3422                     (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
3423                         if (dst_hold_safe(&rt->dst)) {
3424                                 rcu_read_unlock();
3425                                 ip6_del_rt(rt);
3426                         } else {
3427                                 rcu_read_unlock();
3428                         }
3429                         goto restart;
3430                 }
3431         }
3432         rcu_read_unlock();
3433
3434         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3435 }
3436
3437 void rt6_purge_dflt_routers(struct net *net)
3438 {
3439         struct fib6_table *table;
3440         struct hlist_head *head;
3441         unsigned int h;
3442
3443         rcu_read_lock();
3444
3445         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3446                 head = &net->ipv6.fib_table_hash[h];
3447                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3448                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3449                                 __rt6_purge_dflt_routers(table);
3450                 }
3451         }
3452
3453         rcu_read_unlock();
3454 }
3455
3456 static void rtmsg_to_fib6_config(struct net *net,
3457                                  struct in6_rtmsg *rtmsg,
3458                                  struct fib6_config *cfg)
3459 {
3460         memset(cfg, 0, sizeof(*cfg));
3461
3462         cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3463                          : RT6_TABLE_MAIN;
3464         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3465         cfg->fc_metric = rtmsg->rtmsg_metric;
3466         cfg->fc_expires = rtmsg->rtmsg_info;
3467         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3468         cfg->fc_src_len = rtmsg->rtmsg_src_len;
3469         cfg->fc_flags = rtmsg->rtmsg_flags;
3470
3471         cfg->fc_nlinfo.nl_net = net;
3472
3473         cfg->fc_dst = rtmsg->rtmsg_dst;
3474         cfg->fc_src = rtmsg->rtmsg_src;
3475         cfg->fc_gateway = rtmsg->rtmsg_gateway;
3476 }
3477
3478 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3479 {
3480         struct fib6_config cfg;
3481         struct in6_rtmsg rtmsg;
3482         int err;
3483
3484         switch (cmd) {
3485         case SIOCADDRT:         /* Add a route */
3486         case SIOCDELRT:         /* Delete a route */
3487                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3488                         return -EPERM;
3489                 err = copy_from_user(&rtmsg, arg,
3490                                      sizeof(struct in6_rtmsg));
3491                 if (err)
3492                         return -EFAULT;
3493
3494                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3495
3496                 rtnl_lock();
3497                 switch (cmd) {
3498                 case SIOCADDRT:
3499                         err = ip6_route_add(&cfg, NULL);
3500                         break;
3501                 case SIOCDELRT:
3502                         err = ip6_route_del(&cfg, NULL);
3503                         break;
3504                 default:
3505                         err = -EINVAL;
3506                 }
3507                 rtnl_unlock();
3508
3509                 return err;
3510         }
3511
3512         return -EINVAL;
3513 }
3514
3515 /*
3516  *      Drop the packet on the floor
3517  */
3518
3519 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3520 {
3521         int type;
3522         struct dst_entry *dst = skb_dst(skb);
3523         switch (ipstats_mib_noroutes) {
3524         case IPSTATS_MIB_INNOROUTES:
3525                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3526                 if (type == IPV6_ADDR_ANY) {
3527                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3528                                       IPSTATS_MIB_INADDRERRORS);
3529                         break;
3530                 }
3531                 /* FALLTHROUGH */
3532         case IPSTATS_MIB_OUTNOROUTES:
3533                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3534                               ipstats_mib_noroutes);
3535                 break;
3536         }
3537         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3538         kfree_skb(skb);
3539         return 0;
3540 }
3541
3542 static int ip6_pkt_discard(struct sk_buff *skb)
3543 {
3544         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3545 }
3546
3547 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3548 {
3549         skb->dev = skb_dst(skb)->dev;
3550         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3551 }
3552
3553 static int ip6_pkt_prohibit(struct sk_buff *skb)
3554 {
3555         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3556 }
3557
3558 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3559 {
3560         skb->dev = skb_dst(skb)->dev;
3561         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3562 }
3563
3564 /*
3565  *      Allocate a dst for local (unicast / anycast) address.
3566  */
3567
3568 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
3569                                     const struct in6_addr *addr,
3570                                     bool anycast)
3571 {
3572         u32 tb_id;
3573         struct net *net = dev_net(idev->dev);
3574         struct net_device *dev = idev->dev;
3575         struct rt6_info *rt;
3576
3577         rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
3578         if (!rt)
3579                 return ERR_PTR(-ENOMEM);
3580
3581         in6_dev_hold(idev);
3582
3583         rt->dst.flags |= DST_HOST;
3584         rt->dst.input = ip6_input;
3585         rt->dst.output = ip6_output;
3586         rt->rt6i_idev = idev;
3587
3588         rt->rt6i_protocol = RTPROT_KERNEL;
3589         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
3590         if (anycast)
3591                 rt->rt6i_flags |= RTF_ANYCAST;
3592         else
3593                 rt->rt6i_flags |= RTF_LOCAL;
3594
3595         rt->rt6i_gateway  = *addr;
3596         rt->rt6i_dst.addr = *addr;
3597         rt->rt6i_dst.plen = 128;
3598         tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3599         rt->rt6i_table = fib6_get_table(net, tb_id);
3600
3601         return rt;
3602 }
3603
3604 /* remove deleted ip from prefsrc entries */
3605 struct arg_dev_net_ip {
3606         struct net_device *dev;
3607         struct net *net;
3608         struct in6_addr *addr;
3609 };
3610
3611 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
3612 {
3613         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3614         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3615         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3616
3617         if (((void *)rt->dst.dev == dev || !dev) &&
3618             rt != net->ipv6.ip6_null_entry &&
3619             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
3620                 spin_lock_bh(&rt6_exception_lock);
3621                 /* remove prefsrc entry */
3622                 rt->rt6i_prefsrc.plen = 0;
3623                 /* need to update cache as well */
3624                 rt6_exceptions_remove_prefsrc(rt);
3625                 spin_unlock_bh(&rt6_exception_lock);
3626         }
3627         return 0;
3628 }
3629
3630 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3631 {
3632         struct net *net = dev_net(ifp->idev->dev);
3633         struct arg_dev_net_ip adni = {
3634                 .dev = ifp->idev->dev,
3635                 .net = net,
3636                 .addr = &ifp->addr,
3637         };
3638         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3639 }
3640
3641 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3642
3643 /* Remove routers and update dst entries when gateway turn into host. */
3644 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
3645 {
3646         struct in6_addr *gateway = (struct in6_addr *)arg;
3647
3648         if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3649             ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
3650                 return -1;
3651         }
3652
3653         /* Further clean up cached routes in exception table.
3654          * This is needed because cached route may have a different
3655          * gateway than its 'parent' in the case of an ip redirect.
3656          */
3657         rt6_exceptions_clean_tohost(rt, gateway);
3658
3659         return 0;
3660 }
3661
3662 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3663 {
3664         fib6_clean_all(net, fib6_clean_tohost, gateway);
3665 }
3666
3667 struct arg_netdev_event {
3668         const struct net_device *dev;
3669         union {
3670                 unsigned int nh_flags;
3671                 unsigned long event;
3672         };
3673 };
3674
3675 static struct rt6_info *rt6_multipath_first_sibling(const struct rt6_info *rt)
3676 {
3677         struct rt6_info *iter;
3678         struct fib6_node *fn;
3679
3680         fn = rcu_dereference_protected(rt->rt6i_node,
3681                         lockdep_is_held(&rt->rt6i_table->tb6_lock));
3682         iter = rcu_dereference_protected(fn->leaf,
3683                         lockdep_is_held(&rt->rt6i_table->tb6_lock));
3684         while (iter) {
3685                 if (iter->rt6i_metric == rt->rt6i_metric &&
3686                     rt6_qualify_for_ecmp(iter))
3687                         return iter;
3688                 iter = rcu_dereference_protected(iter->rt6_next,
3689                                 lockdep_is_held(&rt->rt6i_table->tb6_lock));
3690         }
3691
3692         return NULL;
3693 }
3694
3695 static bool rt6_is_dead(const struct rt6_info *rt)
3696 {
3697         if (rt->rt6i_nh_flags & RTNH_F_DEAD ||
3698             (rt->rt6i_nh_flags & RTNH_F_LINKDOWN &&
3699              rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
3700                 return true;
3701
3702         return false;
3703 }
3704
3705 static int rt6_multipath_total_weight(const struct rt6_info *rt)
3706 {
3707         struct rt6_info *iter;
3708         int total = 0;
3709
3710         if (!rt6_is_dead(rt))
3711                 total += rt->rt6i_nh_weight;
3712
3713         list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) {
3714                 if (!rt6_is_dead(iter))
3715                         total += iter->rt6i_nh_weight;
3716         }
3717
3718         return total;
3719 }
3720
3721 static void rt6_upper_bound_set(struct rt6_info *rt, int *weight, int total)
3722 {
3723         int upper_bound = -1;
3724
3725         if (!rt6_is_dead(rt)) {
3726                 *weight += rt->rt6i_nh_weight;
3727                 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3728                                                     total) - 1;
3729         }
3730         atomic_set(&rt->rt6i_nh_upper_bound, upper_bound);
3731 }
3732
3733 static void rt6_multipath_upper_bound_set(struct rt6_info *rt, int total)
3734 {
3735         struct rt6_info *iter;
3736         int weight = 0;
3737
3738         rt6_upper_bound_set(rt, &weight, total);
3739
3740         list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3741                 rt6_upper_bound_set(iter, &weight, total);
3742 }
3743
3744 void rt6_multipath_rebalance(struct rt6_info *rt)
3745 {
3746         struct rt6_info *first;
3747         int total;
3748
3749         /* In case the entire multipath route was marked for flushing,
3750          * then there is no need to rebalance upon the removal of every
3751          * sibling route.
3752          */
3753         if (!rt->rt6i_nsiblings || rt->should_flush)
3754                 return;
3755
3756         /* During lookup routes are evaluated in order, so we need to
3757          * make sure upper bounds are assigned from the first sibling
3758          * onwards.
3759          */
3760         first = rt6_multipath_first_sibling(rt);
3761         if (WARN_ON_ONCE(!first))
3762                 return;
3763
3764         total = rt6_multipath_total_weight(first);
3765         rt6_multipath_upper_bound_set(first, total);
3766 }
3767
3768 static int fib6_ifup(struct rt6_info *rt, void *p_arg)
3769 {
3770         const struct arg_netdev_event *arg = p_arg;
3771         const struct net *net = dev_net(arg->dev);
3772
3773         if (rt != net->ipv6.ip6_null_entry && rt->dst.dev == arg->dev) {
3774                 rt->rt6i_nh_flags &= ~arg->nh_flags;
3775                 fib6_update_sernum_upto_root(dev_net(rt->dst.dev), rt);
3776                 rt6_multipath_rebalance(rt);
3777         }
3778
3779         return 0;
3780 }
3781
3782 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3783 {
3784         struct arg_netdev_event arg = {
3785                 .dev = dev,
3786                 {
3787                         .nh_flags = nh_flags,
3788                 },
3789         };
3790
3791         if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3792                 arg.nh_flags |= RTNH_F_LINKDOWN;
3793
3794         fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3795 }
3796
3797 static bool rt6_multipath_uses_dev(const struct rt6_info *rt,
3798                                    const struct net_device *dev)
3799 {
3800         struct rt6_info *iter;
3801
3802         if (rt->dst.dev == dev)
3803                 return true;
3804         list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3805                 if (iter->dst.dev == dev)
3806                         return true;
3807
3808         return false;
3809 }
3810
3811 static void rt6_multipath_flush(struct rt6_info *rt)
3812 {
3813         struct rt6_info *iter;
3814
3815         rt->should_flush = 1;
3816         list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3817                 iter->should_flush = 1;
3818 }
3819
3820 static unsigned int rt6_multipath_dead_count(const struct rt6_info *rt,
3821                                              const struct net_device *down_dev)
3822 {
3823         struct rt6_info *iter;
3824         unsigned int dead = 0;
3825
3826         if (rt->dst.dev == down_dev || rt->rt6i_nh_flags & RTNH_F_DEAD)
3827                 dead++;
3828         list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3829                 if (iter->dst.dev == down_dev ||
3830                     iter->rt6i_nh_flags & RTNH_F_DEAD)
3831                         dead++;
3832
3833         return dead;
3834 }
3835
3836 static void rt6_multipath_nh_flags_set(struct rt6_info *rt,
3837                                        const struct net_device *dev,
3838                                        unsigned int nh_flags)
3839 {
3840         struct rt6_info *iter;
3841
3842         if (rt->dst.dev == dev)
3843                 rt->rt6i_nh_flags |= nh_flags;
3844         list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3845                 if (iter->dst.dev == dev)
3846                         iter->rt6i_nh_flags |= nh_flags;
3847 }
3848
3849 /* called with write lock held for table with rt */
3850 static int fib6_ifdown(struct rt6_info *rt, void *p_arg)
3851 {
3852         const struct arg_netdev_event *arg = p_arg;
3853         const struct net_device *dev = arg->dev;
3854         const struct net *net = dev_net(dev);
3855
3856         if (rt == net->ipv6.ip6_null_entry)
3857                 return 0;
3858
3859         switch (arg->event) {
3860         case NETDEV_UNREGISTER:
3861                 return rt->dst.dev == dev ? -1 : 0;
3862         case NETDEV_DOWN:
3863                 if (rt->should_flush)
3864                         return -1;
3865                 if (!rt->rt6i_nsiblings)
3866                         return rt->dst.dev == dev ? -1 : 0;
3867                 if (rt6_multipath_uses_dev(rt, dev)) {
3868                         unsigned int count;
3869
3870                         count = rt6_multipath_dead_count(rt, dev);
3871                         if (rt->rt6i_nsiblings + 1 == count) {
3872                                 rt6_multipath_flush(rt);
3873                                 return -1;
3874                         }
3875                         rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
3876                                                    RTNH_F_LINKDOWN);
3877                         fib6_update_sernum(rt);
3878                         rt6_multipath_rebalance(rt);
3879                 }
3880                 return -2;
3881         case NETDEV_CHANGE:
3882                 if (rt->dst.dev != dev ||
3883                     rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST))
3884                         break;
3885                 rt->rt6i_nh_flags |= RTNH_F_LINKDOWN;
3886                 rt6_multipath_rebalance(rt);
3887                 break;
3888         }
3889
3890         return 0;
3891 }
3892
3893 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
3894 {
3895         struct arg_netdev_event arg = {
3896                 .dev = dev,
3897                 {
3898                         .event = event,
3899                 },
3900         };
3901
3902         fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
3903 }
3904
3905 void rt6_disable_ip(struct net_device *dev, unsigned long event)
3906 {
3907         rt6_sync_down_dev(dev, event);
3908         rt6_uncached_list_flush_dev(dev_net(dev), dev);
3909         neigh_ifdown(&nd_tbl, dev);
3910 }
3911
3912 struct rt6_mtu_change_arg {
3913         struct net_device *dev;
3914         unsigned int mtu;
3915 };
3916
3917 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
3918 {
3919         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
3920         struct inet6_dev *idev;
3921
3922         /* In IPv6 pmtu discovery is not optional,
3923            so that RTAX_MTU lock cannot disable it.
3924            We still use this lock to block changes
3925            caused by addrconf/ndisc.
3926         */
3927
3928         idev = __in6_dev_get(arg->dev);
3929         if (!idev)
3930                 return 0;
3931
3932         /* For administrative MTU increase, there is no way to discover
3933            IPv6 PMTU increase, so PMTU increase should be updated here.
3934            Since RFC 1981 doesn't include administrative MTU increase
3935            update PMTU increase is a MUST. (i.e. jumbo frame)
3936          */
3937         if (rt->dst.dev == arg->dev &&
3938             !dst_metric_locked(&rt->dst, RTAX_MTU)) {
3939                 spin_lock_bh(&rt6_exception_lock);
3940                 if (dst_metric_raw(&rt->dst, RTAX_MTU) &&
3941                     rt6_mtu_change_route_allowed(idev, rt, arg->mtu))
3942                         dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
3943                 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
3944                 spin_unlock_bh(&rt6_exception_lock);
3945         }
3946         return 0;
3947 }
3948
3949 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
3950 {
3951         struct rt6_mtu_change_arg arg = {
3952                 .dev = dev,
3953                 .mtu = mtu,
3954         };
3955
3956         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
3957 }
3958
3959 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
3960         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
3961         [RTA_OIF]               = { .type = NLA_U32 },
3962         [RTA_IIF]               = { .type = NLA_U32 },
3963         [RTA_PRIORITY]          = { .type = NLA_U32 },
3964         [RTA_METRICS]           = { .type = NLA_NESTED },
3965         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
3966         [RTA_PREF]              = { .type = NLA_U8 },
3967         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
3968         [RTA_ENCAP]             = { .type = NLA_NESTED },
3969         [RTA_EXPIRES]           = { .type = NLA_U32 },
3970         [RTA_UID]               = { .type = NLA_U32 },
3971         [RTA_MARK]              = { .type = NLA_U32 },
3972 };
3973
3974 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
3975                               struct fib6_config *cfg,
3976                               struct netlink_ext_ack *extack)
3977 {
3978         struct rtmsg *rtm;
3979         struct nlattr *tb[RTA_MAX+1];
3980         unsigned int pref;
3981         int err;
3982
3983         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
3984                           NULL);
3985         if (err < 0)
3986                 goto errout;
3987
3988         err = -EINVAL;
3989         rtm = nlmsg_data(nlh);
3990         memset(cfg, 0, sizeof(*cfg));
3991
3992         cfg->fc_table = rtm->rtm_table;
3993         cfg->fc_dst_len = rtm->rtm_dst_len;
3994         cfg->fc_src_len = rtm->rtm_src_len;
3995         cfg->fc_flags = RTF_UP;
3996         cfg->fc_protocol = rtm->rtm_protocol;
3997         cfg->fc_type = rtm->rtm_type;
3998
3999         if (rtm->rtm_type == RTN_UNREACHABLE ||
4000             rtm->rtm_type == RTN_BLACKHOLE ||
4001             rtm->rtm_type == RTN_PROHIBIT ||
4002             rtm->rtm_type == RTN_THROW)
4003                 cfg->fc_flags |= RTF_REJECT;
4004
4005         if (rtm->rtm_type == RTN_LOCAL)
4006                 cfg->fc_flags |= RTF_LOCAL;
4007
4008         if (rtm->rtm_flags & RTM_F_CLONED)
4009                 cfg->fc_flags |= RTF_CACHE;
4010
4011         cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4012
4013         cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
4014         cfg->fc_nlinfo.nlh = nlh;
4015         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
4016
4017         if (tb[RTA_GATEWAY]) {
4018                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4019                 cfg->fc_flags |= RTF_GATEWAY;
4020         }
4021
4022         if (tb[RTA_DST]) {
4023                 int plen = (rtm->rtm_dst_len + 7) >> 3;
4024
4025                 if (nla_len(tb[RTA_DST]) < plen)
4026                         goto errout;
4027
4028                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4029         }
4030
4031         if (tb[RTA_SRC]) {
4032                 int plen = (rtm->rtm_src_len + 7) >> 3;
4033
4034                 if (nla_len(tb[RTA_SRC]) < plen)
4035                         goto errout;
4036
4037                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4038         }
4039
4040         if (tb[RTA_PREFSRC])
4041                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4042
4043         if (tb[RTA_OIF])
4044                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4045
4046         if (tb[RTA_PRIORITY])
4047                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4048
4049         if (tb[RTA_METRICS]) {
4050                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4051                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4052         }
4053
4054         if (tb[RTA_TABLE])
4055                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4056
4057         if (tb[RTA_MULTIPATH]) {
4058                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4059                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4060
4061                 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4062                                                      cfg->fc_mp_len, extack);
4063                 if (err < 0)
4064                         goto errout;
4065         }
4066
4067         if (tb[RTA_PREF]) {
4068                 pref = nla_get_u8(tb[RTA_PREF]);
4069                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4070                     pref != ICMPV6_ROUTER_PREF_HIGH)
4071                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
4072                 cfg->fc_flags |= RTF_PREF(pref);
4073         }
4074
4075         if (tb[RTA_ENCAP])
4076                 cfg->fc_encap = tb[RTA_ENCAP];
4077
4078         if (tb[RTA_ENCAP_TYPE]) {
4079                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4080
4081                 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4082                 if (err < 0)
4083                         goto errout;
4084         }
4085
4086         if (tb[RTA_EXPIRES]) {
4087                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4088
4089                 if (addrconf_finite_timeout(timeout)) {
4090                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4091                         cfg->fc_flags |= RTF_EXPIRES;
4092                 }
4093         }
4094
4095         err = 0;
4096 errout:
4097         return err;
4098 }
4099
4100 struct rt6_nh {
4101         struct rt6_info *rt6_info;
4102         struct fib6_config r_cfg;
4103         struct mx6_config mxc;
4104         struct list_head next;
4105 };
4106
4107 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
4108 {
4109         struct rt6_nh *nh;
4110
4111         list_for_each_entry(nh, rt6_nh_list, next) {
4112                 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
4113                         &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
4114                         nh->r_cfg.fc_ifindex);
4115         }
4116 }
4117
4118 static int ip6_route_info_append(struct list_head *rt6_nh_list,
4119                                  struct rt6_info *rt, struct fib6_config *r_cfg)
4120 {
4121         struct rt6_nh *nh;
4122         int err = -EEXIST;
4123
4124         list_for_each_entry(nh, rt6_nh_list, next) {
4125                 /* check if rt6_info already exists */
4126                 if (rt6_duplicate_nexthop(nh->rt6_info, rt))
4127                         return err;
4128         }
4129
4130         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4131         if (!nh)
4132                 return -ENOMEM;
4133         nh->rt6_info = rt;
4134         err = ip6_convert_metrics(&nh->mxc, r_cfg);
4135         if (err) {
4136                 kfree(nh);
4137                 return err;
4138         }
4139         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4140         list_add_tail(&nh->next, rt6_nh_list);
4141
4142         return 0;
4143 }
4144
4145 static void ip6_route_mpath_notify(struct rt6_info *rt,
4146                                    struct rt6_info *rt_last,
4147                                    struct nl_info *info,
4148                                    __u16 nlflags)
4149 {
4150         /* if this is an APPEND route, then rt points to the first route
4151          * inserted and rt_last points to last route inserted. Userspace
4152          * wants a consistent dump of the route which starts at the first
4153          * nexthop. Since sibling routes are always added at the end of
4154          * the list, find the first sibling of the last route appended
4155          */
4156         if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
4157                 rt = list_first_entry(&rt_last->rt6i_siblings,
4158                                       struct rt6_info,
4159                                       rt6i_siblings);
4160         }
4161
4162         if (rt)
4163                 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4164 }
4165
4166 static int ip6_route_multipath_add(struct fib6_config *cfg,
4167                                    struct netlink_ext_ack *extack)
4168 {
4169         struct rt6_info *rt_notif = NULL, *rt_last = NULL;
4170         struct nl_info *info = &cfg->fc_nlinfo;
4171         struct fib6_config r_cfg;
4172         struct rtnexthop *rtnh;
4173         struct rt6_info *rt;
4174         struct rt6_nh *err_nh;
4175         struct rt6_nh *nh, *nh_safe;
4176         __u16 nlflags;
4177         int remaining;
4178         int attrlen;
4179         int err = 1;
4180         int nhn = 0;
4181         int replace = (cfg->fc_nlinfo.nlh &&
4182                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4183         LIST_HEAD(rt6_nh_list);
4184
4185         nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4186         if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4187                 nlflags |= NLM_F_APPEND;
4188
4189         remaining = cfg->fc_mp_len;
4190         rtnh = (struct rtnexthop *)cfg->fc_mp;
4191
4192         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4193          * rt6_info structs per nexthop
4194          */
4195         while (rtnh_ok(rtnh, remaining)) {
4196                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4197                 if (rtnh->rtnh_ifindex)
4198                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4199
4200                 attrlen = rtnh_attrlen(rtnh);
4201                 if (attrlen > 0) {
4202                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4203
4204                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4205                         if (nla) {
4206                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4207                                 r_cfg.fc_flags |= RTF_GATEWAY;
4208                         }
4209                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4210                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4211                         if (nla)
4212                                 r_cfg.fc_encap_type = nla_get_u16(nla);
4213                 }
4214
4215                 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4216                 rt = ip6_route_info_create(&r_cfg, extack);
4217                 if (IS_ERR(rt)) {
4218                         err = PTR_ERR(rt);
4219                         rt = NULL;
4220                         goto cleanup;
4221                 }
4222
4223                 rt->rt6i_nh_weight = rtnh->rtnh_hops + 1;
4224
4225                 err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
4226                 if (err) {
4227                         dst_release_immediate(&rt->dst);
4228                         goto cleanup;
4229                 }
4230
4231                 rtnh = rtnh_next(rtnh, &remaining);
4232         }
4233
4234         /* for add and replace send one notification with all nexthops.
4235          * Skip the notification in fib6_add_rt2node and send one with
4236          * the full route when done
4237          */
4238         info->skip_notify = 1;
4239
4240         err_nh = NULL;
4241         list_for_each_entry(nh, &rt6_nh_list, next) {
4242                 rt_last = nh->rt6_info;
4243                 err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack);
4244                 /* save reference to first route for notification */
4245                 if (!rt_notif && !err)
4246                         rt_notif = nh->rt6_info;
4247
4248                 /* nh->rt6_info is used or freed at this point, reset to NULL*/
4249                 nh->rt6_info = NULL;
4250                 if (err) {
4251                         if (replace && nhn)
4252                                 ip6_print_replace_route_err(&rt6_nh_list);
4253                         err_nh = nh;
4254                         goto add_errout;
4255                 }
4256
4257                 /* Because each route is added like a single route we remove
4258                  * these flags after the first nexthop: if there is a collision,
4259                  * we have already failed to add the first nexthop:
4260                  * fib6_add_rt2node() has rejected it; when replacing, old
4261                  * nexthops have been replaced by first new, the rest should
4262                  * be added to it.
4263                  */
4264                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4265                                                      NLM_F_REPLACE);
4266                 nhn++;
4267         }
4268
4269         /* success ... tell user about new route */
4270         ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4271         goto cleanup;
4272
4273 add_errout:
4274         /* send notification for routes that were added so that
4275          * the delete notifications sent by ip6_route_del are
4276          * coherent
4277          */
4278         if (rt_notif)
4279                 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4280
4281         /* Delete routes that were already added */
4282         list_for_each_entry(nh, &rt6_nh_list, next) {
4283                 if (err_nh == nh)
4284                         break;
4285                 ip6_route_del(&nh->r_cfg, extack);
4286         }
4287
4288 cleanup:
4289         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4290                 if (nh->rt6_info)
4291                         dst_release_immediate(&nh->rt6_info->dst);
4292                 kfree(nh->mxc.mx);
4293                 list_del(&nh->next);
4294                 kfree(nh);
4295         }
4296
4297         return err;
4298 }
4299
4300 static int ip6_route_multipath_del(struct fib6_config *cfg,
4301                                    struct netlink_ext_ack *extack)
4302 {
4303         struct fib6_config r_cfg;
4304         struct rtnexthop *rtnh;
4305         int remaining;
4306         int attrlen;
4307         int err = 1, last_err = 0;
4308
4309         remaining = cfg->fc_mp_len;
4310         rtnh = (struct rtnexthop *)cfg->fc_mp;
4311
4312         /* Parse a Multipath Entry */
4313         while (rtnh_ok(rtnh, remaining)) {
4314                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4315                 if (rtnh->rtnh_ifindex)
4316                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4317
4318                 attrlen = rtnh_attrlen(rtnh);
4319                 if (attrlen > 0) {
4320                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4321
4322                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4323                         if (nla) {
4324                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4325                                 r_cfg.fc_flags |= RTF_GATEWAY;
4326                         }
4327                 }
4328                 err = ip6_route_del(&r_cfg, extack);
4329                 if (err)
4330                         last_err = err;
4331
4332                 rtnh = rtnh_next(rtnh, &remaining);
4333         }
4334
4335         return last_err;
4336 }
4337
4338 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4339                               struct netlink_ext_ack *extack)
4340 {
4341         struct fib6_config cfg;
4342         int err;
4343
4344         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4345         if (err < 0)
4346                 return err;
4347
4348         if (cfg.fc_mp)
4349                 return ip6_route_multipath_del(&cfg, extack);
4350         else {
4351                 cfg.fc_delete_all_nh = 1;
4352                 return ip6_route_del(&cfg, extack);
4353         }
4354 }
4355
4356 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4357                               struct netlink_ext_ack *extack)
4358 {
4359         struct fib6_config cfg;
4360         int err;
4361
4362         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4363         if (err < 0)
4364                 return err;
4365
4366         if (cfg.fc_mp)
4367                 return ip6_route_multipath_add(&cfg, extack);
4368         else
4369                 return ip6_route_add(&cfg, extack);
4370 }
4371
4372 static size_t rt6_nlmsg_size(struct rt6_info *rt)
4373 {
4374         int nexthop_len = 0;
4375
4376         if (rt->rt6i_nsiblings) {
4377                 nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
4378                             + NLA_ALIGN(sizeof(struct rtnexthop))
4379                             + nla_total_size(16) /* RTA_GATEWAY */
4380                             + lwtunnel_get_encap_size(rt->dst.lwtstate);
4381
4382                 nexthop_len *= rt->rt6i_nsiblings;
4383         }
4384
4385         return NLMSG_ALIGN(sizeof(struct rtmsg))
4386                + nla_total_size(16) /* RTA_SRC */
4387                + nla_total_size(16) /* RTA_DST */
4388                + nla_total_size(16) /* RTA_GATEWAY */
4389                + nla_total_size(16) /* RTA_PREFSRC */
4390                + nla_total_size(4) /* RTA_TABLE */
4391                + nla_total_size(4) /* RTA_IIF */
4392                + nla_total_size(4) /* RTA_OIF */
4393                + nla_total_size(4) /* RTA_PRIORITY */
4394                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4395                + nla_total_size(sizeof(struct rta_cacheinfo))
4396                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4397                + nla_total_size(1) /* RTA_PREF */
4398                + lwtunnel_get_encap_size(rt->dst.lwtstate)
4399                + nexthop_len;
4400 }
4401
4402 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
4403                             unsigned int *flags, bool skip_oif)
4404 {
4405         if (rt->rt6i_nh_flags & RTNH_F_DEAD)
4406                 *flags |= RTNH_F_DEAD;
4407
4408         if (rt->rt6i_nh_flags & RTNH_F_LINKDOWN) {
4409                 *flags |= RTNH_F_LINKDOWN;
4410                 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
4411                         *flags |= RTNH_F_DEAD;
4412         }
4413
4414         if (rt->rt6i_flags & RTF_GATEWAY) {
4415                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
4416                         goto nla_put_failure;
4417         }
4418
4419         *flags |= (rt->rt6i_nh_flags & RTNH_F_ONLINK);
4420         if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD)
4421                 *flags |= RTNH_F_OFFLOAD;
4422
4423         /* not needed for multipath encoding b/c it has a rtnexthop struct */
4424         if (!skip_oif && rt->dst.dev &&
4425             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
4426                 goto nla_put_failure;
4427
4428         if (rt->dst.lwtstate &&
4429             lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
4430                 goto nla_put_failure;
4431
4432         return 0;
4433
4434 nla_put_failure:
4435         return -EMSGSIZE;
4436 }
4437
4438 /* add multipath next hop */
4439 static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
4440 {
4441         struct rtnexthop *rtnh;
4442         unsigned int flags = 0;
4443
4444         rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4445         if (!rtnh)
4446                 goto nla_put_failure;
4447
4448         rtnh->rtnh_hops = rt->rt6i_nh_weight - 1;
4449         rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
4450
4451         if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4452                 goto nla_put_failure;
4453
4454         rtnh->rtnh_flags = flags;
4455
4456         /* length of rtnetlink header + attributes */
4457         rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4458
4459         return 0;
4460
4461 nla_put_failure:
4462         return -EMSGSIZE;
4463 }
4464
4465 static int rt6_fill_node(struct net *net,
4466                          struct sk_buff *skb, struct rt6_info *rt,
4467                          struct in6_addr *dst, struct in6_addr *src,
4468                          int iif, int type, u32 portid, u32 seq,
4469                          unsigned int flags)
4470 {
4471         u32 metrics[RTAX_MAX];
4472         struct rtmsg *rtm;
4473         struct nlmsghdr *nlh;
4474         long expires;
4475         u32 table;
4476
4477         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4478         if (!nlh)
4479                 return -EMSGSIZE;
4480
4481         rtm = nlmsg_data(nlh);
4482         rtm->rtm_family = AF_INET6;
4483         rtm->rtm_dst_len = rt->rt6i_dst.plen;
4484         rtm->rtm_src_len = rt->rt6i_src.plen;
4485         rtm->rtm_tos = 0;
4486         if (rt->rt6i_table)
4487                 table = rt->rt6i_table->tb6_id;
4488         else
4489                 table = RT6_TABLE_UNSPEC;
4490         rtm->rtm_table = table;
4491         if (nla_put_u32(skb, RTA_TABLE, table))
4492                 goto nla_put_failure;
4493         if (rt->rt6i_flags & RTF_REJECT) {
4494                 switch (rt->dst.error) {
4495                 case -EINVAL:
4496                         rtm->rtm_type = RTN_BLACKHOLE;
4497                         break;
4498                 case -EACCES:
4499                         rtm->rtm_type = RTN_PROHIBIT;
4500                         break;
4501                 case -EAGAIN:
4502                         rtm->rtm_type = RTN_THROW;
4503                         break;
4504                 default:
4505                         rtm->rtm_type = RTN_UNREACHABLE;
4506                         break;
4507                 }
4508         }
4509         else if (rt->rt6i_flags & RTF_LOCAL)
4510                 rtm->rtm_type = RTN_LOCAL;
4511         else if (rt->rt6i_flags & RTF_ANYCAST)
4512                 rtm->rtm_type = RTN_ANYCAST;
4513         else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
4514                 rtm->rtm_type = RTN_LOCAL;
4515         else
4516                 rtm->rtm_type = RTN_UNICAST;
4517         rtm->rtm_flags = 0;
4518         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4519         rtm->rtm_protocol = rt->rt6i_protocol;
4520
4521         if (rt->rt6i_flags & RTF_CACHE)
4522                 rtm->rtm_flags |= RTM_F_CLONED;
4523
4524         if (dst) {
4525                 if (nla_put_in6_addr(skb, RTA_DST, dst))
4526                         goto nla_put_failure;
4527                 rtm->rtm_dst_len = 128;
4528         } else if (rtm->rtm_dst_len)
4529                 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
4530                         goto nla_put_failure;
4531 #ifdef CONFIG_IPV6_SUBTREES
4532         if (src) {
4533                 if (nla_put_in6_addr(skb, RTA_SRC, src))
4534                         goto nla_put_failure;
4535                 rtm->rtm_src_len = 128;
4536         } else if (rtm->rtm_src_len &&
4537                    nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
4538                 goto nla_put_failure;
4539 #endif
4540         if (iif) {
4541 #ifdef CONFIG_IPV6_MROUTE
4542                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
4543                         int err = ip6mr_get_route(net, skb, rtm, portid);
4544
4545                         if (err == 0)
4546                                 return 0;
4547                         if (err < 0)
4548                                 goto nla_put_failure;
4549                 } else
4550 #endif
4551                         if (nla_put_u32(skb, RTA_IIF, iif))
4552                                 goto nla_put_failure;
4553         } else if (dst) {
4554                 struct in6_addr saddr_buf;
4555                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
4556                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4557                         goto nla_put_failure;
4558         }
4559
4560         if (rt->rt6i_prefsrc.plen) {
4561                 struct in6_addr saddr_buf;
4562                 saddr_buf = rt->rt6i_prefsrc.addr;
4563                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4564                         goto nla_put_failure;
4565         }
4566
4567         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
4568         if (rt->rt6i_pmtu)
4569                 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
4570         if (rtnetlink_put_metrics(skb, metrics) < 0)
4571                 goto nla_put_failure;
4572
4573         if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
4574                 goto nla_put_failure;
4575
4576         /* For multipath routes, walk the siblings list and add
4577          * each as a nexthop within RTA_MULTIPATH.
4578          */
4579         if (rt->rt6i_nsiblings) {
4580                 struct rt6_info *sibling, *next_sibling;
4581                 struct nlattr *mp;
4582
4583                 mp = nla_nest_start(skb, RTA_MULTIPATH);
4584                 if (!mp)
4585                         goto nla_put_failure;
4586
4587                 if (rt6_add_nexthop(skb, rt) < 0)
4588                         goto nla_put_failure;
4589
4590                 list_for_each_entry_safe(sibling, next_sibling,
4591                                          &rt->rt6i_siblings, rt6i_siblings) {
4592                         if (rt6_add_nexthop(skb, sibling) < 0)
4593                                 goto nla_put_failure;
4594                 }
4595
4596                 nla_nest_end(skb, mp);
4597         } else {
4598                 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4599                         goto nla_put_failure;
4600         }
4601
4602         expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
4603
4604         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
4605                 goto nla_put_failure;
4606
4607         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
4608                 goto nla_put_failure;
4609
4610
4611         nlmsg_end(skb, nlh);
4612         return 0;
4613
4614 nla_put_failure:
4615         nlmsg_cancel(skb, nlh);
4616         return -EMSGSIZE;
4617 }
4618
4619 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
4620 {
4621         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4622         struct net *net = arg->net;
4623
4624         if (rt == net->ipv6.ip6_null_entry)
4625                 return 0;
4626
4627         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4628                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4629
4630                 /* user wants prefix routes only */
4631                 if (rtm->rtm_flags & RTM_F_PREFIX &&
4632                     !(rt->rt6i_flags & RTF_PREFIX_RT)) {
4633                         /* success since this is not a prefix route */
4634                         return 1;
4635                 }
4636         }
4637
4638         return rt6_fill_node(net,
4639                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
4640                      NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
4641                      NLM_F_MULTI);
4642 }
4643
4644 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4645                               struct netlink_ext_ack *extack)
4646 {
4647         struct net *net = sock_net(in_skb->sk);
4648         struct nlattr *tb[RTA_MAX+1];
4649         int err, iif = 0, oif = 0;
4650         struct dst_entry *dst;
4651         struct rt6_info *rt;
4652         struct sk_buff *skb;
4653         struct rtmsg *rtm;
4654         struct flowi6 fl6;
4655         bool fibmatch;
4656
4657         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4658                           extack);
4659         if (err < 0)
4660                 goto errout;
4661
4662         err = -EINVAL;
4663         memset(&fl6, 0, sizeof(fl6));
4664         rtm = nlmsg_data(nlh);
4665         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4666         fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4667
4668         if (tb[RTA_SRC]) {
4669                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4670                         goto errout;
4671
4672                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4673         }
4674
4675         if (tb[RTA_DST]) {
4676                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4677                         goto errout;
4678
4679                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4680         }
4681
4682         if (tb[RTA_IIF])
4683                 iif = nla_get_u32(tb[RTA_IIF]);
4684
4685         if (tb[RTA_OIF])
4686                 oif = nla_get_u32(tb[RTA_OIF]);
4687
4688         if (tb[RTA_MARK])
4689                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4690
4691         if (tb[RTA_UID])
4692                 fl6.flowi6_uid = make_kuid(current_user_ns(),
4693                                            nla_get_u32(tb[RTA_UID]));
4694         else
4695                 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4696
4697         if (iif) {
4698                 struct net_device *dev;
4699                 int flags = 0;
4700
4701                 rcu_read_lock();
4702
4703                 dev = dev_get_by_index_rcu(net, iif);
4704                 if (!dev) {
4705                         rcu_read_unlock();
4706                         err = -ENODEV;
4707                         goto errout;
4708                 }
4709
4710                 fl6.flowi6_iif = iif;
4711
4712                 if (!ipv6_addr_any(&fl6.saddr))
4713                         flags |= RT6_LOOKUP_F_HAS_SADDR;
4714
4715                 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4716
4717                 rcu_read_unlock();
4718         } else {
4719                 fl6.flowi6_oif = oif;
4720
4721                 dst = ip6_route_output(net, NULL, &fl6);
4722         }
4723
4724
4725         rt = container_of(dst, struct rt6_info, dst);
4726         if (rt->dst.error) {
4727                 err = rt->dst.error;
4728                 ip6_rt_put(rt);
4729                 goto errout;
4730         }
4731
4732         if (rt == net->ipv6.ip6_null_entry) {
4733                 err = rt->dst.error;
4734                 ip6_rt_put(rt);
4735                 goto errout;
4736         }
4737
4738         if (fibmatch && rt->from) {
4739                 struct rt6_info *ort = rt->from;
4740
4741                 dst_hold(&ort->dst);
4742                 ip6_rt_put(rt);
4743                 rt = ort;
4744         }
4745
4746         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4747         if (!skb) {
4748                 ip6_rt_put(rt);
4749                 err = -ENOBUFS;
4750                 goto errout;
4751         }
4752
4753         skb_dst_set(skb, &rt->dst);
4754         if (fibmatch)
4755                 err = rt6_fill_node(net, skb, rt, NULL, NULL, iif,
4756                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4757                                     nlh->nlmsg_seq, 0);
4758         else
4759                 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
4760                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4761                                     nlh->nlmsg_seq, 0);
4762         if (err < 0) {
4763                 kfree_skb(skb);
4764                 goto errout;
4765         }
4766
4767         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4768 errout:
4769         return err;
4770 }
4771
4772 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
4773                      unsigned int nlm_flags)
4774 {
4775         struct sk_buff *skb;
4776         struct net *net = info->nl_net;
4777         u32 seq;
4778         int err;
4779
4780         err = -ENOBUFS;
4781         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4782
4783         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4784         if (!skb)
4785                 goto errout;
4786
4787         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
4788                                 event, info->portid, seq, nlm_flags);
4789         if (err < 0) {
4790                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4791                 WARN_ON(err == -EMSGSIZE);
4792                 kfree_skb(skb);
4793                 goto errout;
4794         }
4795         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4796                     info->nlh, gfp_any());
4797         return;
4798 errout:
4799         if (err < 0)
4800                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4801 }
4802
4803 static int ip6_route_dev_notify(struct notifier_block *this,
4804                                 unsigned long event, void *ptr)
4805 {
4806         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4807         struct net *net = dev_net(dev);
4808
4809         if (!(dev->flags & IFF_LOOPBACK))
4810                 return NOTIFY_OK;
4811
4812         if (event == NETDEV_REGISTER) {
4813                 net->ipv6.ip6_null_entry->dst.dev = dev;
4814                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
4815 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4816                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
4817                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
4818                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
4819                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
4820 #endif
4821          } else if (event == NETDEV_UNREGISTER &&
4822                     dev->reg_state != NETREG_UNREGISTERED) {
4823                 /* NETDEV_UNREGISTER could be fired for multiple times by
4824                  * netdev_wait_allrefs(). Make sure we only call this once.
4825                  */
4826                 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
4827 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4828                 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
4829                 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
4830 #endif
4831         }
4832
4833         return NOTIFY_OK;
4834 }
4835
4836 /*
4837  *      /proc
4838  */
4839
4840 #ifdef CONFIG_PROC_FS
4841
4842 static const struct file_operations ipv6_route_proc_fops = {
4843         .open           = ipv6_route_open,
4844         .read           = seq_read,
4845         .llseek         = seq_lseek,
4846         .release        = seq_release_net,
4847 };
4848
4849 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
4850 {
4851         struct net *net = (struct net *)seq->private;
4852         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
4853                    net->ipv6.rt6_stats->fib_nodes,
4854                    net->ipv6.rt6_stats->fib_route_nodes,
4855                    atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
4856                    net->ipv6.rt6_stats->fib_rt_entries,
4857                    net->ipv6.rt6_stats->fib_rt_cache,
4858                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
4859                    net->ipv6.rt6_stats->fib_discarded_routes);
4860
4861         return 0;
4862 }
4863
4864 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
4865 {
4866         return single_open_net(inode, file, rt6_stats_seq_show);
4867 }
4868
4869 static const struct file_operations rt6_stats_seq_fops = {
4870         .open    = rt6_stats_seq_open,
4871         .read    = seq_read,
4872         .llseek  = seq_lseek,
4873         .release = single_release_net,
4874 };
4875 #endif  /* CONFIG_PROC_FS */
4876
4877 #ifdef CONFIG_SYSCTL
4878
4879 static
4880 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
4881                               void __user *buffer, size_t *lenp, loff_t *ppos)
4882 {
4883         struct net *net;
4884         int delay;
4885         if (!write)
4886                 return -EINVAL;
4887
4888         net = (struct net *)ctl->extra1;
4889         delay = net->ipv6.sysctl.flush_delay;
4890         proc_dointvec(ctl, write, buffer, lenp, ppos);
4891         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
4892         return 0;
4893 }
4894
4895 struct ctl_table ipv6_route_table_template[] = {
4896         {
4897                 .procname       =       "flush",
4898                 .data           =       &init_net.ipv6.sysctl.flush_delay,
4899                 .maxlen         =       sizeof(int),
4900                 .mode           =       0200,
4901                 .proc_handler   =       ipv6_sysctl_rtcache_flush
4902         },
4903         {
4904                 .procname       =       "gc_thresh",
4905                 .data           =       &ip6_dst_ops_template.gc_thresh,
4906                 .maxlen         =       sizeof(int),
4907                 .mode           =       0644,
4908                 .proc_handler   =       proc_dointvec,
4909         },
4910         {
4911                 .procname       =       "max_size",
4912                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
4913                 .maxlen         =       sizeof(int),
4914                 .mode           =       0644,
4915                 .proc_handler   =       proc_dointvec,
4916         },
4917         {
4918                 .procname       =       "gc_min_interval",
4919                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4920                 .maxlen         =       sizeof(int),
4921                 .mode           =       0644,
4922                 .proc_handler   =       proc_dointvec_jiffies,
4923         },
4924         {
4925                 .procname       =       "gc_timeout",
4926                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
4927                 .maxlen         =       sizeof(int),
4928                 .mode           =       0644,
4929                 .proc_handler   =       proc_dointvec_jiffies,
4930         },
4931         {
4932                 .procname       =       "gc_interval",
4933                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
4934                 .maxlen         =       sizeof(int),
4935                 .mode           =       0644,
4936                 .proc_handler   =       proc_dointvec_jiffies,
4937         },
4938         {
4939                 .procname       =       "gc_elasticity",
4940                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
4941                 .maxlen         =       sizeof(int),
4942                 .mode           =       0644,
4943                 .proc_handler   =       proc_dointvec,
4944         },
4945         {
4946                 .procname       =       "mtu_expires",
4947                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
4948                 .maxlen         =       sizeof(int),
4949                 .mode           =       0644,
4950                 .proc_handler   =       proc_dointvec_jiffies,
4951         },
4952         {
4953                 .procname       =       "min_adv_mss",
4954                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
4955                 .maxlen         =       sizeof(int),
4956                 .mode           =       0644,
4957                 .proc_handler   =       proc_dointvec,
4958         },
4959         {
4960                 .procname       =       "gc_min_interval_ms",
4961                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4962                 .maxlen         =       sizeof(int),
4963                 .mode           =       0644,
4964                 .proc_handler   =       proc_dointvec_ms_jiffies,
4965         },
4966         { }
4967 };
4968
4969 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
4970 {
4971         struct ctl_table *table;
4972
4973         table = kmemdup(ipv6_route_table_template,
4974                         sizeof(ipv6_route_table_template),
4975                         GFP_KERNEL);
4976
4977         if (table) {
4978                 table[0].data = &net->ipv6.sysctl.flush_delay;
4979                 table[0].extra1 = net;
4980                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
4981                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
4982                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4983                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
4984                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
4985                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
4986                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
4987                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
4988                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4989
4990                 /* Don't export sysctls to unprivileged users */
4991                 if (net->user_ns != &init_user_ns)
4992                         table[0].procname = NULL;
4993         }
4994
4995         return table;
4996 }
4997 #endif
4998
4999 static int __net_init ip6_route_net_init(struct net *net)
5000 {
5001         int ret = -ENOMEM;
5002
5003         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5004                sizeof(net->ipv6.ip6_dst_ops));
5005
5006         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5007                 goto out_ip6_dst_ops;
5008
5009         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5010                                            sizeof(*net->ipv6.ip6_null_entry),
5011                                            GFP_KERNEL);
5012         if (!net->ipv6.ip6_null_entry)
5013                 goto out_ip6_dst_entries;
5014         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5015         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5016                          ip6_template_metrics, true);
5017
5018 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5019         net->ipv6.fib6_has_custom_rules = false;
5020         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5021                                                sizeof(*net->ipv6.ip6_prohibit_entry),
5022                                                GFP_KERNEL);
5023         if (!net->ipv6.ip6_prohibit_entry)
5024                 goto out_ip6_null_entry;
5025         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5026         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5027                          ip6_template_metrics, true);
5028
5029         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5030                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
5031                                                GFP_KERNEL);
5032         if (!net->ipv6.ip6_blk_hole_entry)
5033                 goto out_ip6_prohibit_entry;
5034         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5035         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5036                          ip6_template_metrics, true);
5037 #endif
5038
5039         net->ipv6.sysctl.flush_delay = 0;
5040         net->ipv6.sysctl.ip6_rt_max_size = 4096;
5041         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5042         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5043         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5044         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5045         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5046         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5047
5048         net->ipv6.ip6_rt_gc_expire = 30*HZ;
5049
5050         ret = 0;
5051 out:
5052         return ret;
5053
5054 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5055 out_ip6_prohibit_entry:
5056         kfree(net->ipv6.ip6_prohibit_entry);
5057 out_ip6_null_entry:
5058         kfree(net->ipv6.ip6_null_entry);
5059 #endif
5060 out_ip6_dst_entries:
5061         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5062 out_ip6_dst_ops:
5063         goto out;
5064 }
5065
5066 static void __net_exit ip6_route_net_exit(struct net *net)
5067 {
5068         kfree(net->ipv6.ip6_null_entry);
5069 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5070         kfree(net->ipv6.ip6_prohibit_entry);
5071         kfree(net->ipv6.ip6_blk_hole_entry);
5072 #endif
5073         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5074 }
5075
5076 static int __net_init ip6_route_net_init_late(struct net *net)
5077 {
5078 #ifdef CONFIG_PROC_FS
5079         proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
5080         proc_create("rt6_stats", 0444, net->proc_net, &rt6_stats_seq_fops);
5081 #endif
5082         return 0;
5083 }
5084
5085 static void __net_exit ip6_route_net_exit_late(struct net *net)
5086 {
5087 #ifdef CONFIG_PROC_FS
5088         remove_proc_entry("ipv6_route", net->proc_net);
5089         remove_proc_entry("rt6_stats", net->proc_net);
5090 #endif
5091 }
5092
5093 static struct pernet_operations ip6_route_net_ops = {
5094         .init = ip6_route_net_init,
5095         .exit = ip6_route_net_exit,
5096 };
5097
5098 static int __net_init ipv6_inetpeer_init(struct net *net)
5099 {
5100         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5101
5102         if (!bp)
5103                 return -ENOMEM;
5104         inet_peer_base_init(bp);
5105         net->ipv6.peers = bp;
5106         return 0;
5107 }
5108
5109 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5110 {
5111         struct inet_peer_base *bp = net->ipv6.peers;
5112
5113         net->ipv6.peers = NULL;
5114         inetpeer_invalidate_tree(bp);
5115         kfree(bp);
5116 }
5117
5118 static struct pernet_operations ipv6_inetpeer_ops = {
5119         .init   =       ipv6_inetpeer_init,
5120         .exit   =       ipv6_inetpeer_exit,
5121 };
5122
5123 static struct pernet_operations ip6_route_net_late_ops = {
5124         .init = ip6_route_net_init_late,
5125         .exit = ip6_route_net_exit_late,
5126 };
5127
5128 static struct notifier_block ip6_route_dev_notifier = {
5129         .notifier_call = ip6_route_dev_notify,
5130         .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5131 };
5132
5133 void __init ip6_route_init_special_entries(void)
5134 {
5135         /* Registering of the loopback is done before this portion of code,
5136          * the loopback reference in rt6_info will not be taken, do it
5137          * manually for init_net */
5138         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5139         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5140   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5141         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5142         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5143         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5144         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5145   #endif
5146 }
5147
5148 int __init ip6_route_init(void)
5149 {
5150         int ret;
5151         int cpu;
5152
5153         ret = -ENOMEM;
5154         ip6_dst_ops_template.kmem_cachep =
5155                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5156                                   SLAB_HWCACHE_ALIGN, NULL);
5157         if (!ip6_dst_ops_template.kmem_cachep)
5158                 goto out;
5159
5160         ret = dst_entries_init(&ip6_dst_blackhole_ops);
5161         if (ret)
5162                 goto out_kmem_cache;
5163
5164         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5165         if (ret)
5166                 goto out_dst_entries;
5167
5168         ret = register_pernet_subsys(&ip6_route_net_ops);
5169         if (ret)
5170                 goto out_register_inetpeer;
5171
5172         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5173
5174         ret = fib6_init();
5175         if (ret)
5176                 goto out_register_subsys;
5177
5178         ret = xfrm6_init();
5179         if (ret)
5180                 goto out_fib6_init;
5181
5182         ret = fib6_rules_init();
5183         if (ret)
5184                 goto xfrm6_init;
5185
5186         ret = register_pernet_subsys(&ip6_route_net_late_ops);
5187         if (ret)
5188                 goto fib6_rules_init;
5189
5190         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5191                                    inet6_rtm_newroute, NULL, 0);
5192         if (ret < 0)
5193                 goto out_register_late_subsys;
5194
5195         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5196                                    inet6_rtm_delroute, NULL, 0);
5197         if (ret < 0)
5198                 goto out_register_late_subsys;
5199
5200         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5201                                    inet6_rtm_getroute, NULL,
5202                                    RTNL_FLAG_DOIT_UNLOCKED);
5203         if (ret < 0)
5204                 goto out_register_late_subsys;
5205
5206         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5207         if (ret)
5208                 goto out_register_late_subsys;
5209
5210         for_each_possible_cpu(cpu) {
5211                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5212
5213                 INIT_LIST_HEAD(&ul->head);
5214                 spin_lock_init(&ul->lock);
5215         }
5216
5217 out:
5218         return ret;
5219
5220 out_register_late_subsys:
5221         rtnl_unregister_all(PF_INET6);
5222         unregister_pernet_subsys(&ip6_route_net_late_ops);
5223 fib6_rules_init:
5224         fib6_rules_cleanup();
5225 xfrm6_init:
5226         xfrm6_fini();
5227 out_fib6_init:
5228         fib6_gc_cleanup();
5229 out_register_subsys:
5230         unregister_pernet_subsys(&ip6_route_net_ops);
5231 out_register_inetpeer:
5232         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5233 out_dst_entries:
5234         dst_entries_destroy(&ip6_dst_blackhole_ops);
5235 out_kmem_cache:
5236         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5237         goto out;
5238 }
5239
5240 void ip6_route_cleanup(void)
5241 {
5242         unregister_netdevice_notifier(&ip6_route_dev_notifier);
5243         unregister_pernet_subsys(&ip6_route_net_late_ops);
5244         fib6_rules_cleanup();
5245         xfrm6_fini();
5246         fib6_gc_cleanup();
5247         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5248         unregister_pernet_subsys(&ip6_route_net_ops);
5249         dst_entries_destroy(&ip6_dst_blackhole_ops);
5250         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5251 }