f980f904d6ea5a04586e09c184431b16d269151b
[sfrench/cifs-2.6.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <trace/events/fib6.h>
67
68 #include <linux/uaccess.h>
69
70 #ifdef CONFIG_SYSCTL
71 #include <linux/sysctl.h>
72 #endif
73
74 enum rt6_nud_state {
75         RT6_NUD_FAIL_HARD = -3,
76         RT6_NUD_FAIL_PROBE = -2,
77         RT6_NUD_FAIL_DO_RR = -1,
78         RT6_NUD_SUCCEED = 1
79 };
80
81 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
82 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
83 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
84 static unsigned int      ip6_mtu(const struct dst_entry *dst);
85 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
86 static void             ip6_dst_destroy(struct dst_entry *);
87 static void             ip6_dst_ifdown(struct dst_entry *,
88                                        struct net_device *dev, int how);
89 static int               ip6_dst_gc(struct dst_ops *ops);
90
91 static int              ip6_pkt_discard(struct sk_buff *skb);
92 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
93 static int              ip6_pkt_prohibit(struct sk_buff *skb);
94 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
95 static void             ip6_link_failure(struct sk_buff *skb);
96 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
97                                            struct sk_buff *skb, u32 mtu);
98 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
99                                         struct sk_buff *skb);
100 static void             rt6_dst_from_metrics_check(struct rt6_info *rt);
101 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
102 static size_t rt6_nlmsg_size(struct rt6_info *rt);
103 static int rt6_fill_node(struct net *net,
104                          struct sk_buff *skb, struct rt6_info *rt,
105                          struct in6_addr *dst, struct in6_addr *src,
106                          int iif, int type, u32 portid, u32 seq,
107                          unsigned int flags);
108 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
109                                            struct in6_addr *daddr,
110                                            struct in6_addr *saddr);
111
112 #ifdef CONFIG_IPV6_ROUTE_INFO
113 static struct rt6_info *rt6_add_route_info(struct net *net,
114                                            const struct in6_addr *prefix, int prefixlen,
115                                            const struct in6_addr *gwaddr,
116                                            struct net_device *dev,
117                                            unsigned int pref);
118 static struct rt6_info *rt6_get_route_info(struct net *net,
119                                            const struct in6_addr *prefix, int prefixlen,
120                                            const struct in6_addr *gwaddr,
121                                            struct net_device *dev);
122 #endif
123
124 struct uncached_list {
125         spinlock_t              lock;
126         struct list_head        head;
127 };
128
129 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
130
131 static void rt6_uncached_list_add(struct rt6_info *rt)
132 {
133         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
134
135         rt->rt6i_uncached_list = ul;
136
137         spin_lock_bh(&ul->lock);
138         list_add_tail(&rt->rt6i_uncached, &ul->head);
139         spin_unlock_bh(&ul->lock);
140 }
141
142 static void rt6_uncached_list_del(struct rt6_info *rt)
143 {
144         if (!list_empty(&rt->rt6i_uncached)) {
145                 struct uncached_list *ul = rt->rt6i_uncached_list;
146                 struct net *net = dev_net(rt->dst.dev);
147
148                 spin_lock_bh(&ul->lock);
149                 list_del(&rt->rt6i_uncached);
150                 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
151                 spin_unlock_bh(&ul->lock);
152         }
153 }
154
155 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
156 {
157         struct net_device *loopback_dev = net->loopback_dev;
158         int cpu;
159
160         if (dev == loopback_dev)
161                 return;
162
163         for_each_possible_cpu(cpu) {
164                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
165                 struct rt6_info *rt;
166
167                 spin_lock_bh(&ul->lock);
168                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
169                         struct inet6_dev *rt_idev = rt->rt6i_idev;
170                         struct net_device *rt_dev = rt->dst.dev;
171
172                         if (rt_idev->dev == dev) {
173                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
174                                 in6_dev_put(rt_idev);
175                         }
176
177                         if (rt_dev == dev) {
178                                 rt->dst.dev = loopback_dev;
179                                 dev_hold(rt->dst.dev);
180                                 dev_put(rt_dev);
181                         }
182                 }
183                 spin_unlock_bh(&ul->lock);
184         }
185 }
186
187 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
188 {
189         return dst_metrics_write_ptr(&rt->from->dst);
190 }
191
192 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
193 {
194         struct rt6_info *rt = (struct rt6_info *)dst;
195
196         if (rt->rt6i_flags & RTF_PCPU)
197                 return rt6_pcpu_cow_metrics(rt);
198         else if (rt->rt6i_flags & RTF_CACHE)
199                 return NULL;
200         else
201                 return dst_cow_metrics_generic(dst, old);
202 }
203
204 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
205                                              struct sk_buff *skb,
206                                              const void *daddr)
207 {
208         struct in6_addr *p = &rt->rt6i_gateway;
209
210         if (!ipv6_addr_any(p))
211                 return (const void *) p;
212         else if (skb)
213                 return &ipv6_hdr(skb)->daddr;
214         return daddr;
215 }
216
217 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
218                                           struct sk_buff *skb,
219                                           const void *daddr)
220 {
221         struct rt6_info *rt = (struct rt6_info *) dst;
222         struct neighbour *n;
223
224         daddr = choose_neigh_daddr(rt, skb, daddr);
225         n = __ipv6_neigh_lookup(dst->dev, daddr);
226         if (n)
227                 return n;
228         return neigh_create(&nd_tbl, daddr, dst->dev);
229 }
230
231 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
232 {
233         struct net_device *dev = dst->dev;
234         struct rt6_info *rt = (struct rt6_info *)dst;
235
236         daddr = choose_neigh_daddr(rt, NULL, daddr);
237         if (!daddr)
238                 return;
239         if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
240                 return;
241         if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
242                 return;
243         __ipv6_confirm_neigh(dev, daddr);
244 }
245
246 static struct dst_ops ip6_dst_ops_template = {
247         .family                 =       AF_INET6,
248         .gc                     =       ip6_dst_gc,
249         .gc_thresh              =       1024,
250         .check                  =       ip6_dst_check,
251         .default_advmss         =       ip6_default_advmss,
252         .mtu                    =       ip6_mtu,
253         .cow_metrics            =       ipv6_cow_metrics,
254         .destroy                =       ip6_dst_destroy,
255         .ifdown                 =       ip6_dst_ifdown,
256         .negative_advice        =       ip6_negative_advice,
257         .link_failure           =       ip6_link_failure,
258         .update_pmtu            =       ip6_rt_update_pmtu,
259         .redirect               =       rt6_do_redirect,
260         .local_out              =       __ip6_local_out,
261         .neigh_lookup           =       ip6_neigh_lookup,
262         .confirm_neigh          =       ip6_confirm_neigh,
263 };
264
265 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
266 {
267         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
268
269         return mtu ? : dst->dev->mtu;
270 }
271
272 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
273                                          struct sk_buff *skb, u32 mtu)
274 {
275 }
276
277 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
278                                       struct sk_buff *skb)
279 {
280 }
281
282 static struct dst_ops ip6_dst_blackhole_ops = {
283         .family                 =       AF_INET6,
284         .destroy                =       ip6_dst_destroy,
285         .check                  =       ip6_dst_check,
286         .mtu                    =       ip6_blackhole_mtu,
287         .default_advmss         =       ip6_default_advmss,
288         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
289         .redirect               =       ip6_rt_blackhole_redirect,
290         .cow_metrics            =       dst_cow_metrics_generic,
291         .neigh_lookup           =       ip6_neigh_lookup,
292 };
293
294 static const u32 ip6_template_metrics[RTAX_MAX] = {
295         [RTAX_HOPLIMIT - 1] = 0,
296 };
297
298 static const struct rt6_info ip6_null_entry_template = {
299         .dst = {
300                 .__refcnt       = ATOMIC_INIT(1),
301                 .__use          = 1,
302                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
303                 .error          = -ENETUNREACH,
304                 .input          = ip6_pkt_discard,
305                 .output         = ip6_pkt_discard_out,
306         },
307         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
308         .rt6i_protocol  = RTPROT_KERNEL,
309         .rt6i_metric    = ~(u32) 0,
310         .rt6i_ref       = ATOMIC_INIT(1),
311 };
312
313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
314
315 static const struct rt6_info ip6_prohibit_entry_template = {
316         .dst = {
317                 .__refcnt       = ATOMIC_INIT(1),
318                 .__use          = 1,
319                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
320                 .error          = -EACCES,
321                 .input          = ip6_pkt_prohibit,
322                 .output         = ip6_pkt_prohibit_out,
323         },
324         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
325         .rt6i_protocol  = RTPROT_KERNEL,
326         .rt6i_metric    = ~(u32) 0,
327         .rt6i_ref       = ATOMIC_INIT(1),
328 };
329
330 static const struct rt6_info ip6_blk_hole_entry_template = {
331         .dst = {
332                 .__refcnt       = ATOMIC_INIT(1),
333                 .__use          = 1,
334                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
335                 .error          = -EINVAL,
336                 .input          = dst_discard,
337                 .output         = dst_discard_out,
338         },
339         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
340         .rt6i_protocol  = RTPROT_KERNEL,
341         .rt6i_metric    = ~(u32) 0,
342         .rt6i_ref       = ATOMIC_INIT(1),
343 };
344
345 #endif
346
347 static void rt6_info_init(struct rt6_info *rt)
348 {
349         struct dst_entry *dst = &rt->dst;
350
351         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
352         INIT_LIST_HEAD(&rt->rt6i_siblings);
353         INIT_LIST_HEAD(&rt->rt6i_uncached);
354 }
355
356 /* allocate dst with ip6_dst_ops */
357 static struct rt6_info *__ip6_dst_alloc(struct net *net,
358                                         struct net_device *dev,
359                                         int flags)
360 {
361         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
362                                         1, DST_OBSOLETE_FORCE_CHK, flags);
363
364         if (rt) {
365                 rt6_info_init(rt);
366                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
367         }
368
369         return rt;
370 }
371
372 struct rt6_info *ip6_dst_alloc(struct net *net,
373                                struct net_device *dev,
374                                int flags)
375 {
376         struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
377
378         if (rt) {
379                 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
380                 if (!rt->rt6i_pcpu) {
381                         dst_release_immediate(&rt->dst);
382                         return NULL;
383                 }
384         }
385
386         return rt;
387 }
388 EXPORT_SYMBOL(ip6_dst_alloc);
389
390 static void ip6_dst_destroy(struct dst_entry *dst)
391 {
392         struct rt6_info *rt = (struct rt6_info *)dst;
393         struct rt6_exception_bucket *bucket;
394         struct rt6_info *from = rt->from;
395         struct inet6_dev *idev;
396
397         dst_destroy_metrics_generic(dst);
398         free_percpu(rt->rt6i_pcpu);
399         rt6_uncached_list_del(rt);
400
401         idev = rt->rt6i_idev;
402         if (idev) {
403                 rt->rt6i_idev = NULL;
404                 in6_dev_put(idev);
405         }
406         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1);
407         if (bucket) {
408                 rt->rt6i_exception_bucket = NULL;
409                 kfree(bucket);
410         }
411
412         rt->from = NULL;
413         dst_release(&from->dst);
414 }
415
416 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
417                            int how)
418 {
419         struct rt6_info *rt = (struct rt6_info *)dst;
420         struct inet6_dev *idev = rt->rt6i_idev;
421         struct net_device *loopback_dev =
422                 dev_net(dev)->loopback_dev;
423
424         if (idev && idev->dev != loopback_dev) {
425                 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
426                 if (loopback_idev) {
427                         rt->rt6i_idev = loopback_idev;
428                         in6_dev_put(idev);
429                 }
430         }
431 }
432
433 static bool __rt6_check_expired(const struct rt6_info *rt)
434 {
435         if (rt->rt6i_flags & RTF_EXPIRES)
436                 return time_after(jiffies, rt->dst.expires);
437         else
438                 return false;
439 }
440
441 static bool rt6_check_expired(const struct rt6_info *rt)
442 {
443         if (rt->rt6i_flags & RTF_EXPIRES) {
444                 if (time_after(jiffies, rt->dst.expires))
445                         return true;
446         } else if (rt->from) {
447                 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
448                         rt6_check_expired(rt->from);
449         }
450         return false;
451 }
452
453 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
454                                              struct flowi6 *fl6, int oif,
455                                              int strict)
456 {
457         struct rt6_info *sibling, *next_sibling;
458         int route_choosen;
459
460         /* We might have already computed the hash for ICMPv6 errors. In such
461          * case it will always be non-zero. Otherwise now is the time to do it.
462          */
463         if (!fl6->mp_hash)
464                 fl6->mp_hash = rt6_multipath_hash(fl6, NULL);
465
466         route_choosen = fl6->mp_hash % (match->rt6i_nsiblings + 1);
467         /* Don't change the route, if route_choosen == 0
468          * (siblings does not include ourself)
469          */
470         if (route_choosen)
471                 list_for_each_entry_safe(sibling, next_sibling,
472                                 &match->rt6i_siblings, rt6i_siblings) {
473                         route_choosen--;
474                         if (route_choosen == 0) {
475                                 struct inet6_dev *idev = sibling->rt6i_idev;
476
477                                 if (sibling->rt6i_nh_flags & RTNH_F_LINKDOWN &&
478                                     idev->cnf.ignore_routes_with_linkdown)
479                                         break;
480                                 if (rt6_score_route(sibling, oif, strict) < 0)
481                                         break;
482                                 match = sibling;
483                                 break;
484                         }
485                 }
486         return match;
487 }
488
489 /*
490  *      Route lookup. rcu_read_lock() should be held.
491  */
492
493 static inline struct rt6_info *rt6_device_match(struct net *net,
494                                                     struct rt6_info *rt,
495                                                     const struct in6_addr *saddr,
496                                                     int oif,
497                                                     int flags)
498 {
499         struct rt6_info *local = NULL;
500         struct rt6_info *sprt;
501
502         if (!oif && ipv6_addr_any(saddr))
503                 goto out;
504
505         for (sprt = rt; sprt; sprt = rcu_dereference(sprt->rt6_next)) {
506                 struct net_device *dev = sprt->dst.dev;
507
508                 if (oif) {
509                         if (dev->ifindex == oif)
510                                 return sprt;
511                         if (dev->flags & IFF_LOOPBACK) {
512                                 if (!sprt->rt6i_idev ||
513                                     sprt->rt6i_idev->dev->ifindex != oif) {
514                                         if (flags & RT6_LOOKUP_F_IFACE)
515                                                 continue;
516                                         if (local &&
517                                             local->rt6i_idev->dev->ifindex == oif)
518                                                 continue;
519                                 }
520                                 local = sprt;
521                         }
522                 } else {
523                         if (ipv6_chk_addr(net, saddr, dev,
524                                           flags & RT6_LOOKUP_F_IFACE))
525                                 return sprt;
526                 }
527         }
528
529         if (oif) {
530                 if (local)
531                         return local;
532
533                 if (flags & RT6_LOOKUP_F_IFACE)
534                         return net->ipv6.ip6_null_entry;
535         }
536 out:
537         return rt;
538 }
539
540 #ifdef CONFIG_IPV6_ROUTER_PREF
541 struct __rt6_probe_work {
542         struct work_struct work;
543         struct in6_addr target;
544         struct net_device *dev;
545 };
546
547 static void rt6_probe_deferred(struct work_struct *w)
548 {
549         struct in6_addr mcaddr;
550         struct __rt6_probe_work *work =
551                 container_of(w, struct __rt6_probe_work, work);
552
553         addrconf_addr_solict_mult(&work->target, &mcaddr);
554         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
555         dev_put(work->dev);
556         kfree(work);
557 }
558
559 static void rt6_probe(struct rt6_info *rt)
560 {
561         struct __rt6_probe_work *work;
562         struct neighbour *neigh;
563         /*
564          * Okay, this does not seem to be appropriate
565          * for now, however, we need to check if it
566          * is really so; aka Router Reachability Probing.
567          *
568          * Router Reachability Probe MUST be rate-limited
569          * to no more than one per minute.
570          */
571         if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
572                 return;
573         rcu_read_lock_bh();
574         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
575         if (neigh) {
576                 if (neigh->nud_state & NUD_VALID)
577                         goto out;
578
579                 work = NULL;
580                 write_lock(&neigh->lock);
581                 if (!(neigh->nud_state & NUD_VALID) &&
582                     time_after(jiffies,
583                                neigh->updated +
584                                rt->rt6i_idev->cnf.rtr_probe_interval)) {
585                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
586                         if (work)
587                                 __neigh_set_probe_once(neigh);
588                 }
589                 write_unlock(&neigh->lock);
590         } else {
591                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
592         }
593
594         if (work) {
595                 INIT_WORK(&work->work, rt6_probe_deferred);
596                 work->target = rt->rt6i_gateway;
597                 dev_hold(rt->dst.dev);
598                 work->dev = rt->dst.dev;
599                 schedule_work(&work->work);
600         }
601
602 out:
603         rcu_read_unlock_bh();
604 }
605 #else
606 static inline void rt6_probe(struct rt6_info *rt)
607 {
608 }
609 #endif
610
611 /*
612  * Default Router Selection (RFC 2461 6.3.6)
613  */
614 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
615 {
616         struct net_device *dev = rt->dst.dev;
617         if (!oif || dev->ifindex == oif)
618                 return 2;
619         if ((dev->flags & IFF_LOOPBACK) &&
620             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
621                 return 1;
622         return 0;
623 }
624
625 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
626 {
627         struct neighbour *neigh;
628         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
629
630         if (rt->rt6i_flags & RTF_NONEXTHOP ||
631             !(rt->rt6i_flags & RTF_GATEWAY))
632                 return RT6_NUD_SUCCEED;
633
634         rcu_read_lock_bh();
635         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
636         if (neigh) {
637                 read_lock(&neigh->lock);
638                 if (neigh->nud_state & NUD_VALID)
639                         ret = RT6_NUD_SUCCEED;
640 #ifdef CONFIG_IPV6_ROUTER_PREF
641                 else if (!(neigh->nud_state & NUD_FAILED))
642                         ret = RT6_NUD_SUCCEED;
643                 else
644                         ret = RT6_NUD_FAIL_PROBE;
645 #endif
646                 read_unlock(&neigh->lock);
647         } else {
648                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
649                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
650         }
651         rcu_read_unlock_bh();
652
653         return ret;
654 }
655
656 static int rt6_score_route(struct rt6_info *rt, int oif,
657                            int strict)
658 {
659         int m;
660
661         m = rt6_check_dev(rt, oif);
662         if (!m && (strict & RT6_LOOKUP_F_IFACE))
663                 return RT6_NUD_FAIL_HARD;
664 #ifdef CONFIG_IPV6_ROUTER_PREF
665         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
666 #endif
667         if (strict & RT6_LOOKUP_F_REACHABLE) {
668                 int n = rt6_check_neigh(rt);
669                 if (n < 0)
670                         return n;
671         }
672         return m;
673 }
674
675 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
676                                    int *mpri, struct rt6_info *match,
677                                    bool *do_rr)
678 {
679         int m;
680         bool match_do_rr = false;
681         struct inet6_dev *idev = rt->rt6i_idev;
682
683         if (idev->cnf.ignore_routes_with_linkdown &&
684             rt->rt6i_nh_flags & RTNH_F_LINKDOWN &&
685             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
686                 goto out;
687
688         if (rt6_check_expired(rt))
689                 goto out;
690
691         m = rt6_score_route(rt, oif, strict);
692         if (m == RT6_NUD_FAIL_DO_RR) {
693                 match_do_rr = true;
694                 m = 0; /* lowest valid score */
695         } else if (m == RT6_NUD_FAIL_HARD) {
696                 goto out;
697         }
698
699         if (strict & RT6_LOOKUP_F_REACHABLE)
700                 rt6_probe(rt);
701
702         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
703         if (m > *mpri) {
704                 *do_rr = match_do_rr;
705                 *mpri = m;
706                 match = rt;
707         }
708 out:
709         return match;
710 }
711
712 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
713                                      struct rt6_info *leaf,
714                                      struct rt6_info *rr_head,
715                                      u32 metric, int oif, int strict,
716                                      bool *do_rr)
717 {
718         struct rt6_info *rt, *match, *cont;
719         int mpri = -1;
720
721         match = NULL;
722         cont = NULL;
723         for (rt = rr_head; rt; rt = rcu_dereference(rt->rt6_next)) {
724                 if (rt->rt6i_metric != metric) {
725                         cont = rt;
726                         break;
727                 }
728
729                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
730         }
731
732         for (rt = leaf; rt && rt != rr_head;
733              rt = rcu_dereference(rt->rt6_next)) {
734                 if (rt->rt6i_metric != metric) {
735                         cont = rt;
736                         break;
737                 }
738
739                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
740         }
741
742         if (match || !cont)
743                 return match;
744
745         for (rt = cont; rt; rt = rcu_dereference(rt->rt6_next))
746                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
747
748         return match;
749 }
750
751 static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
752                                    int oif, int strict)
753 {
754         struct rt6_info *leaf = rcu_dereference(fn->leaf);
755         struct rt6_info *match, *rt0;
756         bool do_rr = false;
757         int key_plen;
758
759         if (!leaf || leaf == net->ipv6.ip6_null_entry)
760                 return net->ipv6.ip6_null_entry;
761
762         rt0 = rcu_dereference(fn->rr_ptr);
763         if (!rt0)
764                 rt0 = leaf;
765
766         /* Double check to make sure fn is not an intermediate node
767          * and fn->leaf does not points to its child's leaf
768          * (This might happen if all routes under fn are deleted from
769          * the tree and fib6_repair_tree() is called on the node.)
770          */
771         key_plen = rt0->rt6i_dst.plen;
772 #ifdef CONFIG_IPV6_SUBTREES
773         if (rt0->rt6i_src.plen)
774                 key_plen = rt0->rt6i_src.plen;
775 #endif
776         if (fn->fn_bit != key_plen)
777                 return net->ipv6.ip6_null_entry;
778
779         match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict,
780                              &do_rr);
781
782         if (do_rr) {
783                 struct rt6_info *next = rcu_dereference(rt0->rt6_next);
784
785                 /* no entries matched; do round-robin */
786                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
787                         next = leaf;
788
789                 if (next != rt0) {
790                         spin_lock_bh(&leaf->rt6i_table->tb6_lock);
791                         /* make sure next is not being deleted from the tree */
792                         if (next->rt6i_node)
793                                 rcu_assign_pointer(fn->rr_ptr, next);
794                         spin_unlock_bh(&leaf->rt6i_table->tb6_lock);
795                 }
796         }
797
798         return match ? match : net->ipv6.ip6_null_entry;
799 }
800
801 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
802 {
803         return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
804 }
805
806 #ifdef CONFIG_IPV6_ROUTE_INFO
807 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
808                   const struct in6_addr *gwaddr)
809 {
810         struct net *net = dev_net(dev);
811         struct route_info *rinfo = (struct route_info *) opt;
812         struct in6_addr prefix_buf, *prefix;
813         unsigned int pref;
814         unsigned long lifetime;
815         struct rt6_info *rt;
816
817         if (len < sizeof(struct route_info)) {
818                 return -EINVAL;
819         }
820
821         /* Sanity check for prefix_len and length */
822         if (rinfo->length > 3) {
823                 return -EINVAL;
824         } else if (rinfo->prefix_len > 128) {
825                 return -EINVAL;
826         } else if (rinfo->prefix_len > 64) {
827                 if (rinfo->length < 2) {
828                         return -EINVAL;
829                 }
830         } else if (rinfo->prefix_len > 0) {
831                 if (rinfo->length < 1) {
832                         return -EINVAL;
833                 }
834         }
835
836         pref = rinfo->route_pref;
837         if (pref == ICMPV6_ROUTER_PREF_INVALID)
838                 return -EINVAL;
839
840         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
841
842         if (rinfo->length == 3)
843                 prefix = (struct in6_addr *)rinfo->prefix;
844         else {
845                 /* this function is safe */
846                 ipv6_addr_prefix(&prefix_buf,
847                                  (struct in6_addr *)rinfo->prefix,
848                                  rinfo->prefix_len);
849                 prefix = &prefix_buf;
850         }
851
852         if (rinfo->prefix_len == 0)
853                 rt = rt6_get_dflt_router(gwaddr, dev);
854         else
855                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
856                                         gwaddr, dev);
857
858         if (rt && !lifetime) {
859                 ip6_del_rt(rt);
860                 rt = NULL;
861         }
862
863         if (!rt && lifetime)
864                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
865                                         dev, pref);
866         else if (rt)
867                 rt->rt6i_flags = RTF_ROUTEINFO |
868                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
869
870         if (rt) {
871                 if (!addrconf_finite_timeout(lifetime))
872                         rt6_clean_expires(rt);
873                 else
874                         rt6_set_expires(rt, jiffies + HZ * lifetime);
875
876                 ip6_rt_put(rt);
877         }
878         return 0;
879 }
880 #endif
881
882 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
883                                         struct in6_addr *saddr)
884 {
885         struct fib6_node *pn, *sn;
886         while (1) {
887                 if (fn->fn_flags & RTN_TL_ROOT)
888                         return NULL;
889                 pn = rcu_dereference(fn->parent);
890                 sn = FIB6_SUBTREE(pn);
891                 if (sn && sn != fn)
892                         fn = fib6_lookup(sn, NULL, saddr);
893                 else
894                         fn = pn;
895                 if (fn->fn_flags & RTN_RTINFO)
896                         return fn;
897         }
898 }
899
900 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
901                           bool null_fallback)
902 {
903         struct rt6_info *rt = *prt;
904
905         if (dst_hold_safe(&rt->dst))
906                 return true;
907         if (null_fallback) {
908                 rt = net->ipv6.ip6_null_entry;
909                 dst_hold(&rt->dst);
910         } else {
911                 rt = NULL;
912         }
913         *prt = rt;
914         return false;
915 }
916
917 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
918                                              struct fib6_table *table,
919                                              struct flowi6 *fl6, int flags)
920 {
921         struct rt6_info *rt, *rt_cache;
922         struct fib6_node *fn;
923
924         rcu_read_lock();
925         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
926 restart:
927         rt = rcu_dereference(fn->leaf);
928         if (!rt) {
929                 rt = net->ipv6.ip6_null_entry;
930         } else {
931                 rt = rt6_device_match(net, rt, &fl6->saddr,
932                                       fl6->flowi6_oif, flags);
933                 if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
934                         rt = rt6_multipath_select(rt, fl6,
935                                                   fl6->flowi6_oif, flags);
936         }
937         if (rt == net->ipv6.ip6_null_entry) {
938                 fn = fib6_backtrack(fn, &fl6->saddr);
939                 if (fn)
940                         goto restart;
941         }
942         /* Search through exception table */
943         rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
944         if (rt_cache)
945                 rt = rt_cache;
946
947         if (ip6_hold_safe(net, &rt, true))
948                 dst_use_noref(&rt->dst, jiffies);
949
950         rcu_read_unlock();
951
952         trace_fib6_table_lookup(net, rt, table, fl6);
953
954         return rt;
955
956 }
957
958 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
959                                     int flags)
960 {
961         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
962 }
963 EXPORT_SYMBOL_GPL(ip6_route_lookup);
964
965 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
966                             const struct in6_addr *saddr, int oif, int strict)
967 {
968         struct flowi6 fl6 = {
969                 .flowi6_oif = oif,
970                 .daddr = *daddr,
971         };
972         struct dst_entry *dst;
973         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
974
975         if (saddr) {
976                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
977                 flags |= RT6_LOOKUP_F_HAS_SADDR;
978         }
979
980         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
981         if (dst->error == 0)
982                 return (struct rt6_info *) dst;
983
984         dst_release(dst);
985
986         return NULL;
987 }
988 EXPORT_SYMBOL(rt6_lookup);
989
990 /* ip6_ins_rt is called with FREE table->tb6_lock.
991  * It takes new route entry, the addition fails by any reason the
992  * route is released.
993  * Caller must hold dst before calling it.
994  */
995
996 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
997                         struct mx6_config *mxc,
998                         struct netlink_ext_ack *extack)
999 {
1000         int err;
1001         struct fib6_table *table;
1002
1003         table = rt->rt6i_table;
1004         spin_lock_bh(&table->tb6_lock);
1005         err = fib6_add(&table->tb6_root, rt, info, mxc, extack);
1006         spin_unlock_bh(&table->tb6_lock);
1007
1008         return err;
1009 }
1010
1011 int ip6_ins_rt(struct rt6_info *rt)
1012 {
1013         struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
1014         struct mx6_config mxc = { .mx = NULL, };
1015
1016         /* Hold dst to account for the reference from the fib6 tree */
1017         dst_hold(&rt->dst);
1018         return __ip6_ins_rt(rt, &info, &mxc, NULL);
1019 }
1020
1021 /* called with rcu_lock held */
1022 static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
1023 {
1024         struct net_device *dev = rt->dst.dev;
1025
1026         if (rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) {
1027                 /* for copies of local routes, dst->dev needs to be the
1028                  * device if it is a master device, the master device if
1029                  * device is enslaved, and the loopback as the default
1030                  */
1031                 if (netif_is_l3_slave(dev) &&
1032                     !rt6_need_strict(&rt->rt6i_dst.addr))
1033                         dev = l3mdev_master_dev_rcu(dev);
1034                 else if (!netif_is_l3_master(dev))
1035                         dev = dev_net(dev)->loopback_dev;
1036                 /* last case is netif_is_l3_master(dev) is true in which
1037                  * case we want dev returned to be dev
1038                  */
1039         }
1040
1041         return dev;
1042 }
1043
1044 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
1045                                            const struct in6_addr *daddr,
1046                                            const struct in6_addr *saddr)
1047 {
1048         struct net_device *dev;
1049         struct rt6_info *rt;
1050
1051         /*
1052          *      Clone the route.
1053          */
1054
1055         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1056                 ort = ort->from;
1057
1058         rcu_read_lock();
1059         dev = ip6_rt_get_dev_rcu(ort);
1060         rt = __ip6_dst_alloc(dev_net(dev), dev, 0);
1061         rcu_read_unlock();
1062         if (!rt)
1063                 return NULL;
1064
1065         ip6_rt_copy_init(rt, ort);
1066         rt->rt6i_flags |= RTF_CACHE;
1067         rt->rt6i_metric = 0;
1068         rt->dst.flags |= DST_HOST;
1069         rt->rt6i_dst.addr = *daddr;
1070         rt->rt6i_dst.plen = 128;
1071
1072         if (!rt6_is_gw_or_nonexthop(ort)) {
1073                 if (ort->rt6i_dst.plen != 128 &&
1074                     ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
1075                         rt->rt6i_flags |= RTF_ANYCAST;
1076 #ifdef CONFIG_IPV6_SUBTREES
1077                 if (rt->rt6i_src.plen && saddr) {
1078                         rt->rt6i_src.addr = *saddr;
1079                         rt->rt6i_src.plen = 128;
1080                 }
1081 #endif
1082         }
1083
1084         return rt;
1085 }
1086
1087 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
1088 {
1089         struct net_device *dev;
1090         struct rt6_info *pcpu_rt;
1091
1092         rcu_read_lock();
1093         dev = ip6_rt_get_dev_rcu(rt);
1094         pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags);
1095         rcu_read_unlock();
1096         if (!pcpu_rt)
1097                 return NULL;
1098         ip6_rt_copy_init(pcpu_rt, rt);
1099         pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1100         pcpu_rt->rt6i_flags |= RTF_PCPU;
1101         return pcpu_rt;
1102 }
1103
1104 /* It should be called with rcu_read_lock() acquired */
1105 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1106 {
1107         struct rt6_info *pcpu_rt, **p;
1108
1109         p = this_cpu_ptr(rt->rt6i_pcpu);
1110         pcpu_rt = *p;
1111
1112         if (pcpu_rt && ip6_hold_safe(NULL, &pcpu_rt, false))
1113                 rt6_dst_from_metrics_check(pcpu_rt);
1114
1115         return pcpu_rt;
1116 }
1117
1118 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1119 {
1120         struct rt6_info *pcpu_rt, *prev, **p;
1121
1122         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1123         if (!pcpu_rt) {
1124                 struct net *net = dev_net(rt->dst.dev);
1125
1126                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1127                 return net->ipv6.ip6_null_entry;
1128         }
1129
1130         dst_hold(&pcpu_rt->dst);
1131         p = this_cpu_ptr(rt->rt6i_pcpu);
1132         prev = cmpxchg(p, NULL, pcpu_rt);
1133         BUG_ON(prev);
1134
1135         rt6_dst_from_metrics_check(pcpu_rt);
1136         return pcpu_rt;
1137 }
1138
1139 /* exception hash table implementation
1140  */
1141 static DEFINE_SPINLOCK(rt6_exception_lock);
1142
1143 /* Remove rt6_ex from hash table and free the memory
1144  * Caller must hold rt6_exception_lock
1145  */
1146 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1147                                  struct rt6_exception *rt6_ex)
1148 {
1149         struct net *net;
1150
1151         if (!bucket || !rt6_ex)
1152                 return;
1153
1154         net = dev_net(rt6_ex->rt6i->dst.dev);
1155         rt6_ex->rt6i->rt6i_node = NULL;
1156         hlist_del_rcu(&rt6_ex->hlist);
1157         rt6_release(rt6_ex->rt6i);
1158         kfree_rcu(rt6_ex, rcu);
1159         WARN_ON_ONCE(!bucket->depth);
1160         bucket->depth--;
1161         net->ipv6.rt6_stats->fib_rt_cache--;
1162 }
1163
1164 /* Remove oldest rt6_ex in bucket and free the memory
1165  * Caller must hold rt6_exception_lock
1166  */
1167 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1168 {
1169         struct rt6_exception *rt6_ex, *oldest = NULL;
1170
1171         if (!bucket)
1172                 return;
1173
1174         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1175                 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1176                         oldest = rt6_ex;
1177         }
1178         rt6_remove_exception(bucket, oldest);
1179 }
1180
1181 static u32 rt6_exception_hash(const struct in6_addr *dst,
1182                               const struct in6_addr *src)
1183 {
1184         static u32 seed __read_mostly;
1185         u32 val;
1186
1187         net_get_random_once(&seed, sizeof(seed));
1188         val = jhash(dst, sizeof(*dst), seed);
1189
1190 #ifdef CONFIG_IPV6_SUBTREES
1191         if (src)
1192                 val = jhash(src, sizeof(*src), val);
1193 #endif
1194         return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1195 }
1196
1197 /* Helper function to find the cached rt in the hash table
1198  * and update bucket pointer to point to the bucket for this
1199  * (daddr, saddr) pair
1200  * Caller must hold rt6_exception_lock
1201  */
1202 static struct rt6_exception *
1203 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1204                               const struct in6_addr *daddr,
1205                               const struct in6_addr *saddr)
1206 {
1207         struct rt6_exception *rt6_ex;
1208         u32 hval;
1209
1210         if (!(*bucket) || !daddr)
1211                 return NULL;
1212
1213         hval = rt6_exception_hash(daddr, saddr);
1214         *bucket += hval;
1215
1216         hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1217                 struct rt6_info *rt6 = rt6_ex->rt6i;
1218                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1219
1220 #ifdef CONFIG_IPV6_SUBTREES
1221                 if (matched && saddr)
1222                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1223 #endif
1224                 if (matched)
1225                         return rt6_ex;
1226         }
1227         return NULL;
1228 }
1229
1230 /* Helper function to find the cached rt in the hash table
1231  * and update bucket pointer to point to the bucket for this
1232  * (daddr, saddr) pair
1233  * Caller must hold rcu_read_lock()
1234  */
1235 static struct rt6_exception *
1236 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1237                          const struct in6_addr *daddr,
1238                          const struct in6_addr *saddr)
1239 {
1240         struct rt6_exception *rt6_ex;
1241         u32 hval;
1242
1243         WARN_ON_ONCE(!rcu_read_lock_held());
1244
1245         if (!(*bucket) || !daddr)
1246                 return NULL;
1247
1248         hval = rt6_exception_hash(daddr, saddr);
1249         *bucket += hval;
1250
1251         hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1252                 struct rt6_info *rt6 = rt6_ex->rt6i;
1253                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1254
1255 #ifdef CONFIG_IPV6_SUBTREES
1256                 if (matched && saddr)
1257                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1258 #endif
1259                 if (matched)
1260                         return rt6_ex;
1261         }
1262         return NULL;
1263 }
1264
1265 static int rt6_insert_exception(struct rt6_info *nrt,
1266                                 struct rt6_info *ort)
1267 {
1268         struct net *net = dev_net(ort->dst.dev);
1269         struct rt6_exception_bucket *bucket;
1270         struct in6_addr *src_key = NULL;
1271         struct rt6_exception *rt6_ex;
1272         int err = 0;
1273
1274         /* ort can't be a cache or pcpu route */
1275         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1276                 ort = ort->from;
1277         WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU));
1278
1279         spin_lock_bh(&rt6_exception_lock);
1280
1281         if (ort->exception_bucket_flushed) {
1282                 err = -EINVAL;
1283                 goto out;
1284         }
1285
1286         bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1287                                         lockdep_is_held(&rt6_exception_lock));
1288         if (!bucket) {
1289                 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1290                                  GFP_ATOMIC);
1291                 if (!bucket) {
1292                         err = -ENOMEM;
1293                         goto out;
1294                 }
1295                 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1296         }
1297
1298 #ifdef CONFIG_IPV6_SUBTREES
1299         /* rt6i_src.plen != 0 indicates ort is in subtree
1300          * and exception table is indexed by a hash of
1301          * both rt6i_dst and rt6i_src.
1302          * Otherwise, the exception table is indexed by
1303          * a hash of only rt6i_dst.
1304          */
1305         if (ort->rt6i_src.plen)
1306                 src_key = &nrt->rt6i_src.addr;
1307 #endif
1308
1309         /* Update rt6i_prefsrc as it could be changed
1310          * in rt6_remove_prefsrc()
1311          */
1312         nrt->rt6i_prefsrc = ort->rt6i_prefsrc;
1313         /* rt6_mtu_change() might lower mtu on ort.
1314          * Only insert this exception route if its mtu
1315          * is less than ort's mtu value.
1316          */
1317         if (nrt->rt6i_pmtu >= dst_mtu(&ort->dst)) {
1318                 err = -EINVAL;
1319                 goto out;
1320         }
1321
1322         rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1323                                                src_key);
1324         if (rt6_ex)
1325                 rt6_remove_exception(bucket, rt6_ex);
1326
1327         rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1328         if (!rt6_ex) {
1329                 err = -ENOMEM;
1330                 goto out;
1331         }
1332         rt6_ex->rt6i = nrt;
1333         rt6_ex->stamp = jiffies;
1334         atomic_inc(&nrt->rt6i_ref);
1335         nrt->rt6i_node = ort->rt6i_node;
1336         hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1337         bucket->depth++;
1338         net->ipv6.rt6_stats->fib_rt_cache++;
1339
1340         if (bucket->depth > FIB6_MAX_DEPTH)
1341                 rt6_exception_remove_oldest(bucket);
1342
1343 out:
1344         spin_unlock_bh(&rt6_exception_lock);
1345
1346         /* Update fn->fn_sernum to invalidate all cached dst */
1347         if (!err) {
1348                 fib6_update_sernum(ort);
1349                 fib6_force_start_gc(net);
1350         }
1351
1352         return err;
1353 }
1354
1355 void rt6_flush_exceptions(struct rt6_info *rt)
1356 {
1357         struct rt6_exception_bucket *bucket;
1358         struct rt6_exception *rt6_ex;
1359         struct hlist_node *tmp;
1360         int i;
1361
1362         spin_lock_bh(&rt6_exception_lock);
1363         /* Prevent rt6_insert_exception() to recreate the bucket list */
1364         rt->exception_bucket_flushed = 1;
1365
1366         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1367                                     lockdep_is_held(&rt6_exception_lock));
1368         if (!bucket)
1369                 goto out;
1370
1371         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1372                 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1373                         rt6_remove_exception(bucket, rt6_ex);
1374                 WARN_ON_ONCE(bucket->depth);
1375                 bucket++;
1376         }
1377
1378 out:
1379         spin_unlock_bh(&rt6_exception_lock);
1380 }
1381
1382 /* Find cached rt in the hash table inside passed in rt
1383  * Caller has to hold rcu_read_lock()
1384  */
1385 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
1386                                            struct in6_addr *daddr,
1387                                            struct in6_addr *saddr)
1388 {
1389         struct rt6_exception_bucket *bucket;
1390         struct in6_addr *src_key = NULL;
1391         struct rt6_exception *rt6_ex;
1392         struct rt6_info *res = NULL;
1393
1394         bucket = rcu_dereference(rt->rt6i_exception_bucket);
1395
1396 #ifdef CONFIG_IPV6_SUBTREES
1397         /* rt6i_src.plen != 0 indicates rt is in subtree
1398          * and exception table is indexed by a hash of
1399          * both rt6i_dst and rt6i_src.
1400          * Otherwise, the exception table is indexed by
1401          * a hash of only rt6i_dst.
1402          */
1403         if (rt->rt6i_src.plen)
1404                 src_key = saddr;
1405 #endif
1406         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1407
1408         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1409                 res = rt6_ex->rt6i;
1410
1411         return res;
1412 }
1413
1414 /* Remove the passed in cached rt from the hash table that contains it */
1415 int rt6_remove_exception_rt(struct rt6_info *rt)
1416 {
1417         struct rt6_exception_bucket *bucket;
1418         struct rt6_info *from = rt->from;
1419         struct in6_addr *src_key = NULL;
1420         struct rt6_exception *rt6_ex;
1421         int err;
1422
1423         if (!from ||
1424             !(rt->rt6i_flags & RTF_CACHE))
1425                 return -EINVAL;
1426
1427         if (!rcu_access_pointer(from->rt6i_exception_bucket))
1428                 return -ENOENT;
1429
1430         spin_lock_bh(&rt6_exception_lock);
1431         bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1432                                     lockdep_is_held(&rt6_exception_lock));
1433 #ifdef CONFIG_IPV6_SUBTREES
1434         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1435          * and exception table is indexed by a hash of
1436          * both rt6i_dst and rt6i_src.
1437          * Otherwise, the exception table is indexed by
1438          * a hash of only rt6i_dst.
1439          */
1440         if (from->rt6i_src.plen)
1441                 src_key = &rt->rt6i_src.addr;
1442 #endif
1443         rt6_ex = __rt6_find_exception_spinlock(&bucket,
1444                                                &rt->rt6i_dst.addr,
1445                                                src_key);
1446         if (rt6_ex) {
1447                 rt6_remove_exception(bucket, rt6_ex);
1448                 err = 0;
1449         } else {
1450                 err = -ENOENT;
1451         }
1452
1453         spin_unlock_bh(&rt6_exception_lock);
1454         return err;
1455 }
1456
1457 /* Find rt6_ex which contains the passed in rt cache and
1458  * refresh its stamp
1459  */
1460 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1461 {
1462         struct rt6_exception_bucket *bucket;
1463         struct rt6_info *from = rt->from;
1464         struct in6_addr *src_key = NULL;
1465         struct rt6_exception *rt6_ex;
1466
1467         if (!from ||
1468             !(rt->rt6i_flags & RTF_CACHE))
1469                 return;
1470
1471         rcu_read_lock();
1472         bucket = rcu_dereference(from->rt6i_exception_bucket);
1473
1474 #ifdef CONFIG_IPV6_SUBTREES
1475         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1476          * and exception table is indexed by a hash of
1477          * both rt6i_dst and rt6i_src.
1478          * Otherwise, the exception table is indexed by
1479          * a hash of only rt6i_dst.
1480          */
1481         if (from->rt6i_src.plen)
1482                 src_key = &rt->rt6i_src.addr;
1483 #endif
1484         rt6_ex = __rt6_find_exception_rcu(&bucket,
1485                                           &rt->rt6i_dst.addr,
1486                                           src_key);
1487         if (rt6_ex)
1488                 rt6_ex->stamp = jiffies;
1489
1490         rcu_read_unlock();
1491 }
1492
1493 static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt)
1494 {
1495         struct rt6_exception_bucket *bucket;
1496         struct rt6_exception *rt6_ex;
1497         int i;
1498
1499         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1500                                         lockdep_is_held(&rt6_exception_lock));
1501
1502         if (bucket) {
1503                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1504                         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1505                                 rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1506                         }
1507                         bucket++;
1508                 }
1509         }
1510 }
1511
1512 static void rt6_exceptions_update_pmtu(struct rt6_info *rt, int mtu)
1513 {
1514         struct rt6_exception_bucket *bucket;
1515         struct rt6_exception *rt6_ex;
1516         int i;
1517
1518         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1519                                         lockdep_is_held(&rt6_exception_lock));
1520
1521         if (bucket) {
1522                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1523                         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1524                                 struct rt6_info *entry = rt6_ex->rt6i;
1525                                 /* For RTF_CACHE with rt6i_pmtu == 0
1526                                  * (i.e. a redirected route),
1527                                  * the metrics of its rt->dst.from has already
1528                                  * been updated.
1529                                  */
1530                                 if (entry->rt6i_pmtu && entry->rt6i_pmtu > mtu)
1531                                         entry->rt6i_pmtu = mtu;
1532                         }
1533                         bucket++;
1534                 }
1535         }
1536 }
1537
1538 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
1539
1540 static void rt6_exceptions_clean_tohost(struct rt6_info *rt,
1541                                         struct in6_addr *gateway)
1542 {
1543         struct rt6_exception_bucket *bucket;
1544         struct rt6_exception *rt6_ex;
1545         struct hlist_node *tmp;
1546         int i;
1547
1548         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1549                 return;
1550
1551         spin_lock_bh(&rt6_exception_lock);
1552         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1553                                      lockdep_is_held(&rt6_exception_lock));
1554
1555         if (bucket) {
1556                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1557                         hlist_for_each_entry_safe(rt6_ex, tmp,
1558                                                   &bucket->chain, hlist) {
1559                                 struct rt6_info *entry = rt6_ex->rt6i;
1560
1561                                 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1562                                     RTF_CACHE_GATEWAY &&
1563                                     ipv6_addr_equal(gateway,
1564                                                     &entry->rt6i_gateway)) {
1565                                         rt6_remove_exception(bucket, rt6_ex);
1566                                 }
1567                         }
1568                         bucket++;
1569                 }
1570         }
1571
1572         spin_unlock_bh(&rt6_exception_lock);
1573 }
1574
1575 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1576                                       struct rt6_exception *rt6_ex,
1577                                       struct fib6_gc_args *gc_args,
1578                                       unsigned long now)
1579 {
1580         struct rt6_info *rt = rt6_ex->rt6i;
1581
1582         /* we are pruning and obsoleting aged-out and non gateway exceptions
1583          * even if others have still references to them, so that on next
1584          * dst_check() such references can be dropped.
1585          * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1586          * expired, independently from their aging, as per RFC 8201 section 4
1587          */
1588         if (!(rt->rt6i_flags & RTF_EXPIRES) &&
1589             time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1590                 RT6_TRACE("aging clone %p\n", rt);
1591                 rt6_remove_exception(bucket, rt6_ex);
1592                 return;
1593         } else if (rt->rt6i_flags & RTF_GATEWAY) {
1594                 struct neighbour *neigh;
1595                 __u8 neigh_flags = 0;
1596
1597                 neigh = dst_neigh_lookup(&rt->dst, &rt->rt6i_gateway);
1598                 if (neigh) {
1599                         neigh_flags = neigh->flags;
1600                         neigh_release(neigh);
1601                 }
1602                 if (!(neigh_flags & NTF_ROUTER)) {
1603                         RT6_TRACE("purging route %p via non-router but gateway\n",
1604                                   rt);
1605                         rt6_remove_exception(bucket, rt6_ex);
1606                         return;
1607                 }
1608         } else if (__rt6_check_expired(rt)) {
1609                 RT6_TRACE("purging expired route %p\n", rt);
1610                 rt6_remove_exception(bucket, rt6_ex);
1611                 return;
1612         }
1613         gc_args->more++;
1614 }
1615
1616 void rt6_age_exceptions(struct rt6_info *rt,
1617                         struct fib6_gc_args *gc_args,
1618                         unsigned long now)
1619 {
1620         struct rt6_exception_bucket *bucket;
1621         struct rt6_exception *rt6_ex;
1622         struct hlist_node *tmp;
1623         int i;
1624
1625         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1626                 return;
1627
1628         spin_lock_bh(&rt6_exception_lock);
1629         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1630                                     lockdep_is_held(&rt6_exception_lock));
1631
1632         if (bucket) {
1633                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1634                         hlist_for_each_entry_safe(rt6_ex, tmp,
1635                                                   &bucket->chain, hlist) {
1636                                 rt6_age_examine_exception(bucket, rt6_ex,
1637                                                           gc_args, now);
1638                         }
1639                         bucket++;
1640                 }
1641         }
1642         spin_unlock_bh(&rt6_exception_lock);
1643 }
1644
1645 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1646                                int oif, struct flowi6 *fl6, int flags)
1647 {
1648         struct fib6_node *fn, *saved_fn;
1649         struct rt6_info *rt, *rt_cache;
1650         int strict = 0;
1651
1652         strict |= flags & RT6_LOOKUP_F_IFACE;
1653         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1654         if (net->ipv6.devconf_all->forwarding == 0)
1655                 strict |= RT6_LOOKUP_F_REACHABLE;
1656
1657         rcu_read_lock();
1658
1659         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1660         saved_fn = fn;
1661
1662         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1663                 oif = 0;
1664
1665 redo_rt6_select:
1666         rt = rt6_select(net, fn, oif, strict);
1667         if (rt->rt6i_nsiblings)
1668                 rt = rt6_multipath_select(rt, fl6, oif, strict);
1669         if (rt == net->ipv6.ip6_null_entry) {
1670                 fn = fib6_backtrack(fn, &fl6->saddr);
1671                 if (fn)
1672                         goto redo_rt6_select;
1673                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1674                         /* also consider unreachable route */
1675                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1676                         fn = saved_fn;
1677                         goto redo_rt6_select;
1678                 }
1679         }
1680
1681         /*Search through exception table */
1682         rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
1683         if (rt_cache)
1684                 rt = rt_cache;
1685
1686         if (rt == net->ipv6.ip6_null_entry) {
1687                 rcu_read_unlock();
1688                 dst_hold(&rt->dst);
1689                 trace_fib6_table_lookup(net, rt, table, fl6);
1690                 return rt;
1691         } else if (rt->rt6i_flags & RTF_CACHE) {
1692                 if (ip6_hold_safe(net, &rt, true)) {
1693                         dst_use_noref(&rt->dst, jiffies);
1694                         rt6_dst_from_metrics_check(rt);
1695                 }
1696                 rcu_read_unlock();
1697                 trace_fib6_table_lookup(net, rt, table, fl6);
1698                 return rt;
1699         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1700                             !(rt->rt6i_flags & RTF_GATEWAY))) {
1701                 /* Create a RTF_CACHE clone which will not be
1702                  * owned by the fib6 tree.  It is for the special case where
1703                  * the daddr in the skb during the neighbor look-up is different
1704                  * from the fl6->daddr used to look-up route here.
1705                  */
1706
1707                 struct rt6_info *uncached_rt;
1708
1709                 if (ip6_hold_safe(net, &rt, true)) {
1710                         dst_use_noref(&rt->dst, jiffies);
1711                 } else {
1712                         rcu_read_unlock();
1713                         uncached_rt = rt;
1714                         goto uncached_rt_out;
1715                 }
1716                 rcu_read_unlock();
1717
1718                 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1719                 dst_release(&rt->dst);
1720
1721                 if (uncached_rt) {
1722                         /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1723                          * No need for another dst_hold()
1724                          */
1725                         rt6_uncached_list_add(uncached_rt);
1726                         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1727                 } else {
1728                         uncached_rt = net->ipv6.ip6_null_entry;
1729                         dst_hold(&uncached_rt->dst);
1730                 }
1731
1732 uncached_rt_out:
1733                 trace_fib6_table_lookup(net, uncached_rt, table, fl6);
1734                 return uncached_rt;
1735
1736         } else {
1737                 /* Get a percpu copy */
1738
1739                 struct rt6_info *pcpu_rt;
1740
1741                 dst_use_noref(&rt->dst, jiffies);
1742                 local_bh_disable();
1743                 pcpu_rt = rt6_get_pcpu_route(rt);
1744
1745                 if (!pcpu_rt) {
1746                         /* atomic_inc_not_zero() is needed when using rcu */
1747                         if (atomic_inc_not_zero(&rt->rt6i_ref)) {
1748                                 /* No dst_hold() on rt is needed because grabbing
1749                                  * rt->rt6i_ref makes sure rt can't be released.
1750                                  */
1751                                 pcpu_rt = rt6_make_pcpu_route(rt);
1752                                 rt6_release(rt);
1753                         } else {
1754                                 /* rt is already removed from tree */
1755                                 pcpu_rt = net->ipv6.ip6_null_entry;
1756                                 dst_hold(&pcpu_rt->dst);
1757                         }
1758                 }
1759                 local_bh_enable();
1760                 rcu_read_unlock();
1761                 trace_fib6_table_lookup(net, pcpu_rt, table, fl6);
1762                 return pcpu_rt;
1763         }
1764 }
1765 EXPORT_SYMBOL_GPL(ip6_pol_route);
1766
1767 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1768                                             struct flowi6 *fl6, int flags)
1769 {
1770         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1771 }
1772
1773 struct dst_entry *ip6_route_input_lookup(struct net *net,
1774                                          struct net_device *dev,
1775                                          struct flowi6 *fl6, int flags)
1776 {
1777         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1778                 flags |= RT6_LOOKUP_F_IFACE;
1779
1780         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1781 }
1782 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1783
1784 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1785                                   struct flow_keys *keys)
1786 {
1787         const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1788         const struct ipv6hdr *key_iph = outer_iph;
1789         const struct ipv6hdr *inner_iph;
1790         const struct icmp6hdr *icmph;
1791         struct ipv6hdr _inner_iph;
1792
1793         if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1794                 goto out;
1795
1796         icmph = icmp6_hdr(skb);
1797         if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1798             icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1799             icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1800             icmph->icmp6_type != ICMPV6_PARAMPROB)
1801                 goto out;
1802
1803         inner_iph = skb_header_pointer(skb,
1804                                        skb_transport_offset(skb) + sizeof(*icmph),
1805                                        sizeof(_inner_iph), &_inner_iph);
1806         if (!inner_iph)
1807                 goto out;
1808
1809         key_iph = inner_iph;
1810 out:
1811         memset(keys, 0, sizeof(*keys));
1812         keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1813         keys->addrs.v6addrs.src = key_iph->saddr;
1814         keys->addrs.v6addrs.dst = key_iph->daddr;
1815         keys->tags.flow_label = ip6_flowinfo(key_iph);
1816         keys->basic.ip_proto = key_iph->nexthdr;
1817 }
1818
1819 /* if skb is set it will be used and fl6 can be NULL */
1820 u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb)
1821 {
1822         struct flow_keys hash_keys;
1823
1824         if (skb) {
1825                 ip6_multipath_l3_keys(skb, &hash_keys);
1826                 return flow_hash_from_keys(&hash_keys);
1827         }
1828
1829         return get_hash_from_flowi6(fl6);
1830 }
1831
1832 void ip6_route_input(struct sk_buff *skb)
1833 {
1834         const struct ipv6hdr *iph = ipv6_hdr(skb);
1835         struct net *net = dev_net(skb->dev);
1836         int flags = RT6_LOOKUP_F_HAS_SADDR;
1837         struct ip_tunnel_info *tun_info;
1838         struct flowi6 fl6 = {
1839                 .flowi6_iif = skb->dev->ifindex,
1840                 .daddr = iph->daddr,
1841                 .saddr = iph->saddr,
1842                 .flowlabel = ip6_flowinfo(iph),
1843                 .flowi6_mark = skb->mark,
1844                 .flowi6_proto = iph->nexthdr,
1845         };
1846
1847         tun_info = skb_tunnel_info(skb);
1848         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1849                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1850         if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
1851                 fl6.mp_hash = rt6_multipath_hash(&fl6, skb);
1852         skb_dst_drop(skb);
1853         skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1854 }
1855
1856 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1857                                              struct flowi6 *fl6, int flags)
1858 {
1859         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1860 }
1861
1862 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1863                                          struct flowi6 *fl6, int flags)
1864 {
1865         bool any_src;
1866
1867         if (rt6_need_strict(&fl6->daddr)) {
1868                 struct dst_entry *dst;
1869
1870                 dst = l3mdev_link_scope_lookup(net, fl6);
1871                 if (dst)
1872                         return dst;
1873         }
1874
1875         fl6->flowi6_iif = LOOPBACK_IFINDEX;
1876
1877         any_src = ipv6_addr_any(&fl6->saddr);
1878         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1879             (fl6->flowi6_oif && any_src))
1880                 flags |= RT6_LOOKUP_F_IFACE;
1881
1882         if (!any_src)
1883                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1884         else if (sk)
1885                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1886
1887         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1888 }
1889 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1890
1891 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1892 {
1893         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1894         struct net_device *loopback_dev = net->loopback_dev;
1895         struct dst_entry *new = NULL;
1896
1897         rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
1898                        DST_OBSOLETE_DEAD, 0);
1899         if (rt) {
1900                 rt6_info_init(rt);
1901                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
1902
1903                 new = &rt->dst;
1904                 new->__use = 1;
1905                 new->input = dst_discard;
1906                 new->output = dst_discard_out;
1907
1908                 dst_copy_metrics(new, &ort->dst);
1909
1910                 rt->rt6i_idev = in6_dev_get(loopback_dev);
1911                 rt->rt6i_gateway = ort->rt6i_gateway;
1912                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
1913                 rt->rt6i_metric = 0;
1914
1915                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1916 #ifdef CONFIG_IPV6_SUBTREES
1917                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1918 #endif
1919         }
1920
1921         dst_release(dst_orig);
1922         return new ? new : ERR_PTR(-ENOMEM);
1923 }
1924
1925 /*
1926  *      Destination cache support functions
1927  */
1928
1929 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1930 {
1931         if (rt->from &&
1932             dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(&rt->from->dst))
1933                 dst_init_metrics(&rt->dst, dst_metrics_ptr(&rt->from->dst), true);
1934 }
1935
1936 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1937 {
1938         u32 rt_cookie = 0;
1939
1940         if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
1941                 return NULL;
1942
1943         if (rt6_check_expired(rt))
1944                 return NULL;
1945
1946         return &rt->dst;
1947 }
1948
1949 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1950 {
1951         if (!__rt6_check_expired(rt) &&
1952             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1953             rt6_check(rt->from, cookie))
1954                 return &rt->dst;
1955         else
1956                 return NULL;
1957 }
1958
1959 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1960 {
1961         struct rt6_info *rt;
1962
1963         rt = (struct rt6_info *) dst;
1964
1965         /* All IPV6 dsts are created with ->obsolete set to the value
1966          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1967          * into this function always.
1968          */
1969
1970         rt6_dst_from_metrics_check(rt);
1971
1972         if (rt->rt6i_flags & RTF_PCPU ||
1973             (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from))
1974                 return rt6_dst_from_check(rt, cookie);
1975         else
1976                 return rt6_check(rt, cookie);
1977 }
1978
1979 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1980 {
1981         struct rt6_info *rt = (struct rt6_info *) dst;
1982
1983         if (rt) {
1984                 if (rt->rt6i_flags & RTF_CACHE) {
1985                         if (rt6_check_expired(rt)) {
1986                                 ip6_del_rt(rt);
1987                                 dst = NULL;
1988                         }
1989                 } else {
1990                         dst_release(dst);
1991                         dst = NULL;
1992                 }
1993         }
1994         return dst;
1995 }
1996
1997 static void ip6_link_failure(struct sk_buff *skb)
1998 {
1999         struct rt6_info *rt;
2000
2001         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2002
2003         rt = (struct rt6_info *) skb_dst(skb);
2004         if (rt) {
2005                 if (rt->rt6i_flags & RTF_CACHE) {
2006                         if (dst_hold_safe(&rt->dst))
2007                                 ip6_del_rt(rt);
2008                 } else {
2009                         struct fib6_node *fn;
2010
2011                         rcu_read_lock();
2012                         fn = rcu_dereference(rt->rt6i_node);
2013                         if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2014                                 fn->fn_sernum = -1;
2015                         rcu_read_unlock();
2016                 }
2017         }
2018 }
2019
2020 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2021 {
2022         struct net *net = dev_net(rt->dst.dev);
2023
2024         rt->rt6i_flags |= RTF_MODIFIED;
2025         rt->rt6i_pmtu = mtu;
2026         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2027 }
2028
2029 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2030 {
2031         return !(rt->rt6i_flags & RTF_CACHE) &&
2032                 (rt->rt6i_flags & RTF_PCPU ||
2033                  rcu_access_pointer(rt->rt6i_node));
2034 }
2035
2036 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2037                                  const struct ipv6hdr *iph, u32 mtu)
2038 {
2039         const struct in6_addr *daddr, *saddr;
2040         struct rt6_info *rt6 = (struct rt6_info *)dst;
2041
2042         if (rt6->rt6i_flags & RTF_LOCAL)
2043                 return;
2044
2045         if (dst_metric_locked(dst, RTAX_MTU))
2046                 return;
2047
2048         if (iph) {
2049                 daddr = &iph->daddr;
2050                 saddr = &iph->saddr;
2051         } else if (sk) {
2052                 daddr = &sk->sk_v6_daddr;
2053                 saddr = &inet6_sk(sk)->saddr;
2054         } else {
2055                 daddr = NULL;
2056                 saddr = NULL;
2057         }
2058         dst_confirm_neigh(dst, daddr);
2059         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2060         if (mtu >= dst_mtu(dst))
2061                 return;
2062
2063         if (!rt6_cache_allowed_for_pmtu(rt6)) {
2064                 rt6_do_update_pmtu(rt6, mtu);
2065                 /* update rt6_ex->stamp for cache */
2066                 if (rt6->rt6i_flags & RTF_CACHE)
2067                         rt6_update_exception_stamp_rt(rt6);
2068         } else if (daddr) {
2069                 struct rt6_info *nrt6;
2070
2071                 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
2072                 if (nrt6) {
2073                         rt6_do_update_pmtu(nrt6, mtu);
2074                         if (rt6_insert_exception(nrt6, rt6))
2075                                 dst_release_immediate(&nrt6->dst);
2076                 }
2077         }
2078 }
2079
2080 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2081                                struct sk_buff *skb, u32 mtu)
2082 {
2083         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2084 }
2085
2086 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2087                      int oif, u32 mark, kuid_t uid)
2088 {
2089         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2090         struct dst_entry *dst;
2091         struct flowi6 fl6;
2092
2093         memset(&fl6, 0, sizeof(fl6));
2094         fl6.flowi6_oif = oif;
2095         fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2096         fl6.daddr = iph->daddr;
2097         fl6.saddr = iph->saddr;
2098         fl6.flowlabel = ip6_flowinfo(iph);
2099         fl6.flowi6_uid = uid;
2100
2101         dst = ip6_route_output(net, NULL, &fl6);
2102         if (!dst->error)
2103                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2104         dst_release(dst);
2105 }
2106 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2107
2108 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2109 {
2110         struct dst_entry *dst;
2111
2112         ip6_update_pmtu(skb, sock_net(sk), mtu,
2113                         sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2114
2115         dst = __sk_dst_get(sk);
2116         if (!dst || !dst->obsolete ||
2117             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2118                 return;
2119
2120         bh_lock_sock(sk);
2121         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2122                 ip6_datagram_dst_update(sk, false);
2123         bh_unlock_sock(sk);
2124 }
2125 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2126
2127 /* Handle redirects */
2128 struct ip6rd_flowi {
2129         struct flowi6 fl6;
2130         struct in6_addr gateway;
2131 };
2132
2133 static struct rt6_info *__ip6_route_redirect(struct net *net,
2134                                              struct fib6_table *table,
2135                                              struct flowi6 *fl6,
2136                                              int flags)
2137 {
2138         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2139         struct rt6_info *rt, *rt_cache;
2140         struct fib6_node *fn;
2141
2142         /* Get the "current" route for this destination and
2143          * check if the redirect has come from appropriate router.
2144          *
2145          * RFC 4861 specifies that redirects should only be
2146          * accepted if they come from the nexthop to the target.
2147          * Due to the way the routes are chosen, this notion
2148          * is a bit fuzzy and one might need to check all possible
2149          * routes.
2150          */
2151
2152         rcu_read_lock();
2153         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2154 restart:
2155         for_each_fib6_node_rt_rcu(fn) {
2156                 if (rt6_check_expired(rt))
2157                         continue;
2158                 if (rt->dst.error)
2159                         break;
2160                 if (!(rt->rt6i_flags & RTF_GATEWAY))
2161                         continue;
2162                 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
2163                         continue;
2164                 /* rt_cache's gateway might be different from its 'parent'
2165                  * in the case of an ip redirect.
2166                  * So we keep searching in the exception table if the gateway
2167                  * is different.
2168                  */
2169                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) {
2170                         rt_cache = rt6_find_cached_rt(rt,
2171                                                       &fl6->daddr,
2172                                                       &fl6->saddr);
2173                         if (rt_cache &&
2174                             ipv6_addr_equal(&rdfl->gateway,
2175                                             &rt_cache->rt6i_gateway)) {
2176                                 rt = rt_cache;
2177                                 break;
2178                         }
2179                         continue;
2180                 }
2181                 break;
2182         }
2183
2184         if (!rt)
2185                 rt = net->ipv6.ip6_null_entry;
2186         else if (rt->dst.error) {
2187                 rt = net->ipv6.ip6_null_entry;
2188                 goto out;
2189         }
2190
2191         if (rt == net->ipv6.ip6_null_entry) {
2192                 fn = fib6_backtrack(fn, &fl6->saddr);
2193                 if (fn)
2194                         goto restart;
2195         }
2196
2197 out:
2198         ip6_hold_safe(net, &rt, true);
2199
2200         rcu_read_unlock();
2201
2202         trace_fib6_table_lookup(net, rt, table, fl6);
2203         return rt;
2204 };
2205
2206 static struct dst_entry *ip6_route_redirect(struct net *net,
2207                                         const struct flowi6 *fl6,
2208                                         const struct in6_addr *gateway)
2209 {
2210         int flags = RT6_LOOKUP_F_HAS_SADDR;
2211         struct ip6rd_flowi rdfl;
2212
2213         rdfl.fl6 = *fl6;
2214         rdfl.gateway = *gateway;
2215
2216         return fib6_rule_lookup(net, &rdfl.fl6,
2217                                 flags, __ip6_route_redirect);
2218 }
2219
2220 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2221                   kuid_t uid)
2222 {
2223         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2224         struct dst_entry *dst;
2225         struct flowi6 fl6;
2226
2227         memset(&fl6, 0, sizeof(fl6));
2228         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2229         fl6.flowi6_oif = oif;
2230         fl6.flowi6_mark = mark;
2231         fl6.daddr = iph->daddr;
2232         fl6.saddr = iph->saddr;
2233         fl6.flowlabel = ip6_flowinfo(iph);
2234         fl6.flowi6_uid = uid;
2235
2236         dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
2237         rt6_do_redirect(dst, NULL, skb);
2238         dst_release(dst);
2239 }
2240 EXPORT_SYMBOL_GPL(ip6_redirect);
2241
2242 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2243                             u32 mark)
2244 {
2245         const struct ipv6hdr *iph = ipv6_hdr(skb);
2246         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2247         struct dst_entry *dst;
2248         struct flowi6 fl6;
2249
2250         memset(&fl6, 0, sizeof(fl6));
2251         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2252         fl6.flowi6_oif = oif;
2253         fl6.flowi6_mark = mark;
2254         fl6.daddr = msg->dest;
2255         fl6.saddr = iph->daddr;
2256         fl6.flowi6_uid = sock_net_uid(net, NULL);
2257
2258         dst = ip6_route_redirect(net, &fl6, &iph->saddr);
2259         rt6_do_redirect(dst, NULL, skb);
2260         dst_release(dst);
2261 }
2262
2263 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2264 {
2265         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2266                      sk->sk_uid);
2267 }
2268 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2269
2270 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2271 {
2272         struct net_device *dev = dst->dev;
2273         unsigned int mtu = dst_mtu(dst);
2274         struct net *net = dev_net(dev);
2275
2276         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2277
2278         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2279                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2280
2281         /*
2282          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2283          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2284          * IPV6_MAXPLEN is also valid and means: "any MSS,
2285          * rely only on pmtu discovery"
2286          */
2287         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2288                 mtu = IPV6_MAXPLEN;
2289         return mtu;
2290 }
2291
2292 static unsigned int ip6_mtu(const struct dst_entry *dst)
2293 {
2294         const struct rt6_info *rt = (const struct rt6_info *)dst;
2295         unsigned int mtu = rt->rt6i_pmtu;
2296         struct inet6_dev *idev;
2297
2298         if (mtu)
2299                 goto out;
2300
2301         mtu = dst_metric_raw(dst, RTAX_MTU);
2302         if (mtu)
2303                 goto out;
2304
2305         mtu = IPV6_MIN_MTU;
2306
2307         rcu_read_lock();
2308         idev = __in6_dev_get(dst->dev);
2309         if (idev)
2310                 mtu = idev->cnf.mtu6;
2311         rcu_read_unlock();
2312
2313 out:
2314         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2315
2316         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2317 }
2318
2319 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2320                                   struct flowi6 *fl6)
2321 {
2322         struct dst_entry *dst;
2323         struct rt6_info *rt;
2324         struct inet6_dev *idev = in6_dev_get(dev);
2325         struct net *net = dev_net(dev);
2326
2327         if (unlikely(!idev))
2328                 return ERR_PTR(-ENODEV);
2329
2330         rt = ip6_dst_alloc(net, dev, 0);
2331         if (unlikely(!rt)) {
2332                 in6_dev_put(idev);
2333                 dst = ERR_PTR(-ENOMEM);
2334                 goto out;
2335         }
2336
2337         rt->dst.flags |= DST_HOST;
2338         rt->dst.input = ip6_input;
2339         rt->dst.output  = ip6_output;
2340         rt->rt6i_gateway  = fl6->daddr;
2341         rt->rt6i_dst.addr = fl6->daddr;
2342         rt->rt6i_dst.plen = 128;
2343         rt->rt6i_idev     = idev;
2344         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2345
2346         /* Add this dst into uncached_list so that rt6_disable_ip() can
2347          * do proper release of the net_device
2348          */
2349         rt6_uncached_list_add(rt);
2350         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2351
2352         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2353
2354 out:
2355         return dst;
2356 }
2357
2358 static int ip6_dst_gc(struct dst_ops *ops)
2359 {
2360         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2361         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2362         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2363         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2364         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2365         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2366         int entries;
2367
2368         entries = dst_entries_get_fast(ops);
2369         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2370             entries <= rt_max_size)
2371                 goto out;
2372
2373         net->ipv6.ip6_rt_gc_expire++;
2374         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2375         entries = dst_entries_get_slow(ops);
2376         if (entries < ops->gc_thresh)
2377                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2378 out:
2379         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2380         return entries > rt_max_size;
2381 }
2382
2383 static int ip6_convert_metrics(struct mx6_config *mxc,
2384                                const struct fib6_config *cfg)
2385 {
2386         struct net *net = cfg->fc_nlinfo.nl_net;
2387         bool ecn_ca = false;
2388         struct nlattr *nla;
2389         int remaining;
2390         u32 *mp;
2391
2392         if (!cfg->fc_mx)
2393                 return 0;
2394
2395         mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
2396         if (unlikely(!mp))
2397                 return -ENOMEM;
2398
2399         nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
2400                 int type = nla_type(nla);
2401                 u32 val;
2402
2403                 if (!type)
2404                         continue;
2405                 if (unlikely(type > RTAX_MAX))
2406                         goto err;
2407
2408                 if (type == RTAX_CC_ALGO) {
2409                         char tmp[TCP_CA_NAME_MAX];
2410
2411                         nla_strlcpy(tmp, nla, sizeof(tmp));
2412                         val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca);
2413                         if (val == TCP_CA_UNSPEC)
2414                                 goto err;
2415                 } else {
2416                         val = nla_get_u32(nla);
2417                 }
2418                 if (type == RTAX_HOPLIMIT && val > 255)
2419                         val = 255;
2420                 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
2421                         goto err;
2422
2423                 mp[type - 1] = val;
2424                 __set_bit(type - 1, mxc->mx_valid);
2425         }
2426
2427         if (ecn_ca) {
2428                 __set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
2429                 mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
2430         }
2431
2432         mxc->mx = mp;
2433         return 0;
2434  err:
2435         kfree(mp);
2436         return -EINVAL;
2437 }
2438
2439 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2440                                             struct fib6_config *cfg,
2441                                             const struct in6_addr *gw_addr)
2442 {
2443         struct flowi6 fl6 = {
2444                 .flowi6_oif = cfg->fc_ifindex,
2445                 .daddr = *gw_addr,
2446                 .saddr = cfg->fc_prefsrc,
2447         };
2448         struct fib6_table *table;
2449         struct rt6_info *rt;
2450         int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_IGNORE_LINKSTATE;
2451
2452         table = fib6_get_table(net, cfg->fc_table);
2453         if (!table)
2454                 return NULL;
2455
2456         if (!ipv6_addr_any(&cfg->fc_prefsrc))
2457                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2458
2459         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags);
2460
2461         /* if table lookup failed, fall back to full lookup */
2462         if (rt == net->ipv6.ip6_null_entry) {
2463                 ip6_rt_put(rt);
2464                 rt = NULL;
2465         }
2466
2467         return rt;
2468 }
2469
2470 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
2471                                               struct netlink_ext_ack *extack)
2472 {
2473         struct net *net = cfg->fc_nlinfo.nl_net;
2474         struct rt6_info *rt = NULL;
2475         struct net_device *dev = NULL;
2476         struct inet6_dev *idev = NULL;
2477         struct fib6_table *table;
2478         int addr_type;
2479         int err = -EINVAL;
2480
2481         /* RTF_PCPU is an internal flag; can not be set by userspace */
2482         if (cfg->fc_flags & RTF_PCPU) {
2483                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2484                 goto out;
2485         }
2486
2487         /* RTF_CACHE is an internal flag; can not be set by userspace */
2488         if (cfg->fc_flags & RTF_CACHE) {
2489                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2490                 goto out;
2491         }
2492
2493         if (cfg->fc_dst_len > 128) {
2494                 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2495                 goto out;
2496         }
2497         if (cfg->fc_src_len > 128) {
2498                 NL_SET_ERR_MSG(extack, "Invalid source address length");
2499                 goto out;
2500         }
2501 #ifndef CONFIG_IPV6_SUBTREES
2502         if (cfg->fc_src_len) {
2503                 NL_SET_ERR_MSG(extack,
2504                                "Specifying source address requires IPV6_SUBTREES to be enabled");
2505                 goto out;
2506         }
2507 #endif
2508         if (cfg->fc_ifindex) {
2509                 err = -ENODEV;
2510                 dev = dev_get_by_index(net, cfg->fc_ifindex);
2511                 if (!dev)
2512                         goto out;
2513                 idev = in6_dev_get(dev);
2514                 if (!idev)
2515                         goto out;
2516         }
2517
2518         if (cfg->fc_metric == 0)
2519                 cfg->fc_metric = IP6_RT_PRIO_USER;
2520
2521         err = -ENOBUFS;
2522         if (cfg->fc_nlinfo.nlh &&
2523             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2524                 table = fib6_get_table(net, cfg->fc_table);
2525                 if (!table) {
2526                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2527                         table = fib6_new_table(net, cfg->fc_table);
2528                 }
2529         } else {
2530                 table = fib6_new_table(net, cfg->fc_table);
2531         }
2532
2533         if (!table)
2534                 goto out;
2535
2536         rt = ip6_dst_alloc(net, NULL,
2537                            (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
2538
2539         if (!rt) {
2540                 err = -ENOMEM;
2541                 goto out;
2542         }
2543
2544         if (cfg->fc_flags & RTF_EXPIRES)
2545                 rt6_set_expires(rt, jiffies +
2546                                 clock_t_to_jiffies(cfg->fc_expires));
2547         else
2548                 rt6_clean_expires(rt);
2549
2550         if (cfg->fc_protocol == RTPROT_UNSPEC)
2551                 cfg->fc_protocol = RTPROT_BOOT;
2552         rt->rt6i_protocol = cfg->fc_protocol;
2553
2554         addr_type = ipv6_addr_type(&cfg->fc_dst);
2555
2556         if (addr_type & IPV6_ADDR_MULTICAST)
2557                 rt->dst.input = ip6_mc_input;
2558         else if (cfg->fc_flags & RTF_LOCAL)
2559                 rt->dst.input = ip6_input;
2560         else
2561                 rt->dst.input = ip6_forward;
2562
2563         rt->dst.output = ip6_output;
2564
2565         if (cfg->fc_encap) {
2566                 struct lwtunnel_state *lwtstate;
2567
2568                 err = lwtunnel_build_state(cfg->fc_encap_type,
2569                                            cfg->fc_encap, AF_INET6, cfg,
2570                                            &lwtstate, extack);
2571                 if (err)
2572                         goto out;
2573                 rt->dst.lwtstate = lwtstate_get(lwtstate);
2574                 if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
2575                         rt->dst.lwtstate->orig_output = rt->dst.output;
2576                         rt->dst.output = lwtunnel_output;
2577                 }
2578                 if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
2579                         rt->dst.lwtstate->orig_input = rt->dst.input;
2580                         rt->dst.input = lwtunnel_input;
2581                 }
2582         }
2583
2584         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
2585         rt->rt6i_dst.plen = cfg->fc_dst_len;
2586         if (rt->rt6i_dst.plen == 128)
2587                 rt->dst.flags |= DST_HOST;
2588
2589 #ifdef CONFIG_IPV6_SUBTREES
2590         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
2591         rt->rt6i_src.plen = cfg->fc_src_len;
2592 #endif
2593
2594         rt->rt6i_metric = cfg->fc_metric;
2595
2596         /* We cannot add true routes via loopback here,
2597            they would result in kernel looping; promote them to reject routes
2598          */
2599         if ((cfg->fc_flags & RTF_REJECT) ||
2600             (dev && (dev->flags & IFF_LOOPBACK) &&
2601              !(addr_type & IPV6_ADDR_LOOPBACK) &&
2602              !(cfg->fc_flags & RTF_LOCAL))) {
2603                 /* hold loopback dev/idev if we haven't done so. */
2604                 if (dev != net->loopback_dev) {
2605                         if (dev) {
2606                                 dev_put(dev);
2607                                 in6_dev_put(idev);
2608                         }
2609                         dev = net->loopback_dev;
2610                         dev_hold(dev);
2611                         idev = in6_dev_get(dev);
2612                         if (!idev) {
2613                                 err = -ENODEV;
2614                                 goto out;
2615                         }
2616                 }
2617                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
2618                 switch (cfg->fc_type) {
2619                 case RTN_BLACKHOLE:
2620                         rt->dst.error = -EINVAL;
2621                         rt->dst.output = dst_discard_out;
2622                         rt->dst.input = dst_discard;
2623                         break;
2624                 case RTN_PROHIBIT:
2625                         rt->dst.error = -EACCES;
2626                         rt->dst.output = ip6_pkt_prohibit_out;
2627                         rt->dst.input = ip6_pkt_prohibit;
2628                         break;
2629                 case RTN_THROW:
2630                 case RTN_UNREACHABLE:
2631                 default:
2632                         rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
2633                                         : (cfg->fc_type == RTN_UNREACHABLE)
2634                                         ? -EHOSTUNREACH : -ENETUNREACH;
2635                         rt->dst.output = ip6_pkt_discard_out;
2636                         rt->dst.input = ip6_pkt_discard;
2637                         break;
2638                 }
2639                 goto install_route;
2640         }
2641
2642         if (cfg->fc_flags & RTF_GATEWAY) {
2643                 const struct in6_addr *gw_addr;
2644                 int gwa_type;
2645
2646                 gw_addr = &cfg->fc_gateway;
2647                 gwa_type = ipv6_addr_type(gw_addr);
2648
2649                 /* if gw_addr is local we will fail to detect this in case
2650                  * address is still TENTATIVE (DAD in progress). rt6_lookup()
2651                  * will return already-added prefix route via interface that
2652                  * prefix route was assigned to, which might be non-loopback.
2653                  */
2654                 err = -EINVAL;
2655                 if (ipv6_chk_addr_and_flags(net, gw_addr,
2656                                             gwa_type & IPV6_ADDR_LINKLOCAL ?
2657                                             dev : NULL, 0, 0)) {
2658                         NL_SET_ERR_MSG(extack, "Invalid gateway address");
2659                         goto out;
2660                 }
2661                 rt->rt6i_gateway = *gw_addr;
2662
2663                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
2664                         struct rt6_info *grt = NULL;
2665
2666                         /* IPv6 strictly inhibits using not link-local
2667                            addresses as nexthop address.
2668                            Otherwise, router will not able to send redirects.
2669                            It is very good, but in some (rare!) circumstances
2670                            (SIT, PtP, NBMA NOARP links) it is handy to allow
2671                            some exceptions. --ANK
2672                            We allow IPv4-mapped nexthops to support RFC4798-type
2673                            addressing
2674                          */
2675                         if (!(gwa_type & (IPV6_ADDR_UNICAST |
2676                                           IPV6_ADDR_MAPPED))) {
2677                                 NL_SET_ERR_MSG(extack,
2678                                                "Invalid gateway address");
2679                                 goto out;
2680                         }
2681
2682                         if (cfg->fc_table) {
2683                                 grt = ip6_nh_lookup_table(net, cfg, gw_addr);
2684
2685                                 if (grt) {
2686                                         if (grt->rt6i_flags & RTF_GATEWAY ||
2687                                             (dev && dev != grt->dst.dev)) {
2688                                                 ip6_rt_put(grt);
2689                                                 grt = NULL;
2690                                         }
2691                                 }
2692                         }
2693
2694                         if (!grt)
2695                                 grt = rt6_lookup(net, gw_addr, NULL,
2696                                                  cfg->fc_ifindex, 1);
2697
2698                         err = -EHOSTUNREACH;
2699                         if (!grt)
2700                                 goto out;
2701                         if (dev) {
2702                                 if (dev != grt->dst.dev) {
2703                                         ip6_rt_put(grt);
2704                                         goto out;
2705                                 }
2706                         } else {
2707                                 dev = grt->dst.dev;
2708                                 idev = grt->rt6i_idev;
2709                                 dev_hold(dev);
2710                                 in6_dev_hold(grt->rt6i_idev);
2711                         }
2712                         if (!(grt->rt6i_flags & RTF_GATEWAY))
2713                                 err = 0;
2714                         ip6_rt_put(grt);
2715
2716                         if (err)
2717                                 goto out;
2718                 }
2719                 err = -EINVAL;
2720                 if (!dev) {
2721                         NL_SET_ERR_MSG(extack, "Egress device not specified");
2722                         goto out;
2723                 } else if (dev->flags & IFF_LOOPBACK) {
2724                         NL_SET_ERR_MSG(extack,
2725                                        "Egress device can not be loopback device for this route");
2726                         goto out;
2727                 }
2728         }
2729
2730         err = -ENODEV;
2731         if (!dev)
2732                 goto out;
2733
2734         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2735                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2736                         NL_SET_ERR_MSG(extack, "Invalid source address");
2737                         err = -EINVAL;
2738                         goto out;
2739                 }
2740                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2741                 rt->rt6i_prefsrc.plen = 128;
2742         } else
2743                 rt->rt6i_prefsrc.plen = 0;
2744
2745         rt->rt6i_flags = cfg->fc_flags;
2746
2747 install_route:
2748         if (!(rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
2749             !netif_carrier_ok(dev))
2750                 rt->rt6i_nh_flags |= RTNH_F_LINKDOWN;
2751         rt->dst.dev = dev;
2752         rt->rt6i_idev = idev;
2753         rt->rt6i_table = table;
2754
2755         cfg->fc_nlinfo.nl_net = dev_net(dev);
2756
2757         return rt;
2758 out:
2759         if (dev)
2760                 dev_put(dev);
2761         if (idev)
2762                 in6_dev_put(idev);
2763         if (rt)
2764                 dst_release_immediate(&rt->dst);
2765
2766         return ERR_PTR(err);
2767 }
2768
2769 int ip6_route_add(struct fib6_config *cfg,
2770                   struct netlink_ext_ack *extack)
2771 {
2772         struct mx6_config mxc = { .mx = NULL, };
2773         struct rt6_info *rt;
2774         int err;
2775
2776         rt = ip6_route_info_create(cfg, extack);
2777         if (IS_ERR(rt)) {
2778                 err = PTR_ERR(rt);
2779                 rt = NULL;
2780                 goto out;
2781         }
2782
2783         err = ip6_convert_metrics(&mxc, cfg);
2784         if (err)
2785                 goto out;
2786
2787         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack);
2788
2789         kfree(mxc.mx);
2790
2791         return err;
2792 out:
2793         if (rt)
2794                 dst_release_immediate(&rt->dst);
2795
2796         return err;
2797 }
2798
2799 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
2800 {
2801         int err;
2802         struct fib6_table *table;
2803         struct net *net = dev_net(rt->dst.dev);
2804
2805         if (rt == net->ipv6.ip6_null_entry) {
2806                 err = -ENOENT;
2807                 goto out;
2808         }
2809
2810         table = rt->rt6i_table;
2811         spin_lock_bh(&table->tb6_lock);
2812         err = fib6_del(rt, info);
2813         spin_unlock_bh(&table->tb6_lock);
2814
2815 out:
2816         ip6_rt_put(rt);
2817         return err;
2818 }
2819
2820 int ip6_del_rt(struct rt6_info *rt)
2821 {
2822         struct nl_info info = {
2823                 .nl_net = dev_net(rt->dst.dev),
2824         };
2825         return __ip6_del_rt(rt, &info);
2826 }
2827
2828 static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
2829 {
2830         struct nl_info *info = &cfg->fc_nlinfo;
2831         struct net *net = info->nl_net;
2832         struct sk_buff *skb = NULL;
2833         struct fib6_table *table;
2834         int err = -ENOENT;
2835
2836         if (rt == net->ipv6.ip6_null_entry)
2837                 goto out_put;
2838         table = rt->rt6i_table;
2839         spin_lock_bh(&table->tb6_lock);
2840
2841         if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
2842                 struct rt6_info *sibling, *next_sibling;
2843
2844                 /* prefer to send a single notification with all hops */
2845                 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
2846                 if (skb) {
2847                         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2848
2849                         if (rt6_fill_node(net, skb, rt,
2850                                           NULL, NULL, 0, RTM_DELROUTE,
2851                                           info->portid, seq, 0) < 0) {
2852                                 kfree_skb(skb);
2853                                 skb = NULL;
2854                         } else
2855                                 info->skip_notify = 1;
2856                 }
2857
2858                 list_for_each_entry_safe(sibling, next_sibling,
2859                                          &rt->rt6i_siblings,
2860                                          rt6i_siblings) {
2861                         err = fib6_del(sibling, info);
2862                         if (err)
2863                                 goto out_unlock;
2864                 }
2865         }
2866
2867         err = fib6_del(rt, info);
2868 out_unlock:
2869         spin_unlock_bh(&table->tb6_lock);
2870 out_put:
2871         ip6_rt_put(rt);
2872
2873         if (skb) {
2874                 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
2875                             info->nlh, gfp_any());
2876         }
2877         return err;
2878 }
2879
2880 static int ip6_route_del(struct fib6_config *cfg,
2881                          struct netlink_ext_ack *extack)
2882 {
2883         struct rt6_info *rt, *rt_cache;
2884         struct fib6_table *table;
2885         struct fib6_node *fn;
2886         int err = -ESRCH;
2887
2888         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
2889         if (!table) {
2890                 NL_SET_ERR_MSG(extack, "FIB table does not exist");
2891                 return err;
2892         }
2893
2894         rcu_read_lock();
2895
2896         fn = fib6_locate(&table->tb6_root,
2897                          &cfg->fc_dst, cfg->fc_dst_len,
2898                          &cfg->fc_src, cfg->fc_src_len,
2899                          !(cfg->fc_flags & RTF_CACHE));
2900
2901         if (fn) {
2902                 for_each_fib6_node_rt_rcu(fn) {
2903                         if (cfg->fc_flags & RTF_CACHE) {
2904                                 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
2905                                                               &cfg->fc_src);
2906                                 if (!rt_cache)
2907                                         continue;
2908                                 rt = rt_cache;
2909                         }
2910                         if (cfg->fc_ifindex &&
2911                             (!rt->dst.dev ||
2912                              rt->dst.dev->ifindex != cfg->fc_ifindex))
2913                                 continue;
2914                         if (cfg->fc_flags & RTF_GATEWAY &&
2915                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
2916                                 continue;
2917                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2918                                 continue;
2919                         if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
2920                                 continue;
2921                         if (!dst_hold_safe(&rt->dst))
2922                                 break;
2923                         rcu_read_unlock();
2924
2925                         /* if gateway was specified only delete the one hop */
2926                         if (cfg->fc_flags & RTF_GATEWAY)
2927                                 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
2928
2929                         return __ip6_del_rt_siblings(rt, cfg);
2930                 }
2931         }
2932         rcu_read_unlock();
2933
2934         return err;
2935 }
2936
2937 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2938 {
2939         struct netevent_redirect netevent;
2940         struct rt6_info *rt, *nrt = NULL;
2941         struct ndisc_options ndopts;
2942         struct inet6_dev *in6_dev;
2943         struct neighbour *neigh;
2944         struct rd_msg *msg;
2945         int optlen, on_link;
2946         u8 *lladdr;
2947
2948         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2949         optlen -= sizeof(*msg);
2950
2951         if (optlen < 0) {
2952                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2953                 return;
2954         }
2955
2956         msg = (struct rd_msg *)icmp6_hdr(skb);
2957
2958         if (ipv6_addr_is_multicast(&msg->dest)) {
2959                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2960                 return;
2961         }
2962
2963         on_link = 0;
2964         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2965                 on_link = 1;
2966         } else if (ipv6_addr_type(&msg->target) !=
2967                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2968                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2969                 return;
2970         }
2971
2972         in6_dev = __in6_dev_get(skb->dev);
2973         if (!in6_dev)
2974                 return;
2975         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2976                 return;
2977
2978         /* RFC2461 8.1:
2979          *      The IP source address of the Redirect MUST be the same as the current
2980          *      first-hop router for the specified ICMP Destination Address.
2981          */
2982
2983         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
2984                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2985                 return;
2986         }
2987
2988         lladdr = NULL;
2989         if (ndopts.nd_opts_tgt_lladdr) {
2990                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2991                                              skb->dev);
2992                 if (!lladdr) {
2993                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2994                         return;
2995                 }
2996         }
2997
2998         rt = (struct rt6_info *) dst;
2999         if (rt->rt6i_flags & RTF_REJECT) {
3000                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3001                 return;
3002         }
3003
3004         /* Redirect received -> path was valid.
3005          * Look, redirects are sent only in response to data packets,
3006          * so that this nexthop apparently is reachable. --ANK
3007          */
3008         dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3009
3010         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3011         if (!neigh)
3012                 return;
3013
3014         /*
3015          *      We have finally decided to accept it.
3016          */
3017
3018         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3019                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
3020                      NEIGH_UPDATE_F_OVERRIDE|
3021                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3022                                      NEIGH_UPDATE_F_ISROUTER)),
3023                      NDISC_REDIRECT, &ndopts);
3024
3025         nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
3026         if (!nrt)
3027                 goto out;
3028
3029         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3030         if (on_link)
3031                 nrt->rt6i_flags &= ~RTF_GATEWAY;
3032
3033         nrt->rt6i_protocol = RTPROT_REDIRECT;
3034         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3035
3036         /* No need to remove rt from the exception table if rt is
3037          * a cached route because rt6_insert_exception() will
3038          * takes care of it
3039          */
3040         if (rt6_insert_exception(nrt, rt)) {
3041                 dst_release_immediate(&nrt->dst);
3042                 goto out;
3043         }
3044
3045         netevent.old = &rt->dst;
3046         netevent.new = &nrt->dst;
3047         netevent.daddr = &msg->dest;
3048         netevent.neigh = neigh;
3049         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3050
3051 out:
3052         neigh_release(neigh);
3053 }
3054
3055 /*
3056  *      Misc support functions
3057  */
3058
3059 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
3060 {
3061         BUG_ON(from->from);
3062
3063         rt->rt6i_flags &= ~RTF_EXPIRES;
3064         dst_hold(&from->dst);
3065         rt->from = from;
3066         dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
3067 }
3068
3069 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
3070 {
3071         rt->dst.input = ort->dst.input;
3072         rt->dst.output = ort->dst.output;
3073         rt->rt6i_dst = ort->rt6i_dst;
3074         rt->dst.error = ort->dst.error;
3075         rt->rt6i_idev = ort->rt6i_idev;
3076         if (rt->rt6i_idev)
3077                 in6_dev_hold(rt->rt6i_idev);
3078         rt->dst.lastuse = jiffies;
3079         rt->rt6i_gateway = ort->rt6i_gateway;
3080         rt->rt6i_flags = ort->rt6i_flags;
3081         rt6_set_from(rt, ort);
3082         rt->rt6i_metric = ort->rt6i_metric;
3083 #ifdef CONFIG_IPV6_SUBTREES
3084         rt->rt6i_src = ort->rt6i_src;
3085 #endif
3086         rt->rt6i_prefsrc = ort->rt6i_prefsrc;
3087         rt->rt6i_table = ort->rt6i_table;
3088         rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
3089 }
3090
3091 #ifdef CONFIG_IPV6_ROUTE_INFO
3092 static struct rt6_info *rt6_get_route_info(struct net *net,
3093                                            const struct in6_addr *prefix, int prefixlen,
3094                                            const struct in6_addr *gwaddr,
3095                                            struct net_device *dev)
3096 {
3097         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3098         int ifindex = dev->ifindex;
3099         struct fib6_node *fn;
3100         struct rt6_info *rt = NULL;
3101         struct fib6_table *table;
3102
3103         table = fib6_get_table(net, tb_id);
3104         if (!table)
3105                 return NULL;
3106
3107         rcu_read_lock();
3108         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3109         if (!fn)
3110                 goto out;
3111
3112         for_each_fib6_node_rt_rcu(fn) {
3113                 if (rt->dst.dev->ifindex != ifindex)
3114                         continue;
3115                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3116                         continue;
3117                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
3118                         continue;
3119                 ip6_hold_safe(NULL, &rt, false);
3120                 break;
3121         }
3122 out:
3123         rcu_read_unlock();
3124         return rt;
3125 }
3126
3127 static struct rt6_info *rt6_add_route_info(struct net *net,
3128                                            const struct in6_addr *prefix, int prefixlen,
3129                                            const struct in6_addr *gwaddr,
3130                                            struct net_device *dev,
3131                                            unsigned int pref)
3132 {
3133         struct fib6_config cfg = {
3134                 .fc_metric      = IP6_RT_PRIO_USER,
3135                 .fc_ifindex     = dev->ifindex,
3136                 .fc_dst_len     = prefixlen,
3137                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3138                                   RTF_UP | RTF_PREF(pref),
3139                 .fc_protocol = RTPROT_RA,
3140                 .fc_nlinfo.portid = 0,
3141                 .fc_nlinfo.nlh = NULL,
3142                 .fc_nlinfo.nl_net = net,
3143         };
3144
3145         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3146         cfg.fc_dst = *prefix;
3147         cfg.fc_gateway = *gwaddr;
3148
3149         /* We should treat it as a default route if prefix length is 0. */
3150         if (!prefixlen)
3151                 cfg.fc_flags |= RTF_DEFAULT;
3152
3153         ip6_route_add(&cfg, NULL);
3154
3155         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3156 }
3157 #endif
3158
3159 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
3160 {
3161         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3162         struct rt6_info *rt;
3163         struct fib6_table *table;
3164
3165         table = fib6_get_table(dev_net(dev), tb_id);
3166         if (!table)
3167                 return NULL;
3168
3169         rcu_read_lock();
3170         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3171                 if (dev == rt->dst.dev &&
3172                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3173                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
3174                         break;
3175         }
3176         if (rt)
3177                 ip6_hold_safe(NULL, &rt, false);
3178         rcu_read_unlock();
3179         return rt;
3180 }
3181
3182 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
3183                                      struct net_device *dev,
3184                                      unsigned int pref)
3185 {
3186         struct fib6_config cfg = {
3187                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3188                 .fc_metric      = IP6_RT_PRIO_USER,
3189                 .fc_ifindex     = dev->ifindex,
3190                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3191                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3192                 .fc_protocol = RTPROT_RA,
3193                 .fc_nlinfo.portid = 0,
3194                 .fc_nlinfo.nlh = NULL,
3195                 .fc_nlinfo.nl_net = dev_net(dev),
3196         };
3197
3198         cfg.fc_gateway = *gwaddr;
3199
3200         if (!ip6_route_add(&cfg, NULL)) {
3201                 struct fib6_table *table;
3202
3203                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3204                 if (table)
3205                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3206         }
3207
3208         return rt6_get_dflt_router(gwaddr, dev);
3209 }
3210
3211 static void __rt6_purge_dflt_routers(struct fib6_table *table)
3212 {
3213         struct rt6_info *rt;
3214
3215 restart:
3216         rcu_read_lock();
3217         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3218                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3219                     (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
3220                         if (dst_hold_safe(&rt->dst)) {
3221                                 rcu_read_unlock();
3222                                 ip6_del_rt(rt);
3223                         } else {
3224                                 rcu_read_unlock();
3225                         }
3226                         goto restart;
3227                 }
3228         }
3229         rcu_read_unlock();
3230
3231         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3232 }
3233
3234 void rt6_purge_dflt_routers(struct net *net)
3235 {
3236         struct fib6_table *table;
3237         struct hlist_head *head;
3238         unsigned int h;
3239
3240         rcu_read_lock();
3241
3242         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3243                 head = &net->ipv6.fib_table_hash[h];
3244                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3245                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3246                                 __rt6_purge_dflt_routers(table);
3247                 }
3248         }
3249
3250         rcu_read_unlock();
3251 }
3252
3253 static void rtmsg_to_fib6_config(struct net *net,
3254                                  struct in6_rtmsg *rtmsg,
3255                                  struct fib6_config *cfg)
3256 {
3257         memset(cfg, 0, sizeof(*cfg));
3258
3259         cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3260                          : RT6_TABLE_MAIN;
3261         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3262         cfg->fc_metric = rtmsg->rtmsg_metric;
3263         cfg->fc_expires = rtmsg->rtmsg_info;
3264         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3265         cfg->fc_src_len = rtmsg->rtmsg_src_len;
3266         cfg->fc_flags = rtmsg->rtmsg_flags;
3267
3268         cfg->fc_nlinfo.nl_net = net;
3269
3270         cfg->fc_dst = rtmsg->rtmsg_dst;
3271         cfg->fc_src = rtmsg->rtmsg_src;
3272         cfg->fc_gateway = rtmsg->rtmsg_gateway;
3273 }
3274
3275 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3276 {
3277         struct fib6_config cfg;
3278         struct in6_rtmsg rtmsg;
3279         int err;
3280
3281         switch (cmd) {
3282         case SIOCADDRT:         /* Add a route */
3283         case SIOCDELRT:         /* Delete a route */
3284                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3285                         return -EPERM;
3286                 err = copy_from_user(&rtmsg, arg,
3287                                      sizeof(struct in6_rtmsg));
3288                 if (err)
3289                         return -EFAULT;
3290
3291                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3292
3293                 rtnl_lock();
3294                 switch (cmd) {
3295                 case SIOCADDRT:
3296                         err = ip6_route_add(&cfg, NULL);
3297                         break;
3298                 case SIOCDELRT:
3299                         err = ip6_route_del(&cfg, NULL);
3300                         break;
3301                 default:
3302                         err = -EINVAL;
3303                 }
3304                 rtnl_unlock();
3305
3306                 return err;
3307         }
3308
3309         return -EINVAL;
3310 }
3311
3312 /*
3313  *      Drop the packet on the floor
3314  */
3315
3316 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3317 {
3318         int type;
3319         struct dst_entry *dst = skb_dst(skb);
3320         switch (ipstats_mib_noroutes) {
3321         case IPSTATS_MIB_INNOROUTES:
3322                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3323                 if (type == IPV6_ADDR_ANY) {
3324                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3325                                       IPSTATS_MIB_INADDRERRORS);
3326                         break;
3327                 }
3328                 /* FALLTHROUGH */
3329         case IPSTATS_MIB_OUTNOROUTES:
3330                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3331                               ipstats_mib_noroutes);
3332                 break;
3333         }
3334         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3335         kfree_skb(skb);
3336         return 0;
3337 }
3338
3339 static int ip6_pkt_discard(struct sk_buff *skb)
3340 {
3341         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3342 }
3343
3344 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3345 {
3346         skb->dev = skb_dst(skb)->dev;
3347         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3348 }
3349
3350 static int ip6_pkt_prohibit(struct sk_buff *skb)
3351 {
3352         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3353 }
3354
3355 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3356 {
3357         skb->dev = skb_dst(skb)->dev;
3358         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3359 }
3360
3361 /*
3362  *      Allocate a dst for local (unicast / anycast) address.
3363  */
3364
3365 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
3366                                     const struct in6_addr *addr,
3367                                     bool anycast)
3368 {
3369         u32 tb_id;
3370         struct net *net = dev_net(idev->dev);
3371         struct net_device *dev = idev->dev;
3372         struct rt6_info *rt;
3373
3374         rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
3375         if (!rt)
3376                 return ERR_PTR(-ENOMEM);
3377
3378         in6_dev_hold(idev);
3379
3380         rt->dst.flags |= DST_HOST;
3381         rt->dst.input = ip6_input;
3382         rt->dst.output = ip6_output;
3383         rt->rt6i_idev = idev;
3384
3385         rt->rt6i_protocol = RTPROT_KERNEL;
3386         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
3387         if (anycast)
3388                 rt->rt6i_flags |= RTF_ANYCAST;
3389         else
3390                 rt->rt6i_flags |= RTF_LOCAL;
3391
3392         rt->rt6i_gateway  = *addr;
3393         rt->rt6i_dst.addr = *addr;
3394         rt->rt6i_dst.plen = 128;
3395         tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3396         rt->rt6i_table = fib6_get_table(net, tb_id);
3397
3398         return rt;
3399 }
3400
3401 /* remove deleted ip from prefsrc entries */
3402 struct arg_dev_net_ip {
3403         struct net_device *dev;
3404         struct net *net;
3405         struct in6_addr *addr;
3406 };
3407
3408 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
3409 {
3410         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3411         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3412         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3413
3414         if (((void *)rt->dst.dev == dev || !dev) &&
3415             rt != net->ipv6.ip6_null_entry &&
3416             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
3417                 spin_lock_bh(&rt6_exception_lock);
3418                 /* remove prefsrc entry */
3419                 rt->rt6i_prefsrc.plen = 0;
3420                 /* need to update cache as well */
3421                 rt6_exceptions_remove_prefsrc(rt);
3422                 spin_unlock_bh(&rt6_exception_lock);
3423         }
3424         return 0;
3425 }
3426
3427 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3428 {
3429         struct net *net = dev_net(ifp->idev->dev);
3430         struct arg_dev_net_ip adni = {
3431                 .dev = ifp->idev->dev,
3432                 .net = net,
3433                 .addr = &ifp->addr,
3434         };
3435         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3436 }
3437
3438 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3439
3440 /* Remove routers and update dst entries when gateway turn into host. */
3441 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
3442 {
3443         struct in6_addr *gateway = (struct in6_addr *)arg;
3444
3445         if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3446             ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
3447                 return -1;
3448         }
3449
3450         /* Further clean up cached routes in exception table.
3451          * This is needed because cached route may have a different
3452          * gateway than its 'parent' in the case of an ip redirect.
3453          */
3454         rt6_exceptions_clean_tohost(rt, gateway);
3455
3456         return 0;
3457 }
3458
3459 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3460 {
3461         fib6_clean_all(net, fib6_clean_tohost, gateway);
3462 }
3463
3464 struct arg_netdev_event {
3465         const struct net_device *dev;
3466         union {
3467                 unsigned int nh_flags;
3468                 unsigned long event;
3469         };
3470 };
3471
3472 static int fib6_ifup(struct rt6_info *rt, void *p_arg)
3473 {
3474         const struct arg_netdev_event *arg = p_arg;
3475         const struct net *net = dev_net(arg->dev);
3476
3477         if (rt != net->ipv6.ip6_null_entry && rt->dst.dev == arg->dev)
3478                 rt->rt6i_nh_flags &= ~arg->nh_flags;
3479
3480         return 0;
3481 }
3482
3483 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3484 {
3485         struct arg_netdev_event arg = {
3486                 .dev = dev,
3487                 .nh_flags = nh_flags,
3488         };
3489
3490         if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3491                 arg.nh_flags |= RTNH_F_LINKDOWN;
3492
3493         fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3494 }
3495
3496 /* called with write lock held for table with rt */
3497 static int fib6_ifdown(struct rt6_info *rt, void *p_arg)
3498 {
3499         const struct arg_netdev_event *arg = p_arg;
3500         const struct net_device *dev = arg->dev;
3501         const struct net *net = dev_net(dev);
3502
3503         if (rt->dst.dev != dev || rt == net->ipv6.ip6_null_entry)
3504                 return 0;
3505
3506         switch (arg->event) {
3507         case NETDEV_UNREGISTER:
3508                 return -1;
3509         case NETDEV_DOWN:
3510                 if (rt->rt6i_nsiblings == 0 ||
3511                     !rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
3512                         return -1;
3513                 rt->rt6i_nh_flags |= RTNH_F_DEAD;
3514                 /* fall through */
3515         case NETDEV_CHANGE:
3516                 if (rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST))
3517                         break;
3518                 rt->rt6i_nh_flags |= RTNH_F_LINKDOWN;
3519                 break;
3520         }
3521
3522         return 0;
3523 }
3524
3525 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
3526 {
3527         struct arg_netdev_event arg = {
3528                 .dev = dev,
3529                 .event = event,
3530         };
3531
3532         fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
3533 }
3534
3535 void rt6_disable_ip(struct net_device *dev, unsigned long event)
3536 {
3537         rt6_sync_down_dev(dev, event);
3538         rt6_uncached_list_flush_dev(dev_net(dev), dev);
3539         neigh_ifdown(&nd_tbl, dev);
3540 }
3541
3542 struct rt6_mtu_change_arg {
3543         struct net_device *dev;
3544         unsigned int mtu;
3545 };
3546
3547 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
3548 {
3549         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
3550         struct inet6_dev *idev;
3551
3552         /* In IPv6 pmtu discovery is not optional,
3553            so that RTAX_MTU lock cannot disable it.
3554            We still use this lock to block changes
3555            caused by addrconf/ndisc.
3556         */
3557
3558         idev = __in6_dev_get(arg->dev);
3559         if (!idev)
3560                 return 0;
3561
3562         /* For administrative MTU increase, there is no way to discover
3563            IPv6 PMTU increase, so PMTU increase should be updated here.
3564            Since RFC 1981 doesn't include administrative MTU increase
3565            update PMTU increase is a MUST. (i.e. jumbo frame)
3566          */
3567         /*
3568            If new MTU is less than route PMTU, this new MTU will be the
3569            lowest MTU in the path, update the route PMTU to reflect PMTU
3570            decreases; if new MTU is greater than route PMTU, and the
3571            old MTU is the lowest MTU in the path, update the route PMTU
3572            to reflect the increase. In this case if the other nodes' MTU
3573            also have the lowest MTU, TOO BIG MESSAGE will be lead to
3574            PMTU discovery.
3575          */
3576         if (rt->dst.dev == arg->dev &&
3577             dst_metric_raw(&rt->dst, RTAX_MTU) &&
3578             !dst_metric_locked(&rt->dst, RTAX_MTU)) {
3579                 spin_lock_bh(&rt6_exception_lock);
3580                 if (dst_mtu(&rt->dst) >= arg->mtu ||
3581                     (dst_mtu(&rt->dst) < arg->mtu &&
3582                      dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
3583                         dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
3584                 }
3585                 rt6_exceptions_update_pmtu(rt, arg->mtu);
3586                 spin_unlock_bh(&rt6_exception_lock);
3587         }
3588         return 0;
3589 }
3590
3591 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
3592 {
3593         struct rt6_mtu_change_arg arg = {
3594                 .dev = dev,
3595                 .mtu = mtu,
3596         };
3597
3598         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
3599 }
3600
3601 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
3602         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
3603         [RTA_OIF]               = { .type = NLA_U32 },
3604         [RTA_IIF]               = { .type = NLA_U32 },
3605         [RTA_PRIORITY]          = { .type = NLA_U32 },
3606         [RTA_METRICS]           = { .type = NLA_NESTED },
3607         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
3608         [RTA_PREF]              = { .type = NLA_U8 },
3609         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
3610         [RTA_ENCAP]             = { .type = NLA_NESTED },
3611         [RTA_EXPIRES]           = { .type = NLA_U32 },
3612         [RTA_UID]               = { .type = NLA_U32 },
3613         [RTA_MARK]              = { .type = NLA_U32 },
3614 };
3615
3616 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
3617                               struct fib6_config *cfg,
3618                               struct netlink_ext_ack *extack)
3619 {
3620         struct rtmsg *rtm;
3621         struct nlattr *tb[RTA_MAX+1];
3622         unsigned int pref;
3623         int err;
3624
3625         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
3626                           NULL);
3627         if (err < 0)
3628                 goto errout;
3629
3630         err = -EINVAL;
3631         rtm = nlmsg_data(nlh);
3632         memset(cfg, 0, sizeof(*cfg));
3633
3634         cfg->fc_table = rtm->rtm_table;
3635         cfg->fc_dst_len = rtm->rtm_dst_len;
3636         cfg->fc_src_len = rtm->rtm_src_len;
3637         cfg->fc_flags = RTF_UP;
3638         cfg->fc_protocol = rtm->rtm_protocol;
3639         cfg->fc_type = rtm->rtm_type;
3640
3641         if (rtm->rtm_type == RTN_UNREACHABLE ||
3642             rtm->rtm_type == RTN_BLACKHOLE ||
3643             rtm->rtm_type == RTN_PROHIBIT ||
3644             rtm->rtm_type == RTN_THROW)
3645                 cfg->fc_flags |= RTF_REJECT;
3646
3647         if (rtm->rtm_type == RTN_LOCAL)
3648                 cfg->fc_flags |= RTF_LOCAL;
3649
3650         if (rtm->rtm_flags & RTM_F_CLONED)
3651                 cfg->fc_flags |= RTF_CACHE;
3652
3653         cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
3654         cfg->fc_nlinfo.nlh = nlh;
3655         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
3656
3657         if (tb[RTA_GATEWAY]) {
3658                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
3659                 cfg->fc_flags |= RTF_GATEWAY;
3660         }
3661
3662         if (tb[RTA_DST]) {
3663                 int plen = (rtm->rtm_dst_len + 7) >> 3;
3664
3665                 if (nla_len(tb[RTA_DST]) < plen)
3666                         goto errout;
3667
3668                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
3669         }
3670
3671         if (tb[RTA_SRC]) {
3672                 int plen = (rtm->rtm_src_len + 7) >> 3;
3673
3674                 if (nla_len(tb[RTA_SRC]) < plen)
3675                         goto errout;
3676
3677                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
3678         }
3679
3680         if (tb[RTA_PREFSRC])
3681                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
3682
3683         if (tb[RTA_OIF])
3684                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
3685
3686         if (tb[RTA_PRIORITY])
3687                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
3688
3689         if (tb[RTA_METRICS]) {
3690                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
3691                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
3692         }
3693
3694         if (tb[RTA_TABLE])
3695                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
3696
3697         if (tb[RTA_MULTIPATH]) {
3698                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
3699                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
3700
3701                 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
3702                                                      cfg->fc_mp_len, extack);
3703                 if (err < 0)
3704                         goto errout;
3705         }
3706
3707         if (tb[RTA_PREF]) {
3708                 pref = nla_get_u8(tb[RTA_PREF]);
3709                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
3710                     pref != ICMPV6_ROUTER_PREF_HIGH)
3711                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
3712                 cfg->fc_flags |= RTF_PREF(pref);
3713         }
3714
3715         if (tb[RTA_ENCAP])
3716                 cfg->fc_encap = tb[RTA_ENCAP];
3717
3718         if (tb[RTA_ENCAP_TYPE]) {
3719                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
3720
3721                 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
3722                 if (err < 0)
3723                         goto errout;
3724         }
3725
3726         if (tb[RTA_EXPIRES]) {
3727                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
3728
3729                 if (addrconf_finite_timeout(timeout)) {
3730                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
3731                         cfg->fc_flags |= RTF_EXPIRES;
3732                 }
3733         }
3734
3735         err = 0;
3736 errout:
3737         return err;
3738 }
3739
3740 struct rt6_nh {
3741         struct rt6_info *rt6_info;
3742         struct fib6_config r_cfg;
3743         struct mx6_config mxc;
3744         struct list_head next;
3745 };
3746
3747 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
3748 {
3749         struct rt6_nh *nh;
3750
3751         list_for_each_entry(nh, rt6_nh_list, next) {
3752                 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
3753                         &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
3754                         nh->r_cfg.fc_ifindex);
3755         }
3756 }
3757
3758 static int ip6_route_info_append(struct list_head *rt6_nh_list,
3759                                  struct rt6_info *rt, struct fib6_config *r_cfg)
3760 {
3761         struct rt6_nh *nh;
3762         int err = -EEXIST;
3763
3764         list_for_each_entry(nh, rt6_nh_list, next) {
3765                 /* check if rt6_info already exists */
3766                 if (rt6_duplicate_nexthop(nh->rt6_info, rt))
3767                         return err;
3768         }
3769
3770         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
3771         if (!nh)
3772                 return -ENOMEM;
3773         nh->rt6_info = rt;
3774         err = ip6_convert_metrics(&nh->mxc, r_cfg);
3775         if (err) {
3776                 kfree(nh);
3777                 return err;
3778         }
3779         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
3780         list_add_tail(&nh->next, rt6_nh_list);
3781
3782         return 0;
3783 }
3784
3785 static void ip6_route_mpath_notify(struct rt6_info *rt,
3786                                    struct rt6_info *rt_last,
3787                                    struct nl_info *info,
3788                                    __u16 nlflags)
3789 {
3790         /* if this is an APPEND route, then rt points to the first route
3791          * inserted and rt_last points to last route inserted. Userspace
3792          * wants a consistent dump of the route which starts at the first
3793          * nexthop. Since sibling routes are always added at the end of
3794          * the list, find the first sibling of the last route appended
3795          */
3796         if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
3797                 rt = list_first_entry(&rt_last->rt6i_siblings,
3798                                       struct rt6_info,
3799                                       rt6i_siblings);
3800         }
3801
3802         if (rt)
3803                 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
3804 }
3805
3806 static int ip6_route_multipath_add(struct fib6_config *cfg,
3807                                    struct netlink_ext_ack *extack)
3808 {
3809         struct rt6_info *rt_notif = NULL, *rt_last = NULL;
3810         struct nl_info *info = &cfg->fc_nlinfo;
3811         struct fib6_config r_cfg;
3812         struct rtnexthop *rtnh;
3813         struct rt6_info *rt;
3814         struct rt6_nh *err_nh;
3815         struct rt6_nh *nh, *nh_safe;
3816         __u16 nlflags;
3817         int remaining;
3818         int attrlen;
3819         int err = 1;
3820         int nhn = 0;
3821         int replace = (cfg->fc_nlinfo.nlh &&
3822                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
3823         LIST_HEAD(rt6_nh_list);
3824
3825         nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
3826         if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
3827                 nlflags |= NLM_F_APPEND;
3828
3829         remaining = cfg->fc_mp_len;
3830         rtnh = (struct rtnexthop *)cfg->fc_mp;
3831
3832         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
3833          * rt6_info structs per nexthop
3834          */
3835         while (rtnh_ok(rtnh, remaining)) {
3836                 memcpy(&r_cfg, cfg, sizeof(*cfg));
3837                 if (rtnh->rtnh_ifindex)
3838                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3839
3840                 attrlen = rtnh_attrlen(rtnh);
3841                 if (attrlen > 0) {
3842                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3843
3844                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3845                         if (nla) {
3846                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
3847                                 r_cfg.fc_flags |= RTF_GATEWAY;
3848                         }
3849                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
3850                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
3851                         if (nla)
3852                                 r_cfg.fc_encap_type = nla_get_u16(nla);
3853                 }
3854
3855                 rt = ip6_route_info_create(&r_cfg, extack);
3856                 if (IS_ERR(rt)) {
3857                         err = PTR_ERR(rt);
3858                         rt = NULL;
3859                         goto cleanup;
3860                 }
3861
3862                 err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
3863                 if (err) {
3864                         dst_release_immediate(&rt->dst);
3865                         goto cleanup;
3866                 }
3867
3868                 rtnh = rtnh_next(rtnh, &remaining);
3869         }
3870
3871         /* for add and replace send one notification with all nexthops.
3872          * Skip the notification in fib6_add_rt2node and send one with
3873          * the full route when done
3874          */
3875         info->skip_notify = 1;
3876
3877         err_nh = NULL;
3878         list_for_each_entry(nh, &rt6_nh_list, next) {
3879                 rt_last = nh->rt6_info;
3880                 err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack);
3881                 /* save reference to first route for notification */
3882                 if (!rt_notif && !err)
3883                         rt_notif = nh->rt6_info;
3884
3885                 /* nh->rt6_info is used or freed at this point, reset to NULL*/
3886                 nh->rt6_info = NULL;
3887                 if (err) {
3888                         if (replace && nhn)
3889                                 ip6_print_replace_route_err(&rt6_nh_list);
3890                         err_nh = nh;
3891                         goto add_errout;
3892                 }
3893
3894                 /* Because each route is added like a single route we remove
3895                  * these flags after the first nexthop: if there is a collision,
3896                  * we have already failed to add the first nexthop:
3897                  * fib6_add_rt2node() has rejected it; when replacing, old
3898                  * nexthops have been replaced by first new, the rest should
3899                  * be added to it.
3900                  */
3901                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
3902                                                      NLM_F_REPLACE);
3903                 nhn++;
3904         }
3905
3906         /* success ... tell user about new route */
3907         ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3908         goto cleanup;
3909
3910 add_errout:
3911         /* send notification for routes that were added so that
3912          * the delete notifications sent by ip6_route_del are
3913          * coherent
3914          */
3915         if (rt_notif)
3916                 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3917
3918         /* Delete routes that were already added */
3919         list_for_each_entry(nh, &rt6_nh_list, next) {
3920                 if (err_nh == nh)
3921                         break;
3922                 ip6_route_del(&nh->r_cfg, extack);
3923         }
3924
3925 cleanup:
3926         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
3927                 if (nh->rt6_info)
3928                         dst_release_immediate(&nh->rt6_info->dst);
3929                 kfree(nh->mxc.mx);
3930                 list_del(&nh->next);
3931                 kfree(nh);
3932         }
3933
3934         return err;
3935 }
3936
3937 static int ip6_route_multipath_del(struct fib6_config *cfg,
3938                                    struct netlink_ext_ack *extack)
3939 {
3940         struct fib6_config r_cfg;
3941         struct rtnexthop *rtnh;
3942         int remaining;
3943         int attrlen;
3944         int err = 1, last_err = 0;
3945
3946         remaining = cfg->fc_mp_len;
3947         rtnh = (struct rtnexthop *)cfg->fc_mp;
3948
3949         /* Parse a Multipath Entry */
3950         while (rtnh_ok(rtnh, remaining)) {
3951                 memcpy(&r_cfg, cfg, sizeof(*cfg));
3952                 if (rtnh->rtnh_ifindex)
3953                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3954
3955                 attrlen = rtnh_attrlen(rtnh);
3956                 if (attrlen > 0) {
3957                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3958
3959                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3960                         if (nla) {
3961                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
3962                                 r_cfg.fc_flags |= RTF_GATEWAY;
3963                         }
3964                 }
3965                 err = ip6_route_del(&r_cfg, extack);
3966                 if (err)
3967                         last_err = err;
3968
3969                 rtnh = rtnh_next(rtnh, &remaining);
3970         }
3971
3972         return last_err;
3973 }
3974
3975 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3976                               struct netlink_ext_ack *extack)
3977 {
3978         struct fib6_config cfg;
3979         int err;
3980
3981         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3982         if (err < 0)
3983                 return err;
3984
3985         if (cfg.fc_mp)
3986                 return ip6_route_multipath_del(&cfg, extack);
3987         else {
3988                 cfg.fc_delete_all_nh = 1;
3989                 return ip6_route_del(&cfg, extack);
3990         }
3991 }
3992
3993 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3994                               struct netlink_ext_ack *extack)
3995 {
3996         struct fib6_config cfg;
3997         int err;
3998
3999         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4000         if (err < 0)
4001                 return err;
4002
4003         if (cfg.fc_mp)
4004                 return ip6_route_multipath_add(&cfg, extack);
4005         else
4006                 return ip6_route_add(&cfg, extack);
4007 }
4008
4009 static size_t rt6_nlmsg_size(struct rt6_info *rt)
4010 {
4011         int nexthop_len = 0;
4012
4013         if (rt->rt6i_nsiblings) {
4014                 nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
4015                             + NLA_ALIGN(sizeof(struct rtnexthop))
4016                             + nla_total_size(16) /* RTA_GATEWAY */
4017                             + lwtunnel_get_encap_size(rt->dst.lwtstate);
4018
4019                 nexthop_len *= rt->rt6i_nsiblings;
4020         }
4021
4022         return NLMSG_ALIGN(sizeof(struct rtmsg))
4023                + nla_total_size(16) /* RTA_SRC */
4024                + nla_total_size(16) /* RTA_DST */
4025                + nla_total_size(16) /* RTA_GATEWAY */
4026                + nla_total_size(16) /* RTA_PREFSRC */
4027                + nla_total_size(4) /* RTA_TABLE */
4028                + nla_total_size(4) /* RTA_IIF */
4029                + nla_total_size(4) /* RTA_OIF */
4030                + nla_total_size(4) /* RTA_PRIORITY */
4031                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4032                + nla_total_size(sizeof(struct rta_cacheinfo))
4033                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4034                + nla_total_size(1) /* RTA_PREF */
4035                + lwtunnel_get_encap_size(rt->dst.lwtstate)
4036                + nexthop_len;
4037 }
4038
4039 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
4040                             unsigned int *flags, bool skip_oif)
4041 {
4042         if (rt->rt6i_nh_flags & RTNH_F_LINKDOWN) {
4043                 *flags |= RTNH_F_LINKDOWN;
4044                 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
4045                         *flags |= RTNH_F_DEAD;
4046         }
4047
4048         if (rt->rt6i_flags & RTF_GATEWAY) {
4049                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
4050                         goto nla_put_failure;
4051         }
4052
4053         if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD)
4054                 *flags |= RTNH_F_OFFLOAD;
4055
4056         /* not needed for multipath encoding b/c it has a rtnexthop struct */
4057         if (!skip_oif && rt->dst.dev &&
4058             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
4059                 goto nla_put_failure;
4060
4061         if (rt->dst.lwtstate &&
4062             lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
4063                 goto nla_put_failure;
4064
4065         return 0;
4066
4067 nla_put_failure:
4068         return -EMSGSIZE;
4069 }
4070
4071 /* add multipath next hop */
4072 static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
4073 {
4074         struct rtnexthop *rtnh;
4075         unsigned int flags = 0;
4076
4077         rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4078         if (!rtnh)
4079                 goto nla_put_failure;
4080
4081         rtnh->rtnh_hops = 0;
4082         rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
4083
4084         if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4085                 goto nla_put_failure;
4086
4087         rtnh->rtnh_flags = flags;
4088
4089         /* length of rtnetlink header + attributes */
4090         rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4091
4092         return 0;
4093
4094 nla_put_failure:
4095         return -EMSGSIZE;
4096 }
4097
4098 static int rt6_fill_node(struct net *net,
4099                          struct sk_buff *skb, struct rt6_info *rt,
4100                          struct in6_addr *dst, struct in6_addr *src,
4101                          int iif, int type, u32 portid, u32 seq,
4102                          unsigned int flags)
4103 {
4104         u32 metrics[RTAX_MAX];
4105         struct rtmsg *rtm;
4106         struct nlmsghdr *nlh;
4107         long expires;
4108         u32 table;
4109
4110         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4111         if (!nlh)
4112                 return -EMSGSIZE;
4113
4114         rtm = nlmsg_data(nlh);
4115         rtm->rtm_family = AF_INET6;
4116         rtm->rtm_dst_len = rt->rt6i_dst.plen;
4117         rtm->rtm_src_len = rt->rt6i_src.plen;
4118         rtm->rtm_tos = 0;
4119         if (rt->rt6i_table)
4120                 table = rt->rt6i_table->tb6_id;
4121         else
4122                 table = RT6_TABLE_UNSPEC;
4123         rtm->rtm_table = table;
4124         if (nla_put_u32(skb, RTA_TABLE, table))
4125                 goto nla_put_failure;
4126         if (rt->rt6i_flags & RTF_REJECT) {
4127                 switch (rt->dst.error) {
4128                 case -EINVAL:
4129                         rtm->rtm_type = RTN_BLACKHOLE;
4130                         break;
4131                 case -EACCES:
4132                         rtm->rtm_type = RTN_PROHIBIT;
4133                         break;
4134                 case -EAGAIN:
4135                         rtm->rtm_type = RTN_THROW;
4136                         break;
4137                 default:
4138                         rtm->rtm_type = RTN_UNREACHABLE;
4139                         break;
4140                 }
4141         }
4142         else if (rt->rt6i_flags & RTF_LOCAL)
4143                 rtm->rtm_type = RTN_LOCAL;
4144         else if (rt->rt6i_flags & RTF_ANYCAST)
4145                 rtm->rtm_type = RTN_ANYCAST;
4146         else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
4147                 rtm->rtm_type = RTN_LOCAL;
4148         else
4149                 rtm->rtm_type = RTN_UNICAST;
4150         rtm->rtm_flags = 0;
4151         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4152         rtm->rtm_protocol = rt->rt6i_protocol;
4153
4154         if (rt->rt6i_flags & RTF_CACHE)
4155                 rtm->rtm_flags |= RTM_F_CLONED;
4156
4157         if (dst) {
4158                 if (nla_put_in6_addr(skb, RTA_DST, dst))
4159                         goto nla_put_failure;
4160                 rtm->rtm_dst_len = 128;
4161         } else if (rtm->rtm_dst_len)
4162                 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
4163                         goto nla_put_failure;
4164 #ifdef CONFIG_IPV6_SUBTREES
4165         if (src) {
4166                 if (nla_put_in6_addr(skb, RTA_SRC, src))
4167                         goto nla_put_failure;
4168                 rtm->rtm_src_len = 128;
4169         } else if (rtm->rtm_src_len &&
4170                    nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
4171                 goto nla_put_failure;
4172 #endif
4173         if (iif) {
4174 #ifdef CONFIG_IPV6_MROUTE
4175                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
4176                         int err = ip6mr_get_route(net, skb, rtm, portid);
4177
4178                         if (err == 0)
4179                                 return 0;
4180                         if (err < 0)
4181                                 goto nla_put_failure;
4182                 } else
4183 #endif
4184                         if (nla_put_u32(skb, RTA_IIF, iif))
4185                                 goto nla_put_failure;
4186         } else if (dst) {
4187                 struct in6_addr saddr_buf;
4188                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
4189                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4190                         goto nla_put_failure;
4191         }
4192
4193         if (rt->rt6i_prefsrc.plen) {
4194                 struct in6_addr saddr_buf;
4195                 saddr_buf = rt->rt6i_prefsrc.addr;
4196                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4197                         goto nla_put_failure;
4198         }
4199
4200         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
4201         if (rt->rt6i_pmtu)
4202                 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
4203         if (rtnetlink_put_metrics(skb, metrics) < 0)
4204                 goto nla_put_failure;
4205
4206         if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
4207                 goto nla_put_failure;
4208
4209         /* For multipath routes, walk the siblings list and add
4210          * each as a nexthop within RTA_MULTIPATH.
4211          */
4212         if (rt->rt6i_nsiblings) {
4213                 struct rt6_info *sibling, *next_sibling;
4214                 struct nlattr *mp;
4215
4216                 mp = nla_nest_start(skb, RTA_MULTIPATH);
4217                 if (!mp)
4218                         goto nla_put_failure;
4219
4220                 if (rt6_add_nexthop(skb, rt) < 0)
4221                         goto nla_put_failure;
4222
4223                 list_for_each_entry_safe(sibling, next_sibling,
4224                                          &rt->rt6i_siblings, rt6i_siblings) {
4225                         if (rt6_add_nexthop(skb, sibling) < 0)
4226                                 goto nla_put_failure;
4227                 }
4228
4229                 nla_nest_end(skb, mp);
4230         } else {
4231                 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4232                         goto nla_put_failure;
4233         }
4234
4235         expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
4236
4237         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
4238                 goto nla_put_failure;
4239
4240         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
4241                 goto nla_put_failure;
4242
4243
4244         nlmsg_end(skb, nlh);
4245         return 0;
4246
4247 nla_put_failure:
4248         nlmsg_cancel(skb, nlh);
4249         return -EMSGSIZE;
4250 }
4251
4252 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
4253 {
4254         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4255         struct net *net = arg->net;
4256
4257         if (rt == net->ipv6.ip6_null_entry)
4258                 return 0;
4259
4260         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4261                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4262
4263                 /* user wants prefix routes only */
4264                 if (rtm->rtm_flags & RTM_F_PREFIX &&
4265                     !(rt->rt6i_flags & RTF_PREFIX_RT)) {
4266                         /* success since this is not a prefix route */
4267                         return 1;
4268                 }
4269         }
4270
4271         return rt6_fill_node(net,
4272                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
4273                      NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
4274                      NLM_F_MULTI);
4275 }
4276
4277 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4278                               struct netlink_ext_ack *extack)
4279 {
4280         struct net *net = sock_net(in_skb->sk);
4281         struct nlattr *tb[RTA_MAX+1];
4282         int err, iif = 0, oif = 0;
4283         struct dst_entry *dst;
4284         struct rt6_info *rt;
4285         struct sk_buff *skb;
4286         struct rtmsg *rtm;
4287         struct flowi6 fl6;
4288         bool fibmatch;
4289
4290         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4291                           extack);
4292         if (err < 0)
4293                 goto errout;
4294
4295         err = -EINVAL;
4296         memset(&fl6, 0, sizeof(fl6));
4297         rtm = nlmsg_data(nlh);
4298         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4299         fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4300
4301         if (tb[RTA_SRC]) {
4302                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4303                         goto errout;
4304
4305                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4306         }
4307
4308         if (tb[RTA_DST]) {
4309                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4310                         goto errout;
4311
4312                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4313         }
4314
4315         if (tb[RTA_IIF])
4316                 iif = nla_get_u32(tb[RTA_IIF]);
4317
4318         if (tb[RTA_OIF])
4319                 oif = nla_get_u32(tb[RTA_OIF]);
4320
4321         if (tb[RTA_MARK])
4322                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4323
4324         if (tb[RTA_UID])
4325                 fl6.flowi6_uid = make_kuid(current_user_ns(),
4326                                            nla_get_u32(tb[RTA_UID]));
4327         else
4328                 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4329
4330         if (iif) {
4331                 struct net_device *dev;
4332                 int flags = 0;
4333
4334                 rcu_read_lock();
4335
4336                 dev = dev_get_by_index_rcu(net, iif);
4337                 if (!dev) {
4338                         rcu_read_unlock();
4339                         err = -ENODEV;
4340                         goto errout;
4341                 }
4342
4343                 fl6.flowi6_iif = iif;
4344
4345                 if (!ipv6_addr_any(&fl6.saddr))
4346                         flags |= RT6_LOOKUP_F_HAS_SADDR;
4347
4348                 dst = ip6_route_input_lookup(net, dev, &fl6, flags);
4349
4350                 rcu_read_unlock();
4351         } else {
4352                 fl6.flowi6_oif = oif;
4353
4354                 dst = ip6_route_output(net, NULL, &fl6);
4355         }
4356
4357
4358         rt = container_of(dst, struct rt6_info, dst);
4359         if (rt->dst.error) {
4360                 err = rt->dst.error;
4361                 ip6_rt_put(rt);
4362                 goto errout;
4363         }
4364
4365         if (rt == net->ipv6.ip6_null_entry) {
4366                 err = rt->dst.error;
4367                 ip6_rt_put(rt);
4368                 goto errout;
4369         }
4370
4371         if (fibmatch && rt->from) {
4372                 struct rt6_info *ort = rt->from;
4373
4374                 dst_hold(&ort->dst);
4375                 ip6_rt_put(rt);
4376                 rt = ort;
4377         }
4378
4379         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4380         if (!skb) {
4381                 ip6_rt_put(rt);
4382                 err = -ENOBUFS;
4383                 goto errout;
4384         }
4385
4386         skb_dst_set(skb, &rt->dst);
4387         if (fibmatch)
4388                 err = rt6_fill_node(net, skb, rt, NULL, NULL, iif,
4389                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4390                                     nlh->nlmsg_seq, 0);
4391         else
4392                 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
4393                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4394                                     nlh->nlmsg_seq, 0);
4395         if (err < 0) {
4396                 kfree_skb(skb);
4397                 goto errout;
4398         }
4399
4400         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4401 errout:
4402         return err;
4403 }
4404
4405 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
4406                      unsigned int nlm_flags)
4407 {
4408         struct sk_buff *skb;
4409         struct net *net = info->nl_net;
4410         u32 seq;
4411         int err;
4412
4413         err = -ENOBUFS;
4414         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4415
4416         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4417         if (!skb)
4418                 goto errout;
4419
4420         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
4421                                 event, info->portid, seq, nlm_flags);
4422         if (err < 0) {
4423                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4424                 WARN_ON(err == -EMSGSIZE);
4425                 kfree_skb(skb);
4426                 goto errout;
4427         }
4428         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4429                     info->nlh, gfp_any());
4430         return;
4431 errout:
4432         if (err < 0)
4433                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4434 }
4435
4436 static int ip6_route_dev_notify(struct notifier_block *this,
4437                                 unsigned long event, void *ptr)
4438 {
4439         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4440         struct net *net = dev_net(dev);
4441
4442         if (!(dev->flags & IFF_LOOPBACK))
4443                 return NOTIFY_OK;
4444
4445         if (event == NETDEV_REGISTER) {
4446                 net->ipv6.ip6_null_entry->dst.dev = dev;
4447                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
4448 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4449                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
4450                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
4451                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
4452                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
4453 #endif
4454          } else if (event == NETDEV_UNREGISTER &&
4455                     dev->reg_state != NETREG_UNREGISTERED) {
4456                 /* NETDEV_UNREGISTER could be fired for multiple times by
4457                  * netdev_wait_allrefs(). Make sure we only call this once.
4458                  */
4459                 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
4460 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4461                 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
4462                 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
4463 #endif
4464         }
4465
4466         return NOTIFY_OK;
4467 }
4468
4469 /*
4470  *      /proc
4471  */
4472
4473 #ifdef CONFIG_PROC_FS
4474
4475 static const struct file_operations ipv6_route_proc_fops = {
4476         .owner          = THIS_MODULE,
4477         .open           = ipv6_route_open,
4478         .read           = seq_read,
4479         .llseek         = seq_lseek,
4480         .release        = seq_release_net,
4481 };
4482
4483 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
4484 {
4485         struct net *net = (struct net *)seq->private;
4486         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
4487                    net->ipv6.rt6_stats->fib_nodes,
4488                    net->ipv6.rt6_stats->fib_route_nodes,
4489                    atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
4490                    net->ipv6.rt6_stats->fib_rt_entries,
4491                    net->ipv6.rt6_stats->fib_rt_cache,
4492                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
4493                    net->ipv6.rt6_stats->fib_discarded_routes);
4494
4495         return 0;
4496 }
4497
4498 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
4499 {
4500         return single_open_net(inode, file, rt6_stats_seq_show);
4501 }
4502
4503 static const struct file_operations rt6_stats_seq_fops = {
4504         .owner   = THIS_MODULE,
4505         .open    = rt6_stats_seq_open,
4506         .read    = seq_read,
4507         .llseek  = seq_lseek,
4508         .release = single_release_net,
4509 };
4510 #endif  /* CONFIG_PROC_FS */
4511
4512 #ifdef CONFIG_SYSCTL
4513
4514 static
4515 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
4516                               void __user *buffer, size_t *lenp, loff_t *ppos)
4517 {
4518         struct net *net;
4519         int delay;
4520         if (!write)
4521                 return -EINVAL;
4522
4523         net = (struct net *)ctl->extra1;
4524         delay = net->ipv6.sysctl.flush_delay;
4525         proc_dointvec(ctl, write, buffer, lenp, ppos);
4526         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
4527         return 0;
4528 }
4529
4530 struct ctl_table ipv6_route_table_template[] = {
4531         {
4532                 .procname       =       "flush",
4533                 .data           =       &init_net.ipv6.sysctl.flush_delay,
4534                 .maxlen         =       sizeof(int),
4535                 .mode           =       0200,
4536                 .proc_handler   =       ipv6_sysctl_rtcache_flush
4537         },
4538         {
4539                 .procname       =       "gc_thresh",
4540                 .data           =       &ip6_dst_ops_template.gc_thresh,
4541                 .maxlen         =       sizeof(int),
4542                 .mode           =       0644,
4543                 .proc_handler   =       proc_dointvec,
4544         },
4545         {
4546                 .procname       =       "max_size",
4547                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
4548                 .maxlen         =       sizeof(int),
4549                 .mode           =       0644,
4550                 .proc_handler   =       proc_dointvec,
4551         },
4552         {
4553                 .procname       =       "gc_min_interval",
4554                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4555                 .maxlen         =       sizeof(int),
4556                 .mode           =       0644,
4557                 .proc_handler   =       proc_dointvec_jiffies,
4558         },
4559         {
4560                 .procname       =       "gc_timeout",
4561                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
4562                 .maxlen         =       sizeof(int),
4563                 .mode           =       0644,
4564                 .proc_handler   =       proc_dointvec_jiffies,
4565         },
4566         {
4567                 .procname       =       "gc_interval",
4568                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
4569                 .maxlen         =       sizeof(int),
4570                 .mode           =       0644,
4571                 .proc_handler   =       proc_dointvec_jiffies,
4572         },
4573         {
4574                 .procname       =       "gc_elasticity",
4575                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
4576                 .maxlen         =       sizeof(int),
4577                 .mode           =       0644,
4578                 .proc_handler   =       proc_dointvec,
4579         },
4580         {
4581                 .procname       =       "mtu_expires",
4582                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
4583                 .maxlen         =       sizeof(int),
4584                 .mode           =       0644,
4585                 .proc_handler   =       proc_dointvec_jiffies,
4586         },
4587         {
4588                 .procname       =       "min_adv_mss",
4589                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
4590                 .maxlen         =       sizeof(int),
4591                 .mode           =       0644,
4592                 .proc_handler   =       proc_dointvec,
4593         },
4594         {
4595                 .procname       =       "gc_min_interval_ms",
4596                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4597                 .maxlen         =       sizeof(int),
4598                 .mode           =       0644,
4599                 .proc_handler   =       proc_dointvec_ms_jiffies,
4600         },
4601         { }
4602 };
4603
4604 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
4605 {
4606         struct ctl_table *table;
4607
4608         table = kmemdup(ipv6_route_table_template,
4609                         sizeof(ipv6_route_table_template),
4610                         GFP_KERNEL);
4611
4612         if (table) {
4613                 table[0].data = &net->ipv6.sysctl.flush_delay;
4614                 table[0].extra1 = net;
4615                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
4616                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
4617                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4618                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
4619                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
4620                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
4621                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
4622                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
4623                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4624
4625                 /* Don't export sysctls to unprivileged users */
4626                 if (net->user_ns != &init_user_ns)
4627                         table[0].procname = NULL;
4628         }
4629
4630         return table;
4631 }
4632 #endif
4633
4634 static int __net_init ip6_route_net_init(struct net *net)
4635 {
4636         int ret = -ENOMEM;
4637
4638         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
4639                sizeof(net->ipv6.ip6_dst_ops));
4640
4641         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
4642                 goto out_ip6_dst_ops;
4643
4644         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
4645                                            sizeof(*net->ipv6.ip6_null_entry),
4646                                            GFP_KERNEL);
4647         if (!net->ipv6.ip6_null_entry)
4648                 goto out_ip6_dst_entries;
4649         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4650         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
4651                          ip6_template_metrics, true);
4652
4653 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4654         net->ipv6.fib6_has_custom_rules = false;
4655         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
4656                                                sizeof(*net->ipv6.ip6_prohibit_entry),
4657                                                GFP_KERNEL);
4658         if (!net->ipv6.ip6_prohibit_entry)
4659                 goto out_ip6_null_entry;
4660         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4661         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
4662                          ip6_template_metrics, true);
4663
4664         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
4665                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
4666                                                GFP_KERNEL);
4667         if (!net->ipv6.ip6_blk_hole_entry)
4668                 goto out_ip6_prohibit_entry;
4669         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4670         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
4671                          ip6_template_metrics, true);
4672 #endif
4673
4674         net->ipv6.sysctl.flush_delay = 0;
4675         net->ipv6.sysctl.ip6_rt_max_size = 4096;
4676         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
4677         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
4678         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
4679         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
4680         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
4681         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
4682
4683         net->ipv6.ip6_rt_gc_expire = 30*HZ;
4684
4685         ret = 0;
4686 out:
4687         return ret;
4688
4689 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4690 out_ip6_prohibit_entry:
4691         kfree(net->ipv6.ip6_prohibit_entry);
4692 out_ip6_null_entry:
4693         kfree(net->ipv6.ip6_null_entry);
4694 #endif
4695 out_ip6_dst_entries:
4696         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4697 out_ip6_dst_ops:
4698         goto out;
4699 }
4700
4701 static void __net_exit ip6_route_net_exit(struct net *net)
4702 {
4703         kfree(net->ipv6.ip6_null_entry);
4704 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4705         kfree(net->ipv6.ip6_prohibit_entry);
4706         kfree(net->ipv6.ip6_blk_hole_entry);
4707 #endif
4708         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4709 }
4710
4711 static int __net_init ip6_route_net_init_late(struct net *net)
4712 {
4713 #ifdef CONFIG_PROC_FS
4714         proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
4715         proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
4716 #endif
4717         return 0;
4718 }
4719
4720 static void __net_exit ip6_route_net_exit_late(struct net *net)
4721 {
4722 #ifdef CONFIG_PROC_FS
4723         remove_proc_entry("ipv6_route", net->proc_net);
4724         remove_proc_entry("rt6_stats", net->proc_net);
4725 #endif
4726 }
4727
4728 static struct pernet_operations ip6_route_net_ops = {
4729         .init = ip6_route_net_init,
4730         .exit = ip6_route_net_exit,
4731 };
4732
4733 static int __net_init ipv6_inetpeer_init(struct net *net)
4734 {
4735         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
4736
4737         if (!bp)
4738                 return -ENOMEM;
4739         inet_peer_base_init(bp);
4740         net->ipv6.peers = bp;
4741         return 0;
4742 }
4743
4744 static void __net_exit ipv6_inetpeer_exit(struct net *net)
4745 {
4746         struct inet_peer_base *bp = net->ipv6.peers;
4747
4748         net->ipv6.peers = NULL;
4749         inetpeer_invalidate_tree(bp);
4750         kfree(bp);
4751 }
4752
4753 static struct pernet_operations ipv6_inetpeer_ops = {
4754         .init   =       ipv6_inetpeer_init,
4755         .exit   =       ipv6_inetpeer_exit,
4756 };
4757
4758 static struct pernet_operations ip6_route_net_late_ops = {
4759         .init = ip6_route_net_init_late,
4760         .exit = ip6_route_net_exit_late,
4761 };
4762
4763 static struct notifier_block ip6_route_dev_notifier = {
4764         .notifier_call = ip6_route_dev_notify,
4765         .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
4766 };
4767
4768 void __init ip6_route_init_special_entries(void)
4769 {
4770         /* Registering of the loopback is done before this portion of code,
4771          * the loopback reference in rt6_info will not be taken, do it
4772          * manually for init_net */
4773         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
4774         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4775   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4776         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
4777         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4778         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
4779         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4780   #endif
4781 }
4782
4783 int __init ip6_route_init(void)
4784 {
4785         int ret;
4786         int cpu;
4787
4788         ret = -ENOMEM;
4789         ip6_dst_ops_template.kmem_cachep =
4790                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
4791                                   SLAB_HWCACHE_ALIGN, NULL);
4792         if (!ip6_dst_ops_template.kmem_cachep)
4793                 goto out;
4794
4795         ret = dst_entries_init(&ip6_dst_blackhole_ops);
4796         if (ret)
4797                 goto out_kmem_cache;
4798
4799         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
4800         if (ret)
4801                 goto out_dst_entries;
4802
4803         ret = register_pernet_subsys(&ip6_route_net_ops);
4804         if (ret)
4805                 goto out_register_inetpeer;
4806
4807         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
4808
4809         ret = fib6_init();
4810         if (ret)
4811                 goto out_register_subsys;
4812
4813         ret = xfrm6_init();
4814         if (ret)
4815                 goto out_fib6_init;
4816
4817         ret = fib6_rules_init();
4818         if (ret)
4819                 goto xfrm6_init;
4820
4821         ret = register_pernet_subsys(&ip6_route_net_late_ops);
4822         if (ret)
4823                 goto fib6_rules_init;
4824
4825         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
4826                                    inet6_rtm_newroute, NULL, 0);
4827         if (ret < 0)
4828                 goto out_register_late_subsys;
4829
4830         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
4831                                    inet6_rtm_delroute, NULL, 0);
4832         if (ret < 0)
4833                 goto out_register_late_subsys;
4834
4835         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
4836                                    inet6_rtm_getroute, NULL,
4837                                    RTNL_FLAG_DOIT_UNLOCKED);
4838         if (ret < 0)
4839                 goto out_register_late_subsys;
4840
4841         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
4842         if (ret)
4843                 goto out_register_late_subsys;
4844
4845         for_each_possible_cpu(cpu) {
4846                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
4847
4848                 INIT_LIST_HEAD(&ul->head);
4849                 spin_lock_init(&ul->lock);
4850         }
4851
4852 out:
4853         return ret;
4854
4855 out_register_late_subsys:
4856         rtnl_unregister_all(PF_INET6);
4857         unregister_pernet_subsys(&ip6_route_net_late_ops);
4858 fib6_rules_init:
4859         fib6_rules_cleanup();
4860 xfrm6_init:
4861         xfrm6_fini();
4862 out_fib6_init:
4863         fib6_gc_cleanup();
4864 out_register_subsys:
4865         unregister_pernet_subsys(&ip6_route_net_ops);
4866 out_register_inetpeer:
4867         unregister_pernet_subsys(&ipv6_inetpeer_ops);
4868 out_dst_entries:
4869         dst_entries_destroy(&ip6_dst_blackhole_ops);
4870 out_kmem_cache:
4871         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4872         goto out;
4873 }
4874
4875 void ip6_route_cleanup(void)
4876 {
4877         unregister_netdevice_notifier(&ip6_route_dev_notifier);
4878         unregister_pernet_subsys(&ip6_route_net_late_ops);
4879         fib6_rules_cleanup();
4880         xfrm6_fini();
4881         fib6_gc_cleanup();
4882         unregister_pernet_subsys(&ipv6_inetpeer_ops);
4883         unregister_pernet_subsys(&ip6_route_net_ops);
4884         dst_entries_destroy(&ip6_dst_blackhole_ops);
4885         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4886 }