Merge branch 'acpi-pm'
[sfrench/cifs-2.6.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <net/net_namespace.h>
48 #include <net/snmp.h>
49 #include <net/ipv6.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #include <net/ndisc.h>
53 #include <net/addrconf.h>
54 #include <net/tcp.h>
55 #include <linux/rtnetlink.h>
56 #include <net/dst.h>
57 #include <net/dst_metadata.h>
58 #include <net/xfrm.h>
59 #include <net/netevent.h>
60 #include <net/netlink.h>
61 #include <net/nexthop.h>
62 #include <net/lwtunnel.h>
63 #include <net/ip_tunnels.h>
64 #include <net/l3mdev.h>
65 #include <trace/events/fib6.h>
66
67 #include <linux/uaccess.h>
68
69 #ifdef CONFIG_SYSCTL
70 #include <linux/sysctl.h>
71 #endif
72
73 enum rt6_nud_state {
74         RT6_NUD_FAIL_HARD = -3,
75         RT6_NUD_FAIL_PROBE = -2,
76         RT6_NUD_FAIL_DO_RR = -1,
77         RT6_NUD_SUCCEED = 1
78 };
79
80 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
81 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
82 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
83 static unsigned int      ip6_mtu(const struct dst_entry *dst);
84 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
85 static void             ip6_dst_destroy(struct dst_entry *);
86 static void             ip6_dst_ifdown(struct dst_entry *,
87                                        struct net_device *dev, int how);
88 static int               ip6_dst_gc(struct dst_ops *ops);
89
90 static int              ip6_pkt_discard(struct sk_buff *skb);
91 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
92 static int              ip6_pkt_prohibit(struct sk_buff *skb);
93 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
94 static void             ip6_link_failure(struct sk_buff *skb);
95 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
96                                            struct sk_buff *skb, u32 mtu);
97 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
98                                         struct sk_buff *skb);
99 static void             rt6_dst_from_metrics_check(struct rt6_info *rt);
100 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
101 static size_t rt6_nlmsg_size(struct rt6_info *rt);
102 static int rt6_fill_node(struct net *net,
103                          struct sk_buff *skb, struct rt6_info *rt,
104                          struct in6_addr *dst, struct in6_addr *src,
105                          int iif, int type, u32 portid, u32 seq,
106                          unsigned int flags);
107
108 #ifdef CONFIG_IPV6_ROUTE_INFO
109 static struct rt6_info *rt6_add_route_info(struct net *net,
110                                            const struct in6_addr *prefix, int prefixlen,
111                                            const struct in6_addr *gwaddr,
112                                            struct net_device *dev,
113                                            unsigned int pref);
114 static struct rt6_info *rt6_get_route_info(struct net *net,
115                                            const struct in6_addr *prefix, int prefixlen,
116                                            const struct in6_addr *gwaddr,
117                                            struct net_device *dev);
118 #endif
119
120 struct uncached_list {
121         spinlock_t              lock;
122         struct list_head        head;
123 };
124
125 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
126
127 static void rt6_uncached_list_add(struct rt6_info *rt)
128 {
129         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
130
131         rt->rt6i_uncached_list = ul;
132
133         spin_lock_bh(&ul->lock);
134         list_add_tail(&rt->rt6i_uncached, &ul->head);
135         spin_unlock_bh(&ul->lock);
136 }
137
138 static void rt6_uncached_list_del(struct rt6_info *rt)
139 {
140         if (!list_empty(&rt->rt6i_uncached)) {
141                 struct uncached_list *ul = rt->rt6i_uncached_list;
142
143                 spin_lock_bh(&ul->lock);
144                 list_del(&rt->rt6i_uncached);
145                 spin_unlock_bh(&ul->lock);
146         }
147 }
148
149 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
150 {
151         struct net_device *loopback_dev = net->loopback_dev;
152         int cpu;
153
154         if (dev == loopback_dev)
155                 return;
156
157         for_each_possible_cpu(cpu) {
158                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
159                 struct rt6_info *rt;
160
161                 spin_lock_bh(&ul->lock);
162                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
163                         struct inet6_dev *rt_idev = rt->rt6i_idev;
164                         struct net_device *rt_dev = rt->dst.dev;
165
166                         if (rt_idev->dev == dev) {
167                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
168                                 in6_dev_put(rt_idev);
169                         }
170
171                         if (rt_dev == dev) {
172                                 rt->dst.dev = loopback_dev;
173                                 dev_hold(rt->dst.dev);
174                                 dev_put(rt_dev);
175                         }
176                 }
177                 spin_unlock_bh(&ul->lock);
178         }
179 }
180
181 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
182 {
183         return dst_metrics_write_ptr(rt->dst.from);
184 }
185
186 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
187 {
188         struct rt6_info *rt = (struct rt6_info *)dst;
189
190         if (rt->rt6i_flags & RTF_PCPU)
191                 return rt6_pcpu_cow_metrics(rt);
192         else if (rt->rt6i_flags & RTF_CACHE)
193                 return NULL;
194         else
195                 return dst_cow_metrics_generic(dst, old);
196 }
197
198 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
199                                              struct sk_buff *skb,
200                                              const void *daddr)
201 {
202         struct in6_addr *p = &rt->rt6i_gateway;
203
204         if (!ipv6_addr_any(p))
205                 return (const void *) p;
206         else if (skb)
207                 return &ipv6_hdr(skb)->daddr;
208         return daddr;
209 }
210
211 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
212                                           struct sk_buff *skb,
213                                           const void *daddr)
214 {
215         struct rt6_info *rt = (struct rt6_info *) dst;
216         struct neighbour *n;
217
218         daddr = choose_neigh_daddr(rt, skb, daddr);
219         n = __ipv6_neigh_lookup(dst->dev, daddr);
220         if (n)
221                 return n;
222         return neigh_create(&nd_tbl, daddr, dst->dev);
223 }
224
225 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
226 {
227         struct net_device *dev = dst->dev;
228         struct rt6_info *rt = (struct rt6_info *)dst;
229
230         daddr = choose_neigh_daddr(rt, NULL, daddr);
231         if (!daddr)
232                 return;
233         if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
234                 return;
235         if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
236                 return;
237         __ipv6_confirm_neigh(dev, daddr);
238 }
239
240 static struct dst_ops ip6_dst_ops_template = {
241         .family                 =       AF_INET6,
242         .gc                     =       ip6_dst_gc,
243         .gc_thresh              =       1024,
244         .check                  =       ip6_dst_check,
245         .default_advmss         =       ip6_default_advmss,
246         .mtu                    =       ip6_mtu,
247         .cow_metrics            =       ipv6_cow_metrics,
248         .destroy                =       ip6_dst_destroy,
249         .ifdown                 =       ip6_dst_ifdown,
250         .negative_advice        =       ip6_negative_advice,
251         .link_failure           =       ip6_link_failure,
252         .update_pmtu            =       ip6_rt_update_pmtu,
253         .redirect               =       rt6_do_redirect,
254         .local_out              =       __ip6_local_out,
255         .neigh_lookup           =       ip6_neigh_lookup,
256         .confirm_neigh          =       ip6_confirm_neigh,
257 };
258
259 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
260 {
261         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
262
263         return mtu ? : dst->dev->mtu;
264 }
265
266 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
267                                          struct sk_buff *skb, u32 mtu)
268 {
269 }
270
271 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
272                                       struct sk_buff *skb)
273 {
274 }
275
276 static struct dst_ops ip6_dst_blackhole_ops = {
277         .family                 =       AF_INET6,
278         .destroy                =       ip6_dst_destroy,
279         .check                  =       ip6_dst_check,
280         .mtu                    =       ip6_blackhole_mtu,
281         .default_advmss         =       ip6_default_advmss,
282         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
283         .redirect               =       ip6_rt_blackhole_redirect,
284         .cow_metrics            =       dst_cow_metrics_generic,
285         .neigh_lookup           =       ip6_neigh_lookup,
286 };
287
288 static const u32 ip6_template_metrics[RTAX_MAX] = {
289         [RTAX_HOPLIMIT - 1] = 0,
290 };
291
292 static const struct rt6_info ip6_null_entry_template = {
293         .dst = {
294                 .__refcnt       = ATOMIC_INIT(1),
295                 .__use          = 1,
296                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
297                 .error          = -ENETUNREACH,
298                 .input          = ip6_pkt_discard,
299                 .output         = ip6_pkt_discard_out,
300         },
301         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
302         .rt6i_protocol  = RTPROT_KERNEL,
303         .rt6i_metric    = ~(u32) 0,
304         .rt6i_ref       = ATOMIC_INIT(1),
305 };
306
307 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
308
309 static const struct rt6_info ip6_prohibit_entry_template = {
310         .dst = {
311                 .__refcnt       = ATOMIC_INIT(1),
312                 .__use          = 1,
313                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
314                 .error          = -EACCES,
315                 .input          = ip6_pkt_prohibit,
316                 .output         = ip6_pkt_prohibit_out,
317         },
318         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
319         .rt6i_protocol  = RTPROT_KERNEL,
320         .rt6i_metric    = ~(u32) 0,
321         .rt6i_ref       = ATOMIC_INIT(1),
322 };
323
324 static const struct rt6_info ip6_blk_hole_entry_template = {
325         .dst = {
326                 .__refcnt       = ATOMIC_INIT(1),
327                 .__use          = 1,
328                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
329                 .error          = -EINVAL,
330                 .input          = dst_discard,
331                 .output         = dst_discard_out,
332         },
333         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
334         .rt6i_protocol  = RTPROT_KERNEL,
335         .rt6i_metric    = ~(u32) 0,
336         .rt6i_ref       = ATOMIC_INIT(1),
337 };
338
339 #endif
340
341 static void rt6_info_init(struct rt6_info *rt)
342 {
343         struct dst_entry *dst = &rt->dst;
344
345         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
346         INIT_LIST_HEAD(&rt->rt6i_siblings);
347         INIT_LIST_HEAD(&rt->rt6i_uncached);
348 }
349
350 /* allocate dst with ip6_dst_ops */
351 static struct rt6_info *__ip6_dst_alloc(struct net *net,
352                                         struct net_device *dev,
353                                         int flags)
354 {
355         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
356                                         1, DST_OBSOLETE_FORCE_CHK, flags);
357
358         if (rt)
359                 rt6_info_init(rt);
360
361         return rt;
362 }
363
364 struct rt6_info *ip6_dst_alloc(struct net *net,
365                                struct net_device *dev,
366                                int flags)
367 {
368         struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
369
370         if (rt) {
371                 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
372                 if (rt->rt6i_pcpu) {
373                         int cpu;
374
375                         for_each_possible_cpu(cpu) {
376                                 struct rt6_info **p;
377
378                                 p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
379                                 /* no one shares rt */
380                                 *p =  NULL;
381                         }
382                 } else {
383                         dst_release_immediate(&rt->dst);
384                         return NULL;
385                 }
386         }
387
388         return rt;
389 }
390 EXPORT_SYMBOL(ip6_dst_alloc);
391
392 static void ip6_dst_destroy(struct dst_entry *dst)
393 {
394         struct rt6_info *rt = (struct rt6_info *)dst;
395         struct dst_entry *from = dst->from;
396         struct inet6_dev *idev;
397
398         dst_destroy_metrics_generic(dst);
399         free_percpu(rt->rt6i_pcpu);
400         rt6_uncached_list_del(rt);
401
402         idev = rt->rt6i_idev;
403         if (idev) {
404                 rt->rt6i_idev = NULL;
405                 in6_dev_put(idev);
406         }
407
408         dst->from = NULL;
409         dst_release(from);
410 }
411
412 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
413                            int how)
414 {
415         struct rt6_info *rt = (struct rt6_info *)dst;
416         struct inet6_dev *idev = rt->rt6i_idev;
417         struct net_device *loopback_dev =
418                 dev_net(dev)->loopback_dev;
419
420         if (idev && idev->dev != loopback_dev) {
421                 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
422                 if (loopback_idev) {
423                         rt->rt6i_idev = loopback_idev;
424                         in6_dev_put(idev);
425                 }
426         }
427 }
428
429 static bool __rt6_check_expired(const struct rt6_info *rt)
430 {
431         if (rt->rt6i_flags & RTF_EXPIRES)
432                 return time_after(jiffies, rt->dst.expires);
433         else
434                 return false;
435 }
436
437 static bool rt6_check_expired(const struct rt6_info *rt)
438 {
439         if (rt->rt6i_flags & RTF_EXPIRES) {
440                 if (time_after(jiffies, rt->dst.expires))
441                         return true;
442         } else if (rt->dst.from) {
443                 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
444                        rt6_check_expired((struct rt6_info *)rt->dst.from);
445         }
446         return false;
447 }
448
449 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
450                                              struct flowi6 *fl6, int oif,
451                                              int strict)
452 {
453         struct rt6_info *sibling, *next_sibling;
454         int route_choosen;
455
456         /* We might have already computed the hash for ICMPv6 errors. In such
457          * case it will always be non-zero. Otherwise now is the time to do it.
458          */
459         if (!fl6->mp_hash)
460                 fl6->mp_hash = rt6_multipath_hash(fl6, NULL);
461
462         route_choosen = fl6->mp_hash % (match->rt6i_nsiblings + 1);
463         /* Don't change the route, if route_choosen == 0
464          * (siblings does not include ourself)
465          */
466         if (route_choosen)
467                 list_for_each_entry_safe(sibling, next_sibling,
468                                 &match->rt6i_siblings, rt6i_siblings) {
469                         route_choosen--;
470                         if (route_choosen == 0) {
471                                 if (rt6_score_route(sibling, oif, strict) < 0)
472                                         break;
473                                 match = sibling;
474                                 break;
475                         }
476                 }
477         return match;
478 }
479
480 /*
481  *      Route lookup. Any table->tb6_lock is implied.
482  */
483
484 static inline struct rt6_info *rt6_device_match(struct net *net,
485                                                     struct rt6_info *rt,
486                                                     const struct in6_addr *saddr,
487                                                     int oif,
488                                                     int flags)
489 {
490         struct rt6_info *local = NULL;
491         struct rt6_info *sprt;
492
493         if (!oif && ipv6_addr_any(saddr))
494                 goto out;
495
496         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
497                 struct net_device *dev = sprt->dst.dev;
498
499                 if (oif) {
500                         if (dev->ifindex == oif)
501                                 return sprt;
502                         if (dev->flags & IFF_LOOPBACK) {
503                                 if (!sprt->rt6i_idev ||
504                                     sprt->rt6i_idev->dev->ifindex != oif) {
505                                         if (flags & RT6_LOOKUP_F_IFACE)
506                                                 continue;
507                                         if (local &&
508                                             local->rt6i_idev->dev->ifindex == oif)
509                                                 continue;
510                                 }
511                                 local = sprt;
512                         }
513                 } else {
514                         if (ipv6_chk_addr(net, saddr, dev,
515                                           flags & RT6_LOOKUP_F_IFACE))
516                                 return sprt;
517                 }
518         }
519
520         if (oif) {
521                 if (local)
522                         return local;
523
524                 if (flags & RT6_LOOKUP_F_IFACE)
525                         return net->ipv6.ip6_null_entry;
526         }
527 out:
528         return rt;
529 }
530
531 #ifdef CONFIG_IPV6_ROUTER_PREF
532 struct __rt6_probe_work {
533         struct work_struct work;
534         struct in6_addr target;
535         struct net_device *dev;
536 };
537
538 static void rt6_probe_deferred(struct work_struct *w)
539 {
540         struct in6_addr mcaddr;
541         struct __rt6_probe_work *work =
542                 container_of(w, struct __rt6_probe_work, work);
543
544         addrconf_addr_solict_mult(&work->target, &mcaddr);
545         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
546         dev_put(work->dev);
547         kfree(work);
548 }
549
550 static void rt6_probe(struct rt6_info *rt)
551 {
552         struct __rt6_probe_work *work;
553         struct neighbour *neigh;
554         /*
555          * Okay, this does not seem to be appropriate
556          * for now, however, we need to check if it
557          * is really so; aka Router Reachability Probing.
558          *
559          * Router Reachability Probe MUST be rate-limited
560          * to no more than one per minute.
561          */
562         if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
563                 return;
564         rcu_read_lock_bh();
565         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
566         if (neigh) {
567                 if (neigh->nud_state & NUD_VALID)
568                         goto out;
569
570                 work = NULL;
571                 write_lock(&neigh->lock);
572                 if (!(neigh->nud_state & NUD_VALID) &&
573                     time_after(jiffies,
574                                neigh->updated +
575                                rt->rt6i_idev->cnf.rtr_probe_interval)) {
576                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
577                         if (work)
578                                 __neigh_set_probe_once(neigh);
579                 }
580                 write_unlock(&neigh->lock);
581         } else {
582                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
583         }
584
585         if (work) {
586                 INIT_WORK(&work->work, rt6_probe_deferred);
587                 work->target = rt->rt6i_gateway;
588                 dev_hold(rt->dst.dev);
589                 work->dev = rt->dst.dev;
590                 schedule_work(&work->work);
591         }
592
593 out:
594         rcu_read_unlock_bh();
595 }
596 #else
597 static inline void rt6_probe(struct rt6_info *rt)
598 {
599 }
600 #endif
601
602 /*
603  * Default Router Selection (RFC 2461 6.3.6)
604  */
605 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
606 {
607         struct net_device *dev = rt->dst.dev;
608         if (!oif || dev->ifindex == oif)
609                 return 2;
610         if ((dev->flags & IFF_LOOPBACK) &&
611             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
612                 return 1;
613         return 0;
614 }
615
616 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
617 {
618         struct neighbour *neigh;
619         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
620
621         if (rt->rt6i_flags & RTF_NONEXTHOP ||
622             !(rt->rt6i_flags & RTF_GATEWAY))
623                 return RT6_NUD_SUCCEED;
624
625         rcu_read_lock_bh();
626         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
627         if (neigh) {
628                 read_lock(&neigh->lock);
629                 if (neigh->nud_state & NUD_VALID)
630                         ret = RT6_NUD_SUCCEED;
631 #ifdef CONFIG_IPV6_ROUTER_PREF
632                 else if (!(neigh->nud_state & NUD_FAILED))
633                         ret = RT6_NUD_SUCCEED;
634                 else
635                         ret = RT6_NUD_FAIL_PROBE;
636 #endif
637                 read_unlock(&neigh->lock);
638         } else {
639                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
640                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
641         }
642         rcu_read_unlock_bh();
643
644         return ret;
645 }
646
647 static int rt6_score_route(struct rt6_info *rt, int oif,
648                            int strict)
649 {
650         int m;
651
652         m = rt6_check_dev(rt, oif);
653         if (!m && (strict & RT6_LOOKUP_F_IFACE))
654                 return RT6_NUD_FAIL_HARD;
655 #ifdef CONFIG_IPV6_ROUTER_PREF
656         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
657 #endif
658         if (strict & RT6_LOOKUP_F_REACHABLE) {
659                 int n = rt6_check_neigh(rt);
660                 if (n < 0)
661                         return n;
662         }
663         return m;
664 }
665
666 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
667                                    int *mpri, struct rt6_info *match,
668                                    bool *do_rr)
669 {
670         int m;
671         bool match_do_rr = false;
672         struct inet6_dev *idev = rt->rt6i_idev;
673         struct net_device *dev = rt->dst.dev;
674
675         if (dev && !netif_carrier_ok(dev) &&
676             idev->cnf.ignore_routes_with_linkdown &&
677             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
678                 goto out;
679
680         if (rt6_check_expired(rt))
681                 goto out;
682
683         m = rt6_score_route(rt, oif, strict);
684         if (m == RT6_NUD_FAIL_DO_RR) {
685                 match_do_rr = true;
686                 m = 0; /* lowest valid score */
687         } else if (m == RT6_NUD_FAIL_HARD) {
688                 goto out;
689         }
690
691         if (strict & RT6_LOOKUP_F_REACHABLE)
692                 rt6_probe(rt);
693
694         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
695         if (m > *mpri) {
696                 *do_rr = match_do_rr;
697                 *mpri = m;
698                 match = rt;
699         }
700 out:
701         return match;
702 }
703
704 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
705                                      struct rt6_info *rr_head,
706                                      u32 metric, int oif, int strict,
707                                      bool *do_rr)
708 {
709         struct rt6_info *rt, *match, *cont;
710         int mpri = -1;
711
712         match = NULL;
713         cont = NULL;
714         for (rt = rr_head; rt; rt = rt->dst.rt6_next) {
715                 if (rt->rt6i_metric != metric) {
716                         cont = rt;
717                         break;
718                 }
719
720                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
721         }
722
723         for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) {
724                 if (rt->rt6i_metric != metric) {
725                         cont = rt;
726                         break;
727                 }
728
729                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
730         }
731
732         if (match || !cont)
733                 return match;
734
735         for (rt = cont; rt; rt = rt->dst.rt6_next)
736                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
737
738         return match;
739 }
740
741 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
742 {
743         struct rt6_info *match, *rt0;
744         struct net *net;
745         bool do_rr = false;
746
747         rt0 = fn->rr_ptr;
748         if (!rt0)
749                 fn->rr_ptr = rt0 = fn->leaf;
750
751         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict,
752                              &do_rr);
753
754         if (do_rr) {
755                 struct rt6_info *next = rt0->dst.rt6_next;
756
757                 /* no entries matched; do round-robin */
758                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
759                         next = fn->leaf;
760
761                 if (next != rt0)
762                         fn->rr_ptr = next;
763         }
764
765         net = dev_net(rt0->dst.dev);
766         return match ? match : net->ipv6.ip6_null_entry;
767 }
768
769 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
770 {
771         return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
772 }
773
774 #ifdef CONFIG_IPV6_ROUTE_INFO
775 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
776                   const struct in6_addr *gwaddr)
777 {
778         struct net *net = dev_net(dev);
779         struct route_info *rinfo = (struct route_info *) opt;
780         struct in6_addr prefix_buf, *prefix;
781         unsigned int pref;
782         unsigned long lifetime;
783         struct rt6_info *rt;
784
785         if (len < sizeof(struct route_info)) {
786                 return -EINVAL;
787         }
788
789         /* Sanity check for prefix_len and length */
790         if (rinfo->length > 3) {
791                 return -EINVAL;
792         } else if (rinfo->prefix_len > 128) {
793                 return -EINVAL;
794         } else if (rinfo->prefix_len > 64) {
795                 if (rinfo->length < 2) {
796                         return -EINVAL;
797                 }
798         } else if (rinfo->prefix_len > 0) {
799                 if (rinfo->length < 1) {
800                         return -EINVAL;
801                 }
802         }
803
804         pref = rinfo->route_pref;
805         if (pref == ICMPV6_ROUTER_PREF_INVALID)
806                 return -EINVAL;
807
808         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
809
810         if (rinfo->length == 3)
811                 prefix = (struct in6_addr *)rinfo->prefix;
812         else {
813                 /* this function is safe */
814                 ipv6_addr_prefix(&prefix_buf,
815                                  (struct in6_addr *)rinfo->prefix,
816                                  rinfo->prefix_len);
817                 prefix = &prefix_buf;
818         }
819
820         if (rinfo->prefix_len == 0)
821                 rt = rt6_get_dflt_router(gwaddr, dev);
822         else
823                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
824                                         gwaddr, dev);
825
826         if (rt && !lifetime) {
827                 ip6_del_rt(rt);
828                 rt = NULL;
829         }
830
831         if (!rt && lifetime)
832                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
833                                         dev, pref);
834         else if (rt)
835                 rt->rt6i_flags = RTF_ROUTEINFO |
836                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
837
838         if (rt) {
839                 if (!addrconf_finite_timeout(lifetime))
840                         rt6_clean_expires(rt);
841                 else
842                         rt6_set_expires(rt, jiffies + HZ * lifetime);
843
844                 ip6_rt_put(rt);
845         }
846         return 0;
847 }
848 #endif
849
850 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
851                                         struct in6_addr *saddr)
852 {
853         struct fib6_node *pn;
854         while (1) {
855                 if (fn->fn_flags & RTN_TL_ROOT)
856                         return NULL;
857                 pn = fn->parent;
858                 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn)
859                         fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr);
860                 else
861                         fn = pn;
862                 if (fn->fn_flags & RTN_RTINFO)
863                         return fn;
864         }
865 }
866
867 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
868                                              struct fib6_table *table,
869                                              struct flowi6 *fl6, int flags)
870 {
871         struct fib6_node *fn;
872         struct rt6_info *rt;
873
874         read_lock_bh(&table->tb6_lock);
875         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
876 restart:
877         rt = fn->leaf;
878         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
879         if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
880                 rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags);
881         if (rt == net->ipv6.ip6_null_entry) {
882                 fn = fib6_backtrack(fn, &fl6->saddr);
883                 if (fn)
884                         goto restart;
885         }
886         dst_use(&rt->dst, jiffies);
887         read_unlock_bh(&table->tb6_lock);
888
889         trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
890
891         return rt;
892
893 }
894
895 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
896                                     int flags)
897 {
898         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
899 }
900 EXPORT_SYMBOL_GPL(ip6_route_lookup);
901
902 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
903                             const struct in6_addr *saddr, int oif, int strict)
904 {
905         struct flowi6 fl6 = {
906                 .flowi6_oif = oif,
907                 .daddr = *daddr,
908         };
909         struct dst_entry *dst;
910         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
911
912         if (saddr) {
913                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
914                 flags |= RT6_LOOKUP_F_HAS_SADDR;
915         }
916
917         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
918         if (dst->error == 0)
919                 return (struct rt6_info *) dst;
920
921         dst_release(dst);
922
923         return NULL;
924 }
925 EXPORT_SYMBOL(rt6_lookup);
926
927 /* ip6_ins_rt is called with FREE table->tb6_lock.
928  * It takes new route entry, the addition fails by any reason the
929  * route is released.
930  * Caller must hold dst before calling it.
931  */
932
933 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
934                         struct mx6_config *mxc,
935                         struct netlink_ext_ack *extack)
936 {
937         int err;
938         struct fib6_table *table;
939
940         table = rt->rt6i_table;
941         write_lock_bh(&table->tb6_lock);
942         err = fib6_add(&table->tb6_root, rt, info, mxc, extack);
943         write_unlock_bh(&table->tb6_lock);
944
945         return err;
946 }
947
948 int ip6_ins_rt(struct rt6_info *rt)
949 {
950         struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
951         struct mx6_config mxc = { .mx = NULL, };
952
953         /* Hold dst to account for the reference from the fib6 tree */
954         dst_hold(&rt->dst);
955         return __ip6_ins_rt(rt, &info, &mxc, NULL);
956 }
957
958 /* called with rcu_lock held */
959 static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
960 {
961         struct net_device *dev = rt->dst.dev;
962
963         if (rt->rt6i_flags & RTF_LOCAL) {
964                 /* for copies of local routes, dst->dev needs to be the
965                  * device if it is a master device, the master device if
966                  * device is enslaved, and the loopback as the default
967                  */
968                 if (netif_is_l3_slave(dev) &&
969                     !rt6_need_strict(&rt->rt6i_dst.addr))
970                         dev = l3mdev_master_dev_rcu(dev);
971                 else if (!netif_is_l3_master(dev))
972                         dev = dev_net(dev)->loopback_dev;
973                 /* last case is netif_is_l3_master(dev) is true in which
974                  * case we want dev returned to be dev
975                  */
976         }
977
978         return dev;
979 }
980
981 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
982                                            const struct in6_addr *daddr,
983                                            const struct in6_addr *saddr)
984 {
985         struct net_device *dev;
986         struct rt6_info *rt;
987
988         /*
989          *      Clone the route.
990          */
991
992         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
993                 ort = (struct rt6_info *)ort->dst.from;
994
995         rcu_read_lock();
996         dev = ip6_rt_get_dev_rcu(ort);
997         rt = __ip6_dst_alloc(dev_net(dev), dev, 0);
998         rcu_read_unlock();
999         if (!rt)
1000                 return NULL;
1001
1002         ip6_rt_copy_init(rt, ort);
1003         rt->rt6i_flags |= RTF_CACHE;
1004         rt->rt6i_metric = 0;
1005         rt->dst.flags |= DST_HOST;
1006         rt->rt6i_dst.addr = *daddr;
1007         rt->rt6i_dst.plen = 128;
1008
1009         if (!rt6_is_gw_or_nonexthop(ort)) {
1010                 if (ort->rt6i_dst.plen != 128 &&
1011                     ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
1012                         rt->rt6i_flags |= RTF_ANYCAST;
1013 #ifdef CONFIG_IPV6_SUBTREES
1014                 if (rt->rt6i_src.plen && saddr) {
1015                         rt->rt6i_src.addr = *saddr;
1016                         rt->rt6i_src.plen = 128;
1017                 }
1018 #endif
1019         }
1020
1021         return rt;
1022 }
1023
1024 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
1025 {
1026         struct net_device *dev;
1027         struct rt6_info *pcpu_rt;
1028
1029         rcu_read_lock();
1030         dev = ip6_rt_get_dev_rcu(rt);
1031         pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags);
1032         rcu_read_unlock();
1033         if (!pcpu_rt)
1034                 return NULL;
1035         ip6_rt_copy_init(pcpu_rt, rt);
1036         pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1037         pcpu_rt->rt6i_flags |= RTF_PCPU;
1038         return pcpu_rt;
1039 }
1040
1041 /* It should be called with read_lock_bh(&tb6_lock) acquired */
1042 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1043 {
1044         struct rt6_info *pcpu_rt, **p;
1045
1046         p = this_cpu_ptr(rt->rt6i_pcpu);
1047         pcpu_rt = *p;
1048
1049         if (pcpu_rt) {
1050                 dst_hold(&pcpu_rt->dst);
1051                 rt6_dst_from_metrics_check(pcpu_rt);
1052         }
1053         return pcpu_rt;
1054 }
1055
1056 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1057 {
1058         struct fib6_table *table = rt->rt6i_table;
1059         struct rt6_info *pcpu_rt, *prev, **p;
1060
1061         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1062         if (!pcpu_rt) {
1063                 struct net *net = dev_net(rt->dst.dev);
1064
1065                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1066                 return net->ipv6.ip6_null_entry;
1067         }
1068
1069         read_lock_bh(&table->tb6_lock);
1070         if (rt->rt6i_pcpu) {
1071                 p = this_cpu_ptr(rt->rt6i_pcpu);
1072                 prev = cmpxchg(p, NULL, pcpu_rt);
1073                 if (prev) {
1074                         /* If someone did it before us, return prev instead */
1075                         dst_release_immediate(&pcpu_rt->dst);
1076                         pcpu_rt = prev;
1077                 }
1078         } else {
1079                 /* rt has been removed from the fib6 tree
1080                  * before we have a chance to acquire the read_lock.
1081                  * In this case, don't brother to create a pcpu rt
1082                  * since rt is going away anyway.  The next
1083                  * dst_check() will trigger a re-lookup.
1084                  */
1085                 dst_release_immediate(&pcpu_rt->dst);
1086                 pcpu_rt = rt;
1087         }
1088         dst_hold(&pcpu_rt->dst);
1089         rt6_dst_from_metrics_check(pcpu_rt);
1090         read_unlock_bh(&table->tb6_lock);
1091         return pcpu_rt;
1092 }
1093
1094 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1095                                int oif, struct flowi6 *fl6, int flags)
1096 {
1097         struct fib6_node *fn, *saved_fn;
1098         struct rt6_info *rt;
1099         int strict = 0;
1100
1101         strict |= flags & RT6_LOOKUP_F_IFACE;
1102         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1103         if (net->ipv6.devconf_all->forwarding == 0)
1104                 strict |= RT6_LOOKUP_F_REACHABLE;
1105
1106         read_lock_bh(&table->tb6_lock);
1107
1108         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1109         saved_fn = fn;
1110
1111         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1112                 oif = 0;
1113
1114 redo_rt6_select:
1115         rt = rt6_select(fn, oif, strict);
1116         if (rt->rt6i_nsiblings)
1117                 rt = rt6_multipath_select(rt, fl6, oif, strict);
1118         if (rt == net->ipv6.ip6_null_entry) {
1119                 fn = fib6_backtrack(fn, &fl6->saddr);
1120                 if (fn)
1121                         goto redo_rt6_select;
1122                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1123                         /* also consider unreachable route */
1124                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1125                         fn = saved_fn;
1126                         goto redo_rt6_select;
1127                 }
1128         }
1129
1130
1131         if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) {
1132                 dst_use(&rt->dst, jiffies);
1133                 read_unlock_bh(&table->tb6_lock);
1134
1135                 rt6_dst_from_metrics_check(rt);
1136
1137                 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1138                 return rt;
1139         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1140                             !(rt->rt6i_flags & RTF_GATEWAY))) {
1141                 /* Create a RTF_CACHE clone which will not be
1142                  * owned by the fib6 tree.  It is for the special case where
1143                  * the daddr in the skb during the neighbor look-up is different
1144                  * from the fl6->daddr used to look-up route here.
1145                  */
1146
1147                 struct rt6_info *uncached_rt;
1148
1149                 dst_use(&rt->dst, jiffies);
1150                 read_unlock_bh(&table->tb6_lock);
1151
1152                 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1153                 dst_release(&rt->dst);
1154
1155                 if (uncached_rt) {
1156                         /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1157                          * No need for another dst_hold()
1158                          */
1159                         rt6_uncached_list_add(uncached_rt);
1160                 } else {
1161                         uncached_rt = net->ipv6.ip6_null_entry;
1162                         dst_hold(&uncached_rt->dst);
1163                 }
1164
1165                 trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6);
1166                 return uncached_rt;
1167
1168         } else {
1169                 /* Get a percpu copy */
1170
1171                 struct rt6_info *pcpu_rt;
1172
1173                 rt->dst.lastuse = jiffies;
1174                 rt->dst.__use++;
1175                 pcpu_rt = rt6_get_pcpu_route(rt);
1176
1177                 if (pcpu_rt) {
1178                         read_unlock_bh(&table->tb6_lock);
1179                 } else {
1180                         /* We have to do the read_unlock first
1181                          * because rt6_make_pcpu_route() may trigger
1182                          * ip6_dst_gc() which will take the write_lock.
1183                          */
1184                         dst_hold(&rt->dst);
1185                         read_unlock_bh(&table->tb6_lock);
1186                         pcpu_rt = rt6_make_pcpu_route(rt);
1187                         dst_release(&rt->dst);
1188                 }
1189
1190                 trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6);
1191                 return pcpu_rt;
1192
1193         }
1194 }
1195 EXPORT_SYMBOL_GPL(ip6_pol_route);
1196
1197 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1198                                             struct flowi6 *fl6, int flags)
1199 {
1200         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1201 }
1202
1203 struct dst_entry *ip6_route_input_lookup(struct net *net,
1204                                          struct net_device *dev,
1205                                          struct flowi6 *fl6, int flags)
1206 {
1207         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1208                 flags |= RT6_LOOKUP_F_IFACE;
1209
1210         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1211 }
1212 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1213
1214 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1215                                   struct flow_keys *keys)
1216 {
1217         const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1218         const struct ipv6hdr *key_iph = outer_iph;
1219         const struct ipv6hdr *inner_iph;
1220         const struct icmp6hdr *icmph;
1221         struct ipv6hdr _inner_iph;
1222
1223         if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1224                 goto out;
1225
1226         icmph = icmp6_hdr(skb);
1227         if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1228             icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1229             icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1230             icmph->icmp6_type != ICMPV6_PARAMPROB)
1231                 goto out;
1232
1233         inner_iph = skb_header_pointer(skb,
1234                                        skb_transport_offset(skb) + sizeof(*icmph),
1235                                        sizeof(_inner_iph), &_inner_iph);
1236         if (!inner_iph)
1237                 goto out;
1238
1239         key_iph = inner_iph;
1240 out:
1241         memset(keys, 0, sizeof(*keys));
1242         keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1243         keys->addrs.v6addrs.src = key_iph->saddr;
1244         keys->addrs.v6addrs.dst = key_iph->daddr;
1245         keys->tags.flow_label = ip6_flowinfo(key_iph);
1246         keys->basic.ip_proto = key_iph->nexthdr;
1247 }
1248
1249 /* if skb is set it will be used and fl6 can be NULL */
1250 u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb)
1251 {
1252         struct flow_keys hash_keys;
1253
1254         if (skb) {
1255                 ip6_multipath_l3_keys(skb, &hash_keys);
1256                 return flow_hash_from_keys(&hash_keys);
1257         }
1258
1259         return get_hash_from_flowi6(fl6);
1260 }
1261
1262 void ip6_route_input(struct sk_buff *skb)
1263 {
1264         const struct ipv6hdr *iph = ipv6_hdr(skb);
1265         struct net *net = dev_net(skb->dev);
1266         int flags = RT6_LOOKUP_F_HAS_SADDR;
1267         struct ip_tunnel_info *tun_info;
1268         struct flowi6 fl6 = {
1269                 .flowi6_iif = skb->dev->ifindex,
1270                 .daddr = iph->daddr,
1271                 .saddr = iph->saddr,
1272                 .flowlabel = ip6_flowinfo(iph),
1273                 .flowi6_mark = skb->mark,
1274                 .flowi6_proto = iph->nexthdr,
1275         };
1276
1277         tun_info = skb_tunnel_info(skb);
1278         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1279                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1280         if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
1281                 fl6.mp_hash = rt6_multipath_hash(&fl6, skb);
1282         skb_dst_drop(skb);
1283         skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1284 }
1285
1286 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1287                                              struct flowi6 *fl6, int flags)
1288 {
1289         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1290 }
1291
1292 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1293                                          struct flowi6 *fl6, int flags)
1294 {
1295         bool any_src;
1296
1297         if (rt6_need_strict(&fl6->daddr)) {
1298                 struct dst_entry *dst;
1299
1300                 dst = l3mdev_link_scope_lookup(net, fl6);
1301                 if (dst)
1302                         return dst;
1303         }
1304
1305         fl6->flowi6_iif = LOOPBACK_IFINDEX;
1306
1307         any_src = ipv6_addr_any(&fl6->saddr);
1308         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1309             (fl6->flowi6_oif && any_src))
1310                 flags |= RT6_LOOKUP_F_IFACE;
1311
1312         if (!any_src)
1313                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1314         else if (sk)
1315                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1316
1317         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1318 }
1319 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1320
1321 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1322 {
1323         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1324         struct net_device *loopback_dev = net->loopback_dev;
1325         struct dst_entry *new = NULL;
1326
1327         rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
1328                        DST_OBSOLETE_DEAD, 0);
1329         if (rt) {
1330                 rt6_info_init(rt);
1331
1332                 new = &rt->dst;
1333                 new->__use = 1;
1334                 new->input = dst_discard;
1335                 new->output = dst_discard_out;
1336
1337                 dst_copy_metrics(new, &ort->dst);
1338
1339                 rt->rt6i_idev = in6_dev_get(loopback_dev);
1340                 rt->rt6i_gateway = ort->rt6i_gateway;
1341                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
1342                 rt->rt6i_metric = 0;
1343
1344                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1345 #ifdef CONFIG_IPV6_SUBTREES
1346                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1347 #endif
1348         }
1349
1350         dst_release(dst_orig);
1351         return new ? new : ERR_PTR(-ENOMEM);
1352 }
1353
1354 /*
1355  *      Destination cache support functions
1356  */
1357
1358 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1359 {
1360         if (rt->dst.from &&
1361             dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
1362                 dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
1363 }
1364
1365 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1366 {
1367         u32 rt_cookie = 0;
1368
1369         if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
1370                 return NULL;
1371
1372         if (rt6_check_expired(rt))
1373                 return NULL;
1374
1375         return &rt->dst;
1376 }
1377
1378 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1379 {
1380         if (!__rt6_check_expired(rt) &&
1381             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1382             rt6_check((struct rt6_info *)(rt->dst.from), cookie))
1383                 return &rt->dst;
1384         else
1385                 return NULL;
1386 }
1387
1388 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1389 {
1390         struct rt6_info *rt;
1391
1392         rt = (struct rt6_info *) dst;
1393
1394         /* All IPV6 dsts are created with ->obsolete set to the value
1395          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1396          * into this function always.
1397          */
1398
1399         rt6_dst_from_metrics_check(rt);
1400
1401         if (rt->rt6i_flags & RTF_PCPU ||
1402             (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->dst.from))
1403                 return rt6_dst_from_check(rt, cookie);
1404         else
1405                 return rt6_check(rt, cookie);
1406 }
1407
1408 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1409 {
1410         struct rt6_info *rt = (struct rt6_info *) dst;
1411
1412         if (rt) {
1413                 if (rt->rt6i_flags & RTF_CACHE) {
1414                         if (rt6_check_expired(rt)) {
1415                                 ip6_del_rt(rt);
1416                                 dst = NULL;
1417                         }
1418                 } else {
1419                         dst_release(dst);
1420                         dst = NULL;
1421                 }
1422         }
1423         return dst;
1424 }
1425
1426 static void ip6_link_failure(struct sk_buff *skb)
1427 {
1428         struct rt6_info *rt;
1429
1430         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1431
1432         rt = (struct rt6_info *) skb_dst(skb);
1433         if (rt) {
1434                 if (rt->rt6i_flags & RTF_CACHE) {
1435                         if (dst_hold_safe(&rt->dst))
1436                                 ip6_del_rt(rt);
1437                 } else {
1438                         struct fib6_node *fn;
1439
1440                         rcu_read_lock();
1441                         fn = rcu_dereference(rt->rt6i_node);
1442                         if (fn && (rt->rt6i_flags & RTF_DEFAULT))
1443                                 fn->fn_sernum = -1;
1444                         rcu_read_unlock();
1445                 }
1446         }
1447 }
1448
1449 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
1450 {
1451         struct net *net = dev_net(rt->dst.dev);
1452
1453         rt->rt6i_flags |= RTF_MODIFIED;
1454         rt->rt6i_pmtu = mtu;
1455         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1456 }
1457
1458 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
1459 {
1460         return !(rt->rt6i_flags & RTF_CACHE) &&
1461                 (rt->rt6i_flags & RTF_PCPU ||
1462                  rcu_access_pointer(rt->rt6i_node));
1463 }
1464
1465 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
1466                                  const struct ipv6hdr *iph, u32 mtu)
1467 {
1468         const struct in6_addr *daddr, *saddr;
1469         struct rt6_info *rt6 = (struct rt6_info *)dst;
1470
1471         if (rt6->rt6i_flags & RTF_LOCAL)
1472                 return;
1473
1474         if (dst_metric_locked(dst, RTAX_MTU))
1475                 return;
1476
1477         if (iph) {
1478                 daddr = &iph->daddr;
1479                 saddr = &iph->saddr;
1480         } else if (sk) {
1481                 daddr = &sk->sk_v6_daddr;
1482                 saddr = &inet6_sk(sk)->saddr;
1483         } else {
1484                 daddr = NULL;
1485                 saddr = NULL;
1486         }
1487         dst_confirm_neigh(dst, daddr);
1488         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
1489         if (mtu >= dst_mtu(dst))
1490                 return;
1491
1492         if (!rt6_cache_allowed_for_pmtu(rt6)) {
1493                 rt6_do_update_pmtu(rt6, mtu);
1494         } else if (daddr) {
1495                 struct rt6_info *nrt6;
1496
1497                 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
1498                 if (nrt6) {
1499                         rt6_do_update_pmtu(nrt6, mtu);
1500
1501                         /* ip6_ins_rt(nrt6) will bump the
1502                          * rt6->rt6i_node->fn_sernum
1503                          * which will fail the next rt6_check() and
1504                          * invalidate the sk->sk_dst_cache.
1505                          */
1506                         ip6_ins_rt(nrt6);
1507                         /* Release the reference taken in
1508                          * ip6_rt_cache_alloc()
1509                          */
1510                         dst_release(&nrt6->dst);
1511                 }
1512         }
1513 }
1514
1515 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1516                                struct sk_buff *skb, u32 mtu)
1517 {
1518         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
1519 }
1520
1521 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
1522                      int oif, u32 mark, kuid_t uid)
1523 {
1524         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1525         struct dst_entry *dst;
1526         struct flowi6 fl6;
1527
1528         memset(&fl6, 0, sizeof(fl6));
1529         fl6.flowi6_oif = oif;
1530         fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
1531         fl6.daddr = iph->daddr;
1532         fl6.saddr = iph->saddr;
1533         fl6.flowlabel = ip6_flowinfo(iph);
1534         fl6.flowi6_uid = uid;
1535
1536         dst = ip6_route_output(net, NULL, &fl6);
1537         if (!dst->error)
1538                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
1539         dst_release(dst);
1540 }
1541 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
1542
1543 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
1544 {
1545         struct dst_entry *dst;
1546
1547         ip6_update_pmtu(skb, sock_net(sk), mtu,
1548                         sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
1549
1550         dst = __sk_dst_get(sk);
1551         if (!dst || !dst->obsolete ||
1552             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
1553                 return;
1554
1555         bh_lock_sock(sk);
1556         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
1557                 ip6_datagram_dst_update(sk, false);
1558         bh_unlock_sock(sk);
1559 }
1560 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
1561
1562 /* Handle redirects */
1563 struct ip6rd_flowi {
1564         struct flowi6 fl6;
1565         struct in6_addr gateway;
1566 };
1567
1568 static struct rt6_info *__ip6_route_redirect(struct net *net,
1569                                              struct fib6_table *table,
1570                                              struct flowi6 *fl6,
1571                                              int flags)
1572 {
1573         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1574         struct rt6_info *rt;
1575         struct fib6_node *fn;
1576
1577         /* Get the "current" route for this destination and
1578          * check if the redirect has come from appropriate router.
1579          *
1580          * RFC 4861 specifies that redirects should only be
1581          * accepted if they come from the nexthop to the target.
1582          * Due to the way the routes are chosen, this notion
1583          * is a bit fuzzy and one might need to check all possible
1584          * routes.
1585          */
1586
1587         read_lock_bh(&table->tb6_lock);
1588         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1589 restart:
1590         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1591                 if (rt6_check_expired(rt))
1592                         continue;
1593                 if (rt->dst.error)
1594                         break;
1595                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1596                         continue;
1597                 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1598                         continue;
1599                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1600                         continue;
1601                 break;
1602         }
1603
1604         if (!rt)
1605                 rt = net->ipv6.ip6_null_entry;
1606         else if (rt->dst.error) {
1607                 rt = net->ipv6.ip6_null_entry;
1608                 goto out;
1609         }
1610
1611         if (rt == net->ipv6.ip6_null_entry) {
1612                 fn = fib6_backtrack(fn, &fl6->saddr);
1613                 if (fn)
1614                         goto restart;
1615         }
1616
1617 out:
1618         dst_hold(&rt->dst);
1619
1620         read_unlock_bh(&table->tb6_lock);
1621
1622         trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1623         return rt;
1624 };
1625
1626 static struct dst_entry *ip6_route_redirect(struct net *net,
1627                                         const struct flowi6 *fl6,
1628                                         const struct in6_addr *gateway)
1629 {
1630         int flags = RT6_LOOKUP_F_HAS_SADDR;
1631         struct ip6rd_flowi rdfl;
1632
1633         rdfl.fl6 = *fl6;
1634         rdfl.gateway = *gateway;
1635
1636         return fib6_rule_lookup(net, &rdfl.fl6,
1637                                 flags, __ip6_route_redirect);
1638 }
1639
1640 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
1641                   kuid_t uid)
1642 {
1643         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1644         struct dst_entry *dst;
1645         struct flowi6 fl6;
1646
1647         memset(&fl6, 0, sizeof(fl6));
1648         fl6.flowi6_iif = LOOPBACK_IFINDEX;
1649         fl6.flowi6_oif = oif;
1650         fl6.flowi6_mark = mark;
1651         fl6.daddr = iph->daddr;
1652         fl6.saddr = iph->saddr;
1653         fl6.flowlabel = ip6_flowinfo(iph);
1654         fl6.flowi6_uid = uid;
1655
1656         dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
1657         rt6_do_redirect(dst, NULL, skb);
1658         dst_release(dst);
1659 }
1660 EXPORT_SYMBOL_GPL(ip6_redirect);
1661
1662 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
1663                             u32 mark)
1664 {
1665         const struct ipv6hdr *iph = ipv6_hdr(skb);
1666         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
1667         struct dst_entry *dst;
1668         struct flowi6 fl6;
1669
1670         memset(&fl6, 0, sizeof(fl6));
1671         fl6.flowi6_iif = LOOPBACK_IFINDEX;
1672         fl6.flowi6_oif = oif;
1673         fl6.flowi6_mark = mark;
1674         fl6.daddr = msg->dest;
1675         fl6.saddr = iph->daddr;
1676         fl6.flowi6_uid = sock_net_uid(net, NULL);
1677
1678         dst = ip6_route_redirect(net, &fl6, &iph->saddr);
1679         rt6_do_redirect(dst, NULL, skb);
1680         dst_release(dst);
1681 }
1682
1683 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
1684 {
1685         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
1686                      sk->sk_uid);
1687 }
1688 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
1689
1690 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1691 {
1692         struct net_device *dev = dst->dev;
1693         unsigned int mtu = dst_mtu(dst);
1694         struct net *net = dev_net(dev);
1695
1696         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1697
1698         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1699                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1700
1701         /*
1702          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1703          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1704          * IPV6_MAXPLEN is also valid and means: "any MSS,
1705          * rely only on pmtu discovery"
1706          */
1707         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1708                 mtu = IPV6_MAXPLEN;
1709         return mtu;
1710 }
1711
1712 static unsigned int ip6_mtu(const struct dst_entry *dst)
1713 {
1714         const struct rt6_info *rt = (const struct rt6_info *)dst;
1715         unsigned int mtu = rt->rt6i_pmtu;
1716         struct inet6_dev *idev;
1717
1718         if (mtu)
1719                 goto out;
1720
1721         mtu = dst_metric_raw(dst, RTAX_MTU);
1722         if (mtu)
1723                 goto out;
1724
1725         mtu = IPV6_MIN_MTU;
1726
1727         rcu_read_lock();
1728         idev = __in6_dev_get(dst->dev);
1729         if (idev)
1730                 mtu = idev->cnf.mtu6;
1731         rcu_read_unlock();
1732
1733 out:
1734         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1735
1736         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1737 }
1738
1739 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1740                                   struct flowi6 *fl6)
1741 {
1742         struct dst_entry *dst;
1743         struct rt6_info *rt;
1744         struct inet6_dev *idev = in6_dev_get(dev);
1745         struct net *net = dev_net(dev);
1746
1747         if (unlikely(!idev))
1748                 return ERR_PTR(-ENODEV);
1749
1750         rt = ip6_dst_alloc(net, dev, 0);
1751         if (unlikely(!rt)) {
1752                 in6_dev_put(idev);
1753                 dst = ERR_PTR(-ENOMEM);
1754                 goto out;
1755         }
1756
1757         rt->dst.flags |= DST_HOST;
1758         rt->dst.output  = ip6_output;
1759         rt->rt6i_gateway  = fl6->daddr;
1760         rt->rt6i_dst.addr = fl6->daddr;
1761         rt->rt6i_dst.plen = 128;
1762         rt->rt6i_idev     = idev;
1763         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1764
1765         /* Add this dst into uncached_list so that rt6_ifdown() can
1766          * do proper release of the net_device
1767          */
1768         rt6_uncached_list_add(rt);
1769
1770         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1771
1772 out:
1773         return dst;
1774 }
1775
1776 static int ip6_dst_gc(struct dst_ops *ops)
1777 {
1778         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1779         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1780         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1781         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1782         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1783         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1784         int entries;
1785
1786         entries = dst_entries_get_fast(ops);
1787         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
1788             entries <= rt_max_size)
1789                 goto out;
1790
1791         net->ipv6.ip6_rt_gc_expire++;
1792         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
1793         entries = dst_entries_get_slow(ops);
1794         if (entries < ops->gc_thresh)
1795                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1796 out:
1797         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1798         return entries > rt_max_size;
1799 }
1800
1801 static int ip6_convert_metrics(struct mx6_config *mxc,
1802                                const struct fib6_config *cfg)
1803 {
1804         bool ecn_ca = false;
1805         struct nlattr *nla;
1806         int remaining;
1807         u32 *mp;
1808
1809         if (!cfg->fc_mx)
1810                 return 0;
1811
1812         mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1813         if (unlikely(!mp))
1814                 return -ENOMEM;
1815
1816         nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1817                 int type = nla_type(nla);
1818                 u32 val;
1819
1820                 if (!type)
1821                         continue;
1822                 if (unlikely(type > RTAX_MAX))
1823                         goto err;
1824
1825                 if (type == RTAX_CC_ALGO) {
1826                         char tmp[TCP_CA_NAME_MAX];
1827
1828                         nla_strlcpy(tmp, nla, sizeof(tmp));
1829                         val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
1830                         if (val == TCP_CA_UNSPEC)
1831                                 goto err;
1832                 } else {
1833                         val = nla_get_u32(nla);
1834                 }
1835                 if (type == RTAX_HOPLIMIT && val > 255)
1836                         val = 255;
1837                 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
1838                         goto err;
1839
1840                 mp[type - 1] = val;
1841                 __set_bit(type - 1, mxc->mx_valid);
1842         }
1843
1844         if (ecn_ca) {
1845                 __set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
1846                 mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
1847         }
1848
1849         mxc->mx = mp;
1850         return 0;
1851  err:
1852         kfree(mp);
1853         return -EINVAL;
1854 }
1855
1856 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
1857                                             struct fib6_config *cfg,
1858                                             const struct in6_addr *gw_addr)
1859 {
1860         struct flowi6 fl6 = {
1861                 .flowi6_oif = cfg->fc_ifindex,
1862                 .daddr = *gw_addr,
1863                 .saddr = cfg->fc_prefsrc,
1864         };
1865         struct fib6_table *table;
1866         struct rt6_info *rt;
1867         int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_IGNORE_LINKSTATE;
1868
1869         table = fib6_get_table(net, cfg->fc_table);
1870         if (!table)
1871                 return NULL;
1872
1873         if (!ipv6_addr_any(&cfg->fc_prefsrc))
1874                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1875
1876         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags);
1877
1878         /* if table lookup failed, fall back to full lookup */
1879         if (rt == net->ipv6.ip6_null_entry) {
1880                 ip6_rt_put(rt);
1881                 rt = NULL;
1882         }
1883
1884         return rt;
1885 }
1886
1887 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
1888                                               struct netlink_ext_ack *extack)
1889 {
1890         struct net *net = cfg->fc_nlinfo.nl_net;
1891         struct rt6_info *rt = NULL;
1892         struct net_device *dev = NULL;
1893         struct inet6_dev *idev = NULL;
1894         struct fib6_table *table;
1895         int addr_type;
1896         int err = -EINVAL;
1897
1898         /* RTF_PCPU is an internal flag; can not be set by userspace */
1899         if (cfg->fc_flags & RTF_PCPU) {
1900                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
1901                 goto out;
1902         }
1903
1904         if (cfg->fc_dst_len > 128) {
1905                 NL_SET_ERR_MSG(extack, "Invalid prefix length");
1906                 goto out;
1907         }
1908         if (cfg->fc_src_len > 128) {
1909                 NL_SET_ERR_MSG(extack, "Invalid source address length");
1910                 goto out;
1911         }
1912 #ifndef CONFIG_IPV6_SUBTREES
1913         if (cfg->fc_src_len) {
1914                 NL_SET_ERR_MSG(extack,
1915                                "Specifying source address requires IPV6_SUBTREES to be enabled");
1916                 goto out;
1917         }
1918 #endif
1919         if (cfg->fc_ifindex) {
1920                 err = -ENODEV;
1921                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1922                 if (!dev)
1923                         goto out;
1924                 idev = in6_dev_get(dev);
1925                 if (!idev)
1926                         goto out;
1927         }
1928
1929         if (cfg->fc_metric == 0)
1930                 cfg->fc_metric = IP6_RT_PRIO_USER;
1931
1932         err = -ENOBUFS;
1933         if (cfg->fc_nlinfo.nlh &&
1934             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1935                 table = fib6_get_table(net, cfg->fc_table);
1936                 if (!table) {
1937                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1938                         table = fib6_new_table(net, cfg->fc_table);
1939                 }
1940         } else {
1941                 table = fib6_new_table(net, cfg->fc_table);
1942         }
1943
1944         if (!table)
1945                 goto out;
1946
1947         rt = ip6_dst_alloc(net, NULL,
1948                            (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
1949
1950         if (!rt) {
1951                 err = -ENOMEM;
1952                 goto out;
1953         }
1954
1955         if (cfg->fc_flags & RTF_EXPIRES)
1956                 rt6_set_expires(rt, jiffies +
1957                                 clock_t_to_jiffies(cfg->fc_expires));
1958         else
1959                 rt6_clean_expires(rt);
1960
1961         if (cfg->fc_protocol == RTPROT_UNSPEC)
1962                 cfg->fc_protocol = RTPROT_BOOT;
1963         rt->rt6i_protocol = cfg->fc_protocol;
1964
1965         addr_type = ipv6_addr_type(&cfg->fc_dst);
1966
1967         if (addr_type & IPV6_ADDR_MULTICAST)
1968                 rt->dst.input = ip6_mc_input;
1969         else if (cfg->fc_flags & RTF_LOCAL)
1970                 rt->dst.input = ip6_input;
1971         else
1972                 rt->dst.input = ip6_forward;
1973
1974         rt->dst.output = ip6_output;
1975
1976         if (cfg->fc_encap) {
1977                 struct lwtunnel_state *lwtstate;
1978
1979                 err = lwtunnel_build_state(cfg->fc_encap_type,
1980                                            cfg->fc_encap, AF_INET6, cfg,
1981                                            &lwtstate, extack);
1982                 if (err)
1983                         goto out;
1984                 rt->dst.lwtstate = lwtstate_get(lwtstate);
1985                 if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
1986                         rt->dst.lwtstate->orig_output = rt->dst.output;
1987                         rt->dst.output = lwtunnel_output;
1988                 }
1989                 if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
1990                         rt->dst.lwtstate->orig_input = rt->dst.input;
1991                         rt->dst.input = lwtunnel_input;
1992                 }
1993         }
1994
1995         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1996         rt->rt6i_dst.plen = cfg->fc_dst_len;
1997         if (rt->rt6i_dst.plen == 128)
1998                 rt->dst.flags |= DST_HOST;
1999
2000 #ifdef CONFIG_IPV6_SUBTREES
2001         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
2002         rt->rt6i_src.plen = cfg->fc_src_len;
2003 #endif
2004
2005         rt->rt6i_metric = cfg->fc_metric;
2006
2007         /* We cannot add true routes via loopback here,
2008            they would result in kernel looping; promote them to reject routes
2009          */
2010         if ((cfg->fc_flags & RTF_REJECT) ||
2011             (dev && (dev->flags & IFF_LOOPBACK) &&
2012              !(addr_type & IPV6_ADDR_LOOPBACK) &&
2013              !(cfg->fc_flags & RTF_LOCAL))) {
2014                 /* hold loopback dev/idev if we haven't done so. */
2015                 if (dev != net->loopback_dev) {
2016                         if (dev) {
2017                                 dev_put(dev);
2018                                 in6_dev_put(idev);
2019                         }
2020                         dev = net->loopback_dev;
2021                         dev_hold(dev);
2022                         idev = in6_dev_get(dev);
2023                         if (!idev) {
2024                                 err = -ENODEV;
2025                                 goto out;
2026                         }
2027                 }
2028                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
2029                 switch (cfg->fc_type) {
2030                 case RTN_BLACKHOLE:
2031                         rt->dst.error = -EINVAL;
2032                         rt->dst.output = dst_discard_out;
2033                         rt->dst.input = dst_discard;
2034                         break;
2035                 case RTN_PROHIBIT:
2036                         rt->dst.error = -EACCES;
2037                         rt->dst.output = ip6_pkt_prohibit_out;
2038                         rt->dst.input = ip6_pkt_prohibit;
2039                         break;
2040                 case RTN_THROW:
2041                 case RTN_UNREACHABLE:
2042                 default:
2043                         rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
2044                                         : (cfg->fc_type == RTN_UNREACHABLE)
2045                                         ? -EHOSTUNREACH : -ENETUNREACH;
2046                         rt->dst.output = ip6_pkt_discard_out;
2047                         rt->dst.input = ip6_pkt_discard;
2048                         break;
2049                 }
2050                 goto install_route;
2051         }
2052
2053         if (cfg->fc_flags & RTF_GATEWAY) {
2054                 const struct in6_addr *gw_addr;
2055                 int gwa_type;
2056
2057                 gw_addr = &cfg->fc_gateway;
2058                 gwa_type = ipv6_addr_type(gw_addr);
2059
2060                 /* if gw_addr is local we will fail to detect this in case
2061                  * address is still TENTATIVE (DAD in progress). rt6_lookup()
2062                  * will return already-added prefix route via interface that
2063                  * prefix route was assigned to, which might be non-loopback.
2064                  */
2065                 err = -EINVAL;
2066                 if (ipv6_chk_addr_and_flags(net, gw_addr,
2067                                             gwa_type & IPV6_ADDR_LINKLOCAL ?
2068                                             dev : NULL, 0, 0)) {
2069                         NL_SET_ERR_MSG(extack, "Invalid gateway address");
2070                         goto out;
2071                 }
2072                 rt->rt6i_gateway = *gw_addr;
2073
2074                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
2075                         struct rt6_info *grt = NULL;
2076
2077                         /* IPv6 strictly inhibits using not link-local
2078                            addresses as nexthop address.
2079                            Otherwise, router will not able to send redirects.
2080                            It is very good, but in some (rare!) circumstances
2081                            (SIT, PtP, NBMA NOARP links) it is handy to allow
2082                            some exceptions. --ANK
2083                            We allow IPv4-mapped nexthops to support RFC4798-type
2084                            addressing
2085                          */
2086                         if (!(gwa_type & (IPV6_ADDR_UNICAST |
2087                                           IPV6_ADDR_MAPPED))) {
2088                                 NL_SET_ERR_MSG(extack,
2089                                                "Invalid gateway address");
2090                                 goto out;
2091                         }
2092
2093                         if (cfg->fc_table) {
2094                                 grt = ip6_nh_lookup_table(net, cfg, gw_addr);
2095
2096                                 if (grt) {
2097                                         if (grt->rt6i_flags & RTF_GATEWAY ||
2098                                             (dev && dev != grt->dst.dev)) {
2099                                                 ip6_rt_put(grt);
2100                                                 grt = NULL;
2101                                         }
2102                                 }
2103                         }
2104
2105                         if (!grt)
2106                                 grt = rt6_lookup(net, gw_addr, NULL,
2107                                                  cfg->fc_ifindex, 1);
2108
2109                         err = -EHOSTUNREACH;
2110                         if (!grt)
2111                                 goto out;
2112                         if (dev) {
2113                                 if (dev != grt->dst.dev) {
2114                                         ip6_rt_put(grt);
2115                                         goto out;
2116                                 }
2117                         } else {
2118                                 dev = grt->dst.dev;
2119                                 idev = grt->rt6i_idev;
2120                                 dev_hold(dev);
2121                                 in6_dev_hold(grt->rt6i_idev);
2122                         }
2123                         if (!(grt->rt6i_flags & RTF_GATEWAY))
2124                                 err = 0;
2125                         ip6_rt_put(grt);
2126
2127                         if (err)
2128                                 goto out;
2129                 }
2130                 err = -EINVAL;
2131                 if (!dev) {
2132                         NL_SET_ERR_MSG(extack, "Egress device not specified");
2133                         goto out;
2134                 } else if (dev->flags & IFF_LOOPBACK) {
2135                         NL_SET_ERR_MSG(extack,
2136                                        "Egress device can not be loopback device for this route");
2137                         goto out;
2138                 }
2139         }
2140
2141         err = -ENODEV;
2142         if (!dev)
2143                 goto out;
2144
2145         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2146                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2147                         NL_SET_ERR_MSG(extack, "Invalid source address");
2148                         err = -EINVAL;
2149                         goto out;
2150                 }
2151                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2152                 rt->rt6i_prefsrc.plen = 128;
2153         } else
2154                 rt->rt6i_prefsrc.plen = 0;
2155
2156         rt->rt6i_flags = cfg->fc_flags;
2157
2158 install_route:
2159         rt->dst.dev = dev;
2160         rt->rt6i_idev = idev;
2161         rt->rt6i_table = table;
2162
2163         cfg->fc_nlinfo.nl_net = dev_net(dev);
2164
2165         return rt;
2166 out:
2167         if (dev)
2168                 dev_put(dev);
2169         if (idev)
2170                 in6_dev_put(idev);
2171         if (rt)
2172                 dst_release_immediate(&rt->dst);
2173
2174         return ERR_PTR(err);
2175 }
2176
2177 int ip6_route_add(struct fib6_config *cfg,
2178                   struct netlink_ext_ack *extack)
2179 {
2180         struct mx6_config mxc = { .mx = NULL, };
2181         struct rt6_info *rt;
2182         int err;
2183
2184         rt = ip6_route_info_create(cfg, extack);
2185         if (IS_ERR(rt)) {
2186                 err = PTR_ERR(rt);
2187                 rt = NULL;
2188                 goto out;
2189         }
2190
2191         err = ip6_convert_metrics(&mxc, cfg);
2192         if (err)
2193                 goto out;
2194
2195         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack);
2196
2197         kfree(mxc.mx);
2198
2199         return err;
2200 out:
2201         if (rt)
2202                 dst_release_immediate(&rt->dst);
2203
2204         return err;
2205 }
2206
2207 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
2208 {
2209         int err;
2210         struct fib6_table *table;
2211         struct net *net = dev_net(rt->dst.dev);
2212
2213         if (rt == net->ipv6.ip6_null_entry) {
2214                 err = -ENOENT;
2215                 goto out;
2216         }
2217
2218         table = rt->rt6i_table;
2219         write_lock_bh(&table->tb6_lock);
2220         err = fib6_del(rt, info);
2221         write_unlock_bh(&table->tb6_lock);
2222
2223 out:
2224         ip6_rt_put(rt);
2225         return err;
2226 }
2227
2228 int ip6_del_rt(struct rt6_info *rt)
2229 {
2230         struct nl_info info = {
2231                 .nl_net = dev_net(rt->dst.dev),
2232         };
2233         return __ip6_del_rt(rt, &info);
2234 }
2235
2236 static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
2237 {
2238         struct nl_info *info = &cfg->fc_nlinfo;
2239         struct net *net = info->nl_net;
2240         struct sk_buff *skb = NULL;
2241         struct fib6_table *table;
2242         int err = -ENOENT;
2243
2244         if (rt == net->ipv6.ip6_null_entry)
2245                 goto out_put;
2246         table = rt->rt6i_table;
2247         write_lock_bh(&table->tb6_lock);
2248
2249         if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
2250                 struct rt6_info *sibling, *next_sibling;
2251
2252                 /* prefer to send a single notification with all hops */
2253                 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
2254                 if (skb) {
2255                         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2256
2257                         if (rt6_fill_node(net, skb, rt,
2258                                           NULL, NULL, 0, RTM_DELROUTE,
2259                                           info->portid, seq, 0) < 0) {
2260                                 kfree_skb(skb);
2261                                 skb = NULL;
2262                         } else
2263                                 info->skip_notify = 1;
2264                 }
2265
2266                 list_for_each_entry_safe(sibling, next_sibling,
2267                                          &rt->rt6i_siblings,
2268                                          rt6i_siblings) {
2269                         err = fib6_del(sibling, info);
2270                         if (err)
2271                                 goto out_unlock;
2272                 }
2273         }
2274
2275         err = fib6_del(rt, info);
2276 out_unlock:
2277         write_unlock_bh(&table->tb6_lock);
2278 out_put:
2279         ip6_rt_put(rt);
2280
2281         if (skb) {
2282                 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
2283                             info->nlh, gfp_any());
2284         }
2285         return err;
2286 }
2287
2288 static int ip6_route_del(struct fib6_config *cfg,
2289                          struct netlink_ext_ack *extack)
2290 {
2291         struct fib6_table *table;
2292         struct fib6_node *fn;
2293         struct rt6_info *rt;
2294         int err = -ESRCH;
2295
2296         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
2297         if (!table) {
2298                 NL_SET_ERR_MSG(extack, "FIB table does not exist");
2299                 return err;
2300         }
2301
2302         read_lock_bh(&table->tb6_lock);
2303
2304         fn = fib6_locate(&table->tb6_root,
2305                          &cfg->fc_dst, cfg->fc_dst_len,
2306                          &cfg->fc_src, cfg->fc_src_len);
2307
2308         if (fn) {
2309                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2310                         if ((rt->rt6i_flags & RTF_CACHE) &&
2311                             !(cfg->fc_flags & RTF_CACHE))
2312                                 continue;
2313                         if (cfg->fc_ifindex &&
2314                             (!rt->dst.dev ||
2315                              rt->dst.dev->ifindex != cfg->fc_ifindex))
2316                                 continue;
2317                         if (cfg->fc_flags & RTF_GATEWAY &&
2318                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
2319                                 continue;
2320                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2321                                 continue;
2322                         if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
2323                                 continue;
2324                         dst_hold(&rt->dst);
2325                         read_unlock_bh(&table->tb6_lock);
2326
2327                         /* if gateway was specified only delete the one hop */
2328                         if (cfg->fc_flags & RTF_GATEWAY)
2329                                 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
2330
2331                         return __ip6_del_rt_siblings(rt, cfg);
2332                 }
2333         }
2334         read_unlock_bh(&table->tb6_lock);
2335
2336         return err;
2337 }
2338
2339 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2340 {
2341         struct netevent_redirect netevent;
2342         struct rt6_info *rt, *nrt = NULL;
2343         struct ndisc_options ndopts;
2344         struct inet6_dev *in6_dev;
2345         struct neighbour *neigh;
2346         struct rd_msg *msg;
2347         int optlen, on_link;
2348         u8 *lladdr;
2349
2350         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2351         optlen -= sizeof(*msg);
2352
2353         if (optlen < 0) {
2354                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2355                 return;
2356         }
2357
2358         msg = (struct rd_msg *)icmp6_hdr(skb);
2359
2360         if (ipv6_addr_is_multicast(&msg->dest)) {
2361                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2362                 return;
2363         }
2364
2365         on_link = 0;
2366         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2367                 on_link = 1;
2368         } else if (ipv6_addr_type(&msg->target) !=
2369                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2370                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2371                 return;
2372         }
2373
2374         in6_dev = __in6_dev_get(skb->dev);
2375         if (!in6_dev)
2376                 return;
2377         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2378                 return;
2379
2380         /* RFC2461 8.1:
2381          *      The IP source address of the Redirect MUST be the same as the current
2382          *      first-hop router for the specified ICMP Destination Address.
2383          */
2384
2385         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
2386                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2387                 return;
2388         }
2389
2390         lladdr = NULL;
2391         if (ndopts.nd_opts_tgt_lladdr) {
2392                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2393                                              skb->dev);
2394                 if (!lladdr) {
2395                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2396                         return;
2397                 }
2398         }
2399
2400         rt = (struct rt6_info *) dst;
2401         if (rt->rt6i_flags & RTF_REJECT) {
2402                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2403                 return;
2404         }
2405
2406         /* Redirect received -> path was valid.
2407          * Look, redirects are sent only in response to data packets,
2408          * so that this nexthop apparently is reachable. --ANK
2409          */
2410         dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
2411
2412         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
2413         if (!neigh)
2414                 return;
2415
2416         /*
2417          *      We have finally decided to accept it.
2418          */
2419
2420         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
2421                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
2422                      NEIGH_UPDATE_F_OVERRIDE|
2423                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
2424                                      NEIGH_UPDATE_F_ISROUTER)),
2425                      NDISC_REDIRECT, &ndopts);
2426
2427         nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
2428         if (!nrt)
2429                 goto out;
2430
2431         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
2432         if (on_link)
2433                 nrt->rt6i_flags &= ~RTF_GATEWAY;
2434
2435         nrt->rt6i_protocol = RTPROT_REDIRECT;
2436         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
2437
2438         if (ip6_ins_rt(nrt))
2439                 goto out_release;
2440
2441         netevent.old = &rt->dst;
2442         netevent.new = &nrt->dst;
2443         netevent.daddr = &msg->dest;
2444         netevent.neigh = neigh;
2445         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
2446
2447         if (rt->rt6i_flags & RTF_CACHE) {
2448                 rt = (struct rt6_info *) dst_clone(&rt->dst);
2449                 ip6_del_rt(rt);
2450         }
2451
2452 out_release:
2453         /* Release the reference taken in
2454          * ip6_rt_cache_alloc()
2455          */
2456         dst_release(&nrt->dst);
2457
2458 out:
2459         neigh_release(neigh);
2460 }
2461
2462 /*
2463  *      Misc support functions
2464  */
2465
2466 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
2467 {
2468         BUG_ON(from->dst.from);
2469
2470         rt->rt6i_flags &= ~RTF_EXPIRES;
2471         dst_hold(&from->dst);
2472         rt->dst.from = &from->dst;
2473         dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
2474 }
2475
2476 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
2477 {
2478         rt->dst.input = ort->dst.input;
2479         rt->dst.output = ort->dst.output;
2480         rt->rt6i_dst = ort->rt6i_dst;
2481         rt->dst.error = ort->dst.error;
2482         rt->rt6i_idev = ort->rt6i_idev;
2483         if (rt->rt6i_idev)
2484                 in6_dev_hold(rt->rt6i_idev);
2485         rt->dst.lastuse = jiffies;
2486         rt->rt6i_gateway = ort->rt6i_gateway;
2487         rt->rt6i_flags = ort->rt6i_flags;
2488         rt6_set_from(rt, ort);
2489         rt->rt6i_metric = ort->rt6i_metric;
2490 #ifdef CONFIG_IPV6_SUBTREES
2491         rt->rt6i_src = ort->rt6i_src;
2492 #endif
2493         rt->rt6i_prefsrc = ort->rt6i_prefsrc;
2494         rt->rt6i_table = ort->rt6i_table;
2495         rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
2496 }
2497
2498 #ifdef CONFIG_IPV6_ROUTE_INFO
2499 static struct rt6_info *rt6_get_route_info(struct net *net,
2500                                            const struct in6_addr *prefix, int prefixlen,
2501                                            const struct in6_addr *gwaddr,
2502                                            struct net_device *dev)
2503 {
2504         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
2505         int ifindex = dev->ifindex;
2506         struct fib6_node *fn;
2507         struct rt6_info *rt = NULL;
2508         struct fib6_table *table;
2509
2510         table = fib6_get_table(net, tb_id);
2511         if (!table)
2512                 return NULL;
2513
2514         read_lock_bh(&table->tb6_lock);
2515         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0);
2516         if (!fn)
2517                 goto out;
2518
2519         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2520                 if (rt->dst.dev->ifindex != ifindex)
2521                         continue;
2522                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
2523                         continue;
2524                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
2525                         continue;
2526                 dst_hold(&rt->dst);
2527                 break;
2528         }
2529 out:
2530         read_unlock_bh(&table->tb6_lock);
2531         return rt;
2532 }
2533
2534 static struct rt6_info *rt6_add_route_info(struct net *net,
2535                                            const struct in6_addr *prefix, int prefixlen,
2536                                            const struct in6_addr *gwaddr,
2537                                            struct net_device *dev,
2538                                            unsigned int pref)
2539 {
2540         struct fib6_config cfg = {
2541                 .fc_metric      = IP6_RT_PRIO_USER,
2542                 .fc_ifindex     = dev->ifindex,
2543                 .fc_dst_len     = prefixlen,
2544                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
2545                                   RTF_UP | RTF_PREF(pref),
2546                 .fc_protocol = RTPROT_RA,
2547                 .fc_nlinfo.portid = 0,
2548                 .fc_nlinfo.nlh = NULL,
2549                 .fc_nlinfo.nl_net = net,
2550         };
2551
2552         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
2553         cfg.fc_dst = *prefix;
2554         cfg.fc_gateway = *gwaddr;
2555
2556         /* We should treat it as a default route if prefix length is 0. */
2557         if (!prefixlen)
2558                 cfg.fc_flags |= RTF_DEFAULT;
2559
2560         ip6_route_add(&cfg, NULL);
2561
2562         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
2563 }
2564 #endif
2565
2566 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
2567 {
2568         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
2569         struct rt6_info *rt;
2570         struct fib6_table *table;
2571
2572         table = fib6_get_table(dev_net(dev), tb_id);
2573         if (!table)
2574                 return NULL;
2575
2576         read_lock_bh(&table->tb6_lock);
2577         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2578                 if (dev == rt->dst.dev &&
2579                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
2580                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
2581                         break;
2582         }
2583         if (rt)
2584                 dst_hold(&rt->dst);
2585         read_unlock_bh(&table->tb6_lock);
2586         return rt;
2587 }
2588
2589 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
2590                                      struct net_device *dev,
2591                                      unsigned int pref)
2592 {
2593         struct fib6_config cfg = {
2594                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
2595                 .fc_metric      = IP6_RT_PRIO_USER,
2596                 .fc_ifindex     = dev->ifindex,
2597                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
2598                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
2599                 .fc_protocol = RTPROT_RA,
2600                 .fc_nlinfo.portid = 0,
2601                 .fc_nlinfo.nlh = NULL,
2602                 .fc_nlinfo.nl_net = dev_net(dev),
2603         };
2604
2605         cfg.fc_gateway = *gwaddr;
2606
2607         if (!ip6_route_add(&cfg, NULL)) {
2608                 struct fib6_table *table;
2609
2610                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
2611                 if (table)
2612                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
2613         }
2614
2615         return rt6_get_dflt_router(gwaddr, dev);
2616 }
2617
2618 static void __rt6_purge_dflt_routers(struct fib6_table *table)
2619 {
2620         struct rt6_info *rt;
2621
2622 restart:
2623         read_lock_bh(&table->tb6_lock);
2624         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2625                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
2626                     (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
2627                         dst_hold(&rt->dst);
2628                         read_unlock_bh(&table->tb6_lock);
2629                         ip6_del_rt(rt);
2630                         goto restart;
2631                 }
2632         }
2633         read_unlock_bh(&table->tb6_lock);
2634
2635         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
2636 }
2637
2638 void rt6_purge_dflt_routers(struct net *net)
2639 {
2640         struct fib6_table *table;
2641         struct hlist_head *head;
2642         unsigned int h;
2643
2644         rcu_read_lock();
2645
2646         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
2647                 head = &net->ipv6.fib_table_hash[h];
2648                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
2649                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
2650                                 __rt6_purge_dflt_routers(table);
2651                 }
2652         }
2653
2654         rcu_read_unlock();
2655 }
2656
2657 static void rtmsg_to_fib6_config(struct net *net,
2658                                  struct in6_rtmsg *rtmsg,
2659                                  struct fib6_config *cfg)
2660 {
2661         memset(cfg, 0, sizeof(*cfg));
2662
2663         cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
2664                          : RT6_TABLE_MAIN;
2665         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
2666         cfg->fc_metric = rtmsg->rtmsg_metric;
2667         cfg->fc_expires = rtmsg->rtmsg_info;
2668         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
2669         cfg->fc_src_len = rtmsg->rtmsg_src_len;
2670         cfg->fc_flags = rtmsg->rtmsg_flags;
2671
2672         cfg->fc_nlinfo.nl_net = net;
2673
2674         cfg->fc_dst = rtmsg->rtmsg_dst;
2675         cfg->fc_src = rtmsg->rtmsg_src;
2676         cfg->fc_gateway = rtmsg->rtmsg_gateway;
2677 }
2678
2679 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
2680 {
2681         struct fib6_config cfg;
2682         struct in6_rtmsg rtmsg;
2683         int err;
2684
2685         switch (cmd) {
2686         case SIOCADDRT:         /* Add a route */
2687         case SIOCDELRT:         /* Delete a route */
2688                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
2689                         return -EPERM;
2690                 err = copy_from_user(&rtmsg, arg,
2691                                      sizeof(struct in6_rtmsg));
2692                 if (err)
2693                         return -EFAULT;
2694
2695                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
2696
2697                 rtnl_lock();
2698                 switch (cmd) {
2699                 case SIOCADDRT:
2700                         err = ip6_route_add(&cfg, NULL);
2701                         break;
2702                 case SIOCDELRT:
2703                         err = ip6_route_del(&cfg, NULL);
2704                         break;
2705                 default:
2706                         err = -EINVAL;
2707                 }
2708                 rtnl_unlock();
2709
2710                 return err;
2711         }
2712
2713         return -EINVAL;
2714 }
2715
2716 /*
2717  *      Drop the packet on the floor
2718  */
2719
2720 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2721 {
2722         int type;
2723         struct dst_entry *dst = skb_dst(skb);
2724         switch (ipstats_mib_noroutes) {
2725         case IPSTATS_MIB_INNOROUTES:
2726                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2727                 if (type == IPV6_ADDR_ANY) {
2728                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2729                                       IPSTATS_MIB_INADDRERRORS);
2730                         break;
2731                 }
2732                 /* FALLTHROUGH */
2733         case IPSTATS_MIB_OUTNOROUTES:
2734                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2735                               ipstats_mib_noroutes);
2736                 break;
2737         }
2738         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2739         kfree_skb(skb);
2740         return 0;
2741 }
2742
2743 static int ip6_pkt_discard(struct sk_buff *skb)
2744 {
2745         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2746 }
2747
2748 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
2749 {
2750         skb->dev = skb_dst(skb)->dev;
2751         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2752 }
2753
2754 static int ip6_pkt_prohibit(struct sk_buff *skb)
2755 {
2756         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2757 }
2758
2759 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
2760 {
2761         skb->dev = skb_dst(skb)->dev;
2762         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2763 }
2764
2765 /*
2766  *      Allocate a dst for local (unicast / anycast) address.
2767  */
2768
2769 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2770                                     const struct in6_addr *addr,
2771                                     bool anycast)
2772 {
2773         u32 tb_id;
2774         struct net *net = dev_net(idev->dev);
2775         struct net_device *dev = idev->dev;
2776         struct rt6_info *rt;
2777
2778         rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
2779         if (!rt)
2780                 return ERR_PTR(-ENOMEM);
2781
2782         in6_dev_hold(idev);
2783
2784         rt->dst.flags |= DST_HOST;
2785         rt->dst.input = ip6_input;
2786         rt->dst.output = ip6_output;
2787         rt->rt6i_idev = idev;
2788
2789         rt->rt6i_protocol = RTPROT_KERNEL;
2790         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2791         if (anycast)
2792                 rt->rt6i_flags |= RTF_ANYCAST;
2793         else
2794                 rt->rt6i_flags |= RTF_LOCAL;
2795
2796         rt->rt6i_gateway  = *addr;
2797         rt->rt6i_dst.addr = *addr;
2798         rt->rt6i_dst.plen = 128;
2799         tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
2800         rt->rt6i_table = fib6_get_table(net, tb_id);
2801
2802         return rt;
2803 }
2804
2805 /* remove deleted ip from prefsrc entries */
2806 struct arg_dev_net_ip {
2807         struct net_device *dev;
2808         struct net *net;
2809         struct in6_addr *addr;
2810 };
2811
2812 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2813 {
2814         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2815         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2816         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2817
2818         if (((void *)rt->dst.dev == dev || !dev) &&
2819             rt != net->ipv6.ip6_null_entry &&
2820             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2821                 /* remove prefsrc entry */
2822                 rt->rt6i_prefsrc.plen = 0;
2823         }
2824         return 0;
2825 }
2826
2827 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2828 {
2829         struct net *net = dev_net(ifp->idev->dev);
2830         struct arg_dev_net_ip adni = {
2831                 .dev = ifp->idev->dev,
2832                 .net = net,
2833                 .addr = &ifp->addr,
2834         };
2835         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
2836 }
2837
2838 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
2839 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
2840
2841 /* Remove routers and update dst entries when gateway turn into host. */
2842 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
2843 {
2844         struct in6_addr *gateway = (struct in6_addr *)arg;
2845
2846         if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) ||
2847              ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) &&
2848              ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
2849                 return -1;
2850         }
2851         return 0;
2852 }
2853
2854 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
2855 {
2856         fib6_clean_all(net, fib6_clean_tohost, gateway);
2857 }
2858
2859 struct arg_dev_net {
2860         struct net_device *dev;
2861         struct net *net;
2862 };
2863
2864 /* called with write lock held for table with rt */
2865 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2866 {
2867         const struct arg_dev_net *adn = arg;
2868         const struct net_device *dev = adn->dev;
2869
2870         if ((rt->dst.dev == dev || !dev) &&
2871             rt != adn->net->ipv6.ip6_null_entry &&
2872             (rt->rt6i_nsiblings == 0 ||
2873              (dev && netdev_unregistering(dev)) ||
2874              !rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
2875                 return -1;
2876
2877         return 0;
2878 }
2879
2880 void rt6_ifdown(struct net *net, struct net_device *dev)
2881 {
2882         struct arg_dev_net adn = {
2883                 .dev = dev,
2884                 .net = net,
2885         };
2886
2887         fib6_clean_all(net, fib6_ifdown, &adn);
2888         if (dev)
2889                 rt6_uncached_list_flush_dev(net, dev);
2890 }
2891
2892 struct rt6_mtu_change_arg {
2893         struct net_device *dev;
2894         unsigned int mtu;
2895 };
2896
2897 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2898 {
2899         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2900         struct inet6_dev *idev;
2901
2902         /* In IPv6 pmtu discovery is not optional,
2903            so that RTAX_MTU lock cannot disable it.
2904            We still use this lock to block changes
2905            caused by addrconf/ndisc.
2906         */
2907
2908         idev = __in6_dev_get(arg->dev);
2909         if (!idev)
2910                 return 0;
2911
2912         /* For administrative MTU increase, there is no way to discover
2913            IPv6 PMTU increase, so PMTU increase should be updated here.
2914            Since RFC 1981 doesn't include administrative MTU increase
2915            update PMTU increase is a MUST. (i.e. jumbo frame)
2916          */
2917         /*
2918            If new MTU is less than route PMTU, this new MTU will be the
2919            lowest MTU in the path, update the route PMTU to reflect PMTU
2920            decreases; if new MTU is greater than route PMTU, and the
2921            old MTU is the lowest MTU in the path, update the route PMTU
2922            to reflect the increase. In this case if the other nodes' MTU
2923            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2924            PMTU discovery.
2925          */
2926         if (rt->dst.dev == arg->dev &&
2927             dst_metric_raw(&rt->dst, RTAX_MTU) &&
2928             !dst_metric_locked(&rt->dst, RTAX_MTU)) {
2929                 if (rt->rt6i_flags & RTF_CACHE) {
2930                         /* For RTF_CACHE with rt6i_pmtu == 0
2931                          * (i.e. a redirected route),
2932                          * the metrics of its rt->dst.from has already
2933                          * been updated.
2934                          */
2935                         if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu)
2936                                 rt->rt6i_pmtu = arg->mtu;
2937                 } else if (dst_mtu(&rt->dst) >= arg->mtu ||
2938                            (dst_mtu(&rt->dst) < arg->mtu &&
2939                             dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
2940                         dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2941                 }
2942         }
2943         return 0;
2944 }
2945
2946 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
2947 {
2948         struct rt6_mtu_change_arg arg = {
2949                 .dev = dev,
2950                 .mtu = mtu,
2951         };
2952
2953         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
2954 }
2955
2956 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2957         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2958         [RTA_OIF]               = { .type = NLA_U32 },
2959         [RTA_IIF]               = { .type = NLA_U32 },
2960         [RTA_PRIORITY]          = { .type = NLA_U32 },
2961         [RTA_METRICS]           = { .type = NLA_NESTED },
2962         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
2963         [RTA_PREF]              = { .type = NLA_U8 },
2964         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
2965         [RTA_ENCAP]             = { .type = NLA_NESTED },
2966         [RTA_EXPIRES]           = { .type = NLA_U32 },
2967         [RTA_UID]               = { .type = NLA_U32 },
2968         [RTA_MARK]              = { .type = NLA_U32 },
2969 };
2970
2971 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2972                               struct fib6_config *cfg,
2973                               struct netlink_ext_ack *extack)
2974 {
2975         struct rtmsg *rtm;
2976         struct nlattr *tb[RTA_MAX+1];
2977         unsigned int pref;
2978         int err;
2979
2980         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
2981                           NULL);
2982         if (err < 0)
2983                 goto errout;
2984
2985         err = -EINVAL;
2986         rtm = nlmsg_data(nlh);
2987         memset(cfg, 0, sizeof(*cfg));
2988
2989         cfg->fc_table = rtm->rtm_table;
2990         cfg->fc_dst_len = rtm->rtm_dst_len;
2991         cfg->fc_src_len = rtm->rtm_src_len;
2992         cfg->fc_flags = RTF_UP;
2993         cfg->fc_protocol = rtm->rtm_protocol;
2994         cfg->fc_type = rtm->rtm_type;
2995
2996         if (rtm->rtm_type == RTN_UNREACHABLE ||
2997             rtm->rtm_type == RTN_BLACKHOLE ||
2998             rtm->rtm_type == RTN_PROHIBIT ||
2999             rtm->rtm_type == RTN_THROW)
3000                 cfg->fc_flags |= RTF_REJECT;
3001
3002         if (rtm->rtm_type == RTN_LOCAL)
3003                 cfg->fc_flags |= RTF_LOCAL;
3004
3005         if (rtm->rtm_flags & RTM_F_CLONED)
3006                 cfg->fc_flags |= RTF_CACHE;
3007
3008         cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
3009         cfg->fc_nlinfo.nlh = nlh;
3010         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
3011
3012         if (tb[RTA_GATEWAY]) {
3013                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
3014                 cfg->fc_flags |= RTF_GATEWAY;
3015         }
3016
3017         if (tb[RTA_DST]) {
3018                 int plen = (rtm->rtm_dst_len + 7) >> 3;
3019
3020                 if (nla_len(tb[RTA_DST]) < plen)
3021                         goto errout;
3022
3023                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
3024         }
3025
3026         if (tb[RTA_SRC]) {
3027                 int plen = (rtm->rtm_src_len + 7) >> 3;
3028
3029                 if (nla_len(tb[RTA_SRC]) < plen)
3030                         goto errout;
3031
3032                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
3033         }
3034
3035         if (tb[RTA_PREFSRC])
3036                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
3037
3038         if (tb[RTA_OIF])
3039                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
3040
3041         if (tb[RTA_PRIORITY])
3042                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
3043
3044         if (tb[RTA_METRICS]) {
3045                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
3046                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
3047         }
3048
3049         if (tb[RTA_TABLE])
3050                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
3051
3052         if (tb[RTA_MULTIPATH]) {
3053                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
3054                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
3055
3056                 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
3057                                                      cfg->fc_mp_len, extack);
3058                 if (err < 0)
3059                         goto errout;
3060         }
3061
3062         if (tb[RTA_PREF]) {
3063                 pref = nla_get_u8(tb[RTA_PREF]);
3064                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
3065                     pref != ICMPV6_ROUTER_PREF_HIGH)
3066                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
3067                 cfg->fc_flags |= RTF_PREF(pref);
3068         }
3069
3070         if (tb[RTA_ENCAP])
3071                 cfg->fc_encap = tb[RTA_ENCAP];
3072
3073         if (tb[RTA_ENCAP_TYPE]) {
3074                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
3075
3076                 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
3077                 if (err < 0)
3078                         goto errout;
3079         }
3080
3081         if (tb[RTA_EXPIRES]) {
3082                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
3083
3084                 if (addrconf_finite_timeout(timeout)) {
3085                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
3086                         cfg->fc_flags |= RTF_EXPIRES;
3087                 }
3088         }
3089
3090         err = 0;
3091 errout:
3092         return err;
3093 }
3094
3095 struct rt6_nh {
3096         struct rt6_info *rt6_info;
3097         struct fib6_config r_cfg;
3098         struct mx6_config mxc;
3099         struct list_head next;
3100 };
3101
3102 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
3103 {
3104         struct rt6_nh *nh;
3105
3106         list_for_each_entry(nh, rt6_nh_list, next) {
3107                 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
3108                         &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
3109                         nh->r_cfg.fc_ifindex);
3110         }
3111 }
3112
3113 static int ip6_route_info_append(struct list_head *rt6_nh_list,
3114                                  struct rt6_info *rt, struct fib6_config *r_cfg)
3115 {
3116         struct rt6_nh *nh;
3117         int err = -EEXIST;
3118
3119         list_for_each_entry(nh, rt6_nh_list, next) {
3120                 /* check if rt6_info already exists */
3121                 if (rt6_duplicate_nexthop(nh->rt6_info, rt))
3122                         return err;
3123         }
3124
3125         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
3126         if (!nh)
3127                 return -ENOMEM;
3128         nh->rt6_info = rt;
3129         err = ip6_convert_metrics(&nh->mxc, r_cfg);
3130         if (err) {
3131                 kfree(nh);
3132                 return err;
3133         }
3134         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
3135         list_add_tail(&nh->next, rt6_nh_list);
3136
3137         return 0;
3138 }
3139
3140 static void ip6_route_mpath_notify(struct rt6_info *rt,
3141                                    struct rt6_info *rt_last,
3142                                    struct nl_info *info,
3143                                    __u16 nlflags)
3144 {
3145         /* if this is an APPEND route, then rt points to the first route
3146          * inserted and rt_last points to last route inserted. Userspace
3147          * wants a consistent dump of the route which starts at the first
3148          * nexthop. Since sibling routes are always added at the end of
3149          * the list, find the first sibling of the last route appended
3150          */
3151         if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
3152                 rt = list_first_entry(&rt_last->rt6i_siblings,
3153                                       struct rt6_info,
3154                                       rt6i_siblings);
3155         }
3156
3157         if (rt)
3158                 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
3159 }
3160
3161 static int ip6_route_multipath_add(struct fib6_config *cfg,
3162                                    struct netlink_ext_ack *extack)
3163 {
3164         struct rt6_info *rt_notif = NULL, *rt_last = NULL;
3165         struct nl_info *info = &cfg->fc_nlinfo;
3166         struct fib6_config r_cfg;
3167         struct rtnexthop *rtnh;
3168         struct rt6_info *rt;
3169         struct rt6_nh *err_nh;
3170         struct rt6_nh *nh, *nh_safe;
3171         __u16 nlflags;
3172         int remaining;
3173         int attrlen;
3174         int err = 1;
3175         int nhn = 0;
3176         int replace = (cfg->fc_nlinfo.nlh &&
3177                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
3178         LIST_HEAD(rt6_nh_list);
3179
3180         nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
3181         if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
3182                 nlflags |= NLM_F_APPEND;
3183
3184         remaining = cfg->fc_mp_len;
3185         rtnh = (struct rtnexthop *)cfg->fc_mp;
3186
3187         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
3188          * rt6_info structs per nexthop
3189          */
3190         while (rtnh_ok(rtnh, remaining)) {
3191                 memcpy(&r_cfg, cfg, sizeof(*cfg));
3192                 if (rtnh->rtnh_ifindex)
3193                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3194
3195                 attrlen = rtnh_attrlen(rtnh);
3196                 if (attrlen > 0) {
3197                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3198
3199                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3200                         if (nla) {
3201                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
3202                                 r_cfg.fc_flags |= RTF_GATEWAY;
3203                         }
3204                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
3205                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
3206                         if (nla)
3207                                 r_cfg.fc_encap_type = nla_get_u16(nla);
3208                 }
3209
3210                 rt = ip6_route_info_create(&r_cfg, extack);
3211                 if (IS_ERR(rt)) {
3212                         err = PTR_ERR(rt);
3213                         rt = NULL;
3214                         goto cleanup;
3215                 }
3216
3217                 err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
3218                 if (err) {
3219                         dst_release_immediate(&rt->dst);
3220                         goto cleanup;
3221                 }
3222
3223                 rtnh = rtnh_next(rtnh, &remaining);
3224         }
3225
3226         /* for add and replace send one notification with all nexthops.
3227          * Skip the notification in fib6_add_rt2node and send one with
3228          * the full route when done
3229          */
3230         info->skip_notify = 1;
3231
3232         err_nh = NULL;
3233         list_for_each_entry(nh, &rt6_nh_list, next) {
3234                 rt_last = nh->rt6_info;
3235                 err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack);
3236                 /* save reference to first route for notification */
3237                 if (!rt_notif && !err)
3238                         rt_notif = nh->rt6_info;
3239
3240                 /* nh->rt6_info is used or freed at this point, reset to NULL*/
3241                 nh->rt6_info = NULL;
3242                 if (err) {
3243                         if (replace && nhn)
3244                                 ip6_print_replace_route_err(&rt6_nh_list);
3245                         err_nh = nh;
3246                         goto add_errout;
3247                 }
3248
3249                 /* Because each route is added like a single route we remove
3250                  * these flags after the first nexthop: if there is a collision,
3251                  * we have already failed to add the first nexthop:
3252                  * fib6_add_rt2node() has rejected it; when replacing, old
3253                  * nexthops have been replaced by first new, the rest should
3254                  * be added to it.
3255                  */
3256                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
3257                                                      NLM_F_REPLACE);
3258                 nhn++;
3259         }
3260
3261         /* success ... tell user about new route */
3262         ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3263         goto cleanup;
3264
3265 add_errout:
3266         /* send notification for routes that were added so that
3267          * the delete notifications sent by ip6_route_del are
3268          * coherent
3269          */
3270         if (rt_notif)
3271                 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3272
3273         /* Delete routes that were already added */
3274         list_for_each_entry(nh, &rt6_nh_list, next) {
3275                 if (err_nh == nh)
3276                         break;
3277                 ip6_route_del(&nh->r_cfg, extack);
3278         }
3279
3280 cleanup:
3281         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
3282                 if (nh->rt6_info)
3283                         dst_release_immediate(&nh->rt6_info->dst);
3284                 kfree(nh->mxc.mx);
3285                 list_del(&nh->next);
3286                 kfree(nh);
3287         }
3288
3289         return err;
3290 }
3291
3292 static int ip6_route_multipath_del(struct fib6_config *cfg,
3293                                    struct netlink_ext_ack *extack)
3294 {
3295         struct fib6_config r_cfg;
3296         struct rtnexthop *rtnh;
3297         int remaining;
3298         int attrlen;
3299         int err = 1, last_err = 0;
3300
3301         remaining = cfg->fc_mp_len;
3302         rtnh = (struct rtnexthop *)cfg->fc_mp;
3303
3304         /* Parse a Multipath Entry */
3305         while (rtnh_ok(rtnh, remaining)) {
3306                 memcpy(&r_cfg, cfg, sizeof(*cfg));
3307                 if (rtnh->rtnh_ifindex)
3308                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3309
3310                 attrlen = rtnh_attrlen(rtnh);
3311                 if (attrlen > 0) {
3312                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3313
3314                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3315                         if (nla) {
3316                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
3317                                 r_cfg.fc_flags |= RTF_GATEWAY;
3318                         }
3319                 }
3320                 err = ip6_route_del(&r_cfg, extack);
3321                 if (err)
3322                         last_err = err;
3323
3324                 rtnh = rtnh_next(rtnh, &remaining);
3325         }
3326
3327         return last_err;
3328 }
3329
3330 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3331                               struct netlink_ext_ack *extack)
3332 {
3333         struct fib6_config cfg;
3334         int err;
3335
3336         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3337         if (err < 0)
3338                 return err;
3339
3340         if (cfg.fc_mp)
3341                 return ip6_route_multipath_del(&cfg, extack);
3342         else {
3343                 cfg.fc_delete_all_nh = 1;
3344                 return ip6_route_del(&cfg, extack);
3345         }
3346 }
3347
3348 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3349                               struct netlink_ext_ack *extack)
3350 {
3351         struct fib6_config cfg;
3352         int err;
3353
3354         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3355         if (err < 0)
3356                 return err;
3357
3358         if (cfg.fc_mp)
3359                 return ip6_route_multipath_add(&cfg, extack);
3360         else
3361                 return ip6_route_add(&cfg, extack);
3362 }
3363
3364 static size_t rt6_nlmsg_size(struct rt6_info *rt)
3365 {
3366         int nexthop_len = 0;
3367
3368         if (rt->rt6i_nsiblings) {
3369                 nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
3370                             + NLA_ALIGN(sizeof(struct rtnexthop))
3371                             + nla_total_size(16) /* RTA_GATEWAY */
3372                             + lwtunnel_get_encap_size(rt->dst.lwtstate);
3373
3374                 nexthop_len *= rt->rt6i_nsiblings;
3375         }
3376
3377         return NLMSG_ALIGN(sizeof(struct rtmsg))
3378                + nla_total_size(16) /* RTA_SRC */
3379                + nla_total_size(16) /* RTA_DST */
3380                + nla_total_size(16) /* RTA_GATEWAY */
3381                + nla_total_size(16) /* RTA_PREFSRC */
3382                + nla_total_size(4) /* RTA_TABLE */
3383                + nla_total_size(4) /* RTA_IIF */
3384                + nla_total_size(4) /* RTA_OIF */
3385                + nla_total_size(4) /* RTA_PRIORITY */
3386                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
3387                + nla_total_size(sizeof(struct rta_cacheinfo))
3388                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
3389                + nla_total_size(1) /* RTA_PREF */
3390                + lwtunnel_get_encap_size(rt->dst.lwtstate)
3391                + nexthop_len;
3392 }
3393
3394 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
3395                             unsigned int *flags, bool skip_oif)
3396 {
3397         if (!netif_running(rt->dst.dev) || !netif_carrier_ok(rt->dst.dev)) {
3398                 *flags |= RTNH_F_LINKDOWN;
3399                 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
3400                         *flags |= RTNH_F_DEAD;
3401         }
3402
3403         if (rt->rt6i_flags & RTF_GATEWAY) {
3404                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
3405                         goto nla_put_failure;
3406         }
3407
3408         if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD)
3409                 *flags |= RTNH_F_OFFLOAD;
3410
3411         /* not needed for multipath encoding b/c it has a rtnexthop struct */
3412         if (!skip_oif && rt->dst.dev &&
3413             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
3414                 goto nla_put_failure;
3415
3416         if (rt->dst.lwtstate &&
3417             lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
3418                 goto nla_put_failure;
3419
3420         return 0;
3421
3422 nla_put_failure:
3423         return -EMSGSIZE;
3424 }
3425
3426 /* add multipath next hop */
3427 static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
3428 {
3429         struct rtnexthop *rtnh;
3430         unsigned int flags = 0;
3431
3432         rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
3433         if (!rtnh)
3434                 goto nla_put_failure;
3435
3436         rtnh->rtnh_hops = 0;
3437         rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
3438
3439         if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
3440                 goto nla_put_failure;
3441
3442         rtnh->rtnh_flags = flags;
3443
3444         /* length of rtnetlink header + attributes */
3445         rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
3446
3447         return 0;
3448
3449 nla_put_failure:
3450         return -EMSGSIZE;
3451 }
3452
3453 static int rt6_fill_node(struct net *net,
3454                          struct sk_buff *skb, struct rt6_info *rt,
3455                          struct in6_addr *dst, struct in6_addr *src,
3456                          int iif, int type, u32 portid, u32 seq,
3457                          unsigned int flags)
3458 {
3459         u32 metrics[RTAX_MAX];
3460         struct rtmsg *rtm;
3461         struct nlmsghdr *nlh;
3462         long expires;
3463         u32 table;
3464
3465         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
3466         if (!nlh)
3467                 return -EMSGSIZE;
3468
3469         rtm = nlmsg_data(nlh);
3470         rtm->rtm_family = AF_INET6;
3471         rtm->rtm_dst_len = rt->rt6i_dst.plen;
3472         rtm->rtm_src_len = rt->rt6i_src.plen;
3473         rtm->rtm_tos = 0;
3474         if (rt->rt6i_table)
3475                 table = rt->rt6i_table->tb6_id;
3476         else
3477                 table = RT6_TABLE_UNSPEC;
3478         rtm->rtm_table = table;
3479         if (nla_put_u32(skb, RTA_TABLE, table))
3480                 goto nla_put_failure;
3481         if (rt->rt6i_flags & RTF_REJECT) {
3482                 switch (rt->dst.error) {
3483                 case -EINVAL:
3484                         rtm->rtm_type = RTN_BLACKHOLE;
3485                         break;
3486                 case -EACCES:
3487                         rtm->rtm_type = RTN_PROHIBIT;
3488                         break;
3489                 case -EAGAIN:
3490                         rtm->rtm_type = RTN_THROW;
3491                         break;
3492                 default:
3493                         rtm->rtm_type = RTN_UNREACHABLE;
3494                         break;
3495                 }
3496         }
3497         else if (rt->rt6i_flags & RTF_LOCAL)
3498                 rtm->rtm_type = RTN_LOCAL;
3499         else if (rt->rt6i_flags & RTF_ANYCAST)
3500                 rtm->rtm_type = RTN_ANYCAST;
3501         else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
3502                 rtm->rtm_type = RTN_LOCAL;
3503         else
3504                 rtm->rtm_type = RTN_UNICAST;
3505         rtm->rtm_flags = 0;
3506         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
3507         rtm->rtm_protocol = rt->rt6i_protocol;
3508
3509         if (rt->rt6i_flags & RTF_CACHE)
3510                 rtm->rtm_flags |= RTM_F_CLONED;
3511
3512         if (dst) {
3513                 if (nla_put_in6_addr(skb, RTA_DST, dst))
3514                         goto nla_put_failure;
3515                 rtm->rtm_dst_len = 128;
3516         } else if (rtm->rtm_dst_len)
3517                 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
3518                         goto nla_put_failure;
3519 #ifdef CONFIG_IPV6_SUBTREES
3520         if (src) {
3521                 if (nla_put_in6_addr(skb, RTA_SRC, src))
3522                         goto nla_put_failure;
3523                 rtm->rtm_src_len = 128;
3524         } else if (rtm->rtm_src_len &&
3525                    nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
3526                 goto nla_put_failure;
3527 #endif
3528         if (iif) {
3529 #ifdef CONFIG_IPV6_MROUTE
3530                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
3531                         int err = ip6mr_get_route(net, skb, rtm, portid);
3532
3533                         if (err == 0)
3534                                 return 0;
3535                         if (err < 0)
3536                                 goto nla_put_failure;
3537                 } else
3538 #endif
3539                         if (nla_put_u32(skb, RTA_IIF, iif))
3540                                 goto nla_put_failure;
3541         } else if (dst) {
3542                 struct in6_addr saddr_buf;
3543                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
3544                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
3545                         goto nla_put_failure;
3546         }
3547
3548         if (rt->rt6i_prefsrc.plen) {
3549                 struct in6_addr saddr_buf;
3550                 saddr_buf = rt->rt6i_prefsrc.addr;
3551                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
3552                         goto nla_put_failure;
3553         }
3554
3555         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
3556         if (rt->rt6i_pmtu)
3557                 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
3558         if (rtnetlink_put_metrics(skb, metrics) < 0)
3559                 goto nla_put_failure;
3560
3561         if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
3562                 goto nla_put_failure;
3563
3564         /* For multipath routes, walk the siblings list and add
3565          * each as a nexthop within RTA_MULTIPATH.
3566          */
3567         if (rt->rt6i_nsiblings) {
3568                 struct rt6_info *sibling, *next_sibling;
3569                 struct nlattr *mp;
3570
3571                 mp = nla_nest_start(skb, RTA_MULTIPATH);
3572                 if (!mp)
3573                         goto nla_put_failure;
3574
3575                 if (rt6_add_nexthop(skb, rt) < 0)
3576                         goto nla_put_failure;
3577
3578                 list_for_each_entry_safe(sibling, next_sibling,
3579                                          &rt->rt6i_siblings, rt6i_siblings) {
3580                         if (rt6_add_nexthop(skb, sibling) < 0)
3581                                 goto nla_put_failure;
3582                 }
3583
3584                 nla_nest_end(skb, mp);
3585         } else {
3586                 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
3587                         goto nla_put_failure;
3588         }
3589
3590         expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
3591
3592         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
3593                 goto nla_put_failure;
3594
3595         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
3596                 goto nla_put_failure;
3597
3598
3599         nlmsg_end(skb, nlh);
3600         return 0;
3601
3602 nla_put_failure:
3603         nlmsg_cancel(skb, nlh);
3604         return -EMSGSIZE;
3605 }
3606
3607 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
3608 {
3609         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
3610         struct net *net = arg->net;
3611
3612         if (rt == net->ipv6.ip6_null_entry)
3613                 return 0;
3614
3615         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
3616                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
3617
3618                 /* user wants prefix routes only */
3619                 if (rtm->rtm_flags & RTM_F_PREFIX &&
3620                     !(rt->rt6i_flags & RTF_PREFIX_RT)) {
3621                         /* success since this is not a prefix route */
3622                         return 1;
3623                 }
3624         }
3625
3626         return rt6_fill_node(net,
3627                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
3628                      NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
3629                      NLM_F_MULTI);
3630 }
3631
3632 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3633                               struct netlink_ext_ack *extack)
3634 {
3635         struct net *net = sock_net(in_skb->sk);
3636         struct nlattr *tb[RTA_MAX+1];
3637         int err, iif = 0, oif = 0;
3638         struct dst_entry *dst;
3639         struct rt6_info *rt;
3640         struct sk_buff *skb;
3641         struct rtmsg *rtm;
3642         struct flowi6 fl6;
3643         bool fibmatch;
3644
3645         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
3646                           extack);
3647         if (err < 0)
3648                 goto errout;
3649
3650         err = -EINVAL;
3651         memset(&fl6, 0, sizeof(fl6));
3652         rtm = nlmsg_data(nlh);
3653         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
3654         fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
3655
3656         if (tb[RTA_SRC]) {
3657                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
3658                         goto errout;
3659
3660                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
3661         }
3662
3663         if (tb[RTA_DST]) {
3664                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
3665                         goto errout;
3666
3667                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
3668         }
3669
3670         if (tb[RTA_IIF])
3671                 iif = nla_get_u32(tb[RTA_IIF]);
3672
3673         if (tb[RTA_OIF])
3674                 oif = nla_get_u32(tb[RTA_OIF]);
3675
3676         if (tb[RTA_MARK])
3677                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
3678
3679         if (tb[RTA_UID])
3680                 fl6.flowi6_uid = make_kuid(current_user_ns(),
3681                                            nla_get_u32(tb[RTA_UID]));
3682         else
3683                 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
3684
3685         if (iif) {
3686                 struct net_device *dev;
3687                 int flags = 0;
3688
3689                 rcu_read_lock();
3690
3691                 dev = dev_get_by_index_rcu(net, iif);
3692                 if (!dev) {
3693                         rcu_read_unlock();
3694                         err = -ENODEV;
3695                         goto errout;
3696                 }
3697
3698                 fl6.flowi6_iif = iif;
3699
3700                 if (!ipv6_addr_any(&fl6.saddr))
3701                         flags |= RT6_LOOKUP_F_HAS_SADDR;
3702
3703                 if (!fibmatch)
3704                         dst = ip6_route_input_lookup(net, dev, &fl6, flags);
3705                 else
3706                         dst = ip6_route_lookup(net, &fl6, 0);
3707
3708                 rcu_read_unlock();
3709         } else {
3710                 fl6.flowi6_oif = oif;
3711
3712                 if (!fibmatch)
3713                         dst = ip6_route_output(net, NULL, &fl6);
3714                 else
3715                         dst = ip6_route_lookup(net, &fl6, 0);
3716         }
3717
3718
3719         rt = container_of(dst, struct rt6_info, dst);
3720         if (rt->dst.error) {
3721                 err = rt->dst.error;
3722                 ip6_rt_put(rt);
3723                 goto errout;
3724         }
3725
3726         if (rt == net->ipv6.ip6_null_entry) {
3727                 err = rt->dst.error;
3728                 ip6_rt_put(rt);
3729                 goto errout;
3730         }
3731
3732         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3733         if (!skb) {
3734                 ip6_rt_put(rt);
3735                 err = -ENOBUFS;
3736                 goto errout;
3737         }
3738
3739         skb_dst_set(skb, &rt->dst);
3740         if (fibmatch)
3741                 err = rt6_fill_node(net, skb, rt, NULL, NULL, iif,
3742                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
3743                                     nlh->nlmsg_seq, 0);
3744         else
3745                 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
3746                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
3747                                     nlh->nlmsg_seq, 0);
3748         if (err < 0) {
3749                 kfree_skb(skb);
3750                 goto errout;
3751         }
3752
3753         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3754 errout:
3755         return err;
3756 }
3757
3758 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
3759                      unsigned int nlm_flags)
3760 {
3761         struct sk_buff *skb;
3762         struct net *net = info->nl_net;
3763         u32 seq;
3764         int err;
3765
3766         err = -ENOBUFS;
3767         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3768
3769         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3770         if (!skb)
3771                 goto errout;
3772
3773         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
3774                                 event, info->portid, seq, nlm_flags);
3775         if (err < 0) {
3776                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
3777                 WARN_ON(err == -EMSGSIZE);
3778                 kfree_skb(skb);
3779                 goto errout;
3780         }
3781         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3782                     info->nlh, gfp_any());
3783         return;
3784 errout:
3785         if (err < 0)
3786                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
3787 }
3788
3789 static int ip6_route_dev_notify(struct notifier_block *this,
3790                                 unsigned long event, void *ptr)
3791 {
3792         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
3793         struct net *net = dev_net(dev);
3794
3795         if (!(dev->flags & IFF_LOOPBACK))
3796                 return NOTIFY_OK;
3797
3798         if (event == NETDEV_REGISTER) {
3799                 net->ipv6.ip6_null_entry->dst.dev = dev;
3800                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
3801 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3802                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
3803                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
3804                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
3805                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
3806 #endif
3807          } else if (event == NETDEV_UNREGISTER &&
3808                     dev->reg_state != NETREG_UNREGISTERED) {
3809                 /* NETDEV_UNREGISTER could be fired for multiple times by
3810                  * netdev_wait_allrefs(). Make sure we only call this once.
3811                  */
3812                 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
3813 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3814                 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
3815                 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
3816 #endif
3817         }
3818
3819         return NOTIFY_OK;
3820 }
3821
3822 /*
3823  *      /proc
3824  */
3825
3826 #ifdef CONFIG_PROC_FS
3827
3828 static const struct file_operations ipv6_route_proc_fops = {
3829         .owner          = THIS_MODULE,
3830         .open           = ipv6_route_open,
3831         .read           = seq_read,
3832         .llseek         = seq_lseek,
3833         .release        = seq_release_net,
3834 };
3835
3836 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
3837 {
3838         struct net *net = (struct net *)seq->private;
3839         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
3840                    net->ipv6.rt6_stats->fib_nodes,
3841                    net->ipv6.rt6_stats->fib_route_nodes,
3842                    net->ipv6.rt6_stats->fib_rt_alloc,
3843                    net->ipv6.rt6_stats->fib_rt_entries,
3844                    net->ipv6.rt6_stats->fib_rt_cache,
3845                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
3846                    net->ipv6.rt6_stats->fib_discarded_routes);
3847
3848         return 0;
3849 }
3850
3851 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
3852 {
3853         return single_open_net(inode, file, rt6_stats_seq_show);
3854 }
3855
3856 static const struct file_operations rt6_stats_seq_fops = {
3857         .owner   = THIS_MODULE,
3858         .open    = rt6_stats_seq_open,
3859         .read    = seq_read,
3860         .llseek  = seq_lseek,
3861         .release = single_release_net,
3862 };
3863 #endif  /* CONFIG_PROC_FS */
3864
3865 #ifdef CONFIG_SYSCTL
3866
3867 static
3868 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
3869                               void __user *buffer, size_t *lenp, loff_t *ppos)
3870 {
3871         struct net *net;
3872         int delay;
3873         if (!write)
3874                 return -EINVAL;
3875
3876         net = (struct net *)ctl->extra1;
3877         delay = net->ipv6.sysctl.flush_delay;
3878         proc_dointvec(ctl, write, buffer, lenp, ppos);
3879         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
3880         return 0;
3881 }
3882
3883 struct ctl_table ipv6_route_table_template[] = {
3884         {
3885                 .procname       =       "flush",
3886                 .data           =       &init_net.ipv6.sysctl.flush_delay,
3887                 .maxlen         =       sizeof(int),
3888                 .mode           =       0200,
3889                 .proc_handler   =       ipv6_sysctl_rtcache_flush
3890         },
3891         {
3892                 .procname       =       "gc_thresh",
3893                 .data           =       &ip6_dst_ops_template.gc_thresh,
3894                 .maxlen         =       sizeof(int),
3895                 .mode           =       0644,
3896                 .proc_handler   =       proc_dointvec,
3897         },
3898         {
3899                 .procname       =       "max_size",
3900                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
3901                 .maxlen         =       sizeof(int),
3902                 .mode           =       0644,
3903                 .proc_handler   =       proc_dointvec,
3904         },
3905         {
3906                 .procname       =       "gc_min_interval",
3907                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3908                 .maxlen         =       sizeof(int),
3909                 .mode           =       0644,
3910                 .proc_handler   =       proc_dointvec_jiffies,
3911         },
3912         {
3913                 .procname       =       "gc_timeout",
3914                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
3915                 .maxlen         =       sizeof(int),
3916                 .mode           =       0644,
3917                 .proc_handler   =       proc_dointvec_jiffies,
3918         },
3919         {
3920                 .procname       =       "gc_interval",
3921                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
3922                 .maxlen         =       sizeof(int),
3923                 .mode           =       0644,
3924                 .proc_handler   =       proc_dointvec_jiffies,
3925         },
3926         {
3927                 .procname       =       "gc_elasticity",
3928                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
3929                 .maxlen         =       sizeof(int),
3930                 .mode           =       0644,
3931                 .proc_handler   =       proc_dointvec,
3932         },
3933         {
3934                 .procname       =       "mtu_expires",
3935                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
3936                 .maxlen         =       sizeof(int),
3937                 .mode           =       0644,
3938                 .proc_handler   =       proc_dointvec_jiffies,
3939         },
3940         {
3941                 .procname       =       "min_adv_mss",
3942                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
3943                 .maxlen         =       sizeof(int),
3944                 .mode           =       0644,
3945                 .proc_handler   =       proc_dointvec,
3946         },
3947         {
3948                 .procname       =       "gc_min_interval_ms",
3949                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3950                 .maxlen         =       sizeof(int),
3951                 .mode           =       0644,
3952                 .proc_handler   =       proc_dointvec_ms_jiffies,
3953         },
3954         { }
3955 };
3956
3957 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
3958 {
3959         struct ctl_table *table;
3960
3961         table = kmemdup(ipv6_route_table_template,
3962                         sizeof(ipv6_route_table_template),
3963                         GFP_KERNEL);
3964
3965         if (table) {
3966                 table[0].data = &net->ipv6.sysctl.flush_delay;
3967                 table[0].extra1 = net;
3968                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
3969                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
3970                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3971                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
3972                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
3973                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
3974                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
3975                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
3976                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3977
3978                 /* Don't export sysctls to unprivileged users */
3979                 if (net->user_ns != &init_user_ns)
3980                         table[0].procname = NULL;
3981         }
3982
3983         return table;
3984 }
3985 #endif
3986
3987 static int __net_init ip6_route_net_init(struct net *net)
3988 {
3989         int ret = -ENOMEM;
3990
3991         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
3992                sizeof(net->ipv6.ip6_dst_ops));
3993
3994         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
3995                 goto out_ip6_dst_ops;
3996
3997         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
3998                                            sizeof(*net->ipv6.ip6_null_entry),
3999                                            GFP_KERNEL);
4000         if (!net->ipv6.ip6_null_entry)
4001                 goto out_ip6_dst_entries;
4002         net->ipv6.ip6_null_entry->dst.path =
4003                 (struct dst_entry *)net->ipv6.ip6_null_entry;
4004         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4005         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
4006                          ip6_template_metrics, true);
4007
4008 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4009         net->ipv6.fib6_has_custom_rules = false;
4010         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
4011                                                sizeof(*net->ipv6.ip6_prohibit_entry),
4012                                                GFP_KERNEL);
4013         if (!net->ipv6.ip6_prohibit_entry)
4014                 goto out_ip6_null_entry;
4015         net->ipv6.ip6_prohibit_entry->dst.path =
4016                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
4017         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4018         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
4019                          ip6_template_metrics, true);
4020
4021         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
4022                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
4023                                                GFP_KERNEL);
4024         if (!net->ipv6.ip6_blk_hole_entry)
4025                 goto out_ip6_prohibit_entry;
4026         net->ipv6.ip6_blk_hole_entry->dst.path =
4027                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
4028         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4029         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
4030                          ip6_template_metrics, true);
4031 #endif
4032
4033         net->ipv6.sysctl.flush_delay = 0;
4034         net->ipv6.sysctl.ip6_rt_max_size = 4096;
4035         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
4036         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
4037         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
4038         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
4039         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
4040         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
4041
4042         net->ipv6.ip6_rt_gc_expire = 30*HZ;
4043
4044         ret = 0;
4045 out:
4046         return ret;
4047
4048 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4049 out_ip6_prohibit_entry:
4050         kfree(net->ipv6.ip6_prohibit_entry);
4051 out_ip6_null_entry:
4052         kfree(net->ipv6.ip6_null_entry);
4053 #endif
4054 out_ip6_dst_entries:
4055         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4056 out_ip6_dst_ops:
4057         goto out;
4058 }
4059
4060 static void __net_exit ip6_route_net_exit(struct net *net)
4061 {
4062         kfree(net->ipv6.ip6_null_entry);
4063 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4064         kfree(net->ipv6.ip6_prohibit_entry);
4065         kfree(net->ipv6.ip6_blk_hole_entry);
4066 #endif
4067         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4068 }
4069
4070 static int __net_init ip6_route_net_init_late(struct net *net)
4071 {
4072 #ifdef CONFIG_PROC_FS
4073         proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
4074         proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
4075 #endif
4076         return 0;
4077 }
4078
4079 static void __net_exit ip6_route_net_exit_late(struct net *net)
4080 {
4081 #ifdef CONFIG_PROC_FS
4082         remove_proc_entry("ipv6_route", net->proc_net);
4083         remove_proc_entry("rt6_stats", net->proc_net);
4084 #endif
4085 }
4086
4087 static struct pernet_operations ip6_route_net_ops = {
4088         .init = ip6_route_net_init,
4089         .exit = ip6_route_net_exit,
4090 };
4091
4092 static int __net_init ipv6_inetpeer_init(struct net *net)
4093 {
4094         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
4095
4096         if (!bp)
4097                 return -ENOMEM;
4098         inet_peer_base_init(bp);
4099         net->ipv6.peers = bp;
4100         return 0;
4101 }
4102
4103 static void __net_exit ipv6_inetpeer_exit(struct net *net)
4104 {
4105         struct inet_peer_base *bp = net->ipv6.peers;
4106
4107         net->ipv6.peers = NULL;
4108         inetpeer_invalidate_tree(bp);
4109         kfree(bp);
4110 }
4111
4112 static struct pernet_operations ipv6_inetpeer_ops = {
4113         .init   =       ipv6_inetpeer_init,
4114         .exit   =       ipv6_inetpeer_exit,
4115 };
4116
4117 static struct pernet_operations ip6_route_net_late_ops = {
4118         .init = ip6_route_net_init_late,
4119         .exit = ip6_route_net_exit_late,
4120 };
4121
4122 static struct notifier_block ip6_route_dev_notifier = {
4123         .notifier_call = ip6_route_dev_notify,
4124         .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
4125 };
4126
4127 void __init ip6_route_init_special_entries(void)
4128 {
4129         /* Registering of the loopback is done before this portion of code,
4130          * the loopback reference in rt6_info will not be taken, do it
4131          * manually for init_net */
4132         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
4133         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4134   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4135         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
4136         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4137         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
4138         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4139   #endif
4140 }
4141
4142 int __init ip6_route_init(void)
4143 {
4144         int ret;
4145         int cpu;
4146
4147         ret = -ENOMEM;
4148         ip6_dst_ops_template.kmem_cachep =
4149                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
4150                                   SLAB_HWCACHE_ALIGN, NULL);
4151         if (!ip6_dst_ops_template.kmem_cachep)
4152                 goto out;
4153
4154         ret = dst_entries_init(&ip6_dst_blackhole_ops);
4155         if (ret)
4156                 goto out_kmem_cache;
4157
4158         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
4159         if (ret)
4160                 goto out_dst_entries;
4161
4162         ret = register_pernet_subsys(&ip6_route_net_ops);
4163         if (ret)
4164                 goto out_register_inetpeer;
4165
4166         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
4167
4168         ret = fib6_init();
4169         if (ret)
4170                 goto out_register_subsys;
4171
4172         ret = xfrm6_init();
4173         if (ret)
4174                 goto out_fib6_init;
4175
4176         ret = fib6_rules_init();
4177         if (ret)
4178                 goto xfrm6_init;
4179
4180         ret = register_pernet_subsys(&ip6_route_net_late_ops);
4181         if (ret)
4182                 goto fib6_rules_init;
4183
4184         ret = -ENOBUFS;
4185         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, 0) ||
4186             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, 0) ||
4187             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL,
4188                             RTNL_FLAG_DOIT_UNLOCKED))
4189                 goto out_register_late_subsys;
4190
4191         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
4192         if (ret)
4193                 goto out_register_late_subsys;
4194
4195         for_each_possible_cpu(cpu) {
4196                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
4197
4198                 INIT_LIST_HEAD(&ul->head);
4199                 spin_lock_init(&ul->lock);
4200         }
4201
4202 out:
4203         return ret;
4204
4205 out_register_late_subsys:
4206         unregister_pernet_subsys(&ip6_route_net_late_ops);
4207 fib6_rules_init:
4208         fib6_rules_cleanup();
4209 xfrm6_init:
4210         xfrm6_fini();
4211 out_fib6_init:
4212         fib6_gc_cleanup();
4213 out_register_subsys:
4214         unregister_pernet_subsys(&ip6_route_net_ops);
4215 out_register_inetpeer:
4216         unregister_pernet_subsys(&ipv6_inetpeer_ops);
4217 out_dst_entries:
4218         dst_entries_destroy(&ip6_dst_blackhole_ops);
4219 out_kmem_cache:
4220         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4221         goto out;
4222 }
4223
4224 void ip6_route_cleanup(void)
4225 {
4226         unregister_netdevice_notifier(&ip6_route_dev_notifier);
4227         unregister_pernet_subsys(&ip6_route_net_late_ops);
4228         fib6_rules_cleanup();
4229         xfrm6_fini();
4230         fib6_gc_cleanup();
4231         unregister_pernet_subsys(&ipv6_inetpeer_ops);
4232         unregister_pernet_subsys(&ip6_route_net_ops);
4233         dst_entries_destroy(&ip6_dst_blackhole_ops);
4234         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4235 }