Merge tag 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost
[sfrench/cifs-2.6.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <net/ip.h>
67 #include <linux/uaccess.h>
68
69 #ifdef CONFIG_SYSCTL
70 #include <linux/sysctl.h>
71 #endif
72
73 static int ip6_rt_type_to_error(u8 fib6_type);
74
75 #define CREATE_TRACE_POINTS
76 #include <trace/events/fib6.h>
77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
78 #undef CREATE_TRACE_POINTS
79
80 enum rt6_nud_state {
81         RT6_NUD_FAIL_HARD = -3,
82         RT6_NUD_FAIL_PROBE = -2,
83         RT6_NUD_FAIL_DO_RR = -1,
84         RT6_NUD_SUCCEED = 1
85 };
86
87 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
88 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
89 static unsigned int      ip6_mtu(const struct dst_entry *dst);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void             ip6_dst_destroy(struct dst_entry *);
92 static void             ip6_dst_ifdown(struct dst_entry *,
93                                        struct net_device *dev, int how);
94 static int               ip6_dst_gc(struct dst_ops *ops);
95
96 static int              ip6_pkt_discard(struct sk_buff *skb);
97 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
98 static int              ip6_pkt_prohibit(struct sk_buff *skb);
99 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
100 static void             ip6_link_failure(struct sk_buff *skb);
101 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
102                                            struct sk_buff *skb, u32 mtu);
103 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
104                                         struct sk_buff *skb);
105 static int rt6_score_route(struct fib6_info *rt, int oif, int strict);
106 static size_t rt6_nlmsg_size(struct fib6_info *rt);
107 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
108                          struct fib6_info *rt, struct dst_entry *dst,
109                          struct in6_addr *dest, struct in6_addr *src,
110                          int iif, int type, u32 portid, u32 seq,
111                          unsigned int flags);
112 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
113                                            struct in6_addr *daddr,
114                                            struct in6_addr *saddr);
115
116 #ifdef CONFIG_IPV6_ROUTE_INFO
117 static struct fib6_info *rt6_add_route_info(struct net *net,
118                                            const struct in6_addr *prefix, int prefixlen,
119                                            const struct in6_addr *gwaddr,
120                                            struct net_device *dev,
121                                            unsigned int pref);
122 static struct fib6_info *rt6_get_route_info(struct net *net,
123                                            const struct in6_addr *prefix, int prefixlen,
124                                            const struct in6_addr *gwaddr,
125                                            struct net_device *dev);
126 #endif
127
128 struct uncached_list {
129         spinlock_t              lock;
130         struct list_head        head;
131 };
132
133 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
134
135 void rt6_uncached_list_add(struct rt6_info *rt)
136 {
137         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
138
139         rt->rt6i_uncached_list = ul;
140
141         spin_lock_bh(&ul->lock);
142         list_add_tail(&rt->rt6i_uncached, &ul->head);
143         spin_unlock_bh(&ul->lock);
144 }
145
146 void rt6_uncached_list_del(struct rt6_info *rt)
147 {
148         if (!list_empty(&rt->rt6i_uncached)) {
149                 struct uncached_list *ul = rt->rt6i_uncached_list;
150                 struct net *net = dev_net(rt->dst.dev);
151
152                 spin_lock_bh(&ul->lock);
153                 list_del(&rt->rt6i_uncached);
154                 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
155                 spin_unlock_bh(&ul->lock);
156         }
157 }
158
159 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
160 {
161         struct net_device *loopback_dev = net->loopback_dev;
162         int cpu;
163
164         if (dev == loopback_dev)
165                 return;
166
167         for_each_possible_cpu(cpu) {
168                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
169                 struct rt6_info *rt;
170
171                 spin_lock_bh(&ul->lock);
172                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
173                         struct inet6_dev *rt_idev = rt->rt6i_idev;
174                         struct net_device *rt_dev = rt->dst.dev;
175
176                         if (rt_idev->dev == dev) {
177                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
178                                 in6_dev_put(rt_idev);
179                         }
180
181                         if (rt_dev == dev) {
182                                 rt->dst.dev = loopback_dev;
183                                 dev_hold(rt->dst.dev);
184                                 dev_put(rt_dev);
185                         }
186                 }
187                 spin_unlock_bh(&ul->lock);
188         }
189 }
190
191 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
192                                              struct sk_buff *skb,
193                                              const void *daddr)
194 {
195         if (!ipv6_addr_any(p))
196                 return (const void *) p;
197         else if (skb)
198                 return &ipv6_hdr(skb)->daddr;
199         return daddr;
200 }
201
202 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
203                                    struct net_device *dev,
204                                    struct sk_buff *skb,
205                                    const void *daddr)
206 {
207         struct neighbour *n;
208
209         daddr = choose_neigh_daddr(gw, skb, daddr);
210         n = __ipv6_neigh_lookup(dev, daddr);
211         if (n)
212                 return n;
213         return neigh_create(&nd_tbl, daddr, dev);
214 }
215
216 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
217                                               struct sk_buff *skb,
218                                               const void *daddr)
219 {
220         const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
221
222         return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
223 }
224
225 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
226 {
227         struct net_device *dev = dst->dev;
228         struct rt6_info *rt = (struct rt6_info *)dst;
229
230         daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
231         if (!daddr)
232                 return;
233         if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
234                 return;
235         if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
236                 return;
237         __ipv6_confirm_neigh(dev, daddr);
238 }
239
240 static struct dst_ops ip6_dst_ops_template = {
241         .family                 =       AF_INET6,
242         .gc                     =       ip6_dst_gc,
243         .gc_thresh              =       1024,
244         .check                  =       ip6_dst_check,
245         .default_advmss         =       ip6_default_advmss,
246         .mtu                    =       ip6_mtu,
247         .cow_metrics            =       dst_cow_metrics_generic,
248         .destroy                =       ip6_dst_destroy,
249         .ifdown                 =       ip6_dst_ifdown,
250         .negative_advice        =       ip6_negative_advice,
251         .link_failure           =       ip6_link_failure,
252         .update_pmtu            =       ip6_rt_update_pmtu,
253         .redirect               =       rt6_do_redirect,
254         .local_out              =       __ip6_local_out,
255         .neigh_lookup           =       ip6_dst_neigh_lookup,
256         .confirm_neigh          =       ip6_confirm_neigh,
257 };
258
259 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
260 {
261         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
262
263         return mtu ? : dst->dev->mtu;
264 }
265
266 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
267                                          struct sk_buff *skb, u32 mtu)
268 {
269 }
270
271 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
272                                       struct sk_buff *skb)
273 {
274 }
275
276 static struct dst_ops ip6_dst_blackhole_ops = {
277         .family                 =       AF_INET6,
278         .destroy                =       ip6_dst_destroy,
279         .check                  =       ip6_dst_check,
280         .mtu                    =       ip6_blackhole_mtu,
281         .default_advmss         =       ip6_default_advmss,
282         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
283         .redirect               =       ip6_rt_blackhole_redirect,
284         .cow_metrics            =       dst_cow_metrics_generic,
285         .neigh_lookup           =       ip6_dst_neigh_lookup,
286 };
287
288 static const u32 ip6_template_metrics[RTAX_MAX] = {
289         [RTAX_HOPLIMIT - 1] = 0,
290 };
291
292 static const struct fib6_info fib6_null_entry_template = {
293         .fib6_flags     = (RTF_REJECT | RTF_NONEXTHOP),
294         .fib6_protocol  = RTPROT_KERNEL,
295         .fib6_metric    = ~(u32)0,
296         .fib6_ref       = ATOMIC_INIT(1),
297         .fib6_type      = RTN_UNREACHABLE,
298         .fib6_metrics   = (struct dst_metrics *)&dst_default_metrics,
299 };
300
301 static const struct rt6_info ip6_null_entry_template = {
302         .dst = {
303                 .__refcnt       = ATOMIC_INIT(1),
304                 .__use          = 1,
305                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
306                 .error          = -ENETUNREACH,
307                 .input          = ip6_pkt_discard,
308                 .output         = ip6_pkt_discard_out,
309         },
310         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
311 };
312
313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
314
315 static const struct rt6_info ip6_prohibit_entry_template = {
316         .dst = {
317                 .__refcnt       = ATOMIC_INIT(1),
318                 .__use          = 1,
319                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
320                 .error          = -EACCES,
321                 .input          = ip6_pkt_prohibit,
322                 .output         = ip6_pkt_prohibit_out,
323         },
324         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
325 };
326
327 static const struct rt6_info ip6_blk_hole_entry_template = {
328         .dst = {
329                 .__refcnt       = ATOMIC_INIT(1),
330                 .__use          = 1,
331                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
332                 .error          = -EINVAL,
333                 .input          = dst_discard,
334                 .output         = dst_discard_out,
335         },
336         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
337 };
338
339 #endif
340
341 static void rt6_info_init(struct rt6_info *rt)
342 {
343         struct dst_entry *dst = &rt->dst;
344
345         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
346         INIT_LIST_HEAD(&rt->rt6i_uncached);
347 }
348
349 /* allocate dst with ip6_dst_ops */
350 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
351                                int flags)
352 {
353         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
354                                         1, DST_OBSOLETE_FORCE_CHK, flags);
355
356         if (rt) {
357                 rt6_info_init(rt);
358                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
359         }
360
361         return rt;
362 }
363 EXPORT_SYMBOL(ip6_dst_alloc);
364
365 static void ip6_dst_destroy(struct dst_entry *dst)
366 {
367         struct rt6_info *rt = (struct rt6_info *)dst;
368         struct fib6_info *from;
369         struct inet6_dev *idev;
370
371         ip_dst_metrics_put(dst);
372         rt6_uncached_list_del(rt);
373
374         idev = rt->rt6i_idev;
375         if (idev) {
376                 rt->rt6i_idev = NULL;
377                 in6_dev_put(idev);
378         }
379
380         rcu_read_lock();
381         from = rcu_dereference(rt->from);
382         rcu_assign_pointer(rt->from, NULL);
383         fib6_info_release(from);
384         rcu_read_unlock();
385 }
386
387 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
388                            int how)
389 {
390         struct rt6_info *rt = (struct rt6_info *)dst;
391         struct inet6_dev *idev = rt->rt6i_idev;
392         struct net_device *loopback_dev =
393                 dev_net(dev)->loopback_dev;
394
395         if (idev && idev->dev != loopback_dev) {
396                 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
397                 if (loopback_idev) {
398                         rt->rt6i_idev = loopback_idev;
399                         in6_dev_put(idev);
400                 }
401         }
402 }
403
404 static bool __rt6_check_expired(const struct rt6_info *rt)
405 {
406         if (rt->rt6i_flags & RTF_EXPIRES)
407                 return time_after(jiffies, rt->dst.expires);
408         else
409                 return false;
410 }
411
412 static bool rt6_check_expired(const struct rt6_info *rt)
413 {
414         struct fib6_info *from;
415
416         from = rcu_dereference(rt->from);
417
418         if (rt->rt6i_flags & RTF_EXPIRES) {
419                 if (time_after(jiffies, rt->dst.expires))
420                         return true;
421         } else if (from) {
422                 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
423                         fib6_check_expired(from);
424         }
425         return false;
426 }
427
428 struct fib6_info *fib6_multipath_select(const struct net *net,
429                                         struct fib6_info *match,
430                                         struct flowi6 *fl6, int oif,
431                                         const struct sk_buff *skb,
432                                         int strict)
433 {
434         struct fib6_info *sibling, *next_sibling;
435
436         /* We might have already computed the hash for ICMPv6 errors. In such
437          * case it will always be non-zero. Otherwise now is the time to do it.
438          */
439         if (!fl6->mp_hash)
440                 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
441
442         if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound))
443                 return match;
444
445         list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
446                                  fib6_siblings) {
447                 int nh_upper_bound;
448
449                 nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound);
450                 if (fl6->mp_hash > nh_upper_bound)
451                         continue;
452                 if (rt6_score_route(sibling, oif, strict) < 0)
453                         break;
454                 match = sibling;
455                 break;
456         }
457
458         return match;
459 }
460
461 /*
462  *      Route lookup. rcu_read_lock() should be held.
463  */
464
465 static inline struct fib6_info *rt6_device_match(struct net *net,
466                                                  struct fib6_info *rt,
467                                                     const struct in6_addr *saddr,
468                                                     int oif,
469                                                     int flags)
470 {
471         struct fib6_info *sprt;
472
473         if (!oif && ipv6_addr_any(saddr) &&
474             !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
475                 return rt;
476
477         for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) {
478                 const struct net_device *dev = sprt->fib6_nh.nh_dev;
479
480                 if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD)
481                         continue;
482
483                 if (oif) {
484                         if (dev->ifindex == oif)
485                                 return sprt;
486                 } else {
487                         if (ipv6_chk_addr(net, saddr, dev,
488                                           flags & RT6_LOOKUP_F_IFACE))
489                                 return sprt;
490                 }
491         }
492
493         if (oif && flags & RT6_LOOKUP_F_IFACE)
494                 return net->ipv6.fib6_null_entry;
495
496         return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
497 }
498
499 #ifdef CONFIG_IPV6_ROUTER_PREF
500 struct __rt6_probe_work {
501         struct work_struct work;
502         struct in6_addr target;
503         struct net_device *dev;
504 };
505
506 static void rt6_probe_deferred(struct work_struct *w)
507 {
508         struct in6_addr mcaddr;
509         struct __rt6_probe_work *work =
510                 container_of(w, struct __rt6_probe_work, work);
511
512         addrconf_addr_solict_mult(&work->target, &mcaddr);
513         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
514         dev_put(work->dev);
515         kfree(work);
516 }
517
518 static void rt6_probe(struct fib6_info *rt)
519 {
520         struct __rt6_probe_work *work = NULL;
521         const struct in6_addr *nh_gw;
522         struct neighbour *neigh;
523         struct net_device *dev;
524         struct inet6_dev *idev;
525
526         /*
527          * Okay, this does not seem to be appropriate
528          * for now, however, we need to check if it
529          * is really so; aka Router Reachability Probing.
530          *
531          * Router Reachability Probe MUST be rate-limited
532          * to no more than one per minute.
533          */
534         if (!rt || !(rt->fib6_flags & RTF_GATEWAY))
535                 return;
536
537         nh_gw = &rt->fib6_nh.nh_gw;
538         dev = rt->fib6_nh.nh_dev;
539         rcu_read_lock_bh();
540         idev = __in6_dev_get(dev);
541         neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
542         if (neigh) {
543                 if (neigh->nud_state & NUD_VALID)
544                         goto out;
545
546                 write_lock(&neigh->lock);
547                 if (!(neigh->nud_state & NUD_VALID) &&
548                     time_after(jiffies,
549                                neigh->updated + idev->cnf.rtr_probe_interval)) {
550                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
551                         if (work)
552                                 __neigh_set_probe_once(neigh);
553                 }
554                 write_unlock(&neigh->lock);
555         } else if (time_after(jiffies, rt->last_probe +
556                                        idev->cnf.rtr_probe_interval)) {
557                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
558         }
559
560         if (work) {
561                 rt->last_probe = jiffies;
562                 INIT_WORK(&work->work, rt6_probe_deferred);
563                 work->target = *nh_gw;
564                 dev_hold(dev);
565                 work->dev = dev;
566                 schedule_work(&work->work);
567         }
568
569 out:
570         rcu_read_unlock_bh();
571 }
572 #else
573 static inline void rt6_probe(struct fib6_info *rt)
574 {
575 }
576 #endif
577
578 /*
579  * Default Router Selection (RFC 2461 6.3.6)
580  */
581 static inline int rt6_check_dev(struct fib6_info *rt, int oif)
582 {
583         const struct net_device *dev = rt->fib6_nh.nh_dev;
584
585         if (!oif || dev->ifindex == oif)
586                 return 2;
587         return 0;
588 }
589
590 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
591 {
592         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
593         struct neighbour *neigh;
594
595         if (rt->fib6_flags & RTF_NONEXTHOP ||
596             !(rt->fib6_flags & RTF_GATEWAY))
597                 return RT6_NUD_SUCCEED;
598
599         rcu_read_lock_bh();
600         neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev,
601                                           &rt->fib6_nh.nh_gw);
602         if (neigh) {
603                 read_lock(&neigh->lock);
604                 if (neigh->nud_state & NUD_VALID)
605                         ret = RT6_NUD_SUCCEED;
606 #ifdef CONFIG_IPV6_ROUTER_PREF
607                 else if (!(neigh->nud_state & NUD_FAILED))
608                         ret = RT6_NUD_SUCCEED;
609                 else
610                         ret = RT6_NUD_FAIL_PROBE;
611 #endif
612                 read_unlock(&neigh->lock);
613         } else {
614                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
615                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
616         }
617         rcu_read_unlock_bh();
618
619         return ret;
620 }
621
622 static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
623 {
624         int m;
625
626         m = rt6_check_dev(rt, oif);
627         if (!m && (strict & RT6_LOOKUP_F_IFACE))
628                 return RT6_NUD_FAIL_HARD;
629 #ifdef CONFIG_IPV6_ROUTER_PREF
630         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2;
631 #endif
632         if (strict & RT6_LOOKUP_F_REACHABLE) {
633                 int n = rt6_check_neigh(rt);
634                 if (n < 0)
635                         return n;
636         }
637         return m;
638 }
639
640 /* called with rc_read_lock held */
641 static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i)
642 {
643         const struct net_device *dev = fib6_info_nh_dev(f6i);
644         bool rc = false;
645
646         if (dev) {
647                 const struct inet6_dev *idev = __in6_dev_get(dev);
648
649                 rc = !!idev->cnf.ignore_routes_with_linkdown;
650         }
651
652         return rc;
653 }
654
655 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
656                                    int *mpri, struct fib6_info *match,
657                                    bool *do_rr)
658 {
659         int m;
660         bool match_do_rr = false;
661
662         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
663                 goto out;
664
665         if (fib6_ignore_linkdown(rt) &&
666             rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
667             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
668                 goto out;
669
670         if (fib6_check_expired(rt))
671                 goto out;
672
673         m = rt6_score_route(rt, oif, strict);
674         if (m == RT6_NUD_FAIL_DO_RR) {
675                 match_do_rr = true;
676                 m = 0; /* lowest valid score */
677         } else if (m == RT6_NUD_FAIL_HARD) {
678                 goto out;
679         }
680
681         if (strict & RT6_LOOKUP_F_REACHABLE)
682                 rt6_probe(rt);
683
684         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
685         if (m > *mpri) {
686                 *do_rr = match_do_rr;
687                 *mpri = m;
688                 match = rt;
689         }
690 out:
691         return match;
692 }
693
694 static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
695                                      struct fib6_info *leaf,
696                                      struct fib6_info *rr_head,
697                                      u32 metric, int oif, int strict,
698                                      bool *do_rr)
699 {
700         struct fib6_info *rt, *match, *cont;
701         int mpri = -1;
702
703         match = NULL;
704         cont = NULL;
705         for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) {
706                 if (rt->fib6_metric != metric) {
707                         cont = rt;
708                         break;
709                 }
710
711                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
712         }
713
714         for (rt = leaf; rt && rt != rr_head;
715              rt = rcu_dereference(rt->fib6_next)) {
716                 if (rt->fib6_metric != metric) {
717                         cont = rt;
718                         break;
719                 }
720
721                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
722         }
723
724         if (match || !cont)
725                 return match;
726
727         for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next))
728                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
729
730         return match;
731 }
732
733 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
734                                    int oif, int strict)
735 {
736         struct fib6_info *leaf = rcu_dereference(fn->leaf);
737         struct fib6_info *match, *rt0;
738         bool do_rr = false;
739         int key_plen;
740
741         if (!leaf || leaf == net->ipv6.fib6_null_entry)
742                 return net->ipv6.fib6_null_entry;
743
744         rt0 = rcu_dereference(fn->rr_ptr);
745         if (!rt0)
746                 rt0 = leaf;
747
748         /* Double check to make sure fn is not an intermediate node
749          * and fn->leaf does not points to its child's leaf
750          * (This might happen if all routes under fn are deleted from
751          * the tree and fib6_repair_tree() is called on the node.)
752          */
753         key_plen = rt0->fib6_dst.plen;
754 #ifdef CONFIG_IPV6_SUBTREES
755         if (rt0->fib6_src.plen)
756                 key_plen = rt0->fib6_src.plen;
757 #endif
758         if (fn->fn_bit != key_plen)
759                 return net->ipv6.fib6_null_entry;
760
761         match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
762                              &do_rr);
763
764         if (do_rr) {
765                 struct fib6_info *next = rcu_dereference(rt0->fib6_next);
766
767                 /* no entries matched; do round-robin */
768                 if (!next || next->fib6_metric != rt0->fib6_metric)
769                         next = leaf;
770
771                 if (next != rt0) {
772                         spin_lock_bh(&leaf->fib6_table->tb6_lock);
773                         /* make sure next is not being deleted from the tree */
774                         if (next->fib6_node)
775                                 rcu_assign_pointer(fn->rr_ptr, next);
776                         spin_unlock_bh(&leaf->fib6_table->tb6_lock);
777                 }
778         }
779
780         return match ? match : net->ipv6.fib6_null_entry;
781 }
782
783 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
784 {
785         return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
786 }
787
788 #ifdef CONFIG_IPV6_ROUTE_INFO
789 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
790                   const struct in6_addr *gwaddr)
791 {
792         struct net *net = dev_net(dev);
793         struct route_info *rinfo = (struct route_info *) opt;
794         struct in6_addr prefix_buf, *prefix;
795         unsigned int pref;
796         unsigned long lifetime;
797         struct fib6_info *rt;
798
799         if (len < sizeof(struct route_info)) {
800                 return -EINVAL;
801         }
802
803         /* Sanity check for prefix_len and length */
804         if (rinfo->length > 3) {
805                 return -EINVAL;
806         } else if (rinfo->prefix_len > 128) {
807                 return -EINVAL;
808         } else if (rinfo->prefix_len > 64) {
809                 if (rinfo->length < 2) {
810                         return -EINVAL;
811                 }
812         } else if (rinfo->prefix_len > 0) {
813                 if (rinfo->length < 1) {
814                         return -EINVAL;
815                 }
816         }
817
818         pref = rinfo->route_pref;
819         if (pref == ICMPV6_ROUTER_PREF_INVALID)
820                 return -EINVAL;
821
822         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
823
824         if (rinfo->length == 3)
825                 prefix = (struct in6_addr *)rinfo->prefix;
826         else {
827                 /* this function is safe */
828                 ipv6_addr_prefix(&prefix_buf,
829                                  (struct in6_addr *)rinfo->prefix,
830                                  rinfo->prefix_len);
831                 prefix = &prefix_buf;
832         }
833
834         if (rinfo->prefix_len == 0)
835                 rt = rt6_get_dflt_router(net, gwaddr, dev);
836         else
837                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
838                                         gwaddr, dev);
839
840         if (rt && !lifetime) {
841                 ip6_del_rt(net, rt);
842                 rt = NULL;
843         }
844
845         if (!rt && lifetime)
846                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
847                                         dev, pref);
848         else if (rt)
849                 rt->fib6_flags = RTF_ROUTEINFO |
850                                  (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
851
852         if (rt) {
853                 if (!addrconf_finite_timeout(lifetime))
854                         fib6_clean_expires(rt);
855                 else
856                         fib6_set_expires(rt, jiffies + HZ * lifetime);
857
858                 fib6_info_release(rt);
859         }
860         return 0;
861 }
862 #endif
863
864 /*
865  *      Misc support functions
866  */
867
868 /* called with rcu_lock held */
869 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
870 {
871         struct net_device *dev = rt->fib6_nh.nh_dev;
872
873         if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
874                 /* for copies of local routes, dst->dev needs to be the
875                  * device if it is a master device, the master device if
876                  * device is enslaved, and the loopback as the default
877                  */
878                 if (netif_is_l3_slave(dev) &&
879                     !rt6_need_strict(&rt->fib6_dst.addr))
880                         dev = l3mdev_master_dev_rcu(dev);
881                 else if (!netif_is_l3_master(dev))
882                         dev = dev_net(dev)->loopback_dev;
883                 /* last case is netif_is_l3_master(dev) is true in which
884                  * case we want dev returned to be dev
885                  */
886         }
887
888         return dev;
889 }
890
891 static const int fib6_prop[RTN_MAX + 1] = {
892         [RTN_UNSPEC]    = 0,
893         [RTN_UNICAST]   = 0,
894         [RTN_LOCAL]     = 0,
895         [RTN_BROADCAST] = 0,
896         [RTN_ANYCAST]   = 0,
897         [RTN_MULTICAST] = 0,
898         [RTN_BLACKHOLE] = -EINVAL,
899         [RTN_UNREACHABLE] = -EHOSTUNREACH,
900         [RTN_PROHIBIT]  = -EACCES,
901         [RTN_THROW]     = -EAGAIN,
902         [RTN_NAT]       = -EINVAL,
903         [RTN_XRESOLVE]  = -EINVAL,
904 };
905
906 static int ip6_rt_type_to_error(u8 fib6_type)
907 {
908         return fib6_prop[fib6_type];
909 }
910
911 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
912 {
913         unsigned short flags = 0;
914
915         if (rt->dst_nocount)
916                 flags |= DST_NOCOUNT;
917         if (rt->dst_nopolicy)
918                 flags |= DST_NOPOLICY;
919         if (rt->dst_host)
920                 flags |= DST_HOST;
921
922         return flags;
923 }
924
925 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
926 {
927         rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
928
929         switch (ort->fib6_type) {
930         case RTN_BLACKHOLE:
931                 rt->dst.output = dst_discard_out;
932                 rt->dst.input = dst_discard;
933                 break;
934         case RTN_PROHIBIT:
935                 rt->dst.output = ip6_pkt_prohibit_out;
936                 rt->dst.input = ip6_pkt_prohibit;
937                 break;
938         case RTN_THROW:
939         case RTN_UNREACHABLE:
940         default:
941                 rt->dst.output = ip6_pkt_discard_out;
942                 rt->dst.input = ip6_pkt_discard;
943                 break;
944         }
945 }
946
947 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
948 {
949         if (ort->fib6_flags & RTF_REJECT) {
950                 ip6_rt_init_dst_reject(rt, ort);
951                 return;
952         }
953
954         rt->dst.error = 0;
955         rt->dst.output = ip6_output;
956
957         if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) {
958                 rt->dst.input = ip6_input;
959         } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
960                 rt->dst.input = ip6_mc_input;
961         } else {
962                 rt->dst.input = ip6_forward;
963         }
964
965         if (ort->fib6_nh.nh_lwtstate) {
966                 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
967                 lwtunnel_set_redirect(&rt->dst);
968         }
969
970         rt->dst.lastuse = jiffies;
971 }
972
973 /* Caller must already hold reference to @from */
974 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
975 {
976         rt->rt6i_flags &= ~RTF_EXPIRES;
977         rcu_assign_pointer(rt->from, from);
978         ip_dst_init_metrics(&rt->dst, from->fib6_metrics);
979 }
980
981 /* Caller must already hold reference to @ort */
982 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
983 {
984         struct net_device *dev = fib6_info_nh_dev(ort);
985
986         ip6_rt_init_dst(rt, ort);
987
988         rt->rt6i_dst = ort->fib6_dst;
989         rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
990         rt->rt6i_gateway = ort->fib6_nh.nh_gw;
991         rt->rt6i_flags = ort->fib6_flags;
992         rt6_set_from(rt, ort);
993 #ifdef CONFIG_IPV6_SUBTREES
994         rt->rt6i_src = ort->fib6_src;
995 #endif
996 }
997
998 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
999                                         struct in6_addr *saddr)
1000 {
1001         struct fib6_node *pn, *sn;
1002         while (1) {
1003                 if (fn->fn_flags & RTN_TL_ROOT)
1004                         return NULL;
1005                 pn = rcu_dereference(fn->parent);
1006                 sn = FIB6_SUBTREE(pn);
1007                 if (sn && sn != fn)
1008                         fn = fib6_node_lookup(sn, NULL, saddr);
1009                 else
1010                         fn = pn;
1011                 if (fn->fn_flags & RTN_RTINFO)
1012                         return fn;
1013         }
1014 }
1015
1016 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
1017                           bool null_fallback)
1018 {
1019         struct rt6_info *rt = *prt;
1020
1021         if (dst_hold_safe(&rt->dst))
1022                 return true;
1023         if (null_fallback) {
1024                 rt = net->ipv6.ip6_null_entry;
1025                 dst_hold(&rt->dst);
1026         } else {
1027                 rt = NULL;
1028         }
1029         *prt = rt;
1030         return false;
1031 }
1032
1033 /* called with rcu_lock held */
1034 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
1035 {
1036         unsigned short flags = fib6_info_dst_flags(rt);
1037         struct net_device *dev = rt->fib6_nh.nh_dev;
1038         struct rt6_info *nrt;
1039
1040         if (!fib6_info_hold_safe(rt))
1041                 return NULL;
1042
1043         nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1044         if (nrt)
1045                 ip6_rt_copy_init(nrt, rt);
1046         else
1047                 fib6_info_release(rt);
1048
1049         return nrt;
1050 }
1051
1052 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1053                                              struct fib6_table *table,
1054                                              struct flowi6 *fl6,
1055                                              const struct sk_buff *skb,
1056                                              int flags)
1057 {
1058         struct fib6_info *f6i;
1059         struct fib6_node *fn;
1060         struct rt6_info *rt;
1061
1062         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1063                 flags &= ~RT6_LOOKUP_F_IFACE;
1064
1065         rcu_read_lock();
1066         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1067 restart:
1068         f6i = rcu_dereference(fn->leaf);
1069         if (!f6i) {
1070                 f6i = net->ipv6.fib6_null_entry;
1071         } else {
1072                 f6i = rt6_device_match(net, f6i, &fl6->saddr,
1073                                       fl6->flowi6_oif, flags);
1074                 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
1075                         f6i = fib6_multipath_select(net, f6i, fl6,
1076                                                     fl6->flowi6_oif, skb,
1077                                                     flags);
1078         }
1079         if (f6i == net->ipv6.fib6_null_entry) {
1080                 fn = fib6_backtrack(fn, &fl6->saddr);
1081                 if (fn)
1082                         goto restart;
1083         }
1084
1085         trace_fib6_table_lookup(net, f6i, table, fl6);
1086
1087         /* Search through exception table */
1088         rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1089         if (rt) {
1090                 if (ip6_hold_safe(net, &rt, true))
1091                         dst_use_noref(&rt->dst, jiffies);
1092         } else if (f6i == net->ipv6.fib6_null_entry) {
1093                 rt = net->ipv6.ip6_null_entry;
1094                 dst_hold(&rt->dst);
1095         } else {
1096                 rt = ip6_create_rt_rcu(f6i);
1097                 if (!rt) {
1098                         rt = net->ipv6.ip6_null_entry;
1099                         dst_hold(&rt->dst);
1100                 }
1101         }
1102
1103         rcu_read_unlock();
1104
1105         return rt;
1106 }
1107
1108 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1109                                    const struct sk_buff *skb, int flags)
1110 {
1111         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1112 }
1113 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1114
1115 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1116                             const struct in6_addr *saddr, int oif,
1117                             const struct sk_buff *skb, int strict)
1118 {
1119         struct flowi6 fl6 = {
1120                 .flowi6_oif = oif,
1121                 .daddr = *daddr,
1122         };
1123         struct dst_entry *dst;
1124         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1125
1126         if (saddr) {
1127                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1128                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1129         }
1130
1131         dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1132         if (dst->error == 0)
1133                 return (struct rt6_info *) dst;
1134
1135         dst_release(dst);
1136
1137         return NULL;
1138 }
1139 EXPORT_SYMBOL(rt6_lookup);
1140
1141 /* ip6_ins_rt is called with FREE table->tb6_lock.
1142  * It takes new route entry, the addition fails by any reason the
1143  * route is released.
1144  * Caller must hold dst before calling it.
1145  */
1146
1147 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1148                         struct netlink_ext_ack *extack)
1149 {
1150         int err;
1151         struct fib6_table *table;
1152
1153         table = rt->fib6_table;
1154         spin_lock_bh(&table->tb6_lock);
1155         err = fib6_add(&table->tb6_root, rt, info, extack);
1156         spin_unlock_bh(&table->tb6_lock);
1157
1158         return err;
1159 }
1160
1161 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1162 {
1163         struct nl_info info = { .nl_net = net, };
1164
1165         return __ip6_ins_rt(rt, &info, NULL);
1166 }
1167
1168 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
1169                                            const struct in6_addr *daddr,
1170                                            const struct in6_addr *saddr)
1171 {
1172         struct net_device *dev;
1173         struct rt6_info *rt;
1174
1175         /*
1176          *      Clone the route.
1177          */
1178
1179         if (!fib6_info_hold_safe(ort))
1180                 return NULL;
1181
1182         dev = ip6_rt_get_dev_rcu(ort);
1183         rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1184         if (!rt) {
1185                 fib6_info_release(ort);
1186                 return NULL;
1187         }
1188
1189         ip6_rt_copy_init(rt, ort);
1190         rt->rt6i_flags |= RTF_CACHE;
1191         rt->dst.flags |= DST_HOST;
1192         rt->rt6i_dst.addr = *daddr;
1193         rt->rt6i_dst.plen = 128;
1194
1195         if (!rt6_is_gw_or_nonexthop(ort)) {
1196                 if (ort->fib6_dst.plen != 128 &&
1197                     ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
1198                         rt->rt6i_flags |= RTF_ANYCAST;
1199 #ifdef CONFIG_IPV6_SUBTREES
1200                 if (rt->rt6i_src.plen && saddr) {
1201                         rt->rt6i_src.addr = *saddr;
1202                         rt->rt6i_src.plen = 128;
1203                 }
1204 #endif
1205         }
1206
1207         return rt;
1208 }
1209
1210 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
1211 {
1212         unsigned short flags = fib6_info_dst_flags(rt);
1213         struct net_device *dev;
1214         struct rt6_info *pcpu_rt;
1215
1216         if (!fib6_info_hold_safe(rt))
1217                 return NULL;
1218
1219         rcu_read_lock();
1220         dev = ip6_rt_get_dev_rcu(rt);
1221         pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1222         rcu_read_unlock();
1223         if (!pcpu_rt) {
1224                 fib6_info_release(rt);
1225                 return NULL;
1226         }
1227         ip6_rt_copy_init(pcpu_rt, rt);
1228         pcpu_rt->rt6i_flags |= RTF_PCPU;
1229         return pcpu_rt;
1230 }
1231
1232 /* It should be called with rcu_read_lock() acquired */
1233 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
1234 {
1235         struct rt6_info *pcpu_rt, **p;
1236
1237         p = this_cpu_ptr(rt->rt6i_pcpu);
1238         pcpu_rt = *p;
1239
1240         if (pcpu_rt)
1241                 ip6_hold_safe(NULL, &pcpu_rt, false);
1242
1243         return pcpu_rt;
1244 }
1245
1246 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1247                                             struct fib6_info *rt)
1248 {
1249         struct rt6_info *pcpu_rt, *prev, **p;
1250
1251         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1252         if (!pcpu_rt) {
1253                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1254                 return net->ipv6.ip6_null_entry;
1255         }
1256
1257         dst_hold(&pcpu_rt->dst);
1258         p = this_cpu_ptr(rt->rt6i_pcpu);
1259         prev = cmpxchg(p, NULL, pcpu_rt);
1260         BUG_ON(prev);
1261
1262         return pcpu_rt;
1263 }
1264
1265 /* exception hash table implementation
1266  */
1267 static DEFINE_SPINLOCK(rt6_exception_lock);
1268
1269 /* Remove rt6_ex from hash table and free the memory
1270  * Caller must hold rt6_exception_lock
1271  */
1272 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1273                                  struct rt6_exception *rt6_ex)
1274 {
1275         struct net *net;
1276
1277         if (!bucket || !rt6_ex)
1278                 return;
1279
1280         net = dev_net(rt6_ex->rt6i->dst.dev);
1281         hlist_del_rcu(&rt6_ex->hlist);
1282         dst_release(&rt6_ex->rt6i->dst);
1283         kfree_rcu(rt6_ex, rcu);
1284         WARN_ON_ONCE(!bucket->depth);
1285         bucket->depth--;
1286         net->ipv6.rt6_stats->fib_rt_cache--;
1287 }
1288
1289 /* Remove oldest rt6_ex in bucket and free the memory
1290  * Caller must hold rt6_exception_lock
1291  */
1292 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1293 {
1294         struct rt6_exception *rt6_ex, *oldest = NULL;
1295
1296         if (!bucket)
1297                 return;
1298
1299         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1300                 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1301                         oldest = rt6_ex;
1302         }
1303         rt6_remove_exception(bucket, oldest);
1304 }
1305
1306 static u32 rt6_exception_hash(const struct in6_addr *dst,
1307                               const struct in6_addr *src)
1308 {
1309         static u32 seed __read_mostly;
1310         u32 val;
1311
1312         net_get_random_once(&seed, sizeof(seed));
1313         val = jhash(dst, sizeof(*dst), seed);
1314
1315 #ifdef CONFIG_IPV6_SUBTREES
1316         if (src)
1317                 val = jhash(src, sizeof(*src), val);
1318 #endif
1319         return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1320 }
1321
1322 /* Helper function to find the cached rt in the hash table
1323  * and update bucket pointer to point to the bucket for this
1324  * (daddr, saddr) pair
1325  * Caller must hold rt6_exception_lock
1326  */
1327 static struct rt6_exception *
1328 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1329                               const struct in6_addr *daddr,
1330                               const struct in6_addr *saddr)
1331 {
1332         struct rt6_exception *rt6_ex;
1333         u32 hval;
1334
1335         if (!(*bucket) || !daddr)
1336                 return NULL;
1337
1338         hval = rt6_exception_hash(daddr, saddr);
1339         *bucket += hval;
1340
1341         hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1342                 struct rt6_info *rt6 = rt6_ex->rt6i;
1343                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1344
1345 #ifdef CONFIG_IPV6_SUBTREES
1346                 if (matched && saddr)
1347                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1348 #endif
1349                 if (matched)
1350                         return rt6_ex;
1351         }
1352         return NULL;
1353 }
1354
1355 /* Helper function to find the cached rt in the hash table
1356  * and update bucket pointer to point to the bucket for this
1357  * (daddr, saddr) pair
1358  * Caller must hold rcu_read_lock()
1359  */
1360 static struct rt6_exception *
1361 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1362                          const struct in6_addr *daddr,
1363                          const struct in6_addr *saddr)
1364 {
1365         struct rt6_exception *rt6_ex;
1366         u32 hval;
1367
1368         WARN_ON_ONCE(!rcu_read_lock_held());
1369
1370         if (!(*bucket) || !daddr)
1371                 return NULL;
1372
1373         hval = rt6_exception_hash(daddr, saddr);
1374         *bucket += hval;
1375
1376         hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1377                 struct rt6_info *rt6 = rt6_ex->rt6i;
1378                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1379
1380 #ifdef CONFIG_IPV6_SUBTREES
1381                 if (matched && saddr)
1382                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1383 #endif
1384                 if (matched)
1385                         return rt6_ex;
1386         }
1387         return NULL;
1388 }
1389
1390 static unsigned int fib6_mtu(const struct fib6_info *rt)
1391 {
1392         unsigned int mtu;
1393
1394         if (rt->fib6_pmtu) {
1395                 mtu = rt->fib6_pmtu;
1396         } else {
1397                 struct net_device *dev = fib6_info_nh_dev(rt);
1398                 struct inet6_dev *idev;
1399
1400                 rcu_read_lock();
1401                 idev = __in6_dev_get(dev);
1402                 mtu = idev->cnf.mtu6;
1403                 rcu_read_unlock();
1404         }
1405
1406         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1407
1408         return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
1409 }
1410
1411 static int rt6_insert_exception(struct rt6_info *nrt,
1412                                 struct fib6_info *ort)
1413 {
1414         struct net *net = dev_net(nrt->dst.dev);
1415         struct rt6_exception_bucket *bucket;
1416         struct in6_addr *src_key = NULL;
1417         struct rt6_exception *rt6_ex;
1418         int err = 0;
1419
1420         spin_lock_bh(&rt6_exception_lock);
1421
1422         if (ort->exception_bucket_flushed) {
1423                 err = -EINVAL;
1424                 goto out;
1425         }
1426
1427         bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1428                                         lockdep_is_held(&rt6_exception_lock));
1429         if (!bucket) {
1430                 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1431                                  GFP_ATOMIC);
1432                 if (!bucket) {
1433                         err = -ENOMEM;
1434                         goto out;
1435                 }
1436                 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1437         }
1438
1439 #ifdef CONFIG_IPV6_SUBTREES
1440         /* rt6i_src.plen != 0 indicates ort is in subtree
1441          * and exception table is indexed by a hash of
1442          * both rt6i_dst and rt6i_src.
1443          * Otherwise, the exception table is indexed by
1444          * a hash of only rt6i_dst.
1445          */
1446         if (ort->fib6_src.plen)
1447                 src_key = &nrt->rt6i_src.addr;
1448 #endif
1449         /* rt6_mtu_change() might lower mtu on ort.
1450          * Only insert this exception route if its mtu
1451          * is less than ort's mtu value.
1452          */
1453         if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1454                 err = -EINVAL;
1455                 goto out;
1456         }
1457
1458         rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1459                                                src_key);
1460         if (rt6_ex)
1461                 rt6_remove_exception(bucket, rt6_ex);
1462
1463         rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1464         if (!rt6_ex) {
1465                 err = -ENOMEM;
1466                 goto out;
1467         }
1468         rt6_ex->rt6i = nrt;
1469         rt6_ex->stamp = jiffies;
1470         hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1471         bucket->depth++;
1472         net->ipv6.rt6_stats->fib_rt_cache++;
1473
1474         if (bucket->depth > FIB6_MAX_DEPTH)
1475                 rt6_exception_remove_oldest(bucket);
1476
1477 out:
1478         spin_unlock_bh(&rt6_exception_lock);
1479
1480         /* Update fn->fn_sernum to invalidate all cached dst */
1481         if (!err) {
1482                 spin_lock_bh(&ort->fib6_table->tb6_lock);
1483                 fib6_update_sernum(net, ort);
1484                 spin_unlock_bh(&ort->fib6_table->tb6_lock);
1485                 fib6_force_start_gc(net);
1486         }
1487
1488         return err;
1489 }
1490
1491 void rt6_flush_exceptions(struct fib6_info *rt)
1492 {
1493         struct rt6_exception_bucket *bucket;
1494         struct rt6_exception *rt6_ex;
1495         struct hlist_node *tmp;
1496         int i;
1497
1498         spin_lock_bh(&rt6_exception_lock);
1499         /* Prevent rt6_insert_exception() to recreate the bucket list */
1500         rt->exception_bucket_flushed = 1;
1501
1502         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1503                                     lockdep_is_held(&rt6_exception_lock));
1504         if (!bucket)
1505                 goto out;
1506
1507         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1508                 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1509                         rt6_remove_exception(bucket, rt6_ex);
1510                 WARN_ON_ONCE(bucket->depth);
1511                 bucket++;
1512         }
1513
1514 out:
1515         spin_unlock_bh(&rt6_exception_lock);
1516 }
1517
1518 /* Find cached rt in the hash table inside passed in rt
1519  * Caller has to hold rcu_read_lock()
1520  */
1521 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
1522                                            struct in6_addr *daddr,
1523                                            struct in6_addr *saddr)
1524 {
1525         struct rt6_exception_bucket *bucket;
1526         struct in6_addr *src_key = NULL;
1527         struct rt6_exception *rt6_ex;
1528         struct rt6_info *res = NULL;
1529
1530         bucket = rcu_dereference(rt->rt6i_exception_bucket);
1531
1532 #ifdef CONFIG_IPV6_SUBTREES
1533         /* rt6i_src.plen != 0 indicates rt is in subtree
1534          * and exception table is indexed by a hash of
1535          * both rt6i_dst and rt6i_src.
1536          * Otherwise, the exception table is indexed by
1537          * a hash of only rt6i_dst.
1538          */
1539         if (rt->fib6_src.plen)
1540                 src_key = saddr;
1541 #endif
1542         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1543
1544         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1545                 res = rt6_ex->rt6i;
1546
1547         return res;
1548 }
1549
1550 /* Remove the passed in cached rt from the hash table that contains it */
1551 static int rt6_remove_exception_rt(struct rt6_info *rt)
1552 {
1553         struct rt6_exception_bucket *bucket;
1554         struct in6_addr *src_key = NULL;
1555         struct rt6_exception *rt6_ex;
1556         struct fib6_info *from;
1557         int err;
1558
1559         from = rcu_dereference(rt->from);
1560         if (!from ||
1561             !(rt->rt6i_flags & RTF_CACHE))
1562                 return -EINVAL;
1563
1564         if (!rcu_access_pointer(from->rt6i_exception_bucket))
1565                 return -ENOENT;
1566
1567         spin_lock_bh(&rt6_exception_lock);
1568         bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1569                                     lockdep_is_held(&rt6_exception_lock));
1570 #ifdef CONFIG_IPV6_SUBTREES
1571         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1572          * and exception table is indexed by a hash of
1573          * both rt6i_dst and rt6i_src.
1574          * Otherwise, the exception table is indexed by
1575          * a hash of only rt6i_dst.
1576          */
1577         if (from->fib6_src.plen)
1578                 src_key = &rt->rt6i_src.addr;
1579 #endif
1580         rt6_ex = __rt6_find_exception_spinlock(&bucket,
1581                                                &rt->rt6i_dst.addr,
1582                                                src_key);
1583         if (rt6_ex) {
1584                 rt6_remove_exception(bucket, rt6_ex);
1585                 err = 0;
1586         } else {
1587                 err = -ENOENT;
1588         }
1589
1590         spin_unlock_bh(&rt6_exception_lock);
1591         return err;
1592 }
1593
1594 /* Find rt6_ex which contains the passed in rt cache and
1595  * refresh its stamp
1596  */
1597 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1598 {
1599         struct rt6_exception_bucket *bucket;
1600         struct fib6_info *from = rt->from;
1601         struct in6_addr *src_key = NULL;
1602         struct rt6_exception *rt6_ex;
1603
1604         if (!from ||
1605             !(rt->rt6i_flags & RTF_CACHE))
1606                 return;
1607
1608         rcu_read_lock();
1609         bucket = rcu_dereference(from->rt6i_exception_bucket);
1610
1611 #ifdef CONFIG_IPV6_SUBTREES
1612         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1613          * and exception table is indexed by a hash of
1614          * both rt6i_dst and rt6i_src.
1615          * Otherwise, the exception table is indexed by
1616          * a hash of only rt6i_dst.
1617          */
1618         if (from->fib6_src.plen)
1619                 src_key = &rt->rt6i_src.addr;
1620 #endif
1621         rt6_ex = __rt6_find_exception_rcu(&bucket,
1622                                           &rt->rt6i_dst.addr,
1623                                           src_key);
1624         if (rt6_ex)
1625                 rt6_ex->stamp = jiffies;
1626
1627         rcu_read_unlock();
1628 }
1629
1630 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1631                                          struct rt6_info *rt, int mtu)
1632 {
1633         /* If the new MTU is lower than the route PMTU, this new MTU will be the
1634          * lowest MTU in the path: always allow updating the route PMTU to
1635          * reflect PMTU decreases.
1636          *
1637          * If the new MTU is higher, and the route PMTU is equal to the local
1638          * MTU, this means the old MTU is the lowest in the path, so allow
1639          * updating it: if other nodes now have lower MTUs, PMTU discovery will
1640          * handle this.
1641          */
1642
1643         if (dst_mtu(&rt->dst) >= mtu)
1644                 return true;
1645
1646         if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1647                 return true;
1648
1649         return false;
1650 }
1651
1652 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1653                                        struct fib6_info *rt, int mtu)
1654 {
1655         struct rt6_exception_bucket *bucket;
1656         struct rt6_exception *rt6_ex;
1657         int i;
1658
1659         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1660                                         lockdep_is_held(&rt6_exception_lock));
1661
1662         if (!bucket)
1663                 return;
1664
1665         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1666                 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1667                         struct rt6_info *entry = rt6_ex->rt6i;
1668
1669                         /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1670                          * route), the metrics of its rt->from have already
1671                          * been updated.
1672                          */
1673                         if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1674                             rt6_mtu_change_route_allowed(idev, entry, mtu))
1675                                 dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1676                 }
1677                 bucket++;
1678         }
1679 }
1680
1681 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
1682
1683 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1684                                         struct in6_addr *gateway)
1685 {
1686         struct rt6_exception_bucket *bucket;
1687         struct rt6_exception *rt6_ex;
1688         struct hlist_node *tmp;
1689         int i;
1690
1691         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1692                 return;
1693
1694         spin_lock_bh(&rt6_exception_lock);
1695         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1696                                      lockdep_is_held(&rt6_exception_lock));
1697
1698         if (bucket) {
1699                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1700                         hlist_for_each_entry_safe(rt6_ex, tmp,
1701                                                   &bucket->chain, hlist) {
1702                                 struct rt6_info *entry = rt6_ex->rt6i;
1703
1704                                 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1705                                     RTF_CACHE_GATEWAY &&
1706                                     ipv6_addr_equal(gateway,
1707                                                     &entry->rt6i_gateway)) {
1708                                         rt6_remove_exception(bucket, rt6_ex);
1709                                 }
1710                         }
1711                         bucket++;
1712                 }
1713         }
1714
1715         spin_unlock_bh(&rt6_exception_lock);
1716 }
1717
1718 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1719                                       struct rt6_exception *rt6_ex,
1720                                       struct fib6_gc_args *gc_args,
1721                                       unsigned long now)
1722 {
1723         struct rt6_info *rt = rt6_ex->rt6i;
1724
1725         /* we are pruning and obsoleting aged-out and non gateway exceptions
1726          * even if others have still references to them, so that on next
1727          * dst_check() such references can be dropped.
1728          * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1729          * expired, independently from their aging, as per RFC 8201 section 4
1730          */
1731         if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1732                 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1733                         RT6_TRACE("aging clone %p\n", rt);
1734                         rt6_remove_exception(bucket, rt6_ex);
1735                         return;
1736                 }
1737         } else if (time_after(jiffies, rt->dst.expires)) {
1738                 RT6_TRACE("purging expired route %p\n", rt);
1739                 rt6_remove_exception(bucket, rt6_ex);
1740                 return;
1741         }
1742
1743         if (rt->rt6i_flags & RTF_GATEWAY) {
1744                 struct neighbour *neigh;
1745                 __u8 neigh_flags = 0;
1746
1747                 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1748                 if (neigh)
1749                         neigh_flags = neigh->flags;
1750
1751                 if (!(neigh_flags & NTF_ROUTER)) {
1752                         RT6_TRACE("purging route %p via non-router but gateway\n",
1753                                   rt);
1754                         rt6_remove_exception(bucket, rt6_ex);
1755                         return;
1756                 }
1757         }
1758
1759         gc_args->more++;
1760 }
1761
1762 void rt6_age_exceptions(struct fib6_info *rt,
1763                         struct fib6_gc_args *gc_args,
1764                         unsigned long now)
1765 {
1766         struct rt6_exception_bucket *bucket;
1767         struct rt6_exception *rt6_ex;
1768         struct hlist_node *tmp;
1769         int i;
1770
1771         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1772                 return;
1773
1774         rcu_read_lock_bh();
1775         spin_lock(&rt6_exception_lock);
1776         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1777                                     lockdep_is_held(&rt6_exception_lock));
1778
1779         if (bucket) {
1780                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1781                         hlist_for_each_entry_safe(rt6_ex, tmp,
1782                                                   &bucket->chain, hlist) {
1783                                 rt6_age_examine_exception(bucket, rt6_ex,
1784                                                           gc_args, now);
1785                         }
1786                         bucket++;
1787                 }
1788         }
1789         spin_unlock(&rt6_exception_lock);
1790         rcu_read_unlock_bh();
1791 }
1792
1793 /* must be called with rcu lock held */
1794 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
1795                                     int oif, struct flowi6 *fl6, int strict)
1796 {
1797         struct fib6_node *fn, *saved_fn;
1798         struct fib6_info *f6i;
1799
1800         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1801         saved_fn = fn;
1802
1803         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1804                 oif = 0;
1805
1806 redo_rt6_select:
1807         f6i = rt6_select(net, fn, oif, strict);
1808         if (f6i == net->ipv6.fib6_null_entry) {
1809                 fn = fib6_backtrack(fn, &fl6->saddr);
1810                 if (fn)
1811                         goto redo_rt6_select;
1812                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1813                         /* also consider unreachable route */
1814                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1815                         fn = saved_fn;
1816                         goto redo_rt6_select;
1817                 }
1818         }
1819
1820         trace_fib6_table_lookup(net, f6i, table, fl6);
1821
1822         return f6i;
1823 }
1824
1825 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1826                                int oif, struct flowi6 *fl6,
1827                                const struct sk_buff *skb, int flags)
1828 {
1829         struct fib6_info *f6i;
1830         struct rt6_info *rt;
1831         int strict = 0;
1832
1833         strict |= flags & RT6_LOOKUP_F_IFACE;
1834         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1835         if (net->ipv6.devconf_all->forwarding == 0)
1836                 strict |= RT6_LOOKUP_F_REACHABLE;
1837
1838         rcu_read_lock();
1839
1840         f6i = fib6_table_lookup(net, table, oif, fl6, strict);
1841         if (f6i->fib6_nsiblings)
1842                 f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
1843
1844         if (f6i == net->ipv6.fib6_null_entry) {
1845                 rt = net->ipv6.ip6_null_entry;
1846                 rcu_read_unlock();
1847                 dst_hold(&rt->dst);
1848                 return rt;
1849         }
1850
1851         /*Search through exception table */
1852         rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1853         if (rt) {
1854                 if (ip6_hold_safe(net, &rt, true))
1855                         dst_use_noref(&rt->dst, jiffies);
1856
1857                 rcu_read_unlock();
1858                 return rt;
1859         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1860                             !(f6i->fib6_flags & RTF_GATEWAY))) {
1861                 /* Create a RTF_CACHE clone which will not be
1862                  * owned by the fib6 tree.  It is for the special case where
1863                  * the daddr in the skb during the neighbor look-up is different
1864                  * from the fl6->daddr used to look-up route here.
1865                  */
1866                 struct rt6_info *uncached_rt;
1867
1868                 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
1869
1870                 rcu_read_unlock();
1871
1872                 if (uncached_rt) {
1873                         /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1874                          * No need for another dst_hold()
1875                          */
1876                         rt6_uncached_list_add(uncached_rt);
1877                         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1878                 } else {
1879                         uncached_rt = net->ipv6.ip6_null_entry;
1880                         dst_hold(&uncached_rt->dst);
1881                 }
1882
1883                 return uncached_rt;
1884         } else {
1885                 /* Get a percpu copy */
1886
1887                 struct rt6_info *pcpu_rt;
1888
1889                 local_bh_disable();
1890                 pcpu_rt = rt6_get_pcpu_route(f6i);
1891
1892                 if (!pcpu_rt)
1893                         pcpu_rt = rt6_make_pcpu_route(net, f6i);
1894
1895                 local_bh_enable();
1896                 rcu_read_unlock();
1897
1898                 return pcpu_rt;
1899         }
1900 }
1901 EXPORT_SYMBOL_GPL(ip6_pol_route);
1902
1903 static struct rt6_info *ip6_pol_route_input(struct net *net,
1904                                             struct fib6_table *table,
1905                                             struct flowi6 *fl6,
1906                                             const struct sk_buff *skb,
1907                                             int flags)
1908 {
1909         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1910 }
1911
1912 struct dst_entry *ip6_route_input_lookup(struct net *net,
1913                                          struct net_device *dev,
1914                                          struct flowi6 *fl6,
1915                                          const struct sk_buff *skb,
1916                                          int flags)
1917 {
1918         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1919                 flags |= RT6_LOOKUP_F_IFACE;
1920
1921         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1922 }
1923 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1924
1925 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1926                                   struct flow_keys *keys,
1927                                   struct flow_keys *flkeys)
1928 {
1929         const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1930         const struct ipv6hdr *key_iph = outer_iph;
1931         struct flow_keys *_flkeys = flkeys;
1932         const struct ipv6hdr *inner_iph;
1933         const struct icmp6hdr *icmph;
1934         struct ipv6hdr _inner_iph;
1935         struct icmp6hdr _icmph;
1936
1937         if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1938                 goto out;
1939
1940         icmph = skb_header_pointer(skb, skb_transport_offset(skb),
1941                                    sizeof(_icmph), &_icmph);
1942         if (!icmph)
1943                 goto out;
1944
1945         if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1946             icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1947             icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1948             icmph->icmp6_type != ICMPV6_PARAMPROB)
1949                 goto out;
1950
1951         inner_iph = skb_header_pointer(skb,
1952                                        skb_transport_offset(skb) + sizeof(*icmph),
1953                                        sizeof(_inner_iph), &_inner_iph);
1954         if (!inner_iph)
1955                 goto out;
1956
1957         key_iph = inner_iph;
1958         _flkeys = NULL;
1959 out:
1960         if (_flkeys) {
1961                 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1962                 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1963                 keys->tags.flow_label = _flkeys->tags.flow_label;
1964                 keys->basic.ip_proto = _flkeys->basic.ip_proto;
1965         } else {
1966                 keys->addrs.v6addrs.src = key_iph->saddr;
1967                 keys->addrs.v6addrs.dst = key_iph->daddr;
1968                 keys->tags.flow_label = ip6_flowlabel(key_iph);
1969                 keys->basic.ip_proto = key_iph->nexthdr;
1970         }
1971 }
1972
1973 /* if skb is set it will be used and fl6 can be NULL */
1974 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
1975                        const struct sk_buff *skb, struct flow_keys *flkeys)
1976 {
1977         struct flow_keys hash_keys;
1978         u32 mhash;
1979
1980         switch (ip6_multipath_hash_policy(net)) {
1981         case 0:
1982                 memset(&hash_keys, 0, sizeof(hash_keys));
1983                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1984                 if (skb) {
1985                         ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
1986                 } else {
1987                         hash_keys.addrs.v6addrs.src = fl6->saddr;
1988                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
1989                         hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
1990                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
1991                 }
1992                 break;
1993         case 1:
1994                 if (skb) {
1995                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1996                         struct flow_keys keys;
1997
1998                         /* short-circuit if we already have L4 hash present */
1999                         if (skb->l4_hash)
2000                                 return skb_get_hash_raw(skb) >> 1;
2001
2002                         memset(&hash_keys, 0, sizeof(hash_keys));
2003
2004                         if (!flkeys) {
2005                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
2006                                 flkeys = &keys;
2007                         }
2008                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2009                         hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2010                         hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2011                         hash_keys.ports.src = flkeys->ports.src;
2012                         hash_keys.ports.dst = flkeys->ports.dst;
2013                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2014                 } else {
2015                         memset(&hash_keys, 0, sizeof(hash_keys));
2016                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2017                         hash_keys.addrs.v6addrs.src = fl6->saddr;
2018                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
2019                         hash_keys.ports.src = fl6->fl6_sport;
2020                         hash_keys.ports.dst = fl6->fl6_dport;
2021                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
2022                 }
2023                 break;
2024         }
2025         mhash = flow_hash_from_keys(&hash_keys);
2026
2027         return mhash >> 1;
2028 }
2029
2030 void ip6_route_input(struct sk_buff *skb)
2031 {
2032         const struct ipv6hdr *iph = ipv6_hdr(skb);
2033         struct net *net = dev_net(skb->dev);
2034         int flags = RT6_LOOKUP_F_HAS_SADDR;
2035         struct ip_tunnel_info *tun_info;
2036         struct flowi6 fl6 = {
2037                 .flowi6_iif = skb->dev->ifindex,
2038                 .daddr = iph->daddr,
2039                 .saddr = iph->saddr,
2040                 .flowlabel = ip6_flowinfo(iph),
2041                 .flowi6_mark = skb->mark,
2042                 .flowi6_proto = iph->nexthdr,
2043         };
2044         struct flow_keys *flkeys = NULL, _flkeys;
2045
2046         tun_info = skb_tunnel_info(skb);
2047         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2048                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2049
2050         if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2051                 flkeys = &_flkeys;
2052
2053         if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2054                 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2055         skb_dst_drop(skb);
2056         skb_dst_set(skb,
2057                     ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2058 }
2059
2060 static struct rt6_info *ip6_pol_route_output(struct net *net,
2061                                              struct fib6_table *table,
2062                                              struct flowi6 *fl6,
2063                                              const struct sk_buff *skb,
2064                                              int flags)
2065 {
2066         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2067 }
2068
2069 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2070                                          struct flowi6 *fl6, int flags)
2071 {
2072         bool any_src;
2073
2074         if (ipv6_addr_type(&fl6->daddr) &
2075             (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
2076                 struct dst_entry *dst;
2077
2078                 dst = l3mdev_link_scope_lookup(net, fl6);
2079                 if (dst)
2080                         return dst;
2081         }
2082
2083         fl6->flowi6_iif = LOOPBACK_IFINDEX;
2084
2085         any_src = ipv6_addr_any(&fl6->saddr);
2086         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2087             (fl6->flowi6_oif && any_src))
2088                 flags |= RT6_LOOKUP_F_IFACE;
2089
2090         if (!any_src)
2091                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2092         else if (sk)
2093                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2094
2095         return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2096 }
2097 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2098
2099 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2100 {
2101         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2102         struct net_device *loopback_dev = net->loopback_dev;
2103         struct dst_entry *new = NULL;
2104
2105         rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2106                        DST_OBSOLETE_DEAD, 0);
2107         if (rt) {
2108                 rt6_info_init(rt);
2109                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2110
2111                 new = &rt->dst;
2112                 new->__use = 1;
2113                 new->input = dst_discard;
2114                 new->output = dst_discard_out;
2115
2116                 dst_copy_metrics(new, &ort->dst);
2117
2118                 rt->rt6i_idev = in6_dev_get(loopback_dev);
2119                 rt->rt6i_gateway = ort->rt6i_gateway;
2120                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2121
2122                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2123 #ifdef CONFIG_IPV6_SUBTREES
2124                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2125 #endif
2126         }
2127
2128         dst_release(dst_orig);
2129         return new ? new : ERR_PTR(-ENOMEM);
2130 }
2131
2132 /*
2133  *      Destination cache support functions
2134  */
2135
2136 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2137 {
2138         u32 rt_cookie = 0;
2139
2140         if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2141                 return false;
2142
2143         if (fib6_check_expired(f6i))
2144                 return false;
2145
2146         return true;
2147 }
2148
2149 static struct dst_entry *rt6_check(struct rt6_info *rt,
2150                                    struct fib6_info *from,
2151                                    u32 cookie)
2152 {
2153         u32 rt_cookie = 0;
2154
2155         if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2156             rt_cookie != cookie)
2157                 return NULL;
2158
2159         if (rt6_check_expired(rt))
2160                 return NULL;
2161
2162         return &rt->dst;
2163 }
2164
2165 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2166                                             struct fib6_info *from,
2167                                             u32 cookie)
2168 {
2169         if (!__rt6_check_expired(rt) &&
2170             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2171             fib6_check(from, cookie))
2172                 return &rt->dst;
2173         else
2174                 return NULL;
2175 }
2176
2177 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2178 {
2179         struct dst_entry *dst_ret;
2180         struct fib6_info *from;
2181         struct rt6_info *rt;
2182
2183         rt = container_of(dst, struct rt6_info, dst);
2184
2185         rcu_read_lock();
2186
2187         /* All IPV6 dsts are created with ->obsolete set to the value
2188          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2189          * into this function always.
2190          */
2191
2192         from = rcu_dereference(rt->from);
2193
2194         if (from && (rt->rt6i_flags & RTF_PCPU ||
2195             unlikely(!list_empty(&rt->rt6i_uncached))))
2196                 dst_ret = rt6_dst_from_check(rt, from, cookie);
2197         else
2198                 dst_ret = rt6_check(rt, from, cookie);
2199
2200         rcu_read_unlock();
2201
2202         return dst_ret;
2203 }
2204
2205 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2206 {
2207         struct rt6_info *rt = (struct rt6_info *) dst;
2208
2209         if (rt) {
2210                 if (rt->rt6i_flags & RTF_CACHE) {
2211                         rcu_read_lock();
2212                         if (rt6_check_expired(rt)) {
2213                                 rt6_remove_exception_rt(rt);
2214                                 dst = NULL;
2215                         }
2216                         rcu_read_unlock();
2217                 } else {
2218                         dst_release(dst);
2219                         dst = NULL;
2220                 }
2221         }
2222         return dst;
2223 }
2224
2225 static void ip6_link_failure(struct sk_buff *skb)
2226 {
2227         struct rt6_info *rt;
2228
2229         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2230
2231         rt = (struct rt6_info *) skb_dst(skb);
2232         if (rt) {
2233                 rcu_read_lock();
2234                 if (rt->rt6i_flags & RTF_CACHE) {
2235                         rt6_remove_exception_rt(rt);
2236                 } else {
2237                         struct fib6_info *from;
2238                         struct fib6_node *fn;
2239
2240                         from = rcu_dereference(rt->from);
2241                         if (from) {
2242                                 fn = rcu_dereference(from->fib6_node);
2243                                 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2244                                         fn->fn_sernum = -1;
2245                         }
2246                 }
2247                 rcu_read_unlock();
2248         }
2249 }
2250
2251 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2252 {
2253         if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2254                 struct fib6_info *from;
2255
2256                 rcu_read_lock();
2257                 from = rcu_dereference(rt0->from);
2258                 if (from)
2259                         rt0->dst.expires = from->expires;
2260                 rcu_read_unlock();
2261         }
2262
2263         dst_set_expires(&rt0->dst, timeout);
2264         rt0->rt6i_flags |= RTF_EXPIRES;
2265 }
2266
2267 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2268 {
2269         struct net *net = dev_net(rt->dst.dev);
2270
2271         dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2272         rt->rt6i_flags |= RTF_MODIFIED;
2273         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2274 }
2275
2276 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2277 {
2278         bool from_set;
2279
2280         rcu_read_lock();
2281         from_set = !!rcu_dereference(rt->from);
2282         rcu_read_unlock();
2283
2284         return !(rt->rt6i_flags & RTF_CACHE) &&
2285                 (rt->rt6i_flags & RTF_PCPU || from_set);
2286 }
2287
2288 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2289                                  const struct ipv6hdr *iph, u32 mtu)
2290 {
2291         const struct in6_addr *daddr, *saddr;
2292         struct rt6_info *rt6 = (struct rt6_info *)dst;
2293
2294         if (dst_metric_locked(dst, RTAX_MTU))
2295                 return;
2296
2297         if (iph) {
2298                 daddr = &iph->daddr;
2299                 saddr = &iph->saddr;
2300         } else if (sk) {
2301                 daddr = &sk->sk_v6_daddr;
2302                 saddr = &inet6_sk(sk)->saddr;
2303         } else {
2304                 daddr = NULL;
2305                 saddr = NULL;
2306         }
2307         dst_confirm_neigh(dst, daddr);
2308         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2309         if (mtu >= dst_mtu(dst))
2310                 return;
2311
2312         if (!rt6_cache_allowed_for_pmtu(rt6)) {
2313                 rt6_do_update_pmtu(rt6, mtu);
2314                 /* update rt6_ex->stamp for cache */
2315                 if (rt6->rt6i_flags & RTF_CACHE)
2316                         rt6_update_exception_stamp_rt(rt6);
2317         } else if (daddr) {
2318                 struct fib6_info *from;
2319                 struct rt6_info *nrt6;
2320
2321                 rcu_read_lock();
2322                 from = rcu_dereference(rt6->from);
2323                 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
2324                 if (nrt6) {
2325                         rt6_do_update_pmtu(nrt6, mtu);
2326                         if (rt6_insert_exception(nrt6, from))
2327                                 dst_release_immediate(&nrt6->dst);
2328                 }
2329                 rcu_read_unlock();
2330         }
2331 }
2332
2333 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2334                                struct sk_buff *skb, u32 mtu)
2335 {
2336         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2337 }
2338
2339 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2340                      int oif, u32 mark, kuid_t uid)
2341 {
2342         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2343         struct dst_entry *dst;
2344         struct flowi6 fl6 = {
2345                 .flowi6_oif = oif,
2346                 .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark),
2347                 .daddr = iph->daddr,
2348                 .saddr = iph->saddr,
2349                 .flowlabel = ip6_flowinfo(iph),
2350                 .flowi6_uid = uid,
2351         };
2352
2353         dst = ip6_route_output(net, NULL, &fl6);
2354         if (!dst->error)
2355                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2356         dst_release(dst);
2357 }
2358 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2359
2360 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2361 {
2362         int oif = sk->sk_bound_dev_if;
2363         struct dst_entry *dst;
2364
2365         if (!oif && skb->dev)
2366                 oif = l3mdev_master_ifindex(skb->dev);
2367
2368         ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid);
2369
2370         dst = __sk_dst_get(sk);
2371         if (!dst || !dst->obsolete ||
2372             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2373                 return;
2374
2375         bh_lock_sock(sk);
2376         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2377                 ip6_datagram_dst_update(sk, false);
2378         bh_unlock_sock(sk);
2379 }
2380 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2381
2382 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2383                            const struct flowi6 *fl6)
2384 {
2385 #ifdef CONFIG_IPV6_SUBTREES
2386         struct ipv6_pinfo *np = inet6_sk(sk);
2387 #endif
2388
2389         ip6_dst_store(sk, dst,
2390                       ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2391                       &sk->sk_v6_daddr : NULL,
2392 #ifdef CONFIG_IPV6_SUBTREES
2393                       ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2394                       &np->saddr :
2395 #endif
2396                       NULL);
2397 }
2398
2399 /* Handle redirects */
2400 struct ip6rd_flowi {
2401         struct flowi6 fl6;
2402         struct in6_addr gateway;
2403 };
2404
2405 static struct rt6_info *__ip6_route_redirect(struct net *net,
2406                                              struct fib6_table *table,
2407                                              struct flowi6 *fl6,
2408                                              const struct sk_buff *skb,
2409                                              int flags)
2410 {
2411         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2412         struct rt6_info *ret = NULL, *rt_cache;
2413         struct fib6_info *rt;
2414         struct fib6_node *fn;
2415
2416         /* Get the "current" route for this destination and
2417          * check if the redirect has come from appropriate router.
2418          *
2419          * RFC 4861 specifies that redirects should only be
2420          * accepted if they come from the nexthop to the target.
2421          * Due to the way the routes are chosen, this notion
2422          * is a bit fuzzy and one might need to check all possible
2423          * routes.
2424          */
2425
2426         rcu_read_lock();
2427         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2428 restart:
2429         for_each_fib6_node_rt_rcu(fn) {
2430                 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
2431                         continue;
2432                 if (fib6_check_expired(rt))
2433                         continue;
2434                 if (rt->fib6_flags & RTF_REJECT)
2435                         break;
2436                 if (!(rt->fib6_flags & RTF_GATEWAY))
2437                         continue;
2438                 if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex)
2439                         continue;
2440                 /* rt_cache's gateway might be different from its 'parent'
2441                  * in the case of an ip redirect.
2442                  * So we keep searching in the exception table if the gateway
2443                  * is different.
2444                  */
2445                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) {
2446                         rt_cache = rt6_find_cached_rt(rt,
2447                                                       &fl6->daddr,
2448                                                       &fl6->saddr);
2449                         if (rt_cache &&
2450                             ipv6_addr_equal(&rdfl->gateway,
2451                                             &rt_cache->rt6i_gateway)) {
2452                                 ret = rt_cache;
2453                                 break;
2454                         }
2455                         continue;
2456                 }
2457                 break;
2458         }
2459
2460         if (!rt)
2461                 rt = net->ipv6.fib6_null_entry;
2462         else if (rt->fib6_flags & RTF_REJECT) {
2463                 ret = net->ipv6.ip6_null_entry;
2464                 goto out;
2465         }
2466
2467         if (rt == net->ipv6.fib6_null_entry) {
2468                 fn = fib6_backtrack(fn, &fl6->saddr);
2469                 if (fn)
2470                         goto restart;
2471         }
2472
2473 out:
2474         if (ret)
2475                 ip6_hold_safe(net, &ret, true);
2476         else
2477                 ret = ip6_create_rt_rcu(rt);
2478
2479         rcu_read_unlock();
2480
2481         trace_fib6_table_lookup(net, rt, table, fl6);
2482         return ret;
2483 };
2484
2485 static struct dst_entry *ip6_route_redirect(struct net *net,
2486                                             const struct flowi6 *fl6,
2487                                             const struct sk_buff *skb,
2488                                             const struct in6_addr *gateway)
2489 {
2490         int flags = RT6_LOOKUP_F_HAS_SADDR;
2491         struct ip6rd_flowi rdfl;
2492
2493         rdfl.fl6 = *fl6;
2494         rdfl.gateway = *gateway;
2495
2496         return fib6_rule_lookup(net, &rdfl.fl6, skb,
2497                                 flags, __ip6_route_redirect);
2498 }
2499
2500 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2501                   kuid_t uid)
2502 {
2503         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2504         struct dst_entry *dst;
2505         struct flowi6 fl6 = {
2506                 .flowi6_iif = LOOPBACK_IFINDEX,
2507                 .flowi6_oif = oif,
2508                 .flowi6_mark = mark,
2509                 .daddr = iph->daddr,
2510                 .saddr = iph->saddr,
2511                 .flowlabel = ip6_flowinfo(iph),
2512                 .flowi6_uid = uid,
2513         };
2514
2515         dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2516         rt6_do_redirect(dst, NULL, skb);
2517         dst_release(dst);
2518 }
2519 EXPORT_SYMBOL_GPL(ip6_redirect);
2520
2521 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif)
2522 {
2523         const struct ipv6hdr *iph = ipv6_hdr(skb);
2524         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2525         struct dst_entry *dst;
2526         struct flowi6 fl6 = {
2527                 .flowi6_iif = LOOPBACK_IFINDEX,
2528                 .flowi6_oif = oif,
2529                 .daddr = msg->dest,
2530                 .saddr = iph->daddr,
2531                 .flowi6_uid = sock_net_uid(net, NULL),
2532         };
2533
2534         dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2535         rt6_do_redirect(dst, NULL, skb);
2536         dst_release(dst);
2537 }
2538
2539 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2540 {
2541         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2542                      sk->sk_uid);
2543 }
2544 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2545
2546 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2547 {
2548         struct net_device *dev = dst->dev;
2549         unsigned int mtu = dst_mtu(dst);
2550         struct net *net = dev_net(dev);
2551
2552         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2553
2554         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2555                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2556
2557         /*
2558          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2559          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2560          * IPV6_MAXPLEN is also valid and means: "any MSS,
2561          * rely only on pmtu discovery"
2562          */
2563         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2564                 mtu = IPV6_MAXPLEN;
2565         return mtu;
2566 }
2567
2568 static unsigned int ip6_mtu(const struct dst_entry *dst)
2569 {
2570         struct inet6_dev *idev;
2571         unsigned int mtu;
2572
2573         mtu = dst_metric_raw(dst, RTAX_MTU);
2574         if (mtu)
2575                 goto out;
2576
2577         mtu = IPV6_MIN_MTU;
2578
2579         rcu_read_lock();
2580         idev = __in6_dev_get(dst->dev);
2581         if (idev)
2582                 mtu = idev->cnf.mtu6;
2583         rcu_read_unlock();
2584
2585 out:
2586         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2587
2588         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2589 }
2590
2591 /* MTU selection:
2592  * 1. mtu on route is locked - use it
2593  * 2. mtu from nexthop exception
2594  * 3. mtu from egress device
2595  *
2596  * based on ip6_dst_mtu_forward and exception logic of
2597  * rt6_find_cached_rt; called with rcu_read_lock
2598  */
2599 u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
2600                       struct in6_addr *saddr)
2601 {
2602         struct rt6_exception_bucket *bucket;
2603         struct rt6_exception *rt6_ex;
2604         struct in6_addr *src_key;
2605         struct inet6_dev *idev;
2606         u32 mtu = 0;
2607
2608         if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
2609                 mtu = f6i->fib6_pmtu;
2610                 if (mtu)
2611                         goto out;
2612         }
2613
2614         src_key = NULL;
2615 #ifdef CONFIG_IPV6_SUBTREES
2616         if (f6i->fib6_src.plen)
2617                 src_key = saddr;
2618 #endif
2619
2620         bucket = rcu_dereference(f6i->rt6i_exception_bucket);
2621         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
2622         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
2623                 mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU);
2624
2625         if (likely(!mtu)) {
2626                 struct net_device *dev = fib6_info_nh_dev(f6i);
2627
2628                 mtu = IPV6_MIN_MTU;
2629                 idev = __in6_dev_get(dev);
2630                 if (idev && idev->cnf.mtu6 > mtu)
2631                         mtu = idev->cnf.mtu6;
2632         }
2633
2634         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2635 out:
2636         return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu);
2637 }
2638
2639 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2640                                   struct flowi6 *fl6)
2641 {
2642         struct dst_entry *dst;
2643         struct rt6_info *rt;
2644         struct inet6_dev *idev = in6_dev_get(dev);
2645         struct net *net = dev_net(dev);
2646
2647         if (unlikely(!idev))
2648                 return ERR_PTR(-ENODEV);
2649
2650         rt = ip6_dst_alloc(net, dev, 0);
2651         if (unlikely(!rt)) {
2652                 in6_dev_put(idev);
2653                 dst = ERR_PTR(-ENOMEM);
2654                 goto out;
2655         }
2656
2657         rt->dst.flags |= DST_HOST;
2658         rt->dst.input = ip6_input;
2659         rt->dst.output  = ip6_output;
2660         rt->rt6i_gateway  = fl6->daddr;
2661         rt->rt6i_dst.addr = fl6->daddr;
2662         rt->rt6i_dst.plen = 128;
2663         rt->rt6i_idev     = idev;
2664         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2665
2666         /* Add this dst into uncached_list so that rt6_disable_ip() can
2667          * do proper release of the net_device
2668          */
2669         rt6_uncached_list_add(rt);
2670         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2671
2672         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2673
2674 out:
2675         return dst;
2676 }
2677
2678 static int ip6_dst_gc(struct dst_ops *ops)
2679 {
2680         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2681         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2682         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2683         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2684         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2685         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2686         int entries;
2687
2688         entries = dst_entries_get_fast(ops);
2689         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2690             entries <= rt_max_size)
2691                 goto out;
2692
2693         net->ipv6.ip6_rt_gc_expire++;
2694         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2695         entries = dst_entries_get_slow(ops);
2696         if (entries < ops->gc_thresh)
2697                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2698 out:
2699         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2700         return entries > rt_max_size;
2701 }
2702
2703 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2704                                             struct fib6_config *cfg,
2705                                             const struct in6_addr *gw_addr,
2706                                             u32 tbid, int flags)
2707 {
2708         struct flowi6 fl6 = {
2709                 .flowi6_oif = cfg->fc_ifindex,
2710                 .daddr = *gw_addr,
2711                 .saddr = cfg->fc_prefsrc,
2712         };
2713         struct fib6_table *table;
2714         struct rt6_info *rt;
2715
2716         table = fib6_get_table(net, tbid);
2717         if (!table)
2718                 return NULL;
2719
2720         if (!ipv6_addr_any(&cfg->fc_prefsrc))
2721                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2722
2723         flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2724         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2725
2726         /* if table lookup failed, fall back to full lookup */
2727         if (rt == net->ipv6.ip6_null_entry) {
2728                 ip6_rt_put(rt);
2729                 rt = NULL;
2730         }
2731
2732         return rt;
2733 }
2734
2735 static int ip6_route_check_nh_onlink(struct net *net,
2736                                      struct fib6_config *cfg,
2737                                      const struct net_device *dev,
2738                                      struct netlink_ext_ack *extack)
2739 {
2740         u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2741         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2742         u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2743         struct rt6_info *grt;
2744         int err;
2745
2746         err = 0;
2747         grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2748         if (grt) {
2749                 if (!grt->dst.error &&
2750                     /* ignore match if it is the default route */
2751                     grt->from && !ipv6_addr_any(&grt->from->fib6_dst.addr) &&
2752                     (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2753                         NL_SET_ERR_MSG(extack,
2754                                        "Nexthop has invalid gateway or device mismatch");
2755                         err = -EINVAL;
2756                 }
2757
2758                 ip6_rt_put(grt);
2759         }
2760
2761         return err;
2762 }
2763
2764 static int ip6_route_check_nh(struct net *net,
2765                               struct fib6_config *cfg,
2766                               struct net_device **_dev,
2767                               struct inet6_dev **idev)
2768 {
2769         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2770         struct net_device *dev = _dev ? *_dev : NULL;
2771         struct rt6_info *grt = NULL;
2772         int err = -EHOSTUNREACH;
2773
2774         if (cfg->fc_table) {
2775                 int flags = RT6_LOOKUP_F_IFACE;
2776
2777                 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2778                                           cfg->fc_table, flags);
2779                 if (grt) {
2780                         if (grt->rt6i_flags & RTF_GATEWAY ||
2781                             (dev && dev != grt->dst.dev)) {
2782                                 ip6_rt_put(grt);
2783                                 grt = NULL;
2784                         }
2785                 }
2786         }
2787
2788         if (!grt)
2789                 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2790
2791         if (!grt)
2792                 goto out;
2793
2794         if (dev) {
2795                 if (dev != grt->dst.dev) {
2796                         ip6_rt_put(grt);
2797                         goto out;
2798                 }
2799         } else {
2800                 *_dev = dev = grt->dst.dev;
2801                 *idev = grt->rt6i_idev;
2802                 dev_hold(dev);
2803                 in6_dev_hold(grt->rt6i_idev);
2804         }
2805
2806         if (!(grt->rt6i_flags & RTF_GATEWAY))
2807                 err = 0;
2808
2809         ip6_rt_put(grt);
2810
2811 out:
2812         return err;
2813 }
2814
2815 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2816                            struct net_device **_dev, struct inet6_dev **idev,
2817                            struct netlink_ext_ack *extack)
2818 {
2819         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2820         int gwa_type = ipv6_addr_type(gw_addr);
2821         bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2822         const struct net_device *dev = *_dev;
2823         bool need_addr_check = !dev;
2824         int err = -EINVAL;
2825
2826         /* if gw_addr is local we will fail to detect this in case
2827          * address is still TENTATIVE (DAD in progress). rt6_lookup()
2828          * will return already-added prefix route via interface that
2829          * prefix route was assigned to, which might be non-loopback.
2830          */
2831         if (dev &&
2832             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2833                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2834                 goto out;
2835         }
2836
2837         if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2838                 /* IPv6 strictly inhibits using not link-local
2839                  * addresses as nexthop address.
2840                  * Otherwise, router will not able to send redirects.
2841                  * It is very good, but in some (rare!) circumstances
2842                  * (SIT, PtP, NBMA NOARP links) it is handy to allow
2843                  * some exceptions. --ANK
2844                  * We allow IPv4-mapped nexthops to support RFC4798-type
2845                  * addressing
2846                  */
2847                 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2848                         NL_SET_ERR_MSG(extack, "Invalid gateway address");
2849                         goto out;
2850                 }
2851
2852                 if (cfg->fc_flags & RTNH_F_ONLINK)
2853                         err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2854                 else
2855                         err = ip6_route_check_nh(net, cfg, _dev, idev);
2856
2857                 if (err)
2858                         goto out;
2859         }
2860
2861         /* reload in case device was changed */
2862         dev = *_dev;
2863
2864         err = -EINVAL;
2865         if (!dev) {
2866                 NL_SET_ERR_MSG(extack, "Egress device not specified");
2867                 goto out;
2868         } else if (dev->flags & IFF_LOOPBACK) {
2869                 NL_SET_ERR_MSG(extack,
2870                                "Egress device can not be loopback device for this route");
2871                 goto out;
2872         }
2873
2874         /* if we did not check gw_addr above, do so now that the
2875          * egress device has been resolved.
2876          */
2877         if (need_addr_check &&
2878             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2879                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2880                 goto out;
2881         }
2882
2883         err = 0;
2884 out:
2885         return err;
2886 }
2887
2888 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
2889                                               gfp_t gfp_flags,
2890                                               struct netlink_ext_ack *extack)
2891 {
2892         struct net *net = cfg->fc_nlinfo.nl_net;
2893         struct fib6_info *rt = NULL;
2894         struct net_device *dev = NULL;
2895         struct inet6_dev *idev = NULL;
2896         struct fib6_table *table;
2897         int addr_type;
2898         int err = -EINVAL;
2899
2900         /* RTF_PCPU is an internal flag; can not be set by userspace */
2901         if (cfg->fc_flags & RTF_PCPU) {
2902                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2903                 goto out;
2904         }
2905
2906         /* RTF_CACHE is an internal flag; can not be set by userspace */
2907         if (cfg->fc_flags & RTF_CACHE) {
2908                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2909                 goto out;
2910         }
2911
2912         if (cfg->fc_type > RTN_MAX) {
2913                 NL_SET_ERR_MSG(extack, "Invalid route type");
2914                 goto out;
2915         }
2916
2917         if (cfg->fc_dst_len > 128) {
2918                 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2919                 goto out;
2920         }
2921         if (cfg->fc_src_len > 128) {
2922                 NL_SET_ERR_MSG(extack, "Invalid source address length");
2923                 goto out;
2924         }
2925 #ifndef CONFIG_IPV6_SUBTREES
2926         if (cfg->fc_src_len) {
2927                 NL_SET_ERR_MSG(extack,
2928                                "Specifying source address requires IPV6_SUBTREES to be enabled");
2929                 goto out;
2930         }
2931 #endif
2932         if (cfg->fc_ifindex) {
2933                 err = -ENODEV;
2934                 dev = dev_get_by_index(net, cfg->fc_ifindex);
2935                 if (!dev)
2936                         goto out;
2937                 idev = in6_dev_get(dev);
2938                 if (!idev)
2939                         goto out;
2940         }
2941
2942         if (cfg->fc_metric == 0)
2943                 cfg->fc_metric = IP6_RT_PRIO_USER;
2944
2945         if (cfg->fc_flags & RTNH_F_ONLINK) {
2946                 if (!dev) {
2947                         NL_SET_ERR_MSG(extack,
2948                                        "Nexthop device required for onlink");
2949                         err = -ENODEV;
2950                         goto out;
2951                 }
2952
2953                 if (!(dev->flags & IFF_UP)) {
2954                         NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2955                         err = -ENETDOWN;
2956                         goto out;
2957                 }
2958         }
2959
2960         err = -ENOBUFS;
2961         if (cfg->fc_nlinfo.nlh &&
2962             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2963                 table = fib6_get_table(net, cfg->fc_table);
2964                 if (!table) {
2965                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2966                         table = fib6_new_table(net, cfg->fc_table);
2967                 }
2968         } else {
2969                 table = fib6_new_table(net, cfg->fc_table);
2970         }
2971
2972         if (!table)
2973                 goto out;
2974
2975         err = -ENOMEM;
2976         rt = fib6_info_alloc(gfp_flags);
2977         if (!rt)
2978                 goto out;
2979
2980         rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len,
2981                                                extack);
2982         if (IS_ERR(rt->fib6_metrics)) {
2983                 err = PTR_ERR(rt->fib6_metrics);
2984                 /* Do not leave garbage there. */
2985                 rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
2986                 goto out;
2987         }
2988
2989         if (cfg->fc_flags & RTF_ADDRCONF)
2990                 rt->dst_nocount = true;
2991
2992         if (cfg->fc_flags & RTF_EXPIRES)
2993                 fib6_set_expires(rt, jiffies +
2994                                 clock_t_to_jiffies(cfg->fc_expires));
2995         else
2996                 fib6_clean_expires(rt);
2997
2998         if (cfg->fc_protocol == RTPROT_UNSPEC)
2999                 cfg->fc_protocol = RTPROT_BOOT;
3000         rt->fib6_protocol = cfg->fc_protocol;
3001
3002         addr_type = ipv6_addr_type(&cfg->fc_dst);
3003
3004         if (cfg->fc_encap) {
3005                 struct lwtunnel_state *lwtstate;
3006
3007                 err = lwtunnel_build_state(cfg->fc_encap_type,
3008                                            cfg->fc_encap, AF_INET6, cfg,
3009                                            &lwtstate, extack);
3010                 if (err)
3011                         goto out;
3012                 rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate);
3013         }
3014
3015         ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3016         rt->fib6_dst.plen = cfg->fc_dst_len;
3017         if (rt->fib6_dst.plen == 128)
3018                 rt->dst_host = true;
3019
3020 #ifdef CONFIG_IPV6_SUBTREES
3021         ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3022         rt->fib6_src.plen = cfg->fc_src_len;
3023 #endif
3024
3025         rt->fib6_metric = cfg->fc_metric;
3026         rt->fib6_nh.nh_weight = 1;
3027
3028         rt->fib6_type = cfg->fc_type;
3029
3030         /* We cannot add true routes via loopback here,
3031            they would result in kernel looping; promote them to reject routes
3032          */
3033         if ((cfg->fc_flags & RTF_REJECT) ||
3034             (dev && (dev->flags & IFF_LOOPBACK) &&
3035              !(addr_type & IPV6_ADDR_LOOPBACK) &&
3036              !(cfg->fc_flags & RTF_LOCAL))) {
3037                 /* hold loopback dev/idev if we haven't done so. */
3038                 if (dev != net->loopback_dev) {
3039                         if (dev) {
3040                                 dev_put(dev);
3041                                 in6_dev_put(idev);
3042                         }
3043                         dev = net->loopback_dev;
3044                         dev_hold(dev);
3045                         idev = in6_dev_get(dev);
3046                         if (!idev) {
3047                                 err = -ENODEV;
3048                                 goto out;
3049                         }
3050                 }
3051                 rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP;
3052                 goto install_route;
3053         }
3054
3055         if (cfg->fc_flags & RTF_GATEWAY) {
3056                 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
3057                 if (err)
3058                         goto out;
3059
3060                 rt->fib6_nh.nh_gw = cfg->fc_gateway;
3061         }
3062
3063         err = -ENODEV;
3064         if (!dev)
3065                 goto out;
3066
3067         if (idev->cnf.disable_ipv6) {
3068                 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3069                 err = -EACCES;
3070                 goto out;
3071         }
3072
3073         if (!(dev->flags & IFF_UP)) {
3074                 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3075                 err = -ENETDOWN;
3076                 goto out;
3077         }
3078
3079         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3080                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3081                         NL_SET_ERR_MSG(extack, "Invalid source address");
3082                         err = -EINVAL;
3083                         goto out;
3084                 }
3085                 rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3086                 rt->fib6_prefsrc.plen = 128;
3087         } else
3088                 rt->fib6_prefsrc.plen = 0;
3089
3090         rt->fib6_flags = cfg->fc_flags;
3091
3092 install_route:
3093         if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3094             !netif_carrier_ok(dev))
3095                 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3096         rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
3097         rt->fib6_nh.nh_dev = dev;
3098         rt->fib6_table = table;
3099
3100         if (idev)
3101                 in6_dev_put(idev);
3102
3103         return rt;
3104 out:
3105         if (dev)
3106                 dev_put(dev);
3107         if (idev)
3108                 in6_dev_put(idev);
3109
3110         fib6_info_release(rt);
3111         return ERR_PTR(err);
3112 }
3113
3114 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3115                   struct netlink_ext_ack *extack)
3116 {
3117         struct fib6_info *rt;
3118         int err;
3119
3120         rt = ip6_route_info_create(cfg, gfp_flags, extack);
3121         if (IS_ERR(rt))
3122                 return PTR_ERR(rt);
3123
3124         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3125         fib6_info_release(rt);
3126
3127         return err;
3128 }
3129
3130 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3131 {
3132         struct net *net = info->nl_net;
3133         struct fib6_table *table;
3134         int err;
3135
3136         if (rt == net->ipv6.fib6_null_entry) {
3137                 err = -ENOENT;
3138                 goto out;
3139         }
3140
3141         table = rt->fib6_table;
3142         spin_lock_bh(&table->tb6_lock);
3143         err = fib6_del(rt, info);
3144         spin_unlock_bh(&table->tb6_lock);
3145
3146 out:
3147         fib6_info_release(rt);
3148         return err;
3149 }
3150
3151 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3152 {
3153         struct nl_info info = { .nl_net = net };
3154
3155         return __ip6_del_rt(rt, &info);
3156 }
3157
3158 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3159 {
3160         struct nl_info *info = &cfg->fc_nlinfo;
3161         struct net *net = info->nl_net;
3162         struct sk_buff *skb = NULL;
3163         struct fib6_table *table;
3164         int err = -ENOENT;
3165
3166         if (rt == net->ipv6.fib6_null_entry)
3167                 goto out_put;
3168         table = rt->fib6_table;
3169         spin_lock_bh(&table->tb6_lock);
3170
3171         if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3172                 struct fib6_info *sibling, *next_sibling;
3173
3174                 /* prefer to send a single notification with all hops */
3175                 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3176                 if (skb) {
3177                         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3178
3179                         if (rt6_fill_node(net, skb, rt, NULL,
3180                                           NULL, NULL, 0, RTM_DELROUTE,
3181                                           info->portid, seq, 0) < 0) {
3182                                 kfree_skb(skb);
3183                                 skb = NULL;
3184                         } else
3185                                 info->skip_notify = 1;
3186                 }
3187
3188                 list_for_each_entry_safe(sibling, next_sibling,
3189                                          &rt->fib6_siblings,
3190                                          fib6_siblings) {
3191                         err = fib6_del(sibling, info);
3192                         if (err)
3193                                 goto out_unlock;
3194                 }
3195         }
3196
3197         err = fib6_del(rt, info);
3198 out_unlock:
3199         spin_unlock_bh(&table->tb6_lock);
3200 out_put:
3201         fib6_info_release(rt);
3202
3203         if (skb) {
3204                 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3205                             info->nlh, gfp_any());
3206         }
3207         return err;
3208 }
3209
3210 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3211 {
3212         int rc = -ESRCH;
3213
3214         if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3215                 goto out;
3216
3217         if (cfg->fc_flags & RTF_GATEWAY &&
3218             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3219                 goto out;
3220
3221         rc = rt6_remove_exception_rt(rt);
3222 out:
3223         return rc;
3224 }
3225
3226 static int ip6_route_del(struct fib6_config *cfg,
3227                          struct netlink_ext_ack *extack)
3228 {
3229         struct rt6_info *rt_cache;
3230         struct fib6_table *table;
3231         struct fib6_info *rt;
3232         struct fib6_node *fn;
3233         int err = -ESRCH;
3234
3235         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3236         if (!table) {
3237                 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3238                 return err;
3239         }
3240
3241         rcu_read_lock();
3242
3243         fn = fib6_locate(&table->tb6_root,
3244                          &cfg->fc_dst, cfg->fc_dst_len,
3245                          &cfg->fc_src, cfg->fc_src_len,
3246                          !(cfg->fc_flags & RTF_CACHE));
3247
3248         if (fn) {
3249                 for_each_fib6_node_rt_rcu(fn) {
3250                         if (cfg->fc_flags & RTF_CACHE) {
3251                                 int rc;
3252
3253                                 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3254                                                               &cfg->fc_src);
3255                                 if (rt_cache) {
3256                                         rc = ip6_del_cached_rt(rt_cache, cfg);
3257                                         if (rc != -ESRCH) {
3258                                                 rcu_read_unlock();
3259                                                 return rc;
3260                                         }
3261                                 }
3262                                 continue;
3263                         }
3264                         if (cfg->fc_ifindex &&
3265                             (!rt->fib6_nh.nh_dev ||
3266                              rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex))
3267                                 continue;
3268                         if (cfg->fc_flags & RTF_GATEWAY &&
3269                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw))
3270                                 continue;
3271                         if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3272                                 continue;
3273                         if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3274                                 continue;
3275                         if (!fib6_info_hold_safe(rt))
3276                                 continue;
3277                         rcu_read_unlock();
3278
3279                         /* if gateway was specified only delete the one hop */
3280                         if (cfg->fc_flags & RTF_GATEWAY)
3281                                 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3282
3283                         return __ip6_del_rt_siblings(rt, cfg);
3284                 }
3285         }
3286         rcu_read_unlock();
3287
3288         return err;
3289 }
3290
3291 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3292 {
3293         struct netevent_redirect netevent;
3294         struct rt6_info *rt, *nrt = NULL;
3295         struct ndisc_options ndopts;
3296         struct inet6_dev *in6_dev;
3297         struct neighbour *neigh;
3298         struct fib6_info *from;
3299         struct rd_msg *msg;
3300         int optlen, on_link;
3301         u8 *lladdr;
3302
3303         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3304         optlen -= sizeof(*msg);
3305
3306         if (optlen < 0) {
3307                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3308                 return;
3309         }
3310
3311         msg = (struct rd_msg *)icmp6_hdr(skb);
3312
3313         if (ipv6_addr_is_multicast(&msg->dest)) {
3314                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3315                 return;
3316         }
3317
3318         on_link = 0;
3319         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3320                 on_link = 1;
3321         } else if (ipv6_addr_type(&msg->target) !=
3322                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3323                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3324                 return;
3325         }
3326
3327         in6_dev = __in6_dev_get(skb->dev);
3328         if (!in6_dev)
3329                 return;
3330         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3331                 return;
3332
3333         /* RFC2461 8.1:
3334          *      The IP source address of the Redirect MUST be the same as the current
3335          *      first-hop router for the specified ICMP Destination Address.
3336          */
3337
3338         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3339                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3340                 return;
3341         }
3342
3343         lladdr = NULL;
3344         if (ndopts.nd_opts_tgt_lladdr) {
3345                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3346                                              skb->dev);
3347                 if (!lladdr) {
3348                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3349                         return;
3350                 }
3351         }
3352
3353         rt = (struct rt6_info *) dst;
3354         if (rt->rt6i_flags & RTF_REJECT) {
3355                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3356                 return;
3357         }
3358
3359         /* Redirect received -> path was valid.
3360          * Look, redirects are sent only in response to data packets,
3361          * so that this nexthop apparently is reachable. --ANK
3362          */
3363         dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3364
3365         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3366         if (!neigh)
3367                 return;
3368
3369         /*
3370          *      We have finally decided to accept it.
3371          */
3372
3373         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3374                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
3375                      NEIGH_UPDATE_F_OVERRIDE|
3376                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3377                                      NEIGH_UPDATE_F_ISROUTER)),
3378                      NDISC_REDIRECT, &ndopts);
3379
3380         rcu_read_lock();
3381         from = rcu_dereference(rt->from);
3382         /* This fib6_info_hold() is safe here because we hold reference to rt
3383          * and rt already holds reference to fib6_info.
3384          */
3385         fib6_info_hold(from);
3386         rcu_read_unlock();
3387
3388         nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
3389         if (!nrt)
3390                 goto out;
3391
3392         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3393         if (on_link)
3394                 nrt->rt6i_flags &= ~RTF_GATEWAY;
3395
3396         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3397
3398         /* No need to remove rt from the exception table if rt is
3399          * a cached route because rt6_insert_exception() will
3400          * takes care of it
3401          */
3402         if (rt6_insert_exception(nrt, from)) {
3403                 dst_release_immediate(&nrt->dst);
3404                 goto out;
3405         }
3406
3407         netevent.old = &rt->dst;
3408         netevent.new = &nrt->dst;
3409         netevent.daddr = &msg->dest;
3410         netevent.neigh = neigh;
3411         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3412
3413 out:
3414         fib6_info_release(from);
3415         neigh_release(neigh);
3416 }
3417
3418 #ifdef CONFIG_IPV6_ROUTE_INFO
3419 static struct fib6_info *rt6_get_route_info(struct net *net,
3420                                            const struct in6_addr *prefix, int prefixlen,
3421                                            const struct in6_addr *gwaddr,
3422                                            struct net_device *dev)
3423 {
3424         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3425         int ifindex = dev->ifindex;
3426         struct fib6_node *fn;
3427         struct fib6_info *rt = NULL;
3428         struct fib6_table *table;
3429
3430         table = fib6_get_table(net, tb_id);
3431         if (!table)
3432                 return NULL;
3433
3434         rcu_read_lock();
3435         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3436         if (!fn)
3437                 goto out;
3438
3439         for_each_fib6_node_rt_rcu(fn) {
3440                 if (rt->fib6_nh.nh_dev->ifindex != ifindex)
3441                         continue;
3442                 if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3443                         continue;
3444                 if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
3445                         continue;
3446                 if (!fib6_info_hold_safe(rt))
3447                         continue;
3448                 break;
3449         }
3450 out:
3451         rcu_read_unlock();
3452         return rt;
3453 }
3454
3455 static struct fib6_info *rt6_add_route_info(struct net *net,
3456                                            const struct in6_addr *prefix, int prefixlen,
3457                                            const struct in6_addr *gwaddr,
3458                                            struct net_device *dev,
3459                                            unsigned int pref)
3460 {
3461         struct fib6_config cfg = {
3462                 .fc_metric      = IP6_RT_PRIO_USER,
3463                 .fc_ifindex     = dev->ifindex,
3464                 .fc_dst_len     = prefixlen,
3465                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3466                                   RTF_UP | RTF_PREF(pref),
3467                 .fc_protocol = RTPROT_RA,
3468                 .fc_type = RTN_UNICAST,
3469                 .fc_nlinfo.portid = 0,
3470                 .fc_nlinfo.nlh = NULL,
3471                 .fc_nlinfo.nl_net = net,
3472         };
3473
3474         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3475         cfg.fc_dst = *prefix;
3476         cfg.fc_gateway = *gwaddr;
3477
3478         /* We should treat it as a default route if prefix length is 0. */
3479         if (!prefixlen)
3480                 cfg.fc_flags |= RTF_DEFAULT;
3481
3482         ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3483
3484         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3485 }
3486 #endif
3487
3488 struct fib6_info *rt6_get_dflt_router(struct net *net,
3489                                      const struct in6_addr *addr,
3490                                      struct net_device *dev)
3491 {
3492         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3493         struct fib6_info *rt;
3494         struct fib6_table *table;
3495
3496         table = fib6_get_table(net, tb_id);
3497         if (!table)
3498                 return NULL;
3499
3500         rcu_read_lock();
3501         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3502                 if (dev == rt->fib6_nh.nh_dev &&
3503                     ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3504                     ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
3505                         break;
3506         }
3507         if (rt && !fib6_info_hold_safe(rt))
3508                 rt = NULL;
3509         rcu_read_unlock();
3510         return rt;
3511 }
3512
3513 struct fib6_info *rt6_add_dflt_router(struct net *net,
3514                                      const struct in6_addr *gwaddr,
3515                                      struct net_device *dev,
3516                                      unsigned int pref)
3517 {
3518         struct fib6_config cfg = {
3519                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3520                 .fc_metric      = IP6_RT_PRIO_USER,
3521                 .fc_ifindex     = dev->ifindex,
3522                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3523                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3524                 .fc_protocol = RTPROT_RA,
3525                 .fc_type = RTN_UNICAST,
3526                 .fc_nlinfo.portid = 0,
3527                 .fc_nlinfo.nlh = NULL,
3528                 .fc_nlinfo.nl_net = net,
3529         };
3530
3531         cfg.fc_gateway = *gwaddr;
3532
3533         if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3534                 struct fib6_table *table;
3535
3536                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3537                 if (table)
3538                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3539         }
3540
3541         return rt6_get_dflt_router(net, gwaddr, dev);
3542 }
3543
3544 static void __rt6_purge_dflt_routers(struct net *net,
3545                                      struct fib6_table *table)
3546 {
3547         struct fib6_info *rt;
3548
3549 restart:
3550         rcu_read_lock();
3551         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3552                 struct net_device *dev = fib6_info_nh_dev(rt);
3553                 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3554
3555                 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3556                     (!idev || idev->cnf.accept_ra != 2) &&
3557                     fib6_info_hold_safe(rt)) {
3558                         rcu_read_unlock();
3559                         ip6_del_rt(net, rt);
3560                         goto restart;
3561                 }
3562         }
3563         rcu_read_unlock();
3564
3565         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3566 }
3567
3568 void rt6_purge_dflt_routers(struct net *net)
3569 {
3570         struct fib6_table *table;
3571         struct hlist_head *head;
3572         unsigned int h;
3573
3574         rcu_read_lock();
3575
3576         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3577                 head = &net->ipv6.fib_table_hash[h];
3578                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3579                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3580                                 __rt6_purge_dflt_routers(net, table);
3581                 }
3582         }
3583
3584         rcu_read_unlock();
3585 }
3586
3587 static void rtmsg_to_fib6_config(struct net *net,
3588                                  struct in6_rtmsg *rtmsg,
3589                                  struct fib6_config *cfg)
3590 {
3591         *cfg = (struct fib6_config){
3592                 .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3593                          : RT6_TABLE_MAIN,
3594                 .fc_ifindex = rtmsg->rtmsg_ifindex,
3595                 .fc_metric = rtmsg->rtmsg_metric,
3596                 .fc_expires = rtmsg->rtmsg_info,
3597                 .fc_dst_len = rtmsg->rtmsg_dst_len,
3598                 .fc_src_len = rtmsg->rtmsg_src_len,
3599                 .fc_flags = rtmsg->rtmsg_flags,
3600                 .fc_type = rtmsg->rtmsg_type,
3601
3602                 .fc_nlinfo.nl_net = net,
3603
3604                 .fc_dst = rtmsg->rtmsg_dst,
3605                 .fc_src = rtmsg->rtmsg_src,
3606                 .fc_gateway = rtmsg->rtmsg_gateway,
3607         };
3608 }
3609
3610 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3611 {
3612         struct fib6_config cfg;
3613         struct in6_rtmsg rtmsg;
3614         int err;
3615
3616         switch (cmd) {
3617         case SIOCADDRT:         /* Add a route */
3618         case SIOCDELRT:         /* Delete a route */
3619                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3620                         return -EPERM;
3621                 err = copy_from_user(&rtmsg, arg,
3622                                      sizeof(struct in6_rtmsg));
3623                 if (err)
3624                         return -EFAULT;
3625
3626                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3627
3628                 rtnl_lock();
3629                 switch (cmd) {
3630                 case SIOCADDRT:
3631                         err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3632                         break;
3633                 case SIOCDELRT:
3634                         err = ip6_route_del(&cfg, NULL);
3635                         break;
3636                 default:
3637                         err = -EINVAL;
3638                 }
3639                 rtnl_unlock();
3640
3641                 return err;
3642         }
3643
3644         return -EINVAL;
3645 }
3646
3647 /*
3648  *      Drop the packet on the floor
3649  */
3650
3651 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3652 {
3653         int type;
3654         struct dst_entry *dst = skb_dst(skb);
3655         switch (ipstats_mib_noroutes) {
3656         case IPSTATS_MIB_INNOROUTES:
3657                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3658                 if (type == IPV6_ADDR_ANY) {
3659                         IP6_INC_STATS(dev_net(dst->dev),
3660                                       __in6_dev_get_safely(skb->dev),
3661                                       IPSTATS_MIB_INADDRERRORS);
3662                         break;
3663                 }
3664                 /* FALLTHROUGH */
3665         case IPSTATS_MIB_OUTNOROUTES:
3666                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3667                               ipstats_mib_noroutes);
3668                 break;
3669         }
3670         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3671         kfree_skb(skb);
3672         return 0;
3673 }
3674
3675 static int ip6_pkt_discard(struct sk_buff *skb)
3676 {
3677         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3678 }
3679
3680 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3681 {
3682         skb->dev = skb_dst(skb)->dev;
3683         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3684 }
3685
3686 static int ip6_pkt_prohibit(struct sk_buff *skb)
3687 {
3688         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3689 }
3690
3691 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3692 {
3693         skb->dev = skb_dst(skb)->dev;
3694         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3695 }
3696
3697 /*
3698  *      Allocate a dst for local (unicast / anycast) address.
3699  */
3700
3701 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3702                                      struct inet6_dev *idev,
3703                                      const struct in6_addr *addr,
3704                                      bool anycast, gfp_t gfp_flags)
3705 {
3706         u32 tb_id;
3707         struct net_device *dev = idev->dev;
3708         struct fib6_info *f6i;
3709
3710         f6i = fib6_info_alloc(gfp_flags);
3711         if (!f6i)
3712                 return ERR_PTR(-ENOMEM);
3713
3714         f6i->fib6_metrics = ip_fib_metrics_init(net, NULL, 0, NULL);
3715         f6i->dst_nocount = true;
3716         f6i->dst_host = true;
3717         f6i->fib6_protocol = RTPROT_KERNEL;
3718         f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP;
3719         if (anycast) {
3720                 f6i->fib6_type = RTN_ANYCAST;
3721                 f6i->fib6_flags |= RTF_ANYCAST;
3722         } else {
3723                 f6i->fib6_type = RTN_LOCAL;
3724                 f6i->fib6_flags |= RTF_LOCAL;
3725         }
3726
3727         f6i->fib6_nh.nh_gw = *addr;
3728         dev_hold(dev);
3729         f6i->fib6_nh.nh_dev = dev;
3730         f6i->fib6_dst.addr = *addr;
3731         f6i->fib6_dst.plen = 128;
3732         tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3733         f6i->fib6_table = fib6_get_table(net, tb_id);
3734
3735         return f6i;
3736 }
3737
3738 /* remove deleted ip from prefsrc entries */
3739 struct arg_dev_net_ip {
3740         struct net_device *dev;
3741         struct net *net;
3742         struct in6_addr *addr;
3743 };
3744
3745 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3746 {
3747         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3748         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3749         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3750
3751         if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
3752             rt != net->ipv6.fib6_null_entry &&
3753             ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3754                 spin_lock_bh(&rt6_exception_lock);
3755                 /* remove prefsrc entry */
3756                 rt->fib6_prefsrc.plen = 0;
3757                 spin_unlock_bh(&rt6_exception_lock);
3758         }
3759         return 0;
3760 }
3761
3762 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3763 {
3764         struct net *net = dev_net(ifp->idev->dev);
3765         struct arg_dev_net_ip adni = {
3766                 .dev = ifp->idev->dev,
3767                 .net = net,
3768                 .addr = &ifp->addr,
3769         };
3770         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3771 }
3772
3773 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3774
3775 /* Remove routers and update dst entries when gateway turn into host. */
3776 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3777 {
3778         struct in6_addr *gateway = (struct in6_addr *)arg;
3779
3780         if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3781             ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) {
3782                 return -1;
3783         }
3784
3785         /* Further clean up cached routes in exception table.
3786          * This is needed because cached route may have a different
3787          * gateway than its 'parent' in the case of an ip redirect.
3788          */
3789         rt6_exceptions_clean_tohost(rt, gateway);
3790
3791         return 0;
3792 }
3793
3794 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3795 {
3796         fib6_clean_all(net, fib6_clean_tohost, gateway);
3797 }
3798
3799 struct arg_netdev_event {
3800         const struct net_device *dev;
3801         union {
3802                 unsigned int nh_flags;
3803                 unsigned long event;
3804         };
3805 };
3806
3807 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3808 {
3809         struct fib6_info *iter;
3810         struct fib6_node *fn;
3811
3812         fn = rcu_dereference_protected(rt->fib6_node,
3813                         lockdep_is_held(&rt->fib6_table->tb6_lock));
3814         iter = rcu_dereference_protected(fn->leaf,
3815                         lockdep_is_held(&rt->fib6_table->tb6_lock));
3816         while (iter) {
3817                 if (iter->fib6_metric == rt->fib6_metric &&
3818                     rt6_qualify_for_ecmp(iter))
3819                         return iter;
3820                 iter = rcu_dereference_protected(iter->fib6_next,
3821                                 lockdep_is_held(&rt->fib6_table->tb6_lock));
3822         }
3823
3824         return NULL;
3825 }
3826
3827 static bool rt6_is_dead(const struct fib6_info *rt)
3828 {
3829         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
3830             (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
3831              fib6_ignore_linkdown(rt)))
3832                 return true;
3833
3834         return false;
3835 }
3836
3837 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3838 {
3839         struct fib6_info *iter;
3840         int total = 0;
3841
3842         if (!rt6_is_dead(rt))
3843                 total += rt->fib6_nh.nh_weight;
3844
3845         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3846                 if (!rt6_is_dead(iter))
3847                         total += iter->fib6_nh.nh_weight;
3848         }
3849
3850         return total;
3851 }
3852
3853 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3854 {
3855         int upper_bound = -1;
3856
3857         if (!rt6_is_dead(rt)) {
3858                 *weight += rt->fib6_nh.nh_weight;
3859                 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3860                                                     total) - 1;
3861         }
3862         atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound);
3863 }
3864
3865 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3866 {
3867         struct fib6_info *iter;
3868         int weight = 0;
3869
3870         rt6_upper_bound_set(rt, &weight, total);
3871
3872         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3873                 rt6_upper_bound_set(iter, &weight, total);
3874 }
3875
3876 void rt6_multipath_rebalance(struct fib6_info *rt)
3877 {
3878         struct fib6_info *first;
3879         int total;
3880
3881         /* In case the entire multipath route was marked for flushing,
3882          * then there is no need to rebalance upon the removal of every
3883          * sibling route.
3884          */
3885         if (!rt->fib6_nsiblings || rt->should_flush)
3886                 return;
3887
3888         /* During lookup routes are evaluated in order, so we need to
3889          * make sure upper bounds are assigned from the first sibling
3890          * onwards.
3891          */
3892         first = rt6_multipath_first_sibling(rt);
3893         if (WARN_ON_ONCE(!first))
3894                 return;
3895
3896         total = rt6_multipath_total_weight(first);
3897         rt6_multipath_upper_bound_set(first, total);
3898 }
3899
3900 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
3901 {
3902         const struct arg_netdev_event *arg = p_arg;
3903         struct net *net = dev_net(arg->dev);
3904
3905         if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) {
3906                 rt->fib6_nh.nh_flags &= ~arg->nh_flags;
3907                 fib6_update_sernum_upto_root(net, rt);
3908                 rt6_multipath_rebalance(rt);
3909         }
3910
3911         return 0;
3912 }
3913
3914 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3915 {
3916         struct arg_netdev_event arg = {
3917                 .dev = dev,
3918                 {
3919                         .nh_flags = nh_flags,
3920                 },
3921         };
3922
3923         if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3924                 arg.nh_flags |= RTNH_F_LINKDOWN;
3925
3926         fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3927 }
3928
3929 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
3930                                    const struct net_device *dev)
3931 {
3932         struct fib6_info *iter;
3933
3934         if (rt->fib6_nh.nh_dev == dev)
3935                 return true;
3936         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3937                 if (iter->fib6_nh.nh_dev == dev)
3938                         return true;
3939
3940         return false;
3941 }
3942
3943 static void rt6_multipath_flush(struct fib6_info *rt)
3944 {
3945         struct fib6_info *iter;
3946
3947         rt->should_flush = 1;
3948         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3949                 iter->should_flush = 1;
3950 }
3951
3952 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
3953                                              const struct net_device *down_dev)
3954 {
3955         struct fib6_info *iter;
3956         unsigned int dead = 0;
3957
3958         if (rt->fib6_nh.nh_dev == down_dev ||
3959             rt->fib6_nh.nh_flags & RTNH_F_DEAD)
3960                 dead++;
3961         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3962                 if (iter->fib6_nh.nh_dev == down_dev ||
3963                     iter->fib6_nh.nh_flags & RTNH_F_DEAD)
3964                         dead++;
3965
3966         return dead;
3967 }
3968
3969 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
3970                                        const struct net_device *dev,
3971                                        unsigned int nh_flags)
3972 {
3973         struct fib6_info *iter;
3974
3975         if (rt->fib6_nh.nh_dev == dev)
3976                 rt->fib6_nh.nh_flags |= nh_flags;
3977         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3978                 if (iter->fib6_nh.nh_dev == dev)
3979                         iter->fib6_nh.nh_flags |= nh_flags;
3980 }
3981
3982 /* called with write lock held for table with rt */
3983 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
3984 {
3985         const struct arg_netdev_event *arg = p_arg;
3986         const struct net_device *dev = arg->dev;
3987         struct net *net = dev_net(dev);
3988
3989         if (rt == net->ipv6.fib6_null_entry)
3990                 return 0;
3991
3992         switch (arg->event) {
3993         case NETDEV_UNREGISTER:
3994                 return rt->fib6_nh.nh_dev == dev ? -1 : 0;
3995         case NETDEV_DOWN:
3996                 if (rt->should_flush)
3997                         return -1;
3998                 if (!rt->fib6_nsiblings)
3999                         return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4000                 if (rt6_multipath_uses_dev(rt, dev)) {
4001                         unsigned int count;
4002
4003                         count = rt6_multipath_dead_count(rt, dev);
4004                         if (rt->fib6_nsiblings + 1 == count) {
4005                                 rt6_multipath_flush(rt);
4006                                 return -1;
4007                         }
4008                         rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4009                                                    RTNH_F_LINKDOWN);
4010                         fib6_update_sernum(net, rt);
4011                         rt6_multipath_rebalance(rt);
4012                 }
4013                 return -2;
4014         case NETDEV_CHANGE:
4015                 if (rt->fib6_nh.nh_dev != dev ||
4016                     rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4017                         break;
4018                 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
4019                 rt6_multipath_rebalance(rt);
4020                 break;
4021         }
4022
4023         return 0;
4024 }
4025
4026 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4027 {
4028         struct arg_netdev_event arg = {
4029                 .dev = dev,
4030                 {
4031                         .event = event,
4032                 },
4033         };
4034         struct net *net = dev_net(dev);
4035
4036         if (net->ipv6.sysctl.skip_notify_on_dev_down)
4037                 fib6_clean_all_skip_notify(net, fib6_ifdown, &arg);
4038         else
4039                 fib6_clean_all(net, fib6_ifdown, &arg);
4040 }
4041
4042 void rt6_disable_ip(struct net_device *dev, unsigned long event)
4043 {
4044         rt6_sync_down_dev(dev, event);
4045         rt6_uncached_list_flush_dev(dev_net(dev), dev);
4046         neigh_ifdown(&nd_tbl, dev);
4047 }
4048
4049 struct rt6_mtu_change_arg {
4050         struct net_device *dev;
4051         unsigned int mtu;
4052 };
4053
4054 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4055 {
4056         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4057         struct inet6_dev *idev;
4058
4059         /* In IPv6 pmtu discovery is not optional,
4060            so that RTAX_MTU lock cannot disable it.
4061            We still use this lock to block changes
4062            caused by addrconf/ndisc.
4063         */
4064
4065         idev = __in6_dev_get(arg->dev);
4066         if (!idev)
4067                 return 0;
4068
4069         /* For administrative MTU increase, there is no way to discover
4070            IPv6 PMTU increase, so PMTU increase should be updated here.
4071            Since RFC 1981 doesn't include administrative MTU increase
4072            update PMTU increase is a MUST. (i.e. jumbo frame)
4073          */
4074         if (rt->fib6_nh.nh_dev == arg->dev &&
4075             !fib6_metric_locked(rt, RTAX_MTU)) {
4076                 u32 mtu = rt->fib6_pmtu;
4077
4078                 if (mtu >= arg->mtu ||
4079                     (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4080                         fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4081
4082                 spin_lock_bh(&rt6_exception_lock);
4083                 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4084                 spin_unlock_bh(&rt6_exception_lock);
4085         }
4086         return 0;
4087 }
4088
4089 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4090 {
4091         struct rt6_mtu_change_arg arg = {
4092                 .dev = dev,
4093                 .mtu = mtu,
4094         };
4095
4096         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4097 }
4098
4099 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4100         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
4101         [RTA_PREFSRC]           = { .len = sizeof(struct in6_addr) },
4102         [RTA_OIF]               = { .type = NLA_U32 },
4103         [RTA_IIF]               = { .type = NLA_U32 },
4104         [RTA_PRIORITY]          = { .type = NLA_U32 },
4105         [RTA_METRICS]           = { .type = NLA_NESTED },
4106         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
4107         [RTA_PREF]              = { .type = NLA_U8 },
4108         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
4109         [RTA_ENCAP]             = { .type = NLA_NESTED },
4110         [RTA_EXPIRES]           = { .type = NLA_U32 },
4111         [RTA_UID]               = { .type = NLA_U32 },
4112         [RTA_MARK]              = { .type = NLA_U32 },
4113         [RTA_TABLE]             = { .type = NLA_U32 },
4114         [RTA_IP_PROTO]          = { .type = NLA_U8 },
4115         [RTA_SPORT]             = { .type = NLA_U16 },
4116         [RTA_DPORT]             = { .type = NLA_U16 },
4117 };
4118
4119 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4120                               struct fib6_config *cfg,
4121                               struct netlink_ext_ack *extack)
4122 {
4123         struct rtmsg *rtm;
4124         struct nlattr *tb[RTA_MAX+1];
4125         unsigned int pref;
4126         int err;
4127
4128         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4129                           extack);
4130         if (err < 0)
4131                 goto errout;
4132
4133         err = -EINVAL;
4134         rtm = nlmsg_data(nlh);
4135
4136         *cfg = (struct fib6_config){
4137                 .fc_table = rtm->rtm_table,
4138                 .fc_dst_len = rtm->rtm_dst_len,
4139                 .fc_src_len = rtm->rtm_src_len,
4140                 .fc_flags = RTF_UP,
4141                 .fc_protocol = rtm->rtm_protocol,
4142                 .fc_type = rtm->rtm_type,
4143
4144                 .fc_nlinfo.portid = NETLINK_CB(skb).portid,
4145                 .fc_nlinfo.nlh = nlh,
4146                 .fc_nlinfo.nl_net = sock_net(skb->sk),
4147         };
4148
4149         if (rtm->rtm_type == RTN_UNREACHABLE ||
4150             rtm->rtm_type == RTN_BLACKHOLE ||
4151             rtm->rtm_type == RTN_PROHIBIT ||
4152             rtm->rtm_type == RTN_THROW)
4153                 cfg->fc_flags |= RTF_REJECT;
4154
4155         if (rtm->rtm_type == RTN_LOCAL)
4156                 cfg->fc_flags |= RTF_LOCAL;
4157
4158         if (rtm->rtm_flags & RTM_F_CLONED)
4159                 cfg->fc_flags |= RTF_CACHE;
4160
4161         cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4162
4163         if (tb[RTA_GATEWAY]) {
4164                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4165                 cfg->fc_flags |= RTF_GATEWAY;
4166         }
4167
4168         if (tb[RTA_DST]) {
4169                 int plen = (rtm->rtm_dst_len + 7) >> 3;
4170
4171                 if (nla_len(tb[RTA_DST]) < plen)
4172                         goto errout;
4173
4174                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4175         }
4176
4177         if (tb[RTA_SRC]) {
4178                 int plen = (rtm->rtm_src_len + 7) >> 3;
4179
4180                 if (nla_len(tb[RTA_SRC]) < plen)
4181                         goto errout;
4182
4183                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4184         }
4185
4186         if (tb[RTA_PREFSRC])
4187                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4188
4189         if (tb[RTA_OIF])
4190                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4191
4192         if (tb[RTA_PRIORITY])
4193                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4194
4195         if (tb[RTA_METRICS]) {
4196                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4197                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4198         }
4199
4200         if (tb[RTA_TABLE])
4201                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4202
4203         if (tb[RTA_MULTIPATH]) {
4204                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4205                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4206
4207                 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4208                                                      cfg->fc_mp_len, extack);
4209                 if (err < 0)
4210                         goto errout;
4211         }
4212
4213         if (tb[RTA_PREF]) {
4214                 pref = nla_get_u8(tb[RTA_PREF]);
4215                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4216                     pref != ICMPV6_ROUTER_PREF_HIGH)
4217                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
4218                 cfg->fc_flags |= RTF_PREF(pref);
4219         }
4220
4221         if (tb[RTA_ENCAP])
4222                 cfg->fc_encap = tb[RTA_ENCAP];
4223
4224         if (tb[RTA_ENCAP_TYPE]) {
4225                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4226
4227                 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4228                 if (err < 0)
4229                         goto errout;
4230         }
4231
4232         if (tb[RTA_EXPIRES]) {
4233                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4234
4235                 if (addrconf_finite_timeout(timeout)) {
4236                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4237                         cfg->fc_flags |= RTF_EXPIRES;
4238                 }
4239         }
4240
4241         err = 0;
4242 errout:
4243         return err;
4244 }
4245
4246 struct rt6_nh {
4247         struct fib6_info *fib6_info;
4248         struct fib6_config r_cfg;
4249         struct list_head next;
4250 };
4251
4252 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
4253 {
4254         struct rt6_nh *nh;
4255
4256         list_for_each_entry(nh, rt6_nh_list, next) {
4257                 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
4258                         &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
4259                         nh->r_cfg.fc_ifindex);
4260         }
4261 }
4262
4263 static int ip6_route_info_append(struct net *net,
4264                                  struct list_head *rt6_nh_list,
4265                                  struct fib6_info *rt,
4266                                  struct fib6_config *r_cfg)
4267 {
4268         struct rt6_nh *nh;
4269         int err = -EEXIST;
4270
4271         list_for_each_entry(nh, rt6_nh_list, next) {
4272                 /* check if fib6_info already exists */
4273                 if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4274                         return err;
4275         }
4276
4277         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4278         if (!nh)
4279                 return -ENOMEM;
4280         nh->fib6_info = rt;
4281         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4282         list_add_tail(&nh->next, rt6_nh_list);
4283
4284         return 0;
4285 }
4286
4287 static void ip6_route_mpath_notify(struct fib6_info *rt,
4288                                    struct fib6_info *rt_last,
4289                                    struct nl_info *info,
4290                                    __u16 nlflags)
4291 {
4292         /* if this is an APPEND route, then rt points to the first route
4293          * inserted and rt_last points to last route inserted. Userspace
4294          * wants a consistent dump of the route which starts at the first
4295          * nexthop. Since sibling routes are always added at the end of
4296          * the list, find the first sibling of the last route appended
4297          */
4298         if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4299                 rt = list_first_entry(&rt_last->fib6_siblings,
4300                                       struct fib6_info,
4301                                       fib6_siblings);
4302         }
4303
4304         if (rt)
4305                 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4306 }
4307
4308 static int ip6_route_multipath_add(struct fib6_config *cfg,
4309                                    struct netlink_ext_ack *extack)
4310 {
4311         struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4312         struct nl_info *info = &cfg->fc_nlinfo;
4313         struct fib6_config r_cfg;
4314         struct rtnexthop *rtnh;
4315         struct fib6_info *rt;
4316         struct rt6_nh *err_nh;
4317         struct rt6_nh *nh, *nh_safe;
4318         __u16 nlflags;
4319         int remaining;
4320         int attrlen;
4321         int err = 1;
4322         int nhn = 0;
4323         int replace = (cfg->fc_nlinfo.nlh &&
4324                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4325         LIST_HEAD(rt6_nh_list);
4326
4327         nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4328         if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4329                 nlflags |= NLM_F_APPEND;
4330
4331         remaining = cfg->fc_mp_len;
4332         rtnh = (struct rtnexthop *)cfg->fc_mp;
4333
4334         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4335          * fib6_info structs per nexthop
4336          */
4337         while (rtnh_ok(rtnh, remaining)) {
4338                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4339                 if (rtnh->rtnh_ifindex)
4340                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4341
4342                 attrlen = rtnh_attrlen(rtnh);
4343                 if (attrlen > 0) {
4344                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4345
4346                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4347                         if (nla) {
4348                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4349                                 r_cfg.fc_flags |= RTF_GATEWAY;
4350                         }
4351                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4352                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4353                         if (nla)
4354                                 r_cfg.fc_encap_type = nla_get_u16(nla);
4355                 }
4356
4357                 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4358                 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4359                 if (IS_ERR(rt)) {
4360                         err = PTR_ERR(rt);
4361                         rt = NULL;
4362                         goto cleanup;
4363                 }
4364                 if (!rt6_qualify_for_ecmp(rt)) {
4365                         err = -EINVAL;
4366                         NL_SET_ERR_MSG(extack,
4367                                        "Device only routes can not be added for IPv6 using the multipath API.");
4368                         fib6_info_release(rt);
4369                         goto cleanup;
4370                 }
4371
4372                 rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
4373
4374                 err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4375                                             rt, &r_cfg);
4376                 if (err) {
4377                         fib6_info_release(rt);
4378                         goto cleanup;
4379                 }
4380
4381                 rtnh = rtnh_next(rtnh, &remaining);
4382         }
4383
4384         /* for add and replace send one notification with all nexthops.
4385          * Skip the notification in fib6_add_rt2node and send one with
4386          * the full route when done
4387          */
4388         info->skip_notify = 1;
4389
4390         err_nh = NULL;
4391         list_for_each_entry(nh, &rt6_nh_list, next) {
4392                 err = __ip6_ins_rt(nh->fib6_info, info, extack);
4393                 fib6_info_release(nh->fib6_info);
4394
4395                 if (!err) {
4396                         /* save reference to last route successfully inserted */
4397                         rt_last = nh->fib6_info;
4398
4399                         /* save reference to first route for notification */
4400                         if (!rt_notif)
4401                                 rt_notif = nh->fib6_info;
4402                 }
4403
4404                 /* nh->fib6_info is used or freed at this point, reset to NULL*/
4405                 nh->fib6_info = NULL;
4406                 if (err) {
4407                         if (replace && nhn)
4408                                 ip6_print_replace_route_err(&rt6_nh_list);
4409                         err_nh = nh;
4410                         goto add_errout;
4411                 }
4412
4413                 /* Because each route is added like a single route we remove
4414                  * these flags after the first nexthop: if there is a collision,
4415                  * we have already failed to add the first nexthop:
4416                  * fib6_add_rt2node() has rejected it; when replacing, old
4417                  * nexthops have been replaced by first new, the rest should
4418                  * be added to it.
4419                  */
4420                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4421                                                      NLM_F_REPLACE);
4422                 nhn++;
4423         }
4424
4425         /* success ... tell user about new route */
4426         ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4427         goto cleanup;
4428
4429 add_errout:
4430         /* send notification for routes that were added so that
4431          * the delete notifications sent by ip6_route_del are
4432          * coherent
4433          */
4434         if (rt_notif)
4435                 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4436
4437         /* Delete routes that were already added */
4438         list_for_each_entry(nh, &rt6_nh_list, next) {
4439                 if (err_nh == nh)
4440                         break;
4441                 ip6_route_del(&nh->r_cfg, extack);
4442         }
4443
4444 cleanup:
4445         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4446                 if (nh->fib6_info)
4447                         fib6_info_release(nh->fib6_info);
4448                 list_del(&nh->next);
4449                 kfree(nh);
4450         }
4451
4452         return err;
4453 }
4454
4455 static int ip6_route_multipath_del(struct fib6_config *cfg,
4456                                    struct netlink_ext_ack *extack)
4457 {
4458         struct fib6_config r_cfg;
4459         struct rtnexthop *rtnh;
4460         int remaining;
4461         int attrlen;
4462         int err = 1, last_err = 0;
4463
4464         remaining = cfg->fc_mp_len;
4465         rtnh = (struct rtnexthop *)cfg->fc_mp;
4466
4467         /* Parse a Multipath Entry */
4468         while (rtnh_ok(rtnh, remaining)) {
4469                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4470                 if (rtnh->rtnh_ifindex)
4471                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4472
4473                 attrlen = rtnh_attrlen(rtnh);
4474                 if (attrlen > 0) {
4475                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4476
4477                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4478                         if (nla) {
4479                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4480                                 r_cfg.fc_flags |= RTF_GATEWAY;
4481                         }
4482                 }
4483                 err = ip6_route_del(&r_cfg, extack);
4484                 if (err)
4485                         last_err = err;
4486
4487                 rtnh = rtnh_next(rtnh, &remaining);
4488         }
4489
4490         return last_err;
4491 }
4492
4493 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4494                               struct netlink_ext_ack *extack)
4495 {
4496         struct fib6_config cfg;
4497         int err;
4498
4499         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4500         if (err < 0)
4501                 return err;
4502
4503         if (cfg.fc_mp)
4504                 return ip6_route_multipath_del(&cfg, extack);
4505         else {
4506                 cfg.fc_delete_all_nh = 1;
4507                 return ip6_route_del(&cfg, extack);
4508         }
4509 }
4510
4511 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4512                               struct netlink_ext_ack *extack)
4513 {
4514         struct fib6_config cfg;
4515         int err;
4516
4517         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4518         if (err < 0)
4519                 return err;
4520
4521         if (cfg.fc_mp)
4522                 return ip6_route_multipath_add(&cfg, extack);
4523         else
4524                 return ip6_route_add(&cfg, GFP_KERNEL, extack);
4525 }
4526
4527 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4528 {
4529         int nexthop_len = 0;
4530
4531         if (rt->fib6_nsiblings) {
4532                 nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
4533                             + NLA_ALIGN(sizeof(struct rtnexthop))
4534                             + nla_total_size(16) /* RTA_GATEWAY */
4535                             + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate);
4536
4537                 nexthop_len *= rt->fib6_nsiblings;
4538         }
4539
4540         return NLMSG_ALIGN(sizeof(struct rtmsg))
4541                + nla_total_size(16) /* RTA_SRC */
4542                + nla_total_size(16) /* RTA_DST */
4543                + nla_total_size(16) /* RTA_GATEWAY */
4544                + nla_total_size(16) /* RTA_PREFSRC */
4545                + nla_total_size(4) /* RTA_TABLE */
4546                + nla_total_size(4) /* RTA_IIF */
4547                + nla_total_size(4) /* RTA_OIF */
4548                + nla_total_size(4) /* RTA_PRIORITY */
4549                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4550                + nla_total_size(sizeof(struct rta_cacheinfo))
4551                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4552                + nla_total_size(1) /* RTA_PREF */
4553                + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate)
4554                + nexthop_len;
4555 }
4556
4557 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
4558                             unsigned int *flags, bool skip_oif)
4559 {
4560         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4561                 *flags |= RTNH_F_DEAD;
4562
4563         if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
4564                 *flags |= RTNH_F_LINKDOWN;
4565
4566                 rcu_read_lock();
4567                 if (fib6_ignore_linkdown(rt))
4568                         *flags |= RTNH_F_DEAD;
4569                 rcu_read_unlock();
4570         }
4571
4572         if (rt->fib6_flags & RTF_GATEWAY) {
4573                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0)
4574                         goto nla_put_failure;
4575         }
4576
4577         *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK);
4578         if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD)
4579                 *flags |= RTNH_F_OFFLOAD;
4580
4581         /* not needed for multipath encoding b/c it has a rtnexthop struct */
4582         if (!skip_oif && rt->fib6_nh.nh_dev &&
4583             nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex))
4584                 goto nla_put_failure;
4585
4586         if (rt->fib6_nh.nh_lwtstate &&
4587             lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0)
4588                 goto nla_put_failure;
4589
4590         return 0;
4591
4592 nla_put_failure:
4593         return -EMSGSIZE;
4594 }
4595
4596 /* add multipath next hop */
4597 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt)
4598 {
4599         const struct net_device *dev = rt->fib6_nh.nh_dev;
4600         struct rtnexthop *rtnh;
4601         unsigned int flags = 0;
4602
4603         rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4604         if (!rtnh)
4605                 goto nla_put_failure;
4606
4607         rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1;
4608         rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
4609
4610         if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4611                 goto nla_put_failure;
4612
4613         rtnh->rtnh_flags = flags;
4614
4615         /* length of rtnetlink header + attributes */
4616         rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4617
4618         return 0;
4619
4620 nla_put_failure:
4621         return -EMSGSIZE;
4622 }
4623
4624 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4625                          struct fib6_info *rt, struct dst_entry *dst,
4626                          struct in6_addr *dest, struct in6_addr *src,
4627                          int iif, int type, u32 portid, u32 seq,
4628                          unsigned int flags)
4629 {
4630         struct rt6_info *rt6 = (struct rt6_info *)dst;
4631         struct rt6key *rt6_dst, *rt6_src;
4632         u32 *pmetrics, table, rt6_flags;
4633         struct nlmsghdr *nlh;
4634         struct rtmsg *rtm;
4635         long expires = 0;
4636
4637         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4638         if (!nlh)
4639                 return -EMSGSIZE;
4640
4641         if (rt6) {
4642                 rt6_dst = &rt6->rt6i_dst;
4643                 rt6_src = &rt6->rt6i_src;
4644                 rt6_flags = rt6->rt6i_flags;
4645         } else {
4646                 rt6_dst = &rt->fib6_dst;
4647                 rt6_src = &rt->fib6_src;
4648                 rt6_flags = rt->fib6_flags;
4649         }
4650
4651         rtm = nlmsg_data(nlh);
4652         rtm->rtm_family = AF_INET6;
4653         rtm->rtm_dst_len = rt6_dst->plen;
4654         rtm->rtm_src_len = rt6_src->plen;
4655         rtm->rtm_tos = 0;
4656         if (rt->fib6_table)
4657                 table = rt->fib6_table->tb6_id;
4658         else
4659                 table = RT6_TABLE_UNSPEC;
4660         rtm->rtm_table = table;
4661         if (nla_put_u32(skb, RTA_TABLE, table))
4662                 goto nla_put_failure;
4663
4664         rtm->rtm_type = rt->fib6_type;
4665         rtm->rtm_flags = 0;
4666         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4667         rtm->rtm_protocol = rt->fib6_protocol;
4668
4669         if (rt6_flags & RTF_CACHE)
4670                 rtm->rtm_flags |= RTM_F_CLONED;
4671
4672         if (dest) {
4673                 if (nla_put_in6_addr(skb, RTA_DST, dest))
4674                         goto nla_put_failure;
4675                 rtm->rtm_dst_len = 128;
4676         } else if (rtm->rtm_dst_len)
4677                 if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
4678                         goto nla_put_failure;
4679 #ifdef CONFIG_IPV6_SUBTREES
4680         if (src) {
4681                 if (nla_put_in6_addr(skb, RTA_SRC, src))
4682                         goto nla_put_failure;
4683                 rtm->rtm_src_len = 128;
4684         } else if (rtm->rtm_src_len &&
4685                    nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
4686                 goto nla_put_failure;
4687 #endif
4688         if (iif) {
4689 #ifdef CONFIG_IPV6_MROUTE
4690                 if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
4691                         int err = ip6mr_get_route(net, skb, rtm, portid);
4692
4693                         if (err == 0)
4694                                 return 0;
4695                         if (err < 0)
4696                                 goto nla_put_failure;
4697                 } else
4698 #endif
4699                         if (nla_put_u32(skb, RTA_IIF, iif))
4700                                 goto nla_put_failure;
4701         } else if (dest) {
4702                 struct in6_addr saddr_buf;
4703                 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4704                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4705                         goto nla_put_failure;
4706         }
4707
4708         if (rt->fib6_prefsrc.plen) {
4709                 struct in6_addr saddr_buf;
4710                 saddr_buf = rt->fib6_prefsrc.addr;
4711                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4712                         goto nla_put_failure;
4713         }
4714
4715         pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4716         if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4717                 goto nla_put_failure;
4718
4719         if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4720                 goto nla_put_failure;
4721
4722         /* For multipath routes, walk the siblings list and add
4723          * each as a nexthop within RTA_MULTIPATH.
4724          */
4725         if (rt6) {
4726                 if (rt6_flags & RTF_GATEWAY &&
4727                     nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
4728                         goto nla_put_failure;
4729
4730                 if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
4731                         goto nla_put_failure;
4732         } else if (rt->fib6_nsiblings) {
4733                 struct fib6_info *sibling, *next_sibling;
4734                 struct nlattr *mp;
4735
4736                 mp = nla_nest_start(skb, RTA_MULTIPATH);
4737                 if (!mp)
4738                         goto nla_put_failure;
4739
4740                 if (rt6_add_nexthop(skb, rt) < 0)
4741                         goto nla_put_failure;
4742
4743                 list_for_each_entry_safe(sibling, next_sibling,
4744                                          &rt->fib6_siblings, fib6_siblings) {
4745                         if (rt6_add_nexthop(skb, sibling) < 0)
4746                                 goto nla_put_failure;
4747                 }
4748
4749                 nla_nest_end(skb, mp);
4750         } else {
4751                 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4752                         goto nla_put_failure;
4753         }
4754
4755         if (rt6_flags & RTF_EXPIRES) {
4756                 expires = dst ? dst->expires : rt->expires;
4757                 expires -= jiffies;
4758         }
4759
4760         if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4761                 goto nla_put_failure;
4762
4763         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
4764                 goto nla_put_failure;
4765
4766
4767         nlmsg_end(skb, nlh);
4768         return 0;
4769
4770 nla_put_failure:
4771         nlmsg_cancel(skb, nlh);
4772         return -EMSGSIZE;
4773 }
4774
4775 static bool fib6_info_uses_dev(const struct fib6_info *f6i,
4776                                const struct net_device *dev)
4777 {
4778         if (f6i->fib6_nh.nh_dev == dev)
4779                 return true;
4780
4781         if (f6i->fib6_nsiblings) {
4782                 struct fib6_info *sibling, *next_sibling;
4783
4784                 list_for_each_entry_safe(sibling, next_sibling,
4785                                          &f6i->fib6_siblings, fib6_siblings) {
4786                         if (sibling->fib6_nh.nh_dev == dev)
4787                                 return true;
4788                 }
4789         }
4790
4791         return false;
4792 }
4793
4794 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4795 {
4796         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4797         struct fib_dump_filter *filter = &arg->filter;
4798         unsigned int flags = NLM_F_MULTI;
4799         struct net *net = arg->net;
4800
4801         if (rt == net->ipv6.fib6_null_entry)
4802                 return 0;
4803
4804         if ((filter->flags & RTM_F_PREFIX) &&
4805             !(rt->fib6_flags & RTF_PREFIX_RT)) {
4806                 /* success since this is not a prefix route */
4807                 return 1;
4808         }
4809         if (filter->filter_set) {
4810                 if ((filter->rt_type && rt->fib6_type != filter->rt_type) ||
4811                     (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) ||
4812                     (filter->protocol && rt->fib6_protocol != filter->protocol)) {
4813                         return 1;
4814                 }
4815                 flags |= NLM_F_DUMP_FILTERED;
4816         }
4817
4818         return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4819                              RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4820                              arg->cb->nlh->nlmsg_seq, flags);
4821 }
4822
4823 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4824                               struct netlink_ext_ack *extack)
4825 {
4826         struct net *net = sock_net(in_skb->sk);
4827         struct nlattr *tb[RTA_MAX+1];
4828         int err, iif = 0, oif = 0;
4829         struct fib6_info *from;
4830         struct dst_entry *dst;
4831         struct rt6_info *rt;
4832         struct sk_buff *skb;
4833         struct rtmsg *rtm;
4834         struct flowi6 fl6 = {};
4835         bool fibmatch;
4836
4837         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4838                           extack);
4839         if (err < 0)
4840                 goto errout;
4841
4842         err = -EINVAL;
4843         rtm = nlmsg_data(nlh);
4844         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4845         fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4846
4847         if (tb[RTA_SRC]) {
4848                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4849                         goto errout;
4850
4851                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4852         }
4853
4854         if (tb[RTA_DST]) {
4855                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4856                         goto errout;
4857
4858                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4859         }
4860
4861         if (tb[RTA_IIF])
4862                 iif = nla_get_u32(tb[RTA_IIF]);
4863
4864         if (tb[RTA_OIF])
4865                 oif = nla_get_u32(tb[RTA_OIF]);
4866
4867         if (tb[RTA_MARK])
4868                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4869
4870         if (tb[RTA_UID])
4871                 fl6.flowi6_uid = make_kuid(current_user_ns(),
4872                                            nla_get_u32(tb[RTA_UID]));
4873         else
4874                 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4875
4876         if (tb[RTA_SPORT])
4877                 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
4878
4879         if (tb[RTA_DPORT])
4880                 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
4881
4882         if (tb[RTA_IP_PROTO]) {
4883                 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
4884                                                   &fl6.flowi6_proto, extack);
4885                 if (err)
4886                         goto errout;
4887         }
4888
4889         if (iif) {
4890                 struct net_device *dev;
4891                 int flags = 0;
4892
4893                 rcu_read_lock();
4894
4895                 dev = dev_get_by_index_rcu(net, iif);
4896                 if (!dev) {
4897                         rcu_read_unlock();
4898                         err = -ENODEV;
4899                         goto errout;
4900                 }
4901
4902                 fl6.flowi6_iif = iif;
4903
4904                 if (!ipv6_addr_any(&fl6.saddr))
4905                         flags |= RT6_LOOKUP_F_HAS_SADDR;
4906
4907                 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4908
4909                 rcu_read_unlock();
4910         } else {
4911                 fl6.flowi6_oif = oif;
4912
4913                 dst = ip6_route_output(net, NULL, &fl6);
4914         }
4915
4916
4917         rt = container_of(dst, struct rt6_info, dst);
4918         if (rt->dst.error) {
4919                 err = rt->dst.error;
4920                 ip6_rt_put(rt);
4921                 goto errout;
4922         }
4923
4924         if (rt == net->ipv6.ip6_null_entry) {
4925                 err = rt->dst.error;
4926                 ip6_rt_put(rt);
4927                 goto errout;
4928         }
4929
4930         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4931         if (!skb) {
4932                 ip6_rt_put(rt);
4933                 err = -ENOBUFS;
4934                 goto errout;
4935         }
4936
4937         skb_dst_set(skb, &rt->dst);
4938
4939         rcu_read_lock();
4940         from = rcu_dereference(rt->from);
4941
4942         if (fibmatch)
4943                 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif,
4944                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4945                                     nlh->nlmsg_seq, 0);
4946         else
4947                 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
4948                                     &fl6.saddr, iif, RTM_NEWROUTE,
4949                                     NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
4950                                     0);
4951         rcu_read_unlock();
4952
4953         if (err < 0) {
4954                 kfree_skb(skb);
4955                 goto errout;
4956         }
4957
4958         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4959 errout:
4960         return err;
4961 }
4962
4963 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
4964                      unsigned int nlm_flags)
4965 {
4966         struct sk_buff *skb;
4967         struct net *net = info->nl_net;
4968         u32 seq;
4969         int err;
4970
4971         err = -ENOBUFS;
4972         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4973
4974         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4975         if (!skb)
4976                 goto errout;
4977
4978         err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
4979                             event, info->portid, seq, nlm_flags);
4980         if (err < 0) {
4981                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4982                 WARN_ON(err == -EMSGSIZE);
4983                 kfree_skb(skb);
4984                 goto errout;
4985         }
4986         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4987                     info->nlh, gfp_any());
4988         return;
4989 errout:
4990         if (err < 0)
4991                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4992 }
4993
4994 static int ip6_route_dev_notify(struct notifier_block *this,
4995                                 unsigned long event, void *ptr)
4996 {
4997         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4998         struct net *net = dev_net(dev);
4999
5000         if (!(dev->flags & IFF_LOOPBACK))
5001                 return NOTIFY_OK;
5002
5003         if (event == NETDEV_REGISTER) {
5004                 net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
5005                 net->ipv6.ip6_null_entry->dst.dev = dev;
5006                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
5007 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5008                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
5009                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
5010                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
5011                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
5012 #endif
5013          } else if (event == NETDEV_UNREGISTER &&
5014                     dev->reg_state != NETREG_UNREGISTERED) {
5015                 /* NETDEV_UNREGISTER could be fired for multiple times by
5016                  * netdev_wait_allrefs(). Make sure we only call this once.
5017                  */
5018                 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
5019 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5020                 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
5021                 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
5022 #endif
5023         }
5024
5025         return NOTIFY_OK;
5026 }
5027
5028 /*
5029  *      /proc
5030  */
5031
5032 #ifdef CONFIG_PROC_FS
5033 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
5034 {
5035         struct net *net = (struct net *)seq->private;
5036         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
5037                    net->ipv6.rt6_stats->fib_nodes,
5038                    net->ipv6.rt6_stats->fib_route_nodes,
5039                    atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
5040                    net->ipv6.rt6_stats->fib_rt_entries,
5041                    net->ipv6.rt6_stats->fib_rt_cache,
5042                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
5043                    net->ipv6.rt6_stats->fib_discarded_routes);
5044
5045         return 0;
5046 }
5047 #endif  /* CONFIG_PROC_FS */
5048
5049 #ifdef CONFIG_SYSCTL
5050
5051 static
5052 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
5053                               void __user *buffer, size_t *lenp, loff_t *ppos)
5054 {
5055         struct net *net;
5056         int delay;
5057         if (!write)
5058                 return -EINVAL;
5059
5060         net = (struct net *)ctl->extra1;
5061         delay = net->ipv6.sysctl.flush_delay;
5062         proc_dointvec(ctl, write, buffer, lenp, ppos);
5063         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
5064         return 0;
5065 }
5066
5067 static int zero;
5068 static int one = 1;
5069
5070 static struct ctl_table ipv6_route_table_template[] = {
5071         {
5072                 .procname       =       "flush",
5073                 .data           =       &init_net.ipv6.sysctl.flush_delay,
5074                 .maxlen         =       sizeof(int),
5075                 .mode           =       0200,
5076                 .proc_handler   =       ipv6_sysctl_rtcache_flush
5077         },
5078         {
5079                 .procname       =       "gc_thresh",
5080                 .data           =       &ip6_dst_ops_template.gc_thresh,
5081                 .maxlen         =       sizeof(int),
5082                 .mode           =       0644,
5083                 .proc_handler   =       proc_dointvec,
5084         },
5085         {
5086                 .procname       =       "max_size",
5087                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
5088                 .maxlen         =       sizeof(int),
5089                 .mode           =       0644,
5090                 .proc_handler   =       proc_dointvec,
5091         },
5092         {
5093                 .procname       =       "gc_min_interval",
5094                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5095                 .maxlen         =       sizeof(int),
5096                 .mode           =       0644,
5097                 .proc_handler   =       proc_dointvec_jiffies,
5098         },
5099         {
5100                 .procname       =       "gc_timeout",
5101                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5102                 .maxlen         =       sizeof(int),
5103                 .mode           =       0644,
5104                 .proc_handler   =       proc_dointvec_jiffies,
5105         },
5106         {
5107                 .procname       =       "gc_interval",
5108                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
5109                 .maxlen         =       sizeof(int),
5110                 .mode           =       0644,
5111                 .proc_handler   =       proc_dointvec_jiffies,
5112         },
5113         {
5114                 .procname       =       "gc_elasticity",
5115                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5116                 .maxlen         =       sizeof(int),
5117                 .mode           =       0644,
5118                 .proc_handler   =       proc_dointvec,
5119         },
5120         {
5121                 .procname       =       "mtu_expires",
5122                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5123                 .maxlen         =       sizeof(int),
5124                 .mode           =       0644,
5125                 .proc_handler   =       proc_dointvec_jiffies,
5126         },
5127         {
5128                 .procname       =       "min_adv_mss",
5129                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
5130                 .maxlen         =       sizeof(int),
5131                 .mode           =       0644,
5132                 .proc_handler   =       proc_dointvec,
5133         },
5134         {
5135                 .procname       =       "gc_min_interval_ms",
5136                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5137                 .maxlen         =       sizeof(int),
5138                 .mode           =       0644,
5139                 .proc_handler   =       proc_dointvec_ms_jiffies,
5140         },
5141         {
5142                 .procname       =       "skip_notify_on_dev_down",
5143                 .data           =       &init_net.ipv6.sysctl.skip_notify_on_dev_down,
5144                 .maxlen         =       sizeof(int),
5145                 .mode           =       0644,
5146                 .proc_handler   =       proc_dointvec,
5147                 .extra1         =       &zero,
5148                 .extra2         =       &one,
5149         },
5150         { }
5151 };
5152
5153 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5154 {
5155         struct ctl_table *table;
5156
5157         table = kmemdup(ipv6_route_table_template,
5158                         sizeof(ipv6_route_table_template),
5159                         GFP_KERNEL);
5160
5161         if (table) {
5162                 table[0].data = &net->ipv6.sysctl.flush_delay;
5163                 table[0].extra1 = net;
5164                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5165                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5166                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5167                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5168                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5169                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5170                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5171                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5172                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5173                 table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down;
5174
5175                 /* Don't export sysctls to unprivileged users */
5176                 if (net->user_ns != &init_user_ns)
5177                         table[0].procname = NULL;
5178         }
5179
5180         return table;
5181 }
5182 #endif
5183
5184 static int __net_init ip6_route_net_init(struct net *net)
5185 {
5186         int ret = -ENOMEM;
5187
5188         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5189                sizeof(net->ipv6.ip6_dst_ops));
5190
5191         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5192                 goto out_ip6_dst_ops;
5193
5194         net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5195                                             sizeof(*net->ipv6.fib6_null_entry),
5196                                             GFP_KERNEL);
5197         if (!net->ipv6.fib6_null_entry)
5198                 goto out_ip6_dst_entries;
5199
5200         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5201                                            sizeof(*net->ipv6.ip6_null_entry),
5202                                            GFP_KERNEL);
5203         if (!net->ipv6.ip6_null_entry)
5204                 goto out_fib6_null_entry;
5205         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5206         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5207                          ip6_template_metrics, true);
5208
5209 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5210         net->ipv6.fib6_has_custom_rules = false;
5211         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5212                                                sizeof(*net->ipv6.ip6_prohibit_entry),
5213                                                GFP_KERNEL);
5214         if (!net->ipv6.ip6_prohibit_entry)
5215                 goto out_ip6_null_entry;
5216         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5217         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5218                          ip6_template_metrics, true);
5219
5220         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5221                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
5222                                                GFP_KERNEL);
5223         if (!net->ipv6.ip6_blk_hole_entry)
5224                 goto out_ip6_prohibit_entry;
5225         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5226         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5227                          ip6_template_metrics, true);
5228 #endif
5229
5230         net->ipv6.sysctl.flush_delay = 0;
5231         net->ipv6.sysctl.ip6_rt_max_size = 4096;
5232         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5233         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5234         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5235         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5236         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5237         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5238         net->ipv6.sysctl.skip_notify_on_dev_down = 0;
5239
5240         net->ipv6.ip6_rt_gc_expire = 30*HZ;
5241
5242         ret = 0;
5243 out:
5244         return ret;
5245
5246 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5247 out_ip6_prohibit_entry:
5248         kfree(net->ipv6.ip6_prohibit_entry);
5249 out_ip6_null_entry:
5250         kfree(net->ipv6.ip6_null_entry);
5251 #endif
5252 out_fib6_null_entry:
5253         kfree(net->ipv6.fib6_null_entry);
5254 out_ip6_dst_entries:
5255         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5256 out_ip6_dst_ops:
5257         goto out;
5258 }
5259
5260 static void __net_exit ip6_route_net_exit(struct net *net)
5261 {
5262         kfree(net->ipv6.fib6_null_entry);
5263         kfree(net->ipv6.ip6_null_entry);
5264 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5265         kfree(net->ipv6.ip6_prohibit_entry);
5266         kfree(net->ipv6.ip6_blk_hole_entry);
5267 #endif
5268         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5269 }
5270
5271 static int __net_init ip6_route_net_init_late(struct net *net)
5272 {
5273 #ifdef CONFIG_PROC_FS
5274         proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
5275                         sizeof(struct ipv6_route_iter));
5276         proc_create_net_single("rt6_stats", 0444, net->proc_net,
5277                         rt6_stats_seq_show, NULL);
5278 #endif
5279         return 0;
5280 }
5281
5282 static void __net_exit ip6_route_net_exit_late(struct net *net)
5283 {
5284 #ifdef CONFIG_PROC_FS
5285         remove_proc_entry("ipv6_route", net->proc_net);
5286         remove_proc_entry("rt6_stats", net->proc_net);
5287 #endif
5288 }
5289
5290 static struct pernet_operations ip6_route_net_ops = {
5291         .init = ip6_route_net_init,
5292         .exit = ip6_route_net_exit,
5293 };
5294
5295 static int __net_init ipv6_inetpeer_init(struct net *net)
5296 {
5297         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5298
5299         if (!bp)
5300                 return -ENOMEM;
5301         inet_peer_base_init(bp);
5302         net->ipv6.peers = bp;
5303         return 0;
5304 }
5305
5306 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5307 {
5308         struct inet_peer_base *bp = net->ipv6.peers;
5309
5310         net->ipv6.peers = NULL;
5311         inetpeer_invalidate_tree(bp);
5312         kfree(bp);
5313 }
5314
5315 static struct pernet_operations ipv6_inetpeer_ops = {
5316         .init   =       ipv6_inetpeer_init,
5317         .exit   =       ipv6_inetpeer_exit,
5318 };
5319
5320 static struct pernet_operations ip6_route_net_late_ops = {
5321         .init = ip6_route_net_init_late,
5322         .exit = ip6_route_net_exit_late,
5323 };
5324
5325 static struct notifier_block ip6_route_dev_notifier = {
5326         .notifier_call = ip6_route_dev_notify,
5327         .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5328 };
5329
5330 void __init ip6_route_init_special_entries(void)
5331 {
5332         /* Registering of the loopback is done before this portion of code,
5333          * the loopback reference in rt6_info will not be taken, do it
5334          * manually for init_net */
5335         init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
5336         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5337         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5338   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5339         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5340         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5341         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5342         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5343   #endif
5344 }
5345
5346 int __init ip6_route_init(void)
5347 {
5348         int ret;
5349         int cpu;
5350
5351         ret = -ENOMEM;
5352         ip6_dst_ops_template.kmem_cachep =
5353                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5354                                   SLAB_HWCACHE_ALIGN, NULL);
5355         if (!ip6_dst_ops_template.kmem_cachep)
5356                 goto out;
5357
5358         ret = dst_entries_init(&ip6_dst_blackhole_ops);
5359         if (ret)
5360                 goto out_kmem_cache;
5361
5362         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5363         if (ret)
5364                 goto out_dst_entries;
5365
5366         ret = register_pernet_subsys(&ip6_route_net_ops);
5367         if (ret)
5368                 goto out_register_inetpeer;
5369
5370         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5371
5372         ret = fib6_init();
5373         if (ret)
5374                 goto out_register_subsys;
5375
5376         ret = xfrm6_init();
5377         if (ret)
5378                 goto out_fib6_init;
5379
5380         ret = fib6_rules_init();
5381         if (ret)
5382                 goto xfrm6_init;
5383
5384         ret = register_pernet_subsys(&ip6_route_net_late_ops);
5385         if (ret)
5386                 goto fib6_rules_init;
5387
5388         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5389                                    inet6_rtm_newroute, NULL, 0);
5390         if (ret < 0)
5391                 goto out_register_late_subsys;
5392
5393         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5394                                    inet6_rtm_delroute, NULL, 0);
5395         if (ret < 0)
5396                 goto out_register_late_subsys;
5397
5398         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5399                                    inet6_rtm_getroute, NULL,
5400                                    RTNL_FLAG_DOIT_UNLOCKED);
5401         if (ret < 0)
5402                 goto out_register_late_subsys;
5403
5404         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5405         if (ret)
5406                 goto out_register_late_subsys;
5407
5408         for_each_possible_cpu(cpu) {
5409                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5410
5411                 INIT_LIST_HEAD(&ul->head);
5412                 spin_lock_init(&ul->lock);
5413         }
5414
5415 out:
5416         return ret;
5417
5418 out_register_late_subsys:
5419         rtnl_unregister_all(PF_INET6);
5420         unregister_pernet_subsys(&ip6_route_net_late_ops);
5421 fib6_rules_init:
5422         fib6_rules_cleanup();
5423 xfrm6_init:
5424         xfrm6_fini();
5425 out_fib6_init:
5426         fib6_gc_cleanup();
5427 out_register_subsys:
5428         unregister_pernet_subsys(&ip6_route_net_ops);
5429 out_register_inetpeer:
5430         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5431 out_dst_entries:
5432         dst_entries_destroy(&ip6_dst_blackhole_ops);
5433 out_kmem_cache:
5434         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5435         goto out;
5436 }
5437
5438 void ip6_route_cleanup(void)
5439 {
5440         unregister_netdevice_notifier(&ip6_route_dev_notifier);
5441         unregister_pernet_subsys(&ip6_route_net_late_ops);
5442         fib6_rules_cleanup();
5443         xfrm6_fini();
5444         fib6_gc_cleanup();
5445         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5446         unregister_pernet_subsys(&ip6_route_net_ops);
5447         dst_entries_destroy(&ip6_dst_blackhole_ops);
5448         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5449 }