Merge branches 'work.misc' and 'work.dcache' of git://git.kernel.org/pub/scm/linux...
[sfrench/cifs-2.6.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <net/ip.h>
67 #include <linux/uaccess.h>
68
69 #ifdef CONFIG_SYSCTL
70 #include <linux/sysctl.h>
71 #endif
72
73 static int ip6_rt_type_to_error(u8 fib6_type);
74
75 #define CREATE_TRACE_POINTS
76 #include <trace/events/fib6.h>
77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
78 #undef CREATE_TRACE_POINTS
79
80 enum rt6_nud_state {
81         RT6_NUD_FAIL_HARD = -3,
82         RT6_NUD_FAIL_PROBE = -2,
83         RT6_NUD_FAIL_DO_RR = -1,
84         RT6_NUD_SUCCEED = 1
85 };
86
87 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
88 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
89 static unsigned int      ip6_mtu(const struct dst_entry *dst);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void             ip6_dst_destroy(struct dst_entry *);
92 static void             ip6_dst_ifdown(struct dst_entry *,
93                                        struct net_device *dev, int how);
94 static int               ip6_dst_gc(struct dst_ops *ops);
95
96 static int              ip6_pkt_discard(struct sk_buff *skb);
97 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
98 static int              ip6_pkt_prohibit(struct sk_buff *skb);
99 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
100 static void             ip6_link_failure(struct sk_buff *skb);
101 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
102                                            struct sk_buff *skb, u32 mtu);
103 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
104                                         struct sk_buff *skb);
105 static int rt6_score_route(struct fib6_info *rt, int oif, int strict);
106 static size_t rt6_nlmsg_size(struct fib6_info *rt);
107 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
108                          struct fib6_info *rt, struct dst_entry *dst,
109                          struct in6_addr *dest, struct in6_addr *src,
110                          int iif, int type, u32 portid, u32 seq,
111                          unsigned int flags);
112 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
113                                            struct in6_addr *daddr,
114                                            struct in6_addr *saddr);
115
116 #ifdef CONFIG_IPV6_ROUTE_INFO
117 static struct fib6_info *rt6_add_route_info(struct net *net,
118                                            const struct in6_addr *prefix, int prefixlen,
119                                            const struct in6_addr *gwaddr,
120                                            struct net_device *dev,
121                                            unsigned int pref);
122 static struct fib6_info *rt6_get_route_info(struct net *net,
123                                            const struct in6_addr *prefix, int prefixlen,
124                                            const struct in6_addr *gwaddr,
125                                            struct net_device *dev);
126 #endif
127
128 struct uncached_list {
129         spinlock_t              lock;
130         struct list_head        head;
131 };
132
133 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
134
135 void rt6_uncached_list_add(struct rt6_info *rt)
136 {
137         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
138
139         rt->rt6i_uncached_list = ul;
140
141         spin_lock_bh(&ul->lock);
142         list_add_tail(&rt->rt6i_uncached, &ul->head);
143         spin_unlock_bh(&ul->lock);
144 }
145
146 void rt6_uncached_list_del(struct rt6_info *rt)
147 {
148         if (!list_empty(&rt->rt6i_uncached)) {
149                 struct uncached_list *ul = rt->rt6i_uncached_list;
150                 struct net *net = dev_net(rt->dst.dev);
151
152                 spin_lock_bh(&ul->lock);
153                 list_del(&rt->rt6i_uncached);
154                 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
155                 spin_unlock_bh(&ul->lock);
156         }
157 }
158
159 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
160 {
161         struct net_device *loopback_dev = net->loopback_dev;
162         int cpu;
163
164         if (dev == loopback_dev)
165                 return;
166
167         for_each_possible_cpu(cpu) {
168                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
169                 struct rt6_info *rt;
170
171                 spin_lock_bh(&ul->lock);
172                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
173                         struct inet6_dev *rt_idev = rt->rt6i_idev;
174                         struct net_device *rt_dev = rt->dst.dev;
175
176                         if (rt_idev->dev == dev) {
177                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
178                                 in6_dev_put(rt_idev);
179                         }
180
181                         if (rt_dev == dev) {
182                                 rt->dst.dev = loopback_dev;
183                                 dev_hold(rt->dst.dev);
184                                 dev_put(rt_dev);
185                         }
186                 }
187                 spin_unlock_bh(&ul->lock);
188         }
189 }
190
191 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
192                                              struct sk_buff *skb,
193                                              const void *daddr)
194 {
195         if (!ipv6_addr_any(p))
196                 return (const void *) p;
197         else if (skb)
198                 return &ipv6_hdr(skb)->daddr;
199         return daddr;
200 }
201
202 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
203                                    struct net_device *dev,
204                                    struct sk_buff *skb,
205                                    const void *daddr)
206 {
207         struct neighbour *n;
208
209         daddr = choose_neigh_daddr(gw, skb, daddr);
210         n = __ipv6_neigh_lookup(dev, daddr);
211         if (n)
212                 return n;
213         return neigh_create(&nd_tbl, daddr, dev);
214 }
215
216 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
217                                               struct sk_buff *skb,
218                                               const void *daddr)
219 {
220         const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
221
222         return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
223 }
224
225 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
226 {
227         struct net_device *dev = dst->dev;
228         struct rt6_info *rt = (struct rt6_info *)dst;
229
230         daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
231         if (!daddr)
232                 return;
233         if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
234                 return;
235         if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
236                 return;
237         __ipv6_confirm_neigh(dev, daddr);
238 }
239
240 static struct dst_ops ip6_dst_ops_template = {
241         .family                 =       AF_INET6,
242         .gc                     =       ip6_dst_gc,
243         .gc_thresh              =       1024,
244         .check                  =       ip6_dst_check,
245         .default_advmss         =       ip6_default_advmss,
246         .mtu                    =       ip6_mtu,
247         .cow_metrics            =       dst_cow_metrics_generic,
248         .destroy                =       ip6_dst_destroy,
249         .ifdown                 =       ip6_dst_ifdown,
250         .negative_advice        =       ip6_negative_advice,
251         .link_failure           =       ip6_link_failure,
252         .update_pmtu            =       ip6_rt_update_pmtu,
253         .redirect               =       rt6_do_redirect,
254         .local_out              =       __ip6_local_out,
255         .neigh_lookup           =       ip6_dst_neigh_lookup,
256         .confirm_neigh          =       ip6_confirm_neigh,
257 };
258
259 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
260 {
261         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
262
263         return mtu ? : dst->dev->mtu;
264 }
265
266 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
267                                          struct sk_buff *skb, u32 mtu)
268 {
269 }
270
271 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
272                                       struct sk_buff *skb)
273 {
274 }
275
276 static struct dst_ops ip6_dst_blackhole_ops = {
277         .family                 =       AF_INET6,
278         .destroy                =       ip6_dst_destroy,
279         .check                  =       ip6_dst_check,
280         .mtu                    =       ip6_blackhole_mtu,
281         .default_advmss         =       ip6_default_advmss,
282         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
283         .redirect               =       ip6_rt_blackhole_redirect,
284         .cow_metrics            =       dst_cow_metrics_generic,
285         .neigh_lookup           =       ip6_dst_neigh_lookup,
286 };
287
288 static const u32 ip6_template_metrics[RTAX_MAX] = {
289         [RTAX_HOPLIMIT - 1] = 0,
290 };
291
292 static const struct fib6_info fib6_null_entry_template = {
293         .fib6_flags     = (RTF_REJECT | RTF_NONEXTHOP),
294         .fib6_protocol  = RTPROT_KERNEL,
295         .fib6_metric    = ~(u32)0,
296         .fib6_ref       = ATOMIC_INIT(1),
297         .fib6_type      = RTN_UNREACHABLE,
298         .fib6_metrics   = (struct dst_metrics *)&dst_default_metrics,
299 };
300
301 static const struct rt6_info ip6_null_entry_template = {
302         .dst = {
303                 .__refcnt       = ATOMIC_INIT(1),
304                 .__use          = 1,
305                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
306                 .error          = -ENETUNREACH,
307                 .input          = ip6_pkt_discard,
308                 .output         = ip6_pkt_discard_out,
309         },
310         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
311 };
312
313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
314
315 static const struct rt6_info ip6_prohibit_entry_template = {
316         .dst = {
317                 .__refcnt       = ATOMIC_INIT(1),
318                 .__use          = 1,
319                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
320                 .error          = -EACCES,
321                 .input          = ip6_pkt_prohibit,
322                 .output         = ip6_pkt_prohibit_out,
323         },
324         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
325 };
326
327 static const struct rt6_info ip6_blk_hole_entry_template = {
328         .dst = {
329                 .__refcnt       = ATOMIC_INIT(1),
330                 .__use          = 1,
331                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
332                 .error          = -EINVAL,
333                 .input          = dst_discard,
334                 .output         = dst_discard_out,
335         },
336         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
337 };
338
339 #endif
340
341 static void rt6_info_init(struct rt6_info *rt)
342 {
343         struct dst_entry *dst = &rt->dst;
344
345         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
346         INIT_LIST_HEAD(&rt->rt6i_uncached);
347 }
348
349 /* allocate dst with ip6_dst_ops */
350 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
351                                int flags)
352 {
353         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
354                                         1, DST_OBSOLETE_FORCE_CHK, flags);
355
356         if (rt) {
357                 rt6_info_init(rt);
358                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
359         }
360
361         return rt;
362 }
363 EXPORT_SYMBOL(ip6_dst_alloc);
364
365 static void ip6_dst_destroy(struct dst_entry *dst)
366 {
367         struct rt6_info *rt = (struct rt6_info *)dst;
368         struct fib6_info *from;
369         struct inet6_dev *idev;
370
371         dst_destroy_metrics_generic(dst);
372         rt6_uncached_list_del(rt);
373
374         idev = rt->rt6i_idev;
375         if (idev) {
376                 rt->rt6i_idev = NULL;
377                 in6_dev_put(idev);
378         }
379
380         rcu_read_lock();
381         from = rcu_dereference(rt->from);
382         rcu_assign_pointer(rt->from, NULL);
383         fib6_info_release(from);
384         rcu_read_unlock();
385 }
386
387 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
388                            int how)
389 {
390         struct rt6_info *rt = (struct rt6_info *)dst;
391         struct inet6_dev *idev = rt->rt6i_idev;
392         struct net_device *loopback_dev =
393                 dev_net(dev)->loopback_dev;
394
395         if (idev && idev->dev != loopback_dev) {
396                 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
397                 if (loopback_idev) {
398                         rt->rt6i_idev = loopback_idev;
399                         in6_dev_put(idev);
400                 }
401         }
402 }
403
404 static bool __rt6_check_expired(const struct rt6_info *rt)
405 {
406         if (rt->rt6i_flags & RTF_EXPIRES)
407                 return time_after(jiffies, rt->dst.expires);
408         else
409                 return false;
410 }
411
412 static bool rt6_check_expired(const struct rt6_info *rt)
413 {
414         struct fib6_info *from;
415
416         from = rcu_dereference(rt->from);
417
418         if (rt->rt6i_flags & RTF_EXPIRES) {
419                 if (time_after(jiffies, rt->dst.expires))
420                         return true;
421         } else if (from) {
422                 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
423                         fib6_check_expired(from);
424         }
425         return false;
426 }
427
428 struct fib6_info *fib6_multipath_select(const struct net *net,
429                                         struct fib6_info *match,
430                                         struct flowi6 *fl6, int oif,
431                                         const struct sk_buff *skb,
432                                         int strict)
433 {
434         struct fib6_info *sibling, *next_sibling;
435
436         /* We might have already computed the hash for ICMPv6 errors. In such
437          * case it will always be non-zero. Otherwise now is the time to do it.
438          */
439         if (!fl6->mp_hash)
440                 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
441
442         if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound))
443                 return match;
444
445         list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
446                                  fib6_siblings) {
447                 int nh_upper_bound;
448
449                 nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound);
450                 if (fl6->mp_hash > nh_upper_bound)
451                         continue;
452                 if (rt6_score_route(sibling, oif, strict) < 0)
453                         break;
454                 match = sibling;
455                 break;
456         }
457
458         return match;
459 }
460
461 /*
462  *      Route lookup. rcu_read_lock() should be held.
463  */
464
465 static inline struct fib6_info *rt6_device_match(struct net *net,
466                                                  struct fib6_info *rt,
467                                                     const struct in6_addr *saddr,
468                                                     int oif,
469                                                     int flags)
470 {
471         struct fib6_info *sprt;
472
473         if (!oif && ipv6_addr_any(saddr) &&
474             !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
475                 return rt;
476
477         for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) {
478                 const struct net_device *dev = sprt->fib6_nh.nh_dev;
479
480                 if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD)
481                         continue;
482
483                 if (oif) {
484                         if (dev->ifindex == oif)
485                                 return sprt;
486                 } else {
487                         if (ipv6_chk_addr(net, saddr, dev,
488                                           flags & RT6_LOOKUP_F_IFACE))
489                                 return sprt;
490                 }
491         }
492
493         if (oif && flags & RT6_LOOKUP_F_IFACE)
494                 return net->ipv6.fib6_null_entry;
495
496         return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
497 }
498
499 #ifdef CONFIG_IPV6_ROUTER_PREF
500 struct __rt6_probe_work {
501         struct work_struct work;
502         struct in6_addr target;
503         struct net_device *dev;
504 };
505
506 static void rt6_probe_deferred(struct work_struct *w)
507 {
508         struct in6_addr mcaddr;
509         struct __rt6_probe_work *work =
510                 container_of(w, struct __rt6_probe_work, work);
511
512         addrconf_addr_solict_mult(&work->target, &mcaddr);
513         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
514         dev_put(work->dev);
515         kfree(work);
516 }
517
518 static void rt6_probe(struct fib6_info *rt)
519 {
520         struct __rt6_probe_work *work;
521         const struct in6_addr *nh_gw;
522         struct neighbour *neigh;
523         struct net_device *dev;
524
525         /*
526          * Okay, this does not seem to be appropriate
527          * for now, however, we need to check if it
528          * is really so; aka Router Reachability Probing.
529          *
530          * Router Reachability Probe MUST be rate-limited
531          * to no more than one per minute.
532          */
533         if (!rt || !(rt->fib6_flags & RTF_GATEWAY))
534                 return;
535
536         nh_gw = &rt->fib6_nh.nh_gw;
537         dev = rt->fib6_nh.nh_dev;
538         rcu_read_lock_bh();
539         neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
540         if (neigh) {
541                 struct inet6_dev *idev;
542
543                 if (neigh->nud_state & NUD_VALID)
544                         goto out;
545
546                 idev = __in6_dev_get(dev);
547                 work = NULL;
548                 write_lock(&neigh->lock);
549                 if (!(neigh->nud_state & NUD_VALID) &&
550                     time_after(jiffies,
551                                neigh->updated + idev->cnf.rtr_probe_interval)) {
552                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
553                         if (work)
554                                 __neigh_set_probe_once(neigh);
555                 }
556                 write_unlock(&neigh->lock);
557         } else {
558                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
559         }
560
561         if (work) {
562                 INIT_WORK(&work->work, rt6_probe_deferred);
563                 work->target = *nh_gw;
564                 dev_hold(dev);
565                 work->dev = dev;
566                 schedule_work(&work->work);
567         }
568
569 out:
570         rcu_read_unlock_bh();
571 }
572 #else
573 static inline void rt6_probe(struct fib6_info *rt)
574 {
575 }
576 #endif
577
578 /*
579  * Default Router Selection (RFC 2461 6.3.6)
580  */
581 static inline int rt6_check_dev(struct fib6_info *rt, int oif)
582 {
583         const struct net_device *dev = rt->fib6_nh.nh_dev;
584
585         if (!oif || dev->ifindex == oif)
586                 return 2;
587         return 0;
588 }
589
590 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
591 {
592         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
593         struct neighbour *neigh;
594
595         if (rt->fib6_flags & RTF_NONEXTHOP ||
596             !(rt->fib6_flags & RTF_GATEWAY))
597                 return RT6_NUD_SUCCEED;
598
599         rcu_read_lock_bh();
600         neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev,
601                                           &rt->fib6_nh.nh_gw);
602         if (neigh) {
603                 read_lock(&neigh->lock);
604                 if (neigh->nud_state & NUD_VALID)
605                         ret = RT6_NUD_SUCCEED;
606 #ifdef CONFIG_IPV6_ROUTER_PREF
607                 else if (!(neigh->nud_state & NUD_FAILED))
608                         ret = RT6_NUD_SUCCEED;
609                 else
610                         ret = RT6_NUD_FAIL_PROBE;
611 #endif
612                 read_unlock(&neigh->lock);
613         } else {
614                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
615                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
616         }
617         rcu_read_unlock_bh();
618
619         return ret;
620 }
621
622 static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
623 {
624         int m;
625
626         m = rt6_check_dev(rt, oif);
627         if (!m && (strict & RT6_LOOKUP_F_IFACE))
628                 return RT6_NUD_FAIL_HARD;
629 #ifdef CONFIG_IPV6_ROUTER_PREF
630         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2;
631 #endif
632         if (strict & RT6_LOOKUP_F_REACHABLE) {
633                 int n = rt6_check_neigh(rt);
634                 if (n < 0)
635                         return n;
636         }
637         return m;
638 }
639
640 /* called with rc_read_lock held */
641 static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i)
642 {
643         const struct net_device *dev = fib6_info_nh_dev(f6i);
644         bool rc = false;
645
646         if (dev) {
647                 const struct inet6_dev *idev = __in6_dev_get(dev);
648
649                 rc = !!idev->cnf.ignore_routes_with_linkdown;
650         }
651
652         return rc;
653 }
654
655 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
656                                    int *mpri, struct fib6_info *match,
657                                    bool *do_rr)
658 {
659         int m;
660         bool match_do_rr = false;
661
662         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
663                 goto out;
664
665         if (fib6_ignore_linkdown(rt) &&
666             rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
667             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
668                 goto out;
669
670         if (fib6_check_expired(rt))
671                 goto out;
672
673         m = rt6_score_route(rt, oif, strict);
674         if (m == RT6_NUD_FAIL_DO_RR) {
675                 match_do_rr = true;
676                 m = 0; /* lowest valid score */
677         } else if (m == RT6_NUD_FAIL_HARD) {
678                 goto out;
679         }
680
681         if (strict & RT6_LOOKUP_F_REACHABLE)
682                 rt6_probe(rt);
683
684         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
685         if (m > *mpri) {
686                 *do_rr = match_do_rr;
687                 *mpri = m;
688                 match = rt;
689         }
690 out:
691         return match;
692 }
693
694 static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
695                                      struct fib6_info *leaf,
696                                      struct fib6_info *rr_head,
697                                      u32 metric, int oif, int strict,
698                                      bool *do_rr)
699 {
700         struct fib6_info *rt, *match, *cont;
701         int mpri = -1;
702
703         match = NULL;
704         cont = NULL;
705         for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) {
706                 if (rt->fib6_metric != metric) {
707                         cont = rt;
708                         break;
709                 }
710
711                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
712         }
713
714         for (rt = leaf; rt && rt != rr_head;
715              rt = rcu_dereference(rt->fib6_next)) {
716                 if (rt->fib6_metric != metric) {
717                         cont = rt;
718                         break;
719                 }
720
721                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
722         }
723
724         if (match || !cont)
725                 return match;
726
727         for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next))
728                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
729
730         return match;
731 }
732
733 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
734                                    int oif, int strict)
735 {
736         struct fib6_info *leaf = rcu_dereference(fn->leaf);
737         struct fib6_info *match, *rt0;
738         bool do_rr = false;
739         int key_plen;
740
741         if (!leaf || leaf == net->ipv6.fib6_null_entry)
742                 return net->ipv6.fib6_null_entry;
743
744         rt0 = rcu_dereference(fn->rr_ptr);
745         if (!rt0)
746                 rt0 = leaf;
747
748         /* Double check to make sure fn is not an intermediate node
749          * and fn->leaf does not points to its child's leaf
750          * (This might happen if all routes under fn are deleted from
751          * the tree and fib6_repair_tree() is called on the node.)
752          */
753         key_plen = rt0->fib6_dst.plen;
754 #ifdef CONFIG_IPV6_SUBTREES
755         if (rt0->fib6_src.plen)
756                 key_plen = rt0->fib6_src.plen;
757 #endif
758         if (fn->fn_bit != key_plen)
759                 return net->ipv6.fib6_null_entry;
760
761         match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
762                              &do_rr);
763
764         if (do_rr) {
765                 struct fib6_info *next = rcu_dereference(rt0->fib6_next);
766
767                 /* no entries matched; do round-robin */
768                 if (!next || next->fib6_metric != rt0->fib6_metric)
769                         next = leaf;
770
771                 if (next != rt0) {
772                         spin_lock_bh(&leaf->fib6_table->tb6_lock);
773                         /* make sure next is not being deleted from the tree */
774                         if (next->fib6_node)
775                                 rcu_assign_pointer(fn->rr_ptr, next);
776                         spin_unlock_bh(&leaf->fib6_table->tb6_lock);
777                 }
778         }
779
780         return match ? match : net->ipv6.fib6_null_entry;
781 }
782
783 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
784 {
785         return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
786 }
787
788 #ifdef CONFIG_IPV6_ROUTE_INFO
789 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
790                   const struct in6_addr *gwaddr)
791 {
792         struct net *net = dev_net(dev);
793         struct route_info *rinfo = (struct route_info *) opt;
794         struct in6_addr prefix_buf, *prefix;
795         unsigned int pref;
796         unsigned long lifetime;
797         struct fib6_info *rt;
798
799         if (len < sizeof(struct route_info)) {
800                 return -EINVAL;
801         }
802
803         /* Sanity check for prefix_len and length */
804         if (rinfo->length > 3) {
805                 return -EINVAL;
806         } else if (rinfo->prefix_len > 128) {
807                 return -EINVAL;
808         } else if (rinfo->prefix_len > 64) {
809                 if (rinfo->length < 2) {
810                         return -EINVAL;
811                 }
812         } else if (rinfo->prefix_len > 0) {
813                 if (rinfo->length < 1) {
814                         return -EINVAL;
815                 }
816         }
817
818         pref = rinfo->route_pref;
819         if (pref == ICMPV6_ROUTER_PREF_INVALID)
820                 return -EINVAL;
821
822         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
823
824         if (rinfo->length == 3)
825                 prefix = (struct in6_addr *)rinfo->prefix;
826         else {
827                 /* this function is safe */
828                 ipv6_addr_prefix(&prefix_buf,
829                                  (struct in6_addr *)rinfo->prefix,
830                                  rinfo->prefix_len);
831                 prefix = &prefix_buf;
832         }
833
834         if (rinfo->prefix_len == 0)
835                 rt = rt6_get_dflt_router(net, gwaddr, dev);
836         else
837                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
838                                         gwaddr, dev);
839
840         if (rt && !lifetime) {
841                 ip6_del_rt(net, rt);
842                 rt = NULL;
843         }
844
845         if (!rt && lifetime)
846                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
847                                         dev, pref);
848         else if (rt)
849                 rt->fib6_flags = RTF_ROUTEINFO |
850                                  (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
851
852         if (rt) {
853                 if (!addrconf_finite_timeout(lifetime))
854                         fib6_clean_expires(rt);
855                 else
856                         fib6_set_expires(rt, jiffies + HZ * lifetime);
857
858                 fib6_info_release(rt);
859         }
860         return 0;
861 }
862 #endif
863
864 /*
865  *      Misc support functions
866  */
867
868 /* called with rcu_lock held */
869 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
870 {
871         struct net_device *dev = rt->fib6_nh.nh_dev;
872
873         if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
874                 /* for copies of local routes, dst->dev needs to be the
875                  * device if it is a master device, the master device if
876                  * device is enslaved, and the loopback as the default
877                  */
878                 if (netif_is_l3_slave(dev) &&
879                     !rt6_need_strict(&rt->fib6_dst.addr))
880                         dev = l3mdev_master_dev_rcu(dev);
881                 else if (!netif_is_l3_master(dev))
882                         dev = dev_net(dev)->loopback_dev;
883                 /* last case is netif_is_l3_master(dev) is true in which
884                  * case we want dev returned to be dev
885                  */
886         }
887
888         return dev;
889 }
890
891 static const int fib6_prop[RTN_MAX + 1] = {
892         [RTN_UNSPEC]    = 0,
893         [RTN_UNICAST]   = 0,
894         [RTN_LOCAL]     = 0,
895         [RTN_BROADCAST] = 0,
896         [RTN_ANYCAST]   = 0,
897         [RTN_MULTICAST] = 0,
898         [RTN_BLACKHOLE] = -EINVAL,
899         [RTN_UNREACHABLE] = -EHOSTUNREACH,
900         [RTN_PROHIBIT]  = -EACCES,
901         [RTN_THROW]     = -EAGAIN,
902         [RTN_NAT]       = -EINVAL,
903         [RTN_XRESOLVE]  = -EINVAL,
904 };
905
906 static int ip6_rt_type_to_error(u8 fib6_type)
907 {
908         return fib6_prop[fib6_type];
909 }
910
911 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
912 {
913         unsigned short flags = 0;
914
915         if (rt->dst_nocount)
916                 flags |= DST_NOCOUNT;
917         if (rt->dst_nopolicy)
918                 flags |= DST_NOPOLICY;
919         if (rt->dst_host)
920                 flags |= DST_HOST;
921
922         return flags;
923 }
924
925 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
926 {
927         rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
928
929         switch (ort->fib6_type) {
930         case RTN_BLACKHOLE:
931                 rt->dst.output = dst_discard_out;
932                 rt->dst.input = dst_discard;
933                 break;
934         case RTN_PROHIBIT:
935                 rt->dst.output = ip6_pkt_prohibit_out;
936                 rt->dst.input = ip6_pkt_prohibit;
937                 break;
938         case RTN_THROW:
939         case RTN_UNREACHABLE:
940         default:
941                 rt->dst.output = ip6_pkt_discard_out;
942                 rt->dst.input = ip6_pkt_discard;
943                 break;
944         }
945 }
946
947 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
948 {
949         rt->dst.flags |= fib6_info_dst_flags(ort);
950
951         if (ort->fib6_flags & RTF_REJECT) {
952                 ip6_rt_init_dst_reject(rt, ort);
953                 return;
954         }
955
956         rt->dst.error = 0;
957         rt->dst.output = ip6_output;
958
959         if (ort->fib6_type == RTN_LOCAL) {
960                 rt->dst.input = ip6_input;
961         } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
962                 rt->dst.input = ip6_mc_input;
963         } else {
964                 rt->dst.input = ip6_forward;
965         }
966
967         if (ort->fib6_nh.nh_lwtstate) {
968                 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
969                 lwtunnel_set_redirect(&rt->dst);
970         }
971
972         rt->dst.lastuse = jiffies;
973 }
974
975 /* Caller must already hold reference to @from */
976 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
977 {
978         rt->rt6i_flags &= ~RTF_EXPIRES;
979         rcu_assign_pointer(rt->from, from);
980         dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true);
981 }
982
983 /* Caller must already hold reference to @ort */
984 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
985 {
986         struct net_device *dev = fib6_info_nh_dev(ort);
987
988         ip6_rt_init_dst(rt, ort);
989
990         rt->rt6i_dst = ort->fib6_dst;
991         rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
992         rt->rt6i_gateway = ort->fib6_nh.nh_gw;
993         rt->rt6i_flags = ort->fib6_flags;
994         rt6_set_from(rt, ort);
995 #ifdef CONFIG_IPV6_SUBTREES
996         rt->rt6i_src = ort->fib6_src;
997 #endif
998         rt->rt6i_prefsrc = ort->fib6_prefsrc;
999         rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
1000 }
1001
1002 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1003                                         struct in6_addr *saddr)
1004 {
1005         struct fib6_node *pn, *sn;
1006         while (1) {
1007                 if (fn->fn_flags & RTN_TL_ROOT)
1008                         return NULL;
1009                 pn = rcu_dereference(fn->parent);
1010                 sn = FIB6_SUBTREE(pn);
1011                 if (sn && sn != fn)
1012                         fn = fib6_node_lookup(sn, NULL, saddr);
1013                 else
1014                         fn = pn;
1015                 if (fn->fn_flags & RTN_RTINFO)
1016                         return fn;
1017         }
1018 }
1019
1020 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
1021                           bool null_fallback)
1022 {
1023         struct rt6_info *rt = *prt;
1024
1025         if (dst_hold_safe(&rt->dst))
1026                 return true;
1027         if (null_fallback) {
1028                 rt = net->ipv6.ip6_null_entry;
1029                 dst_hold(&rt->dst);
1030         } else {
1031                 rt = NULL;
1032         }
1033         *prt = rt;
1034         return false;
1035 }
1036
1037 /* called with rcu_lock held */
1038 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
1039 {
1040         unsigned short flags = fib6_info_dst_flags(rt);
1041         struct net_device *dev = rt->fib6_nh.nh_dev;
1042         struct rt6_info *nrt;
1043
1044         if (!fib6_info_hold_safe(rt))
1045                 return NULL;
1046
1047         nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1048         if (nrt)
1049                 ip6_rt_copy_init(nrt, rt);
1050         else
1051                 fib6_info_release(rt);
1052
1053         return nrt;
1054 }
1055
1056 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1057                                              struct fib6_table *table,
1058                                              struct flowi6 *fl6,
1059                                              const struct sk_buff *skb,
1060                                              int flags)
1061 {
1062         struct fib6_info *f6i;
1063         struct fib6_node *fn;
1064         struct rt6_info *rt;
1065
1066         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1067                 flags &= ~RT6_LOOKUP_F_IFACE;
1068
1069         rcu_read_lock();
1070         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1071 restart:
1072         f6i = rcu_dereference(fn->leaf);
1073         if (!f6i) {
1074                 f6i = net->ipv6.fib6_null_entry;
1075         } else {
1076                 f6i = rt6_device_match(net, f6i, &fl6->saddr,
1077                                       fl6->flowi6_oif, flags);
1078                 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
1079                         f6i = fib6_multipath_select(net, f6i, fl6,
1080                                                     fl6->flowi6_oif, skb,
1081                                                     flags);
1082         }
1083         if (f6i == net->ipv6.fib6_null_entry) {
1084                 fn = fib6_backtrack(fn, &fl6->saddr);
1085                 if (fn)
1086                         goto restart;
1087         }
1088
1089         trace_fib6_table_lookup(net, f6i, table, fl6);
1090
1091         /* Search through exception table */
1092         rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1093         if (rt) {
1094                 if (ip6_hold_safe(net, &rt, true))
1095                         dst_use_noref(&rt->dst, jiffies);
1096         } else if (f6i == net->ipv6.fib6_null_entry) {
1097                 rt = net->ipv6.ip6_null_entry;
1098                 dst_hold(&rt->dst);
1099         } else {
1100                 rt = ip6_create_rt_rcu(f6i);
1101                 if (!rt) {
1102                         rt = net->ipv6.ip6_null_entry;
1103                         dst_hold(&rt->dst);
1104                 }
1105         }
1106
1107         rcu_read_unlock();
1108
1109         return rt;
1110 }
1111
1112 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1113                                    const struct sk_buff *skb, int flags)
1114 {
1115         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1116 }
1117 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1118
1119 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1120                             const struct in6_addr *saddr, int oif,
1121                             const struct sk_buff *skb, int strict)
1122 {
1123         struct flowi6 fl6 = {
1124                 .flowi6_oif = oif,
1125                 .daddr = *daddr,
1126         };
1127         struct dst_entry *dst;
1128         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1129
1130         if (saddr) {
1131                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1132                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1133         }
1134
1135         dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1136         if (dst->error == 0)
1137                 return (struct rt6_info *) dst;
1138
1139         dst_release(dst);
1140
1141         return NULL;
1142 }
1143 EXPORT_SYMBOL(rt6_lookup);
1144
1145 /* ip6_ins_rt is called with FREE table->tb6_lock.
1146  * It takes new route entry, the addition fails by any reason the
1147  * route is released.
1148  * Caller must hold dst before calling it.
1149  */
1150
1151 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1152                         struct netlink_ext_ack *extack)
1153 {
1154         int err;
1155         struct fib6_table *table;
1156
1157         table = rt->fib6_table;
1158         spin_lock_bh(&table->tb6_lock);
1159         err = fib6_add(&table->tb6_root, rt, info, extack);
1160         spin_unlock_bh(&table->tb6_lock);
1161
1162         return err;
1163 }
1164
1165 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1166 {
1167         struct nl_info info = { .nl_net = net, };
1168
1169         return __ip6_ins_rt(rt, &info, NULL);
1170 }
1171
1172 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
1173                                            const struct in6_addr *daddr,
1174                                            const struct in6_addr *saddr)
1175 {
1176         struct net_device *dev;
1177         struct rt6_info *rt;
1178
1179         /*
1180          *      Clone the route.
1181          */
1182
1183         if (!fib6_info_hold_safe(ort))
1184                 return NULL;
1185
1186         dev = ip6_rt_get_dev_rcu(ort);
1187         rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1188         if (!rt) {
1189                 fib6_info_release(ort);
1190                 return NULL;
1191         }
1192
1193         ip6_rt_copy_init(rt, ort);
1194         rt->rt6i_flags |= RTF_CACHE;
1195         rt->dst.flags |= DST_HOST;
1196         rt->rt6i_dst.addr = *daddr;
1197         rt->rt6i_dst.plen = 128;
1198
1199         if (!rt6_is_gw_or_nonexthop(ort)) {
1200                 if (ort->fib6_dst.plen != 128 &&
1201                     ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
1202                         rt->rt6i_flags |= RTF_ANYCAST;
1203 #ifdef CONFIG_IPV6_SUBTREES
1204                 if (rt->rt6i_src.plen && saddr) {
1205                         rt->rt6i_src.addr = *saddr;
1206                         rt->rt6i_src.plen = 128;
1207                 }
1208 #endif
1209         }
1210
1211         return rt;
1212 }
1213
1214 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
1215 {
1216         unsigned short flags = fib6_info_dst_flags(rt);
1217         struct net_device *dev;
1218         struct rt6_info *pcpu_rt;
1219
1220         if (!fib6_info_hold_safe(rt))
1221                 return NULL;
1222
1223         rcu_read_lock();
1224         dev = ip6_rt_get_dev_rcu(rt);
1225         pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1226         rcu_read_unlock();
1227         if (!pcpu_rt) {
1228                 fib6_info_release(rt);
1229                 return NULL;
1230         }
1231         ip6_rt_copy_init(pcpu_rt, rt);
1232         pcpu_rt->rt6i_flags |= RTF_PCPU;
1233         return pcpu_rt;
1234 }
1235
1236 /* It should be called with rcu_read_lock() acquired */
1237 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
1238 {
1239         struct rt6_info *pcpu_rt, **p;
1240
1241         p = this_cpu_ptr(rt->rt6i_pcpu);
1242         pcpu_rt = *p;
1243
1244         if (pcpu_rt)
1245                 ip6_hold_safe(NULL, &pcpu_rt, false);
1246
1247         return pcpu_rt;
1248 }
1249
1250 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1251                                             struct fib6_info *rt)
1252 {
1253         struct rt6_info *pcpu_rt, *prev, **p;
1254
1255         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1256         if (!pcpu_rt) {
1257                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1258                 return net->ipv6.ip6_null_entry;
1259         }
1260
1261         dst_hold(&pcpu_rt->dst);
1262         p = this_cpu_ptr(rt->rt6i_pcpu);
1263         prev = cmpxchg(p, NULL, pcpu_rt);
1264         BUG_ON(prev);
1265
1266         return pcpu_rt;
1267 }
1268
1269 /* exception hash table implementation
1270  */
1271 static DEFINE_SPINLOCK(rt6_exception_lock);
1272
1273 /* Remove rt6_ex from hash table and free the memory
1274  * Caller must hold rt6_exception_lock
1275  */
1276 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1277                                  struct rt6_exception *rt6_ex)
1278 {
1279         struct net *net;
1280
1281         if (!bucket || !rt6_ex)
1282                 return;
1283
1284         net = dev_net(rt6_ex->rt6i->dst.dev);
1285         hlist_del_rcu(&rt6_ex->hlist);
1286         dst_release(&rt6_ex->rt6i->dst);
1287         kfree_rcu(rt6_ex, rcu);
1288         WARN_ON_ONCE(!bucket->depth);
1289         bucket->depth--;
1290         net->ipv6.rt6_stats->fib_rt_cache--;
1291 }
1292
1293 /* Remove oldest rt6_ex in bucket and free the memory
1294  * Caller must hold rt6_exception_lock
1295  */
1296 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1297 {
1298         struct rt6_exception *rt6_ex, *oldest = NULL;
1299
1300         if (!bucket)
1301                 return;
1302
1303         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1304                 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1305                         oldest = rt6_ex;
1306         }
1307         rt6_remove_exception(bucket, oldest);
1308 }
1309
1310 static u32 rt6_exception_hash(const struct in6_addr *dst,
1311                               const struct in6_addr *src)
1312 {
1313         static u32 seed __read_mostly;
1314         u32 val;
1315
1316         net_get_random_once(&seed, sizeof(seed));
1317         val = jhash(dst, sizeof(*dst), seed);
1318
1319 #ifdef CONFIG_IPV6_SUBTREES
1320         if (src)
1321                 val = jhash(src, sizeof(*src), val);
1322 #endif
1323         return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1324 }
1325
1326 /* Helper function to find the cached rt in the hash table
1327  * and update bucket pointer to point to the bucket for this
1328  * (daddr, saddr) pair
1329  * Caller must hold rt6_exception_lock
1330  */
1331 static struct rt6_exception *
1332 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1333                               const struct in6_addr *daddr,
1334                               const struct in6_addr *saddr)
1335 {
1336         struct rt6_exception *rt6_ex;
1337         u32 hval;
1338
1339         if (!(*bucket) || !daddr)
1340                 return NULL;
1341
1342         hval = rt6_exception_hash(daddr, saddr);
1343         *bucket += hval;
1344
1345         hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1346                 struct rt6_info *rt6 = rt6_ex->rt6i;
1347                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1348
1349 #ifdef CONFIG_IPV6_SUBTREES
1350                 if (matched && saddr)
1351                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1352 #endif
1353                 if (matched)
1354                         return rt6_ex;
1355         }
1356         return NULL;
1357 }
1358
1359 /* Helper function to find the cached rt in the hash table
1360  * and update bucket pointer to point to the bucket for this
1361  * (daddr, saddr) pair
1362  * Caller must hold rcu_read_lock()
1363  */
1364 static struct rt6_exception *
1365 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1366                          const struct in6_addr *daddr,
1367                          const struct in6_addr *saddr)
1368 {
1369         struct rt6_exception *rt6_ex;
1370         u32 hval;
1371
1372         WARN_ON_ONCE(!rcu_read_lock_held());
1373
1374         if (!(*bucket) || !daddr)
1375                 return NULL;
1376
1377         hval = rt6_exception_hash(daddr, saddr);
1378         *bucket += hval;
1379
1380         hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1381                 struct rt6_info *rt6 = rt6_ex->rt6i;
1382                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1383
1384 #ifdef CONFIG_IPV6_SUBTREES
1385                 if (matched && saddr)
1386                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1387 #endif
1388                 if (matched)
1389                         return rt6_ex;
1390         }
1391         return NULL;
1392 }
1393
1394 static unsigned int fib6_mtu(const struct fib6_info *rt)
1395 {
1396         unsigned int mtu;
1397
1398         if (rt->fib6_pmtu) {
1399                 mtu = rt->fib6_pmtu;
1400         } else {
1401                 struct net_device *dev = fib6_info_nh_dev(rt);
1402                 struct inet6_dev *idev;
1403
1404                 rcu_read_lock();
1405                 idev = __in6_dev_get(dev);
1406                 mtu = idev->cnf.mtu6;
1407                 rcu_read_unlock();
1408         }
1409
1410         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1411
1412         return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
1413 }
1414
1415 static int rt6_insert_exception(struct rt6_info *nrt,
1416                                 struct fib6_info *ort)
1417 {
1418         struct net *net = dev_net(nrt->dst.dev);
1419         struct rt6_exception_bucket *bucket;
1420         struct in6_addr *src_key = NULL;
1421         struct rt6_exception *rt6_ex;
1422         int err = 0;
1423
1424         spin_lock_bh(&rt6_exception_lock);
1425
1426         if (ort->exception_bucket_flushed) {
1427                 err = -EINVAL;
1428                 goto out;
1429         }
1430
1431         bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1432                                         lockdep_is_held(&rt6_exception_lock));
1433         if (!bucket) {
1434                 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1435                                  GFP_ATOMIC);
1436                 if (!bucket) {
1437                         err = -ENOMEM;
1438                         goto out;
1439                 }
1440                 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1441         }
1442
1443 #ifdef CONFIG_IPV6_SUBTREES
1444         /* rt6i_src.plen != 0 indicates ort is in subtree
1445          * and exception table is indexed by a hash of
1446          * both rt6i_dst and rt6i_src.
1447          * Otherwise, the exception table is indexed by
1448          * a hash of only rt6i_dst.
1449          */
1450         if (ort->fib6_src.plen)
1451                 src_key = &nrt->rt6i_src.addr;
1452 #endif
1453
1454         /* Update rt6i_prefsrc as it could be changed
1455          * in rt6_remove_prefsrc()
1456          */
1457         nrt->rt6i_prefsrc = ort->fib6_prefsrc;
1458         /* rt6_mtu_change() might lower mtu on ort.
1459          * Only insert this exception route if its mtu
1460          * is less than ort's mtu value.
1461          */
1462         if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1463                 err = -EINVAL;
1464                 goto out;
1465         }
1466
1467         rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1468                                                src_key);
1469         if (rt6_ex)
1470                 rt6_remove_exception(bucket, rt6_ex);
1471
1472         rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1473         if (!rt6_ex) {
1474                 err = -ENOMEM;
1475                 goto out;
1476         }
1477         rt6_ex->rt6i = nrt;
1478         rt6_ex->stamp = jiffies;
1479         hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1480         bucket->depth++;
1481         net->ipv6.rt6_stats->fib_rt_cache++;
1482
1483         if (bucket->depth > FIB6_MAX_DEPTH)
1484                 rt6_exception_remove_oldest(bucket);
1485
1486 out:
1487         spin_unlock_bh(&rt6_exception_lock);
1488
1489         /* Update fn->fn_sernum to invalidate all cached dst */
1490         if (!err) {
1491                 spin_lock_bh(&ort->fib6_table->tb6_lock);
1492                 fib6_update_sernum(net, ort);
1493                 spin_unlock_bh(&ort->fib6_table->tb6_lock);
1494                 fib6_force_start_gc(net);
1495         }
1496
1497         return err;
1498 }
1499
1500 void rt6_flush_exceptions(struct fib6_info *rt)
1501 {
1502         struct rt6_exception_bucket *bucket;
1503         struct rt6_exception *rt6_ex;
1504         struct hlist_node *tmp;
1505         int i;
1506
1507         spin_lock_bh(&rt6_exception_lock);
1508         /* Prevent rt6_insert_exception() to recreate the bucket list */
1509         rt->exception_bucket_flushed = 1;
1510
1511         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1512                                     lockdep_is_held(&rt6_exception_lock));
1513         if (!bucket)
1514                 goto out;
1515
1516         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1517                 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1518                         rt6_remove_exception(bucket, rt6_ex);
1519                 WARN_ON_ONCE(bucket->depth);
1520                 bucket++;
1521         }
1522
1523 out:
1524         spin_unlock_bh(&rt6_exception_lock);
1525 }
1526
1527 /* Find cached rt in the hash table inside passed in rt
1528  * Caller has to hold rcu_read_lock()
1529  */
1530 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
1531                                            struct in6_addr *daddr,
1532                                            struct in6_addr *saddr)
1533 {
1534         struct rt6_exception_bucket *bucket;
1535         struct in6_addr *src_key = NULL;
1536         struct rt6_exception *rt6_ex;
1537         struct rt6_info *res = NULL;
1538
1539         bucket = rcu_dereference(rt->rt6i_exception_bucket);
1540
1541 #ifdef CONFIG_IPV6_SUBTREES
1542         /* rt6i_src.plen != 0 indicates rt is in subtree
1543          * and exception table is indexed by a hash of
1544          * both rt6i_dst and rt6i_src.
1545          * Otherwise, the exception table is indexed by
1546          * a hash of only rt6i_dst.
1547          */
1548         if (rt->fib6_src.plen)
1549                 src_key = saddr;
1550 #endif
1551         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1552
1553         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1554                 res = rt6_ex->rt6i;
1555
1556         return res;
1557 }
1558
1559 /* Remove the passed in cached rt from the hash table that contains it */
1560 static int rt6_remove_exception_rt(struct rt6_info *rt)
1561 {
1562         struct rt6_exception_bucket *bucket;
1563         struct in6_addr *src_key = NULL;
1564         struct rt6_exception *rt6_ex;
1565         struct fib6_info *from;
1566         int err;
1567
1568         from = rcu_dereference(rt->from);
1569         if (!from ||
1570             !(rt->rt6i_flags & RTF_CACHE))
1571                 return -EINVAL;
1572
1573         if (!rcu_access_pointer(from->rt6i_exception_bucket))
1574                 return -ENOENT;
1575
1576         spin_lock_bh(&rt6_exception_lock);
1577         bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1578                                     lockdep_is_held(&rt6_exception_lock));
1579 #ifdef CONFIG_IPV6_SUBTREES
1580         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1581          * and exception table is indexed by a hash of
1582          * both rt6i_dst and rt6i_src.
1583          * Otherwise, the exception table is indexed by
1584          * a hash of only rt6i_dst.
1585          */
1586         if (from->fib6_src.plen)
1587                 src_key = &rt->rt6i_src.addr;
1588 #endif
1589         rt6_ex = __rt6_find_exception_spinlock(&bucket,
1590                                                &rt->rt6i_dst.addr,
1591                                                src_key);
1592         if (rt6_ex) {
1593                 rt6_remove_exception(bucket, rt6_ex);
1594                 err = 0;
1595         } else {
1596                 err = -ENOENT;
1597         }
1598
1599         spin_unlock_bh(&rt6_exception_lock);
1600         return err;
1601 }
1602
1603 /* Find rt6_ex which contains the passed in rt cache and
1604  * refresh its stamp
1605  */
1606 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1607 {
1608         struct rt6_exception_bucket *bucket;
1609         struct fib6_info *from = rt->from;
1610         struct in6_addr *src_key = NULL;
1611         struct rt6_exception *rt6_ex;
1612
1613         if (!from ||
1614             !(rt->rt6i_flags & RTF_CACHE))
1615                 return;
1616
1617         rcu_read_lock();
1618         bucket = rcu_dereference(from->rt6i_exception_bucket);
1619
1620 #ifdef CONFIG_IPV6_SUBTREES
1621         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1622          * and exception table is indexed by a hash of
1623          * both rt6i_dst and rt6i_src.
1624          * Otherwise, the exception table is indexed by
1625          * a hash of only rt6i_dst.
1626          */
1627         if (from->fib6_src.plen)
1628                 src_key = &rt->rt6i_src.addr;
1629 #endif
1630         rt6_ex = __rt6_find_exception_rcu(&bucket,
1631                                           &rt->rt6i_dst.addr,
1632                                           src_key);
1633         if (rt6_ex)
1634                 rt6_ex->stamp = jiffies;
1635
1636         rcu_read_unlock();
1637 }
1638
1639 static void rt6_exceptions_remove_prefsrc(struct fib6_info *rt)
1640 {
1641         struct rt6_exception_bucket *bucket;
1642         struct rt6_exception *rt6_ex;
1643         int i;
1644
1645         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1646                                         lockdep_is_held(&rt6_exception_lock));
1647
1648         if (bucket) {
1649                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1650                         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1651                                 rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1652                         }
1653                         bucket++;
1654                 }
1655         }
1656 }
1657
1658 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1659                                          struct rt6_info *rt, int mtu)
1660 {
1661         /* If the new MTU is lower than the route PMTU, this new MTU will be the
1662          * lowest MTU in the path: always allow updating the route PMTU to
1663          * reflect PMTU decreases.
1664          *
1665          * If the new MTU is higher, and the route PMTU is equal to the local
1666          * MTU, this means the old MTU is the lowest in the path, so allow
1667          * updating it: if other nodes now have lower MTUs, PMTU discovery will
1668          * handle this.
1669          */
1670
1671         if (dst_mtu(&rt->dst) >= mtu)
1672                 return true;
1673
1674         if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1675                 return true;
1676
1677         return false;
1678 }
1679
1680 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1681                                        struct fib6_info *rt, int mtu)
1682 {
1683         struct rt6_exception_bucket *bucket;
1684         struct rt6_exception *rt6_ex;
1685         int i;
1686
1687         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1688                                         lockdep_is_held(&rt6_exception_lock));
1689
1690         if (!bucket)
1691                 return;
1692
1693         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1694                 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1695                         struct rt6_info *entry = rt6_ex->rt6i;
1696
1697                         /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1698                          * route), the metrics of its rt->from have already
1699                          * been updated.
1700                          */
1701                         if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1702                             rt6_mtu_change_route_allowed(idev, entry, mtu))
1703                                 dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1704                 }
1705                 bucket++;
1706         }
1707 }
1708
1709 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
1710
1711 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1712                                         struct in6_addr *gateway)
1713 {
1714         struct rt6_exception_bucket *bucket;
1715         struct rt6_exception *rt6_ex;
1716         struct hlist_node *tmp;
1717         int i;
1718
1719         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1720                 return;
1721
1722         spin_lock_bh(&rt6_exception_lock);
1723         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1724                                      lockdep_is_held(&rt6_exception_lock));
1725
1726         if (bucket) {
1727                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1728                         hlist_for_each_entry_safe(rt6_ex, tmp,
1729                                                   &bucket->chain, hlist) {
1730                                 struct rt6_info *entry = rt6_ex->rt6i;
1731
1732                                 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1733                                     RTF_CACHE_GATEWAY &&
1734                                     ipv6_addr_equal(gateway,
1735                                                     &entry->rt6i_gateway)) {
1736                                         rt6_remove_exception(bucket, rt6_ex);
1737                                 }
1738                         }
1739                         bucket++;
1740                 }
1741         }
1742
1743         spin_unlock_bh(&rt6_exception_lock);
1744 }
1745
1746 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1747                                       struct rt6_exception *rt6_ex,
1748                                       struct fib6_gc_args *gc_args,
1749                                       unsigned long now)
1750 {
1751         struct rt6_info *rt = rt6_ex->rt6i;
1752
1753         /* we are pruning and obsoleting aged-out and non gateway exceptions
1754          * even if others have still references to them, so that on next
1755          * dst_check() such references can be dropped.
1756          * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1757          * expired, independently from their aging, as per RFC 8201 section 4
1758          */
1759         if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1760                 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1761                         RT6_TRACE("aging clone %p\n", rt);
1762                         rt6_remove_exception(bucket, rt6_ex);
1763                         return;
1764                 }
1765         } else if (time_after(jiffies, rt->dst.expires)) {
1766                 RT6_TRACE("purging expired route %p\n", rt);
1767                 rt6_remove_exception(bucket, rt6_ex);
1768                 return;
1769         }
1770
1771         if (rt->rt6i_flags & RTF_GATEWAY) {
1772                 struct neighbour *neigh;
1773                 __u8 neigh_flags = 0;
1774
1775                 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1776                 if (neigh)
1777                         neigh_flags = neigh->flags;
1778
1779                 if (!(neigh_flags & NTF_ROUTER)) {
1780                         RT6_TRACE("purging route %p via non-router but gateway\n",
1781                                   rt);
1782                         rt6_remove_exception(bucket, rt6_ex);
1783                         return;
1784                 }
1785         }
1786
1787         gc_args->more++;
1788 }
1789
1790 void rt6_age_exceptions(struct fib6_info *rt,
1791                         struct fib6_gc_args *gc_args,
1792                         unsigned long now)
1793 {
1794         struct rt6_exception_bucket *bucket;
1795         struct rt6_exception *rt6_ex;
1796         struct hlist_node *tmp;
1797         int i;
1798
1799         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1800                 return;
1801
1802         rcu_read_lock_bh();
1803         spin_lock(&rt6_exception_lock);
1804         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1805                                     lockdep_is_held(&rt6_exception_lock));
1806
1807         if (bucket) {
1808                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1809                         hlist_for_each_entry_safe(rt6_ex, tmp,
1810                                                   &bucket->chain, hlist) {
1811                                 rt6_age_examine_exception(bucket, rt6_ex,
1812                                                           gc_args, now);
1813                         }
1814                         bucket++;
1815                 }
1816         }
1817         spin_unlock(&rt6_exception_lock);
1818         rcu_read_unlock_bh();
1819 }
1820
1821 /* must be called with rcu lock held */
1822 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
1823                                     int oif, struct flowi6 *fl6, int strict)
1824 {
1825         struct fib6_node *fn, *saved_fn;
1826         struct fib6_info *f6i;
1827
1828         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1829         saved_fn = fn;
1830
1831         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1832                 oif = 0;
1833
1834 redo_rt6_select:
1835         f6i = rt6_select(net, fn, oif, strict);
1836         if (f6i == net->ipv6.fib6_null_entry) {
1837                 fn = fib6_backtrack(fn, &fl6->saddr);
1838                 if (fn)
1839                         goto redo_rt6_select;
1840                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1841                         /* also consider unreachable route */
1842                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1843                         fn = saved_fn;
1844                         goto redo_rt6_select;
1845                 }
1846         }
1847
1848         trace_fib6_table_lookup(net, f6i, table, fl6);
1849
1850         return f6i;
1851 }
1852
1853 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1854                                int oif, struct flowi6 *fl6,
1855                                const struct sk_buff *skb, int flags)
1856 {
1857         struct fib6_info *f6i;
1858         struct rt6_info *rt;
1859         int strict = 0;
1860
1861         strict |= flags & RT6_LOOKUP_F_IFACE;
1862         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1863         if (net->ipv6.devconf_all->forwarding == 0)
1864                 strict |= RT6_LOOKUP_F_REACHABLE;
1865
1866         rcu_read_lock();
1867
1868         f6i = fib6_table_lookup(net, table, oif, fl6, strict);
1869         if (f6i->fib6_nsiblings)
1870                 f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
1871
1872         if (f6i == net->ipv6.fib6_null_entry) {
1873                 rt = net->ipv6.ip6_null_entry;
1874                 rcu_read_unlock();
1875                 dst_hold(&rt->dst);
1876                 return rt;
1877         }
1878
1879         /*Search through exception table */
1880         rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1881         if (rt) {
1882                 if (ip6_hold_safe(net, &rt, true))
1883                         dst_use_noref(&rt->dst, jiffies);
1884
1885                 rcu_read_unlock();
1886                 return rt;
1887         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1888                             !(f6i->fib6_flags & RTF_GATEWAY))) {
1889                 /* Create a RTF_CACHE clone which will not be
1890                  * owned by the fib6 tree.  It is for the special case where
1891                  * the daddr in the skb during the neighbor look-up is different
1892                  * from the fl6->daddr used to look-up route here.
1893                  */
1894                 struct rt6_info *uncached_rt;
1895
1896                 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
1897
1898                 rcu_read_unlock();
1899
1900                 if (uncached_rt) {
1901                         /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1902                          * No need for another dst_hold()
1903                          */
1904                         rt6_uncached_list_add(uncached_rt);
1905                         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1906                 } else {
1907                         uncached_rt = net->ipv6.ip6_null_entry;
1908                         dst_hold(&uncached_rt->dst);
1909                 }
1910
1911                 return uncached_rt;
1912         } else {
1913                 /* Get a percpu copy */
1914
1915                 struct rt6_info *pcpu_rt;
1916
1917                 local_bh_disable();
1918                 pcpu_rt = rt6_get_pcpu_route(f6i);
1919
1920                 if (!pcpu_rt)
1921                         pcpu_rt = rt6_make_pcpu_route(net, f6i);
1922
1923                 local_bh_enable();
1924                 rcu_read_unlock();
1925
1926                 return pcpu_rt;
1927         }
1928 }
1929 EXPORT_SYMBOL_GPL(ip6_pol_route);
1930
1931 static struct rt6_info *ip6_pol_route_input(struct net *net,
1932                                             struct fib6_table *table,
1933                                             struct flowi6 *fl6,
1934                                             const struct sk_buff *skb,
1935                                             int flags)
1936 {
1937         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1938 }
1939
1940 struct dst_entry *ip6_route_input_lookup(struct net *net,
1941                                          struct net_device *dev,
1942                                          struct flowi6 *fl6,
1943                                          const struct sk_buff *skb,
1944                                          int flags)
1945 {
1946         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1947                 flags |= RT6_LOOKUP_F_IFACE;
1948
1949         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1950 }
1951 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1952
1953 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1954                                   struct flow_keys *keys,
1955                                   struct flow_keys *flkeys)
1956 {
1957         const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1958         const struct ipv6hdr *key_iph = outer_iph;
1959         struct flow_keys *_flkeys = flkeys;
1960         const struct ipv6hdr *inner_iph;
1961         const struct icmp6hdr *icmph;
1962         struct ipv6hdr _inner_iph;
1963         struct icmp6hdr _icmph;
1964
1965         if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1966                 goto out;
1967
1968         icmph = skb_header_pointer(skb, skb_transport_offset(skb),
1969                                    sizeof(_icmph), &_icmph);
1970         if (!icmph)
1971                 goto out;
1972
1973         if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1974             icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1975             icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1976             icmph->icmp6_type != ICMPV6_PARAMPROB)
1977                 goto out;
1978
1979         inner_iph = skb_header_pointer(skb,
1980                                        skb_transport_offset(skb) + sizeof(*icmph),
1981                                        sizeof(_inner_iph), &_inner_iph);
1982         if (!inner_iph)
1983                 goto out;
1984
1985         key_iph = inner_iph;
1986         _flkeys = NULL;
1987 out:
1988         if (_flkeys) {
1989                 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1990                 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1991                 keys->tags.flow_label = _flkeys->tags.flow_label;
1992                 keys->basic.ip_proto = _flkeys->basic.ip_proto;
1993         } else {
1994                 keys->addrs.v6addrs.src = key_iph->saddr;
1995                 keys->addrs.v6addrs.dst = key_iph->daddr;
1996                 keys->tags.flow_label = ip6_flowlabel(key_iph);
1997                 keys->basic.ip_proto = key_iph->nexthdr;
1998         }
1999 }
2000
2001 /* if skb is set it will be used and fl6 can be NULL */
2002 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
2003                        const struct sk_buff *skb, struct flow_keys *flkeys)
2004 {
2005         struct flow_keys hash_keys;
2006         u32 mhash;
2007
2008         switch (ip6_multipath_hash_policy(net)) {
2009         case 0:
2010                 memset(&hash_keys, 0, sizeof(hash_keys));
2011                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2012                 if (skb) {
2013                         ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
2014                 } else {
2015                         hash_keys.addrs.v6addrs.src = fl6->saddr;
2016                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
2017                         hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
2018                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
2019                 }
2020                 break;
2021         case 1:
2022                 if (skb) {
2023                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2024                         struct flow_keys keys;
2025
2026                         /* short-circuit if we already have L4 hash present */
2027                         if (skb->l4_hash)
2028                                 return skb_get_hash_raw(skb) >> 1;
2029
2030                         memset(&hash_keys, 0, sizeof(hash_keys));
2031
2032                         if (!flkeys) {
2033                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
2034                                 flkeys = &keys;
2035                         }
2036                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2037                         hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2038                         hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2039                         hash_keys.ports.src = flkeys->ports.src;
2040                         hash_keys.ports.dst = flkeys->ports.dst;
2041                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2042                 } else {
2043                         memset(&hash_keys, 0, sizeof(hash_keys));
2044                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2045                         hash_keys.addrs.v6addrs.src = fl6->saddr;
2046                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
2047                         hash_keys.ports.src = fl6->fl6_sport;
2048                         hash_keys.ports.dst = fl6->fl6_dport;
2049                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
2050                 }
2051                 break;
2052         }
2053         mhash = flow_hash_from_keys(&hash_keys);
2054
2055         return mhash >> 1;
2056 }
2057
2058 void ip6_route_input(struct sk_buff *skb)
2059 {
2060         const struct ipv6hdr *iph = ipv6_hdr(skb);
2061         struct net *net = dev_net(skb->dev);
2062         int flags = RT6_LOOKUP_F_HAS_SADDR;
2063         struct ip_tunnel_info *tun_info;
2064         struct flowi6 fl6 = {
2065                 .flowi6_iif = skb->dev->ifindex,
2066                 .daddr = iph->daddr,
2067                 .saddr = iph->saddr,
2068                 .flowlabel = ip6_flowinfo(iph),
2069                 .flowi6_mark = skb->mark,
2070                 .flowi6_proto = iph->nexthdr,
2071         };
2072         struct flow_keys *flkeys = NULL, _flkeys;
2073
2074         tun_info = skb_tunnel_info(skb);
2075         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2076                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2077
2078         if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2079                 flkeys = &_flkeys;
2080
2081         if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2082                 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2083         skb_dst_drop(skb);
2084         skb_dst_set(skb,
2085                     ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2086 }
2087
2088 static struct rt6_info *ip6_pol_route_output(struct net *net,
2089                                              struct fib6_table *table,
2090                                              struct flowi6 *fl6,
2091                                              const struct sk_buff *skb,
2092                                              int flags)
2093 {
2094         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2095 }
2096
2097 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2098                                          struct flowi6 *fl6, int flags)
2099 {
2100         bool any_src;
2101
2102         if (rt6_need_strict(&fl6->daddr)) {
2103                 struct dst_entry *dst;
2104
2105                 dst = l3mdev_link_scope_lookup(net, fl6);
2106                 if (dst)
2107                         return dst;
2108         }
2109
2110         fl6->flowi6_iif = LOOPBACK_IFINDEX;
2111
2112         any_src = ipv6_addr_any(&fl6->saddr);
2113         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2114             (fl6->flowi6_oif && any_src))
2115                 flags |= RT6_LOOKUP_F_IFACE;
2116
2117         if (!any_src)
2118                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2119         else if (sk)
2120                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2121
2122         return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2123 }
2124 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2125
2126 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2127 {
2128         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2129         struct net_device *loopback_dev = net->loopback_dev;
2130         struct dst_entry *new = NULL;
2131
2132         rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2133                        DST_OBSOLETE_DEAD, 0);
2134         if (rt) {
2135                 rt6_info_init(rt);
2136                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2137
2138                 new = &rt->dst;
2139                 new->__use = 1;
2140                 new->input = dst_discard;
2141                 new->output = dst_discard_out;
2142
2143                 dst_copy_metrics(new, &ort->dst);
2144
2145                 rt->rt6i_idev = in6_dev_get(loopback_dev);
2146                 rt->rt6i_gateway = ort->rt6i_gateway;
2147                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2148
2149                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2150 #ifdef CONFIG_IPV6_SUBTREES
2151                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2152 #endif
2153         }
2154
2155         dst_release(dst_orig);
2156         return new ? new : ERR_PTR(-ENOMEM);
2157 }
2158
2159 /*
2160  *      Destination cache support functions
2161  */
2162
2163 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2164 {
2165         u32 rt_cookie = 0;
2166
2167         if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2168                 return false;
2169
2170         if (fib6_check_expired(f6i))
2171                 return false;
2172
2173         return true;
2174 }
2175
2176 static struct dst_entry *rt6_check(struct rt6_info *rt,
2177                                    struct fib6_info *from,
2178                                    u32 cookie)
2179 {
2180         u32 rt_cookie = 0;
2181
2182         if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2183             rt_cookie != cookie)
2184                 return NULL;
2185
2186         if (rt6_check_expired(rt))
2187                 return NULL;
2188
2189         return &rt->dst;
2190 }
2191
2192 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2193                                             struct fib6_info *from,
2194                                             u32 cookie)
2195 {
2196         if (!__rt6_check_expired(rt) &&
2197             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2198             fib6_check(from, cookie))
2199                 return &rt->dst;
2200         else
2201                 return NULL;
2202 }
2203
2204 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2205 {
2206         struct dst_entry *dst_ret;
2207         struct fib6_info *from;
2208         struct rt6_info *rt;
2209
2210         rt = container_of(dst, struct rt6_info, dst);
2211
2212         rcu_read_lock();
2213
2214         /* All IPV6 dsts are created with ->obsolete set to the value
2215          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2216          * into this function always.
2217          */
2218
2219         from = rcu_dereference(rt->from);
2220
2221         if (from && (rt->rt6i_flags & RTF_PCPU ||
2222             unlikely(!list_empty(&rt->rt6i_uncached))))
2223                 dst_ret = rt6_dst_from_check(rt, from, cookie);
2224         else
2225                 dst_ret = rt6_check(rt, from, cookie);
2226
2227         rcu_read_unlock();
2228
2229         return dst_ret;
2230 }
2231
2232 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2233 {
2234         struct rt6_info *rt = (struct rt6_info *) dst;
2235
2236         if (rt) {
2237                 if (rt->rt6i_flags & RTF_CACHE) {
2238                         rcu_read_lock();
2239                         if (rt6_check_expired(rt)) {
2240                                 rt6_remove_exception_rt(rt);
2241                                 dst = NULL;
2242                         }
2243                         rcu_read_unlock();
2244                 } else {
2245                         dst_release(dst);
2246                         dst = NULL;
2247                 }
2248         }
2249         return dst;
2250 }
2251
2252 static void ip6_link_failure(struct sk_buff *skb)
2253 {
2254         struct rt6_info *rt;
2255
2256         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2257
2258         rt = (struct rt6_info *) skb_dst(skb);
2259         if (rt) {
2260                 rcu_read_lock();
2261                 if (rt->rt6i_flags & RTF_CACHE) {
2262                         if (dst_hold_safe(&rt->dst))
2263                                 rt6_remove_exception_rt(rt);
2264                 } else {
2265                         struct fib6_info *from;
2266                         struct fib6_node *fn;
2267
2268                         from = rcu_dereference(rt->from);
2269                         if (from) {
2270                                 fn = rcu_dereference(from->fib6_node);
2271                                 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2272                                         fn->fn_sernum = -1;
2273                         }
2274                 }
2275                 rcu_read_unlock();
2276         }
2277 }
2278
2279 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2280 {
2281         if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2282                 struct fib6_info *from;
2283
2284                 rcu_read_lock();
2285                 from = rcu_dereference(rt0->from);
2286                 if (from)
2287                         rt0->dst.expires = from->expires;
2288                 rcu_read_unlock();
2289         }
2290
2291         dst_set_expires(&rt0->dst, timeout);
2292         rt0->rt6i_flags |= RTF_EXPIRES;
2293 }
2294
2295 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2296 {
2297         struct net *net = dev_net(rt->dst.dev);
2298
2299         dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2300         rt->rt6i_flags |= RTF_MODIFIED;
2301         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2302 }
2303
2304 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2305 {
2306         bool from_set;
2307
2308         rcu_read_lock();
2309         from_set = !!rcu_dereference(rt->from);
2310         rcu_read_unlock();
2311
2312         return !(rt->rt6i_flags & RTF_CACHE) &&
2313                 (rt->rt6i_flags & RTF_PCPU || from_set);
2314 }
2315
2316 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2317                                  const struct ipv6hdr *iph, u32 mtu)
2318 {
2319         const struct in6_addr *daddr, *saddr;
2320         struct rt6_info *rt6 = (struct rt6_info *)dst;
2321
2322         if (dst_metric_locked(dst, RTAX_MTU))
2323                 return;
2324
2325         if (iph) {
2326                 daddr = &iph->daddr;
2327                 saddr = &iph->saddr;
2328         } else if (sk) {
2329                 daddr = &sk->sk_v6_daddr;
2330                 saddr = &inet6_sk(sk)->saddr;
2331         } else {
2332                 daddr = NULL;
2333                 saddr = NULL;
2334         }
2335         dst_confirm_neigh(dst, daddr);
2336         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2337         if (mtu >= dst_mtu(dst))
2338                 return;
2339
2340         if (!rt6_cache_allowed_for_pmtu(rt6)) {
2341                 rt6_do_update_pmtu(rt6, mtu);
2342                 /* update rt6_ex->stamp for cache */
2343                 if (rt6->rt6i_flags & RTF_CACHE)
2344                         rt6_update_exception_stamp_rt(rt6);
2345         } else if (daddr) {
2346                 struct fib6_info *from;
2347                 struct rt6_info *nrt6;
2348
2349                 rcu_read_lock();
2350                 from = rcu_dereference(rt6->from);
2351                 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
2352                 if (nrt6) {
2353                         rt6_do_update_pmtu(nrt6, mtu);
2354                         if (rt6_insert_exception(nrt6, from))
2355                                 dst_release_immediate(&nrt6->dst);
2356                 }
2357                 rcu_read_unlock();
2358         }
2359 }
2360
2361 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2362                                struct sk_buff *skb, u32 mtu)
2363 {
2364         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2365 }
2366
2367 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2368                      int oif, u32 mark, kuid_t uid)
2369 {
2370         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2371         struct dst_entry *dst;
2372         struct flowi6 fl6;
2373
2374         memset(&fl6, 0, sizeof(fl6));
2375         fl6.flowi6_oif = oif;
2376         fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2377         fl6.daddr = iph->daddr;
2378         fl6.saddr = iph->saddr;
2379         fl6.flowlabel = ip6_flowinfo(iph);
2380         fl6.flowi6_uid = uid;
2381
2382         dst = ip6_route_output(net, NULL, &fl6);
2383         if (!dst->error)
2384                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2385         dst_release(dst);
2386 }
2387 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2388
2389 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2390 {
2391         struct dst_entry *dst;
2392
2393         ip6_update_pmtu(skb, sock_net(sk), mtu,
2394                         sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2395
2396         dst = __sk_dst_get(sk);
2397         if (!dst || !dst->obsolete ||
2398             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2399                 return;
2400
2401         bh_lock_sock(sk);
2402         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2403                 ip6_datagram_dst_update(sk, false);
2404         bh_unlock_sock(sk);
2405 }
2406 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2407
2408 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2409                            const struct flowi6 *fl6)
2410 {
2411 #ifdef CONFIG_IPV6_SUBTREES
2412         struct ipv6_pinfo *np = inet6_sk(sk);
2413 #endif
2414
2415         ip6_dst_store(sk, dst,
2416                       ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2417                       &sk->sk_v6_daddr : NULL,
2418 #ifdef CONFIG_IPV6_SUBTREES
2419                       ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2420                       &np->saddr :
2421 #endif
2422                       NULL);
2423 }
2424
2425 /* Handle redirects */
2426 struct ip6rd_flowi {
2427         struct flowi6 fl6;
2428         struct in6_addr gateway;
2429 };
2430
2431 static struct rt6_info *__ip6_route_redirect(struct net *net,
2432                                              struct fib6_table *table,
2433                                              struct flowi6 *fl6,
2434                                              const struct sk_buff *skb,
2435                                              int flags)
2436 {
2437         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2438         struct rt6_info *ret = NULL, *rt_cache;
2439         struct fib6_info *rt;
2440         struct fib6_node *fn;
2441
2442         /* Get the "current" route for this destination and
2443          * check if the redirect has come from appropriate router.
2444          *
2445          * RFC 4861 specifies that redirects should only be
2446          * accepted if they come from the nexthop to the target.
2447          * Due to the way the routes are chosen, this notion
2448          * is a bit fuzzy and one might need to check all possible
2449          * routes.
2450          */
2451
2452         rcu_read_lock();
2453         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2454 restart:
2455         for_each_fib6_node_rt_rcu(fn) {
2456                 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
2457                         continue;
2458                 if (fib6_check_expired(rt))
2459                         continue;
2460                 if (rt->fib6_flags & RTF_REJECT)
2461                         break;
2462                 if (!(rt->fib6_flags & RTF_GATEWAY))
2463                         continue;
2464                 if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex)
2465                         continue;
2466                 /* rt_cache's gateway might be different from its 'parent'
2467                  * in the case of an ip redirect.
2468                  * So we keep searching in the exception table if the gateway
2469                  * is different.
2470                  */
2471                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) {
2472                         rt_cache = rt6_find_cached_rt(rt,
2473                                                       &fl6->daddr,
2474                                                       &fl6->saddr);
2475                         if (rt_cache &&
2476                             ipv6_addr_equal(&rdfl->gateway,
2477                                             &rt_cache->rt6i_gateway)) {
2478                                 ret = rt_cache;
2479                                 break;
2480                         }
2481                         continue;
2482                 }
2483                 break;
2484         }
2485
2486         if (!rt)
2487                 rt = net->ipv6.fib6_null_entry;
2488         else if (rt->fib6_flags & RTF_REJECT) {
2489                 ret = net->ipv6.ip6_null_entry;
2490                 goto out;
2491         }
2492
2493         if (rt == net->ipv6.fib6_null_entry) {
2494                 fn = fib6_backtrack(fn, &fl6->saddr);
2495                 if (fn)
2496                         goto restart;
2497         }
2498
2499 out:
2500         if (ret)
2501                 ip6_hold_safe(net, &ret, true);
2502         else
2503                 ret = ip6_create_rt_rcu(rt);
2504
2505         rcu_read_unlock();
2506
2507         trace_fib6_table_lookup(net, rt, table, fl6);
2508         return ret;
2509 };
2510
2511 static struct dst_entry *ip6_route_redirect(struct net *net,
2512                                             const struct flowi6 *fl6,
2513                                             const struct sk_buff *skb,
2514                                             const struct in6_addr *gateway)
2515 {
2516         int flags = RT6_LOOKUP_F_HAS_SADDR;
2517         struct ip6rd_flowi rdfl;
2518
2519         rdfl.fl6 = *fl6;
2520         rdfl.gateway = *gateway;
2521
2522         return fib6_rule_lookup(net, &rdfl.fl6, skb,
2523                                 flags, __ip6_route_redirect);
2524 }
2525
2526 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2527                   kuid_t uid)
2528 {
2529         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2530         struct dst_entry *dst;
2531         struct flowi6 fl6;
2532
2533         memset(&fl6, 0, sizeof(fl6));
2534         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2535         fl6.flowi6_oif = oif;
2536         fl6.flowi6_mark = mark;
2537         fl6.daddr = iph->daddr;
2538         fl6.saddr = iph->saddr;
2539         fl6.flowlabel = ip6_flowinfo(iph);
2540         fl6.flowi6_uid = uid;
2541
2542         dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2543         rt6_do_redirect(dst, NULL, skb);
2544         dst_release(dst);
2545 }
2546 EXPORT_SYMBOL_GPL(ip6_redirect);
2547
2548 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2549                             u32 mark)
2550 {
2551         const struct ipv6hdr *iph = ipv6_hdr(skb);
2552         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2553         struct dst_entry *dst;
2554         struct flowi6 fl6;
2555
2556         memset(&fl6, 0, sizeof(fl6));
2557         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2558         fl6.flowi6_oif = oif;
2559         fl6.flowi6_mark = mark;
2560         fl6.daddr = msg->dest;
2561         fl6.saddr = iph->daddr;
2562         fl6.flowi6_uid = sock_net_uid(net, NULL);
2563
2564         dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2565         rt6_do_redirect(dst, NULL, skb);
2566         dst_release(dst);
2567 }
2568
2569 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2570 {
2571         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2572                      sk->sk_uid);
2573 }
2574 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2575
2576 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2577 {
2578         struct net_device *dev = dst->dev;
2579         unsigned int mtu = dst_mtu(dst);
2580         struct net *net = dev_net(dev);
2581
2582         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2583
2584         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2585                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2586
2587         /*
2588          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2589          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2590          * IPV6_MAXPLEN is also valid and means: "any MSS,
2591          * rely only on pmtu discovery"
2592          */
2593         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2594                 mtu = IPV6_MAXPLEN;
2595         return mtu;
2596 }
2597
2598 static unsigned int ip6_mtu(const struct dst_entry *dst)
2599 {
2600         struct inet6_dev *idev;
2601         unsigned int mtu;
2602
2603         mtu = dst_metric_raw(dst, RTAX_MTU);
2604         if (mtu)
2605                 goto out;
2606
2607         mtu = IPV6_MIN_MTU;
2608
2609         rcu_read_lock();
2610         idev = __in6_dev_get(dst->dev);
2611         if (idev)
2612                 mtu = idev->cnf.mtu6;
2613         rcu_read_unlock();
2614
2615 out:
2616         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2617
2618         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2619 }
2620
2621 /* MTU selection:
2622  * 1. mtu on route is locked - use it
2623  * 2. mtu from nexthop exception
2624  * 3. mtu from egress device
2625  *
2626  * based on ip6_dst_mtu_forward and exception logic of
2627  * rt6_find_cached_rt; called with rcu_read_lock
2628  */
2629 u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
2630                       struct in6_addr *saddr)
2631 {
2632         struct rt6_exception_bucket *bucket;
2633         struct rt6_exception *rt6_ex;
2634         struct in6_addr *src_key;
2635         struct inet6_dev *idev;
2636         u32 mtu = 0;
2637
2638         if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
2639                 mtu = f6i->fib6_pmtu;
2640                 if (mtu)
2641                         goto out;
2642         }
2643
2644         src_key = NULL;
2645 #ifdef CONFIG_IPV6_SUBTREES
2646         if (f6i->fib6_src.plen)
2647                 src_key = saddr;
2648 #endif
2649
2650         bucket = rcu_dereference(f6i->rt6i_exception_bucket);
2651         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
2652         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
2653                 mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU);
2654
2655         if (likely(!mtu)) {
2656                 struct net_device *dev = fib6_info_nh_dev(f6i);
2657
2658                 mtu = IPV6_MIN_MTU;
2659                 idev = __in6_dev_get(dev);
2660                 if (idev && idev->cnf.mtu6 > mtu)
2661                         mtu = idev->cnf.mtu6;
2662         }
2663
2664         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2665 out:
2666         return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu);
2667 }
2668
2669 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2670                                   struct flowi6 *fl6)
2671 {
2672         struct dst_entry *dst;
2673         struct rt6_info *rt;
2674         struct inet6_dev *idev = in6_dev_get(dev);
2675         struct net *net = dev_net(dev);
2676
2677         if (unlikely(!idev))
2678                 return ERR_PTR(-ENODEV);
2679
2680         rt = ip6_dst_alloc(net, dev, 0);
2681         if (unlikely(!rt)) {
2682                 in6_dev_put(idev);
2683                 dst = ERR_PTR(-ENOMEM);
2684                 goto out;
2685         }
2686
2687         rt->dst.flags |= DST_HOST;
2688         rt->dst.input = ip6_input;
2689         rt->dst.output  = ip6_output;
2690         rt->rt6i_gateway  = fl6->daddr;
2691         rt->rt6i_dst.addr = fl6->daddr;
2692         rt->rt6i_dst.plen = 128;
2693         rt->rt6i_idev     = idev;
2694         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2695
2696         /* Add this dst into uncached_list so that rt6_disable_ip() can
2697          * do proper release of the net_device
2698          */
2699         rt6_uncached_list_add(rt);
2700         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2701
2702         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2703
2704 out:
2705         return dst;
2706 }
2707
2708 static int ip6_dst_gc(struct dst_ops *ops)
2709 {
2710         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2711         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2712         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2713         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2714         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2715         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2716         int entries;
2717
2718         entries = dst_entries_get_fast(ops);
2719         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2720             entries <= rt_max_size)
2721                 goto out;
2722
2723         net->ipv6.ip6_rt_gc_expire++;
2724         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2725         entries = dst_entries_get_slow(ops);
2726         if (entries < ops->gc_thresh)
2727                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2728 out:
2729         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2730         return entries > rt_max_size;
2731 }
2732
2733 static int ip6_convert_metrics(struct net *net, struct fib6_info *rt,
2734                                struct fib6_config *cfg)
2735 {
2736         struct dst_metrics *p;
2737
2738         if (!cfg->fc_mx)
2739                 return 0;
2740
2741         p = kzalloc(sizeof(*rt->fib6_metrics), GFP_KERNEL);
2742         if (unlikely(!p))
2743                 return -ENOMEM;
2744
2745         refcount_set(&p->refcnt, 1);
2746         rt->fib6_metrics = p;
2747
2748         return ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len, p->metrics);
2749 }
2750
2751 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2752                                             struct fib6_config *cfg,
2753                                             const struct in6_addr *gw_addr,
2754                                             u32 tbid, int flags)
2755 {
2756         struct flowi6 fl6 = {
2757                 .flowi6_oif = cfg->fc_ifindex,
2758                 .daddr = *gw_addr,
2759                 .saddr = cfg->fc_prefsrc,
2760         };
2761         struct fib6_table *table;
2762         struct rt6_info *rt;
2763
2764         table = fib6_get_table(net, tbid);
2765         if (!table)
2766                 return NULL;
2767
2768         if (!ipv6_addr_any(&cfg->fc_prefsrc))
2769                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2770
2771         flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2772         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2773
2774         /* if table lookup failed, fall back to full lookup */
2775         if (rt == net->ipv6.ip6_null_entry) {
2776                 ip6_rt_put(rt);
2777                 rt = NULL;
2778         }
2779
2780         return rt;
2781 }
2782
2783 static int ip6_route_check_nh_onlink(struct net *net,
2784                                      struct fib6_config *cfg,
2785                                      const struct net_device *dev,
2786                                      struct netlink_ext_ack *extack)
2787 {
2788         u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2789         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2790         u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2791         struct rt6_info *grt;
2792         int err;
2793
2794         err = 0;
2795         grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2796         if (grt) {
2797                 if (!grt->dst.error &&
2798                     (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2799                         NL_SET_ERR_MSG(extack,
2800                                        "Nexthop has invalid gateway or device mismatch");
2801                         err = -EINVAL;
2802                 }
2803
2804                 ip6_rt_put(grt);
2805         }
2806
2807         return err;
2808 }
2809
2810 static int ip6_route_check_nh(struct net *net,
2811                               struct fib6_config *cfg,
2812                               struct net_device **_dev,
2813                               struct inet6_dev **idev)
2814 {
2815         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2816         struct net_device *dev = _dev ? *_dev : NULL;
2817         struct rt6_info *grt = NULL;
2818         int err = -EHOSTUNREACH;
2819
2820         if (cfg->fc_table) {
2821                 int flags = RT6_LOOKUP_F_IFACE;
2822
2823                 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2824                                           cfg->fc_table, flags);
2825                 if (grt) {
2826                         if (grt->rt6i_flags & RTF_GATEWAY ||
2827                             (dev && dev != grt->dst.dev)) {
2828                                 ip6_rt_put(grt);
2829                                 grt = NULL;
2830                         }
2831                 }
2832         }
2833
2834         if (!grt)
2835                 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2836
2837         if (!grt)
2838                 goto out;
2839
2840         if (dev) {
2841                 if (dev != grt->dst.dev) {
2842                         ip6_rt_put(grt);
2843                         goto out;
2844                 }
2845         } else {
2846                 *_dev = dev = grt->dst.dev;
2847                 *idev = grt->rt6i_idev;
2848                 dev_hold(dev);
2849                 in6_dev_hold(grt->rt6i_idev);
2850         }
2851
2852         if (!(grt->rt6i_flags & RTF_GATEWAY))
2853                 err = 0;
2854
2855         ip6_rt_put(grt);
2856
2857 out:
2858         return err;
2859 }
2860
2861 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2862                            struct net_device **_dev, struct inet6_dev **idev,
2863                            struct netlink_ext_ack *extack)
2864 {
2865         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2866         int gwa_type = ipv6_addr_type(gw_addr);
2867         bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2868         const struct net_device *dev = *_dev;
2869         bool need_addr_check = !dev;
2870         int err = -EINVAL;
2871
2872         /* if gw_addr is local we will fail to detect this in case
2873          * address is still TENTATIVE (DAD in progress). rt6_lookup()
2874          * will return already-added prefix route via interface that
2875          * prefix route was assigned to, which might be non-loopback.
2876          */
2877         if (dev &&
2878             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2879                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2880                 goto out;
2881         }
2882
2883         if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2884                 /* IPv6 strictly inhibits using not link-local
2885                  * addresses as nexthop address.
2886                  * Otherwise, router will not able to send redirects.
2887                  * It is very good, but in some (rare!) circumstances
2888                  * (SIT, PtP, NBMA NOARP links) it is handy to allow
2889                  * some exceptions. --ANK
2890                  * We allow IPv4-mapped nexthops to support RFC4798-type
2891                  * addressing
2892                  */
2893                 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2894                         NL_SET_ERR_MSG(extack, "Invalid gateway address");
2895                         goto out;
2896                 }
2897
2898                 if (cfg->fc_flags & RTNH_F_ONLINK)
2899                         err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2900                 else
2901                         err = ip6_route_check_nh(net, cfg, _dev, idev);
2902
2903                 if (err)
2904                         goto out;
2905         }
2906
2907         /* reload in case device was changed */
2908         dev = *_dev;
2909
2910         err = -EINVAL;
2911         if (!dev) {
2912                 NL_SET_ERR_MSG(extack, "Egress device not specified");
2913                 goto out;
2914         } else if (dev->flags & IFF_LOOPBACK) {
2915                 NL_SET_ERR_MSG(extack,
2916                                "Egress device can not be loopback device for this route");
2917                 goto out;
2918         }
2919
2920         /* if we did not check gw_addr above, do so now that the
2921          * egress device has been resolved.
2922          */
2923         if (need_addr_check &&
2924             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2925                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2926                 goto out;
2927         }
2928
2929         err = 0;
2930 out:
2931         return err;
2932 }
2933
2934 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
2935                                               gfp_t gfp_flags,
2936                                               struct netlink_ext_ack *extack)
2937 {
2938         struct net *net = cfg->fc_nlinfo.nl_net;
2939         struct fib6_info *rt = NULL;
2940         struct net_device *dev = NULL;
2941         struct inet6_dev *idev = NULL;
2942         struct fib6_table *table;
2943         int addr_type;
2944         int err = -EINVAL;
2945
2946         /* RTF_PCPU is an internal flag; can not be set by userspace */
2947         if (cfg->fc_flags & RTF_PCPU) {
2948                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2949                 goto out;
2950         }
2951
2952         /* RTF_CACHE is an internal flag; can not be set by userspace */
2953         if (cfg->fc_flags & RTF_CACHE) {
2954                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2955                 goto out;
2956         }
2957
2958         if (cfg->fc_type > RTN_MAX) {
2959                 NL_SET_ERR_MSG(extack, "Invalid route type");
2960                 goto out;
2961         }
2962
2963         if (cfg->fc_dst_len > 128) {
2964                 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2965                 goto out;
2966         }
2967         if (cfg->fc_src_len > 128) {
2968                 NL_SET_ERR_MSG(extack, "Invalid source address length");
2969                 goto out;
2970         }
2971 #ifndef CONFIG_IPV6_SUBTREES
2972         if (cfg->fc_src_len) {
2973                 NL_SET_ERR_MSG(extack,
2974                                "Specifying source address requires IPV6_SUBTREES to be enabled");
2975                 goto out;
2976         }
2977 #endif
2978         if (cfg->fc_ifindex) {
2979                 err = -ENODEV;
2980                 dev = dev_get_by_index(net, cfg->fc_ifindex);
2981                 if (!dev)
2982                         goto out;
2983                 idev = in6_dev_get(dev);
2984                 if (!idev)
2985                         goto out;
2986         }
2987
2988         if (cfg->fc_metric == 0)
2989                 cfg->fc_metric = IP6_RT_PRIO_USER;
2990
2991         if (cfg->fc_flags & RTNH_F_ONLINK) {
2992                 if (!dev) {
2993                         NL_SET_ERR_MSG(extack,
2994                                        "Nexthop device required for onlink");
2995                         err = -ENODEV;
2996                         goto out;
2997                 }
2998
2999                 if (!(dev->flags & IFF_UP)) {
3000                         NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3001                         err = -ENETDOWN;
3002                         goto out;
3003                 }
3004         }
3005
3006         err = -ENOBUFS;
3007         if (cfg->fc_nlinfo.nlh &&
3008             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
3009                 table = fib6_get_table(net, cfg->fc_table);
3010                 if (!table) {
3011                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
3012                         table = fib6_new_table(net, cfg->fc_table);
3013                 }
3014         } else {
3015                 table = fib6_new_table(net, cfg->fc_table);
3016         }
3017
3018         if (!table)
3019                 goto out;
3020
3021         err = -ENOMEM;
3022         rt = fib6_info_alloc(gfp_flags);
3023         if (!rt)
3024                 goto out;
3025
3026         if (cfg->fc_flags & RTF_ADDRCONF)
3027                 rt->dst_nocount = true;
3028
3029         err = ip6_convert_metrics(net, rt, cfg);
3030         if (err < 0)
3031                 goto out;
3032
3033         if (cfg->fc_flags & RTF_EXPIRES)
3034                 fib6_set_expires(rt, jiffies +
3035                                 clock_t_to_jiffies(cfg->fc_expires));
3036         else
3037                 fib6_clean_expires(rt);
3038
3039         if (cfg->fc_protocol == RTPROT_UNSPEC)
3040                 cfg->fc_protocol = RTPROT_BOOT;
3041         rt->fib6_protocol = cfg->fc_protocol;
3042
3043         addr_type = ipv6_addr_type(&cfg->fc_dst);
3044
3045         if (cfg->fc_encap) {
3046                 struct lwtunnel_state *lwtstate;
3047
3048                 err = lwtunnel_build_state(cfg->fc_encap_type,
3049                                            cfg->fc_encap, AF_INET6, cfg,
3050                                            &lwtstate, extack);
3051                 if (err)
3052                         goto out;
3053                 rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate);
3054         }
3055
3056         ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3057         rt->fib6_dst.plen = cfg->fc_dst_len;
3058         if (rt->fib6_dst.plen == 128)
3059                 rt->dst_host = true;
3060
3061 #ifdef CONFIG_IPV6_SUBTREES
3062         ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3063         rt->fib6_src.plen = cfg->fc_src_len;
3064 #endif
3065
3066         rt->fib6_metric = cfg->fc_metric;
3067         rt->fib6_nh.nh_weight = 1;
3068
3069         rt->fib6_type = cfg->fc_type;
3070
3071         /* We cannot add true routes via loopback here,
3072            they would result in kernel looping; promote them to reject routes
3073          */
3074         if ((cfg->fc_flags & RTF_REJECT) ||
3075             (dev && (dev->flags & IFF_LOOPBACK) &&
3076              !(addr_type & IPV6_ADDR_LOOPBACK) &&
3077              !(cfg->fc_flags & RTF_LOCAL))) {
3078                 /* hold loopback dev/idev if we haven't done so. */
3079                 if (dev != net->loopback_dev) {
3080                         if (dev) {
3081                                 dev_put(dev);
3082                                 in6_dev_put(idev);
3083                         }
3084                         dev = net->loopback_dev;
3085                         dev_hold(dev);
3086                         idev = in6_dev_get(dev);
3087                         if (!idev) {
3088                                 err = -ENODEV;
3089                                 goto out;
3090                         }
3091                 }
3092                 rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP;
3093                 goto install_route;
3094         }
3095
3096         if (cfg->fc_flags & RTF_GATEWAY) {
3097                 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
3098                 if (err)
3099                         goto out;
3100
3101                 rt->fib6_nh.nh_gw = cfg->fc_gateway;
3102         }
3103
3104         err = -ENODEV;
3105         if (!dev)
3106                 goto out;
3107
3108         if (idev->cnf.disable_ipv6) {
3109                 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3110                 err = -EACCES;
3111                 goto out;
3112         }
3113
3114         if (!(dev->flags & IFF_UP)) {
3115                 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3116                 err = -ENETDOWN;
3117                 goto out;
3118         }
3119
3120         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3121                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3122                         NL_SET_ERR_MSG(extack, "Invalid source address");
3123                         err = -EINVAL;
3124                         goto out;
3125                 }
3126                 rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3127                 rt->fib6_prefsrc.plen = 128;
3128         } else
3129                 rt->fib6_prefsrc.plen = 0;
3130
3131         rt->fib6_flags = cfg->fc_flags;
3132
3133 install_route:
3134         if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3135             !netif_carrier_ok(dev))
3136                 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3137         rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
3138         rt->fib6_nh.nh_dev = dev;
3139         rt->fib6_table = table;
3140
3141         cfg->fc_nlinfo.nl_net = dev_net(dev);
3142
3143         if (idev)
3144                 in6_dev_put(idev);
3145
3146         return rt;
3147 out:
3148         if (dev)
3149                 dev_put(dev);
3150         if (idev)
3151                 in6_dev_put(idev);
3152
3153         fib6_info_release(rt);
3154         return ERR_PTR(err);
3155 }
3156
3157 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3158                   struct netlink_ext_ack *extack)
3159 {
3160         struct fib6_info *rt;
3161         int err;
3162
3163         rt = ip6_route_info_create(cfg, gfp_flags, extack);
3164         if (IS_ERR(rt))
3165                 return PTR_ERR(rt);
3166
3167         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3168         fib6_info_release(rt);
3169
3170         return err;
3171 }
3172
3173 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3174 {
3175         struct net *net = info->nl_net;
3176         struct fib6_table *table;
3177         int err;
3178
3179         if (rt == net->ipv6.fib6_null_entry) {
3180                 err = -ENOENT;
3181                 goto out;
3182         }
3183
3184         table = rt->fib6_table;
3185         spin_lock_bh(&table->tb6_lock);
3186         err = fib6_del(rt, info);
3187         spin_unlock_bh(&table->tb6_lock);
3188
3189 out:
3190         fib6_info_release(rt);
3191         return err;
3192 }
3193
3194 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3195 {
3196         struct nl_info info = { .nl_net = net };
3197
3198         return __ip6_del_rt(rt, &info);
3199 }
3200
3201 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3202 {
3203         struct nl_info *info = &cfg->fc_nlinfo;
3204         struct net *net = info->nl_net;
3205         struct sk_buff *skb = NULL;
3206         struct fib6_table *table;
3207         int err = -ENOENT;
3208
3209         if (rt == net->ipv6.fib6_null_entry)
3210                 goto out_put;
3211         table = rt->fib6_table;
3212         spin_lock_bh(&table->tb6_lock);
3213
3214         if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3215                 struct fib6_info *sibling, *next_sibling;
3216
3217                 /* prefer to send a single notification with all hops */
3218                 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3219                 if (skb) {
3220                         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3221
3222                         if (rt6_fill_node(net, skb, rt, NULL,
3223                                           NULL, NULL, 0, RTM_DELROUTE,
3224                                           info->portid, seq, 0) < 0) {
3225                                 kfree_skb(skb);
3226                                 skb = NULL;
3227                         } else
3228                                 info->skip_notify = 1;
3229                 }
3230
3231                 list_for_each_entry_safe(sibling, next_sibling,
3232                                          &rt->fib6_siblings,
3233                                          fib6_siblings) {
3234                         err = fib6_del(sibling, info);
3235                         if (err)
3236                                 goto out_unlock;
3237                 }
3238         }
3239
3240         err = fib6_del(rt, info);
3241 out_unlock:
3242         spin_unlock_bh(&table->tb6_lock);
3243 out_put:
3244         fib6_info_release(rt);
3245
3246         if (skb) {
3247                 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3248                             info->nlh, gfp_any());
3249         }
3250         return err;
3251 }
3252
3253 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3254 {
3255         int rc = -ESRCH;
3256
3257         if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3258                 goto out;
3259
3260         if (cfg->fc_flags & RTF_GATEWAY &&
3261             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3262                 goto out;
3263         if (dst_hold_safe(&rt->dst))
3264                 rc = rt6_remove_exception_rt(rt);
3265 out:
3266         return rc;
3267 }
3268
3269 static int ip6_route_del(struct fib6_config *cfg,
3270                          struct netlink_ext_ack *extack)
3271 {
3272         struct rt6_info *rt_cache;
3273         struct fib6_table *table;
3274         struct fib6_info *rt;
3275         struct fib6_node *fn;
3276         int err = -ESRCH;
3277
3278         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3279         if (!table) {
3280                 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3281                 return err;
3282         }
3283
3284         rcu_read_lock();
3285
3286         fn = fib6_locate(&table->tb6_root,
3287                          &cfg->fc_dst, cfg->fc_dst_len,
3288                          &cfg->fc_src, cfg->fc_src_len,
3289                          !(cfg->fc_flags & RTF_CACHE));
3290
3291         if (fn) {
3292                 for_each_fib6_node_rt_rcu(fn) {
3293                         if (cfg->fc_flags & RTF_CACHE) {
3294                                 int rc;
3295
3296                                 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3297                                                               &cfg->fc_src);
3298                                 if (rt_cache) {
3299                                         rc = ip6_del_cached_rt(rt_cache, cfg);
3300                                         if (rc != -ESRCH) {
3301                                                 rcu_read_unlock();
3302                                                 return rc;
3303                                         }
3304                                 }
3305                                 continue;
3306                         }
3307                         if (cfg->fc_ifindex &&
3308                             (!rt->fib6_nh.nh_dev ||
3309                              rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex))
3310                                 continue;
3311                         if (cfg->fc_flags & RTF_GATEWAY &&
3312                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw))
3313                                 continue;
3314                         if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3315                                 continue;
3316                         if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3317                                 continue;
3318                         if (!fib6_info_hold_safe(rt))
3319                                 continue;
3320                         rcu_read_unlock();
3321
3322                         /* if gateway was specified only delete the one hop */
3323                         if (cfg->fc_flags & RTF_GATEWAY)
3324                                 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3325
3326                         return __ip6_del_rt_siblings(rt, cfg);
3327                 }
3328         }
3329         rcu_read_unlock();
3330
3331         return err;
3332 }
3333
3334 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3335 {
3336         struct netevent_redirect netevent;
3337         struct rt6_info *rt, *nrt = NULL;
3338         struct ndisc_options ndopts;
3339         struct inet6_dev *in6_dev;
3340         struct neighbour *neigh;
3341         struct fib6_info *from;
3342         struct rd_msg *msg;
3343         int optlen, on_link;
3344         u8 *lladdr;
3345
3346         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3347         optlen -= sizeof(*msg);
3348
3349         if (optlen < 0) {
3350                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3351                 return;
3352         }
3353
3354         msg = (struct rd_msg *)icmp6_hdr(skb);
3355
3356         if (ipv6_addr_is_multicast(&msg->dest)) {
3357                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3358                 return;
3359         }
3360
3361         on_link = 0;
3362         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3363                 on_link = 1;
3364         } else if (ipv6_addr_type(&msg->target) !=
3365                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3366                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3367                 return;
3368         }
3369
3370         in6_dev = __in6_dev_get(skb->dev);
3371         if (!in6_dev)
3372                 return;
3373         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3374                 return;
3375
3376         /* RFC2461 8.1:
3377          *      The IP source address of the Redirect MUST be the same as the current
3378          *      first-hop router for the specified ICMP Destination Address.
3379          */
3380
3381         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3382                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3383                 return;
3384         }
3385
3386         lladdr = NULL;
3387         if (ndopts.nd_opts_tgt_lladdr) {
3388                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3389                                              skb->dev);
3390                 if (!lladdr) {
3391                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3392                         return;
3393                 }
3394         }
3395
3396         rt = (struct rt6_info *) dst;
3397         if (rt->rt6i_flags & RTF_REJECT) {
3398                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3399                 return;
3400         }
3401
3402         /* Redirect received -> path was valid.
3403          * Look, redirects are sent only in response to data packets,
3404          * so that this nexthop apparently is reachable. --ANK
3405          */
3406         dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3407
3408         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3409         if (!neigh)
3410                 return;
3411
3412         /*
3413          *      We have finally decided to accept it.
3414          */
3415
3416         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3417                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
3418                      NEIGH_UPDATE_F_OVERRIDE|
3419                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3420                                      NEIGH_UPDATE_F_ISROUTER)),
3421                      NDISC_REDIRECT, &ndopts);
3422
3423         rcu_read_lock();
3424         from = rcu_dereference(rt->from);
3425         /* This fib6_info_hold() is safe here because we hold reference to rt
3426          * and rt already holds reference to fib6_info.
3427          */
3428         fib6_info_hold(from);
3429         rcu_read_unlock();
3430
3431         nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
3432         if (!nrt)
3433                 goto out;
3434
3435         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3436         if (on_link)
3437                 nrt->rt6i_flags &= ~RTF_GATEWAY;
3438
3439         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3440
3441         /* No need to remove rt from the exception table if rt is
3442          * a cached route because rt6_insert_exception() will
3443          * takes care of it
3444          */
3445         if (rt6_insert_exception(nrt, from)) {
3446                 dst_release_immediate(&nrt->dst);
3447                 goto out;
3448         }
3449
3450         netevent.old = &rt->dst;
3451         netevent.new = &nrt->dst;
3452         netevent.daddr = &msg->dest;
3453         netevent.neigh = neigh;
3454         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3455
3456 out:
3457         fib6_info_release(from);
3458         neigh_release(neigh);
3459 }
3460
3461 #ifdef CONFIG_IPV6_ROUTE_INFO
3462 static struct fib6_info *rt6_get_route_info(struct net *net,
3463                                            const struct in6_addr *prefix, int prefixlen,
3464                                            const struct in6_addr *gwaddr,
3465                                            struct net_device *dev)
3466 {
3467         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3468         int ifindex = dev->ifindex;
3469         struct fib6_node *fn;
3470         struct fib6_info *rt = NULL;
3471         struct fib6_table *table;
3472
3473         table = fib6_get_table(net, tb_id);
3474         if (!table)
3475                 return NULL;
3476
3477         rcu_read_lock();
3478         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3479         if (!fn)
3480                 goto out;
3481
3482         for_each_fib6_node_rt_rcu(fn) {
3483                 if (rt->fib6_nh.nh_dev->ifindex != ifindex)
3484                         continue;
3485                 if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3486                         continue;
3487                 if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
3488                         continue;
3489                 if (!fib6_info_hold_safe(rt))
3490                         continue;
3491                 break;
3492         }
3493 out:
3494         rcu_read_unlock();
3495         return rt;
3496 }
3497
3498 static struct fib6_info *rt6_add_route_info(struct net *net,
3499                                            const struct in6_addr *prefix, int prefixlen,
3500                                            const struct in6_addr *gwaddr,
3501                                            struct net_device *dev,
3502                                            unsigned int pref)
3503 {
3504         struct fib6_config cfg = {
3505                 .fc_metric      = IP6_RT_PRIO_USER,
3506                 .fc_ifindex     = dev->ifindex,
3507                 .fc_dst_len     = prefixlen,
3508                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3509                                   RTF_UP | RTF_PREF(pref),
3510                 .fc_protocol = RTPROT_RA,
3511                 .fc_type = RTN_UNICAST,
3512                 .fc_nlinfo.portid = 0,
3513                 .fc_nlinfo.nlh = NULL,
3514                 .fc_nlinfo.nl_net = net,
3515         };
3516
3517         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3518         cfg.fc_dst = *prefix;
3519         cfg.fc_gateway = *gwaddr;
3520
3521         /* We should treat it as a default route if prefix length is 0. */
3522         if (!prefixlen)
3523                 cfg.fc_flags |= RTF_DEFAULT;
3524
3525         ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3526
3527         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3528 }
3529 #endif
3530
3531 struct fib6_info *rt6_get_dflt_router(struct net *net,
3532                                      const struct in6_addr *addr,
3533                                      struct net_device *dev)
3534 {
3535         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3536         struct fib6_info *rt;
3537         struct fib6_table *table;
3538
3539         table = fib6_get_table(net, tb_id);
3540         if (!table)
3541                 return NULL;
3542
3543         rcu_read_lock();
3544         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3545                 if (dev == rt->fib6_nh.nh_dev &&
3546                     ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3547                     ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
3548                         break;
3549         }
3550         if (rt && !fib6_info_hold_safe(rt))
3551                 rt = NULL;
3552         rcu_read_unlock();
3553         return rt;
3554 }
3555
3556 struct fib6_info *rt6_add_dflt_router(struct net *net,
3557                                      const struct in6_addr *gwaddr,
3558                                      struct net_device *dev,
3559                                      unsigned int pref)
3560 {
3561         struct fib6_config cfg = {
3562                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3563                 .fc_metric      = IP6_RT_PRIO_USER,
3564                 .fc_ifindex     = dev->ifindex,
3565                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3566                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3567                 .fc_protocol = RTPROT_RA,
3568                 .fc_type = RTN_UNICAST,
3569                 .fc_nlinfo.portid = 0,
3570                 .fc_nlinfo.nlh = NULL,
3571                 .fc_nlinfo.nl_net = net,
3572         };
3573
3574         cfg.fc_gateway = *gwaddr;
3575
3576         if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3577                 struct fib6_table *table;
3578
3579                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3580                 if (table)
3581                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3582         }
3583
3584         return rt6_get_dflt_router(net, gwaddr, dev);
3585 }
3586
3587 static void __rt6_purge_dflt_routers(struct net *net,
3588                                      struct fib6_table *table)
3589 {
3590         struct fib6_info *rt;
3591
3592 restart:
3593         rcu_read_lock();
3594         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3595                 struct net_device *dev = fib6_info_nh_dev(rt);
3596                 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3597
3598                 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3599                     (!idev || idev->cnf.accept_ra != 2) &&
3600                     fib6_info_hold_safe(rt)) {
3601                         rcu_read_unlock();
3602                         ip6_del_rt(net, rt);
3603                         goto restart;
3604                 }
3605         }
3606         rcu_read_unlock();
3607
3608         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3609 }
3610
3611 void rt6_purge_dflt_routers(struct net *net)
3612 {
3613         struct fib6_table *table;
3614         struct hlist_head *head;
3615         unsigned int h;
3616
3617         rcu_read_lock();
3618
3619         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3620                 head = &net->ipv6.fib_table_hash[h];
3621                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3622                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3623                                 __rt6_purge_dflt_routers(net, table);
3624                 }
3625         }
3626
3627         rcu_read_unlock();
3628 }
3629
3630 static void rtmsg_to_fib6_config(struct net *net,
3631                                  struct in6_rtmsg *rtmsg,
3632                                  struct fib6_config *cfg)
3633 {
3634         memset(cfg, 0, sizeof(*cfg));
3635
3636         cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3637                          : RT6_TABLE_MAIN;
3638         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3639         cfg->fc_metric = rtmsg->rtmsg_metric;
3640         cfg->fc_expires = rtmsg->rtmsg_info;
3641         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3642         cfg->fc_src_len = rtmsg->rtmsg_src_len;
3643         cfg->fc_flags = rtmsg->rtmsg_flags;
3644         cfg->fc_type = rtmsg->rtmsg_type;
3645
3646         cfg->fc_nlinfo.nl_net = net;
3647
3648         cfg->fc_dst = rtmsg->rtmsg_dst;
3649         cfg->fc_src = rtmsg->rtmsg_src;
3650         cfg->fc_gateway = rtmsg->rtmsg_gateway;
3651 }
3652
3653 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3654 {
3655         struct fib6_config cfg;
3656         struct in6_rtmsg rtmsg;
3657         int err;
3658
3659         switch (cmd) {
3660         case SIOCADDRT:         /* Add a route */
3661         case SIOCDELRT:         /* Delete a route */
3662                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3663                         return -EPERM;
3664                 err = copy_from_user(&rtmsg, arg,
3665                                      sizeof(struct in6_rtmsg));
3666                 if (err)
3667                         return -EFAULT;
3668
3669                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3670
3671                 rtnl_lock();
3672                 switch (cmd) {
3673                 case SIOCADDRT:
3674                         err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3675                         break;
3676                 case SIOCDELRT:
3677                         err = ip6_route_del(&cfg, NULL);
3678                         break;
3679                 default:
3680                         err = -EINVAL;
3681                 }
3682                 rtnl_unlock();
3683
3684                 return err;
3685         }
3686
3687         return -EINVAL;
3688 }
3689
3690 /*
3691  *      Drop the packet on the floor
3692  */
3693
3694 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3695 {
3696         int type;
3697         struct dst_entry *dst = skb_dst(skb);
3698         switch (ipstats_mib_noroutes) {
3699         case IPSTATS_MIB_INNOROUTES:
3700                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3701                 if (type == IPV6_ADDR_ANY) {
3702                         IP6_INC_STATS(dev_net(dst->dev),
3703                                       __in6_dev_get_safely(skb->dev),
3704                                       IPSTATS_MIB_INADDRERRORS);
3705                         break;
3706                 }
3707                 /* FALLTHROUGH */
3708         case IPSTATS_MIB_OUTNOROUTES:
3709                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3710                               ipstats_mib_noroutes);
3711                 break;
3712         }
3713         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3714         kfree_skb(skb);
3715         return 0;
3716 }
3717
3718 static int ip6_pkt_discard(struct sk_buff *skb)
3719 {
3720         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3721 }
3722
3723 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3724 {
3725         skb->dev = skb_dst(skb)->dev;
3726         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3727 }
3728
3729 static int ip6_pkt_prohibit(struct sk_buff *skb)
3730 {
3731         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3732 }
3733
3734 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3735 {
3736         skb->dev = skb_dst(skb)->dev;
3737         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3738 }
3739
3740 /*
3741  *      Allocate a dst for local (unicast / anycast) address.
3742  */
3743
3744 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3745                                      struct inet6_dev *idev,
3746                                      const struct in6_addr *addr,
3747                                      bool anycast, gfp_t gfp_flags)
3748 {
3749         u32 tb_id;
3750         struct net_device *dev = idev->dev;
3751         struct fib6_info *f6i;
3752
3753         f6i = fib6_info_alloc(gfp_flags);
3754         if (!f6i)
3755                 return ERR_PTR(-ENOMEM);
3756
3757         f6i->dst_nocount = true;
3758         f6i->dst_host = true;
3759         f6i->fib6_protocol = RTPROT_KERNEL;
3760         f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP;
3761         if (anycast) {
3762                 f6i->fib6_type = RTN_ANYCAST;
3763                 f6i->fib6_flags |= RTF_ANYCAST;
3764         } else {
3765                 f6i->fib6_type = RTN_LOCAL;
3766                 f6i->fib6_flags |= RTF_LOCAL;
3767         }
3768
3769         f6i->fib6_nh.nh_gw = *addr;
3770         dev_hold(dev);
3771         f6i->fib6_nh.nh_dev = dev;
3772         f6i->fib6_dst.addr = *addr;
3773         f6i->fib6_dst.plen = 128;
3774         tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3775         f6i->fib6_table = fib6_get_table(net, tb_id);
3776
3777         return f6i;
3778 }
3779
3780 /* remove deleted ip from prefsrc entries */
3781 struct arg_dev_net_ip {
3782         struct net_device *dev;
3783         struct net *net;
3784         struct in6_addr *addr;
3785 };
3786
3787 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3788 {
3789         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3790         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3791         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3792
3793         if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
3794             rt != net->ipv6.fib6_null_entry &&
3795             ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3796                 spin_lock_bh(&rt6_exception_lock);
3797                 /* remove prefsrc entry */
3798                 rt->fib6_prefsrc.plen = 0;
3799                 /* need to update cache as well */
3800                 rt6_exceptions_remove_prefsrc(rt);
3801                 spin_unlock_bh(&rt6_exception_lock);
3802         }
3803         return 0;
3804 }
3805
3806 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3807 {
3808         struct net *net = dev_net(ifp->idev->dev);
3809         struct arg_dev_net_ip adni = {
3810                 .dev = ifp->idev->dev,
3811                 .net = net,
3812                 .addr = &ifp->addr,
3813         };
3814         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3815 }
3816
3817 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3818
3819 /* Remove routers and update dst entries when gateway turn into host. */
3820 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3821 {
3822         struct in6_addr *gateway = (struct in6_addr *)arg;
3823
3824         if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3825             ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) {
3826                 return -1;
3827         }
3828
3829         /* Further clean up cached routes in exception table.
3830          * This is needed because cached route may have a different
3831          * gateway than its 'parent' in the case of an ip redirect.
3832          */
3833         rt6_exceptions_clean_tohost(rt, gateway);
3834
3835         return 0;
3836 }
3837
3838 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3839 {
3840         fib6_clean_all(net, fib6_clean_tohost, gateway);
3841 }
3842
3843 struct arg_netdev_event {
3844         const struct net_device *dev;
3845         union {
3846                 unsigned int nh_flags;
3847                 unsigned long event;
3848         };
3849 };
3850
3851 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3852 {
3853         struct fib6_info *iter;
3854         struct fib6_node *fn;
3855
3856         fn = rcu_dereference_protected(rt->fib6_node,
3857                         lockdep_is_held(&rt->fib6_table->tb6_lock));
3858         iter = rcu_dereference_protected(fn->leaf,
3859                         lockdep_is_held(&rt->fib6_table->tb6_lock));
3860         while (iter) {
3861                 if (iter->fib6_metric == rt->fib6_metric &&
3862                     rt6_qualify_for_ecmp(iter))
3863                         return iter;
3864                 iter = rcu_dereference_protected(iter->fib6_next,
3865                                 lockdep_is_held(&rt->fib6_table->tb6_lock));
3866         }
3867
3868         return NULL;
3869 }
3870
3871 static bool rt6_is_dead(const struct fib6_info *rt)
3872 {
3873         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
3874             (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
3875              fib6_ignore_linkdown(rt)))
3876                 return true;
3877
3878         return false;
3879 }
3880
3881 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3882 {
3883         struct fib6_info *iter;
3884         int total = 0;
3885
3886         if (!rt6_is_dead(rt))
3887                 total += rt->fib6_nh.nh_weight;
3888
3889         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3890                 if (!rt6_is_dead(iter))
3891                         total += iter->fib6_nh.nh_weight;
3892         }
3893
3894         return total;
3895 }
3896
3897 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3898 {
3899         int upper_bound = -1;
3900
3901         if (!rt6_is_dead(rt)) {
3902                 *weight += rt->fib6_nh.nh_weight;
3903                 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3904                                                     total) - 1;
3905         }
3906         atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound);
3907 }
3908
3909 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3910 {
3911         struct fib6_info *iter;
3912         int weight = 0;
3913
3914         rt6_upper_bound_set(rt, &weight, total);
3915
3916         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3917                 rt6_upper_bound_set(iter, &weight, total);
3918 }
3919
3920 void rt6_multipath_rebalance(struct fib6_info *rt)
3921 {
3922         struct fib6_info *first;
3923         int total;
3924
3925         /* In case the entire multipath route was marked for flushing,
3926          * then there is no need to rebalance upon the removal of every
3927          * sibling route.
3928          */
3929         if (!rt->fib6_nsiblings || rt->should_flush)
3930                 return;
3931
3932         /* During lookup routes are evaluated in order, so we need to
3933          * make sure upper bounds are assigned from the first sibling
3934          * onwards.
3935          */
3936         first = rt6_multipath_first_sibling(rt);
3937         if (WARN_ON_ONCE(!first))
3938                 return;
3939
3940         total = rt6_multipath_total_weight(first);
3941         rt6_multipath_upper_bound_set(first, total);
3942 }
3943
3944 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
3945 {
3946         const struct arg_netdev_event *arg = p_arg;
3947         struct net *net = dev_net(arg->dev);
3948
3949         if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) {
3950                 rt->fib6_nh.nh_flags &= ~arg->nh_flags;
3951                 fib6_update_sernum_upto_root(net, rt);
3952                 rt6_multipath_rebalance(rt);
3953         }
3954
3955         return 0;
3956 }
3957
3958 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3959 {
3960         struct arg_netdev_event arg = {
3961                 .dev = dev,
3962                 {
3963                         .nh_flags = nh_flags,
3964                 },
3965         };
3966
3967         if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3968                 arg.nh_flags |= RTNH_F_LINKDOWN;
3969
3970         fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3971 }
3972
3973 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
3974                                    const struct net_device *dev)
3975 {
3976         struct fib6_info *iter;
3977
3978         if (rt->fib6_nh.nh_dev == dev)
3979                 return true;
3980         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3981                 if (iter->fib6_nh.nh_dev == dev)
3982                         return true;
3983
3984         return false;
3985 }
3986
3987 static void rt6_multipath_flush(struct fib6_info *rt)
3988 {
3989         struct fib6_info *iter;
3990
3991         rt->should_flush = 1;
3992         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3993                 iter->should_flush = 1;
3994 }
3995
3996 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
3997                                              const struct net_device *down_dev)
3998 {
3999         struct fib6_info *iter;
4000         unsigned int dead = 0;
4001
4002         if (rt->fib6_nh.nh_dev == down_dev ||
4003             rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4004                 dead++;
4005         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4006                 if (iter->fib6_nh.nh_dev == down_dev ||
4007                     iter->fib6_nh.nh_flags & RTNH_F_DEAD)
4008                         dead++;
4009
4010         return dead;
4011 }
4012
4013 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
4014                                        const struct net_device *dev,
4015                                        unsigned int nh_flags)
4016 {
4017         struct fib6_info *iter;
4018
4019         if (rt->fib6_nh.nh_dev == dev)
4020                 rt->fib6_nh.nh_flags |= nh_flags;
4021         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4022                 if (iter->fib6_nh.nh_dev == dev)
4023                         iter->fib6_nh.nh_flags |= nh_flags;
4024 }
4025
4026 /* called with write lock held for table with rt */
4027 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
4028 {
4029         const struct arg_netdev_event *arg = p_arg;
4030         const struct net_device *dev = arg->dev;
4031         struct net *net = dev_net(dev);
4032
4033         if (rt == net->ipv6.fib6_null_entry)
4034                 return 0;
4035
4036         switch (arg->event) {
4037         case NETDEV_UNREGISTER:
4038                 return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4039         case NETDEV_DOWN:
4040                 if (rt->should_flush)
4041                         return -1;
4042                 if (!rt->fib6_nsiblings)
4043                         return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4044                 if (rt6_multipath_uses_dev(rt, dev)) {
4045                         unsigned int count;
4046
4047                         count = rt6_multipath_dead_count(rt, dev);
4048                         if (rt->fib6_nsiblings + 1 == count) {
4049                                 rt6_multipath_flush(rt);
4050                                 return -1;
4051                         }
4052                         rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4053                                                    RTNH_F_LINKDOWN);
4054                         fib6_update_sernum(net, rt);
4055                         rt6_multipath_rebalance(rt);
4056                 }
4057                 return -2;
4058         case NETDEV_CHANGE:
4059                 if (rt->fib6_nh.nh_dev != dev ||
4060                     rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4061                         break;
4062                 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
4063                 rt6_multipath_rebalance(rt);
4064                 break;
4065         }
4066
4067         return 0;
4068 }
4069
4070 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4071 {
4072         struct arg_netdev_event arg = {
4073                 .dev = dev,
4074                 {
4075                         .event = event,
4076                 },
4077         };
4078
4079         fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
4080 }
4081
4082 void rt6_disable_ip(struct net_device *dev, unsigned long event)
4083 {
4084         rt6_sync_down_dev(dev, event);
4085         rt6_uncached_list_flush_dev(dev_net(dev), dev);
4086         neigh_ifdown(&nd_tbl, dev);
4087 }
4088
4089 struct rt6_mtu_change_arg {
4090         struct net_device *dev;
4091         unsigned int mtu;
4092 };
4093
4094 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4095 {
4096         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4097         struct inet6_dev *idev;
4098
4099         /* In IPv6 pmtu discovery is not optional,
4100            so that RTAX_MTU lock cannot disable it.
4101            We still use this lock to block changes
4102            caused by addrconf/ndisc.
4103         */
4104
4105         idev = __in6_dev_get(arg->dev);
4106         if (!idev)
4107                 return 0;
4108
4109         /* For administrative MTU increase, there is no way to discover
4110            IPv6 PMTU increase, so PMTU increase should be updated here.
4111            Since RFC 1981 doesn't include administrative MTU increase
4112            update PMTU increase is a MUST. (i.e. jumbo frame)
4113          */
4114         if (rt->fib6_nh.nh_dev == arg->dev &&
4115             !fib6_metric_locked(rt, RTAX_MTU)) {
4116                 u32 mtu = rt->fib6_pmtu;
4117
4118                 if (mtu >= arg->mtu ||
4119                     (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4120                         fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4121
4122                 spin_lock_bh(&rt6_exception_lock);
4123                 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4124                 spin_unlock_bh(&rt6_exception_lock);
4125         }
4126         return 0;
4127 }
4128
4129 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4130 {
4131         struct rt6_mtu_change_arg arg = {
4132                 .dev = dev,
4133                 .mtu = mtu,
4134         };
4135
4136         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4137 }
4138
4139 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4140         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
4141         [RTA_PREFSRC]           = { .len = sizeof(struct in6_addr) },
4142         [RTA_OIF]               = { .type = NLA_U32 },
4143         [RTA_IIF]               = { .type = NLA_U32 },
4144         [RTA_PRIORITY]          = { .type = NLA_U32 },
4145         [RTA_METRICS]           = { .type = NLA_NESTED },
4146         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
4147         [RTA_PREF]              = { .type = NLA_U8 },
4148         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
4149         [RTA_ENCAP]             = { .type = NLA_NESTED },
4150         [RTA_EXPIRES]           = { .type = NLA_U32 },
4151         [RTA_UID]               = { .type = NLA_U32 },
4152         [RTA_MARK]              = { .type = NLA_U32 },
4153         [RTA_TABLE]             = { .type = NLA_U32 },
4154         [RTA_IP_PROTO]          = { .type = NLA_U8 },
4155         [RTA_SPORT]             = { .type = NLA_U16 },
4156         [RTA_DPORT]             = { .type = NLA_U16 },
4157 };
4158
4159 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4160                               struct fib6_config *cfg,
4161                               struct netlink_ext_ack *extack)
4162 {
4163         struct rtmsg *rtm;
4164         struct nlattr *tb[RTA_MAX+1];
4165         unsigned int pref;
4166         int err;
4167
4168         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4169                           NULL);
4170         if (err < 0)
4171                 goto errout;
4172
4173         err = -EINVAL;
4174         rtm = nlmsg_data(nlh);
4175         memset(cfg, 0, sizeof(*cfg));
4176
4177         cfg->fc_table = rtm->rtm_table;
4178         cfg->fc_dst_len = rtm->rtm_dst_len;
4179         cfg->fc_src_len = rtm->rtm_src_len;
4180         cfg->fc_flags = RTF_UP;
4181         cfg->fc_protocol = rtm->rtm_protocol;
4182         cfg->fc_type = rtm->rtm_type;
4183
4184         if (rtm->rtm_type == RTN_UNREACHABLE ||
4185             rtm->rtm_type == RTN_BLACKHOLE ||
4186             rtm->rtm_type == RTN_PROHIBIT ||
4187             rtm->rtm_type == RTN_THROW)
4188                 cfg->fc_flags |= RTF_REJECT;
4189
4190         if (rtm->rtm_type == RTN_LOCAL)
4191                 cfg->fc_flags |= RTF_LOCAL;
4192
4193         if (rtm->rtm_flags & RTM_F_CLONED)
4194                 cfg->fc_flags |= RTF_CACHE;
4195
4196         cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4197
4198         cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
4199         cfg->fc_nlinfo.nlh = nlh;
4200         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
4201
4202         if (tb[RTA_GATEWAY]) {
4203                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4204                 cfg->fc_flags |= RTF_GATEWAY;
4205         }
4206
4207         if (tb[RTA_DST]) {
4208                 int plen = (rtm->rtm_dst_len + 7) >> 3;
4209
4210                 if (nla_len(tb[RTA_DST]) < plen)
4211                         goto errout;
4212
4213                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4214         }
4215
4216         if (tb[RTA_SRC]) {
4217                 int plen = (rtm->rtm_src_len + 7) >> 3;
4218
4219                 if (nla_len(tb[RTA_SRC]) < plen)
4220                         goto errout;
4221
4222                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4223         }
4224
4225         if (tb[RTA_PREFSRC])
4226                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4227
4228         if (tb[RTA_OIF])
4229                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4230
4231         if (tb[RTA_PRIORITY])
4232                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4233
4234         if (tb[RTA_METRICS]) {
4235                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4236                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4237         }
4238
4239         if (tb[RTA_TABLE])
4240                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4241
4242         if (tb[RTA_MULTIPATH]) {
4243                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4244                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4245
4246                 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4247                                                      cfg->fc_mp_len, extack);
4248                 if (err < 0)
4249                         goto errout;
4250         }
4251
4252         if (tb[RTA_PREF]) {
4253                 pref = nla_get_u8(tb[RTA_PREF]);
4254                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4255                     pref != ICMPV6_ROUTER_PREF_HIGH)
4256                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
4257                 cfg->fc_flags |= RTF_PREF(pref);
4258         }
4259
4260         if (tb[RTA_ENCAP])
4261                 cfg->fc_encap = tb[RTA_ENCAP];
4262
4263         if (tb[RTA_ENCAP_TYPE]) {
4264                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4265
4266                 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4267                 if (err < 0)
4268                         goto errout;
4269         }
4270
4271         if (tb[RTA_EXPIRES]) {
4272                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4273
4274                 if (addrconf_finite_timeout(timeout)) {
4275                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4276                         cfg->fc_flags |= RTF_EXPIRES;
4277                 }
4278         }
4279
4280         err = 0;
4281 errout:
4282         return err;
4283 }
4284
4285 struct rt6_nh {
4286         struct fib6_info *fib6_info;
4287         struct fib6_config r_cfg;
4288         struct list_head next;
4289 };
4290
4291 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
4292 {
4293         struct rt6_nh *nh;
4294
4295         list_for_each_entry(nh, rt6_nh_list, next) {
4296                 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
4297                         &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
4298                         nh->r_cfg.fc_ifindex);
4299         }
4300 }
4301
4302 static int ip6_route_info_append(struct net *net,
4303                                  struct list_head *rt6_nh_list,
4304                                  struct fib6_info *rt,
4305                                  struct fib6_config *r_cfg)
4306 {
4307         struct rt6_nh *nh;
4308         int err = -EEXIST;
4309
4310         list_for_each_entry(nh, rt6_nh_list, next) {
4311                 /* check if fib6_info already exists */
4312                 if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4313                         return err;
4314         }
4315
4316         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4317         if (!nh)
4318                 return -ENOMEM;
4319         nh->fib6_info = rt;
4320         err = ip6_convert_metrics(net, rt, r_cfg);
4321         if (err) {
4322                 kfree(nh);
4323                 return err;
4324         }
4325         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4326         list_add_tail(&nh->next, rt6_nh_list);
4327
4328         return 0;
4329 }
4330
4331 static void ip6_route_mpath_notify(struct fib6_info *rt,
4332                                    struct fib6_info *rt_last,
4333                                    struct nl_info *info,
4334                                    __u16 nlflags)
4335 {
4336         /* if this is an APPEND route, then rt points to the first route
4337          * inserted and rt_last points to last route inserted. Userspace
4338          * wants a consistent dump of the route which starts at the first
4339          * nexthop. Since sibling routes are always added at the end of
4340          * the list, find the first sibling of the last route appended
4341          */
4342         if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4343                 rt = list_first_entry(&rt_last->fib6_siblings,
4344                                       struct fib6_info,
4345                                       fib6_siblings);
4346         }
4347
4348         if (rt)
4349                 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4350 }
4351
4352 static int ip6_route_multipath_add(struct fib6_config *cfg,
4353                                    struct netlink_ext_ack *extack)
4354 {
4355         struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4356         struct nl_info *info = &cfg->fc_nlinfo;
4357         struct fib6_config r_cfg;
4358         struct rtnexthop *rtnh;
4359         struct fib6_info *rt;
4360         struct rt6_nh *err_nh;
4361         struct rt6_nh *nh, *nh_safe;
4362         __u16 nlflags;
4363         int remaining;
4364         int attrlen;
4365         int err = 1;
4366         int nhn = 0;
4367         int replace = (cfg->fc_nlinfo.nlh &&
4368                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4369         LIST_HEAD(rt6_nh_list);
4370
4371         nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4372         if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4373                 nlflags |= NLM_F_APPEND;
4374
4375         remaining = cfg->fc_mp_len;
4376         rtnh = (struct rtnexthop *)cfg->fc_mp;
4377
4378         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4379          * fib6_info structs per nexthop
4380          */
4381         while (rtnh_ok(rtnh, remaining)) {
4382                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4383                 if (rtnh->rtnh_ifindex)
4384                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4385
4386                 attrlen = rtnh_attrlen(rtnh);
4387                 if (attrlen > 0) {
4388                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4389
4390                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4391                         if (nla) {
4392                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4393                                 r_cfg.fc_flags |= RTF_GATEWAY;
4394                         }
4395                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4396                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4397                         if (nla)
4398                                 r_cfg.fc_encap_type = nla_get_u16(nla);
4399                 }
4400
4401                 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4402                 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4403                 if (IS_ERR(rt)) {
4404                         err = PTR_ERR(rt);
4405                         rt = NULL;
4406                         goto cleanup;
4407                 }
4408                 if (!rt6_qualify_for_ecmp(rt)) {
4409                         err = -EINVAL;
4410                         NL_SET_ERR_MSG(extack,
4411                                        "Device only routes can not be added for IPv6 using the multipath API.");
4412                         fib6_info_release(rt);
4413                         goto cleanup;
4414                 }
4415
4416                 rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
4417
4418                 err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4419                                             rt, &r_cfg);
4420                 if (err) {
4421                         fib6_info_release(rt);
4422                         goto cleanup;
4423                 }
4424
4425                 rtnh = rtnh_next(rtnh, &remaining);
4426         }
4427
4428         /* for add and replace send one notification with all nexthops.
4429          * Skip the notification in fib6_add_rt2node and send one with
4430          * the full route when done
4431          */
4432         info->skip_notify = 1;
4433
4434         err_nh = NULL;
4435         list_for_each_entry(nh, &rt6_nh_list, next) {
4436                 err = __ip6_ins_rt(nh->fib6_info, info, extack);
4437                 fib6_info_release(nh->fib6_info);
4438
4439                 if (!err) {
4440                         /* save reference to last route successfully inserted */
4441                         rt_last = nh->fib6_info;
4442
4443                         /* save reference to first route for notification */
4444                         if (!rt_notif)
4445                                 rt_notif = nh->fib6_info;
4446                 }
4447
4448                 /* nh->fib6_info is used or freed at this point, reset to NULL*/
4449                 nh->fib6_info = NULL;
4450                 if (err) {
4451                         if (replace && nhn)
4452                                 ip6_print_replace_route_err(&rt6_nh_list);
4453                         err_nh = nh;
4454                         goto add_errout;
4455                 }
4456
4457                 /* Because each route is added like a single route we remove
4458                  * these flags after the first nexthop: if there is a collision,
4459                  * we have already failed to add the first nexthop:
4460                  * fib6_add_rt2node() has rejected it; when replacing, old
4461                  * nexthops have been replaced by first new, the rest should
4462                  * be added to it.
4463                  */
4464                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4465                                                      NLM_F_REPLACE);
4466                 nhn++;
4467         }
4468
4469         /* success ... tell user about new route */
4470         ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4471         goto cleanup;
4472
4473 add_errout:
4474         /* send notification for routes that were added so that
4475          * the delete notifications sent by ip6_route_del are
4476          * coherent
4477          */
4478         if (rt_notif)
4479                 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4480
4481         /* Delete routes that were already added */
4482         list_for_each_entry(nh, &rt6_nh_list, next) {
4483                 if (err_nh == nh)
4484                         break;
4485                 ip6_route_del(&nh->r_cfg, extack);
4486         }
4487
4488 cleanup:
4489         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4490                 if (nh->fib6_info)
4491                         fib6_info_release(nh->fib6_info);
4492                 list_del(&nh->next);
4493                 kfree(nh);
4494         }
4495
4496         return err;
4497 }
4498
4499 static int ip6_route_multipath_del(struct fib6_config *cfg,
4500                                    struct netlink_ext_ack *extack)
4501 {
4502         struct fib6_config r_cfg;
4503         struct rtnexthop *rtnh;
4504         int remaining;
4505         int attrlen;
4506         int err = 1, last_err = 0;
4507
4508         remaining = cfg->fc_mp_len;
4509         rtnh = (struct rtnexthop *)cfg->fc_mp;
4510
4511         /* Parse a Multipath Entry */
4512         while (rtnh_ok(rtnh, remaining)) {
4513                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4514                 if (rtnh->rtnh_ifindex)
4515                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4516
4517                 attrlen = rtnh_attrlen(rtnh);
4518                 if (attrlen > 0) {
4519                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4520
4521                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4522                         if (nla) {
4523                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4524                                 r_cfg.fc_flags |= RTF_GATEWAY;
4525                         }
4526                 }
4527                 err = ip6_route_del(&r_cfg, extack);
4528                 if (err)
4529                         last_err = err;
4530
4531                 rtnh = rtnh_next(rtnh, &remaining);
4532         }
4533
4534         return last_err;
4535 }
4536
4537 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4538                               struct netlink_ext_ack *extack)
4539 {
4540         struct fib6_config cfg;
4541         int err;
4542
4543         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4544         if (err < 0)
4545                 return err;
4546
4547         if (cfg.fc_mp)
4548                 return ip6_route_multipath_del(&cfg, extack);
4549         else {
4550                 cfg.fc_delete_all_nh = 1;
4551                 return ip6_route_del(&cfg, extack);
4552         }
4553 }
4554
4555 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4556                               struct netlink_ext_ack *extack)
4557 {
4558         struct fib6_config cfg;
4559         int err;
4560
4561         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4562         if (err < 0)
4563                 return err;
4564
4565         if (cfg.fc_mp)
4566                 return ip6_route_multipath_add(&cfg, extack);
4567         else
4568                 return ip6_route_add(&cfg, GFP_KERNEL, extack);
4569 }
4570
4571 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4572 {
4573         int nexthop_len = 0;
4574
4575         if (rt->fib6_nsiblings) {
4576                 nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
4577                             + NLA_ALIGN(sizeof(struct rtnexthop))
4578                             + nla_total_size(16) /* RTA_GATEWAY */
4579                             + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate);
4580
4581                 nexthop_len *= rt->fib6_nsiblings;
4582         }
4583
4584         return NLMSG_ALIGN(sizeof(struct rtmsg))
4585                + nla_total_size(16) /* RTA_SRC */
4586                + nla_total_size(16) /* RTA_DST */
4587                + nla_total_size(16) /* RTA_GATEWAY */
4588                + nla_total_size(16) /* RTA_PREFSRC */
4589                + nla_total_size(4) /* RTA_TABLE */
4590                + nla_total_size(4) /* RTA_IIF */
4591                + nla_total_size(4) /* RTA_OIF */
4592                + nla_total_size(4) /* RTA_PRIORITY */
4593                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4594                + nla_total_size(sizeof(struct rta_cacheinfo))
4595                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4596                + nla_total_size(1) /* RTA_PREF */
4597                + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate)
4598                + nexthop_len;
4599 }
4600
4601 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
4602                             unsigned int *flags, bool skip_oif)
4603 {
4604         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4605                 *flags |= RTNH_F_DEAD;
4606
4607         if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
4608                 *flags |= RTNH_F_LINKDOWN;
4609
4610                 rcu_read_lock();
4611                 if (fib6_ignore_linkdown(rt))
4612                         *flags |= RTNH_F_DEAD;
4613                 rcu_read_unlock();
4614         }
4615
4616         if (rt->fib6_flags & RTF_GATEWAY) {
4617                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0)
4618                         goto nla_put_failure;
4619         }
4620
4621         *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK);
4622         if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD)
4623                 *flags |= RTNH_F_OFFLOAD;
4624
4625         /* not needed for multipath encoding b/c it has a rtnexthop struct */
4626         if (!skip_oif && rt->fib6_nh.nh_dev &&
4627             nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex))
4628                 goto nla_put_failure;
4629
4630         if (rt->fib6_nh.nh_lwtstate &&
4631             lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0)
4632                 goto nla_put_failure;
4633
4634         return 0;
4635
4636 nla_put_failure:
4637         return -EMSGSIZE;
4638 }
4639
4640 /* add multipath next hop */
4641 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt)
4642 {
4643         const struct net_device *dev = rt->fib6_nh.nh_dev;
4644         struct rtnexthop *rtnh;
4645         unsigned int flags = 0;
4646
4647         rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4648         if (!rtnh)
4649                 goto nla_put_failure;
4650
4651         rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1;
4652         rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
4653
4654         if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4655                 goto nla_put_failure;
4656
4657         rtnh->rtnh_flags = flags;
4658
4659         /* length of rtnetlink header + attributes */
4660         rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4661
4662         return 0;
4663
4664 nla_put_failure:
4665         return -EMSGSIZE;
4666 }
4667
4668 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4669                          struct fib6_info *rt, struct dst_entry *dst,
4670                          struct in6_addr *dest, struct in6_addr *src,
4671                          int iif, int type, u32 portid, u32 seq,
4672                          unsigned int flags)
4673 {
4674         struct rtmsg *rtm;
4675         struct nlmsghdr *nlh;
4676         long expires = 0;
4677         u32 *pmetrics;
4678         u32 table;
4679
4680         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4681         if (!nlh)
4682                 return -EMSGSIZE;
4683
4684         rtm = nlmsg_data(nlh);
4685         rtm->rtm_family = AF_INET6;
4686         rtm->rtm_dst_len = rt->fib6_dst.plen;
4687         rtm->rtm_src_len = rt->fib6_src.plen;
4688         rtm->rtm_tos = 0;
4689         if (rt->fib6_table)
4690                 table = rt->fib6_table->tb6_id;
4691         else
4692                 table = RT6_TABLE_UNSPEC;
4693         rtm->rtm_table = table;
4694         if (nla_put_u32(skb, RTA_TABLE, table))
4695                 goto nla_put_failure;
4696
4697         rtm->rtm_type = rt->fib6_type;
4698         rtm->rtm_flags = 0;
4699         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4700         rtm->rtm_protocol = rt->fib6_protocol;
4701
4702         if (rt->fib6_flags & RTF_CACHE)
4703                 rtm->rtm_flags |= RTM_F_CLONED;
4704
4705         if (dest) {
4706                 if (nla_put_in6_addr(skb, RTA_DST, dest))
4707                         goto nla_put_failure;
4708                 rtm->rtm_dst_len = 128;
4709         } else if (rtm->rtm_dst_len)
4710                 if (nla_put_in6_addr(skb, RTA_DST, &rt->fib6_dst.addr))
4711                         goto nla_put_failure;
4712 #ifdef CONFIG_IPV6_SUBTREES
4713         if (src) {
4714                 if (nla_put_in6_addr(skb, RTA_SRC, src))
4715                         goto nla_put_failure;
4716                 rtm->rtm_src_len = 128;
4717         } else if (rtm->rtm_src_len &&
4718                    nla_put_in6_addr(skb, RTA_SRC, &rt->fib6_src.addr))
4719                 goto nla_put_failure;
4720 #endif
4721         if (iif) {
4722 #ifdef CONFIG_IPV6_MROUTE
4723                 if (ipv6_addr_is_multicast(&rt->fib6_dst.addr)) {
4724                         int err = ip6mr_get_route(net, skb, rtm, portid);
4725
4726                         if (err == 0)
4727                                 return 0;
4728                         if (err < 0)
4729                                 goto nla_put_failure;
4730                 } else
4731 #endif
4732                         if (nla_put_u32(skb, RTA_IIF, iif))
4733                                 goto nla_put_failure;
4734         } else if (dest) {
4735                 struct in6_addr saddr_buf;
4736                 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4737                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4738                         goto nla_put_failure;
4739         }
4740
4741         if (rt->fib6_prefsrc.plen) {
4742                 struct in6_addr saddr_buf;
4743                 saddr_buf = rt->fib6_prefsrc.addr;
4744                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4745                         goto nla_put_failure;
4746         }
4747
4748         pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4749         if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4750                 goto nla_put_failure;
4751
4752         if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4753                 goto nla_put_failure;
4754
4755         /* For multipath routes, walk the siblings list and add
4756          * each as a nexthop within RTA_MULTIPATH.
4757          */
4758         if (rt->fib6_nsiblings) {
4759                 struct fib6_info *sibling, *next_sibling;
4760                 struct nlattr *mp;
4761
4762                 mp = nla_nest_start(skb, RTA_MULTIPATH);
4763                 if (!mp)
4764                         goto nla_put_failure;
4765
4766                 if (rt6_add_nexthop(skb, rt) < 0)
4767                         goto nla_put_failure;
4768
4769                 list_for_each_entry_safe(sibling, next_sibling,
4770                                          &rt->fib6_siblings, fib6_siblings) {
4771                         if (rt6_add_nexthop(skb, sibling) < 0)
4772                                 goto nla_put_failure;
4773                 }
4774
4775                 nla_nest_end(skb, mp);
4776         } else {
4777                 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4778                         goto nla_put_failure;
4779         }
4780
4781         if (rt->fib6_flags & RTF_EXPIRES) {
4782                 expires = dst ? dst->expires : rt->expires;
4783                 expires -= jiffies;
4784         }
4785
4786         if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4787                 goto nla_put_failure;
4788
4789         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->fib6_flags)))
4790                 goto nla_put_failure;
4791
4792
4793         nlmsg_end(skb, nlh);
4794         return 0;
4795
4796 nla_put_failure:
4797         nlmsg_cancel(skb, nlh);
4798         return -EMSGSIZE;
4799 }
4800
4801 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4802 {
4803         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4804         struct net *net = arg->net;
4805
4806         if (rt == net->ipv6.fib6_null_entry)
4807                 return 0;
4808
4809         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4810                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4811
4812                 /* user wants prefix routes only */
4813                 if (rtm->rtm_flags & RTM_F_PREFIX &&
4814                     !(rt->fib6_flags & RTF_PREFIX_RT)) {
4815                         /* success since this is not a prefix route */
4816                         return 1;
4817                 }
4818         }
4819
4820         return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4821                              RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4822                              arg->cb->nlh->nlmsg_seq, NLM_F_MULTI);
4823 }
4824
4825 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4826                               struct netlink_ext_ack *extack)
4827 {
4828         struct net *net = sock_net(in_skb->sk);
4829         struct nlattr *tb[RTA_MAX+1];
4830         int err, iif = 0, oif = 0;
4831         struct fib6_info *from;
4832         struct dst_entry *dst;
4833         struct rt6_info *rt;
4834         struct sk_buff *skb;
4835         struct rtmsg *rtm;
4836         struct flowi6 fl6;
4837         bool fibmatch;
4838
4839         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4840                           extack);
4841         if (err < 0)
4842                 goto errout;
4843
4844         err = -EINVAL;
4845         memset(&fl6, 0, sizeof(fl6));
4846         rtm = nlmsg_data(nlh);
4847         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4848         fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4849
4850         if (tb[RTA_SRC]) {
4851                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4852                         goto errout;
4853
4854                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4855         }
4856
4857         if (tb[RTA_DST]) {
4858                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4859                         goto errout;
4860
4861                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4862         }
4863
4864         if (tb[RTA_IIF])
4865                 iif = nla_get_u32(tb[RTA_IIF]);
4866
4867         if (tb[RTA_OIF])
4868                 oif = nla_get_u32(tb[RTA_OIF]);
4869
4870         if (tb[RTA_MARK])
4871                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4872
4873         if (tb[RTA_UID])
4874                 fl6.flowi6_uid = make_kuid(current_user_ns(),
4875                                            nla_get_u32(tb[RTA_UID]));
4876         else
4877                 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4878
4879         if (tb[RTA_SPORT])
4880                 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
4881
4882         if (tb[RTA_DPORT])
4883                 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
4884
4885         if (tb[RTA_IP_PROTO]) {
4886                 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
4887                                                   &fl6.flowi6_proto, extack);
4888                 if (err)
4889                         goto errout;
4890         }
4891
4892         if (iif) {
4893                 struct net_device *dev;
4894                 int flags = 0;
4895
4896                 rcu_read_lock();
4897
4898                 dev = dev_get_by_index_rcu(net, iif);
4899                 if (!dev) {
4900                         rcu_read_unlock();
4901                         err = -ENODEV;
4902                         goto errout;
4903                 }
4904
4905                 fl6.flowi6_iif = iif;
4906
4907                 if (!ipv6_addr_any(&fl6.saddr))
4908                         flags |= RT6_LOOKUP_F_HAS_SADDR;
4909
4910                 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4911
4912                 rcu_read_unlock();
4913         } else {
4914                 fl6.flowi6_oif = oif;
4915
4916                 dst = ip6_route_output(net, NULL, &fl6);
4917         }
4918
4919
4920         rt = container_of(dst, struct rt6_info, dst);
4921         if (rt->dst.error) {
4922                 err = rt->dst.error;
4923                 ip6_rt_put(rt);
4924                 goto errout;
4925         }
4926
4927         if (rt == net->ipv6.ip6_null_entry) {
4928                 err = rt->dst.error;
4929                 ip6_rt_put(rt);
4930                 goto errout;
4931         }
4932
4933         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4934         if (!skb) {
4935                 ip6_rt_put(rt);
4936                 err = -ENOBUFS;
4937                 goto errout;
4938         }
4939
4940         skb_dst_set(skb, &rt->dst);
4941
4942         rcu_read_lock();
4943         from = rcu_dereference(rt->from);
4944
4945         if (fibmatch)
4946                 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif,
4947                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4948                                     nlh->nlmsg_seq, 0);
4949         else
4950                 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
4951                                     &fl6.saddr, iif, RTM_NEWROUTE,
4952                                     NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
4953                                     0);
4954         rcu_read_unlock();
4955
4956         if (err < 0) {
4957                 kfree_skb(skb);
4958                 goto errout;
4959         }
4960
4961         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4962 errout:
4963         return err;
4964 }
4965
4966 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
4967                      unsigned int nlm_flags)
4968 {
4969         struct sk_buff *skb;
4970         struct net *net = info->nl_net;
4971         u32 seq;
4972         int err;
4973
4974         err = -ENOBUFS;
4975         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4976
4977         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4978         if (!skb)
4979                 goto errout;
4980
4981         err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
4982                             event, info->portid, seq, nlm_flags);
4983         if (err < 0) {
4984                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4985                 WARN_ON(err == -EMSGSIZE);
4986                 kfree_skb(skb);
4987                 goto errout;
4988         }
4989         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4990                     info->nlh, gfp_any());
4991         return;
4992 errout:
4993         if (err < 0)
4994                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4995 }
4996
4997 static int ip6_route_dev_notify(struct notifier_block *this,
4998                                 unsigned long event, void *ptr)
4999 {
5000         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
5001         struct net *net = dev_net(dev);
5002
5003         if (!(dev->flags & IFF_LOOPBACK))
5004                 return NOTIFY_OK;
5005
5006         if (event == NETDEV_REGISTER) {
5007                 net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
5008                 net->ipv6.ip6_null_entry->dst.dev = dev;
5009                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
5010 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5011                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
5012                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
5013                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
5014                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
5015 #endif
5016          } else if (event == NETDEV_UNREGISTER &&
5017                     dev->reg_state != NETREG_UNREGISTERED) {
5018                 /* NETDEV_UNREGISTER could be fired for multiple times by
5019                  * netdev_wait_allrefs(). Make sure we only call this once.
5020                  */
5021                 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
5022 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5023                 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
5024                 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
5025 #endif
5026         }
5027
5028         return NOTIFY_OK;
5029 }
5030
5031 /*
5032  *      /proc
5033  */
5034
5035 #ifdef CONFIG_PROC_FS
5036 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
5037 {
5038         struct net *net = (struct net *)seq->private;
5039         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
5040                    net->ipv6.rt6_stats->fib_nodes,
5041                    net->ipv6.rt6_stats->fib_route_nodes,
5042                    atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
5043                    net->ipv6.rt6_stats->fib_rt_entries,
5044                    net->ipv6.rt6_stats->fib_rt_cache,
5045                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
5046                    net->ipv6.rt6_stats->fib_discarded_routes);
5047
5048         return 0;
5049 }
5050 #endif  /* CONFIG_PROC_FS */
5051
5052 #ifdef CONFIG_SYSCTL
5053
5054 static
5055 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
5056                               void __user *buffer, size_t *lenp, loff_t *ppos)
5057 {
5058         struct net *net;
5059         int delay;
5060         if (!write)
5061                 return -EINVAL;
5062
5063         net = (struct net *)ctl->extra1;
5064         delay = net->ipv6.sysctl.flush_delay;
5065         proc_dointvec(ctl, write, buffer, lenp, ppos);
5066         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
5067         return 0;
5068 }
5069
5070 struct ctl_table ipv6_route_table_template[] = {
5071         {
5072                 .procname       =       "flush",
5073                 .data           =       &init_net.ipv6.sysctl.flush_delay,
5074                 .maxlen         =       sizeof(int),
5075                 .mode           =       0200,
5076                 .proc_handler   =       ipv6_sysctl_rtcache_flush
5077         },
5078         {
5079                 .procname       =       "gc_thresh",
5080                 .data           =       &ip6_dst_ops_template.gc_thresh,
5081                 .maxlen         =       sizeof(int),
5082                 .mode           =       0644,
5083                 .proc_handler   =       proc_dointvec,
5084         },
5085         {
5086                 .procname       =       "max_size",
5087                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
5088                 .maxlen         =       sizeof(int),
5089                 .mode           =       0644,
5090                 .proc_handler   =       proc_dointvec,
5091         },
5092         {
5093                 .procname       =       "gc_min_interval",
5094                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5095                 .maxlen         =       sizeof(int),
5096                 .mode           =       0644,
5097                 .proc_handler   =       proc_dointvec_jiffies,
5098         },
5099         {
5100                 .procname       =       "gc_timeout",
5101                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5102                 .maxlen         =       sizeof(int),
5103                 .mode           =       0644,
5104                 .proc_handler   =       proc_dointvec_jiffies,
5105         },
5106         {
5107                 .procname       =       "gc_interval",
5108                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
5109                 .maxlen         =       sizeof(int),
5110                 .mode           =       0644,
5111                 .proc_handler   =       proc_dointvec_jiffies,
5112         },
5113         {
5114                 .procname       =       "gc_elasticity",
5115                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5116                 .maxlen         =       sizeof(int),
5117                 .mode           =       0644,
5118                 .proc_handler   =       proc_dointvec,
5119         },
5120         {
5121                 .procname       =       "mtu_expires",
5122                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5123                 .maxlen         =       sizeof(int),
5124                 .mode           =       0644,
5125                 .proc_handler   =       proc_dointvec_jiffies,
5126         },
5127         {
5128                 .procname       =       "min_adv_mss",
5129                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
5130                 .maxlen         =       sizeof(int),
5131                 .mode           =       0644,
5132                 .proc_handler   =       proc_dointvec,
5133         },
5134         {
5135                 .procname       =       "gc_min_interval_ms",
5136                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5137                 .maxlen         =       sizeof(int),
5138                 .mode           =       0644,
5139                 .proc_handler   =       proc_dointvec_ms_jiffies,
5140         },
5141         { }
5142 };
5143
5144 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5145 {
5146         struct ctl_table *table;
5147
5148         table = kmemdup(ipv6_route_table_template,
5149                         sizeof(ipv6_route_table_template),
5150                         GFP_KERNEL);
5151
5152         if (table) {
5153                 table[0].data = &net->ipv6.sysctl.flush_delay;
5154                 table[0].extra1 = net;
5155                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5156                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5157                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5158                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5159                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5160                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5161                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5162                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5163                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5164
5165                 /* Don't export sysctls to unprivileged users */
5166                 if (net->user_ns != &init_user_ns)
5167                         table[0].procname = NULL;
5168         }
5169
5170         return table;
5171 }
5172 #endif
5173
5174 static int __net_init ip6_route_net_init(struct net *net)
5175 {
5176         int ret = -ENOMEM;
5177
5178         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5179                sizeof(net->ipv6.ip6_dst_ops));
5180
5181         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5182                 goto out_ip6_dst_ops;
5183
5184         net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5185                                             sizeof(*net->ipv6.fib6_null_entry),
5186                                             GFP_KERNEL);
5187         if (!net->ipv6.fib6_null_entry)
5188                 goto out_ip6_dst_entries;
5189
5190         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5191                                            sizeof(*net->ipv6.ip6_null_entry),
5192                                            GFP_KERNEL);
5193         if (!net->ipv6.ip6_null_entry)
5194                 goto out_fib6_null_entry;
5195         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5196         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5197                          ip6_template_metrics, true);
5198
5199 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5200         net->ipv6.fib6_has_custom_rules = false;
5201         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5202                                                sizeof(*net->ipv6.ip6_prohibit_entry),
5203                                                GFP_KERNEL);
5204         if (!net->ipv6.ip6_prohibit_entry)
5205                 goto out_ip6_null_entry;
5206         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5207         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5208                          ip6_template_metrics, true);
5209
5210         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5211                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
5212                                                GFP_KERNEL);
5213         if (!net->ipv6.ip6_blk_hole_entry)
5214                 goto out_ip6_prohibit_entry;
5215         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5216         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5217                          ip6_template_metrics, true);
5218 #endif
5219
5220         net->ipv6.sysctl.flush_delay = 0;
5221         net->ipv6.sysctl.ip6_rt_max_size = 4096;
5222         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5223         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5224         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5225         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5226         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5227         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5228
5229         net->ipv6.ip6_rt_gc_expire = 30*HZ;
5230
5231         ret = 0;
5232 out:
5233         return ret;
5234
5235 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5236 out_ip6_prohibit_entry:
5237         kfree(net->ipv6.ip6_prohibit_entry);
5238 out_ip6_null_entry:
5239         kfree(net->ipv6.ip6_null_entry);
5240 #endif
5241 out_fib6_null_entry:
5242         kfree(net->ipv6.fib6_null_entry);
5243 out_ip6_dst_entries:
5244         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5245 out_ip6_dst_ops:
5246         goto out;
5247 }
5248
5249 static void __net_exit ip6_route_net_exit(struct net *net)
5250 {
5251         kfree(net->ipv6.fib6_null_entry);
5252         kfree(net->ipv6.ip6_null_entry);
5253 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5254         kfree(net->ipv6.ip6_prohibit_entry);
5255         kfree(net->ipv6.ip6_blk_hole_entry);
5256 #endif
5257         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5258 }
5259
5260 static int __net_init ip6_route_net_init_late(struct net *net)
5261 {
5262 #ifdef CONFIG_PROC_FS
5263         proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
5264                         sizeof(struct ipv6_route_iter));
5265         proc_create_net_single("rt6_stats", 0444, net->proc_net,
5266                         rt6_stats_seq_show, NULL);
5267 #endif
5268         return 0;
5269 }
5270
5271 static void __net_exit ip6_route_net_exit_late(struct net *net)
5272 {
5273 #ifdef CONFIG_PROC_FS
5274         remove_proc_entry("ipv6_route", net->proc_net);
5275         remove_proc_entry("rt6_stats", net->proc_net);
5276 #endif
5277 }
5278
5279 static struct pernet_operations ip6_route_net_ops = {
5280         .init = ip6_route_net_init,
5281         .exit = ip6_route_net_exit,
5282 };
5283
5284 static int __net_init ipv6_inetpeer_init(struct net *net)
5285 {
5286         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5287
5288         if (!bp)
5289                 return -ENOMEM;
5290         inet_peer_base_init(bp);
5291         net->ipv6.peers = bp;
5292         return 0;
5293 }
5294
5295 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5296 {
5297         struct inet_peer_base *bp = net->ipv6.peers;
5298
5299         net->ipv6.peers = NULL;
5300         inetpeer_invalidate_tree(bp);
5301         kfree(bp);
5302 }
5303
5304 static struct pernet_operations ipv6_inetpeer_ops = {
5305         .init   =       ipv6_inetpeer_init,
5306         .exit   =       ipv6_inetpeer_exit,
5307 };
5308
5309 static struct pernet_operations ip6_route_net_late_ops = {
5310         .init = ip6_route_net_init_late,
5311         .exit = ip6_route_net_exit_late,
5312 };
5313
5314 static struct notifier_block ip6_route_dev_notifier = {
5315         .notifier_call = ip6_route_dev_notify,
5316         .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5317 };
5318
5319 void __init ip6_route_init_special_entries(void)
5320 {
5321         /* Registering of the loopback is done before this portion of code,
5322          * the loopback reference in rt6_info will not be taken, do it
5323          * manually for init_net */
5324         init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
5325         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5326         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5327   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5328         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5329         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5330         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5331         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5332   #endif
5333 }
5334
5335 int __init ip6_route_init(void)
5336 {
5337         int ret;
5338         int cpu;
5339
5340         ret = -ENOMEM;
5341         ip6_dst_ops_template.kmem_cachep =
5342                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5343                                   SLAB_HWCACHE_ALIGN, NULL);
5344         if (!ip6_dst_ops_template.kmem_cachep)
5345                 goto out;
5346
5347         ret = dst_entries_init(&ip6_dst_blackhole_ops);
5348         if (ret)
5349                 goto out_kmem_cache;
5350
5351         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5352         if (ret)
5353                 goto out_dst_entries;
5354
5355         ret = register_pernet_subsys(&ip6_route_net_ops);
5356         if (ret)
5357                 goto out_register_inetpeer;
5358
5359         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5360
5361         ret = fib6_init();
5362         if (ret)
5363                 goto out_register_subsys;
5364
5365         ret = xfrm6_init();
5366         if (ret)
5367                 goto out_fib6_init;
5368
5369         ret = fib6_rules_init();
5370         if (ret)
5371                 goto xfrm6_init;
5372
5373         ret = register_pernet_subsys(&ip6_route_net_late_ops);
5374         if (ret)
5375                 goto fib6_rules_init;
5376
5377         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5378                                    inet6_rtm_newroute, NULL, 0);
5379         if (ret < 0)
5380                 goto out_register_late_subsys;
5381
5382         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5383                                    inet6_rtm_delroute, NULL, 0);
5384         if (ret < 0)
5385                 goto out_register_late_subsys;
5386
5387         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5388                                    inet6_rtm_getroute, NULL,
5389                                    RTNL_FLAG_DOIT_UNLOCKED);
5390         if (ret < 0)
5391                 goto out_register_late_subsys;
5392
5393         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5394         if (ret)
5395                 goto out_register_late_subsys;
5396
5397         for_each_possible_cpu(cpu) {
5398                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5399
5400                 INIT_LIST_HEAD(&ul->head);
5401                 spin_lock_init(&ul->lock);
5402         }
5403
5404 out:
5405         return ret;
5406
5407 out_register_late_subsys:
5408         rtnl_unregister_all(PF_INET6);
5409         unregister_pernet_subsys(&ip6_route_net_late_ops);
5410 fib6_rules_init:
5411         fib6_rules_cleanup();
5412 xfrm6_init:
5413         xfrm6_fini();
5414 out_fib6_init:
5415         fib6_gc_cleanup();
5416 out_register_subsys:
5417         unregister_pernet_subsys(&ip6_route_net_ops);
5418 out_register_inetpeer:
5419         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5420 out_dst_entries:
5421         dst_entries_destroy(&ip6_dst_blackhole_ops);
5422 out_kmem_cache:
5423         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5424         goto out;
5425 }
5426
5427 void ip6_route_cleanup(void)
5428 {
5429         unregister_netdevice_notifier(&ip6_route_dev_notifier);
5430         unregister_pernet_subsys(&ip6_route_net_late_ops);
5431         fib6_rules_cleanup();
5432         xfrm6_fini();
5433         fib6_gc_cleanup();
5434         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5435         unregister_pernet_subsys(&ip6_route_net_ops);
5436         dst_entries_destroy(&ip6_dst_blackhole_ops);
5437         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5438 }