Merge branch '40GbE' of git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/net...
[sfrench/cifs-2.6.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <net/ip.h>
67 #include <linux/uaccess.h>
68
69 #ifdef CONFIG_SYSCTL
70 #include <linux/sysctl.h>
71 #endif
72
73 static int ip6_rt_type_to_error(u8 fib6_type);
74
75 #define CREATE_TRACE_POINTS
76 #include <trace/events/fib6.h>
77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
78 #undef CREATE_TRACE_POINTS
79
80 enum rt6_nud_state {
81         RT6_NUD_FAIL_HARD = -3,
82         RT6_NUD_FAIL_PROBE = -2,
83         RT6_NUD_FAIL_DO_RR = -1,
84         RT6_NUD_SUCCEED = 1
85 };
86
87 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
88 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
89 static unsigned int      ip6_mtu(const struct dst_entry *dst);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void             ip6_dst_destroy(struct dst_entry *);
92 static void             ip6_dst_ifdown(struct dst_entry *,
93                                        struct net_device *dev, int how);
94 static int               ip6_dst_gc(struct dst_ops *ops);
95
96 static int              ip6_pkt_discard(struct sk_buff *skb);
97 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
98 static int              ip6_pkt_prohibit(struct sk_buff *skb);
99 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
100 static void             ip6_link_failure(struct sk_buff *skb);
101 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
102                                            struct sk_buff *skb, u32 mtu);
103 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
104                                         struct sk_buff *skb);
105 static int rt6_score_route(struct fib6_info *rt, int oif, int strict);
106 static size_t rt6_nlmsg_size(struct fib6_info *rt);
107 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
108                          struct fib6_info *rt, struct dst_entry *dst,
109                          struct in6_addr *dest, struct in6_addr *src,
110                          int iif, int type, u32 portid, u32 seq,
111                          unsigned int flags);
112 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
113                                            struct in6_addr *daddr,
114                                            struct in6_addr *saddr);
115
116 #ifdef CONFIG_IPV6_ROUTE_INFO
117 static struct fib6_info *rt6_add_route_info(struct net *net,
118                                            const struct in6_addr *prefix, int prefixlen,
119                                            const struct in6_addr *gwaddr,
120                                            struct net_device *dev,
121                                            unsigned int pref);
122 static struct fib6_info *rt6_get_route_info(struct net *net,
123                                            const struct in6_addr *prefix, int prefixlen,
124                                            const struct in6_addr *gwaddr,
125                                            struct net_device *dev);
126 #endif
127
128 struct uncached_list {
129         spinlock_t              lock;
130         struct list_head        head;
131 };
132
133 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
134
135 void rt6_uncached_list_add(struct rt6_info *rt)
136 {
137         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
138
139         rt->rt6i_uncached_list = ul;
140
141         spin_lock_bh(&ul->lock);
142         list_add_tail(&rt->rt6i_uncached, &ul->head);
143         spin_unlock_bh(&ul->lock);
144 }
145
146 void rt6_uncached_list_del(struct rt6_info *rt)
147 {
148         if (!list_empty(&rt->rt6i_uncached)) {
149                 struct uncached_list *ul = rt->rt6i_uncached_list;
150                 struct net *net = dev_net(rt->dst.dev);
151
152                 spin_lock_bh(&ul->lock);
153                 list_del(&rt->rt6i_uncached);
154                 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
155                 spin_unlock_bh(&ul->lock);
156         }
157 }
158
159 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
160 {
161         struct net_device *loopback_dev = net->loopback_dev;
162         int cpu;
163
164         if (dev == loopback_dev)
165                 return;
166
167         for_each_possible_cpu(cpu) {
168                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
169                 struct rt6_info *rt;
170
171                 spin_lock_bh(&ul->lock);
172                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
173                         struct inet6_dev *rt_idev = rt->rt6i_idev;
174                         struct net_device *rt_dev = rt->dst.dev;
175
176                         if (rt_idev->dev == dev) {
177                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
178                                 in6_dev_put(rt_idev);
179                         }
180
181                         if (rt_dev == dev) {
182                                 rt->dst.dev = loopback_dev;
183                                 dev_hold(rt->dst.dev);
184                                 dev_put(rt_dev);
185                         }
186                 }
187                 spin_unlock_bh(&ul->lock);
188         }
189 }
190
191 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
192                                              struct sk_buff *skb,
193                                              const void *daddr)
194 {
195         if (!ipv6_addr_any(p))
196                 return (const void *) p;
197         else if (skb)
198                 return &ipv6_hdr(skb)->daddr;
199         return daddr;
200 }
201
202 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
203                                    struct net_device *dev,
204                                    struct sk_buff *skb,
205                                    const void *daddr)
206 {
207         struct neighbour *n;
208
209         daddr = choose_neigh_daddr(gw, skb, daddr);
210         n = __ipv6_neigh_lookup(dev, daddr);
211         if (n)
212                 return n;
213
214         n = neigh_create(&nd_tbl, daddr, dev);
215         return IS_ERR(n) ? NULL : n;
216 }
217
218 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
219                                               struct sk_buff *skb,
220                                               const void *daddr)
221 {
222         const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
223
224         return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
225 }
226
227 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
228 {
229         struct net_device *dev = dst->dev;
230         struct rt6_info *rt = (struct rt6_info *)dst;
231
232         daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
233         if (!daddr)
234                 return;
235         if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
236                 return;
237         if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
238                 return;
239         __ipv6_confirm_neigh(dev, daddr);
240 }
241
242 static struct dst_ops ip6_dst_ops_template = {
243         .family                 =       AF_INET6,
244         .gc                     =       ip6_dst_gc,
245         .gc_thresh              =       1024,
246         .check                  =       ip6_dst_check,
247         .default_advmss         =       ip6_default_advmss,
248         .mtu                    =       ip6_mtu,
249         .cow_metrics            =       dst_cow_metrics_generic,
250         .destroy                =       ip6_dst_destroy,
251         .ifdown                 =       ip6_dst_ifdown,
252         .negative_advice        =       ip6_negative_advice,
253         .link_failure           =       ip6_link_failure,
254         .update_pmtu            =       ip6_rt_update_pmtu,
255         .redirect               =       rt6_do_redirect,
256         .local_out              =       __ip6_local_out,
257         .neigh_lookup           =       ip6_dst_neigh_lookup,
258         .confirm_neigh          =       ip6_confirm_neigh,
259 };
260
261 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
262 {
263         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
264
265         return mtu ? : dst->dev->mtu;
266 }
267
268 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
269                                          struct sk_buff *skb, u32 mtu)
270 {
271 }
272
273 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
274                                       struct sk_buff *skb)
275 {
276 }
277
278 static struct dst_ops ip6_dst_blackhole_ops = {
279         .family                 =       AF_INET6,
280         .destroy                =       ip6_dst_destroy,
281         .check                  =       ip6_dst_check,
282         .mtu                    =       ip6_blackhole_mtu,
283         .default_advmss         =       ip6_default_advmss,
284         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
285         .redirect               =       ip6_rt_blackhole_redirect,
286         .cow_metrics            =       dst_cow_metrics_generic,
287         .neigh_lookup           =       ip6_dst_neigh_lookup,
288 };
289
290 static const u32 ip6_template_metrics[RTAX_MAX] = {
291         [RTAX_HOPLIMIT - 1] = 0,
292 };
293
294 static const struct fib6_info fib6_null_entry_template = {
295         .fib6_flags     = (RTF_REJECT | RTF_NONEXTHOP),
296         .fib6_protocol  = RTPROT_KERNEL,
297         .fib6_metric    = ~(u32)0,
298         .fib6_ref       = ATOMIC_INIT(1),
299         .fib6_type      = RTN_UNREACHABLE,
300         .fib6_metrics   = (struct dst_metrics *)&dst_default_metrics,
301 };
302
303 static const struct rt6_info ip6_null_entry_template = {
304         .dst = {
305                 .__refcnt       = ATOMIC_INIT(1),
306                 .__use          = 1,
307                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
308                 .error          = -ENETUNREACH,
309                 .input          = ip6_pkt_discard,
310                 .output         = ip6_pkt_discard_out,
311         },
312         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
313 };
314
315 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
316
317 static const struct rt6_info ip6_prohibit_entry_template = {
318         .dst = {
319                 .__refcnt       = ATOMIC_INIT(1),
320                 .__use          = 1,
321                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
322                 .error          = -EACCES,
323                 .input          = ip6_pkt_prohibit,
324                 .output         = ip6_pkt_prohibit_out,
325         },
326         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
327 };
328
329 static const struct rt6_info ip6_blk_hole_entry_template = {
330         .dst = {
331                 .__refcnt       = ATOMIC_INIT(1),
332                 .__use          = 1,
333                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
334                 .error          = -EINVAL,
335                 .input          = dst_discard,
336                 .output         = dst_discard_out,
337         },
338         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
339 };
340
341 #endif
342
343 static void rt6_info_init(struct rt6_info *rt)
344 {
345         struct dst_entry *dst = &rt->dst;
346
347         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
348         INIT_LIST_HEAD(&rt->rt6i_uncached);
349 }
350
351 /* allocate dst with ip6_dst_ops */
352 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
353                                int flags)
354 {
355         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
356                                         1, DST_OBSOLETE_FORCE_CHK, flags);
357
358         if (rt) {
359                 rt6_info_init(rt);
360                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
361         }
362
363         return rt;
364 }
365 EXPORT_SYMBOL(ip6_dst_alloc);
366
367 static void ip6_dst_destroy(struct dst_entry *dst)
368 {
369         struct rt6_info *rt = (struct rt6_info *)dst;
370         struct fib6_info *from;
371         struct inet6_dev *idev;
372
373         ip_dst_metrics_put(dst);
374         rt6_uncached_list_del(rt);
375
376         idev = rt->rt6i_idev;
377         if (idev) {
378                 rt->rt6i_idev = NULL;
379                 in6_dev_put(idev);
380         }
381
382         rcu_read_lock();
383         from = rcu_dereference(rt->from);
384         rcu_assign_pointer(rt->from, NULL);
385         fib6_info_release(from);
386         rcu_read_unlock();
387 }
388
389 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
390                            int how)
391 {
392         struct rt6_info *rt = (struct rt6_info *)dst;
393         struct inet6_dev *idev = rt->rt6i_idev;
394         struct net_device *loopback_dev =
395                 dev_net(dev)->loopback_dev;
396
397         if (idev && idev->dev != loopback_dev) {
398                 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
399                 if (loopback_idev) {
400                         rt->rt6i_idev = loopback_idev;
401                         in6_dev_put(idev);
402                 }
403         }
404 }
405
406 static bool __rt6_check_expired(const struct rt6_info *rt)
407 {
408         if (rt->rt6i_flags & RTF_EXPIRES)
409                 return time_after(jiffies, rt->dst.expires);
410         else
411                 return false;
412 }
413
414 static bool rt6_check_expired(const struct rt6_info *rt)
415 {
416         struct fib6_info *from;
417
418         from = rcu_dereference(rt->from);
419
420         if (rt->rt6i_flags & RTF_EXPIRES) {
421                 if (time_after(jiffies, rt->dst.expires))
422                         return true;
423         } else if (from) {
424                 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
425                         fib6_check_expired(from);
426         }
427         return false;
428 }
429
430 struct fib6_info *fib6_multipath_select(const struct net *net,
431                                         struct fib6_info *match,
432                                         struct flowi6 *fl6, int oif,
433                                         const struct sk_buff *skb,
434                                         int strict)
435 {
436         struct fib6_info *sibling, *next_sibling;
437
438         /* We might have already computed the hash for ICMPv6 errors. In such
439          * case it will always be non-zero. Otherwise now is the time to do it.
440          */
441         if (!fl6->mp_hash)
442                 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
443
444         if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound))
445                 return match;
446
447         list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
448                                  fib6_siblings) {
449                 int nh_upper_bound;
450
451                 nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound);
452                 if (fl6->mp_hash > nh_upper_bound)
453                         continue;
454                 if (rt6_score_route(sibling, oif, strict) < 0)
455                         break;
456                 match = sibling;
457                 break;
458         }
459
460         return match;
461 }
462
463 /*
464  *      Route lookup. rcu_read_lock() should be held.
465  */
466
467 static inline struct fib6_info *rt6_device_match(struct net *net,
468                                                  struct fib6_info *rt,
469                                                     const struct in6_addr *saddr,
470                                                     int oif,
471                                                     int flags)
472 {
473         struct fib6_info *sprt;
474
475         if (!oif && ipv6_addr_any(saddr) &&
476             !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
477                 return rt;
478
479         for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) {
480                 const struct net_device *dev = sprt->fib6_nh.nh_dev;
481
482                 if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD)
483                         continue;
484
485                 if (oif) {
486                         if (dev->ifindex == oif)
487                                 return sprt;
488                 } else {
489                         if (ipv6_chk_addr(net, saddr, dev,
490                                           flags & RT6_LOOKUP_F_IFACE))
491                                 return sprt;
492                 }
493         }
494
495         if (oif && flags & RT6_LOOKUP_F_IFACE)
496                 return net->ipv6.fib6_null_entry;
497
498         return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
499 }
500
501 #ifdef CONFIG_IPV6_ROUTER_PREF
502 struct __rt6_probe_work {
503         struct work_struct work;
504         struct in6_addr target;
505         struct net_device *dev;
506 };
507
508 static void rt6_probe_deferred(struct work_struct *w)
509 {
510         struct in6_addr mcaddr;
511         struct __rt6_probe_work *work =
512                 container_of(w, struct __rt6_probe_work, work);
513
514         addrconf_addr_solict_mult(&work->target, &mcaddr);
515         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
516         dev_put(work->dev);
517         kfree(work);
518 }
519
520 static void rt6_probe(struct fib6_info *rt)
521 {
522         struct __rt6_probe_work *work = NULL;
523         const struct in6_addr *nh_gw;
524         struct neighbour *neigh;
525         struct net_device *dev;
526         struct inet6_dev *idev;
527
528         /*
529          * Okay, this does not seem to be appropriate
530          * for now, however, we need to check if it
531          * is really so; aka Router Reachability Probing.
532          *
533          * Router Reachability Probe MUST be rate-limited
534          * to no more than one per minute.
535          */
536         if (!rt || !(rt->fib6_flags & RTF_GATEWAY))
537                 return;
538
539         nh_gw = &rt->fib6_nh.nh_gw;
540         dev = rt->fib6_nh.nh_dev;
541         rcu_read_lock_bh();
542         idev = __in6_dev_get(dev);
543         neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
544         if (neigh) {
545                 if (neigh->nud_state & NUD_VALID)
546                         goto out;
547
548                 write_lock(&neigh->lock);
549                 if (!(neigh->nud_state & NUD_VALID) &&
550                     time_after(jiffies,
551                                neigh->updated + idev->cnf.rtr_probe_interval)) {
552                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
553                         if (work)
554                                 __neigh_set_probe_once(neigh);
555                 }
556                 write_unlock(&neigh->lock);
557         } else if (time_after(jiffies, rt->last_probe +
558                                        idev->cnf.rtr_probe_interval)) {
559                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
560         }
561
562         if (work) {
563                 rt->last_probe = jiffies;
564                 INIT_WORK(&work->work, rt6_probe_deferred);
565                 work->target = *nh_gw;
566                 dev_hold(dev);
567                 work->dev = dev;
568                 schedule_work(&work->work);
569         }
570
571 out:
572         rcu_read_unlock_bh();
573 }
574 #else
575 static inline void rt6_probe(struct fib6_info *rt)
576 {
577 }
578 #endif
579
580 /*
581  * Default Router Selection (RFC 2461 6.3.6)
582  */
583 static inline int rt6_check_dev(struct fib6_info *rt, int oif)
584 {
585         const struct net_device *dev = rt->fib6_nh.nh_dev;
586
587         if (!oif || dev->ifindex == oif)
588                 return 2;
589         return 0;
590 }
591
592 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
593 {
594         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
595         struct neighbour *neigh;
596
597         if (rt->fib6_flags & RTF_NONEXTHOP ||
598             !(rt->fib6_flags & RTF_GATEWAY))
599                 return RT6_NUD_SUCCEED;
600
601         rcu_read_lock_bh();
602         neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev,
603                                           &rt->fib6_nh.nh_gw);
604         if (neigh) {
605                 read_lock(&neigh->lock);
606                 if (neigh->nud_state & NUD_VALID)
607                         ret = RT6_NUD_SUCCEED;
608 #ifdef CONFIG_IPV6_ROUTER_PREF
609                 else if (!(neigh->nud_state & NUD_FAILED))
610                         ret = RT6_NUD_SUCCEED;
611                 else
612                         ret = RT6_NUD_FAIL_PROBE;
613 #endif
614                 read_unlock(&neigh->lock);
615         } else {
616                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
617                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
618         }
619         rcu_read_unlock_bh();
620
621         return ret;
622 }
623
624 static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
625 {
626         int m;
627
628         m = rt6_check_dev(rt, oif);
629         if (!m && (strict & RT6_LOOKUP_F_IFACE))
630                 return RT6_NUD_FAIL_HARD;
631 #ifdef CONFIG_IPV6_ROUTER_PREF
632         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2;
633 #endif
634         if (strict & RT6_LOOKUP_F_REACHABLE) {
635                 int n = rt6_check_neigh(rt);
636                 if (n < 0)
637                         return n;
638         }
639         return m;
640 }
641
642 /* called with rc_read_lock held */
643 static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i)
644 {
645         const struct net_device *dev = fib6_info_nh_dev(f6i);
646         bool rc = false;
647
648         if (dev) {
649                 const struct inet6_dev *idev = __in6_dev_get(dev);
650
651                 rc = !!idev->cnf.ignore_routes_with_linkdown;
652         }
653
654         return rc;
655 }
656
657 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
658                                    int *mpri, struct fib6_info *match,
659                                    bool *do_rr)
660 {
661         int m;
662         bool match_do_rr = false;
663
664         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
665                 goto out;
666
667         if (fib6_ignore_linkdown(rt) &&
668             rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
669             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
670                 goto out;
671
672         if (fib6_check_expired(rt))
673                 goto out;
674
675         m = rt6_score_route(rt, oif, strict);
676         if (m == RT6_NUD_FAIL_DO_RR) {
677                 match_do_rr = true;
678                 m = 0; /* lowest valid score */
679         } else if (m == RT6_NUD_FAIL_HARD) {
680                 goto out;
681         }
682
683         if (strict & RT6_LOOKUP_F_REACHABLE)
684                 rt6_probe(rt);
685
686         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
687         if (m > *mpri) {
688                 *do_rr = match_do_rr;
689                 *mpri = m;
690                 match = rt;
691         }
692 out:
693         return match;
694 }
695
696 static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
697                                      struct fib6_info *leaf,
698                                      struct fib6_info *rr_head,
699                                      u32 metric, int oif, int strict,
700                                      bool *do_rr)
701 {
702         struct fib6_info *rt, *match, *cont;
703         int mpri = -1;
704
705         match = NULL;
706         cont = NULL;
707         for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) {
708                 if (rt->fib6_metric != metric) {
709                         cont = rt;
710                         break;
711                 }
712
713                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
714         }
715
716         for (rt = leaf; rt && rt != rr_head;
717              rt = rcu_dereference(rt->fib6_next)) {
718                 if (rt->fib6_metric != metric) {
719                         cont = rt;
720                         break;
721                 }
722
723                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
724         }
725
726         if (match || !cont)
727                 return match;
728
729         for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next))
730                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
731
732         return match;
733 }
734
735 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
736                                    int oif, int strict)
737 {
738         struct fib6_info *leaf = rcu_dereference(fn->leaf);
739         struct fib6_info *match, *rt0;
740         bool do_rr = false;
741         int key_plen;
742
743         if (!leaf || leaf == net->ipv6.fib6_null_entry)
744                 return net->ipv6.fib6_null_entry;
745
746         rt0 = rcu_dereference(fn->rr_ptr);
747         if (!rt0)
748                 rt0 = leaf;
749
750         /* Double check to make sure fn is not an intermediate node
751          * and fn->leaf does not points to its child's leaf
752          * (This might happen if all routes under fn are deleted from
753          * the tree and fib6_repair_tree() is called on the node.)
754          */
755         key_plen = rt0->fib6_dst.plen;
756 #ifdef CONFIG_IPV6_SUBTREES
757         if (rt0->fib6_src.plen)
758                 key_plen = rt0->fib6_src.plen;
759 #endif
760         if (fn->fn_bit != key_plen)
761                 return net->ipv6.fib6_null_entry;
762
763         match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
764                              &do_rr);
765
766         if (do_rr) {
767                 struct fib6_info *next = rcu_dereference(rt0->fib6_next);
768
769                 /* no entries matched; do round-robin */
770                 if (!next || next->fib6_metric != rt0->fib6_metric)
771                         next = leaf;
772
773                 if (next != rt0) {
774                         spin_lock_bh(&leaf->fib6_table->tb6_lock);
775                         /* make sure next is not being deleted from the tree */
776                         if (next->fib6_node)
777                                 rcu_assign_pointer(fn->rr_ptr, next);
778                         spin_unlock_bh(&leaf->fib6_table->tb6_lock);
779                 }
780         }
781
782         return match ? match : net->ipv6.fib6_null_entry;
783 }
784
785 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
786 {
787         return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
788 }
789
790 #ifdef CONFIG_IPV6_ROUTE_INFO
791 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
792                   const struct in6_addr *gwaddr)
793 {
794         struct net *net = dev_net(dev);
795         struct route_info *rinfo = (struct route_info *) opt;
796         struct in6_addr prefix_buf, *prefix;
797         unsigned int pref;
798         unsigned long lifetime;
799         struct fib6_info *rt;
800
801         if (len < sizeof(struct route_info)) {
802                 return -EINVAL;
803         }
804
805         /* Sanity check for prefix_len and length */
806         if (rinfo->length > 3) {
807                 return -EINVAL;
808         } else if (rinfo->prefix_len > 128) {
809                 return -EINVAL;
810         } else if (rinfo->prefix_len > 64) {
811                 if (rinfo->length < 2) {
812                         return -EINVAL;
813                 }
814         } else if (rinfo->prefix_len > 0) {
815                 if (rinfo->length < 1) {
816                         return -EINVAL;
817                 }
818         }
819
820         pref = rinfo->route_pref;
821         if (pref == ICMPV6_ROUTER_PREF_INVALID)
822                 return -EINVAL;
823
824         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
825
826         if (rinfo->length == 3)
827                 prefix = (struct in6_addr *)rinfo->prefix;
828         else {
829                 /* this function is safe */
830                 ipv6_addr_prefix(&prefix_buf,
831                                  (struct in6_addr *)rinfo->prefix,
832                                  rinfo->prefix_len);
833                 prefix = &prefix_buf;
834         }
835
836         if (rinfo->prefix_len == 0)
837                 rt = rt6_get_dflt_router(net, gwaddr, dev);
838         else
839                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
840                                         gwaddr, dev);
841
842         if (rt && !lifetime) {
843                 ip6_del_rt(net, rt);
844                 rt = NULL;
845         }
846
847         if (!rt && lifetime)
848                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
849                                         dev, pref);
850         else if (rt)
851                 rt->fib6_flags = RTF_ROUTEINFO |
852                                  (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
853
854         if (rt) {
855                 if (!addrconf_finite_timeout(lifetime))
856                         fib6_clean_expires(rt);
857                 else
858                         fib6_set_expires(rt, jiffies + HZ * lifetime);
859
860                 fib6_info_release(rt);
861         }
862         return 0;
863 }
864 #endif
865
866 /*
867  *      Misc support functions
868  */
869
870 /* called with rcu_lock held */
871 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
872 {
873         struct net_device *dev = rt->fib6_nh.nh_dev;
874
875         if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
876                 /* for copies of local routes, dst->dev needs to be the
877                  * device if it is a master device, the master device if
878                  * device is enslaved, and the loopback as the default
879                  */
880                 if (netif_is_l3_slave(dev) &&
881                     !rt6_need_strict(&rt->fib6_dst.addr))
882                         dev = l3mdev_master_dev_rcu(dev);
883                 else if (!netif_is_l3_master(dev))
884                         dev = dev_net(dev)->loopback_dev;
885                 /* last case is netif_is_l3_master(dev) is true in which
886                  * case we want dev returned to be dev
887                  */
888         }
889
890         return dev;
891 }
892
893 static const int fib6_prop[RTN_MAX + 1] = {
894         [RTN_UNSPEC]    = 0,
895         [RTN_UNICAST]   = 0,
896         [RTN_LOCAL]     = 0,
897         [RTN_BROADCAST] = 0,
898         [RTN_ANYCAST]   = 0,
899         [RTN_MULTICAST] = 0,
900         [RTN_BLACKHOLE] = -EINVAL,
901         [RTN_UNREACHABLE] = -EHOSTUNREACH,
902         [RTN_PROHIBIT]  = -EACCES,
903         [RTN_THROW]     = -EAGAIN,
904         [RTN_NAT]       = -EINVAL,
905         [RTN_XRESOLVE]  = -EINVAL,
906 };
907
908 static int ip6_rt_type_to_error(u8 fib6_type)
909 {
910         return fib6_prop[fib6_type];
911 }
912
913 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
914 {
915         unsigned short flags = 0;
916
917         if (rt->dst_nocount)
918                 flags |= DST_NOCOUNT;
919         if (rt->dst_nopolicy)
920                 flags |= DST_NOPOLICY;
921         if (rt->dst_host)
922                 flags |= DST_HOST;
923
924         return flags;
925 }
926
927 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
928 {
929         rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
930
931         switch (ort->fib6_type) {
932         case RTN_BLACKHOLE:
933                 rt->dst.output = dst_discard_out;
934                 rt->dst.input = dst_discard;
935                 break;
936         case RTN_PROHIBIT:
937                 rt->dst.output = ip6_pkt_prohibit_out;
938                 rt->dst.input = ip6_pkt_prohibit;
939                 break;
940         case RTN_THROW:
941         case RTN_UNREACHABLE:
942         default:
943                 rt->dst.output = ip6_pkt_discard_out;
944                 rt->dst.input = ip6_pkt_discard;
945                 break;
946         }
947 }
948
949 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
950 {
951         if (ort->fib6_flags & RTF_REJECT) {
952                 ip6_rt_init_dst_reject(rt, ort);
953                 return;
954         }
955
956         rt->dst.error = 0;
957         rt->dst.output = ip6_output;
958
959         if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) {
960                 rt->dst.input = ip6_input;
961         } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
962                 rt->dst.input = ip6_mc_input;
963         } else {
964                 rt->dst.input = ip6_forward;
965         }
966
967         if (ort->fib6_nh.nh_lwtstate) {
968                 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
969                 lwtunnel_set_redirect(&rt->dst);
970         }
971
972         rt->dst.lastuse = jiffies;
973 }
974
975 /* Caller must already hold reference to @from */
976 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
977 {
978         rt->rt6i_flags &= ~RTF_EXPIRES;
979         rcu_assign_pointer(rt->from, from);
980         ip_dst_init_metrics(&rt->dst, from->fib6_metrics);
981 }
982
983 /* Caller must already hold reference to @ort */
984 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
985 {
986         struct net_device *dev = fib6_info_nh_dev(ort);
987
988         ip6_rt_init_dst(rt, ort);
989
990         rt->rt6i_dst = ort->fib6_dst;
991         rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
992         rt->rt6i_gateway = ort->fib6_nh.nh_gw;
993         rt->rt6i_flags = ort->fib6_flags;
994         rt6_set_from(rt, ort);
995 #ifdef CONFIG_IPV6_SUBTREES
996         rt->rt6i_src = ort->fib6_src;
997 #endif
998 }
999
1000 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1001                                         struct in6_addr *saddr)
1002 {
1003         struct fib6_node *pn, *sn;
1004         while (1) {
1005                 if (fn->fn_flags & RTN_TL_ROOT)
1006                         return NULL;
1007                 pn = rcu_dereference(fn->parent);
1008                 sn = FIB6_SUBTREE(pn);
1009                 if (sn && sn != fn)
1010                         fn = fib6_node_lookup(sn, NULL, saddr);
1011                 else
1012                         fn = pn;
1013                 if (fn->fn_flags & RTN_RTINFO)
1014                         return fn;
1015         }
1016 }
1017
1018 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
1019                           bool null_fallback)
1020 {
1021         struct rt6_info *rt = *prt;
1022
1023         if (dst_hold_safe(&rt->dst))
1024                 return true;
1025         if (null_fallback) {
1026                 rt = net->ipv6.ip6_null_entry;
1027                 dst_hold(&rt->dst);
1028         } else {
1029                 rt = NULL;
1030         }
1031         *prt = rt;
1032         return false;
1033 }
1034
1035 /* called with rcu_lock held */
1036 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
1037 {
1038         unsigned short flags = fib6_info_dst_flags(rt);
1039         struct net_device *dev = rt->fib6_nh.nh_dev;
1040         struct rt6_info *nrt;
1041
1042         if (!fib6_info_hold_safe(rt))
1043                 return NULL;
1044
1045         nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1046         if (nrt)
1047                 ip6_rt_copy_init(nrt, rt);
1048         else
1049                 fib6_info_release(rt);
1050
1051         return nrt;
1052 }
1053
1054 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1055                                              struct fib6_table *table,
1056                                              struct flowi6 *fl6,
1057                                              const struct sk_buff *skb,
1058                                              int flags)
1059 {
1060         struct fib6_info *f6i;
1061         struct fib6_node *fn;
1062         struct rt6_info *rt;
1063
1064         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1065                 flags &= ~RT6_LOOKUP_F_IFACE;
1066
1067         rcu_read_lock();
1068         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1069 restart:
1070         f6i = rcu_dereference(fn->leaf);
1071         if (!f6i) {
1072                 f6i = net->ipv6.fib6_null_entry;
1073         } else {
1074                 f6i = rt6_device_match(net, f6i, &fl6->saddr,
1075                                       fl6->flowi6_oif, flags);
1076                 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
1077                         f6i = fib6_multipath_select(net, f6i, fl6,
1078                                                     fl6->flowi6_oif, skb,
1079                                                     flags);
1080         }
1081         if (f6i == net->ipv6.fib6_null_entry) {
1082                 fn = fib6_backtrack(fn, &fl6->saddr);
1083                 if (fn)
1084                         goto restart;
1085         }
1086
1087         trace_fib6_table_lookup(net, f6i, table, fl6);
1088
1089         /* Search through exception table */
1090         rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1091         if (rt) {
1092                 if (ip6_hold_safe(net, &rt, true))
1093                         dst_use_noref(&rt->dst, jiffies);
1094         } else if (f6i == net->ipv6.fib6_null_entry) {
1095                 rt = net->ipv6.ip6_null_entry;
1096                 dst_hold(&rt->dst);
1097         } else {
1098                 rt = ip6_create_rt_rcu(f6i);
1099                 if (!rt) {
1100                         rt = net->ipv6.ip6_null_entry;
1101                         dst_hold(&rt->dst);
1102                 }
1103         }
1104
1105         rcu_read_unlock();
1106
1107         return rt;
1108 }
1109
1110 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1111                                    const struct sk_buff *skb, int flags)
1112 {
1113         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1114 }
1115 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1116
1117 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1118                             const struct in6_addr *saddr, int oif,
1119                             const struct sk_buff *skb, int strict)
1120 {
1121         struct flowi6 fl6 = {
1122                 .flowi6_oif = oif,
1123                 .daddr = *daddr,
1124         };
1125         struct dst_entry *dst;
1126         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1127
1128         if (saddr) {
1129                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1130                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1131         }
1132
1133         dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1134         if (dst->error == 0)
1135                 return (struct rt6_info *) dst;
1136
1137         dst_release(dst);
1138
1139         return NULL;
1140 }
1141 EXPORT_SYMBOL(rt6_lookup);
1142
1143 /* ip6_ins_rt is called with FREE table->tb6_lock.
1144  * It takes new route entry, the addition fails by any reason the
1145  * route is released.
1146  * Caller must hold dst before calling it.
1147  */
1148
1149 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1150                         struct netlink_ext_ack *extack)
1151 {
1152         int err;
1153         struct fib6_table *table;
1154
1155         table = rt->fib6_table;
1156         spin_lock_bh(&table->tb6_lock);
1157         err = fib6_add(&table->tb6_root, rt, info, extack);
1158         spin_unlock_bh(&table->tb6_lock);
1159
1160         return err;
1161 }
1162
1163 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1164 {
1165         struct nl_info info = { .nl_net = net, };
1166
1167         return __ip6_ins_rt(rt, &info, NULL);
1168 }
1169
1170 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
1171                                            const struct in6_addr *daddr,
1172                                            const struct in6_addr *saddr)
1173 {
1174         struct net_device *dev;
1175         struct rt6_info *rt;
1176
1177         /*
1178          *      Clone the route.
1179          */
1180
1181         if (!fib6_info_hold_safe(ort))
1182                 return NULL;
1183
1184         dev = ip6_rt_get_dev_rcu(ort);
1185         rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1186         if (!rt) {
1187                 fib6_info_release(ort);
1188                 return NULL;
1189         }
1190
1191         ip6_rt_copy_init(rt, ort);
1192         rt->rt6i_flags |= RTF_CACHE;
1193         rt->dst.flags |= DST_HOST;
1194         rt->rt6i_dst.addr = *daddr;
1195         rt->rt6i_dst.plen = 128;
1196
1197         if (!rt6_is_gw_or_nonexthop(ort)) {
1198                 if (ort->fib6_dst.plen != 128 &&
1199                     ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
1200                         rt->rt6i_flags |= RTF_ANYCAST;
1201 #ifdef CONFIG_IPV6_SUBTREES
1202                 if (rt->rt6i_src.plen && saddr) {
1203                         rt->rt6i_src.addr = *saddr;
1204                         rt->rt6i_src.plen = 128;
1205                 }
1206 #endif
1207         }
1208
1209         return rt;
1210 }
1211
1212 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
1213 {
1214         unsigned short flags = fib6_info_dst_flags(rt);
1215         struct net_device *dev;
1216         struct rt6_info *pcpu_rt;
1217
1218         if (!fib6_info_hold_safe(rt))
1219                 return NULL;
1220
1221         rcu_read_lock();
1222         dev = ip6_rt_get_dev_rcu(rt);
1223         pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1224         rcu_read_unlock();
1225         if (!pcpu_rt) {
1226                 fib6_info_release(rt);
1227                 return NULL;
1228         }
1229         ip6_rt_copy_init(pcpu_rt, rt);
1230         pcpu_rt->rt6i_flags |= RTF_PCPU;
1231         return pcpu_rt;
1232 }
1233
1234 /* It should be called with rcu_read_lock() acquired */
1235 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
1236 {
1237         struct rt6_info *pcpu_rt, **p;
1238
1239         p = this_cpu_ptr(rt->rt6i_pcpu);
1240         pcpu_rt = *p;
1241
1242         if (pcpu_rt)
1243                 ip6_hold_safe(NULL, &pcpu_rt, false);
1244
1245         return pcpu_rt;
1246 }
1247
1248 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1249                                             struct fib6_info *rt)
1250 {
1251         struct rt6_info *pcpu_rt, *prev, **p;
1252
1253         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1254         if (!pcpu_rt) {
1255                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1256                 return net->ipv6.ip6_null_entry;
1257         }
1258
1259         dst_hold(&pcpu_rt->dst);
1260         p = this_cpu_ptr(rt->rt6i_pcpu);
1261         prev = cmpxchg(p, NULL, pcpu_rt);
1262         BUG_ON(prev);
1263
1264         return pcpu_rt;
1265 }
1266
1267 /* exception hash table implementation
1268  */
1269 static DEFINE_SPINLOCK(rt6_exception_lock);
1270
1271 /* Remove rt6_ex from hash table and free the memory
1272  * Caller must hold rt6_exception_lock
1273  */
1274 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1275                                  struct rt6_exception *rt6_ex)
1276 {
1277         struct net *net;
1278
1279         if (!bucket || !rt6_ex)
1280                 return;
1281
1282         net = dev_net(rt6_ex->rt6i->dst.dev);
1283         hlist_del_rcu(&rt6_ex->hlist);
1284         dst_release(&rt6_ex->rt6i->dst);
1285         kfree_rcu(rt6_ex, rcu);
1286         WARN_ON_ONCE(!bucket->depth);
1287         bucket->depth--;
1288         net->ipv6.rt6_stats->fib_rt_cache--;
1289 }
1290
1291 /* Remove oldest rt6_ex in bucket and free the memory
1292  * Caller must hold rt6_exception_lock
1293  */
1294 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1295 {
1296         struct rt6_exception *rt6_ex, *oldest = NULL;
1297
1298         if (!bucket)
1299                 return;
1300
1301         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1302                 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1303                         oldest = rt6_ex;
1304         }
1305         rt6_remove_exception(bucket, oldest);
1306 }
1307
1308 static u32 rt6_exception_hash(const struct in6_addr *dst,
1309                               const struct in6_addr *src)
1310 {
1311         static u32 seed __read_mostly;
1312         u32 val;
1313
1314         net_get_random_once(&seed, sizeof(seed));
1315         val = jhash(dst, sizeof(*dst), seed);
1316
1317 #ifdef CONFIG_IPV6_SUBTREES
1318         if (src)
1319                 val = jhash(src, sizeof(*src), val);
1320 #endif
1321         return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1322 }
1323
1324 /* Helper function to find the cached rt in the hash table
1325  * and update bucket pointer to point to the bucket for this
1326  * (daddr, saddr) pair
1327  * Caller must hold rt6_exception_lock
1328  */
1329 static struct rt6_exception *
1330 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1331                               const struct in6_addr *daddr,
1332                               const struct in6_addr *saddr)
1333 {
1334         struct rt6_exception *rt6_ex;
1335         u32 hval;
1336
1337         if (!(*bucket) || !daddr)
1338                 return NULL;
1339
1340         hval = rt6_exception_hash(daddr, saddr);
1341         *bucket += hval;
1342
1343         hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1344                 struct rt6_info *rt6 = rt6_ex->rt6i;
1345                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1346
1347 #ifdef CONFIG_IPV6_SUBTREES
1348                 if (matched && saddr)
1349                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1350 #endif
1351                 if (matched)
1352                         return rt6_ex;
1353         }
1354         return NULL;
1355 }
1356
1357 /* Helper function to find the cached rt in the hash table
1358  * and update bucket pointer to point to the bucket for this
1359  * (daddr, saddr) pair
1360  * Caller must hold rcu_read_lock()
1361  */
1362 static struct rt6_exception *
1363 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1364                          const struct in6_addr *daddr,
1365                          const struct in6_addr *saddr)
1366 {
1367         struct rt6_exception *rt6_ex;
1368         u32 hval;
1369
1370         WARN_ON_ONCE(!rcu_read_lock_held());
1371
1372         if (!(*bucket) || !daddr)
1373                 return NULL;
1374
1375         hval = rt6_exception_hash(daddr, saddr);
1376         *bucket += hval;
1377
1378         hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1379                 struct rt6_info *rt6 = rt6_ex->rt6i;
1380                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1381
1382 #ifdef CONFIG_IPV6_SUBTREES
1383                 if (matched && saddr)
1384                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1385 #endif
1386                 if (matched)
1387                         return rt6_ex;
1388         }
1389         return NULL;
1390 }
1391
1392 static unsigned int fib6_mtu(const struct fib6_info *rt)
1393 {
1394         unsigned int mtu;
1395
1396         if (rt->fib6_pmtu) {
1397                 mtu = rt->fib6_pmtu;
1398         } else {
1399                 struct net_device *dev = fib6_info_nh_dev(rt);
1400                 struct inet6_dev *idev;
1401
1402                 rcu_read_lock();
1403                 idev = __in6_dev_get(dev);
1404                 mtu = idev->cnf.mtu6;
1405                 rcu_read_unlock();
1406         }
1407
1408         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1409
1410         return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
1411 }
1412
1413 static int rt6_insert_exception(struct rt6_info *nrt,
1414                                 struct fib6_info *ort)
1415 {
1416         struct net *net = dev_net(nrt->dst.dev);
1417         struct rt6_exception_bucket *bucket;
1418         struct in6_addr *src_key = NULL;
1419         struct rt6_exception *rt6_ex;
1420         int err = 0;
1421
1422         spin_lock_bh(&rt6_exception_lock);
1423
1424         if (ort->exception_bucket_flushed) {
1425                 err = -EINVAL;
1426                 goto out;
1427         }
1428
1429         bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1430                                         lockdep_is_held(&rt6_exception_lock));
1431         if (!bucket) {
1432                 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1433                                  GFP_ATOMIC);
1434                 if (!bucket) {
1435                         err = -ENOMEM;
1436                         goto out;
1437                 }
1438                 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1439         }
1440
1441 #ifdef CONFIG_IPV6_SUBTREES
1442         /* rt6i_src.plen != 0 indicates ort is in subtree
1443          * and exception table is indexed by a hash of
1444          * both rt6i_dst and rt6i_src.
1445          * Otherwise, the exception table is indexed by
1446          * a hash of only rt6i_dst.
1447          */
1448         if (ort->fib6_src.plen)
1449                 src_key = &nrt->rt6i_src.addr;
1450 #endif
1451         /* rt6_mtu_change() might lower mtu on ort.
1452          * Only insert this exception route if its mtu
1453          * is less than ort's mtu value.
1454          */
1455         if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1456                 err = -EINVAL;
1457                 goto out;
1458         }
1459
1460         rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1461                                                src_key);
1462         if (rt6_ex)
1463                 rt6_remove_exception(bucket, rt6_ex);
1464
1465         rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1466         if (!rt6_ex) {
1467                 err = -ENOMEM;
1468                 goto out;
1469         }
1470         rt6_ex->rt6i = nrt;
1471         rt6_ex->stamp = jiffies;
1472         hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1473         bucket->depth++;
1474         net->ipv6.rt6_stats->fib_rt_cache++;
1475
1476         if (bucket->depth > FIB6_MAX_DEPTH)
1477                 rt6_exception_remove_oldest(bucket);
1478
1479 out:
1480         spin_unlock_bh(&rt6_exception_lock);
1481
1482         /* Update fn->fn_sernum to invalidate all cached dst */
1483         if (!err) {
1484                 spin_lock_bh(&ort->fib6_table->tb6_lock);
1485                 fib6_update_sernum(net, ort);
1486                 spin_unlock_bh(&ort->fib6_table->tb6_lock);
1487                 fib6_force_start_gc(net);
1488         }
1489
1490         return err;
1491 }
1492
1493 void rt6_flush_exceptions(struct fib6_info *rt)
1494 {
1495         struct rt6_exception_bucket *bucket;
1496         struct rt6_exception *rt6_ex;
1497         struct hlist_node *tmp;
1498         int i;
1499
1500         spin_lock_bh(&rt6_exception_lock);
1501         /* Prevent rt6_insert_exception() to recreate the bucket list */
1502         rt->exception_bucket_flushed = 1;
1503
1504         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1505                                     lockdep_is_held(&rt6_exception_lock));
1506         if (!bucket)
1507                 goto out;
1508
1509         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1510                 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1511                         rt6_remove_exception(bucket, rt6_ex);
1512                 WARN_ON_ONCE(bucket->depth);
1513                 bucket++;
1514         }
1515
1516 out:
1517         spin_unlock_bh(&rt6_exception_lock);
1518 }
1519
1520 /* Find cached rt in the hash table inside passed in rt
1521  * Caller has to hold rcu_read_lock()
1522  */
1523 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
1524                                            struct in6_addr *daddr,
1525                                            struct in6_addr *saddr)
1526 {
1527         struct rt6_exception_bucket *bucket;
1528         struct in6_addr *src_key = NULL;
1529         struct rt6_exception *rt6_ex;
1530         struct rt6_info *res = NULL;
1531
1532         bucket = rcu_dereference(rt->rt6i_exception_bucket);
1533
1534 #ifdef CONFIG_IPV6_SUBTREES
1535         /* rt6i_src.plen != 0 indicates rt is in subtree
1536          * and exception table is indexed by a hash of
1537          * both rt6i_dst and rt6i_src.
1538          * Otherwise, the exception table is indexed by
1539          * a hash of only rt6i_dst.
1540          */
1541         if (rt->fib6_src.plen)
1542                 src_key = saddr;
1543 #endif
1544         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1545
1546         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1547                 res = rt6_ex->rt6i;
1548
1549         return res;
1550 }
1551
1552 /* Remove the passed in cached rt from the hash table that contains it */
1553 static int rt6_remove_exception_rt(struct rt6_info *rt)
1554 {
1555         struct rt6_exception_bucket *bucket;
1556         struct in6_addr *src_key = NULL;
1557         struct rt6_exception *rt6_ex;
1558         struct fib6_info *from;
1559         int err;
1560
1561         from = rcu_dereference(rt->from);
1562         if (!from ||
1563             !(rt->rt6i_flags & RTF_CACHE))
1564                 return -EINVAL;
1565
1566         if (!rcu_access_pointer(from->rt6i_exception_bucket))
1567                 return -ENOENT;
1568
1569         spin_lock_bh(&rt6_exception_lock);
1570         bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1571                                     lockdep_is_held(&rt6_exception_lock));
1572 #ifdef CONFIG_IPV6_SUBTREES
1573         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1574          * and exception table is indexed by a hash of
1575          * both rt6i_dst and rt6i_src.
1576          * Otherwise, the exception table is indexed by
1577          * a hash of only rt6i_dst.
1578          */
1579         if (from->fib6_src.plen)
1580                 src_key = &rt->rt6i_src.addr;
1581 #endif
1582         rt6_ex = __rt6_find_exception_spinlock(&bucket,
1583                                                &rt->rt6i_dst.addr,
1584                                                src_key);
1585         if (rt6_ex) {
1586                 rt6_remove_exception(bucket, rt6_ex);
1587                 err = 0;
1588         } else {
1589                 err = -ENOENT;
1590         }
1591
1592         spin_unlock_bh(&rt6_exception_lock);
1593         return err;
1594 }
1595
1596 /* Find rt6_ex which contains the passed in rt cache and
1597  * refresh its stamp
1598  */
1599 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1600 {
1601         struct rt6_exception_bucket *bucket;
1602         struct in6_addr *src_key = NULL;
1603         struct rt6_exception *rt6_ex;
1604         struct fib6_info *from;
1605
1606         rcu_read_lock();
1607         from = rcu_dereference(rt->from);
1608         if (!from || !(rt->rt6i_flags & RTF_CACHE))
1609                 goto unlock;
1610
1611         bucket = rcu_dereference(from->rt6i_exception_bucket);
1612
1613 #ifdef CONFIG_IPV6_SUBTREES
1614         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1615          * and exception table is indexed by a hash of
1616          * both rt6i_dst and rt6i_src.
1617          * Otherwise, the exception table is indexed by
1618          * a hash of only rt6i_dst.
1619          */
1620         if (from->fib6_src.plen)
1621                 src_key = &rt->rt6i_src.addr;
1622 #endif
1623         rt6_ex = __rt6_find_exception_rcu(&bucket,
1624                                           &rt->rt6i_dst.addr,
1625                                           src_key);
1626         if (rt6_ex)
1627                 rt6_ex->stamp = jiffies;
1628
1629 unlock:
1630         rcu_read_unlock();
1631 }
1632
1633 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1634                                          struct rt6_info *rt, int mtu)
1635 {
1636         /* If the new MTU is lower than the route PMTU, this new MTU will be the
1637          * lowest MTU in the path: always allow updating the route PMTU to
1638          * reflect PMTU decreases.
1639          *
1640          * If the new MTU is higher, and the route PMTU is equal to the local
1641          * MTU, this means the old MTU is the lowest in the path, so allow
1642          * updating it: if other nodes now have lower MTUs, PMTU discovery will
1643          * handle this.
1644          */
1645
1646         if (dst_mtu(&rt->dst) >= mtu)
1647                 return true;
1648
1649         if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1650                 return true;
1651
1652         return false;
1653 }
1654
1655 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1656                                        struct fib6_info *rt, int mtu)
1657 {
1658         struct rt6_exception_bucket *bucket;
1659         struct rt6_exception *rt6_ex;
1660         int i;
1661
1662         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1663                                         lockdep_is_held(&rt6_exception_lock));
1664
1665         if (!bucket)
1666                 return;
1667
1668         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1669                 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1670                         struct rt6_info *entry = rt6_ex->rt6i;
1671
1672                         /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1673                          * route), the metrics of its rt->from have already
1674                          * been updated.
1675                          */
1676                         if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1677                             rt6_mtu_change_route_allowed(idev, entry, mtu))
1678                                 dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1679                 }
1680                 bucket++;
1681         }
1682 }
1683
1684 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
1685
1686 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1687                                         struct in6_addr *gateway)
1688 {
1689         struct rt6_exception_bucket *bucket;
1690         struct rt6_exception *rt6_ex;
1691         struct hlist_node *tmp;
1692         int i;
1693
1694         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1695                 return;
1696
1697         spin_lock_bh(&rt6_exception_lock);
1698         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1699                                      lockdep_is_held(&rt6_exception_lock));
1700
1701         if (bucket) {
1702                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1703                         hlist_for_each_entry_safe(rt6_ex, tmp,
1704                                                   &bucket->chain, hlist) {
1705                                 struct rt6_info *entry = rt6_ex->rt6i;
1706
1707                                 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1708                                     RTF_CACHE_GATEWAY &&
1709                                     ipv6_addr_equal(gateway,
1710                                                     &entry->rt6i_gateway)) {
1711                                         rt6_remove_exception(bucket, rt6_ex);
1712                                 }
1713                         }
1714                         bucket++;
1715                 }
1716         }
1717
1718         spin_unlock_bh(&rt6_exception_lock);
1719 }
1720
1721 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1722                                       struct rt6_exception *rt6_ex,
1723                                       struct fib6_gc_args *gc_args,
1724                                       unsigned long now)
1725 {
1726         struct rt6_info *rt = rt6_ex->rt6i;
1727
1728         /* we are pruning and obsoleting aged-out and non gateway exceptions
1729          * even if others have still references to them, so that on next
1730          * dst_check() such references can be dropped.
1731          * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1732          * expired, independently from their aging, as per RFC 8201 section 4
1733          */
1734         if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1735                 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1736                         RT6_TRACE("aging clone %p\n", rt);
1737                         rt6_remove_exception(bucket, rt6_ex);
1738                         return;
1739                 }
1740         } else if (time_after(jiffies, rt->dst.expires)) {
1741                 RT6_TRACE("purging expired route %p\n", rt);
1742                 rt6_remove_exception(bucket, rt6_ex);
1743                 return;
1744         }
1745
1746         if (rt->rt6i_flags & RTF_GATEWAY) {
1747                 struct neighbour *neigh;
1748                 __u8 neigh_flags = 0;
1749
1750                 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1751                 if (neigh)
1752                         neigh_flags = neigh->flags;
1753
1754                 if (!(neigh_flags & NTF_ROUTER)) {
1755                         RT6_TRACE("purging route %p via non-router but gateway\n",
1756                                   rt);
1757                         rt6_remove_exception(bucket, rt6_ex);
1758                         return;
1759                 }
1760         }
1761
1762         gc_args->more++;
1763 }
1764
1765 void rt6_age_exceptions(struct fib6_info *rt,
1766                         struct fib6_gc_args *gc_args,
1767                         unsigned long now)
1768 {
1769         struct rt6_exception_bucket *bucket;
1770         struct rt6_exception *rt6_ex;
1771         struct hlist_node *tmp;
1772         int i;
1773
1774         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1775                 return;
1776
1777         rcu_read_lock_bh();
1778         spin_lock(&rt6_exception_lock);
1779         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1780                                     lockdep_is_held(&rt6_exception_lock));
1781
1782         if (bucket) {
1783                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1784                         hlist_for_each_entry_safe(rt6_ex, tmp,
1785                                                   &bucket->chain, hlist) {
1786                                 rt6_age_examine_exception(bucket, rt6_ex,
1787                                                           gc_args, now);
1788                         }
1789                         bucket++;
1790                 }
1791         }
1792         spin_unlock(&rt6_exception_lock);
1793         rcu_read_unlock_bh();
1794 }
1795
1796 /* must be called with rcu lock held */
1797 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
1798                                     int oif, struct flowi6 *fl6, int strict)
1799 {
1800         struct fib6_node *fn, *saved_fn;
1801         struct fib6_info *f6i;
1802
1803         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1804         saved_fn = fn;
1805
1806         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1807                 oif = 0;
1808
1809 redo_rt6_select:
1810         f6i = rt6_select(net, fn, oif, strict);
1811         if (f6i == net->ipv6.fib6_null_entry) {
1812                 fn = fib6_backtrack(fn, &fl6->saddr);
1813                 if (fn)
1814                         goto redo_rt6_select;
1815                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1816                         /* also consider unreachable route */
1817                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1818                         fn = saved_fn;
1819                         goto redo_rt6_select;
1820                 }
1821         }
1822
1823         trace_fib6_table_lookup(net, f6i, table, fl6);
1824
1825         return f6i;
1826 }
1827
1828 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1829                                int oif, struct flowi6 *fl6,
1830                                const struct sk_buff *skb, int flags)
1831 {
1832         struct fib6_info *f6i;
1833         struct rt6_info *rt;
1834         int strict = 0;
1835
1836         strict |= flags & RT6_LOOKUP_F_IFACE;
1837         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1838         if (net->ipv6.devconf_all->forwarding == 0)
1839                 strict |= RT6_LOOKUP_F_REACHABLE;
1840
1841         rcu_read_lock();
1842
1843         f6i = fib6_table_lookup(net, table, oif, fl6, strict);
1844         if (f6i->fib6_nsiblings)
1845                 f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
1846
1847         if (f6i == net->ipv6.fib6_null_entry) {
1848                 rt = net->ipv6.ip6_null_entry;
1849                 rcu_read_unlock();
1850                 dst_hold(&rt->dst);
1851                 return rt;
1852         }
1853
1854         /*Search through exception table */
1855         rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1856         if (rt) {
1857                 if (ip6_hold_safe(net, &rt, true))
1858                         dst_use_noref(&rt->dst, jiffies);
1859
1860                 rcu_read_unlock();
1861                 return rt;
1862         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1863                             !(f6i->fib6_flags & RTF_GATEWAY))) {
1864                 /* Create a RTF_CACHE clone which will not be
1865                  * owned by the fib6 tree.  It is for the special case where
1866                  * the daddr in the skb during the neighbor look-up is different
1867                  * from the fl6->daddr used to look-up route here.
1868                  */
1869                 struct rt6_info *uncached_rt;
1870
1871                 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
1872
1873                 rcu_read_unlock();
1874
1875                 if (uncached_rt) {
1876                         /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1877                          * No need for another dst_hold()
1878                          */
1879                         rt6_uncached_list_add(uncached_rt);
1880                         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1881                 } else {
1882                         uncached_rt = net->ipv6.ip6_null_entry;
1883                         dst_hold(&uncached_rt->dst);
1884                 }
1885
1886                 return uncached_rt;
1887         } else {
1888                 /* Get a percpu copy */
1889
1890                 struct rt6_info *pcpu_rt;
1891
1892                 local_bh_disable();
1893                 pcpu_rt = rt6_get_pcpu_route(f6i);
1894
1895                 if (!pcpu_rt)
1896                         pcpu_rt = rt6_make_pcpu_route(net, f6i);
1897
1898                 local_bh_enable();
1899                 rcu_read_unlock();
1900
1901                 return pcpu_rt;
1902         }
1903 }
1904 EXPORT_SYMBOL_GPL(ip6_pol_route);
1905
1906 static struct rt6_info *ip6_pol_route_input(struct net *net,
1907                                             struct fib6_table *table,
1908                                             struct flowi6 *fl6,
1909                                             const struct sk_buff *skb,
1910                                             int flags)
1911 {
1912         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1913 }
1914
1915 struct dst_entry *ip6_route_input_lookup(struct net *net,
1916                                          struct net_device *dev,
1917                                          struct flowi6 *fl6,
1918                                          const struct sk_buff *skb,
1919                                          int flags)
1920 {
1921         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1922                 flags |= RT6_LOOKUP_F_IFACE;
1923
1924         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1925 }
1926 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1927
1928 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1929                                   struct flow_keys *keys,
1930                                   struct flow_keys *flkeys)
1931 {
1932         const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1933         const struct ipv6hdr *key_iph = outer_iph;
1934         struct flow_keys *_flkeys = flkeys;
1935         const struct ipv6hdr *inner_iph;
1936         const struct icmp6hdr *icmph;
1937         struct ipv6hdr _inner_iph;
1938         struct icmp6hdr _icmph;
1939
1940         if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1941                 goto out;
1942
1943         icmph = skb_header_pointer(skb, skb_transport_offset(skb),
1944                                    sizeof(_icmph), &_icmph);
1945         if (!icmph)
1946                 goto out;
1947
1948         if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1949             icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1950             icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1951             icmph->icmp6_type != ICMPV6_PARAMPROB)
1952                 goto out;
1953
1954         inner_iph = skb_header_pointer(skb,
1955                                        skb_transport_offset(skb) + sizeof(*icmph),
1956                                        sizeof(_inner_iph), &_inner_iph);
1957         if (!inner_iph)
1958                 goto out;
1959
1960         key_iph = inner_iph;
1961         _flkeys = NULL;
1962 out:
1963         if (_flkeys) {
1964                 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1965                 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1966                 keys->tags.flow_label = _flkeys->tags.flow_label;
1967                 keys->basic.ip_proto = _flkeys->basic.ip_proto;
1968         } else {
1969                 keys->addrs.v6addrs.src = key_iph->saddr;
1970                 keys->addrs.v6addrs.dst = key_iph->daddr;
1971                 keys->tags.flow_label = ip6_flowlabel(key_iph);
1972                 keys->basic.ip_proto = key_iph->nexthdr;
1973         }
1974 }
1975
1976 /* if skb is set it will be used and fl6 can be NULL */
1977 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
1978                        const struct sk_buff *skb, struct flow_keys *flkeys)
1979 {
1980         struct flow_keys hash_keys;
1981         u32 mhash;
1982
1983         switch (ip6_multipath_hash_policy(net)) {
1984         case 0:
1985                 memset(&hash_keys, 0, sizeof(hash_keys));
1986                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1987                 if (skb) {
1988                         ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
1989                 } else {
1990                         hash_keys.addrs.v6addrs.src = fl6->saddr;
1991                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
1992                         hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
1993                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
1994                 }
1995                 break;
1996         case 1:
1997                 if (skb) {
1998                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1999                         struct flow_keys keys;
2000
2001                         /* short-circuit if we already have L4 hash present */
2002                         if (skb->l4_hash)
2003                                 return skb_get_hash_raw(skb) >> 1;
2004
2005                         memset(&hash_keys, 0, sizeof(hash_keys));
2006
2007                         if (!flkeys) {
2008                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
2009                                 flkeys = &keys;
2010                         }
2011                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2012                         hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2013                         hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2014                         hash_keys.ports.src = flkeys->ports.src;
2015                         hash_keys.ports.dst = flkeys->ports.dst;
2016                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2017                 } else {
2018                         memset(&hash_keys, 0, sizeof(hash_keys));
2019                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2020                         hash_keys.addrs.v6addrs.src = fl6->saddr;
2021                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
2022                         hash_keys.ports.src = fl6->fl6_sport;
2023                         hash_keys.ports.dst = fl6->fl6_dport;
2024                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
2025                 }
2026                 break;
2027         }
2028         mhash = flow_hash_from_keys(&hash_keys);
2029
2030         return mhash >> 1;
2031 }
2032
2033 void ip6_route_input(struct sk_buff *skb)
2034 {
2035         const struct ipv6hdr *iph = ipv6_hdr(skb);
2036         struct net *net = dev_net(skb->dev);
2037         int flags = RT6_LOOKUP_F_HAS_SADDR;
2038         struct ip_tunnel_info *tun_info;
2039         struct flowi6 fl6 = {
2040                 .flowi6_iif = skb->dev->ifindex,
2041                 .daddr = iph->daddr,
2042                 .saddr = iph->saddr,
2043                 .flowlabel = ip6_flowinfo(iph),
2044                 .flowi6_mark = skb->mark,
2045                 .flowi6_proto = iph->nexthdr,
2046         };
2047         struct flow_keys *flkeys = NULL, _flkeys;
2048
2049         tun_info = skb_tunnel_info(skb);
2050         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2051                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2052
2053         if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2054                 flkeys = &_flkeys;
2055
2056         if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2057                 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2058         skb_dst_drop(skb);
2059         skb_dst_set(skb,
2060                     ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2061 }
2062
2063 static struct rt6_info *ip6_pol_route_output(struct net *net,
2064                                              struct fib6_table *table,
2065                                              struct flowi6 *fl6,
2066                                              const struct sk_buff *skb,
2067                                              int flags)
2068 {
2069         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2070 }
2071
2072 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2073                                          struct flowi6 *fl6, int flags)
2074 {
2075         bool any_src;
2076
2077         if (ipv6_addr_type(&fl6->daddr) &
2078             (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
2079                 struct dst_entry *dst;
2080
2081                 dst = l3mdev_link_scope_lookup(net, fl6);
2082                 if (dst)
2083                         return dst;
2084         }
2085
2086         fl6->flowi6_iif = LOOPBACK_IFINDEX;
2087
2088         any_src = ipv6_addr_any(&fl6->saddr);
2089         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2090             (fl6->flowi6_oif && any_src))
2091                 flags |= RT6_LOOKUP_F_IFACE;
2092
2093         if (!any_src)
2094                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2095         else if (sk)
2096                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2097
2098         return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2099 }
2100 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2101
2102 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2103 {
2104         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2105         struct net_device *loopback_dev = net->loopback_dev;
2106         struct dst_entry *new = NULL;
2107
2108         rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2109                        DST_OBSOLETE_DEAD, 0);
2110         if (rt) {
2111                 rt6_info_init(rt);
2112                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2113
2114                 new = &rt->dst;
2115                 new->__use = 1;
2116                 new->input = dst_discard;
2117                 new->output = dst_discard_out;
2118
2119                 dst_copy_metrics(new, &ort->dst);
2120
2121                 rt->rt6i_idev = in6_dev_get(loopback_dev);
2122                 rt->rt6i_gateway = ort->rt6i_gateway;
2123                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2124
2125                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2126 #ifdef CONFIG_IPV6_SUBTREES
2127                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2128 #endif
2129         }
2130
2131         dst_release(dst_orig);
2132         return new ? new : ERR_PTR(-ENOMEM);
2133 }
2134
2135 /*
2136  *      Destination cache support functions
2137  */
2138
2139 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2140 {
2141         u32 rt_cookie = 0;
2142
2143         if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2144                 return false;
2145
2146         if (fib6_check_expired(f6i))
2147                 return false;
2148
2149         return true;
2150 }
2151
2152 static struct dst_entry *rt6_check(struct rt6_info *rt,
2153                                    struct fib6_info *from,
2154                                    u32 cookie)
2155 {
2156         u32 rt_cookie = 0;
2157
2158         if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2159             rt_cookie != cookie)
2160                 return NULL;
2161
2162         if (rt6_check_expired(rt))
2163                 return NULL;
2164
2165         return &rt->dst;
2166 }
2167
2168 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2169                                             struct fib6_info *from,
2170                                             u32 cookie)
2171 {
2172         if (!__rt6_check_expired(rt) &&
2173             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2174             fib6_check(from, cookie))
2175                 return &rt->dst;
2176         else
2177                 return NULL;
2178 }
2179
2180 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2181 {
2182         struct dst_entry *dst_ret;
2183         struct fib6_info *from;
2184         struct rt6_info *rt;
2185
2186         rt = container_of(dst, struct rt6_info, dst);
2187
2188         rcu_read_lock();
2189
2190         /* All IPV6 dsts are created with ->obsolete set to the value
2191          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2192          * into this function always.
2193          */
2194
2195         from = rcu_dereference(rt->from);
2196
2197         if (from && (rt->rt6i_flags & RTF_PCPU ||
2198             unlikely(!list_empty(&rt->rt6i_uncached))))
2199                 dst_ret = rt6_dst_from_check(rt, from, cookie);
2200         else
2201                 dst_ret = rt6_check(rt, from, cookie);
2202
2203         rcu_read_unlock();
2204
2205         return dst_ret;
2206 }
2207
2208 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2209 {
2210         struct rt6_info *rt = (struct rt6_info *) dst;
2211
2212         if (rt) {
2213                 if (rt->rt6i_flags & RTF_CACHE) {
2214                         rcu_read_lock();
2215                         if (rt6_check_expired(rt)) {
2216                                 rt6_remove_exception_rt(rt);
2217                                 dst = NULL;
2218                         }
2219                         rcu_read_unlock();
2220                 } else {
2221                         dst_release(dst);
2222                         dst = NULL;
2223                 }
2224         }
2225         return dst;
2226 }
2227
2228 static void ip6_link_failure(struct sk_buff *skb)
2229 {
2230         struct rt6_info *rt;
2231
2232         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2233
2234         rt = (struct rt6_info *) skb_dst(skb);
2235         if (rt) {
2236                 rcu_read_lock();
2237                 if (rt->rt6i_flags & RTF_CACHE) {
2238                         rt6_remove_exception_rt(rt);
2239                 } else {
2240                         struct fib6_info *from;
2241                         struct fib6_node *fn;
2242
2243                         from = rcu_dereference(rt->from);
2244                         if (from) {
2245                                 fn = rcu_dereference(from->fib6_node);
2246                                 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2247                                         fn->fn_sernum = -1;
2248                         }
2249                 }
2250                 rcu_read_unlock();
2251         }
2252 }
2253
2254 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2255 {
2256         if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2257                 struct fib6_info *from;
2258
2259                 rcu_read_lock();
2260                 from = rcu_dereference(rt0->from);
2261                 if (from)
2262                         rt0->dst.expires = from->expires;
2263                 rcu_read_unlock();
2264         }
2265
2266         dst_set_expires(&rt0->dst, timeout);
2267         rt0->rt6i_flags |= RTF_EXPIRES;
2268 }
2269
2270 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2271 {
2272         struct net *net = dev_net(rt->dst.dev);
2273
2274         dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2275         rt->rt6i_flags |= RTF_MODIFIED;
2276         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2277 }
2278
2279 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2280 {
2281         bool from_set;
2282
2283         rcu_read_lock();
2284         from_set = !!rcu_dereference(rt->from);
2285         rcu_read_unlock();
2286
2287         return !(rt->rt6i_flags & RTF_CACHE) &&
2288                 (rt->rt6i_flags & RTF_PCPU || from_set);
2289 }
2290
2291 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2292                                  const struct ipv6hdr *iph, u32 mtu)
2293 {
2294         const struct in6_addr *daddr, *saddr;
2295         struct rt6_info *rt6 = (struct rt6_info *)dst;
2296
2297         if (dst_metric_locked(dst, RTAX_MTU))
2298                 return;
2299
2300         if (iph) {
2301                 daddr = &iph->daddr;
2302                 saddr = &iph->saddr;
2303         } else if (sk) {
2304                 daddr = &sk->sk_v6_daddr;
2305                 saddr = &inet6_sk(sk)->saddr;
2306         } else {
2307                 daddr = NULL;
2308                 saddr = NULL;
2309         }
2310         dst_confirm_neigh(dst, daddr);
2311         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2312         if (mtu >= dst_mtu(dst))
2313                 return;
2314
2315         if (!rt6_cache_allowed_for_pmtu(rt6)) {
2316                 rt6_do_update_pmtu(rt6, mtu);
2317                 /* update rt6_ex->stamp for cache */
2318                 if (rt6->rt6i_flags & RTF_CACHE)
2319                         rt6_update_exception_stamp_rt(rt6);
2320         } else if (daddr) {
2321                 struct fib6_info *from;
2322                 struct rt6_info *nrt6;
2323
2324                 rcu_read_lock();
2325                 from = rcu_dereference(rt6->from);
2326                 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
2327                 if (nrt6) {
2328                         rt6_do_update_pmtu(nrt6, mtu);
2329                         if (rt6_insert_exception(nrt6, from))
2330                                 dst_release_immediate(&nrt6->dst);
2331                 }
2332                 rcu_read_unlock();
2333         }
2334 }
2335
2336 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2337                                struct sk_buff *skb, u32 mtu)
2338 {
2339         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2340 }
2341
2342 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2343                      int oif, u32 mark, kuid_t uid)
2344 {
2345         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2346         struct dst_entry *dst;
2347         struct flowi6 fl6 = {
2348                 .flowi6_oif = oif,
2349                 .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark),
2350                 .daddr = iph->daddr,
2351                 .saddr = iph->saddr,
2352                 .flowlabel = ip6_flowinfo(iph),
2353                 .flowi6_uid = uid,
2354         };
2355
2356         dst = ip6_route_output(net, NULL, &fl6);
2357         if (!dst->error)
2358                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2359         dst_release(dst);
2360 }
2361 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2362
2363 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2364 {
2365         int oif = sk->sk_bound_dev_if;
2366         struct dst_entry *dst;
2367
2368         if (!oif && skb->dev)
2369                 oif = l3mdev_master_ifindex(skb->dev);
2370
2371         ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid);
2372
2373         dst = __sk_dst_get(sk);
2374         if (!dst || !dst->obsolete ||
2375             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2376                 return;
2377
2378         bh_lock_sock(sk);
2379         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2380                 ip6_datagram_dst_update(sk, false);
2381         bh_unlock_sock(sk);
2382 }
2383 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2384
2385 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2386                            const struct flowi6 *fl6)
2387 {
2388 #ifdef CONFIG_IPV6_SUBTREES
2389         struct ipv6_pinfo *np = inet6_sk(sk);
2390 #endif
2391
2392         ip6_dst_store(sk, dst,
2393                       ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2394                       &sk->sk_v6_daddr : NULL,
2395 #ifdef CONFIG_IPV6_SUBTREES
2396                       ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2397                       &np->saddr :
2398 #endif
2399                       NULL);
2400 }
2401
2402 /* Handle redirects */
2403 struct ip6rd_flowi {
2404         struct flowi6 fl6;
2405         struct in6_addr gateway;
2406 };
2407
2408 static struct rt6_info *__ip6_route_redirect(struct net *net,
2409                                              struct fib6_table *table,
2410                                              struct flowi6 *fl6,
2411                                              const struct sk_buff *skb,
2412                                              int flags)
2413 {
2414         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2415         struct rt6_info *ret = NULL, *rt_cache;
2416         struct fib6_info *rt;
2417         struct fib6_node *fn;
2418
2419         /* Get the "current" route for this destination and
2420          * check if the redirect has come from appropriate router.
2421          *
2422          * RFC 4861 specifies that redirects should only be
2423          * accepted if they come from the nexthop to the target.
2424          * Due to the way the routes are chosen, this notion
2425          * is a bit fuzzy and one might need to check all possible
2426          * routes.
2427          */
2428
2429         rcu_read_lock();
2430         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2431 restart:
2432         for_each_fib6_node_rt_rcu(fn) {
2433                 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
2434                         continue;
2435                 if (fib6_check_expired(rt))
2436                         continue;
2437                 if (rt->fib6_flags & RTF_REJECT)
2438                         break;
2439                 if (!(rt->fib6_flags & RTF_GATEWAY))
2440                         continue;
2441                 if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex)
2442                         continue;
2443                 /* rt_cache's gateway might be different from its 'parent'
2444                  * in the case of an ip redirect.
2445                  * So we keep searching in the exception table if the gateway
2446                  * is different.
2447                  */
2448                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) {
2449                         rt_cache = rt6_find_cached_rt(rt,
2450                                                       &fl6->daddr,
2451                                                       &fl6->saddr);
2452                         if (rt_cache &&
2453                             ipv6_addr_equal(&rdfl->gateway,
2454                                             &rt_cache->rt6i_gateway)) {
2455                                 ret = rt_cache;
2456                                 break;
2457                         }
2458                         continue;
2459                 }
2460                 break;
2461         }
2462
2463         if (!rt)
2464                 rt = net->ipv6.fib6_null_entry;
2465         else if (rt->fib6_flags & RTF_REJECT) {
2466                 ret = net->ipv6.ip6_null_entry;
2467                 goto out;
2468         }
2469
2470         if (rt == net->ipv6.fib6_null_entry) {
2471                 fn = fib6_backtrack(fn, &fl6->saddr);
2472                 if (fn)
2473                         goto restart;
2474         }
2475
2476 out:
2477         if (ret)
2478                 ip6_hold_safe(net, &ret, true);
2479         else
2480                 ret = ip6_create_rt_rcu(rt);
2481
2482         rcu_read_unlock();
2483
2484         trace_fib6_table_lookup(net, rt, table, fl6);
2485         return ret;
2486 };
2487
2488 static struct dst_entry *ip6_route_redirect(struct net *net,
2489                                             const struct flowi6 *fl6,
2490                                             const struct sk_buff *skb,
2491                                             const struct in6_addr *gateway)
2492 {
2493         int flags = RT6_LOOKUP_F_HAS_SADDR;
2494         struct ip6rd_flowi rdfl;
2495
2496         rdfl.fl6 = *fl6;
2497         rdfl.gateway = *gateway;
2498
2499         return fib6_rule_lookup(net, &rdfl.fl6, skb,
2500                                 flags, __ip6_route_redirect);
2501 }
2502
2503 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2504                   kuid_t uid)
2505 {
2506         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2507         struct dst_entry *dst;
2508         struct flowi6 fl6 = {
2509                 .flowi6_iif = LOOPBACK_IFINDEX,
2510                 .flowi6_oif = oif,
2511                 .flowi6_mark = mark,
2512                 .daddr = iph->daddr,
2513                 .saddr = iph->saddr,
2514                 .flowlabel = ip6_flowinfo(iph),
2515                 .flowi6_uid = uid,
2516         };
2517
2518         dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2519         rt6_do_redirect(dst, NULL, skb);
2520         dst_release(dst);
2521 }
2522 EXPORT_SYMBOL_GPL(ip6_redirect);
2523
2524 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif)
2525 {
2526         const struct ipv6hdr *iph = ipv6_hdr(skb);
2527         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2528         struct dst_entry *dst;
2529         struct flowi6 fl6 = {
2530                 .flowi6_iif = LOOPBACK_IFINDEX,
2531                 .flowi6_oif = oif,
2532                 .daddr = msg->dest,
2533                 .saddr = iph->daddr,
2534                 .flowi6_uid = sock_net_uid(net, NULL),
2535         };
2536
2537         dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2538         rt6_do_redirect(dst, NULL, skb);
2539         dst_release(dst);
2540 }
2541
2542 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2543 {
2544         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2545                      sk->sk_uid);
2546 }
2547 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2548
2549 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2550 {
2551         struct net_device *dev = dst->dev;
2552         unsigned int mtu = dst_mtu(dst);
2553         struct net *net = dev_net(dev);
2554
2555         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2556
2557         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2558                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2559
2560         /*
2561          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2562          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2563          * IPV6_MAXPLEN is also valid and means: "any MSS,
2564          * rely only on pmtu discovery"
2565          */
2566         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2567                 mtu = IPV6_MAXPLEN;
2568         return mtu;
2569 }
2570
2571 static unsigned int ip6_mtu(const struct dst_entry *dst)
2572 {
2573         struct inet6_dev *idev;
2574         unsigned int mtu;
2575
2576         mtu = dst_metric_raw(dst, RTAX_MTU);
2577         if (mtu)
2578                 goto out;
2579
2580         mtu = IPV6_MIN_MTU;
2581
2582         rcu_read_lock();
2583         idev = __in6_dev_get(dst->dev);
2584         if (idev)
2585                 mtu = idev->cnf.mtu6;
2586         rcu_read_unlock();
2587
2588 out:
2589         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2590
2591         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2592 }
2593
2594 /* MTU selection:
2595  * 1. mtu on route is locked - use it
2596  * 2. mtu from nexthop exception
2597  * 3. mtu from egress device
2598  *
2599  * based on ip6_dst_mtu_forward and exception logic of
2600  * rt6_find_cached_rt; called with rcu_read_lock
2601  */
2602 u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
2603                       struct in6_addr *saddr)
2604 {
2605         struct rt6_exception_bucket *bucket;
2606         struct rt6_exception *rt6_ex;
2607         struct in6_addr *src_key;
2608         struct inet6_dev *idev;
2609         u32 mtu = 0;
2610
2611         if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
2612                 mtu = f6i->fib6_pmtu;
2613                 if (mtu)
2614                         goto out;
2615         }
2616
2617         src_key = NULL;
2618 #ifdef CONFIG_IPV6_SUBTREES
2619         if (f6i->fib6_src.plen)
2620                 src_key = saddr;
2621 #endif
2622
2623         bucket = rcu_dereference(f6i->rt6i_exception_bucket);
2624         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
2625         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
2626                 mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU);
2627
2628         if (likely(!mtu)) {
2629                 struct net_device *dev = fib6_info_nh_dev(f6i);
2630
2631                 mtu = IPV6_MIN_MTU;
2632                 idev = __in6_dev_get(dev);
2633                 if (idev && idev->cnf.mtu6 > mtu)
2634                         mtu = idev->cnf.mtu6;
2635         }
2636
2637         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2638 out:
2639         return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu);
2640 }
2641
2642 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2643                                   struct flowi6 *fl6)
2644 {
2645         struct dst_entry *dst;
2646         struct rt6_info *rt;
2647         struct inet6_dev *idev = in6_dev_get(dev);
2648         struct net *net = dev_net(dev);
2649
2650         if (unlikely(!idev))
2651                 return ERR_PTR(-ENODEV);
2652
2653         rt = ip6_dst_alloc(net, dev, 0);
2654         if (unlikely(!rt)) {
2655                 in6_dev_put(idev);
2656                 dst = ERR_PTR(-ENOMEM);
2657                 goto out;
2658         }
2659
2660         rt->dst.flags |= DST_HOST;
2661         rt->dst.input = ip6_input;
2662         rt->dst.output  = ip6_output;
2663         rt->rt6i_gateway  = fl6->daddr;
2664         rt->rt6i_dst.addr = fl6->daddr;
2665         rt->rt6i_dst.plen = 128;
2666         rt->rt6i_idev     = idev;
2667         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2668
2669         /* Add this dst into uncached_list so that rt6_disable_ip() can
2670          * do proper release of the net_device
2671          */
2672         rt6_uncached_list_add(rt);
2673         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2674
2675         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2676
2677 out:
2678         return dst;
2679 }
2680
2681 static int ip6_dst_gc(struct dst_ops *ops)
2682 {
2683         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2684         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2685         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2686         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2687         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2688         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2689         int entries;
2690
2691         entries = dst_entries_get_fast(ops);
2692         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2693             entries <= rt_max_size)
2694                 goto out;
2695
2696         net->ipv6.ip6_rt_gc_expire++;
2697         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2698         entries = dst_entries_get_slow(ops);
2699         if (entries < ops->gc_thresh)
2700                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2701 out:
2702         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2703         return entries > rt_max_size;
2704 }
2705
2706 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2707                                             struct fib6_config *cfg,
2708                                             const struct in6_addr *gw_addr,
2709                                             u32 tbid, int flags)
2710 {
2711         struct flowi6 fl6 = {
2712                 .flowi6_oif = cfg->fc_ifindex,
2713                 .daddr = *gw_addr,
2714                 .saddr = cfg->fc_prefsrc,
2715         };
2716         struct fib6_table *table;
2717         struct rt6_info *rt;
2718
2719         table = fib6_get_table(net, tbid);
2720         if (!table)
2721                 return NULL;
2722
2723         if (!ipv6_addr_any(&cfg->fc_prefsrc))
2724                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2725
2726         flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2727         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2728
2729         /* if table lookup failed, fall back to full lookup */
2730         if (rt == net->ipv6.ip6_null_entry) {
2731                 ip6_rt_put(rt);
2732                 rt = NULL;
2733         }
2734
2735         return rt;
2736 }
2737
2738 static int ip6_route_check_nh_onlink(struct net *net,
2739                                      struct fib6_config *cfg,
2740                                      const struct net_device *dev,
2741                                      struct netlink_ext_ack *extack)
2742 {
2743         u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2744         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2745         u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2746         struct fib6_info *from;
2747         struct rt6_info *grt;
2748         int err;
2749
2750         err = 0;
2751         grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2752         if (grt) {
2753                 rcu_read_lock();
2754                 from = rcu_dereference(grt->from);
2755                 if (!grt->dst.error &&
2756                     /* ignore match if it is the default route */
2757                     from && !ipv6_addr_any(&from->fib6_dst.addr) &&
2758                     (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2759                         NL_SET_ERR_MSG(extack,
2760                                        "Nexthop has invalid gateway or device mismatch");
2761                         err = -EINVAL;
2762                 }
2763                 rcu_read_unlock();
2764
2765                 ip6_rt_put(grt);
2766         }
2767
2768         return err;
2769 }
2770
2771 static int ip6_route_check_nh(struct net *net,
2772                               struct fib6_config *cfg,
2773                               struct net_device **_dev,
2774                               struct inet6_dev **idev)
2775 {
2776         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2777         struct net_device *dev = _dev ? *_dev : NULL;
2778         struct rt6_info *grt = NULL;
2779         int err = -EHOSTUNREACH;
2780
2781         if (cfg->fc_table) {
2782                 int flags = RT6_LOOKUP_F_IFACE;
2783
2784                 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2785                                           cfg->fc_table, flags);
2786                 if (grt) {
2787                         if (grt->rt6i_flags & RTF_GATEWAY ||
2788                             (dev && dev != grt->dst.dev)) {
2789                                 ip6_rt_put(grt);
2790                                 grt = NULL;
2791                         }
2792                 }
2793         }
2794
2795         if (!grt)
2796                 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2797
2798         if (!grt)
2799                 goto out;
2800
2801         if (dev) {
2802                 if (dev != grt->dst.dev) {
2803                         ip6_rt_put(grt);
2804                         goto out;
2805                 }
2806         } else {
2807                 *_dev = dev = grt->dst.dev;
2808                 *idev = grt->rt6i_idev;
2809                 dev_hold(dev);
2810                 in6_dev_hold(grt->rt6i_idev);
2811         }
2812
2813         if (!(grt->rt6i_flags & RTF_GATEWAY))
2814                 err = 0;
2815
2816         ip6_rt_put(grt);
2817
2818 out:
2819         return err;
2820 }
2821
2822 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2823                            struct net_device **_dev, struct inet6_dev **idev,
2824                            struct netlink_ext_ack *extack)
2825 {
2826         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2827         int gwa_type = ipv6_addr_type(gw_addr);
2828         bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2829         const struct net_device *dev = *_dev;
2830         bool need_addr_check = !dev;
2831         int err = -EINVAL;
2832
2833         /* if gw_addr is local we will fail to detect this in case
2834          * address is still TENTATIVE (DAD in progress). rt6_lookup()
2835          * will return already-added prefix route via interface that
2836          * prefix route was assigned to, which might be non-loopback.
2837          */
2838         if (dev &&
2839             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2840                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2841                 goto out;
2842         }
2843
2844         if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2845                 /* IPv6 strictly inhibits using not link-local
2846                  * addresses as nexthop address.
2847                  * Otherwise, router will not able to send redirects.
2848                  * It is very good, but in some (rare!) circumstances
2849                  * (SIT, PtP, NBMA NOARP links) it is handy to allow
2850                  * some exceptions. --ANK
2851                  * We allow IPv4-mapped nexthops to support RFC4798-type
2852                  * addressing
2853                  */
2854                 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2855                         NL_SET_ERR_MSG(extack, "Invalid gateway address");
2856                         goto out;
2857                 }
2858
2859                 if (cfg->fc_flags & RTNH_F_ONLINK)
2860                         err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2861                 else
2862                         err = ip6_route_check_nh(net, cfg, _dev, idev);
2863
2864                 if (err)
2865                         goto out;
2866         }
2867
2868         /* reload in case device was changed */
2869         dev = *_dev;
2870
2871         err = -EINVAL;
2872         if (!dev) {
2873                 NL_SET_ERR_MSG(extack, "Egress device not specified");
2874                 goto out;
2875         } else if (dev->flags & IFF_LOOPBACK) {
2876                 NL_SET_ERR_MSG(extack,
2877                                "Egress device can not be loopback device for this route");
2878                 goto out;
2879         }
2880
2881         /* if we did not check gw_addr above, do so now that the
2882          * egress device has been resolved.
2883          */
2884         if (need_addr_check &&
2885             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2886                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2887                 goto out;
2888         }
2889
2890         err = 0;
2891 out:
2892         return err;
2893 }
2894
2895 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
2896                                               gfp_t gfp_flags,
2897                                               struct netlink_ext_ack *extack)
2898 {
2899         struct net *net = cfg->fc_nlinfo.nl_net;
2900         struct fib6_info *rt = NULL;
2901         struct net_device *dev = NULL;
2902         struct inet6_dev *idev = NULL;
2903         struct fib6_table *table;
2904         int addr_type;
2905         int err = -EINVAL;
2906
2907         /* RTF_PCPU is an internal flag; can not be set by userspace */
2908         if (cfg->fc_flags & RTF_PCPU) {
2909                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2910                 goto out;
2911         }
2912
2913         /* RTF_CACHE is an internal flag; can not be set by userspace */
2914         if (cfg->fc_flags & RTF_CACHE) {
2915                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2916                 goto out;
2917         }
2918
2919         if (cfg->fc_type > RTN_MAX) {
2920                 NL_SET_ERR_MSG(extack, "Invalid route type");
2921                 goto out;
2922         }
2923
2924         if (cfg->fc_dst_len > 128) {
2925                 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2926                 goto out;
2927         }
2928         if (cfg->fc_src_len > 128) {
2929                 NL_SET_ERR_MSG(extack, "Invalid source address length");
2930                 goto out;
2931         }
2932 #ifndef CONFIG_IPV6_SUBTREES
2933         if (cfg->fc_src_len) {
2934                 NL_SET_ERR_MSG(extack,
2935                                "Specifying source address requires IPV6_SUBTREES to be enabled");
2936                 goto out;
2937         }
2938 #endif
2939         if (cfg->fc_ifindex) {
2940                 err = -ENODEV;
2941                 dev = dev_get_by_index(net, cfg->fc_ifindex);
2942                 if (!dev)
2943                         goto out;
2944                 idev = in6_dev_get(dev);
2945                 if (!idev)
2946                         goto out;
2947         }
2948
2949         if (cfg->fc_metric == 0)
2950                 cfg->fc_metric = IP6_RT_PRIO_USER;
2951
2952         if (cfg->fc_flags & RTNH_F_ONLINK) {
2953                 if (!dev) {
2954                         NL_SET_ERR_MSG(extack,
2955                                        "Nexthop device required for onlink");
2956                         err = -ENODEV;
2957                         goto out;
2958                 }
2959
2960                 if (!(dev->flags & IFF_UP)) {
2961                         NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2962                         err = -ENETDOWN;
2963                         goto out;
2964                 }
2965         }
2966
2967         err = -ENOBUFS;
2968         if (cfg->fc_nlinfo.nlh &&
2969             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2970                 table = fib6_get_table(net, cfg->fc_table);
2971                 if (!table) {
2972                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2973                         table = fib6_new_table(net, cfg->fc_table);
2974                 }
2975         } else {
2976                 table = fib6_new_table(net, cfg->fc_table);
2977         }
2978
2979         if (!table)
2980                 goto out;
2981
2982         err = -ENOMEM;
2983         rt = fib6_info_alloc(gfp_flags);
2984         if (!rt)
2985                 goto out;
2986
2987         rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len,
2988                                                extack);
2989         if (IS_ERR(rt->fib6_metrics)) {
2990                 err = PTR_ERR(rt->fib6_metrics);
2991                 /* Do not leave garbage there. */
2992                 rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
2993                 goto out;
2994         }
2995
2996         if (cfg->fc_flags & RTF_ADDRCONF)
2997                 rt->dst_nocount = true;
2998
2999         if (cfg->fc_flags & RTF_EXPIRES)
3000                 fib6_set_expires(rt, jiffies +
3001                                 clock_t_to_jiffies(cfg->fc_expires));
3002         else
3003                 fib6_clean_expires(rt);
3004
3005         if (cfg->fc_protocol == RTPROT_UNSPEC)
3006                 cfg->fc_protocol = RTPROT_BOOT;
3007         rt->fib6_protocol = cfg->fc_protocol;
3008
3009         addr_type = ipv6_addr_type(&cfg->fc_dst);
3010
3011         if (cfg->fc_encap) {
3012                 struct lwtunnel_state *lwtstate;
3013
3014                 err = lwtunnel_build_state(cfg->fc_encap_type,
3015                                            cfg->fc_encap, AF_INET6, cfg,
3016                                            &lwtstate, extack);
3017                 if (err)
3018                         goto out;
3019                 rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate);
3020         }
3021
3022         ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3023         rt->fib6_dst.plen = cfg->fc_dst_len;
3024         if (rt->fib6_dst.plen == 128)
3025                 rt->dst_host = true;
3026
3027 #ifdef CONFIG_IPV6_SUBTREES
3028         ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3029         rt->fib6_src.plen = cfg->fc_src_len;
3030 #endif
3031
3032         rt->fib6_metric = cfg->fc_metric;
3033         rt->fib6_nh.nh_weight = 1;
3034
3035         rt->fib6_type = cfg->fc_type;
3036
3037         /* We cannot add true routes via loopback here,
3038            they would result in kernel looping; promote them to reject routes
3039          */
3040         if ((cfg->fc_flags & RTF_REJECT) ||
3041             (dev && (dev->flags & IFF_LOOPBACK) &&
3042              !(addr_type & IPV6_ADDR_LOOPBACK) &&
3043              !(cfg->fc_flags & RTF_LOCAL))) {
3044                 /* hold loopback dev/idev if we haven't done so. */
3045                 if (dev != net->loopback_dev) {
3046                         if (dev) {
3047                                 dev_put(dev);
3048                                 in6_dev_put(idev);
3049                         }
3050                         dev = net->loopback_dev;
3051                         dev_hold(dev);
3052                         idev = in6_dev_get(dev);
3053                         if (!idev) {
3054                                 err = -ENODEV;
3055                                 goto out;
3056                         }
3057                 }
3058                 rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP;
3059                 goto install_route;
3060         }
3061
3062         if (cfg->fc_flags & RTF_GATEWAY) {
3063                 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
3064                 if (err)
3065                         goto out;
3066
3067                 rt->fib6_nh.nh_gw = cfg->fc_gateway;
3068         }
3069
3070         err = -ENODEV;
3071         if (!dev)
3072                 goto out;
3073
3074         if (idev->cnf.disable_ipv6) {
3075                 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3076                 err = -EACCES;
3077                 goto out;
3078         }
3079
3080         if (!(dev->flags & IFF_UP)) {
3081                 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3082                 err = -ENETDOWN;
3083                 goto out;
3084         }
3085
3086         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3087                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3088                         NL_SET_ERR_MSG(extack, "Invalid source address");
3089                         err = -EINVAL;
3090                         goto out;
3091                 }
3092                 rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3093                 rt->fib6_prefsrc.plen = 128;
3094         } else
3095                 rt->fib6_prefsrc.plen = 0;
3096
3097         rt->fib6_flags = cfg->fc_flags;
3098
3099 install_route:
3100         if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3101             !netif_carrier_ok(dev))
3102                 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3103         rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
3104         rt->fib6_nh.nh_dev = dev;
3105         rt->fib6_table = table;
3106
3107         if (idev)
3108                 in6_dev_put(idev);
3109
3110         return rt;
3111 out:
3112         if (dev)
3113                 dev_put(dev);
3114         if (idev)
3115                 in6_dev_put(idev);
3116
3117         fib6_info_release(rt);
3118         return ERR_PTR(err);
3119 }
3120
3121 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3122                   struct netlink_ext_ack *extack)
3123 {
3124         struct fib6_info *rt;
3125         int err;
3126
3127         rt = ip6_route_info_create(cfg, gfp_flags, extack);
3128         if (IS_ERR(rt))
3129                 return PTR_ERR(rt);
3130
3131         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3132         fib6_info_release(rt);
3133
3134         return err;
3135 }
3136
3137 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3138 {
3139         struct net *net = info->nl_net;
3140         struct fib6_table *table;
3141         int err;
3142
3143         if (rt == net->ipv6.fib6_null_entry) {
3144                 err = -ENOENT;
3145                 goto out;
3146         }
3147
3148         table = rt->fib6_table;
3149         spin_lock_bh(&table->tb6_lock);
3150         err = fib6_del(rt, info);
3151         spin_unlock_bh(&table->tb6_lock);
3152
3153 out:
3154         fib6_info_release(rt);
3155         return err;
3156 }
3157
3158 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3159 {
3160         struct nl_info info = { .nl_net = net };
3161
3162         return __ip6_del_rt(rt, &info);
3163 }
3164
3165 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3166 {
3167         struct nl_info *info = &cfg->fc_nlinfo;
3168         struct net *net = info->nl_net;
3169         struct sk_buff *skb = NULL;
3170         struct fib6_table *table;
3171         int err = -ENOENT;
3172
3173         if (rt == net->ipv6.fib6_null_entry)
3174                 goto out_put;
3175         table = rt->fib6_table;
3176         spin_lock_bh(&table->tb6_lock);
3177
3178         if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3179                 struct fib6_info *sibling, *next_sibling;
3180
3181                 /* prefer to send a single notification with all hops */
3182                 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3183                 if (skb) {
3184                         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3185
3186                         if (rt6_fill_node(net, skb, rt, NULL,
3187                                           NULL, NULL, 0, RTM_DELROUTE,
3188                                           info->portid, seq, 0) < 0) {
3189                                 kfree_skb(skb);
3190                                 skb = NULL;
3191                         } else
3192                                 info->skip_notify = 1;
3193                 }
3194
3195                 list_for_each_entry_safe(sibling, next_sibling,
3196                                          &rt->fib6_siblings,
3197                                          fib6_siblings) {
3198                         err = fib6_del(sibling, info);
3199                         if (err)
3200                                 goto out_unlock;
3201                 }
3202         }
3203
3204         err = fib6_del(rt, info);
3205 out_unlock:
3206         spin_unlock_bh(&table->tb6_lock);
3207 out_put:
3208         fib6_info_release(rt);
3209
3210         if (skb) {
3211                 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3212                             info->nlh, gfp_any());
3213         }
3214         return err;
3215 }
3216
3217 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3218 {
3219         int rc = -ESRCH;
3220
3221         if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3222                 goto out;
3223
3224         if (cfg->fc_flags & RTF_GATEWAY &&
3225             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3226                 goto out;
3227
3228         rc = rt6_remove_exception_rt(rt);
3229 out:
3230         return rc;
3231 }
3232
3233 static int ip6_route_del(struct fib6_config *cfg,
3234                          struct netlink_ext_ack *extack)
3235 {
3236         struct rt6_info *rt_cache;
3237         struct fib6_table *table;
3238         struct fib6_info *rt;
3239         struct fib6_node *fn;
3240         int err = -ESRCH;
3241
3242         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3243         if (!table) {
3244                 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3245                 return err;
3246         }
3247
3248         rcu_read_lock();
3249
3250         fn = fib6_locate(&table->tb6_root,
3251                          &cfg->fc_dst, cfg->fc_dst_len,
3252                          &cfg->fc_src, cfg->fc_src_len,
3253                          !(cfg->fc_flags & RTF_CACHE));
3254
3255         if (fn) {
3256                 for_each_fib6_node_rt_rcu(fn) {
3257                         if (cfg->fc_flags & RTF_CACHE) {
3258                                 int rc;
3259
3260                                 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3261                                                               &cfg->fc_src);
3262                                 if (rt_cache) {
3263                                         rc = ip6_del_cached_rt(rt_cache, cfg);
3264                                         if (rc != -ESRCH) {
3265                                                 rcu_read_unlock();
3266                                                 return rc;
3267                                         }
3268                                 }
3269                                 continue;
3270                         }
3271                         if (cfg->fc_ifindex &&
3272                             (!rt->fib6_nh.nh_dev ||
3273                              rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex))
3274                                 continue;
3275                         if (cfg->fc_flags & RTF_GATEWAY &&
3276                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw))
3277                                 continue;
3278                         if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3279                                 continue;
3280                         if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3281                                 continue;
3282                         if (!fib6_info_hold_safe(rt))
3283                                 continue;
3284                         rcu_read_unlock();
3285
3286                         /* if gateway was specified only delete the one hop */
3287                         if (cfg->fc_flags & RTF_GATEWAY)
3288                                 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3289
3290                         return __ip6_del_rt_siblings(rt, cfg);
3291                 }
3292         }
3293         rcu_read_unlock();
3294
3295         return err;
3296 }
3297
3298 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3299 {
3300         struct netevent_redirect netevent;
3301         struct rt6_info *rt, *nrt = NULL;
3302         struct ndisc_options ndopts;
3303         struct inet6_dev *in6_dev;
3304         struct neighbour *neigh;
3305         struct fib6_info *from;
3306         struct rd_msg *msg;
3307         int optlen, on_link;
3308         u8 *lladdr;
3309
3310         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3311         optlen -= sizeof(*msg);
3312
3313         if (optlen < 0) {
3314                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3315                 return;
3316         }
3317
3318         msg = (struct rd_msg *)icmp6_hdr(skb);
3319
3320         if (ipv6_addr_is_multicast(&msg->dest)) {
3321                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3322                 return;
3323         }
3324
3325         on_link = 0;
3326         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3327                 on_link = 1;
3328         } else if (ipv6_addr_type(&msg->target) !=
3329                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3330                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3331                 return;
3332         }
3333
3334         in6_dev = __in6_dev_get(skb->dev);
3335         if (!in6_dev)
3336                 return;
3337         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3338                 return;
3339
3340         /* RFC2461 8.1:
3341          *      The IP source address of the Redirect MUST be the same as the current
3342          *      first-hop router for the specified ICMP Destination Address.
3343          */
3344
3345         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3346                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3347                 return;
3348         }
3349
3350         lladdr = NULL;
3351         if (ndopts.nd_opts_tgt_lladdr) {
3352                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3353                                              skb->dev);
3354                 if (!lladdr) {
3355                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3356                         return;
3357                 }
3358         }
3359
3360         rt = (struct rt6_info *) dst;
3361         if (rt->rt6i_flags & RTF_REJECT) {
3362                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3363                 return;
3364         }
3365
3366         /* Redirect received -> path was valid.
3367          * Look, redirects are sent only in response to data packets,
3368          * so that this nexthop apparently is reachable. --ANK
3369          */
3370         dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3371
3372         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3373         if (!neigh)
3374                 return;
3375
3376         /*
3377          *      We have finally decided to accept it.
3378          */
3379
3380         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3381                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
3382                      NEIGH_UPDATE_F_OVERRIDE|
3383                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3384                                      NEIGH_UPDATE_F_ISROUTER)),
3385                      NDISC_REDIRECT, &ndopts);
3386
3387         rcu_read_lock();
3388         from = rcu_dereference(rt->from);
3389         /* This fib6_info_hold() is safe here because we hold reference to rt
3390          * and rt already holds reference to fib6_info.
3391          */
3392         fib6_info_hold(from);
3393         rcu_read_unlock();
3394
3395         nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
3396         if (!nrt)
3397                 goto out;
3398
3399         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3400         if (on_link)
3401                 nrt->rt6i_flags &= ~RTF_GATEWAY;
3402
3403         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3404
3405         /* No need to remove rt from the exception table if rt is
3406          * a cached route because rt6_insert_exception() will
3407          * takes care of it
3408          */
3409         if (rt6_insert_exception(nrt, from)) {
3410                 dst_release_immediate(&nrt->dst);
3411                 goto out;
3412         }
3413
3414         netevent.old = &rt->dst;
3415         netevent.new = &nrt->dst;
3416         netevent.daddr = &msg->dest;
3417         netevent.neigh = neigh;
3418         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3419
3420 out:
3421         fib6_info_release(from);
3422         neigh_release(neigh);
3423 }
3424
3425 #ifdef CONFIG_IPV6_ROUTE_INFO
3426 static struct fib6_info *rt6_get_route_info(struct net *net,
3427                                            const struct in6_addr *prefix, int prefixlen,
3428                                            const struct in6_addr *gwaddr,
3429                                            struct net_device *dev)
3430 {
3431         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3432         int ifindex = dev->ifindex;
3433         struct fib6_node *fn;
3434         struct fib6_info *rt = NULL;
3435         struct fib6_table *table;
3436
3437         table = fib6_get_table(net, tb_id);
3438         if (!table)
3439                 return NULL;
3440
3441         rcu_read_lock();
3442         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3443         if (!fn)
3444                 goto out;
3445
3446         for_each_fib6_node_rt_rcu(fn) {
3447                 if (rt->fib6_nh.nh_dev->ifindex != ifindex)
3448                         continue;
3449                 if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3450                         continue;
3451                 if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
3452                         continue;
3453                 if (!fib6_info_hold_safe(rt))
3454                         continue;
3455                 break;
3456         }
3457 out:
3458         rcu_read_unlock();
3459         return rt;
3460 }
3461
3462 static struct fib6_info *rt6_add_route_info(struct net *net,
3463                                            const struct in6_addr *prefix, int prefixlen,
3464                                            const struct in6_addr *gwaddr,
3465                                            struct net_device *dev,
3466                                            unsigned int pref)
3467 {
3468         struct fib6_config cfg = {
3469                 .fc_metric      = IP6_RT_PRIO_USER,
3470                 .fc_ifindex     = dev->ifindex,
3471                 .fc_dst_len     = prefixlen,
3472                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3473                                   RTF_UP | RTF_PREF(pref),
3474                 .fc_protocol = RTPROT_RA,
3475                 .fc_type = RTN_UNICAST,
3476                 .fc_nlinfo.portid = 0,
3477                 .fc_nlinfo.nlh = NULL,
3478                 .fc_nlinfo.nl_net = net,
3479         };
3480
3481         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3482         cfg.fc_dst = *prefix;
3483         cfg.fc_gateway = *gwaddr;
3484
3485         /* We should treat it as a default route if prefix length is 0. */
3486         if (!prefixlen)
3487                 cfg.fc_flags |= RTF_DEFAULT;
3488
3489         ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3490
3491         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3492 }
3493 #endif
3494
3495 struct fib6_info *rt6_get_dflt_router(struct net *net,
3496                                      const struct in6_addr *addr,
3497                                      struct net_device *dev)
3498 {
3499         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3500         struct fib6_info *rt;
3501         struct fib6_table *table;
3502
3503         table = fib6_get_table(net, tb_id);
3504         if (!table)
3505                 return NULL;
3506
3507         rcu_read_lock();
3508         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3509                 if (dev == rt->fib6_nh.nh_dev &&
3510                     ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3511                     ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
3512                         break;
3513         }
3514         if (rt && !fib6_info_hold_safe(rt))
3515                 rt = NULL;
3516         rcu_read_unlock();
3517         return rt;
3518 }
3519
3520 struct fib6_info *rt6_add_dflt_router(struct net *net,
3521                                      const struct in6_addr *gwaddr,
3522                                      struct net_device *dev,
3523                                      unsigned int pref)
3524 {
3525         struct fib6_config cfg = {
3526                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3527                 .fc_metric      = IP6_RT_PRIO_USER,
3528                 .fc_ifindex     = dev->ifindex,
3529                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3530                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3531                 .fc_protocol = RTPROT_RA,
3532                 .fc_type = RTN_UNICAST,
3533                 .fc_nlinfo.portid = 0,
3534                 .fc_nlinfo.nlh = NULL,
3535                 .fc_nlinfo.nl_net = net,
3536         };
3537
3538         cfg.fc_gateway = *gwaddr;
3539
3540         if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3541                 struct fib6_table *table;
3542
3543                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3544                 if (table)
3545                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3546         }
3547
3548         return rt6_get_dflt_router(net, gwaddr, dev);
3549 }
3550
3551 static void __rt6_purge_dflt_routers(struct net *net,
3552                                      struct fib6_table *table)
3553 {
3554         struct fib6_info *rt;
3555
3556 restart:
3557         rcu_read_lock();
3558         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3559                 struct net_device *dev = fib6_info_nh_dev(rt);
3560                 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3561
3562                 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3563                     (!idev || idev->cnf.accept_ra != 2) &&
3564                     fib6_info_hold_safe(rt)) {
3565                         rcu_read_unlock();
3566                         ip6_del_rt(net, rt);
3567                         goto restart;
3568                 }
3569         }
3570         rcu_read_unlock();
3571
3572         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3573 }
3574
3575 void rt6_purge_dflt_routers(struct net *net)
3576 {
3577         struct fib6_table *table;
3578         struct hlist_head *head;
3579         unsigned int h;
3580
3581         rcu_read_lock();
3582
3583         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3584                 head = &net->ipv6.fib_table_hash[h];
3585                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3586                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3587                                 __rt6_purge_dflt_routers(net, table);
3588                 }
3589         }
3590
3591         rcu_read_unlock();
3592 }
3593
3594 static void rtmsg_to_fib6_config(struct net *net,
3595                                  struct in6_rtmsg *rtmsg,
3596                                  struct fib6_config *cfg)
3597 {
3598         *cfg = (struct fib6_config){
3599                 .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3600                          : RT6_TABLE_MAIN,
3601                 .fc_ifindex = rtmsg->rtmsg_ifindex,
3602                 .fc_metric = rtmsg->rtmsg_metric,
3603                 .fc_expires = rtmsg->rtmsg_info,
3604                 .fc_dst_len = rtmsg->rtmsg_dst_len,
3605                 .fc_src_len = rtmsg->rtmsg_src_len,
3606                 .fc_flags = rtmsg->rtmsg_flags,
3607                 .fc_type = rtmsg->rtmsg_type,
3608
3609                 .fc_nlinfo.nl_net = net,
3610
3611                 .fc_dst = rtmsg->rtmsg_dst,
3612                 .fc_src = rtmsg->rtmsg_src,
3613                 .fc_gateway = rtmsg->rtmsg_gateway,
3614         };
3615 }
3616
3617 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3618 {
3619         struct fib6_config cfg;
3620         struct in6_rtmsg rtmsg;
3621         int err;
3622
3623         switch (cmd) {
3624         case SIOCADDRT:         /* Add a route */
3625         case SIOCDELRT:         /* Delete a route */
3626                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3627                         return -EPERM;
3628                 err = copy_from_user(&rtmsg, arg,
3629                                      sizeof(struct in6_rtmsg));
3630                 if (err)
3631                         return -EFAULT;
3632
3633                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3634
3635                 rtnl_lock();
3636                 switch (cmd) {
3637                 case SIOCADDRT:
3638                         err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3639                         break;
3640                 case SIOCDELRT:
3641                         err = ip6_route_del(&cfg, NULL);
3642                         break;
3643                 default:
3644                         err = -EINVAL;
3645                 }
3646                 rtnl_unlock();
3647
3648                 return err;
3649         }
3650
3651         return -EINVAL;
3652 }
3653
3654 /*
3655  *      Drop the packet on the floor
3656  */
3657
3658 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3659 {
3660         int type;
3661         struct dst_entry *dst = skb_dst(skb);
3662         switch (ipstats_mib_noroutes) {
3663         case IPSTATS_MIB_INNOROUTES:
3664                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3665                 if (type == IPV6_ADDR_ANY) {
3666                         IP6_INC_STATS(dev_net(dst->dev),
3667                                       __in6_dev_get_safely(skb->dev),
3668                                       IPSTATS_MIB_INADDRERRORS);
3669                         break;
3670                 }
3671                 /* FALLTHROUGH */
3672         case IPSTATS_MIB_OUTNOROUTES:
3673                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3674                               ipstats_mib_noroutes);
3675                 break;
3676         }
3677         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3678         kfree_skb(skb);
3679         return 0;
3680 }
3681
3682 static int ip6_pkt_discard(struct sk_buff *skb)
3683 {
3684         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3685 }
3686
3687 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3688 {
3689         skb->dev = skb_dst(skb)->dev;
3690         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3691 }
3692
3693 static int ip6_pkt_prohibit(struct sk_buff *skb)
3694 {
3695         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3696 }
3697
3698 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3699 {
3700         skb->dev = skb_dst(skb)->dev;
3701         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3702 }
3703
3704 /*
3705  *      Allocate a dst for local (unicast / anycast) address.
3706  */
3707
3708 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3709                                      struct inet6_dev *idev,
3710                                      const struct in6_addr *addr,
3711                                      bool anycast, gfp_t gfp_flags)
3712 {
3713         u32 tb_id;
3714         struct net_device *dev = idev->dev;
3715         struct fib6_info *f6i;
3716
3717         f6i = fib6_info_alloc(gfp_flags);
3718         if (!f6i)
3719                 return ERR_PTR(-ENOMEM);
3720
3721         f6i->fib6_metrics = ip_fib_metrics_init(net, NULL, 0, NULL);
3722         f6i->dst_nocount = true;
3723         f6i->dst_host = true;
3724         f6i->fib6_protocol = RTPROT_KERNEL;
3725         f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP;
3726         if (anycast) {
3727                 f6i->fib6_type = RTN_ANYCAST;
3728                 f6i->fib6_flags |= RTF_ANYCAST;
3729         } else {
3730                 f6i->fib6_type = RTN_LOCAL;
3731                 f6i->fib6_flags |= RTF_LOCAL;
3732         }
3733
3734         f6i->fib6_nh.nh_gw = *addr;
3735         dev_hold(dev);
3736         f6i->fib6_nh.nh_dev = dev;
3737         f6i->fib6_dst.addr = *addr;
3738         f6i->fib6_dst.plen = 128;
3739         tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3740         f6i->fib6_table = fib6_get_table(net, tb_id);
3741
3742         return f6i;
3743 }
3744
3745 /* remove deleted ip from prefsrc entries */
3746 struct arg_dev_net_ip {
3747         struct net_device *dev;
3748         struct net *net;
3749         struct in6_addr *addr;
3750 };
3751
3752 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3753 {
3754         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3755         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3756         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3757
3758         if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
3759             rt != net->ipv6.fib6_null_entry &&
3760             ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3761                 spin_lock_bh(&rt6_exception_lock);
3762                 /* remove prefsrc entry */
3763                 rt->fib6_prefsrc.plen = 0;
3764                 spin_unlock_bh(&rt6_exception_lock);
3765         }
3766         return 0;
3767 }
3768
3769 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3770 {
3771         struct net *net = dev_net(ifp->idev->dev);
3772         struct arg_dev_net_ip adni = {
3773                 .dev = ifp->idev->dev,
3774                 .net = net,
3775                 .addr = &ifp->addr,
3776         };
3777         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3778 }
3779
3780 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3781
3782 /* Remove routers and update dst entries when gateway turn into host. */
3783 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3784 {
3785         struct in6_addr *gateway = (struct in6_addr *)arg;
3786
3787         if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3788             ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) {
3789                 return -1;
3790         }
3791
3792         /* Further clean up cached routes in exception table.
3793          * This is needed because cached route may have a different
3794          * gateway than its 'parent' in the case of an ip redirect.
3795          */
3796         rt6_exceptions_clean_tohost(rt, gateway);
3797
3798         return 0;
3799 }
3800
3801 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3802 {
3803         fib6_clean_all(net, fib6_clean_tohost, gateway);
3804 }
3805
3806 struct arg_netdev_event {
3807         const struct net_device *dev;
3808         union {
3809                 unsigned int nh_flags;
3810                 unsigned long event;
3811         };
3812 };
3813
3814 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3815 {
3816         struct fib6_info *iter;
3817         struct fib6_node *fn;
3818
3819         fn = rcu_dereference_protected(rt->fib6_node,
3820                         lockdep_is_held(&rt->fib6_table->tb6_lock));
3821         iter = rcu_dereference_protected(fn->leaf,
3822                         lockdep_is_held(&rt->fib6_table->tb6_lock));
3823         while (iter) {
3824                 if (iter->fib6_metric == rt->fib6_metric &&
3825                     rt6_qualify_for_ecmp(iter))
3826                         return iter;
3827                 iter = rcu_dereference_protected(iter->fib6_next,
3828                                 lockdep_is_held(&rt->fib6_table->tb6_lock));
3829         }
3830
3831         return NULL;
3832 }
3833
3834 static bool rt6_is_dead(const struct fib6_info *rt)
3835 {
3836         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
3837             (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
3838              fib6_ignore_linkdown(rt)))
3839                 return true;
3840
3841         return false;
3842 }
3843
3844 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3845 {
3846         struct fib6_info *iter;
3847         int total = 0;
3848
3849         if (!rt6_is_dead(rt))
3850                 total += rt->fib6_nh.nh_weight;
3851
3852         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3853                 if (!rt6_is_dead(iter))
3854                         total += iter->fib6_nh.nh_weight;
3855         }
3856
3857         return total;
3858 }
3859
3860 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3861 {
3862         int upper_bound = -1;
3863
3864         if (!rt6_is_dead(rt)) {
3865                 *weight += rt->fib6_nh.nh_weight;
3866                 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3867                                                     total) - 1;
3868         }
3869         atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound);
3870 }
3871
3872 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3873 {
3874         struct fib6_info *iter;
3875         int weight = 0;
3876
3877         rt6_upper_bound_set(rt, &weight, total);
3878
3879         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3880                 rt6_upper_bound_set(iter, &weight, total);
3881 }
3882
3883 void rt6_multipath_rebalance(struct fib6_info *rt)
3884 {
3885         struct fib6_info *first;
3886         int total;
3887
3888         /* In case the entire multipath route was marked for flushing,
3889          * then there is no need to rebalance upon the removal of every
3890          * sibling route.
3891          */
3892         if (!rt->fib6_nsiblings || rt->should_flush)
3893                 return;
3894
3895         /* During lookup routes are evaluated in order, so we need to
3896          * make sure upper bounds are assigned from the first sibling
3897          * onwards.
3898          */
3899         first = rt6_multipath_first_sibling(rt);
3900         if (WARN_ON_ONCE(!first))
3901                 return;
3902
3903         total = rt6_multipath_total_weight(first);
3904         rt6_multipath_upper_bound_set(first, total);
3905 }
3906
3907 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
3908 {
3909         const struct arg_netdev_event *arg = p_arg;
3910         struct net *net = dev_net(arg->dev);
3911
3912         if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) {
3913                 rt->fib6_nh.nh_flags &= ~arg->nh_flags;
3914                 fib6_update_sernum_upto_root(net, rt);
3915                 rt6_multipath_rebalance(rt);
3916         }
3917
3918         return 0;
3919 }
3920
3921 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3922 {
3923         struct arg_netdev_event arg = {
3924                 .dev = dev,
3925                 {
3926                         .nh_flags = nh_flags,
3927                 },
3928         };
3929
3930         if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3931                 arg.nh_flags |= RTNH_F_LINKDOWN;
3932
3933         fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3934 }
3935
3936 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
3937                                    const struct net_device *dev)
3938 {
3939         struct fib6_info *iter;
3940
3941         if (rt->fib6_nh.nh_dev == dev)
3942                 return true;
3943         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3944                 if (iter->fib6_nh.nh_dev == dev)
3945                         return true;
3946
3947         return false;
3948 }
3949
3950 static void rt6_multipath_flush(struct fib6_info *rt)
3951 {
3952         struct fib6_info *iter;
3953
3954         rt->should_flush = 1;
3955         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3956                 iter->should_flush = 1;
3957 }
3958
3959 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
3960                                              const struct net_device *down_dev)
3961 {
3962         struct fib6_info *iter;
3963         unsigned int dead = 0;
3964
3965         if (rt->fib6_nh.nh_dev == down_dev ||
3966             rt->fib6_nh.nh_flags & RTNH_F_DEAD)
3967                 dead++;
3968         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3969                 if (iter->fib6_nh.nh_dev == down_dev ||
3970                     iter->fib6_nh.nh_flags & RTNH_F_DEAD)
3971                         dead++;
3972
3973         return dead;
3974 }
3975
3976 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
3977                                        const struct net_device *dev,
3978                                        unsigned int nh_flags)
3979 {
3980         struct fib6_info *iter;
3981
3982         if (rt->fib6_nh.nh_dev == dev)
3983                 rt->fib6_nh.nh_flags |= nh_flags;
3984         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3985                 if (iter->fib6_nh.nh_dev == dev)
3986                         iter->fib6_nh.nh_flags |= nh_flags;
3987 }
3988
3989 /* called with write lock held for table with rt */
3990 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
3991 {
3992         const struct arg_netdev_event *arg = p_arg;
3993         const struct net_device *dev = arg->dev;
3994         struct net *net = dev_net(dev);
3995
3996         if (rt == net->ipv6.fib6_null_entry)
3997                 return 0;
3998
3999         switch (arg->event) {
4000         case NETDEV_UNREGISTER:
4001                 return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4002         case NETDEV_DOWN:
4003                 if (rt->should_flush)
4004                         return -1;
4005                 if (!rt->fib6_nsiblings)
4006                         return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4007                 if (rt6_multipath_uses_dev(rt, dev)) {
4008                         unsigned int count;
4009
4010                         count = rt6_multipath_dead_count(rt, dev);
4011                         if (rt->fib6_nsiblings + 1 == count) {
4012                                 rt6_multipath_flush(rt);
4013                                 return -1;
4014                         }
4015                         rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4016                                                    RTNH_F_LINKDOWN);
4017                         fib6_update_sernum(net, rt);
4018                         rt6_multipath_rebalance(rt);
4019                 }
4020                 return -2;
4021         case NETDEV_CHANGE:
4022                 if (rt->fib6_nh.nh_dev != dev ||
4023                     rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4024                         break;
4025                 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
4026                 rt6_multipath_rebalance(rt);
4027                 break;
4028         }
4029
4030         return 0;
4031 }
4032
4033 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4034 {
4035         struct arg_netdev_event arg = {
4036                 .dev = dev,
4037                 {
4038                         .event = event,
4039                 },
4040         };
4041         struct net *net = dev_net(dev);
4042
4043         if (net->ipv6.sysctl.skip_notify_on_dev_down)
4044                 fib6_clean_all_skip_notify(net, fib6_ifdown, &arg);
4045         else
4046                 fib6_clean_all(net, fib6_ifdown, &arg);
4047 }
4048
4049 void rt6_disable_ip(struct net_device *dev, unsigned long event)
4050 {
4051         rt6_sync_down_dev(dev, event);
4052         rt6_uncached_list_flush_dev(dev_net(dev), dev);
4053         neigh_ifdown(&nd_tbl, dev);
4054 }
4055
4056 struct rt6_mtu_change_arg {
4057         struct net_device *dev;
4058         unsigned int mtu;
4059 };
4060
4061 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4062 {
4063         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4064         struct inet6_dev *idev;
4065
4066         /* In IPv6 pmtu discovery is not optional,
4067            so that RTAX_MTU lock cannot disable it.
4068            We still use this lock to block changes
4069            caused by addrconf/ndisc.
4070         */
4071
4072         idev = __in6_dev_get(arg->dev);
4073         if (!idev)
4074                 return 0;
4075
4076         /* For administrative MTU increase, there is no way to discover
4077            IPv6 PMTU increase, so PMTU increase should be updated here.
4078            Since RFC 1981 doesn't include administrative MTU increase
4079            update PMTU increase is a MUST. (i.e. jumbo frame)
4080          */
4081         if (rt->fib6_nh.nh_dev == arg->dev &&
4082             !fib6_metric_locked(rt, RTAX_MTU)) {
4083                 u32 mtu = rt->fib6_pmtu;
4084
4085                 if (mtu >= arg->mtu ||
4086                     (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4087                         fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4088
4089                 spin_lock_bh(&rt6_exception_lock);
4090                 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4091                 spin_unlock_bh(&rt6_exception_lock);
4092         }
4093         return 0;
4094 }
4095
4096 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4097 {
4098         struct rt6_mtu_change_arg arg = {
4099                 .dev = dev,
4100                 .mtu = mtu,
4101         };
4102
4103         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4104 }
4105
4106 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4107         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
4108         [RTA_PREFSRC]           = { .len = sizeof(struct in6_addr) },
4109         [RTA_OIF]               = { .type = NLA_U32 },
4110         [RTA_IIF]               = { .type = NLA_U32 },
4111         [RTA_PRIORITY]          = { .type = NLA_U32 },
4112         [RTA_METRICS]           = { .type = NLA_NESTED },
4113         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
4114         [RTA_PREF]              = { .type = NLA_U8 },
4115         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
4116         [RTA_ENCAP]             = { .type = NLA_NESTED },
4117         [RTA_EXPIRES]           = { .type = NLA_U32 },
4118         [RTA_UID]               = { .type = NLA_U32 },
4119         [RTA_MARK]              = { .type = NLA_U32 },
4120         [RTA_TABLE]             = { .type = NLA_U32 },
4121         [RTA_IP_PROTO]          = { .type = NLA_U8 },
4122         [RTA_SPORT]             = { .type = NLA_U16 },
4123         [RTA_DPORT]             = { .type = NLA_U16 },
4124 };
4125
4126 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4127                               struct fib6_config *cfg,
4128                               struct netlink_ext_ack *extack)
4129 {
4130         struct rtmsg *rtm;
4131         struct nlattr *tb[RTA_MAX+1];
4132         unsigned int pref;
4133         int err;
4134
4135         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4136                           extack);
4137         if (err < 0)
4138                 goto errout;
4139
4140         err = -EINVAL;
4141         rtm = nlmsg_data(nlh);
4142
4143         *cfg = (struct fib6_config){
4144                 .fc_table = rtm->rtm_table,
4145                 .fc_dst_len = rtm->rtm_dst_len,
4146                 .fc_src_len = rtm->rtm_src_len,
4147                 .fc_flags = RTF_UP,
4148                 .fc_protocol = rtm->rtm_protocol,
4149                 .fc_type = rtm->rtm_type,
4150
4151                 .fc_nlinfo.portid = NETLINK_CB(skb).portid,
4152                 .fc_nlinfo.nlh = nlh,
4153                 .fc_nlinfo.nl_net = sock_net(skb->sk),
4154         };
4155
4156         if (rtm->rtm_type == RTN_UNREACHABLE ||
4157             rtm->rtm_type == RTN_BLACKHOLE ||
4158             rtm->rtm_type == RTN_PROHIBIT ||
4159             rtm->rtm_type == RTN_THROW)
4160                 cfg->fc_flags |= RTF_REJECT;
4161
4162         if (rtm->rtm_type == RTN_LOCAL)
4163                 cfg->fc_flags |= RTF_LOCAL;
4164
4165         if (rtm->rtm_flags & RTM_F_CLONED)
4166                 cfg->fc_flags |= RTF_CACHE;
4167
4168         cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4169
4170         if (tb[RTA_GATEWAY]) {
4171                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4172                 cfg->fc_flags |= RTF_GATEWAY;
4173         }
4174
4175         if (tb[RTA_DST]) {
4176                 int plen = (rtm->rtm_dst_len + 7) >> 3;
4177
4178                 if (nla_len(tb[RTA_DST]) < plen)
4179                         goto errout;
4180
4181                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4182         }
4183
4184         if (tb[RTA_SRC]) {
4185                 int plen = (rtm->rtm_src_len + 7) >> 3;
4186
4187                 if (nla_len(tb[RTA_SRC]) < plen)
4188                         goto errout;
4189
4190                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4191         }
4192
4193         if (tb[RTA_PREFSRC])
4194                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4195
4196         if (tb[RTA_OIF])
4197                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4198
4199         if (tb[RTA_PRIORITY])
4200                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4201
4202         if (tb[RTA_METRICS]) {
4203                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4204                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4205         }
4206
4207         if (tb[RTA_TABLE])
4208                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4209
4210         if (tb[RTA_MULTIPATH]) {
4211                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4212                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4213
4214                 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4215                                                      cfg->fc_mp_len, extack);
4216                 if (err < 0)
4217                         goto errout;
4218         }
4219
4220         if (tb[RTA_PREF]) {
4221                 pref = nla_get_u8(tb[RTA_PREF]);
4222                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4223                     pref != ICMPV6_ROUTER_PREF_HIGH)
4224                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
4225                 cfg->fc_flags |= RTF_PREF(pref);
4226         }
4227
4228         if (tb[RTA_ENCAP])
4229                 cfg->fc_encap = tb[RTA_ENCAP];
4230
4231         if (tb[RTA_ENCAP_TYPE]) {
4232                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4233
4234                 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4235                 if (err < 0)
4236                         goto errout;
4237         }
4238
4239         if (tb[RTA_EXPIRES]) {
4240                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4241
4242                 if (addrconf_finite_timeout(timeout)) {
4243                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4244                         cfg->fc_flags |= RTF_EXPIRES;
4245                 }
4246         }
4247
4248         err = 0;
4249 errout:
4250         return err;
4251 }
4252
4253 struct rt6_nh {
4254         struct fib6_info *fib6_info;
4255         struct fib6_config r_cfg;
4256         struct list_head next;
4257 };
4258
4259 static int ip6_route_info_append(struct net *net,
4260                                  struct list_head *rt6_nh_list,
4261                                  struct fib6_info *rt,
4262                                  struct fib6_config *r_cfg)
4263 {
4264         struct rt6_nh *nh;
4265         int err = -EEXIST;
4266
4267         list_for_each_entry(nh, rt6_nh_list, next) {
4268                 /* check if fib6_info already exists */
4269                 if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4270                         return err;
4271         }
4272
4273         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4274         if (!nh)
4275                 return -ENOMEM;
4276         nh->fib6_info = rt;
4277         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4278         list_add_tail(&nh->next, rt6_nh_list);
4279
4280         return 0;
4281 }
4282
4283 static void ip6_route_mpath_notify(struct fib6_info *rt,
4284                                    struct fib6_info *rt_last,
4285                                    struct nl_info *info,
4286                                    __u16 nlflags)
4287 {
4288         /* if this is an APPEND route, then rt points to the first route
4289          * inserted and rt_last points to last route inserted. Userspace
4290          * wants a consistent dump of the route which starts at the first
4291          * nexthop. Since sibling routes are always added at the end of
4292          * the list, find the first sibling of the last route appended
4293          */
4294         if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4295                 rt = list_first_entry(&rt_last->fib6_siblings,
4296                                       struct fib6_info,
4297                                       fib6_siblings);
4298         }
4299
4300         if (rt)
4301                 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4302 }
4303
4304 static int ip6_route_multipath_add(struct fib6_config *cfg,
4305                                    struct netlink_ext_ack *extack)
4306 {
4307         struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4308         struct nl_info *info = &cfg->fc_nlinfo;
4309         struct fib6_config r_cfg;
4310         struct rtnexthop *rtnh;
4311         struct fib6_info *rt;
4312         struct rt6_nh *err_nh;
4313         struct rt6_nh *nh, *nh_safe;
4314         __u16 nlflags;
4315         int remaining;
4316         int attrlen;
4317         int err = 1;
4318         int nhn = 0;
4319         int replace = (cfg->fc_nlinfo.nlh &&
4320                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4321         LIST_HEAD(rt6_nh_list);
4322
4323         nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4324         if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4325                 nlflags |= NLM_F_APPEND;
4326
4327         remaining = cfg->fc_mp_len;
4328         rtnh = (struct rtnexthop *)cfg->fc_mp;
4329
4330         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4331          * fib6_info structs per nexthop
4332          */
4333         while (rtnh_ok(rtnh, remaining)) {
4334                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4335                 if (rtnh->rtnh_ifindex)
4336                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4337
4338                 attrlen = rtnh_attrlen(rtnh);
4339                 if (attrlen > 0) {
4340                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4341
4342                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4343                         if (nla) {
4344                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4345                                 r_cfg.fc_flags |= RTF_GATEWAY;
4346                         }
4347                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4348                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4349                         if (nla)
4350                                 r_cfg.fc_encap_type = nla_get_u16(nla);
4351                 }
4352
4353                 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4354                 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4355                 if (IS_ERR(rt)) {
4356                         err = PTR_ERR(rt);
4357                         rt = NULL;
4358                         goto cleanup;
4359                 }
4360                 if (!rt6_qualify_for_ecmp(rt)) {
4361                         err = -EINVAL;
4362                         NL_SET_ERR_MSG(extack,
4363                                        "Device only routes can not be added for IPv6 using the multipath API.");
4364                         fib6_info_release(rt);
4365                         goto cleanup;
4366                 }
4367
4368                 rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
4369
4370                 err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4371                                             rt, &r_cfg);
4372                 if (err) {
4373                         fib6_info_release(rt);
4374                         goto cleanup;
4375                 }
4376
4377                 rtnh = rtnh_next(rtnh, &remaining);
4378         }
4379
4380         /* for add and replace send one notification with all nexthops.
4381          * Skip the notification in fib6_add_rt2node and send one with
4382          * the full route when done
4383          */
4384         info->skip_notify = 1;
4385
4386         err_nh = NULL;
4387         list_for_each_entry(nh, &rt6_nh_list, next) {
4388                 err = __ip6_ins_rt(nh->fib6_info, info, extack);
4389                 fib6_info_release(nh->fib6_info);
4390
4391                 if (!err) {
4392                         /* save reference to last route successfully inserted */
4393                         rt_last = nh->fib6_info;
4394
4395                         /* save reference to first route for notification */
4396                         if (!rt_notif)
4397                                 rt_notif = nh->fib6_info;
4398                 }
4399
4400                 /* nh->fib6_info is used or freed at this point, reset to NULL*/
4401                 nh->fib6_info = NULL;
4402                 if (err) {
4403                         if (replace && nhn)
4404                                 NL_SET_ERR_MSG_MOD(extack,
4405                                                    "multipath route replace failed (check consistency of installed routes)");
4406                         err_nh = nh;
4407                         goto add_errout;
4408                 }
4409
4410                 /* Because each route is added like a single route we remove
4411                  * these flags after the first nexthop: if there is a collision,
4412                  * we have already failed to add the first nexthop:
4413                  * fib6_add_rt2node() has rejected it; when replacing, old
4414                  * nexthops have been replaced by first new, the rest should
4415                  * be added to it.
4416                  */
4417                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4418                                                      NLM_F_REPLACE);
4419                 nhn++;
4420         }
4421
4422         /* success ... tell user about new route */
4423         ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4424         goto cleanup;
4425
4426 add_errout:
4427         /* send notification for routes that were added so that
4428          * the delete notifications sent by ip6_route_del are
4429          * coherent
4430          */
4431         if (rt_notif)
4432                 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4433
4434         /* Delete routes that were already added */
4435         list_for_each_entry(nh, &rt6_nh_list, next) {
4436                 if (err_nh == nh)
4437                         break;
4438                 ip6_route_del(&nh->r_cfg, extack);
4439         }
4440
4441 cleanup:
4442         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4443                 if (nh->fib6_info)
4444                         fib6_info_release(nh->fib6_info);
4445                 list_del(&nh->next);
4446                 kfree(nh);
4447         }
4448
4449         return err;
4450 }
4451
4452 static int ip6_route_multipath_del(struct fib6_config *cfg,
4453                                    struct netlink_ext_ack *extack)
4454 {
4455         struct fib6_config r_cfg;
4456         struct rtnexthop *rtnh;
4457         int remaining;
4458         int attrlen;
4459         int err = 1, last_err = 0;
4460
4461         remaining = cfg->fc_mp_len;
4462         rtnh = (struct rtnexthop *)cfg->fc_mp;
4463
4464         /* Parse a Multipath Entry */
4465         while (rtnh_ok(rtnh, remaining)) {
4466                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4467                 if (rtnh->rtnh_ifindex)
4468                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4469
4470                 attrlen = rtnh_attrlen(rtnh);
4471                 if (attrlen > 0) {
4472                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4473
4474                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4475                         if (nla) {
4476                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4477                                 r_cfg.fc_flags |= RTF_GATEWAY;
4478                         }
4479                 }
4480                 err = ip6_route_del(&r_cfg, extack);
4481                 if (err)
4482                         last_err = err;
4483
4484                 rtnh = rtnh_next(rtnh, &remaining);
4485         }
4486
4487         return last_err;
4488 }
4489
4490 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4491                               struct netlink_ext_ack *extack)
4492 {
4493         struct fib6_config cfg;
4494         int err;
4495
4496         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4497         if (err < 0)
4498                 return err;
4499
4500         if (cfg.fc_mp)
4501                 return ip6_route_multipath_del(&cfg, extack);
4502         else {
4503                 cfg.fc_delete_all_nh = 1;
4504                 return ip6_route_del(&cfg, extack);
4505         }
4506 }
4507
4508 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4509                               struct netlink_ext_ack *extack)
4510 {
4511         struct fib6_config cfg;
4512         int err;
4513
4514         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4515         if (err < 0)
4516                 return err;
4517
4518         if (cfg.fc_mp)
4519                 return ip6_route_multipath_add(&cfg, extack);
4520         else
4521                 return ip6_route_add(&cfg, GFP_KERNEL, extack);
4522 }
4523
4524 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4525 {
4526         int nexthop_len = 0;
4527
4528         if (rt->fib6_nsiblings) {
4529                 nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
4530                             + NLA_ALIGN(sizeof(struct rtnexthop))
4531                             + nla_total_size(16) /* RTA_GATEWAY */
4532                             + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate);
4533
4534                 nexthop_len *= rt->fib6_nsiblings;
4535         }
4536
4537         return NLMSG_ALIGN(sizeof(struct rtmsg))
4538                + nla_total_size(16) /* RTA_SRC */
4539                + nla_total_size(16) /* RTA_DST */
4540                + nla_total_size(16) /* RTA_GATEWAY */
4541                + nla_total_size(16) /* RTA_PREFSRC */
4542                + nla_total_size(4) /* RTA_TABLE */
4543                + nla_total_size(4) /* RTA_IIF */
4544                + nla_total_size(4) /* RTA_OIF */
4545                + nla_total_size(4) /* RTA_PRIORITY */
4546                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4547                + nla_total_size(sizeof(struct rta_cacheinfo))
4548                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4549                + nla_total_size(1) /* RTA_PREF */
4550                + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate)
4551                + nexthop_len;
4552 }
4553
4554 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
4555                             unsigned int *flags, bool skip_oif)
4556 {
4557         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4558                 *flags |= RTNH_F_DEAD;
4559
4560         if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
4561                 *flags |= RTNH_F_LINKDOWN;
4562
4563                 rcu_read_lock();
4564                 if (fib6_ignore_linkdown(rt))
4565                         *flags |= RTNH_F_DEAD;
4566                 rcu_read_unlock();
4567         }
4568
4569         if (rt->fib6_flags & RTF_GATEWAY) {
4570                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0)
4571                         goto nla_put_failure;
4572         }
4573
4574         *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK);
4575         if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD)
4576                 *flags |= RTNH_F_OFFLOAD;
4577
4578         /* not needed for multipath encoding b/c it has a rtnexthop struct */
4579         if (!skip_oif && rt->fib6_nh.nh_dev &&
4580             nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex))
4581                 goto nla_put_failure;
4582
4583         if (rt->fib6_nh.nh_lwtstate &&
4584             lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0)
4585                 goto nla_put_failure;
4586
4587         return 0;
4588
4589 nla_put_failure:
4590         return -EMSGSIZE;
4591 }
4592
4593 /* add multipath next hop */
4594 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt)
4595 {
4596         const struct net_device *dev = rt->fib6_nh.nh_dev;
4597         struct rtnexthop *rtnh;
4598         unsigned int flags = 0;
4599
4600         rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4601         if (!rtnh)
4602                 goto nla_put_failure;
4603
4604         rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1;
4605         rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
4606
4607         if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4608                 goto nla_put_failure;
4609
4610         rtnh->rtnh_flags = flags;
4611
4612         /* length of rtnetlink header + attributes */
4613         rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4614
4615         return 0;
4616
4617 nla_put_failure:
4618         return -EMSGSIZE;
4619 }
4620
4621 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4622                          struct fib6_info *rt, struct dst_entry *dst,
4623                          struct in6_addr *dest, struct in6_addr *src,
4624                          int iif, int type, u32 portid, u32 seq,
4625                          unsigned int flags)
4626 {
4627         struct rt6_info *rt6 = (struct rt6_info *)dst;
4628         struct rt6key *rt6_dst, *rt6_src;
4629         u32 *pmetrics, table, rt6_flags;
4630         struct nlmsghdr *nlh;
4631         struct rtmsg *rtm;
4632         long expires = 0;
4633
4634         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4635         if (!nlh)
4636                 return -EMSGSIZE;
4637
4638         if (rt6) {
4639                 rt6_dst = &rt6->rt6i_dst;
4640                 rt6_src = &rt6->rt6i_src;
4641                 rt6_flags = rt6->rt6i_flags;
4642         } else {
4643                 rt6_dst = &rt->fib6_dst;
4644                 rt6_src = &rt->fib6_src;
4645                 rt6_flags = rt->fib6_flags;
4646         }
4647
4648         rtm = nlmsg_data(nlh);
4649         rtm->rtm_family = AF_INET6;
4650         rtm->rtm_dst_len = rt6_dst->plen;
4651         rtm->rtm_src_len = rt6_src->plen;
4652         rtm->rtm_tos = 0;
4653         if (rt->fib6_table)
4654                 table = rt->fib6_table->tb6_id;
4655         else
4656                 table = RT6_TABLE_UNSPEC;
4657         rtm->rtm_table = table;
4658         if (nla_put_u32(skb, RTA_TABLE, table))
4659                 goto nla_put_failure;
4660
4661         rtm->rtm_type = rt->fib6_type;
4662         rtm->rtm_flags = 0;
4663         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4664         rtm->rtm_protocol = rt->fib6_protocol;
4665
4666         if (rt6_flags & RTF_CACHE)
4667                 rtm->rtm_flags |= RTM_F_CLONED;
4668
4669         if (dest) {
4670                 if (nla_put_in6_addr(skb, RTA_DST, dest))
4671                         goto nla_put_failure;
4672                 rtm->rtm_dst_len = 128;
4673         } else if (rtm->rtm_dst_len)
4674                 if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
4675                         goto nla_put_failure;
4676 #ifdef CONFIG_IPV6_SUBTREES
4677         if (src) {
4678                 if (nla_put_in6_addr(skb, RTA_SRC, src))
4679                         goto nla_put_failure;
4680                 rtm->rtm_src_len = 128;
4681         } else if (rtm->rtm_src_len &&
4682                    nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
4683                 goto nla_put_failure;
4684 #endif
4685         if (iif) {
4686 #ifdef CONFIG_IPV6_MROUTE
4687                 if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
4688                         int err = ip6mr_get_route(net, skb, rtm, portid);
4689
4690                         if (err == 0)
4691                                 return 0;
4692                         if (err < 0)
4693                                 goto nla_put_failure;
4694                 } else
4695 #endif
4696                         if (nla_put_u32(skb, RTA_IIF, iif))
4697                                 goto nla_put_failure;
4698         } else if (dest) {
4699                 struct in6_addr saddr_buf;
4700                 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4701                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4702                         goto nla_put_failure;
4703         }
4704
4705         if (rt->fib6_prefsrc.plen) {
4706                 struct in6_addr saddr_buf;
4707                 saddr_buf = rt->fib6_prefsrc.addr;
4708                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4709                         goto nla_put_failure;
4710         }
4711
4712         pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4713         if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4714                 goto nla_put_failure;
4715
4716         if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4717                 goto nla_put_failure;
4718
4719         /* For multipath routes, walk the siblings list and add
4720          * each as a nexthop within RTA_MULTIPATH.
4721          */
4722         if (rt6) {
4723                 if (rt6_flags & RTF_GATEWAY &&
4724                     nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
4725                         goto nla_put_failure;
4726
4727                 if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
4728                         goto nla_put_failure;
4729         } else if (rt->fib6_nsiblings) {
4730                 struct fib6_info *sibling, *next_sibling;
4731                 struct nlattr *mp;
4732
4733                 mp = nla_nest_start(skb, RTA_MULTIPATH);
4734                 if (!mp)
4735                         goto nla_put_failure;
4736
4737                 if (rt6_add_nexthop(skb, rt) < 0)
4738                         goto nla_put_failure;
4739
4740                 list_for_each_entry_safe(sibling, next_sibling,
4741                                          &rt->fib6_siblings, fib6_siblings) {
4742                         if (rt6_add_nexthop(skb, sibling) < 0)
4743                                 goto nla_put_failure;
4744                 }
4745
4746                 nla_nest_end(skb, mp);
4747         } else {
4748                 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4749                         goto nla_put_failure;
4750         }
4751
4752         if (rt6_flags & RTF_EXPIRES) {
4753                 expires = dst ? dst->expires : rt->expires;
4754                 expires -= jiffies;
4755         }
4756
4757         if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4758                 goto nla_put_failure;
4759
4760         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
4761                 goto nla_put_failure;
4762
4763
4764         nlmsg_end(skb, nlh);
4765         return 0;
4766
4767 nla_put_failure:
4768         nlmsg_cancel(skb, nlh);
4769         return -EMSGSIZE;
4770 }
4771
4772 static bool fib6_info_uses_dev(const struct fib6_info *f6i,
4773                                const struct net_device *dev)
4774 {
4775         if (f6i->fib6_nh.nh_dev == dev)
4776                 return true;
4777
4778         if (f6i->fib6_nsiblings) {
4779                 struct fib6_info *sibling, *next_sibling;
4780
4781                 list_for_each_entry_safe(sibling, next_sibling,
4782                                          &f6i->fib6_siblings, fib6_siblings) {
4783                         if (sibling->fib6_nh.nh_dev == dev)
4784                                 return true;
4785                 }
4786         }
4787
4788         return false;
4789 }
4790
4791 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4792 {
4793         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4794         struct fib_dump_filter *filter = &arg->filter;
4795         unsigned int flags = NLM_F_MULTI;
4796         struct net *net = arg->net;
4797
4798         if (rt == net->ipv6.fib6_null_entry)
4799                 return 0;
4800
4801         if ((filter->flags & RTM_F_PREFIX) &&
4802             !(rt->fib6_flags & RTF_PREFIX_RT)) {
4803                 /* success since this is not a prefix route */
4804                 return 1;
4805         }
4806         if (filter->filter_set) {
4807                 if ((filter->rt_type && rt->fib6_type != filter->rt_type) ||
4808                     (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) ||
4809                     (filter->protocol && rt->fib6_protocol != filter->protocol)) {
4810                         return 1;
4811                 }
4812                 flags |= NLM_F_DUMP_FILTERED;
4813         }
4814
4815         return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4816                              RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4817                              arg->cb->nlh->nlmsg_seq, flags);
4818 }
4819
4820 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4821                               struct netlink_ext_ack *extack)
4822 {
4823         struct net *net = sock_net(in_skb->sk);
4824         struct nlattr *tb[RTA_MAX+1];
4825         int err, iif = 0, oif = 0;
4826         struct fib6_info *from;
4827         struct dst_entry *dst;
4828         struct rt6_info *rt;
4829         struct sk_buff *skb;
4830         struct rtmsg *rtm;
4831         struct flowi6 fl6 = {};
4832         bool fibmatch;
4833
4834         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4835                           extack);
4836         if (err < 0)
4837                 goto errout;
4838
4839         err = -EINVAL;
4840         rtm = nlmsg_data(nlh);
4841         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4842         fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4843
4844         if (tb[RTA_SRC]) {
4845                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4846                         goto errout;
4847
4848                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4849         }
4850
4851         if (tb[RTA_DST]) {
4852                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4853                         goto errout;
4854
4855                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4856         }
4857
4858         if (tb[RTA_IIF])
4859                 iif = nla_get_u32(tb[RTA_IIF]);
4860
4861         if (tb[RTA_OIF])
4862                 oif = nla_get_u32(tb[RTA_OIF]);
4863
4864         if (tb[RTA_MARK])
4865                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4866
4867         if (tb[RTA_UID])
4868                 fl6.flowi6_uid = make_kuid(current_user_ns(),
4869                                            nla_get_u32(tb[RTA_UID]));
4870         else
4871                 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4872
4873         if (tb[RTA_SPORT])
4874                 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
4875
4876         if (tb[RTA_DPORT])
4877                 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
4878
4879         if (tb[RTA_IP_PROTO]) {
4880                 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
4881                                                   &fl6.flowi6_proto, extack);
4882                 if (err)
4883                         goto errout;
4884         }
4885
4886         if (iif) {
4887                 struct net_device *dev;
4888                 int flags = 0;
4889
4890                 rcu_read_lock();
4891
4892                 dev = dev_get_by_index_rcu(net, iif);
4893                 if (!dev) {
4894                         rcu_read_unlock();
4895                         err = -ENODEV;
4896                         goto errout;
4897                 }
4898
4899                 fl6.flowi6_iif = iif;
4900
4901                 if (!ipv6_addr_any(&fl6.saddr))
4902                         flags |= RT6_LOOKUP_F_HAS_SADDR;
4903
4904                 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4905
4906                 rcu_read_unlock();
4907         } else {
4908                 fl6.flowi6_oif = oif;
4909
4910                 dst = ip6_route_output(net, NULL, &fl6);
4911         }
4912
4913
4914         rt = container_of(dst, struct rt6_info, dst);
4915         if (rt->dst.error) {
4916                 err = rt->dst.error;
4917                 ip6_rt_put(rt);
4918                 goto errout;
4919         }
4920
4921         if (rt == net->ipv6.ip6_null_entry) {
4922                 err = rt->dst.error;
4923                 ip6_rt_put(rt);
4924                 goto errout;
4925         }
4926
4927         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4928         if (!skb) {
4929                 ip6_rt_put(rt);
4930                 err = -ENOBUFS;
4931                 goto errout;
4932         }
4933
4934         skb_dst_set(skb, &rt->dst);
4935
4936         rcu_read_lock();
4937         from = rcu_dereference(rt->from);
4938
4939         if (fibmatch)
4940                 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif,
4941                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4942                                     nlh->nlmsg_seq, 0);
4943         else
4944                 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
4945                                     &fl6.saddr, iif, RTM_NEWROUTE,
4946                                     NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
4947                                     0);
4948         rcu_read_unlock();
4949
4950         if (err < 0) {
4951                 kfree_skb(skb);
4952                 goto errout;
4953         }
4954
4955         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4956 errout:
4957         return err;
4958 }
4959
4960 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
4961                      unsigned int nlm_flags)
4962 {
4963         struct sk_buff *skb;
4964         struct net *net = info->nl_net;
4965         u32 seq;
4966         int err;
4967
4968         err = -ENOBUFS;
4969         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4970
4971         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4972         if (!skb)
4973                 goto errout;
4974
4975         err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
4976                             event, info->portid, seq, nlm_flags);
4977         if (err < 0) {
4978                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4979                 WARN_ON(err == -EMSGSIZE);
4980                 kfree_skb(skb);
4981                 goto errout;
4982         }
4983         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4984                     info->nlh, gfp_any());
4985         return;
4986 errout:
4987         if (err < 0)
4988                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4989 }
4990
4991 static int ip6_route_dev_notify(struct notifier_block *this,
4992                                 unsigned long event, void *ptr)
4993 {
4994         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4995         struct net *net = dev_net(dev);
4996
4997         if (!(dev->flags & IFF_LOOPBACK))
4998                 return NOTIFY_OK;
4999
5000         if (event == NETDEV_REGISTER) {
5001                 net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
5002                 net->ipv6.ip6_null_entry->dst.dev = dev;
5003                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
5004 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5005                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
5006                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
5007                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
5008                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
5009 #endif
5010          } else if (event == NETDEV_UNREGISTER &&
5011                     dev->reg_state != NETREG_UNREGISTERED) {
5012                 /* NETDEV_UNREGISTER could be fired for multiple times by
5013                  * netdev_wait_allrefs(). Make sure we only call this once.
5014                  */
5015                 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
5016 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5017                 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
5018                 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
5019 #endif
5020         }
5021
5022         return NOTIFY_OK;
5023 }
5024
5025 /*
5026  *      /proc
5027  */
5028
5029 #ifdef CONFIG_PROC_FS
5030 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
5031 {
5032         struct net *net = (struct net *)seq->private;
5033         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
5034                    net->ipv6.rt6_stats->fib_nodes,
5035                    net->ipv6.rt6_stats->fib_route_nodes,
5036                    atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
5037                    net->ipv6.rt6_stats->fib_rt_entries,
5038                    net->ipv6.rt6_stats->fib_rt_cache,
5039                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
5040                    net->ipv6.rt6_stats->fib_discarded_routes);
5041
5042         return 0;
5043 }
5044 #endif  /* CONFIG_PROC_FS */
5045
5046 #ifdef CONFIG_SYSCTL
5047
5048 static
5049 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
5050                               void __user *buffer, size_t *lenp, loff_t *ppos)
5051 {
5052         struct net *net;
5053         int delay;
5054         int ret;
5055         if (!write)
5056                 return -EINVAL;
5057
5058         net = (struct net *)ctl->extra1;
5059         delay = net->ipv6.sysctl.flush_delay;
5060         ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
5061         if (ret)
5062                 return ret;
5063
5064         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
5065         return 0;
5066 }
5067
5068 static int zero;
5069 static int one = 1;
5070
5071 static struct ctl_table ipv6_route_table_template[] = {
5072         {
5073                 .procname       =       "flush",
5074                 .data           =       &init_net.ipv6.sysctl.flush_delay,
5075                 .maxlen         =       sizeof(int),
5076                 .mode           =       0200,
5077                 .proc_handler   =       ipv6_sysctl_rtcache_flush
5078         },
5079         {
5080                 .procname       =       "gc_thresh",
5081                 .data           =       &ip6_dst_ops_template.gc_thresh,
5082                 .maxlen         =       sizeof(int),
5083                 .mode           =       0644,
5084                 .proc_handler   =       proc_dointvec,
5085         },
5086         {
5087                 .procname       =       "max_size",
5088                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
5089                 .maxlen         =       sizeof(int),
5090                 .mode           =       0644,
5091                 .proc_handler   =       proc_dointvec,
5092         },
5093         {
5094                 .procname       =       "gc_min_interval",
5095                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5096                 .maxlen         =       sizeof(int),
5097                 .mode           =       0644,
5098                 .proc_handler   =       proc_dointvec_jiffies,
5099         },
5100         {
5101                 .procname       =       "gc_timeout",
5102                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5103                 .maxlen         =       sizeof(int),
5104                 .mode           =       0644,
5105                 .proc_handler   =       proc_dointvec_jiffies,
5106         },
5107         {
5108                 .procname       =       "gc_interval",
5109                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
5110                 .maxlen         =       sizeof(int),
5111                 .mode           =       0644,
5112                 .proc_handler   =       proc_dointvec_jiffies,
5113         },
5114         {
5115                 .procname       =       "gc_elasticity",
5116                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5117                 .maxlen         =       sizeof(int),
5118                 .mode           =       0644,
5119                 .proc_handler   =       proc_dointvec,
5120         },
5121         {
5122                 .procname       =       "mtu_expires",
5123                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5124                 .maxlen         =       sizeof(int),
5125                 .mode           =       0644,
5126                 .proc_handler   =       proc_dointvec_jiffies,
5127         },
5128         {
5129                 .procname       =       "min_adv_mss",
5130                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
5131                 .maxlen         =       sizeof(int),
5132                 .mode           =       0644,
5133                 .proc_handler   =       proc_dointvec,
5134         },
5135         {
5136                 .procname       =       "gc_min_interval_ms",
5137                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5138                 .maxlen         =       sizeof(int),
5139                 .mode           =       0644,
5140                 .proc_handler   =       proc_dointvec_ms_jiffies,
5141         },
5142         {
5143                 .procname       =       "skip_notify_on_dev_down",
5144                 .data           =       &init_net.ipv6.sysctl.skip_notify_on_dev_down,
5145                 .maxlen         =       sizeof(int),
5146                 .mode           =       0644,
5147                 .proc_handler   =       proc_dointvec,
5148                 .extra1         =       &zero,
5149                 .extra2         =       &one,
5150         },
5151         { }
5152 };
5153
5154 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5155 {
5156         struct ctl_table *table;
5157
5158         table = kmemdup(ipv6_route_table_template,
5159                         sizeof(ipv6_route_table_template),
5160                         GFP_KERNEL);
5161
5162         if (table) {
5163                 table[0].data = &net->ipv6.sysctl.flush_delay;
5164                 table[0].extra1 = net;
5165                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5166                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5167                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5168                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5169                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5170                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5171                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5172                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5173                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5174                 table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down;
5175
5176                 /* Don't export sysctls to unprivileged users */
5177                 if (net->user_ns != &init_user_ns)
5178                         table[0].procname = NULL;
5179         }
5180
5181         return table;
5182 }
5183 #endif
5184
5185 static int __net_init ip6_route_net_init(struct net *net)
5186 {
5187         int ret = -ENOMEM;
5188
5189         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5190                sizeof(net->ipv6.ip6_dst_ops));
5191
5192         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5193                 goto out_ip6_dst_ops;
5194
5195         net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5196                                             sizeof(*net->ipv6.fib6_null_entry),
5197                                             GFP_KERNEL);
5198         if (!net->ipv6.fib6_null_entry)
5199                 goto out_ip6_dst_entries;
5200
5201         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5202                                            sizeof(*net->ipv6.ip6_null_entry),
5203                                            GFP_KERNEL);
5204         if (!net->ipv6.ip6_null_entry)
5205                 goto out_fib6_null_entry;
5206         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5207         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5208                          ip6_template_metrics, true);
5209
5210 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5211         net->ipv6.fib6_has_custom_rules = false;
5212         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5213                                                sizeof(*net->ipv6.ip6_prohibit_entry),
5214                                                GFP_KERNEL);
5215         if (!net->ipv6.ip6_prohibit_entry)
5216                 goto out_ip6_null_entry;
5217         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5218         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5219                          ip6_template_metrics, true);
5220
5221         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5222                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
5223                                                GFP_KERNEL);
5224         if (!net->ipv6.ip6_blk_hole_entry)
5225                 goto out_ip6_prohibit_entry;
5226         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5227         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5228                          ip6_template_metrics, true);
5229 #endif
5230
5231         net->ipv6.sysctl.flush_delay = 0;
5232         net->ipv6.sysctl.ip6_rt_max_size = 4096;
5233         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5234         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5235         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5236         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5237         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5238         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5239         net->ipv6.sysctl.skip_notify_on_dev_down = 0;
5240
5241         net->ipv6.ip6_rt_gc_expire = 30*HZ;
5242
5243         ret = 0;
5244 out:
5245         return ret;
5246
5247 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5248 out_ip6_prohibit_entry:
5249         kfree(net->ipv6.ip6_prohibit_entry);
5250 out_ip6_null_entry:
5251         kfree(net->ipv6.ip6_null_entry);
5252 #endif
5253 out_fib6_null_entry:
5254         kfree(net->ipv6.fib6_null_entry);
5255 out_ip6_dst_entries:
5256         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5257 out_ip6_dst_ops:
5258         goto out;
5259 }
5260
5261 static void __net_exit ip6_route_net_exit(struct net *net)
5262 {
5263         kfree(net->ipv6.fib6_null_entry);
5264         kfree(net->ipv6.ip6_null_entry);
5265 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5266         kfree(net->ipv6.ip6_prohibit_entry);
5267         kfree(net->ipv6.ip6_blk_hole_entry);
5268 #endif
5269         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5270 }
5271
5272 static int __net_init ip6_route_net_init_late(struct net *net)
5273 {
5274 #ifdef CONFIG_PROC_FS
5275         proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
5276                         sizeof(struct ipv6_route_iter));
5277         proc_create_net_single("rt6_stats", 0444, net->proc_net,
5278                         rt6_stats_seq_show, NULL);
5279 #endif
5280         return 0;
5281 }
5282
5283 static void __net_exit ip6_route_net_exit_late(struct net *net)
5284 {
5285 #ifdef CONFIG_PROC_FS
5286         remove_proc_entry("ipv6_route", net->proc_net);
5287         remove_proc_entry("rt6_stats", net->proc_net);
5288 #endif
5289 }
5290
5291 static struct pernet_operations ip6_route_net_ops = {
5292         .init = ip6_route_net_init,
5293         .exit = ip6_route_net_exit,
5294 };
5295
5296 static int __net_init ipv6_inetpeer_init(struct net *net)
5297 {
5298         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5299
5300         if (!bp)
5301                 return -ENOMEM;
5302         inet_peer_base_init(bp);
5303         net->ipv6.peers = bp;
5304         return 0;
5305 }
5306
5307 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5308 {
5309         struct inet_peer_base *bp = net->ipv6.peers;
5310
5311         net->ipv6.peers = NULL;
5312         inetpeer_invalidate_tree(bp);
5313         kfree(bp);
5314 }
5315
5316 static struct pernet_operations ipv6_inetpeer_ops = {
5317         .init   =       ipv6_inetpeer_init,
5318         .exit   =       ipv6_inetpeer_exit,
5319 };
5320
5321 static struct pernet_operations ip6_route_net_late_ops = {
5322         .init = ip6_route_net_init_late,
5323         .exit = ip6_route_net_exit_late,
5324 };
5325
5326 static struct notifier_block ip6_route_dev_notifier = {
5327         .notifier_call = ip6_route_dev_notify,
5328         .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5329 };
5330
5331 void __init ip6_route_init_special_entries(void)
5332 {
5333         /* Registering of the loopback is done before this portion of code,
5334          * the loopback reference in rt6_info will not be taken, do it
5335          * manually for init_net */
5336         init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
5337         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5338         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5339   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5340         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5341         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5342         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5343         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5344   #endif
5345 }
5346
5347 int __init ip6_route_init(void)
5348 {
5349         int ret;
5350         int cpu;
5351
5352         ret = -ENOMEM;
5353         ip6_dst_ops_template.kmem_cachep =
5354                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5355                                   SLAB_HWCACHE_ALIGN, NULL);
5356         if (!ip6_dst_ops_template.kmem_cachep)
5357                 goto out;
5358
5359         ret = dst_entries_init(&ip6_dst_blackhole_ops);
5360         if (ret)
5361                 goto out_kmem_cache;
5362
5363         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5364         if (ret)
5365                 goto out_dst_entries;
5366
5367         ret = register_pernet_subsys(&ip6_route_net_ops);
5368         if (ret)
5369                 goto out_register_inetpeer;
5370
5371         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5372
5373         ret = fib6_init();
5374         if (ret)
5375                 goto out_register_subsys;
5376
5377         ret = xfrm6_init();
5378         if (ret)
5379                 goto out_fib6_init;
5380
5381         ret = fib6_rules_init();
5382         if (ret)
5383                 goto xfrm6_init;
5384
5385         ret = register_pernet_subsys(&ip6_route_net_late_ops);
5386         if (ret)
5387                 goto fib6_rules_init;
5388
5389         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5390                                    inet6_rtm_newroute, NULL, 0);
5391         if (ret < 0)
5392                 goto out_register_late_subsys;
5393
5394         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5395                                    inet6_rtm_delroute, NULL, 0);
5396         if (ret < 0)
5397                 goto out_register_late_subsys;
5398
5399         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5400                                    inet6_rtm_getroute, NULL,
5401                                    RTNL_FLAG_DOIT_UNLOCKED);
5402         if (ret < 0)
5403                 goto out_register_late_subsys;
5404
5405         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5406         if (ret)
5407                 goto out_register_late_subsys;
5408
5409         for_each_possible_cpu(cpu) {
5410                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5411
5412                 INIT_LIST_HEAD(&ul->head);
5413                 spin_lock_init(&ul->lock);
5414         }
5415
5416 out:
5417         return ret;
5418
5419 out_register_late_subsys:
5420         rtnl_unregister_all(PF_INET6);
5421         unregister_pernet_subsys(&ip6_route_net_late_ops);
5422 fib6_rules_init:
5423         fib6_rules_cleanup();
5424 xfrm6_init:
5425         xfrm6_fini();
5426 out_fib6_init:
5427         fib6_gc_cleanup();
5428 out_register_subsys:
5429         unregister_pernet_subsys(&ip6_route_net_ops);
5430 out_register_inetpeer:
5431         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5432 out_dst_entries:
5433         dst_entries_destroy(&ip6_dst_blackhole_ops);
5434 out_kmem_cache:
5435         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5436         goto out;
5437 }
5438
5439 void ip6_route_cleanup(void)
5440 {
5441         unregister_netdevice_notifier(&ip6_route_dev_notifier);
5442         unregister_pernet_subsys(&ip6_route_net_late_ops);
5443         fib6_rules_cleanup();
5444         xfrm6_fini();
5445         fib6_gc_cleanup();
5446         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5447         unregister_pernet_subsys(&ip6_route_net_ops);
5448         dst_entries_destroy(&ip6_dst_blackhole_ops);
5449         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5450 }