Merge tag 'pxa-fixes-5.0' of https://github.com/rjarzmik/linux into arm/fixes
[sfrench/cifs-2.6.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <net/ip.h>
67 #include <linux/uaccess.h>
68
69 #ifdef CONFIG_SYSCTL
70 #include <linux/sysctl.h>
71 #endif
72
73 static int ip6_rt_type_to_error(u8 fib6_type);
74
75 #define CREATE_TRACE_POINTS
76 #include <trace/events/fib6.h>
77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
78 #undef CREATE_TRACE_POINTS
79
80 enum rt6_nud_state {
81         RT6_NUD_FAIL_HARD = -3,
82         RT6_NUD_FAIL_PROBE = -2,
83         RT6_NUD_FAIL_DO_RR = -1,
84         RT6_NUD_SUCCEED = 1
85 };
86
87 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
88 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
89 static unsigned int      ip6_mtu(const struct dst_entry *dst);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void             ip6_dst_destroy(struct dst_entry *);
92 static void             ip6_dst_ifdown(struct dst_entry *,
93                                        struct net_device *dev, int how);
94 static int               ip6_dst_gc(struct dst_ops *ops);
95
96 static int              ip6_pkt_discard(struct sk_buff *skb);
97 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
98 static int              ip6_pkt_prohibit(struct sk_buff *skb);
99 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
100 static void             ip6_link_failure(struct sk_buff *skb);
101 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
102                                            struct sk_buff *skb, u32 mtu);
103 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
104                                         struct sk_buff *skb);
105 static int rt6_score_route(struct fib6_info *rt, int oif, int strict);
106 static size_t rt6_nlmsg_size(struct fib6_info *rt);
107 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
108                          struct fib6_info *rt, struct dst_entry *dst,
109                          struct in6_addr *dest, struct in6_addr *src,
110                          int iif, int type, u32 portid, u32 seq,
111                          unsigned int flags);
112 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
113                                            struct in6_addr *daddr,
114                                            struct in6_addr *saddr);
115
116 #ifdef CONFIG_IPV6_ROUTE_INFO
117 static struct fib6_info *rt6_add_route_info(struct net *net,
118                                            const struct in6_addr *prefix, int prefixlen,
119                                            const struct in6_addr *gwaddr,
120                                            struct net_device *dev,
121                                            unsigned int pref);
122 static struct fib6_info *rt6_get_route_info(struct net *net,
123                                            const struct in6_addr *prefix, int prefixlen,
124                                            const struct in6_addr *gwaddr,
125                                            struct net_device *dev);
126 #endif
127
128 struct uncached_list {
129         spinlock_t              lock;
130         struct list_head        head;
131 };
132
133 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
134
135 void rt6_uncached_list_add(struct rt6_info *rt)
136 {
137         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
138
139         rt->rt6i_uncached_list = ul;
140
141         spin_lock_bh(&ul->lock);
142         list_add_tail(&rt->rt6i_uncached, &ul->head);
143         spin_unlock_bh(&ul->lock);
144 }
145
146 void rt6_uncached_list_del(struct rt6_info *rt)
147 {
148         if (!list_empty(&rt->rt6i_uncached)) {
149                 struct uncached_list *ul = rt->rt6i_uncached_list;
150                 struct net *net = dev_net(rt->dst.dev);
151
152                 spin_lock_bh(&ul->lock);
153                 list_del(&rt->rt6i_uncached);
154                 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
155                 spin_unlock_bh(&ul->lock);
156         }
157 }
158
159 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
160 {
161         struct net_device *loopback_dev = net->loopback_dev;
162         int cpu;
163
164         if (dev == loopback_dev)
165                 return;
166
167         for_each_possible_cpu(cpu) {
168                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
169                 struct rt6_info *rt;
170
171                 spin_lock_bh(&ul->lock);
172                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
173                         struct inet6_dev *rt_idev = rt->rt6i_idev;
174                         struct net_device *rt_dev = rt->dst.dev;
175
176                         if (rt_idev->dev == dev) {
177                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
178                                 in6_dev_put(rt_idev);
179                         }
180
181                         if (rt_dev == dev) {
182                                 rt->dst.dev = loopback_dev;
183                                 dev_hold(rt->dst.dev);
184                                 dev_put(rt_dev);
185                         }
186                 }
187                 spin_unlock_bh(&ul->lock);
188         }
189 }
190
191 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
192                                              struct sk_buff *skb,
193                                              const void *daddr)
194 {
195         if (!ipv6_addr_any(p))
196                 return (const void *) p;
197         else if (skb)
198                 return &ipv6_hdr(skb)->daddr;
199         return daddr;
200 }
201
202 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
203                                    struct net_device *dev,
204                                    struct sk_buff *skb,
205                                    const void *daddr)
206 {
207         struct neighbour *n;
208
209         daddr = choose_neigh_daddr(gw, skb, daddr);
210         n = __ipv6_neigh_lookup(dev, daddr);
211         if (n)
212                 return n;
213
214         n = neigh_create(&nd_tbl, daddr, dev);
215         return IS_ERR(n) ? NULL : n;
216 }
217
218 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
219                                               struct sk_buff *skb,
220                                               const void *daddr)
221 {
222         const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
223
224         return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
225 }
226
227 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
228 {
229         struct net_device *dev = dst->dev;
230         struct rt6_info *rt = (struct rt6_info *)dst;
231
232         daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
233         if (!daddr)
234                 return;
235         if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
236                 return;
237         if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
238                 return;
239         __ipv6_confirm_neigh(dev, daddr);
240 }
241
242 static struct dst_ops ip6_dst_ops_template = {
243         .family                 =       AF_INET6,
244         .gc                     =       ip6_dst_gc,
245         .gc_thresh              =       1024,
246         .check                  =       ip6_dst_check,
247         .default_advmss         =       ip6_default_advmss,
248         .mtu                    =       ip6_mtu,
249         .cow_metrics            =       dst_cow_metrics_generic,
250         .destroy                =       ip6_dst_destroy,
251         .ifdown                 =       ip6_dst_ifdown,
252         .negative_advice        =       ip6_negative_advice,
253         .link_failure           =       ip6_link_failure,
254         .update_pmtu            =       ip6_rt_update_pmtu,
255         .redirect               =       rt6_do_redirect,
256         .local_out              =       __ip6_local_out,
257         .neigh_lookup           =       ip6_dst_neigh_lookup,
258         .confirm_neigh          =       ip6_confirm_neigh,
259 };
260
261 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
262 {
263         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
264
265         return mtu ? : dst->dev->mtu;
266 }
267
268 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
269                                          struct sk_buff *skb, u32 mtu)
270 {
271 }
272
273 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
274                                       struct sk_buff *skb)
275 {
276 }
277
278 static struct dst_ops ip6_dst_blackhole_ops = {
279         .family                 =       AF_INET6,
280         .destroy                =       ip6_dst_destroy,
281         .check                  =       ip6_dst_check,
282         .mtu                    =       ip6_blackhole_mtu,
283         .default_advmss         =       ip6_default_advmss,
284         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
285         .redirect               =       ip6_rt_blackhole_redirect,
286         .cow_metrics            =       dst_cow_metrics_generic,
287         .neigh_lookup           =       ip6_dst_neigh_lookup,
288 };
289
290 static const u32 ip6_template_metrics[RTAX_MAX] = {
291         [RTAX_HOPLIMIT - 1] = 0,
292 };
293
294 static const struct fib6_info fib6_null_entry_template = {
295         .fib6_flags     = (RTF_REJECT | RTF_NONEXTHOP),
296         .fib6_protocol  = RTPROT_KERNEL,
297         .fib6_metric    = ~(u32)0,
298         .fib6_ref       = ATOMIC_INIT(1),
299         .fib6_type      = RTN_UNREACHABLE,
300         .fib6_metrics   = (struct dst_metrics *)&dst_default_metrics,
301 };
302
303 static const struct rt6_info ip6_null_entry_template = {
304         .dst = {
305                 .__refcnt       = ATOMIC_INIT(1),
306                 .__use          = 1,
307                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
308                 .error          = -ENETUNREACH,
309                 .input          = ip6_pkt_discard,
310                 .output         = ip6_pkt_discard_out,
311         },
312         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
313 };
314
315 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
316
317 static const struct rt6_info ip6_prohibit_entry_template = {
318         .dst = {
319                 .__refcnt       = ATOMIC_INIT(1),
320                 .__use          = 1,
321                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
322                 .error          = -EACCES,
323                 .input          = ip6_pkt_prohibit,
324                 .output         = ip6_pkt_prohibit_out,
325         },
326         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
327 };
328
329 static const struct rt6_info ip6_blk_hole_entry_template = {
330         .dst = {
331                 .__refcnt       = ATOMIC_INIT(1),
332                 .__use          = 1,
333                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
334                 .error          = -EINVAL,
335                 .input          = dst_discard,
336                 .output         = dst_discard_out,
337         },
338         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
339 };
340
341 #endif
342
343 static void rt6_info_init(struct rt6_info *rt)
344 {
345         struct dst_entry *dst = &rt->dst;
346
347         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
348         INIT_LIST_HEAD(&rt->rt6i_uncached);
349 }
350
351 /* allocate dst with ip6_dst_ops */
352 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
353                                int flags)
354 {
355         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
356                                         1, DST_OBSOLETE_FORCE_CHK, flags);
357
358         if (rt) {
359                 rt6_info_init(rt);
360                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
361         }
362
363         return rt;
364 }
365 EXPORT_SYMBOL(ip6_dst_alloc);
366
367 static void ip6_dst_destroy(struct dst_entry *dst)
368 {
369         struct rt6_info *rt = (struct rt6_info *)dst;
370         struct fib6_info *from;
371         struct inet6_dev *idev;
372
373         ip_dst_metrics_put(dst);
374         rt6_uncached_list_del(rt);
375
376         idev = rt->rt6i_idev;
377         if (idev) {
378                 rt->rt6i_idev = NULL;
379                 in6_dev_put(idev);
380         }
381
382         rcu_read_lock();
383         from = rcu_dereference(rt->from);
384         rcu_assign_pointer(rt->from, NULL);
385         fib6_info_release(from);
386         rcu_read_unlock();
387 }
388
389 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
390                            int how)
391 {
392         struct rt6_info *rt = (struct rt6_info *)dst;
393         struct inet6_dev *idev = rt->rt6i_idev;
394         struct net_device *loopback_dev =
395                 dev_net(dev)->loopback_dev;
396
397         if (idev && idev->dev != loopback_dev) {
398                 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
399                 if (loopback_idev) {
400                         rt->rt6i_idev = loopback_idev;
401                         in6_dev_put(idev);
402                 }
403         }
404 }
405
406 static bool __rt6_check_expired(const struct rt6_info *rt)
407 {
408         if (rt->rt6i_flags & RTF_EXPIRES)
409                 return time_after(jiffies, rt->dst.expires);
410         else
411                 return false;
412 }
413
414 static bool rt6_check_expired(const struct rt6_info *rt)
415 {
416         struct fib6_info *from;
417
418         from = rcu_dereference(rt->from);
419
420         if (rt->rt6i_flags & RTF_EXPIRES) {
421                 if (time_after(jiffies, rt->dst.expires))
422                         return true;
423         } else if (from) {
424                 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
425                         fib6_check_expired(from);
426         }
427         return false;
428 }
429
430 struct fib6_info *fib6_multipath_select(const struct net *net,
431                                         struct fib6_info *match,
432                                         struct flowi6 *fl6, int oif,
433                                         const struct sk_buff *skb,
434                                         int strict)
435 {
436         struct fib6_info *sibling, *next_sibling;
437
438         /* We might have already computed the hash for ICMPv6 errors. In such
439          * case it will always be non-zero. Otherwise now is the time to do it.
440          */
441         if (!fl6->mp_hash)
442                 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
443
444         if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound))
445                 return match;
446
447         list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
448                                  fib6_siblings) {
449                 int nh_upper_bound;
450
451                 nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound);
452                 if (fl6->mp_hash > nh_upper_bound)
453                         continue;
454                 if (rt6_score_route(sibling, oif, strict) < 0)
455                         break;
456                 match = sibling;
457                 break;
458         }
459
460         return match;
461 }
462
463 /*
464  *      Route lookup. rcu_read_lock() should be held.
465  */
466
467 static inline struct fib6_info *rt6_device_match(struct net *net,
468                                                  struct fib6_info *rt,
469                                                     const struct in6_addr *saddr,
470                                                     int oif,
471                                                     int flags)
472 {
473         struct fib6_info *sprt;
474
475         if (!oif && ipv6_addr_any(saddr) &&
476             !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
477                 return rt;
478
479         for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) {
480                 const struct net_device *dev = sprt->fib6_nh.nh_dev;
481
482                 if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD)
483                         continue;
484
485                 if (oif) {
486                         if (dev->ifindex == oif)
487                                 return sprt;
488                 } else {
489                         if (ipv6_chk_addr(net, saddr, dev,
490                                           flags & RT6_LOOKUP_F_IFACE))
491                                 return sprt;
492                 }
493         }
494
495         if (oif && flags & RT6_LOOKUP_F_IFACE)
496                 return net->ipv6.fib6_null_entry;
497
498         return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
499 }
500
501 #ifdef CONFIG_IPV6_ROUTER_PREF
502 struct __rt6_probe_work {
503         struct work_struct work;
504         struct in6_addr target;
505         struct net_device *dev;
506 };
507
508 static void rt6_probe_deferred(struct work_struct *w)
509 {
510         struct in6_addr mcaddr;
511         struct __rt6_probe_work *work =
512                 container_of(w, struct __rt6_probe_work, work);
513
514         addrconf_addr_solict_mult(&work->target, &mcaddr);
515         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
516         dev_put(work->dev);
517         kfree(work);
518 }
519
520 static void rt6_probe(struct fib6_info *rt)
521 {
522         struct __rt6_probe_work *work = NULL;
523         const struct in6_addr *nh_gw;
524         struct neighbour *neigh;
525         struct net_device *dev;
526         struct inet6_dev *idev;
527
528         /*
529          * Okay, this does not seem to be appropriate
530          * for now, however, we need to check if it
531          * is really so; aka Router Reachability Probing.
532          *
533          * Router Reachability Probe MUST be rate-limited
534          * to no more than one per minute.
535          */
536         if (!rt || !(rt->fib6_flags & RTF_GATEWAY))
537                 return;
538
539         nh_gw = &rt->fib6_nh.nh_gw;
540         dev = rt->fib6_nh.nh_dev;
541         rcu_read_lock_bh();
542         idev = __in6_dev_get(dev);
543         neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
544         if (neigh) {
545                 if (neigh->nud_state & NUD_VALID)
546                         goto out;
547
548                 write_lock(&neigh->lock);
549                 if (!(neigh->nud_state & NUD_VALID) &&
550                     time_after(jiffies,
551                                neigh->updated + idev->cnf.rtr_probe_interval)) {
552                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
553                         if (work)
554                                 __neigh_set_probe_once(neigh);
555                 }
556                 write_unlock(&neigh->lock);
557         } else if (time_after(jiffies, rt->last_probe +
558                                        idev->cnf.rtr_probe_interval)) {
559                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
560         }
561
562         if (work) {
563                 rt->last_probe = jiffies;
564                 INIT_WORK(&work->work, rt6_probe_deferred);
565                 work->target = *nh_gw;
566                 dev_hold(dev);
567                 work->dev = dev;
568                 schedule_work(&work->work);
569         }
570
571 out:
572         rcu_read_unlock_bh();
573 }
574 #else
575 static inline void rt6_probe(struct fib6_info *rt)
576 {
577 }
578 #endif
579
580 /*
581  * Default Router Selection (RFC 2461 6.3.6)
582  */
583 static inline int rt6_check_dev(struct fib6_info *rt, int oif)
584 {
585         const struct net_device *dev = rt->fib6_nh.nh_dev;
586
587         if (!oif || dev->ifindex == oif)
588                 return 2;
589         return 0;
590 }
591
592 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
593 {
594         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
595         struct neighbour *neigh;
596
597         if (rt->fib6_flags & RTF_NONEXTHOP ||
598             !(rt->fib6_flags & RTF_GATEWAY))
599                 return RT6_NUD_SUCCEED;
600
601         rcu_read_lock_bh();
602         neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev,
603                                           &rt->fib6_nh.nh_gw);
604         if (neigh) {
605                 read_lock(&neigh->lock);
606                 if (neigh->nud_state & NUD_VALID)
607                         ret = RT6_NUD_SUCCEED;
608 #ifdef CONFIG_IPV6_ROUTER_PREF
609                 else if (!(neigh->nud_state & NUD_FAILED))
610                         ret = RT6_NUD_SUCCEED;
611                 else
612                         ret = RT6_NUD_FAIL_PROBE;
613 #endif
614                 read_unlock(&neigh->lock);
615         } else {
616                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
617                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
618         }
619         rcu_read_unlock_bh();
620
621         return ret;
622 }
623
624 static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
625 {
626         int m;
627
628         m = rt6_check_dev(rt, oif);
629         if (!m && (strict & RT6_LOOKUP_F_IFACE))
630                 return RT6_NUD_FAIL_HARD;
631 #ifdef CONFIG_IPV6_ROUTER_PREF
632         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2;
633 #endif
634         if (strict & RT6_LOOKUP_F_REACHABLE) {
635                 int n = rt6_check_neigh(rt);
636                 if (n < 0)
637                         return n;
638         }
639         return m;
640 }
641
642 /* called with rc_read_lock held */
643 static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i)
644 {
645         const struct net_device *dev = fib6_info_nh_dev(f6i);
646         bool rc = false;
647
648         if (dev) {
649                 const struct inet6_dev *idev = __in6_dev_get(dev);
650
651                 rc = !!idev->cnf.ignore_routes_with_linkdown;
652         }
653
654         return rc;
655 }
656
657 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
658                                    int *mpri, struct fib6_info *match,
659                                    bool *do_rr)
660 {
661         int m;
662         bool match_do_rr = false;
663
664         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
665                 goto out;
666
667         if (fib6_ignore_linkdown(rt) &&
668             rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
669             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
670                 goto out;
671
672         if (fib6_check_expired(rt))
673                 goto out;
674
675         m = rt6_score_route(rt, oif, strict);
676         if (m == RT6_NUD_FAIL_DO_RR) {
677                 match_do_rr = true;
678                 m = 0; /* lowest valid score */
679         } else if (m == RT6_NUD_FAIL_HARD) {
680                 goto out;
681         }
682
683         if (strict & RT6_LOOKUP_F_REACHABLE)
684                 rt6_probe(rt);
685
686         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
687         if (m > *mpri) {
688                 *do_rr = match_do_rr;
689                 *mpri = m;
690                 match = rt;
691         }
692 out:
693         return match;
694 }
695
696 static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
697                                      struct fib6_info *leaf,
698                                      struct fib6_info *rr_head,
699                                      u32 metric, int oif, int strict,
700                                      bool *do_rr)
701 {
702         struct fib6_info *rt, *match, *cont;
703         int mpri = -1;
704
705         match = NULL;
706         cont = NULL;
707         for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) {
708                 if (rt->fib6_metric != metric) {
709                         cont = rt;
710                         break;
711                 }
712
713                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
714         }
715
716         for (rt = leaf; rt && rt != rr_head;
717              rt = rcu_dereference(rt->fib6_next)) {
718                 if (rt->fib6_metric != metric) {
719                         cont = rt;
720                         break;
721                 }
722
723                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
724         }
725
726         if (match || !cont)
727                 return match;
728
729         for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next))
730                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
731
732         return match;
733 }
734
735 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
736                                    int oif, int strict)
737 {
738         struct fib6_info *leaf = rcu_dereference(fn->leaf);
739         struct fib6_info *match, *rt0;
740         bool do_rr = false;
741         int key_plen;
742
743         if (!leaf || leaf == net->ipv6.fib6_null_entry)
744                 return net->ipv6.fib6_null_entry;
745
746         rt0 = rcu_dereference(fn->rr_ptr);
747         if (!rt0)
748                 rt0 = leaf;
749
750         /* Double check to make sure fn is not an intermediate node
751          * and fn->leaf does not points to its child's leaf
752          * (This might happen if all routes under fn are deleted from
753          * the tree and fib6_repair_tree() is called on the node.)
754          */
755         key_plen = rt0->fib6_dst.plen;
756 #ifdef CONFIG_IPV6_SUBTREES
757         if (rt0->fib6_src.plen)
758                 key_plen = rt0->fib6_src.plen;
759 #endif
760         if (fn->fn_bit != key_plen)
761                 return net->ipv6.fib6_null_entry;
762
763         match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
764                              &do_rr);
765
766         if (do_rr) {
767                 struct fib6_info *next = rcu_dereference(rt0->fib6_next);
768
769                 /* no entries matched; do round-robin */
770                 if (!next || next->fib6_metric != rt0->fib6_metric)
771                         next = leaf;
772
773                 if (next != rt0) {
774                         spin_lock_bh(&leaf->fib6_table->tb6_lock);
775                         /* make sure next is not being deleted from the tree */
776                         if (next->fib6_node)
777                                 rcu_assign_pointer(fn->rr_ptr, next);
778                         spin_unlock_bh(&leaf->fib6_table->tb6_lock);
779                 }
780         }
781
782         return match ? match : net->ipv6.fib6_null_entry;
783 }
784
785 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
786 {
787         return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
788 }
789
790 #ifdef CONFIG_IPV6_ROUTE_INFO
791 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
792                   const struct in6_addr *gwaddr)
793 {
794         struct net *net = dev_net(dev);
795         struct route_info *rinfo = (struct route_info *) opt;
796         struct in6_addr prefix_buf, *prefix;
797         unsigned int pref;
798         unsigned long lifetime;
799         struct fib6_info *rt;
800
801         if (len < sizeof(struct route_info)) {
802                 return -EINVAL;
803         }
804
805         /* Sanity check for prefix_len and length */
806         if (rinfo->length > 3) {
807                 return -EINVAL;
808         } else if (rinfo->prefix_len > 128) {
809                 return -EINVAL;
810         } else if (rinfo->prefix_len > 64) {
811                 if (rinfo->length < 2) {
812                         return -EINVAL;
813                 }
814         } else if (rinfo->prefix_len > 0) {
815                 if (rinfo->length < 1) {
816                         return -EINVAL;
817                 }
818         }
819
820         pref = rinfo->route_pref;
821         if (pref == ICMPV6_ROUTER_PREF_INVALID)
822                 return -EINVAL;
823
824         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
825
826         if (rinfo->length == 3)
827                 prefix = (struct in6_addr *)rinfo->prefix;
828         else {
829                 /* this function is safe */
830                 ipv6_addr_prefix(&prefix_buf,
831                                  (struct in6_addr *)rinfo->prefix,
832                                  rinfo->prefix_len);
833                 prefix = &prefix_buf;
834         }
835
836         if (rinfo->prefix_len == 0)
837                 rt = rt6_get_dflt_router(net, gwaddr, dev);
838         else
839                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
840                                         gwaddr, dev);
841
842         if (rt && !lifetime) {
843                 ip6_del_rt(net, rt);
844                 rt = NULL;
845         }
846
847         if (!rt && lifetime)
848                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
849                                         dev, pref);
850         else if (rt)
851                 rt->fib6_flags = RTF_ROUTEINFO |
852                                  (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
853
854         if (rt) {
855                 if (!addrconf_finite_timeout(lifetime))
856                         fib6_clean_expires(rt);
857                 else
858                         fib6_set_expires(rt, jiffies + HZ * lifetime);
859
860                 fib6_info_release(rt);
861         }
862         return 0;
863 }
864 #endif
865
866 /*
867  *      Misc support functions
868  */
869
870 /* called with rcu_lock held */
871 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
872 {
873         struct net_device *dev = rt->fib6_nh.nh_dev;
874
875         if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
876                 /* for copies of local routes, dst->dev needs to be the
877                  * device if it is a master device, the master device if
878                  * device is enslaved, and the loopback as the default
879                  */
880                 if (netif_is_l3_slave(dev) &&
881                     !rt6_need_strict(&rt->fib6_dst.addr))
882                         dev = l3mdev_master_dev_rcu(dev);
883                 else if (!netif_is_l3_master(dev))
884                         dev = dev_net(dev)->loopback_dev;
885                 /* last case is netif_is_l3_master(dev) is true in which
886                  * case we want dev returned to be dev
887                  */
888         }
889
890         return dev;
891 }
892
893 static const int fib6_prop[RTN_MAX + 1] = {
894         [RTN_UNSPEC]    = 0,
895         [RTN_UNICAST]   = 0,
896         [RTN_LOCAL]     = 0,
897         [RTN_BROADCAST] = 0,
898         [RTN_ANYCAST]   = 0,
899         [RTN_MULTICAST] = 0,
900         [RTN_BLACKHOLE] = -EINVAL,
901         [RTN_UNREACHABLE] = -EHOSTUNREACH,
902         [RTN_PROHIBIT]  = -EACCES,
903         [RTN_THROW]     = -EAGAIN,
904         [RTN_NAT]       = -EINVAL,
905         [RTN_XRESOLVE]  = -EINVAL,
906 };
907
908 static int ip6_rt_type_to_error(u8 fib6_type)
909 {
910         return fib6_prop[fib6_type];
911 }
912
913 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
914 {
915         unsigned short flags = 0;
916
917         if (rt->dst_nocount)
918                 flags |= DST_NOCOUNT;
919         if (rt->dst_nopolicy)
920                 flags |= DST_NOPOLICY;
921         if (rt->dst_host)
922                 flags |= DST_HOST;
923
924         return flags;
925 }
926
927 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
928 {
929         rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
930
931         switch (ort->fib6_type) {
932         case RTN_BLACKHOLE:
933                 rt->dst.output = dst_discard_out;
934                 rt->dst.input = dst_discard;
935                 break;
936         case RTN_PROHIBIT:
937                 rt->dst.output = ip6_pkt_prohibit_out;
938                 rt->dst.input = ip6_pkt_prohibit;
939                 break;
940         case RTN_THROW:
941         case RTN_UNREACHABLE:
942         default:
943                 rt->dst.output = ip6_pkt_discard_out;
944                 rt->dst.input = ip6_pkt_discard;
945                 break;
946         }
947 }
948
949 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
950 {
951         if (ort->fib6_flags & RTF_REJECT) {
952                 ip6_rt_init_dst_reject(rt, ort);
953                 return;
954         }
955
956         rt->dst.error = 0;
957         rt->dst.output = ip6_output;
958
959         if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) {
960                 rt->dst.input = ip6_input;
961         } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
962                 rt->dst.input = ip6_mc_input;
963         } else {
964                 rt->dst.input = ip6_forward;
965         }
966
967         if (ort->fib6_nh.nh_lwtstate) {
968                 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
969                 lwtunnel_set_redirect(&rt->dst);
970         }
971
972         rt->dst.lastuse = jiffies;
973 }
974
975 /* Caller must already hold reference to @from */
976 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
977 {
978         rt->rt6i_flags &= ~RTF_EXPIRES;
979         rcu_assign_pointer(rt->from, from);
980         ip_dst_init_metrics(&rt->dst, from->fib6_metrics);
981 }
982
983 /* Caller must already hold reference to @ort */
984 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
985 {
986         struct net_device *dev = fib6_info_nh_dev(ort);
987
988         ip6_rt_init_dst(rt, ort);
989
990         rt->rt6i_dst = ort->fib6_dst;
991         rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
992         rt->rt6i_gateway = ort->fib6_nh.nh_gw;
993         rt->rt6i_flags = ort->fib6_flags;
994         rt6_set_from(rt, ort);
995 #ifdef CONFIG_IPV6_SUBTREES
996         rt->rt6i_src = ort->fib6_src;
997 #endif
998 }
999
1000 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1001                                         struct in6_addr *saddr)
1002 {
1003         struct fib6_node *pn, *sn;
1004         while (1) {
1005                 if (fn->fn_flags & RTN_TL_ROOT)
1006                         return NULL;
1007                 pn = rcu_dereference(fn->parent);
1008                 sn = FIB6_SUBTREE(pn);
1009                 if (sn && sn != fn)
1010                         fn = fib6_node_lookup(sn, NULL, saddr);
1011                 else
1012                         fn = pn;
1013                 if (fn->fn_flags & RTN_RTINFO)
1014                         return fn;
1015         }
1016 }
1017
1018 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
1019                           bool null_fallback)
1020 {
1021         struct rt6_info *rt = *prt;
1022
1023         if (dst_hold_safe(&rt->dst))
1024                 return true;
1025         if (null_fallback) {
1026                 rt = net->ipv6.ip6_null_entry;
1027                 dst_hold(&rt->dst);
1028         } else {
1029                 rt = NULL;
1030         }
1031         *prt = rt;
1032         return false;
1033 }
1034
1035 /* called with rcu_lock held */
1036 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
1037 {
1038         unsigned short flags = fib6_info_dst_flags(rt);
1039         struct net_device *dev = rt->fib6_nh.nh_dev;
1040         struct rt6_info *nrt;
1041
1042         if (!fib6_info_hold_safe(rt))
1043                 return NULL;
1044
1045         nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1046         if (nrt)
1047                 ip6_rt_copy_init(nrt, rt);
1048         else
1049                 fib6_info_release(rt);
1050
1051         return nrt;
1052 }
1053
1054 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1055                                              struct fib6_table *table,
1056                                              struct flowi6 *fl6,
1057                                              const struct sk_buff *skb,
1058                                              int flags)
1059 {
1060         struct fib6_info *f6i;
1061         struct fib6_node *fn;
1062         struct rt6_info *rt;
1063
1064         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1065                 flags &= ~RT6_LOOKUP_F_IFACE;
1066
1067         rcu_read_lock();
1068         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1069 restart:
1070         f6i = rcu_dereference(fn->leaf);
1071         if (!f6i) {
1072                 f6i = net->ipv6.fib6_null_entry;
1073         } else {
1074                 f6i = rt6_device_match(net, f6i, &fl6->saddr,
1075                                       fl6->flowi6_oif, flags);
1076                 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
1077                         f6i = fib6_multipath_select(net, f6i, fl6,
1078                                                     fl6->flowi6_oif, skb,
1079                                                     flags);
1080         }
1081         if (f6i == net->ipv6.fib6_null_entry) {
1082                 fn = fib6_backtrack(fn, &fl6->saddr);
1083                 if (fn)
1084                         goto restart;
1085         }
1086
1087         trace_fib6_table_lookup(net, f6i, table, fl6);
1088
1089         /* Search through exception table */
1090         rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1091         if (rt) {
1092                 if (ip6_hold_safe(net, &rt, true))
1093                         dst_use_noref(&rt->dst, jiffies);
1094         } else if (f6i == net->ipv6.fib6_null_entry) {
1095                 rt = net->ipv6.ip6_null_entry;
1096                 dst_hold(&rt->dst);
1097         } else {
1098                 rt = ip6_create_rt_rcu(f6i);
1099                 if (!rt) {
1100                         rt = net->ipv6.ip6_null_entry;
1101                         dst_hold(&rt->dst);
1102                 }
1103         }
1104
1105         rcu_read_unlock();
1106
1107         return rt;
1108 }
1109
1110 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1111                                    const struct sk_buff *skb, int flags)
1112 {
1113         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1114 }
1115 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1116
1117 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1118                             const struct in6_addr *saddr, int oif,
1119                             const struct sk_buff *skb, int strict)
1120 {
1121         struct flowi6 fl6 = {
1122                 .flowi6_oif = oif,
1123                 .daddr = *daddr,
1124         };
1125         struct dst_entry *dst;
1126         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1127
1128         if (saddr) {
1129                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1130                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1131         }
1132
1133         dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1134         if (dst->error == 0)
1135                 return (struct rt6_info *) dst;
1136
1137         dst_release(dst);
1138
1139         return NULL;
1140 }
1141 EXPORT_SYMBOL(rt6_lookup);
1142
1143 /* ip6_ins_rt is called with FREE table->tb6_lock.
1144  * It takes new route entry, the addition fails by any reason the
1145  * route is released.
1146  * Caller must hold dst before calling it.
1147  */
1148
1149 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1150                         struct netlink_ext_ack *extack)
1151 {
1152         int err;
1153         struct fib6_table *table;
1154
1155         table = rt->fib6_table;
1156         spin_lock_bh(&table->tb6_lock);
1157         err = fib6_add(&table->tb6_root, rt, info, extack);
1158         spin_unlock_bh(&table->tb6_lock);
1159
1160         return err;
1161 }
1162
1163 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1164 {
1165         struct nl_info info = { .nl_net = net, };
1166
1167         return __ip6_ins_rt(rt, &info, NULL);
1168 }
1169
1170 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
1171                                            const struct in6_addr *daddr,
1172                                            const struct in6_addr *saddr)
1173 {
1174         struct net_device *dev;
1175         struct rt6_info *rt;
1176
1177         /*
1178          *      Clone the route.
1179          */
1180
1181         if (!fib6_info_hold_safe(ort))
1182                 return NULL;
1183
1184         dev = ip6_rt_get_dev_rcu(ort);
1185         rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1186         if (!rt) {
1187                 fib6_info_release(ort);
1188                 return NULL;
1189         }
1190
1191         ip6_rt_copy_init(rt, ort);
1192         rt->rt6i_flags |= RTF_CACHE;
1193         rt->dst.flags |= DST_HOST;
1194         rt->rt6i_dst.addr = *daddr;
1195         rt->rt6i_dst.plen = 128;
1196
1197         if (!rt6_is_gw_or_nonexthop(ort)) {
1198                 if (ort->fib6_dst.plen != 128 &&
1199                     ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
1200                         rt->rt6i_flags |= RTF_ANYCAST;
1201 #ifdef CONFIG_IPV6_SUBTREES
1202                 if (rt->rt6i_src.plen && saddr) {
1203                         rt->rt6i_src.addr = *saddr;
1204                         rt->rt6i_src.plen = 128;
1205                 }
1206 #endif
1207         }
1208
1209         return rt;
1210 }
1211
1212 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
1213 {
1214         unsigned short flags = fib6_info_dst_flags(rt);
1215         struct net_device *dev;
1216         struct rt6_info *pcpu_rt;
1217
1218         if (!fib6_info_hold_safe(rt))
1219                 return NULL;
1220
1221         rcu_read_lock();
1222         dev = ip6_rt_get_dev_rcu(rt);
1223         pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1224         rcu_read_unlock();
1225         if (!pcpu_rt) {
1226                 fib6_info_release(rt);
1227                 return NULL;
1228         }
1229         ip6_rt_copy_init(pcpu_rt, rt);
1230         pcpu_rt->rt6i_flags |= RTF_PCPU;
1231         return pcpu_rt;
1232 }
1233
1234 /* It should be called with rcu_read_lock() acquired */
1235 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
1236 {
1237         struct rt6_info *pcpu_rt, **p;
1238
1239         p = this_cpu_ptr(rt->rt6i_pcpu);
1240         pcpu_rt = *p;
1241
1242         if (pcpu_rt)
1243                 ip6_hold_safe(NULL, &pcpu_rt, false);
1244
1245         return pcpu_rt;
1246 }
1247
1248 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1249                                             struct fib6_info *rt)
1250 {
1251         struct rt6_info *pcpu_rt, *prev, **p;
1252
1253         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1254         if (!pcpu_rt) {
1255                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1256                 return net->ipv6.ip6_null_entry;
1257         }
1258
1259         dst_hold(&pcpu_rt->dst);
1260         p = this_cpu_ptr(rt->rt6i_pcpu);
1261         prev = cmpxchg(p, NULL, pcpu_rt);
1262         BUG_ON(prev);
1263
1264         return pcpu_rt;
1265 }
1266
1267 /* exception hash table implementation
1268  */
1269 static DEFINE_SPINLOCK(rt6_exception_lock);
1270
1271 /* Remove rt6_ex from hash table and free the memory
1272  * Caller must hold rt6_exception_lock
1273  */
1274 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1275                                  struct rt6_exception *rt6_ex)
1276 {
1277         struct net *net;
1278
1279         if (!bucket || !rt6_ex)
1280                 return;
1281
1282         net = dev_net(rt6_ex->rt6i->dst.dev);
1283         hlist_del_rcu(&rt6_ex->hlist);
1284         dst_release(&rt6_ex->rt6i->dst);
1285         kfree_rcu(rt6_ex, rcu);
1286         WARN_ON_ONCE(!bucket->depth);
1287         bucket->depth--;
1288         net->ipv6.rt6_stats->fib_rt_cache--;
1289 }
1290
1291 /* Remove oldest rt6_ex in bucket and free the memory
1292  * Caller must hold rt6_exception_lock
1293  */
1294 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1295 {
1296         struct rt6_exception *rt6_ex, *oldest = NULL;
1297
1298         if (!bucket)
1299                 return;
1300
1301         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1302                 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1303                         oldest = rt6_ex;
1304         }
1305         rt6_remove_exception(bucket, oldest);
1306 }
1307
1308 static u32 rt6_exception_hash(const struct in6_addr *dst,
1309                               const struct in6_addr *src)
1310 {
1311         static u32 seed __read_mostly;
1312         u32 val;
1313
1314         net_get_random_once(&seed, sizeof(seed));
1315         val = jhash(dst, sizeof(*dst), seed);
1316
1317 #ifdef CONFIG_IPV6_SUBTREES
1318         if (src)
1319                 val = jhash(src, sizeof(*src), val);
1320 #endif
1321         return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1322 }
1323
1324 /* Helper function to find the cached rt in the hash table
1325  * and update bucket pointer to point to the bucket for this
1326  * (daddr, saddr) pair
1327  * Caller must hold rt6_exception_lock
1328  */
1329 static struct rt6_exception *
1330 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1331                               const struct in6_addr *daddr,
1332                               const struct in6_addr *saddr)
1333 {
1334         struct rt6_exception *rt6_ex;
1335         u32 hval;
1336
1337         if (!(*bucket) || !daddr)
1338                 return NULL;
1339
1340         hval = rt6_exception_hash(daddr, saddr);
1341         *bucket += hval;
1342
1343         hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1344                 struct rt6_info *rt6 = rt6_ex->rt6i;
1345                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1346
1347 #ifdef CONFIG_IPV6_SUBTREES
1348                 if (matched && saddr)
1349                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1350 #endif
1351                 if (matched)
1352                         return rt6_ex;
1353         }
1354         return NULL;
1355 }
1356
1357 /* Helper function to find the cached rt in the hash table
1358  * and update bucket pointer to point to the bucket for this
1359  * (daddr, saddr) pair
1360  * Caller must hold rcu_read_lock()
1361  */
1362 static struct rt6_exception *
1363 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1364                          const struct in6_addr *daddr,
1365                          const struct in6_addr *saddr)
1366 {
1367         struct rt6_exception *rt6_ex;
1368         u32 hval;
1369
1370         WARN_ON_ONCE(!rcu_read_lock_held());
1371
1372         if (!(*bucket) || !daddr)
1373                 return NULL;
1374
1375         hval = rt6_exception_hash(daddr, saddr);
1376         *bucket += hval;
1377
1378         hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1379                 struct rt6_info *rt6 = rt6_ex->rt6i;
1380                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1381
1382 #ifdef CONFIG_IPV6_SUBTREES
1383                 if (matched && saddr)
1384                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1385 #endif
1386                 if (matched)
1387                         return rt6_ex;
1388         }
1389         return NULL;
1390 }
1391
1392 static unsigned int fib6_mtu(const struct fib6_info *rt)
1393 {
1394         unsigned int mtu;
1395
1396         if (rt->fib6_pmtu) {
1397                 mtu = rt->fib6_pmtu;
1398         } else {
1399                 struct net_device *dev = fib6_info_nh_dev(rt);
1400                 struct inet6_dev *idev;
1401
1402                 rcu_read_lock();
1403                 idev = __in6_dev_get(dev);
1404                 mtu = idev->cnf.mtu6;
1405                 rcu_read_unlock();
1406         }
1407
1408         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1409
1410         return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
1411 }
1412
1413 static int rt6_insert_exception(struct rt6_info *nrt,
1414                                 struct fib6_info *ort)
1415 {
1416         struct net *net = dev_net(nrt->dst.dev);
1417         struct rt6_exception_bucket *bucket;
1418         struct in6_addr *src_key = NULL;
1419         struct rt6_exception *rt6_ex;
1420         int err = 0;
1421
1422         spin_lock_bh(&rt6_exception_lock);
1423
1424         if (ort->exception_bucket_flushed) {
1425                 err = -EINVAL;
1426                 goto out;
1427         }
1428
1429         bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1430                                         lockdep_is_held(&rt6_exception_lock));
1431         if (!bucket) {
1432                 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1433                                  GFP_ATOMIC);
1434                 if (!bucket) {
1435                         err = -ENOMEM;
1436                         goto out;
1437                 }
1438                 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1439         }
1440
1441 #ifdef CONFIG_IPV6_SUBTREES
1442         /* rt6i_src.plen != 0 indicates ort is in subtree
1443          * and exception table is indexed by a hash of
1444          * both rt6i_dst and rt6i_src.
1445          * Otherwise, the exception table is indexed by
1446          * a hash of only rt6i_dst.
1447          */
1448         if (ort->fib6_src.plen)
1449                 src_key = &nrt->rt6i_src.addr;
1450 #endif
1451         /* rt6_mtu_change() might lower mtu on ort.
1452          * Only insert this exception route if its mtu
1453          * is less than ort's mtu value.
1454          */
1455         if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1456                 err = -EINVAL;
1457                 goto out;
1458         }
1459
1460         rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1461                                                src_key);
1462         if (rt6_ex)
1463                 rt6_remove_exception(bucket, rt6_ex);
1464
1465         rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1466         if (!rt6_ex) {
1467                 err = -ENOMEM;
1468                 goto out;
1469         }
1470         rt6_ex->rt6i = nrt;
1471         rt6_ex->stamp = jiffies;
1472         hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1473         bucket->depth++;
1474         net->ipv6.rt6_stats->fib_rt_cache++;
1475
1476         if (bucket->depth > FIB6_MAX_DEPTH)
1477                 rt6_exception_remove_oldest(bucket);
1478
1479 out:
1480         spin_unlock_bh(&rt6_exception_lock);
1481
1482         /* Update fn->fn_sernum to invalidate all cached dst */
1483         if (!err) {
1484                 spin_lock_bh(&ort->fib6_table->tb6_lock);
1485                 fib6_update_sernum(net, ort);
1486                 spin_unlock_bh(&ort->fib6_table->tb6_lock);
1487                 fib6_force_start_gc(net);
1488         }
1489
1490         return err;
1491 }
1492
1493 void rt6_flush_exceptions(struct fib6_info *rt)
1494 {
1495         struct rt6_exception_bucket *bucket;
1496         struct rt6_exception *rt6_ex;
1497         struct hlist_node *tmp;
1498         int i;
1499
1500         spin_lock_bh(&rt6_exception_lock);
1501         /* Prevent rt6_insert_exception() to recreate the bucket list */
1502         rt->exception_bucket_flushed = 1;
1503
1504         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1505                                     lockdep_is_held(&rt6_exception_lock));
1506         if (!bucket)
1507                 goto out;
1508
1509         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1510                 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1511                         rt6_remove_exception(bucket, rt6_ex);
1512                 WARN_ON_ONCE(bucket->depth);
1513                 bucket++;
1514         }
1515
1516 out:
1517         spin_unlock_bh(&rt6_exception_lock);
1518 }
1519
1520 /* Find cached rt in the hash table inside passed in rt
1521  * Caller has to hold rcu_read_lock()
1522  */
1523 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
1524                                            struct in6_addr *daddr,
1525                                            struct in6_addr *saddr)
1526 {
1527         struct rt6_exception_bucket *bucket;
1528         struct in6_addr *src_key = NULL;
1529         struct rt6_exception *rt6_ex;
1530         struct rt6_info *res = NULL;
1531
1532         bucket = rcu_dereference(rt->rt6i_exception_bucket);
1533
1534 #ifdef CONFIG_IPV6_SUBTREES
1535         /* rt6i_src.plen != 0 indicates rt is in subtree
1536          * and exception table is indexed by a hash of
1537          * both rt6i_dst and rt6i_src.
1538          * Otherwise, the exception table is indexed by
1539          * a hash of only rt6i_dst.
1540          */
1541         if (rt->fib6_src.plen)
1542                 src_key = saddr;
1543 #endif
1544         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1545
1546         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1547                 res = rt6_ex->rt6i;
1548
1549         return res;
1550 }
1551
1552 /* Remove the passed in cached rt from the hash table that contains it */
1553 static int rt6_remove_exception_rt(struct rt6_info *rt)
1554 {
1555         struct rt6_exception_bucket *bucket;
1556         struct in6_addr *src_key = NULL;
1557         struct rt6_exception *rt6_ex;
1558         struct fib6_info *from;
1559         int err;
1560
1561         from = rcu_dereference(rt->from);
1562         if (!from ||
1563             !(rt->rt6i_flags & RTF_CACHE))
1564                 return -EINVAL;
1565
1566         if (!rcu_access_pointer(from->rt6i_exception_bucket))
1567                 return -ENOENT;
1568
1569         spin_lock_bh(&rt6_exception_lock);
1570         bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1571                                     lockdep_is_held(&rt6_exception_lock));
1572 #ifdef CONFIG_IPV6_SUBTREES
1573         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1574          * and exception table is indexed by a hash of
1575          * both rt6i_dst and rt6i_src.
1576          * Otherwise, the exception table is indexed by
1577          * a hash of only rt6i_dst.
1578          */
1579         if (from->fib6_src.plen)
1580                 src_key = &rt->rt6i_src.addr;
1581 #endif
1582         rt6_ex = __rt6_find_exception_spinlock(&bucket,
1583                                                &rt->rt6i_dst.addr,
1584                                                src_key);
1585         if (rt6_ex) {
1586                 rt6_remove_exception(bucket, rt6_ex);
1587                 err = 0;
1588         } else {
1589                 err = -ENOENT;
1590         }
1591
1592         spin_unlock_bh(&rt6_exception_lock);
1593         return err;
1594 }
1595
1596 /* Find rt6_ex which contains the passed in rt cache and
1597  * refresh its stamp
1598  */
1599 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1600 {
1601         struct rt6_exception_bucket *bucket;
1602         struct fib6_info *from = rt->from;
1603         struct in6_addr *src_key = NULL;
1604         struct rt6_exception *rt6_ex;
1605
1606         if (!from ||
1607             !(rt->rt6i_flags & RTF_CACHE))
1608                 return;
1609
1610         rcu_read_lock();
1611         bucket = rcu_dereference(from->rt6i_exception_bucket);
1612
1613 #ifdef CONFIG_IPV6_SUBTREES
1614         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1615          * and exception table is indexed by a hash of
1616          * both rt6i_dst and rt6i_src.
1617          * Otherwise, the exception table is indexed by
1618          * a hash of only rt6i_dst.
1619          */
1620         if (from->fib6_src.plen)
1621                 src_key = &rt->rt6i_src.addr;
1622 #endif
1623         rt6_ex = __rt6_find_exception_rcu(&bucket,
1624                                           &rt->rt6i_dst.addr,
1625                                           src_key);
1626         if (rt6_ex)
1627                 rt6_ex->stamp = jiffies;
1628
1629         rcu_read_unlock();
1630 }
1631
1632 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1633                                          struct rt6_info *rt, int mtu)
1634 {
1635         /* If the new MTU is lower than the route PMTU, this new MTU will be the
1636          * lowest MTU in the path: always allow updating the route PMTU to
1637          * reflect PMTU decreases.
1638          *
1639          * If the new MTU is higher, and the route PMTU is equal to the local
1640          * MTU, this means the old MTU is the lowest in the path, so allow
1641          * updating it: if other nodes now have lower MTUs, PMTU discovery will
1642          * handle this.
1643          */
1644
1645         if (dst_mtu(&rt->dst) >= mtu)
1646                 return true;
1647
1648         if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1649                 return true;
1650
1651         return false;
1652 }
1653
1654 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1655                                        struct fib6_info *rt, int mtu)
1656 {
1657         struct rt6_exception_bucket *bucket;
1658         struct rt6_exception *rt6_ex;
1659         int i;
1660
1661         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1662                                         lockdep_is_held(&rt6_exception_lock));
1663
1664         if (!bucket)
1665                 return;
1666
1667         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1668                 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1669                         struct rt6_info *entry = rt6_ex->rt6i;
1670
1671                         /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1672                          * route), the metrics of its rt->from have already
1673                          * been updated.
1674                          */
1675                         if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1676                             rt6_mtu_change_route_allowed(idev, entry, mtu))
1677                                 dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1678                 }
1679                 bucket++;
1680         }
1681 }
1682
1683 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
1684
1685 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1686                                         struct in6_addr *gateway)
1687 {
1688         struct rt6_exception_bucket *bucket;
1689         struct rt6_exception *rt6_ex;
1690         struct hlist_node *tmp;
1691         int i;
1692
1693         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1694                 return;
1695
1696         spin_lock_bh(&rt6_exception_lock);
1697         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1698                                      lockdep_is_held(&rt6_exception_lock));
1699
1700         if (bucket) {
1701                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1702                         hlist_for_each_entry_safe(rt6_ex, tmp,
1703                                                   &bucket->chain, hlist) {
1704                                 struct rt6_info *entry = rt6_ex->rt6i;
1705
1706                                 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1707                                     RTF_CACHE_GATEWAY &&
1708                                     ipv6_addr_equal(gateway,
1709                                                     &entry->rt6i_gateway)) {
1710                                         rt6_remove_exception(bucket, rt6_ex);
1711                                 }
1712                         }
1713                         bucket++;
1714                 }
1715         }
1716
1717         spin_unlock_bh(&rt6_exception_lock);
1718 }
1719
1720 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1721                                       struct rt6_exception *rt6_ex,
1722                                       struct fib6_gc_args *gc_args,
1723                                       unsigned long now)
1724 {
1725         struct rt6_info *rt = rt6_ex->rt6i;
1726
1727         /* we are pruning and obsoleting aged-out and non gateway exceptions
1728          * even if others have still references to them, so that on next
1729          * dst_check() such references can be dropped.
1730          * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1731          * expired, independently from their aging, as per RFC 8201 section 4
1732          */
1733         if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1734                 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1735                         RT6_TRACE("aging clone %p\n", rt);
1736                         rt6_remove_exception(bucket, rt6_ex);
1737                         return;
1738                 }
1739         } else if (time_after(jiffies, rt->dst.expires)) {
1740                 RT6_TRACE("purging expired route %p\n", rt);
1741                 rt6_remove_exception(bucket, rt6_ex);
1742                 return;
1743         }
1744
1745         if (rt->rt6i_flags & RTF_GATEWAY) {
1746                 struct neighbour *neigh;
1747                 __u8 neigh_flags = 0;
1748
1749                 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1750                 if (neigh)
1751                         neigh_flags = neigh->flags;
1752
1753                 if (!(neigh_flags & NTF_ROUTER)) {
1754                         RT6_TRACE("purging route %p via non-router but gateway\n",
1755                                   rt);
1756                         rt6_remove_exception(bucket, rt6_ex);
1757                         return;
1758                 }
1759         }
1760
1761         gc_args->more++;
1762 }
1763
1764 void rt6_age_exceptions(struct fib6_info *rt,
1765                         struct fib6_gc_args *gc_args,
1766                         unsigned long now)
1767 {
1768         struct rt6_exception_bucket *bucket;
1769         struct rt6_exception *rt6_ex;
1770         struct hlist_node *tmp;
1771         int i;
1772
1773         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1774                 return;
1775
1776         rcu_read_lock_bh();
1777         spin_lock(&rt6_exception_lock);
1778         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1779                                     lockdep_is_held(&rt6_exception_lock));
1780
1781         if (bucket) {
1782                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1783                         hlist_for_each_entry_safe(rt6_ex, tmp,
1784                                                   &bucket->chain, hlist) {
1785                                 rt6_age_examine_exception(bucket, rt6_ex,
1786                                                           gc_args, now);
1787                         }
1788                         bucket++;
1789                 }
1790         }
1791         spin_unlock(&rt6_exception_lock);
1792         rcu_read_unlock_bh();
1793 }
1794
1795 /* must be called with rcu lock held */
1796 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
1797                                     int oif, struct flowi6 *fl6, int strict)
1798 {
1799         struct fib6_node *fn, *saved_fn;
1800         struct fib6_info *f6i;
1801
1802         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1803         saved_fn = fn;
1804
1805         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1806                 oif = 0;
1807
1808 redo_rt6_select:
1809         f6i = rt6_select(net, fn, oif, strict);
1810         if (f6i == net->ipv6.fib6_null_entry) {
1811                 fn = fib6_backtrack(fn, &fl6->saddr);
1812                 if (fn)
1813                         goto redo_rt6_select;
1814                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1815                         /* also consider unreachable route */
1816                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1817                         fn = saved_fn;
1818                         goto redo_rt6_select;
1819                 }
1820         }
1821
1822         trace_fib6_table_lookup(net, f6i, table, fl6);
1823
1824         return f6i;
1825 }
1826
1827 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1828                                int oif, struct flowi6 *fl6,
1829                                const struct sk_buff *skb, int flags)
1830 {
1831         struct fib6_info *f6i;
1832         struct rt6_info *rt;
1833         int strict = 0;
1834
1835         strict |= flags & RT6_LOOKUP_F_IFACE;
1836         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1837         if (net->ipv6.devconf_all->forwarding == 0)
1838                 strict |= RT6_LOOKUP_F_REACHABLE;
1839
1840         rcu_read_lock();
1841
1842         f6i = fib6_table_lookup(net, table, oif, fl6, strict);
1843         if (f6i->fib6_nsiblings)
1844                 f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
1845
1846         if (f6i == net->ipv6.fib6_null_entry) {
1847                 rt = net->ipv6.ip6_null_entry;
1848                 rcu_read_unlock();
1849                 dst_hold(&rt->dst);
1850                 return rt;
1851         }
1852
1853         /*Search through exception table */
1854         rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1855         if (rt) {
1856                 if (ip6_hold_safe(net, &rt, true))
1857                         dst_use_noref(&rt->dst, jiffies);
1858
1859                 rcu_read_unlock();
1860                 return rt;
1861         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1862                             !(f6i->fib6_flags & RTF_GATEWAY))) {
1863                 /* Create a RTF_CACHE clone which will not be
1864                  * owned by the fib6 tree.  It is for the special case where
1865                  * the daddr in the skb during the neighbor look-up is different
1866                  * from the fl6->daddr used to look-up route here.
1867                  */
1868                 struct rt6_info *uncached_rt;
1869
1870                 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
1871
1872                 rcu_read_unlock();
1873
1874                 if (uncached_rt) {
1875                         /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1876                          * No need for another dst_hold()
1877                          */
1878                         rt6_uncached_list_add(uncached_rt);
1879                         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1880                 } else {
1881                         uncached_rt = net->ipv6.ip6_null_entry;
1882                         dst_hold(&uncached_rt->dst);
1883                 }
1884
1885                 return uncached_rt;
1886         } else {
1887                 /* Get a percpu copy */
1888
1889                 struct rt6_info *pcpu_rt;
1890
1891                 local_bh_disable();
1892                 pcpu_rt = rt6_get_pcpu_route(f6i);
1893
1894                 if (!pcpu_rt)
1895                         pcpu_rt = rt6_make_pcpu_route(net, f6i);
1896
1897                 local_bh_enable();
1898                 rcu_read_unlock();
1899
1900                 return pcpu_rt;
1901         }
1902 }
1903 EXPORT_SYMBOL_GPL(ip6_pol_route);
1904
1905 static struct rt6_info *ip6_pol_route_input(struct net *net,
1906                                             struct fib6_table *table,
1907                                             struct flowi6 *fl6,
1908                                             const struct sk_buff *skb,
1909                                             int flags)
1910 {
1911         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1912 }
1913
1914 struct dst_entry *ip6_route_input_lookup(struct net *net,
1915                                          struct net_device *dev,
1916                                          struct flowi6 *fl6,
1917                                          const struct sk_buff *skb,
1918                                          int flags)
1919 {
1920         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1921                 flags |= RT6_LOOKUP_F_IFACE;
1922
1923         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1924 }
1925 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1926
1927 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1928                                   struct flow_keys *keys,
1929                                   struct flow_keys *flkeys)
1930 {
1931         const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1932         const struct ipv6hdr *key_iph = outer_iph;
1933         struct flow_keys *_flkeys = flkeys;
1934         const struct ipv6hdr *inner_iph;
1935         const struct icmp6hdr *icmph;
1936         struct ipv6hdr _inner_iph;
1937         struct icmp6hdr _icmph;
1938
1939         if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1940                 goto out;
1941
1942         icmph = skb_header_pointer(skb, skb_transport_offset(skb),
1943                                    sizeof(_icmph), &_icmph);
1944         if (!icmph)
1945                 goto out;
1946
1947         if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1948             icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1949             icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1950             icmph->icmp6_type != ICMPV6_PARAMPROB)
1951                 goto out;
1952
1953         inner_iph = skb_header_pointer(skb,
1954                                        skb_transport_offset(skb) + sizeof(*icmph),
1955                                        sizeof(_inner_iph), &_inner_iph);
1956         if (!inner_iph)
1957                 goto out;
1958
1959         key_iph = inner_iph;
1960         _flkeys = NULL;
1961 out:
1962         if (_flkeys) {
1963                 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1964                 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1965                 keys->tags.flow_label = _flkeys->tags.flow_label;
1966                 keys->basic.ip_proto = _flkeys->basic.ip_proto;
1967         } else {
1968                 keys->addrs.v6addrs.src = key_iph->saddr;
1969                 keys->addrs.v6addrs.dst = key_iph->daddr;
1970                 keys->tags.flow_label = ip6_flowlabel(key_iph);
1971                 keys->basic.ip_proto = key_iph->nexthdr;
1972         }
1973 }
1974
1975 /* if skb is set it will be used and fl6 can be NULL */
1976 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
1977                        const struct sk_buff *skb, struct flow_keys *flkeys)
1978 {
1979         struct flow_keys hash_keys;
1980         u32 mhash;
1981
1982         switch (ip6_multipath_hash_policy(net)) {
1983         case 0:
1984                 memset(&hash_keys, 0, sizeof(hash_keys));
1985                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1986                 if (skb) {
1987                         ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
1988                 } else {
1989                         hash_keys.addrs.v6addrs.src = fl6->saddr;
1990                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
1991                         hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
1992                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
1993                 }
1994                 break;
1995         case 1:
1996                 if (skb) {
1997                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1998                         struct flow_keys keys;
1999
2000                         /* short-circuit if we already have L4 hash present */
2001                         if (skb->l4_hash)
2002                                 return skb_get_hash_raw(skb) >> 1;
2003
2004                         memset(&hash_keys, 0, sizeof(hash_keys));
2005
2006                         if (!flkeys) {
2007                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
2008                                 flkeys = &keys;
2009                         }
2010                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2011                         hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2012                         hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2013                         hash_keys.ports.src = flkeys->ports.src;
2014                         hash_keys.ports.dst = flkeys->ports.dst;
2015                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2016                 } else {
2017                         memset(&hash_keys, 0, sizeof(hash_keys));
2018                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2019                         hash_keys.addrs.v6addrs.src = fl6->saddr;
2020                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
2021                         hash_keys.ports.src = fl6->fl6_sport;
2022                         hash_keys.ports.dst = fl6->fl6_dport;
2023                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
2024                 }
2025                 break;
2026         }
2027         mhash = flow_hash_from_keys(&hash_keys);
2028
2029         return mhash >> 1;
2030 }
2031
2032 void ip6_route_input(struct sk_buff *skb)
2033 {
2034         const struct ipv6hdr *iph = ipv6_hdr(skb);
2035         struct net *net = dev_net(skb->dev);
2036         int flags = RT6_LOOKUP_F_HAS_SADDR;
2037         struct ip_tunnel_info *tun_info;
2038         struct flowi6 fl6 = {
2039                 .flowi6_iif = skb->dev->ifindex,
2040                 .daddr = iph->daddr,
2041                 .saddr = iph->saddr,
2042                 .flowlabel = ip6_flowinfo(iph),
2043                 .flowi6_mark = skb->mark,
2044                 .flowi6_proto = iph->nexthdr,
2045         };
2046         struct flow_keys *flkeys = NULL, _flkeys;
2047
2048         tun_info = skb_tunnel_info(skb);
2049         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2050                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2051
2052         if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2053                 flkeys = &_flkeys;
2054
2055         if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2056                 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2057         skb_dst_drop(skb);
2058         skb_dst_set(skb,
2059                     ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2060 }
2061
2062 static struct rt6_info *ip6_pol_route_output(struct net *net,
2063                                              struct fib6_table *table,
2064                                              struct flowi6 *fl6,
2065                                              const struct sk_buff *skb,
2066                                              int flags)
2067 {
2068         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2069 }
2070
2071 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2072                                          struct flowi6 *fl6, int flags)
2073 {
2074         bool any_src;
2075
2076         if (ipv6_addr_type(&fl6->daddr) &
2077             (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
2078                 struct dst_entry *dst;
2079
2080                 dst = l3mdev_link_scope_lookup(net, fl6);
2081                 if (dst)
2082                         return dst;
2083         }
2084
2085         fl6->flowi6_iif = LOOPBACK_IFINDEX;
2086
2087         any_src = ipv6_addr_any(&fl6->saddr);
2088         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2089             (fl6->flowi6_oif && any_src))
2090                 flags |= RT6_LOOKUP_F_IFACE;
2091
2092         if (!any_src)
2093                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2094         else if (sk)
2095                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2096
2097         return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2098 }
2099 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2100
2101 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2102 {
2103         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2104         struct net_device *loopback_dev = net->loopback_dev;
2105         struct dst_entry *new = NULL;
2106
2107         rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2108                        DST_OBSOLETE_DEAD, 0);
2109         if (rt) {
2110                 rt6_info_init(rt);
2111                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2112
2113                 new = &rt->dst;
2114                 new->__use = 1;
2115                 new->input = dst_discard;
2116                 new->output = dst_discard_out;
2117
2118                 dst_copy_metrics(new, &ort->dst);
2119
2120                 rt->rt6i_idev = in6_dev_get(loopback_dev);
2121                 rt->rt6i_gateway = ort->rt6i_gateway;
2122                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2123
2124                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2125 #ifdef CONFIG_IPV6_SUBTREES
2126                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2127 #endif
2128         }
2129
2130         dst_release(dst_orig);
2131         return new ? new : ERR_PTR(-ENOMEM);
2132 }
2133
2134 /*
2135  *      Destination cache support functions
2136  */
2137
2138 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2139 {
2140         u32 rt_cookie = 0;
2141
2142         if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2143                 return false;
2144
2145         if (fib6_check_expired(f6i))
2146                 return false;
2147
2148         return true;
2149 }
2150
2151 static struct dst_entry *rt6_check(struct rt6_info *rt,
2152                                    struct fib6_info *from,
2153                                    u32 cookie)
2154 {
2155         u32 rt_cookie = 0;
2156
2157         if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2158             rt_cookie != cookie)
2159                 return NULL;
2160
2161         if (rt6_check_expired(rt))
2162                 return NULL;
2163
2164         return &rt->dst;
2165 }
2166
2167 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2168                                             struct fib6_info *from,
2169                                             u32 cookie)
2170 {
2171         if (!__rt6_check_expired(rt) &&
2172             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2173             fib6_check(from, cookie))
2174                 return &rt->dst;
2175         else
2176                 return NULL;
2177 }
2178
2179 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2180 {
2181         struct dst_entry *dst_ret;
2182         struct fib6_info *from;
2183         struct rt6_info *rt;
2184
2185         rt = container_of(dst, struct rt6_info, dst);
2186
2187         rcu_read_lock();
2188
2189         /* All IPV6 dsts are created with ->obsolete set to the value
2190          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2191          * into this function always.
2192          */
2193
2194         from = rcu_dereference(rt->from);
2195
2196         if (from && (rt->rt6i_flags & RTF_PCPU ||
2197             unlikely(!list_empty(&rt->rt6i_uncached))))
2198                 dst_ret = rt6_dst_from_check(rt, from, cookie);
2199         else
2200                 dst_ret = rt6_check(rt, from, cookie);
2201
2202         rcu_read_unlock();
2203
2204         return dst_ret;
2205 }
2206
2207 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2208 {
2209         struct rt6_info *rt = (struct rt6_info *) dst;
2210
2211         if (rt) {
2212                 if (rt->rt6i_flags & RTF_CACHE) {
2213                         rcu_read_lock();
2214                         if (rt6_check_expired(rt)) {
2215                                 rt6_remove_exception_rt(rt);
2216                                 dst = NULL;
2217                         }
2218                         rcu_read_unlock();
2219                 } else {
2220                         dst_release(dst);
2221                         dst = NULL;
2222                 }
2223         }
2224         return dst;
2225 }
2226
2227 static void ip6_link_failure(struct sk_buff *skb)
2228 {
2229         struct rt6_info *rt;
2230
2231         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2232
2233         rt = (struct rt6_info *) skb_dst(skb);
2234         if (rt) {
2235                 rcu_read_lock();
2236                 if (rt->rt6i_flags & RTF_CACHE) {
2237                         rt6_remove_exception_rt(rt);
2238                 } else {
2239                         struct fib6_info *from;
2240                         struct fib6_node *fn;
2241
2242                         from = rcu_dereference(rt->from);
2243                         if (from) {
2244                                 fn = rcu_dereference(from->fib6_node);
2245                                 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2246                                         fn->fn_sernum = -1;
2247                         }
2248                 }
2249                 rcu_read_unlock();
2250         }
2251 }
2252
2253 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2254 {
2255         if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2256                 struct fib6_info *from;
2257
2258                 rcu_read_lock();
2259                 from = rcu_dereference(rt0->from);
2260                 if (from)
2261                         rt0->dst.expires = from->expires;
2262                 rcu_read_unlock();
2263         }
2264
2265         dst_set_expires(&rt0->dst, timeout);
2266         rt0->rt6i_flags |= RTF_EXPIRES;
2267 }
2268
2269 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2270 {
2271         struct net *net = dev_net(rt->dst.dev);
2272
2273         dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2274         rt->rt6i_flags |= RTF_MODIFIED;
2275         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2276 }
2277
2278 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2279 {
2280         bool from_set;
2281
2282         rcu_read_lock();
2283         from_set = !!rcu_dereference(rt->from);
2284         rcu_read_unlock();
2285
2286         return !(rt->rt6i_flags & RTF_CACHE) &&
2287                 (rt->rt6i_flags & RTF_PCPU || from_set);
2288 }
2289
2290 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2291                                  const struct ipv6hdr *iph, u32 mtu)
2292 {
2293         const struct in6_addr *daddr, *saddr;
2294         struct rt6_info *rt6 = (struct rt6_info *)dst;
2295
2296         if (dst_metric_locked(dst, RTAX_MTU))
2297                 return;
2298
2299         if (iph) {
2300                 daddr = &iph->daddr;
2301                 saddr = &iph->saddr;
2302         } else if (sk) {
2303                 daddr = &sk->sk_v6_daddr;
2304                 saddr = &inet6_sk(sk)->saddr;
2305         } else {
2306                 daddr = NULL;
2307                 saddr = NULL;
2308         }
2309         dst_confirm_neigh(dst, daddr);
2310         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2311         if (mtu >= dst_mtu(dst))
2312                 return;
2313
2314         if (!rt6_cache_allowed_for_pmtu(rt6)) {
2315                 rt6_do_update_pmtu(rt6, mtu);
2316                 /* update rt6_ex->stamp for cache */
2317                 if (rt6->rt6i_flags & RTF_CACHE)
2318                         rt6_update_exception_stamp_rt(rt6);
2319         } else if (daddr) {
2320                 struct fib6_info *from;
2321                 struct rt6_info *nrt6;
2322
2323                 rcu_read_lock();
2324                 from = rcu_dereference(rt6->from);
2325                 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
2326                 if (nrt6) {
2327                         rt6_do_update_pmtu(nrt6, mtu);
2328                         if (rt6_insert_exception(nrt6, from))
2329                                 dst_release_immediate(&nrt6->dst);
2330                 }
2331                 rcu_read_unlock();
2332         }
2333 }
2334
2335 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2336                                struct sk_buff *skb, u32 mtu)
2337 {
2338         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2339 }
2340
2341 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2342                      int oif, u32 mark, kuid_t uid)
2343 {
2344         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2345         struct dst_entry *dst;
2346         struct flowi6 fl6 = {
2347                 .flowi6_oif = oif,
2348                 .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark),
2349                 .daddr = iph->daddr,
2350                 .saddr = iph->saddr,
2351                 .flowlabel = ip6_flowinfo(iph),
2352                 .flowi6_uid = uid,
2353         };
2354
2355         dst = ip6_route_output(net, NULL, &fl6);
2356         if (!dst->error)
2357                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2358         dst_release(dst);
2359 }
2360 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2361
2362 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2363 {
2364         int oif = sk->sk_bound_dev_if;
2365         struct dst_entry *dst;
2366
2367         if (!oif && skb->dev)
2368                 oif = l3mdev_master_ifindex(skb->dev);
2369
2370         ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid);
2371
2372         dst = __sk_dst_get(sk);
2373         if (!dst || !dst->obsolete ||
2374             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2375                 return;
2376
2377         bh_lock_sock(sk);
2378         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2379                 ip6_datagram_dst_update(sk, false);
2380         bh_unlock_sock(sk);
2381 }
2382 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2383
2384 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2385                            const struct flowi6 *fl6)
2386 {
2387 #ifdef CONFIG_IPV6_SUBTREES
2388         struct ipv6_pinfo *np = inet6_sk(sk);
2389 #endif
2390
2391         ip6_dst_store(sk, dst,
2392                       ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2393                       &sk->sk_v6_daddr : NULL,
2394 #ifdef CONFIG_IPV6_SUBTREES
2395                       ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2396                       &np->saddr :
2397 #endif
2398                       NULL);
2399 }
2400
2401 /* Handle redirects */
2402 struct ip6rd_flowi {
2403         struct flowi6 fl6;
2404         struct in6_addr gateway;
2405 };
2406
2407 static struct rt6_info *__ip6_route_redirect(struct net *net,
2408                                              struct fib6_table *table,
2409                                              struct flowi6 *fl6,
2410                                              const struct sk_buff *skb,
2411                                              int flags)
2412 {
2413         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2414         struct rt6_info *ret = NULL, *rt_cache;
2415         struct fib6_info *rt;
2416         struct fib6_node *fn;
2417
2418         /* Get the "current" route for this destination and
2419          * check if the redirect has come from appropriate router.
2420          *
2421          * RFC 4861 specifies that redirects should only be
2422          * accepted if they come from the nexthop to the target.
2423          * Due to the way the routes are chosen, this notion
2424          * is a bit fuzzy and one might need to check all possible
2425          * routes.
2426          */
2427
2428         rcu_read_lock();
2429         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2430 restart:
2431         for_each_fib6_node_rt_rcu(fn) {
2432                 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
2433                         continue;
2434                 if (fib6_check_expired(rt))
2435                         continue;
2436                 if (rt->fib6_flags & RTF_REJECT)
2437                         break;
2438                 if (!(rt->fib6_flags & RTF_GATEWAY))
2439                         continue;
2440                 if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex)
2441                         continue;
2442                 /* rt_cache's gateway might be different from its 'parent'
2443                  * in the case of an ip redirect.
2444                  * So we keep searching in the exception table if the gateway
2445                  * is different.
2446                  */
2447                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) {
2448                         rt_cache = rt6_find_cached_rt(rt,
2449                                                       &fl6->daddr,
2450                                                       &fl6->saddr);
2451                         if (rt_cache &&
2452                             ipv6_addr_equal(&rdfl->gateway,
2453                                             &rt_cache->rt6i_gateway)) {
2454                                 ret = rt_cache;
2455                                 break;
2456                         }
2457                         continue;
2458                 }
2459                 break;
2460         }
2461
2462         if (!rt)
2463                 rt = net->ipv6.fib6_null_entry;
2464         else if (rt->fib6_flags & RTF_REJECT) {
2465                 ret = net->ipv6.ip6_null_entry;
2466                 goto out;
2467         }
2468
2469         if (rt == net->ipv6.fib6_null_entry) {
2470                 fn = fib6_backtrack(fn, &fl6->saddr);
2471                 if (fn)
2472                         goto restart;
2473         }
2474
2475 out:
2476         if (ret)
2477                 ip6_hold_safe(net, &ret, true);
2478         else
2479                 ret = ip6_create_rt_rcu(rt);
2480
2481         rcu_read_unlock();
2482
2483         trace_fib6_table_lookup(net, rt, table, fl6);
2484         return ret;
2485 };
2486
2487 static struct dst_entry *ip6_route_redirect(struct net *net,
2488                                             const struct flowi6 *fl6,
2489                                             const struct sk_buff *skb,
2490                                             const struct in6_addr *gateway)
2491 {
2492         int flags = RT6_LOOKUP_F_HAS_SADDR;
2493         struct ip6rd_flowi rdfl;
2494
2495         rdfl.fl6 = *fl6;
2496         rdfl.gateway = *gateway;
2497
2498         return fib6_rule_lookup(net, &rdfl.fl6, skb,
2499                                 flags, __ip6_route_redirect);
2500 }
2501
2502 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2503                   kuid_t uid)
2504 {
2505         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2506         struct dst_entry *dst;
2507         struct flowi6 fl6 = {
2508                 .flowi6_iif = LOOPBACK_IFINDEX,
2509                 .flowi6_oif = oif,
2510                 .flowi6_mark = mark,
2511                 .daddr = iph->daddr,
2512                 .saddr = iph->saddr,
2513                 .flowlabel = ip6_flowinfo(iph),
2514                 .flowi6_uid = uid,
2515         };
2516
2517         dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2518         rt6_do_redirect(dst, NULL, skb);
2519         dst_release(dst);
2520 }
2521 EXPORT_SYMBOL_GPL(ip6_redirect);
2522
2523 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif)
2524 {
2525         const struct ipv6hdr *iph = ipv6_hdr(skb);
2526         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2527         struct dst_entry *dst;
2528         struct flowi6 fl6 = {
2529                 .flowi6_iif = LOOPBACK_IFINDEX,
2530                 .flowi6_oif = oif,
2531                 .daddr = msg->dest,
2532                 .saddr = iph->daddr,
2533                 .flowi6_uid = sock_net_uid(net, NULL),
2534         };
2535
2536         dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2537         rt6_do_redirect(dst, NULL, skb);
2538         dst_release(dst);
2539 }
2540
2541 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2542 {
2543         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2544                      sk->sk_uid);
2545 }
2546 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2547
2548 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2549 {
2550         struct net_device *dev = dst->dev;
2551         unsigned int mtu = dst_mtu(dst);
2552         struct net *net = dev_net(dev);
2553
2554         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2555
2556         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2557                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2558
2559         /*
2560          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2561          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2562          * IPV6_MAXPLEN is also valid and means: "any MSS,
2563          * rely only on pmtu discovery"
2564          */
2565         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2566                 mtu = IPV6_MAXPLEN;
2567         return mtu;
2568 }
2569
2570 static unsigned int ip6_mtu(const struct dst_entry *dst)
2571 {
2572         struct inet6_dev *idev;
2573         unsigned int mtu;
2574
2575         mtu = dst_metric_raw(dst, RTAX_MTU);
2576         if (mtu)
2577                 goto out;
2578
2579         mtu = IPV6_MIN_MTU;
2580
2581         rcu_read_lock();
2582         idev = __in6_dev_get(dst->dev);
2583         if (idev)
2584                 mtu = idev->cnf.mtu6;
2585         rcu_read_unlock();
2586
2587 out:
2588         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2589
2590         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2591 }
2592
2593 /* MTU selection:
2594  * 1. mtu on route is locked - use it
2595  * 2. mtu from nexthop exception
2596  * 3. mtu from egress device
2597  *
2598  * based on ip6_dst_mtu_forward and exception logic of
2599  * rt6_find_cached_rt; called with rcu_read_lock
2600  */
2601 u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
2602                       struct in6_addr *saddr)
2603 {
2604         struct rt6_exception_bucket *bucket;
2605         struct rt6_exception *rt6_ex;
2606         struct in6_addr *src_key;
2607         struct inet6_dev *idev;
2608         u32 mtu = 0;
2609
2610         if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
2611                 mtu = f6i->fib6_pmtu;
2612                 if (mtu)
2613                         goto out;
2614         }
2615
2616         src_key = NULL;
2617 #ifdef CONFIG_IPV6_SUBTREES
2618         if (f6i->fib6_src.plen)
2619                 src_key = saddr;
2620 #endif
2621
2622         bucket = rcu_dereference(f6i->rt6i_exception_bucket);
2623         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
2624         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
2625                 mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU);
2626
2627         if (likely(!mtu)) {
2628                 struct net_device *dev = fib6_info_nh_dev(f6i);
2629
2630                 mtu = IPV6_MIN_MTU;
2631                 idev = __in6_dev_get(dev);
2632                 if (idev && idev->cnf.mtu6 > mtu)
2633                         mtu = idev->cnf.mtu6;
2634         }
2635
2636         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2637 out:
2638         return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu);
2639 }
2640
2641 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2642                                   struct flowi6 *fl6)
2643 {
2644         struct dst_entry *dst;
2645         struct rt6_info *rt;
2646         struct inet6_dev *idev = in6_dev_get(dev);
2647         struct net *net = dev_net(dev);
2648
2649         if (unlikely(!idev))
2650                 return ERR_PTR(-ENODEV);
2651
2652         rt = ip6_dst_alloc(net, dev, 0);
2653         if (unlikely(!rt)) {
2654                 in6_dev_put(idev);
2655                 dst = ERR_PTR(-ENOMEM);
2656                 goto out;
2657         }
2658
2659         rt->dst.flags |= DST_HOST;
2660         rt->dst.input = ip6_input;
2661         rt->dst.output  = ip6_output;
2662         rt->rt6i_gateway  = fl6->daddr;
2663         rt->rt6i_dst.addr = fl6->daddr;
2664         rt->rt6i_dst.plen = 128;
2665         rt->rt6i_idev     = idev;
2666         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2667
2668         /* Add this dst into uncached_list so that rt6_disable_ip() can
2669          * do proper release of the net_device
2670          */
2671         rt6_uncached_list_add(rt);
2672         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2673
2674         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2675
2676 out:
2677         return dst;
2678 }
2679
2680 static int ip6_dst_gc(struct dst_ops *ops)
2681 {
2682         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2683         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2684         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2685         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2686         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2687         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2688         int entries;
2689
2690         entries = dst_entries_get_fast(ops);
2691         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2692             entries <= rt_max_size)
2693                 goto out;
2694
2695         net->ipv6.ip6_rt_gc_expire++;
2696         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2697         entries = dst_entries_get_slow(ops);
2698         if (entries < ops->gc_thresh)
2699                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2700 out:
2701         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2702         return entries > rt_max_size;
2703 }
2704
2705 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2706                                             struct fib6_config *cfg,
2707                                             const struct in6_addr *gw_addr,
2708                                             u32 tbid, int flags)
2709 {
2710         struct flowi6 fl6 = {
2711                 .flowi6_oif = cfg->fc_ifindex,
2712                 .daddr = *gw_addr,
2713                 .saddr = cfg->fc_prefsrc,
2714         };
2715         struct fib6_table *table;
2716         struct rt6_info *rt;
2717
2718         table = fib6_get_table(net, tbid);
2719         if (!table)
2720                 return NULL;
2721
2722         if (!ipv6_addr_any(&cfg->fc_prefsrc))
2723                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2724
2725         flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2726         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2727
2728         /* if table lookup failed, fall back to full lookup */
2729         if (rt == net->ipv6.ip6_null_entry) {
2730                 ip6_rt_put(rt);
2731                 rt = NULL;
2732         }
2733
2734         return rt;
2735 }
2736
2737 static int ip6_route_check_nh_onlink(struct net *net,
2738                                      struct fib6_config *cfg,
2739                                      const struct net_device *dev,
2740                                      struct netlink_ext_ack *extack)
2741 {
2742         u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2743         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2744         u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2745         struct rt6_info *grt;
2746         int err;
2747
2748         err = 0;
2749         grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2750         if (grt) {
2751                 if (!grt->dst.error &&
2752                     /* ignore match if it is the default route */
2753                     grt->from && !ipv6_addr_any(&grt->from->fib6_dst.addr) &&
2754                     (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2755                         NL_SET_ERR_MSG(extack,
2756                                        "Nexthop has invalid gateway or device mismatch");
2757                         err = -EINVAL;
2758                 }
2759
2760                 ip6_rt_put(grt);
2761         }
2762
2763         return err;
2764 }
2765
2766 static int ip6_route_check_nh(struct net *net,
2767                               struct fib6_config *cfg,
2768                               struct net_device **_dev,
2769                               struct inet6_dev **idev)
2770 {
2771         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2772         struct net_device *dev = _dev ? *_dev : NULL;
2773         struct rt6_info *grt = NULL;
2774         int err = -EHOSTUNREACH;
2775
2776         if (cfg->fc_table) {
2777                 int flags = RT6_LOOKUP_F_IFACE;
2778
2779                 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2780                                           cfg->fc_table, flags);
2781                 if (grt) {
2782                         if (grt->rt6i_flags & RTF_GATEWAY ||
2783                             (dev && dev != grt->dst.dev)) {
2784                                 ip6_rt_put(grt);
2785                                 grt = NULL;
2786                         }
2787                 }
2788         }
2789
2790         if (!grt)
2791                 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2792
2793         if (!grt)
2794                 goto out;
2795
2796         if (dev) {
2797                 if (dev != grt->dst.dev) {
2798                         ip6_rt_put(grt);
2799                         goto out;
2800                 }
2801         } else {
2802                 *_dev = dev = grt->dst.dev;
2803                 *idev = grt->rt6i_idev;
2804                 dev_hold(dev);
2805                 in6_dev_hold(grt->rt6i_idev);
2806         }
2807
2808         if (!(grt->rt6i_flags & RTF_GATEWAY))
2809                 err = 0;
2810
2811         ip6_rt_put(grt);
2812
2813 out:
2814         return err;
2815 }
2816
2817 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2818                            struct net_device **_dev, struct inet6_dev **idev,
2819                            struct netlink_ext_ack *extack)
2820 {
2821         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2822         int gwa_type = ipv6_addr_type(gw_addr);
2823         bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2824         const struct net_device *dev = *_dev;
2825         bool need_addr_check = !dev;
2826         int err = -EINVAL;
2827
2828         /* if gw_addr is local we will fail to detect this in case
2829          * address is still TENTATIVE (DAD in progress). rt6_lookup()
2830          * will return already-added prefix route via interface that
2831          * prefix route was assigned to, which might be non-loopback.
2832          */
2833         if (dev &&
2834             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2835                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2836                 goto out;
2837         }
2838
2839         if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2840                 /* IPv6 strictly inhibits using not link-local
2841                  * addresses as nexthop address.
2842                  * Otherwise, router will not able to send redirects.
2843                  * It is very good, but in some (rare!) circumstances
2844                  * (SIT, PtP, NBMA NOARP links) it is handy to allow
2845                  * some exceptions. --ANK
2846                  * We allow IPv4-mapped nexthops to support RFC4798-type
2847                  * addressing
2848                  */
2849                 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2850                         NL_SET_ERR_MSG(extack, "Invalid gateway address");
2851                         goto out;
2852                 }
2853
2854                 if (cfg->fc_flags & RTNH_F_ONLINK)
2855                         err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2856                 else
2857                         err = ip6_route_check_nh(net, cfg, _dev, idev);
2858
2859                 if (err)
2860                         goto out;
2861         }
2862
2863         /* reload in case device was changed */
2864         dev = *_dev;
2865
2866         err = -EINVAL;
2867         if (!dev) {
2868                 NL_SET_ERR_MSG(extack, "Egress device not specified");
2869                 goto out;
2870         } else if (dev->flags & IFF_LOOPBACK) {
2871                 NL_SET_ERR_MSG(extack,
2872                                "Egress device can not be loopback device for this route");
2873                 goto out;
2874         }
2875
2876         /* if we did not check gw_addr above, do so now that the
2877          * egress device has been resolved.
2878          */
2879         if (need_addr_check &&
2880             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2881                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2882                 goto out;
2883         }
2884
2885         err = 0;
2886 out:
2887         return err;
2888 }
2889
2890 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
2891                                               gfp_t gfp_flags,
2892                                               struct netlink_ext_ack *extack)
2893 {
2894         struct net *net = cfg->fc_nlinfo.nl_net;
2895         struct fib6_info *rt = NULL;
2896         struct net_device *dev = NULL;
2897         struct inet6_dev *idev = NULL;
2898         struct fib6_table *table;
2899         int addr_type;
2900         int err = -EINVAL;
2901
2902         /* RTF_PCPU is an internal flag; can not be set by userspace */
2903         if (cfg->fc_flags & RTF_PCPU) {
2904                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2905                 goto out;
2906         }
2907
2908         /* RTF_CACHE is an internal flag; can not be set by userspace */
2909         if (cfg->fc_flags & RTF_CACHE) {
2910                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2911                 goto out;
2912         }
2913
2914         if (cfg->fc_type > RTN_MAX) {
2915                 NL_SET_ERR_MSG(extack, "Invalid route type");
2916                 goto out;
2917         }
2918
2919         if (cfg->fc_dst_len > 128) {
2920                 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2921                 goto out;
2922         }
2923         if (cfg->fc_src_len > 128) {
2924                 NL_SET_ERR_MSG(extack, "Invalid source address length");
2925                 goto out;
2926         }
2927 #ifndef CONFIG_IPV6_SUBTREES
2928         if (cfg->fc_src_len) {
2929                 NL_SET_ERR_MSG(extack,
2930                                "Specifying source address requires IPV6_SUBTREES to be enabled");
2931                 goto out;
2932         }
2933 #endif
2934         if (cfg->fc_ifindex) {
2935                 err = -ENODEV;
2936                 dev = dev_get_by_index(net, cfg->fc_ifindex);
2937                 if (!dev)
2938                         goto out;
2939                 idev = in6_dev_get(dev);
2940                 if (!idev)
2941                         goto out;
2942         }
2943
2944         if (cfg->fc_metric == 0)
2945                 cfg->fc_metric = IP6_RT_PRIO_USER;
2946
2947         if (cfg->fc_flags & RTNH_F_ONLINK) {
2948                 if (!dev) {
2949                         NL_SET_ERR_MSG(extack,
2950                                        "Nexthop device required for onlink");
2951                         err = -ENODEV;
2952                         goto out;
2953                 }
2954
2955                 if (!(dev->flags & IFF_UP)) {
2956                         NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2957                         err = -ENETDOWN;
2958                         goto out;
2959                 }
2960         }
2961
2962         err = -ENOBUFS;
2963         if (cfg->fc_nlinfo.nlh &&
2964             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2965                 table = fib6_get_table(net, cfg->fc_table);
2966                 if (!table) {
2967                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2968                         table = fib6_new_table(net, cfg->fc_table);
2969                 }
2970         } else {
2971                 table = fib6_new_table(net, cfg->fc_table);
2972         }
2973
2974         if (!table)
2975                 goto out;
2976
2977         err = -ENOMEM;
2978         rt = fib6_info_alloc(gfp_flags);
2979         if (!rt)
2980                 goto out;
2981
2982         rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len,
2983                                                extack);
2984         if (IS_ERR(rt->fib6_metrics)) {
2985                 err = PTR_ERR(rt->fib6_metrics);
2986                 /* Do not leave garbage there. */
2987                 rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
2988                 goto out;
2989         }
2990
2991         if (cfg->fc_flags & RTF_ADDRCONF)
2992                 rt->dst_nocount = true;
2993
2994         if (cfg->fc_flags & RTF_EXPIRES)
2995                 fib6_set_expires(rt, jiffies +
2996                                 clock_t_to_jiffies(cfg->fc_expires));
2997         else
2998                 fib6_clean_expires(rt);
2999
3000         if (cfg->fc_protocol == RTPROT_UNSPEC)
3001                 cfg->fc_protocol = RTPROT_BOOT;
3002         rt->fib6_protocol = cfg->fc_protocol;
3003
3004         addr_type = ipv6_addr_type(&cfg->fc_dst);
3005
3006         if (cfg->fc_encap) {
3007                 struct lwtunnel_state *lwtstate;
3008
3009                 err = lwtunnel_build_state(cfg->fc_encap_type,
3010                                            cfg->fc_encap, AF_INET6, cfg,
3011                                            &lwtstate, extack);
3012                 if (err)
3013                         goto out;
3014                 rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate);
3015         }
3016
3017         ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3018         rt->fib6_dst.plen = cfg->fc_dst_len;
3019         if (rt->fib6_dst.plen == 128)
3020                 rt->dst_host = true;
3021
3022 #ifdef CONFIG_IPV6_SUBTREES
3023         ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3024         rt->fib6_src.plen = cfg->fc_src_len;
3025 #endif
3026
3027         rt->fib6_metric = cfg->fc_metric;
3028         rt->fib6_nh.nh_weight = 1;
3029
3030         rt->fib6_type = cfg->fc_type;
3031
3032         /* We cannot add true routes via loopback here,
3033            they would result in kernel looping; promote them to reject routes
3034          */
3035         if ((cfg->fc_flags & RTF_REJECT) ||
3036             (dev && (dev->flags & IFF_LOOPBACK) &&
3037              !(addr_type & IPV6_ADDR_LOOPBACK) &&
3038              !(cfg->fc_flags & RTF_LOCAL))) {
3039                 /* hold loopback dev/idev if we haven't done so. */
3040                 if (dev != net->loopback_dev) {
3041                         if (dev) {
3042                                 dev_put(dev);
3043                                 in6_dev_put(idev);
3044                         }
3045                         dev = net->loopback_dev;
3046                         dev_hold(dev);
3047                         idev = in6_dev_get(dev);
3048                         if (!idev) {
3049                                 err = -ENODEV;
3050                                 goto out;
3051                         }
3052                 }
3053                 rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP;
3054                 goto install_route;
3055         }
3056
3057         if (cfg->fc_flags & RTF_GATEWAY) {
3058                 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
3059                 if (err)
3060                         goto out;
3061
3062                 rt->fib6_nh.nh_gw = cfg->fc_gateway;
3063         }
3064
3065         err = -ENODEV;
3066         if (!dev)
3067                 goto out;
3068
3069         if (idev->cnf.disable_ipv6) {
3070                 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3071                 err = -EACCES;
3072                 goto out;
3073         }
3074
3075         if (!(dev->flags & IFF_UP)) {
3076                 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3077                 err = -ENETDOWN;
3078                 goto out;
3079         }
3080
3081         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3082                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3083                         NL_SET_ERR_MSG(extack, "Invalid source address");
3084                         err = -EINVAL;
3085                         goto out;
3086                 }
3087                 rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3088                 rt->fib6_prefsrc.plen = 128;
3089         } else
3090                 rt->fib6_prefsrc.plen = 0;
3091
3092         rt->fib6_flags = cfg->fc_flags;
3093
3094 install_route:
3095         if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3096             !netif_carrier_ok(dev))
3097                 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3098         rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
3099         rt->fib6_nh.nh_dev = dev;
3100         rt->fib6_table = table;
3101
3102         if (idev)
3103                 in6_dev_put(idev);
3104
3105         return rt;
3106 out:
3107         if (dev)
3108                 dev_put(dev);
3109         if (idev)
3110                 in6_dev_put(idev);
3111
3112         fib6_info_release(rt);
3113         return ERR_PTR(err);
3114 }
3115
3116 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3117                   struct netlink_ext_ack *extack)
3118 {
3119         struct fib6_info *rt;
3120         int err;
3121
3122         rt = ip6_route_info_create(cfg, gfp_flags, extack);
3123         if (IS_ERR(rt))
3124                 return PTR_ERR(rt);
3125
3126         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3127         fib6_info_release(rt);
3128
3129         return err;
3130 }
3131
3132 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3133 {
3134         struct net *net = info->nl_net;
3135         struct fib6_table *table;
3136         int err;
3137
3138         if (rt == net->ipv6.fib6_null_entry) {
3139                 err = -ENOENT;
3140                 goto out;
3141         }
3142
3143         table = rt->fib6_table;
3144         spin_lock_bh(&table->tb6_lock);
3145         err = fib6_del(rt, info);
3146         spin_unlock_bh(&table->tb6_lock);
3147
3148 out:
3149         fib6_info_release(rt);
3150         return err;
3151 }
3152
3153 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3154 {
3155         struct nl_info info = { .nl_net = net };
3156
3157         return __ip6_del_rt(rt, &info);
3158 }
3159
3160 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3161 {
3162         struct nl_info *info = &cfg->fc_nlinfo;
3163         struct net *net = info->nl_net;
3164         struct sk_buff *skb = NULL;
3165         struct fib6_table *table;
3166         int err = -ENOENT;
3167
3168         if (rt == net->ipv6.fib6_null_entry)
3169                 goto out_put;
3170         table = rt->fib6_table;
3171         spin_lock_bh(&table->tb6_lock);
3172
3173         if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3174                 struct fib6_info *sibling, *next_sibling;
3175
3176                 /* prefer to send a single notification with all hops */
3177                 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3178                 if (skb) {
3179                         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3180
3181                         if (rt6_fill_node(net, skb, rt, NULL,
3182                                           NULL, NULL, 0, RTM_DELROUTE,
3183                                           info->portid, seq, 0) < 0) {
3184                                 kfree_skb(skb);
3185                                 skb = NULL;
3186                         } else
3187                                 info->skip_notify = 1;
3188                 }
3189
3190                 list_for_each_entry_safe(sibling, next_sibling,
3191                                          &rt->fib6_siblings,
3192                                          fib6_siblings) {
3193                         err = fib6_del(sibling, info);
3194                         if (err)
3195                                 goto out_unlock;
3196                 }
3197         }
3198
3199         err = fib6_del(rt, info);
3200 out_unlock:
3201         spin_unlock_bh(&table->tb6_lock);
3202 out_put:
3203         fib6_info_release(rt);
3204
3205         if (skb) {
3206                 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3207                             info->nlh, gfp_any());
3208         }
3209         return err;
3210 }
3211
3212 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3213 {
3214         int rc = -ESRCH;
3215
3216         if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3217                 goto out;
3218
3219         if (cfg->fc_flags & RTF_GATEWAY &&
3220             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3221                 goto out;
3222
3223         rc = rt6_remove_exception_rt(rt);
3224 out:
3225         return rc;
3226 }
3227
3228 static int ip6_route_del(struct fib6_config *cfg,
3229                          struct netlink_ext_ack *extack)
3230 {
3231         struct rt6_info *rt_cache;
3232         struct fib6_table *table;
3233         struct fib6_info *rt;
3234         struct fib6_node *fn;
3235         int err = -ESRCH;
3236
3237         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3238         if (!table) {
3239                 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3240                 return err;
3241         }
3242
3243         rcu_read_lock();
3244
3245         fn = fib6_locate(&table->tb6_root,
3246                          &cfg->fc_dst, cfg->fc_dst_len,
3247                          &cfg->fc_src, cfg->fc_src_len,
3248                          !(cfg->fc_flags & RTF_CACHE));
3249
3250         if (fn) {
3251                 for_each_fib6_node_rt_rcu(fn) {
3252                         if (cfg->fc_flags & RTF_CACHE) {
3253                                 int rc;
3254
3255                                 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3256                                                               &cfg->fc_src);
3257                                 if (rt_cache) {
3258                                         rc = ip6_del_cached_rt(rt_cache, cfg);
3259                                         if (rc != -ESRCH) {
3260                                                 rcu_read_unlock();
3261                                                 return rc;
3262                                         }
3263                                 }
3264                                 continue;
3265                         }
3266                         if (cfg->fc_ifindex &&
3267                             (!rt->fib6_nh.nh_dev ||
3268                              rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex))
3269                                 continue;
3270                         if (cfg->fc_flags & RTF_GATEWAY &&
3271                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw))
3272                                 continue;
3273                         if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3274                                 continue;
3275                         if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3276                                 continue;
3277                         if (!fib6_info_hold_safe(rt))
3278                                 continue;
3279                         rcu_read_unlock();
3280
3281                         /* if gateway was specified only delete the one hop */
3282                         if (cfg->fc_flags & RTF_GATEWAY)
3283                                 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3284
3285                         return __ip6_del_rt_siblings(rt, cfg);
3286                 }
3287         }
3288         rcu_read_unlock();
3289
3290         return err;
3291 }
3292
3293 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3294 {
3295         struct netevent_redirect netevent;
3296         struct rt6_info *rt, *nrt = NULL;
3297         struct ndisc_options ndopts;
3298         struct inet6_dev *in6_dev;
3299         struct neighbour *neigh;
3300         struct fib6_info *from;
3301         struct rd_msg *msg;
3302         int optlen, on_link;
3303         u8 *lladdr;
3304
3305         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3306         optlen -= sizeof(*msg);
3307
3308         if (optlen < 0) {
3309                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3310                 return;
3311         }
3312
3313         msg = (struct rd_msg *)icmp6_hdr(skb);
3314
3315         if (ipv6_addr_is_multicast(&msg->dest)) {
3316                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3317                 return;
3318         }
3319
3320         on_link = 0;
3321         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3322                 on_link = 1;
3323         } else if (ipv6_addr_type(&msg->target) !=
3324                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3325                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3326                 return;
3327         }
3328
3329         in6_dev = __in6_dev_get(skb->dev);
3330         if (!in6_dev)
3331                 return;
3332         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3333                 return;
3334
3335         /* RFC2461 8.1:
3336          *      The IP source address of the Redirect MUST be the same as the current
3337          *      first-hop router for the specified ICMP Destination Address.
3338          */
3339
3340         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3341                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3342                 return;
3343         }
3344
3345         lladdr = NULL;
3346         if (ndopts.nd_opts_tgt_lladdr) {
3347                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3348                                              skb->dev);
3349                 if (!lladdr) {
3350                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3351                         return;
3352                 }
3353         }
3354
3355         rt = (struct rt6_info *) dst;
3356         if (rt->rt6i_flags & RTF_REJECT) {
3357                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3358                 return;
3359         }
3360
3361         /* Redirect received -> path was valid.
3362          * Look, redirects are sent only in response to data packets,
3363          * so that this nexthop apparently is reachable. --ANK
3364          */
3365         dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3366
3367         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3368         if (!neigh)
3369                 return;
3370
3371         /*
3372          *      We have finally decided to accept it.
3373          */
3374
3375         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3376                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
3377                      NEIGH_UPDATE_F_OVERRIDE|
3378                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3379                                      NEIGH_UPDATE_F_ISROUTER)),
3380                      NDISC_REDIRECT, &ndopts);
3381
3382         rcu_read_lock();
3383         from = rcu_dereference(rt->from);
3384         /* This fib6_info_hold() is safe here because we hold reference to rt
3385          * and rt already holds reference to fib6_info.
3386          */
3387         fib6_info_hold(from);
3388         rcu_read_unlock();
3389
3390         nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
3391         if (!nrt)
3392                 goto out;
3393
3394         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3395         if (on_link)
3396                 nrt->rt6i_flags &= ~RTF_GATEWAY;
3397
3398         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3399
3400         /* No need to remove rt from the exception table if rt is
3401          * a cached route because rt6_insert_exception() will
3402          * takes care of it
3403          */
3404         if (rt6_insert_exception(nrt, from)) {
3405                 dst_release_immediate(&nrt->dst);
3406                 goto out;
3407         }
3408
3409         netevent.old = &rt->dst;
3410         netevent.new = &nrt->dst;
3411         netevent.daddr = &msg->dest;
3412         netevent.neigh = neigh;
3413         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3414
3415 out:
3416         fib6_info_release(from);
3417         neigh_release(neigh);
3418 }
3419
3420 #ifdef CONFIG_IPV6_ROUTE_INFO
3421 static struct fib6_info *rt6_get_route_info(struct net *net,
3422                                            const struct in6_addr *prefix, int prefixlen,
3423                                            const struct in6_addr *gwaddr,
3424                                            struct net_device *dev)
3425 {
3426         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3427         int ifindex = dev->ifindex;
3428         struct fib6_node *fn;
3429         struct fib6_info *rt = NULL;
3430         struct fib6_table *table;
3431
3432         table = fib6_get_table(net, tb_id);
3433         if (!table)
3434                 return NULL;
3435
3436         rcu_read_lock();
3437         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3438         if (!fn)
3439                 goto out;
3440
3441         for_each_fib6_node_rt_rcu(fn) {
3442                 if (rt->fib6_nh.nh_dev->ifindex != ifindex)
3443                         continue;
3444                 if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3445                         continue;
3446                 if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
3447                         continue;
3448                 if (!fib6_info_hold_safe(rt))
3449                         continue;
3450                 break;
3451         }
3452 out:
3453         rcu_read_unlock();
3454         return rt;
3455 }
3456
3457 static struct fib6_info *rt6_add_route_info(struct net *net,
3458                                            const struct in6_addr *prefix, int prefixlen,
3459                                            const struct in6_addr *gwaddr,
3460                                            struct net_device *dev,
3461                                            unsigned int pref)
3462 {
3463         struct fib6_config cfg = {
3464                 .fc_metric      = IP6_RT_PRIO_USER,
3465                 .fc_ifindex     = dev->ifindex,
3466                 .fc_dst_len     = prefixlen,
3467                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3468                                   RTF_UP | RTF_PREF(pref),
3469                 .fc_protocol = RTPROT_RA,
3470                 .fc_type = RTN_UNICAST,
3471                 .fc_nlinfo.portid = 0,
3472                 .fc_nlinfo.nlh = NULL,
3473                 .fc_nlinfo.nl_net = net,
3474         };
3475
3476         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3477         cfg.fc_dst = *prefix;
3478         cfg.fc_gateway = *gwaddr;
3479
3480         /* We should treat it as a default route if prefix length is 0. */
3481         if (!prefixlen)
3482                 cfg.fc_flags |= RTF_DEFAULT;
3483
3484         ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3485
3486         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3487 }
3488 #endif
3489
3490 struct fib6_info *rt6_get_dflt_router(struct net *net,
3491                                      const struct in6_addr *addr,
3492                                      struct net_device *dev)
3493 {
3494         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3495         struct fib6_info *rt;
3496         struct fib6_table *table;
3497
3498         table = fib6_get_table(net, tb_id);
3499         if (!table)
3500                 return NULL;
3501
3502         rcu_read_lock();
3503         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3504                 if (dev == rt->fib6_nh.nh_dev &&
3505                     ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3506                     ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
3507                         break;
3508         }
3509         if (rt && !fib6_info_hold_safe(rt))
3510                 rt = NULL;
3511         rcu_read_unlock();
3512         return rt;
3513 }
3514
3515 struct fib6_info *rt6_add_dflt_router(struct net *net,
3516                                      const struct in6_addr *gwaddr,
3517                                      struct net_device *dev,
3518                                      unsigned int pref)
3519 {
3520         struct fib6_config cfg = {
3521                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3522                 .fc_metric      = IP6_RT_PRIO_USER,
3523                 .fc_ifindex     = dev->ifindex,
3524                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3525                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3526                 .fc_protocol = RTPROT_RA,
3527                 .fc_type = RTN_UNICAST,
3528                 .fc_nlinfo.portid = 0,
3529                 .fc_nlinfo.nlh = NULL,
3530                 .fc_nlinfo.nl_net = net,
3531         };
3532
3533         cfg.fc_gateway = *gwaddr;
3534
3535         if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3536                 struct fib6_table *table;
3537
3538                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3539                 if (table)
3540                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3541         }
3542
3543         return rt6_get_dflt_router(net, gwaddr, dev);
3544 }
3545
3546 static void __rt6_purge_dflt_routers(struct net *net,
3547                                      struct fib6_table *table)
3548 {
3549         struct fib6_info *rt;
3550
3551 restart:
3552         rcu_read_lock();
3553         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3554                 struct net_device *dev = fib6_info_nh_dev(rt);
3555                 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3556
3557                 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3558                     (!idev || idev->cnf.accept_ra != 2) &&
3559                     fib6_info_hold_safe(rt)) {
3560                         rcu_read_unlock();
3561                         ip6_del_rt(net, rt);
3562                         goto restart;
3563                 }
3564         }
3565         rcu_read_unlock();
3566
3567         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3568 }
3569
3570 void rt6_purge_dflt_routers(struct net *net)
3571 {
3572         struct fib6_table *table;
3573         struct hlist_head *head;
3574         unsigned int h;
3575
3576         rcu_read_lock();
3577
3578         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3579                 head = &net->ipv6.fib_table_hash[h];
3580                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3581                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3582                                 __rt6_purge_dflt_routers(net, table);
3583                 }
3584         }
3585
3586         rcu_read_unlock();
3587 }
3588
3589 static void rtmsg_to_fib6_config(struct net *net,
3590                                  struct in6_rtmsg *rtmsg,
3591                                  struct fib6_config *cfg)
3592 {
3593         *cfg = (struct fib6_config){
3594                 .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3595                          : RT6_TABLE_MAIN,
3596                 .fc_ifindex = rtmsg->rtmsg_ifindex,
3597                 .fc_metric = rtmsg->rtmsg_metric,
3598                 .fc_expires = rtmsg->rtmsg_info,
3599                 .fc_dst_len = rtmsg->rtmsg_dst_len,
3600                 .fc_src_len = rtmsg->rtmsg_src_len,
3601                 .fc_flags = rtmsg->rtmsg_flags,
3602                 .fc_type = rtmsg->rtmsg_type,
3603
3604                 .fc_nlinfo.nl_net = net,
3605
3606                 .fc_dst = rtmsg->rtmsg_dst,
3607                 .fc_src = rtmsg->rtmsg_src,
3608                 .fc_gateway = rtmsg->rtmsg_gateway,
3609         };
3610 }
3611
3612 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3613 {
3614         struct fib6_config cfg;
3615         struct in6_rtmsg rtmsg;
3616         int err;
3617
3618         switch (cmd) {
3619         case SIOCADDRT:         /* Add a route */
3620         case SIOCDELRT:         /* Delete a route */
3621                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3622                         return -EPERM;
3623                 err = copy_from_user(&rtmsg, arg,
3624                                      sizeof(struct in6_rtmsg));
3625                 if (err)
3626                         return -EFAULT;
3627
3628                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3629
3630                 rtnl_lock();
3631                 switch (cmd) {
3632                 case SIOCADDRT:
3633                         err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3634                         break;
3635                 case SIOCDELRT:
3636                         err = ip6_route_del(&cfg, NULL);
3637                         break;
3638                 default:
3639                         err = -EINVAL;
3640                 }
3641                 rtnl_unlock();
3642
3643                 return err;
3644         }
3645
3646         return -EINVAL;
3647 }
3648
3649 /*
3650  *      Drop the packet on the floor
3651  */
3652
3653 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3654 {
3655         int type;
3656         struct dst_entry *dst = skb_dst(skb);
3657         switch (ipstats_mib_noroutes) {
3658         case IPSTATS_MIB_INNOROUTES:
3659                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3660                 if (type == IPV6_ADDR_ANY) {
3661                         IP6_INC_STATS(dev_net(dst->dev),
3662                                       __in6_dev_get_safely(skb->dev),
3663                                       IPSTATS_MIB_INADDRERRORS);
3664                         break;
3665                 }
3666                 /* FALLTHROUGH */
3667         case IPSTATS_MIB_OUTNOROUTES:
3668                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3669                               ipstats_mib_noroutes);
3670                 break;
3671         }
3672         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3673         kfree_skb(skb);
3674         return 0;
3675 }
3676
3677 static int ip6_pkt_discard(struct sk_buff *skb)
3678 {
3679         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3680 }
3681
3682 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3683 {
3684         skb->dev = skb_dst(skb)->dev;
3685         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3686 }
3687
3688 static int ip6_pkt_prohibit(struct sk_buff *skb)
3689 {
3690         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3691 }
3692
3693 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3694 {
3695         skb->dev = skb_dst(skb)->dev;
3696         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3697 }
3698
3699 /*
3700  *      Allocate a dst for local (unicast / anycast) address.
3701  */
3702
3703 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3704                                      struct inet6_dev *idev,
3705                                      const struct in6_addr *addr,
3706                                      bool anycast, gfp_t gfp_flags)
3707 {
3708         u32 tb_id;
3709         struct net_device *dev = idev->dev;
3710         struct fib6_info *f6i;
3711
3712         f6i = fib6_info_alloc(gfp_flags);
3713         if (!f6i)
3714                 return ERR_PTR(-ENOMEM);
3715
3716         f6i->fib6_metrics = ip_fib_metrics_init(net, NULL, 0, NULL);
3717         f6i->dst_nocount = true;
3718         f6i->dst_host = true;
3719         f6i->fib6_protocol = RTPROT_KERNEL;
3720         f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP;
3721         if (anycast) {
3722                 f6i->fib6_type = RTN_ANYCAST;
3723                 f6i->fib6_flags |= RTF_ANYCAST;
3724         } else {
3725                 f6i->fib6_type = RTN_LOCAL;
3726                 f6i->fib6_flags |= RTF_LOCAL;
3727         }
3728
3729         f6i->fib6_nh.nh_gw = *addr;
3730         dev_hold(dev);
3731         f6i->fib6_nh.nh_dev = dev;
3732         f6i->fib6_dst.addr = *addr;
3733         f6i->fib6_dst.plen = 128;
3734         tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3735         f6i->fib6_table = fib6_get_table(net, tb_id);
3736
3737         return f6i;
3738 }
3739
3740 /* remove deleted ip from prefsrc entries */
3741 struct arg_dev_net_ip {
3742         struct net_device *dev;
3743         struct net *net;
3744         struct in6_addr *addr;
3745 };
3746
3747 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3748 {
3749         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3750         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3751         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3752
3753         if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
3754             rt != net->ipv6.fib6_null_entry &&
3755             ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3756                 spin_lock_bh(&rt6_exception_lock);
3757                 /* remove prefsrc entry */
3758                 rt->fib6_prefsrc.plen = 0;
3759                 spin_unlock_bh(&rt6_exception_lock);
3760         }
3761         return 0;
3762 }
3763
3764 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3765 {
3766         struct net *net = dev_net(ifp->idev->dev);
3767         struct arg_dev_net_ip adni = {
3768                 .dev = ifp->idev->dev,
3769                 .net = net,
3770                 .addr = &ifp->addr,
3771         };
3772         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3773 }
3774
3775 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3776
3777 /* Remove routers and update dst entries when gateway turn into host. */
3778 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3779 {
3780         struct in6_addr *gateway = (struct in6_addr *)arg;
3781
3782         if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3783             ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) {
3784                 return -1;
3785         }
3786
3787         /* Further clean up cached routes in exception table.
3788          * This is needed because cached route may have a different
3789          * gateway than its 'parent' in the case of an ip redirect.
3790          */
3791         rt6_exceptions_clean_tohost(rt, gateway);
3792
3793         return 0;
3794 }
3795
3796 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3797 {
3798         fib6_clean_all(net, fib6_clean_tohost, gateway);
3799 }
3800
3801 struct arg_netdev_event {
3802         const struct net_device *dev;
3803         union {
3804                 unsigned int nh_flags;
3805                 unsigned long event;
3806         };
3807 };
3808
3809 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3810 {
3811         struct fib6_info *iter;
3812         struct fib6_node *fn;
3813
3814         fn = rcu_dereference_protected(rt->fib6_node,
3815                         lockdep_is_held(&rt->fib6_table->tb6_lock));
3816         iter = rcu_dereference_protected(fn->leaf,
3817                         lockdep_is_held(&rt->fib6_table->tb6_lock));
3818         while (iter) {
3819                 if (iter->fib6_metric == rt->fib6_metric &&
3820                     rt6_qualify_for_ecmp(iter))
3821                         return iter;
3822                 iter = rcu_dereference_protected(iter->fib6_next,
3823                                 lockdep_is_held(&rt->fib6_table->tb6_lock));
3824         }
3825
3826         return NULL;
3827 }
3828
3829 static bool rt6_is_dead(const struct fib6_info *rt)
3830 {
3831         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
3832             (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
3833              fib6_ignore_linkdown(rt)))
3834                 return true;
3835
3836         return false;
3837 }
3838
3839 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3840 {
3841         struct fib6_info *iter;
3842         int total = 0;
3843
3844         if (!rt6_is_dead(rt))
3845                 total += rt->fib6_nh.nh_weight;
3846
3847         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3848                 if (!rt6_is_dead(iter))
3849                         total += iter->fib6_nh.nh_weight;
3850         }
3851
3852         return total;
3853 }
3854
3855 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3856 {
3857         int upper_bound = -1;
3858
3859         if (!rt6_is_dead(rt)) {
3860                 *weight += rt->fib6_nh.nh_weight;
3861                 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3862                                                     total) - 1;
3863         }
3864         atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound);
3865 }
3866
3867 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3868 {
3869         struct fib6_info *iter;
3870         int weight = 0;
3871
3872         rt6_upper_bound_set(rt, &weight, total);
3873
3874         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3875                 rt6_upper_bound_set(iter, &weight, total);
3876 }
3877
3878 void rt6_multipath_rebalance(struct fib6_info *rt)
3879 {
3880         struct fib6_info *first;
3881         int total;
3882
3883         /* In case the entire multipath route was marked for flushing,
3884          * then there is no need to rebalance upon the removal of every
3885          * sibling route.
3886          */
3887         if (!rt->fib6_nsiblings || rt->should_flush)
3888                 return;
3889
3890         /* During lookup routes are evaluated in order, so we need to
3891          * make sure upper bounds are assigned from the first sibling
3892          * onwards.
3893          */
3894         first = rt6_multipath_first_sibling(rt);
3895         if (WARN_ON_ONCE(!first))
3896                 return;
3897
3898         total = rt6_multipath_total_weight(first);
3899         rt6_multipath_upper_bound_set(first, total);
3900 }
3901
3902 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
3903 {
3904         const struct arg_netdev_event *arg = p_arg;
3905         struct net *net = dev_net(arg->dev);
3906
3907         if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) {
3908                 rt->fib6_nh.nh_flags &= ~arg->nh_flags;
3909                 fib6_update_sernum_upto_root(net, rt);
3910                 rt6_multipath_rebalance(rt);
3911         }
3912
3913         return 0;
3914 }
3915
3916 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3917 {
3918         struct arg_netdev_event arg = {
3919                 .dev = dev,
3920                 {
3921                         .nh_flags = nh_flags,
3922                 },
3923         };
3924
3925         if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3926                 arg.nh_flags |= RTNH_F_LINKDOWN;
3927
3928         fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3929 }
3930
3931 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
3932                                    const struct net_device *dev)
3933 {
3934         struct fib6_info *iter;
3935
3936         if (rt->fib6_nh.nh_dev == dev)
3937                 return true;
3938         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3939                 if (iter->fib6_nh.nh_dev == dev)
3940                         return true;
3941
3942         return false;
3943 }
3944
3945 static void rt6_multipath_flush(struct fib6_info *rt)
3946 {
3947         struct fib6_info *iter;
3948
3949         rt->should_flush = 1;
3950         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3951                 iter->should_flush = 1;
3952 }
3953
3954 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
3955                                              const struct net_device *down_dev)
3956 {
3957         struct fib6_info *iter;
3958         unsigned int dead = 0;
3959
3960         if (rt->fib6_nh.nh_dev == down_dev ||
3961             rt->fib6_nh.nh_flags & RTNH_F_DEAD)
3962                 dead++;
3963         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3964                 if (iter->fib6_nh.nh_dev == down_dev ||
3965                     iter->fib6_nh.nh_flags & RTNH_F_DEAD)
3966                         dead++;
3967
3968         return dead;
3969 }
3970
3971 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
3972                                        const struct net_device *dev,
3973                                        unsigned int nh_flags)
3974 {
3975         struct fib6_info *iter;
3976
3977         if (rt->fib6_nh.nh_dev == dev)
3978                 rt->fib6_nh.nh_flags |= nh_flags;
3979         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3980                 if (iter->fib6_nh.nh_dev == dev)
3981                         iter->fib6_nh.nh_flags |= nh_flags;
3982 }
3983
3984 /* called with write lock held for table with rt */
3985 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
3986 {
3987         const struct arg_netdev_event *arg = p_arg;
3988         const struct net_device *dev = arg->dev;
3989         struct net *net = dev_net(dev);
3990
3991         if (rt == net->ipv6.fib6_null_entry)
3992                 return 0;
3993
3994         switch (arg->event) {
3995         case NETDEV_UNREGISTER:
3996                 return rt->fib6_nh.nh_dev == dev ? -1 : 0;
3997         case NETDEV_DOWN:
3998                 if (rt->should_flush)
3999                         return -1;
4000                 if (!rt->fib6_nsiblings)
4001                         return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4002                 if (rt6_multipath_uses_dev(rt, dev)) {
4003                         unsigned int count;
4004
4005                         count = rt6_multipath_dead_count(rt, dev);
4006                         if (rt->fib6_nsiblings + 1 == count) {
4007                                 rt6_multipath_flush(rt);
4008                                 return -1;
4009                         }
4010                         rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4011                                                    RTNH_F_LINKDOWN);
4012                         fib6_update_sernum(net, rt);
4013                         rt6_multipath_rebalance(rt);
4014                 }
4015                 return -2;
4016         case NETDEV_CHANGE:
4017                 if (rt->fib6_nh.nh_dev != dev ||
4018                     rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4019                         break;
4020                 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
4021                 rt6_multipath_rebalance(rt);
4022                 break;
4023         }
4024
4025         return 0;
4026 }
4027
4028 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4029 {
4030         struct arg_netdev_event arg = {
4031                 .dev = dev,
4032                 {
4033                         .event = event,
4034                 },
4035         };
4036         struct net *net = dev_net(dev);
4037
4038         if (net->ipv6.sysctl.skip_notify_on_dev_down)
4039                 fib6_clean_all_skip_notify(net, fib6_ifdown, &arg);
4040         else
4041                 fib6_clean_all(net, fib6_ifdown, &arg);
4042 }
4043
4044 void rt6_disable_ip(struct net_device *dev, unsigned long event)
4045 {
4046         rt6_sync_down_dev(dev, event);
4047         rt6_uncached_list_flush_dev(dev_net(dev), dev);
4048         neigh_ifdown(&nd_tbl, dev);
4049 }
4050
4051 struct rt6_mtu_change_arg {
4052         struct net_device *dev;
4053         unsigned int mtu;
4054 };
4055
4056 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4057 {
4058         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4059         struct inet6_dev *idev;
4060
4061         /* In IPv6 pmtu discovery is not optional,
4062            so that RTAX_MTU lock cannot disable it.
4063            We still use this lock to block changes
4064            caused by addrconf/ndisc.
4065         */
4066
4067         idev = __in6_dev_get(arg->dev);
4068         if (!idev)
4069                 return 0;
4070
4071         /* For administrative MTU increase, there is no way to discover
4072            IPv6 PMTU increase, so PMTU increase should be updated here.
4073            Since RFC 1981 doesn't include administrative MTU increase
4074            update PMTU increase is a MUST. (i.e. jumbo frame)
4075          */
4076         if (rt->fib6_nh.nh_dev == arg->dev &&
4077             !fib6_metric_locked(rt, RTAX_MTU)) {
4078                 u32 mtu = rt->fib6_pmtu;
4079
4080                 if (mtu >= arg->mtu ||
4081                     (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4082                         fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4083
4084                 spin_lock_bh(&rt6_exception_lock);
4085                 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4086                 spin_unlock_bh(&rt6_exception_lock);
4087         }
4088         return 0;
4089 }
4090
4091 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4092 {
4093         struct rt6_mtu_change_arg arg = {
4094                 .dev = dev,
4095                 .mtu = mtu,
4096         };
4097
4098         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4099 }
4100
4101 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4102         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
4103         [RTA_PREFSRC]           = { .len = sizeof(struct in6_addr) },
4104         [RTA_OIF]               = { .type = NLA_U32 },
4105         [RTA_IIF]               = { .type = NLA_U32 },
4106         [RTA_PRIORITY]          = { .type = NLA_U32 },
4107         [RTA_METRICS]           = { .type = NLA_NESTED },
4108         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
4109         [RTA_PREF]              = { .type = NLA_U8 },
4110         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
4111         [RTA_ENCAP]             = { .type = NLA_NESTED },
4112         [RTA_EXPIRES]           = { .type = NLA_U32 },
4113         [RTA_UID]               = { .type = NLA_U32 },
4114         [RTA_MARK]              = { .type = NLA_U32 },
4115         [RTA_TABLE]             = { .type = NLA_U32 },
4116         [RTA_IP_PROTO]          = { .type = NLA_U8 },
4117         [RTA_SPORT]             = { .type = NLA_U16 },
4118         [RTA_DPORT]             = { .type = NLA_U16 },
4119 };
4120
4121 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4122                               struct fib6_config *cfg,
4123                               struct netlink_ext_ack *extack)
4124 {
4125         struct rtmsg *rtm;
4126         struct nlattr *tb[RTA_MAX+1];
4127         unsigned int pref;
4128         int err;
4129
4130         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4131                           extack);
4132         if (err < 0)
4133                 goto errout;
4134
4135         err = -EINVAL;
4136         rtm = nlmsg_data(nlh);
4137
4138         *cfg = (struct fib6_config){
4139                 .fc_table = rtm->rtm_table,
4140                 .fc_dst_len = rtm->rtm_dst_len,
4141                 .fc_src_len = rtm->rtm_src_len,
4142                 .fc_flags = RTF_UP,
4143                 .fc_protocol = rtm->rtm_protocol,
4144                 .fc_type = rtm->rtm_type,
4145
4146                 .fc_nlinfo.portid = NETLINK_CB(skb).portid,
4147                 .fc_nlinfo.nlh = nlh,
4148                 .fc_nlinfo.nl_net = sock_net(skb->sk),
4149         };
4150
4151         if (rtm->rtm_type == RTN_UNREACHABLE ||
4152             rtm->rtm_type == RTN_BLACKHOLE ||
4153             rtm->rtm_type == RTN_PROHIBIT ||
4154             rtm->rtm_type == RTN_THROW)
4155                 cfg->fc_flags |= RTF_REJECT;
4156
4157         if (rtm->rtm_type == RTN_LOCAL)
4158                 cfg->fc_flags |= RTF_LOCAL;
4159
4160         if (rtm->rtm_flags & RTM_F_CLONED)
4161                 cfg->fc_flags |= RTF_CACHE;
4162
4163         cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4164
4165         if (tb[RTA_GATEWAY]) {
4166                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4167                 cfg->fc_flags |= RTF_GATEWAY;
4168         }
4169
4170         if (tb[RTA_DST]) {
4171                 int plen = (rtm->rtm_dst_len + 7) >> 3;
4172
4173                 if (nla_len(tb[RTA_DST]) < plen)
4174                         goto errout;
4175
4176                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4177         }
4178
4179         if (tb[RTA_SRC]) {
4180                 int plen = (rtm->rtm_src_len + 7) >> 3;
4181
4182                 if (nla_len(tb[RTA_SRC]) < plen)
4183                         goto errout;
4184
4185                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4186         }
4187
4188         if (tb[RTA_PREFSRC])
4189                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4190
4191         if (tb[RTA_OIF])
4192                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4193
4194         if (tb[RTA_PRIORITY])
4195                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4196
4197         if (tb[RTA_METRICS]) {
4198                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4199                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4200         }
4201
4202         if (tb[RTA_TABLE])
4203                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4204
4205         if (tb[RTA_MULTIPATH]) {
4206                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4207                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4208
4209                 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4210                                                      cfg->fc_mp_len, extack);
4211                 if (err < 0)
4212                         goto errout;
4213         }
4214
4215         if (tb[RTA_PREF]) {
4216                 pref = nla_get_u8(tb[RTA_PREF]);
4217                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4218                     pref != ICMPV6_ROUTER_PREF_HIGH)
4219                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
4220                 cfg->fc_flags |= RTF_PREF(pref);
4221         }
4222
4223         if (tb[RTA_ENCAP])
4224                 cfg->fc_encap = tb[RTA_ENCAP];
4225
4226         if (tb[RTA_ENCAP_TYPE]) {
4227                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4228
4229                 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4230                 if (err < 0)
4231                         goto errout;
4232         }
4233
4234         if (tb[RTA_EXPIRES]) {
4235                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4236
4237                 if (addrconf_finite_timeout(timeout)) {
4238                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4239                         cfg->fc_flags |= RTF_EXPIRES;
4240                 }
4241         }
4242
4243         err = 0;
4244 errout:
4245         return err;
4246 }
4247
4248 struct rt6_nh {
4249         struct fib6_info *fib6_info;
4250         struct fib6_config r_cfg;
4251         struct list_head next;
4252 };
4253
4254 static int ip6_route_info_append(struct net *net,
4255                                  struct list_head *rt6_nh_list,
4256                                  struct fib6_info *rt,
4257                                  struct fib6_config *r_cfg)
4258 {
4259         struct rt6_nh *nh;
4260         int err = -EEXIST;
4261
4262         list_for_each_entry(nh, rt6_nh_list, next) {
4263                 /* check if fib6_info already exists */
4264                 if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4265                         return err;
4266         }
4267
4268         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4269         if (!nh)
4270                 return -ENOMEM;
4271         nh->fib6_info = rt;
4272         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4273         list_add_tail(&nh->next, rt6_nh_list);
4274
4275         return 0;
4276 }
4277
4278 static void ip6_route_mpath_notify(struct fib6_info *rt,
4279                                    struct fib6_info *rt_last,
4280                                    struct nl_info *info,
4281                                    __u16 nlflags)
4282 {
4283         /* if this is an APPEND route, then rt points to the first route
4284          * inserted and rt_last points to last route inserted. Userspace
4285          * wants a consistent dump of the route which starts at the first
4286          * nexthop. Since sibling routes are always added at the end of
4287          * the list, find the first sibling of the last route appended
4288          */
4289         if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4290                 rt = list_first_entry(&rt_last->fib6_siblings,
4291                                       struct fib6_info,
4292                                       fib6_siblings);
4293         }
4294
4295         if (rt)
4296                 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4297 }
4298
4299 static int ip6_route_multipath_add(struct fib6_config *cfg,
4300                                    struct netlink_ext_ack *extack)
4301 {
4302         struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4303         struct nl_info *info = &cfg->fc_nlinfo;
4304         struct fib6_config r_cfg;
4305         struct rtnexthop *rtnh;
4306         struct fib6_info *rt;
4307         struct rt6_nh *err_nh;
4308         struct rt6_nh *nh, *nh_safe;
4309         __u16 nlflags;
4310         int remaining;
4311         int attrlen;
4312         int err = 1;
4313         int nhn = 0;
4314         int replace = (cfg->fc_nlinfo.nlh &&
4315                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4316         LIST_HEAD(rt6_nh_list);
4317
4318         nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4319         if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4320                 nlflags |= NLM_F_APPEND;
4321
4322         remaining = cfg->fc_mp_len;
4323         rtnh = (struct rtnexthop *)cfg->fc_mp;
4324
4325         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4326          * fib6_info structs per nexthop
4327          */
4328         while (rtnh_ok(rtnh, remaining)) {
4329                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4330                 if (rtnh->rtnh_ifindex)
4331                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4332
4333                 attrlen = rtnh_attrlen(rtnh);
4334                 if (attrlen > 0) {
4335                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4336
4337                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4338                         if (nla) {
4339                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4340                                 r_cfg.fc_flags |= RTF_GATEWAY;
4341                         }
4342                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4343                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4344                         if (nla)
4345                                 r_cfg.fc_encap_type = nla_get_u16(nla);
4346                 }
4347
4348                 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4349                 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4350                 if (IS_ERR(rt)) {
4351                         err = PTR_ERR(rt);
4352                         rt = NULL;
4353                         goto cleanup;
4354                 }
4355                 if (!rt6_qualify_for_ecmp(rt)) {
4356                         err = -EINVAL;
4357                         NL_SET_ERR_MSG(extack,
4358                                        "Device only routes can not be added for IPv6 using the multipath API.");
4359                         fib6_info_release(rt);
4360                         goto cleanup;
4361                 }
4362
4363                 rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
4364
4365                 err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4366                                             rt, &r_cfg);
4367                 if (err) {
4368                         fib6_info_release(rt);
4369                         goto cleanup;
4370                 }
4371
4372                 rtnh = rtnh_next(rtnh, &remaining);
4373         }
4374
4375         /* for add and replace send one notification with all nexthops.
4376          * Skip the notification in fib6_add_rt2node and send one with
4377          * the full route when done
4378          */
4379         info->skip_notify = 1;
4380
4381         err_nh = NULL;
4382         list_for_each_entry(nh, &rt6_nh_list, next) {
4383                 err = __ip6_ins_rt(nh->fib6_info, info, extack);
4384                 fib6_info_release(nh->fib6_info);
4385
4386                 if (!err) {
4387                         /* save reference to last route successfully inserted */
4388                         rt_last = nh->fib6_info;
4389
4390                         /* save reference to first route for notification */
4391                         if (!rt_notif)
4392                                 rt_notif = nh->fib6_info;
4393                 }
4394
4395                 /* nh->fib6_info is used or freed at this point, reset to NULL*/
4396                 nh->fib6_info = NULL;
4397                 if (err) {
4398                         if (replace && nhn)
4399                                 NL_SET_ERR_MSG_MOD(extack,
4400                                                    "multipath route replace failed (check consistency of installed routes)");
4401                         err_nh = nh;
4402                         goto add_errout;
4403                 }
4404
4405                 /* Because each route is added like a single route we remove
4406                  * these flags after the first nexthop: if there is a collision,
4407                  * we have already failed to add the first nexthop:
4408                  * fib6_add_rt2node() has rejected it; when replacing, old
4409                  * nexthops have been replaced by first new, the rest should
4410                  * be added to it.
4411                  */
4412                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4413                                                      NLM_F_REPLACE);
4414                 nhn++;
4415         }
4416
4417         /* success ... tell user about new route */
4418         ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4419         goto cleanup;
4420
4421 add_errout:
4422         /* send notification for routes that were added so that
4423          * the delete notifications sent by ip6_route_del are
4424          * coherent
4425          */
4426         if (rt_notif)
4427                 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4428
4429         /* Delete routes that were already added */
4430         list_for_each_entry(nh, &rt6_nh_list, next) {
4431                 if (err_nh == nh)
4432                         break;
4433                 ip6_route_del(&nh->r_cfg, extack);
4434         }
4435
4436 cleanup:
4437         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4438                 if (nh->fib6_info)
4439                         fib6_info_release(nh->fib6_info);
4440                 list_del(&nh->next);
4441                 kfree(nh);
4442         }
4443
4444         return err;
4445 }
4446
4447 static int ip6_route_multipath_del(struct fib6_config *cfg,
4448                                    struct netlink_ext_ack *extack)
4449 {
4450         struct fib6_config r_cfg;
4451         struct rtnexthop *rtnh;
4452         int remaining;
4453         int attrlen;
4454         int err = 1, last_err = 0;
4455
4456         remaining = cfg->fc_mp_len;
4457         rtnh = (struct rtnexthop *)cfg->fc_mp;
4458
4459         /* Parse a Multipath Entry */
4460         while (rtnh_ok(rtnh, remaining)) {
4461                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4462                 if (rtnh->rtnh_ifindex)
4463                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4464
4465                 attrlen = rtnh_attrlen(rtnh);
4466                 if (attrlen > 0) {
4467                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4468
4469                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4470                         if (nla) {
4471                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4472                                 r_cfg.fc_flags |= RTF_GATEWAY;
4473                         }
4474                 }
4475                 err = ip6_route_del(&r_cfg, extack);
4476                 if (err)
4477                         last_err = err;
4478
4479                 rtnh = rtnh_next(rtnh, &remaining);
4480         }
4481
4482         return last_err;
4483 }
4484
4485 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4486                               struct netlink_ext_ack *extack)
4487 {
4488         struct fib6_config cfg;
4489         int err;
4490
4491         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4492         if (err < 0)
4493                 return err;
4494
4495         if (cfg.fc_mp)
4496                 return ip6_route_multipath_del(&cfg, extack);
4497         else {
4498                 cfg.fc_delete_all_nh = 1;
4499                 return ip6_route_del(&cfg, extack);
4500         }
4501 }
4502
4503 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4504                               struct netlink_ext_ack *extack)
4505 {
4506         struct fib6_config cfg;
4507         int err;
4508
4509         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4510         if (err < 0)
4511                 return err;
4512
4513         if (cfg.fc_mp)
4514                 return ip6_route_multipath_add(&cfg, extack);
4515         else
4516                 return ip6_route_add(&cfg, GFP_KERNEL, extack);
4517 }
4518
4519 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4520 {
4521         int nexthop_len = 0;
4522
4523         if (rt->fib6_nsiblings) {
4524                 nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
4525                             + NLA_ALIGN(sizeof(struct rtnexthop))
4526                             + nla_total_size(16) /* RTA_GATEWAY */
4527                             + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate);
4528
4529                 nexthop_len *= rt->fib6_nsiblings;
4530         }
4531
4532         return NLMSG_ALIGN(sizeof(struct rtmsg))
4533                + nla_total_size(16) /* RTA_SRC */
4534                + nla_total_size(16) /* RTA_DST */
4535                + nla_total_size(16) /* RTA_GATEWAY */
4536                + nla_total_size(16) /* RTA_PREFSRC */
4537                + nla_total_size(4) /* RTA_TABLE */
4538                + nla_total_size(4) /* RTA_IIF */
4539                + nla_total_size(4) /* RTA_OIF */
4540                + nla_total_size(4) /* RTA_PRIORITY */
4541                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4542                + nla_total_size(sizeof(struct rta_cacheinfo))
4543                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4544                + nla_total_size(1) /* RTA_PREF */
4545                + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate)
4546                + nexthop_len;
4547 }
4548
4549 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
4550                             unsigned int *flags, bool skip_oif)
4551 {
4552         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4553                 *flags |= RTNH_F_DEAD;
4554
4555         if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
4556                 *flags |= RTNH_F_LINKDOWN;
4557
4558                 rcu_read_lock();
4559                 if (fib6_ignore_linkdown(rt))
4560                         *flags |= RTNH_F_DEAD;
4561                 rcu_read_unlock();
4562         }
4563
4564         if (rt->fib6_flags & RTF_GATEWAY) {
4565                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0)
4566                         goto nla_put_failure;
4567         }
4568
4569         *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK);
4570         if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD)
4571                 *flags |= RTNH_F_OFFLOAD;
4572
4573         /* not needed for multipath encoding b/c it has a rtnexthop struct */
4574         if (!skip_oif && rt->fib6_nh.nh_dev &&
4575             nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex))
4576                 goto nla_put_failure;
4577
4578         if (rt->fib6_nh.nh_lwtstate &&
4579             lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0)
4580                 goto nla_put_failure;
4581
4582         return 0;
4583
4584 nla_put_failure:
4585         return -EMSGSIZE;
4586 }
4587
4588 /* add multipath next hop */
4589 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt)
4590 {
4591         const struct net_device *dev = rt->fib6_nh.nh_dev;
4592         struct rtnexthop *rtnh;
4593         unsigned int flags = 0;
4594
4595         rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4596         if (!rtnh)
4597                 goto nla_put_failure;
4598
4599         rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1;
4600         rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
4601
4602         if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4603                 goto nla_put_failure;
4604
4605         rtnh->rtnh_flags = flags;
4606
4607         /* length of rtnetlink header + attributes */
4608         rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4609
4610         return 0;
4611
4612 nla_put_failure:
4613         return -EMSGSIZE;
4614 }
4615
4616 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4617                          struct fib6_info *rt, struct dst_entry *dst,
4618                          struct in6_addr *dest, struct in6_addr *src,
4619                          int iif, int type, u32 portid, u32 seq,
4620                          unsigned int flags)
4621 {
4622         struct rt6_info *rt6 = (struct rt6_info *)dst;
4623         struct rt6key *rt6_dst, *rt6_src;
4624         u32 *pmetrics, table, rt6_flags;
4625         struct nlmsghdr *nlh;
4626         struct rtmsg *rtm;
4627         long expires = 0;
4628
4629         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4630         if (!nlh)
4631                 return -EMSGSIZE;
4632
4633         if (rt6) {
4634                 rt6_dst = &rt6->rt6i_dst;
4635                 rt6_src = &rt6->rt6i_src;
4636                 rt6_flags = rt6->rt6i_flags;
4637         } else {
4638                 rt6_dst = &rt->fib6_dst;
4639                 rt6_src = &rt->fib6_src;
4640                 rt6_flags = rt->fib6_flags;
4641         }
4642
4643         rtm = nlmsg_data(nlh);
4644         rtm->rtm_family = AF_INET6;
4645         rtm->rtm_dst_len = rt6_dst->plen;
4646         rtm->rtm_src_len = rt6_src->plen;
4647         rtm->rtm_tos = 0;
4648         if (rt->fib6_table)
4649                 table = rt->fib6_table->tb6_id;
4650         else
4651                 table = RT6_TABLE_UNSPEC;
4652         rtm->rtm_table = table;
4653         if (nla_put_u32(skb, RTA_TABLE, table))
4654                 goto nla_put_failure;
4655
4656         rtm->rtm_type = rt->fib6_type;
4657         rtm->rtm_flags = 0;
4658         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4659         rtm->rtm_protocol = rt->fib6_protocol;
4660
4661         if (rt6_flags & RTF_CACHE)
4662                 rtm->rtm_flags |= RTM_F_CLONED;
4663
4664         if (dest) {
4665                 if (nla_put_in6_addr(skb, RTA_DST, dest))
4666                         goto nla_put_failure;
4667                 rtm->rtm_dst_len = 128;
4668         } else if (rtm->rtm_dst_len)
4669                 if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
4670                         goto nla_put_failure;
4671 #ifdef CONFIG_IPV6_SUBTREES
4672         if (src) {
4673                 if (nla_put_in6_addr(skb, RTA_SRC, src))
4674                         goto nla_put_failure;
4675                 rtm->rtm_src_len = 128;
4676         } else if (rtm->rtm_src_len &&
4677                    nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
4678                 goto nla_put_failure;
4679 #endif
4680         if (iif) {
4681 #ifdef CONFIG_IPV6_MROUTE
4682                 if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
4683                         int err = ip6mr_get_route(net, skb, rtm, portid);
4684
4685                         if (err == 0)
4686                                 return 0;
4687                         if (err < 0)
4688                                 goto nla_put_failure;
4689                 } else
4690 #endif
4691                         if (nla_put_u32(skb, RTA_IIF, iif))
4692                                 goto nla_put_failure;
4693         } else if (dest) {
4694                 struct in6_addr saddr_buf;
4695                 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4696                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4697                         goto nla_put_failure;
4698         }
4699
4700         if (rt->fib6_prefsrc.plen) {
4701                 struct in6_addr saddr_buf;
4702                 saddr_buf = rt->fib6_prefsrc.addr;
4703                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4704                         goto nla_put_failure;
4705         }
4706
4707         pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4708         if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4709                 goto nla_put_failure;
4710
4711         if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4712                 goto nla_put_failure;
4713
4714         /* For multipath routes, walk the siblings list and add
4715          * each as a nexthop within RTA_MULTIPATH.
4716          */
4717         if (rt6) {
4718                 if (rt6_flags & RTF_GATEWAY &&
4719                     nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
4720                         goto nla_put_failure;
4721
4722                 if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
4723                         goto nla_put_failure;
4724         } else if (rt->fib6_nsiblings) {
4725                 struct fib6_info *sibling, *next_sibling;
4726                 struct nlattr *mp;
4727
4728                 mp = nla_nest_start(skb, RTA_MULTIPATH);
4729                 if (!mp)
4730                         goto nla_put_failure;
4731
4732                 if (rt6_add_nexthop(skb, rt) < 0)
4733                         goto nla_put_failure;
4734
4735                 list_for_each_entry_safe(sibling, next_sibling,
4736                                          &rt->fib6_siblings, fib6_siblings) {
4737                         if (rt6_add_nexthop(skb, sibling) < 0)
4738                                 goto nla_put_failure;
4739                 }
4740
4741                 nla_nest_end(skb, mp);
4742         } else {
4743                 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4744                         goto nla_put_failure;
4745         }
4746
4747         if (rt6_flags & RTF_EXPIRES) {
4748                 expires = dst ? dst->expires : rt->expires;
4749                 expires -= jiffies;
4750         }
4751
4752         if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4753                 goto nla_put_failure;
4754
4755         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
4756                 goto nla_put_failure;
4757
4758
4759         nlmsg_end(skb, nlh);
4760         return 0;
4761
4762 nla_put_failure:
4763         nlmsg_cancel(skb, nlh);
4764         return -EMSGSIZE;
4765 }
4766
4767 static bool fib6_info_uses_dev(const struct fib6_info *f6i,
4768                                const struct net_device *dev)
4769 {
4770         if (f6i->fib6_nh.nh_dev == dev)
4771                 return true;
4772
4773         if (f6i->fib6_nsiblings) {
4774                 struct fib6_info *sibling, *next_sibling;
4775
4776                 list_for_each_entry_safe(sibling, next_sibling,
4777                                          &f6i->fib6_siblings, fib6_siblings) {
4778                         if (sibling->fib6_nh.nh_dev == dev)
4779                                 return true;
4780                 }
4781         }
4782
4783         return false;
4784 }
4785
4786 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4787 {
4788         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4789         struct fib_dump_filter *filter = &arg->filter;
4790         unsigned int flags = NLM_F_MULTI;
4791         struct net *net = arg->net;
4792
4793         if (rt == net->ipv6.fib6_null_entry)
4794                 return 0;
4795
4796         if ((filter->flags & RTM_F_PREFIX) &&
4797             !(rt->fib6_flags & RTF_PREFIX_RT)) {
4798                 /* success since this is not a prefix route */
4799                 return 1;
4800         }
4801         if (filter->filter_set) {
4802                 if ((filter->rt_type && rt->fib6_type != filter->rt_type) ||
4803                     (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) ||
4804                     (filter->protocol && rt->fib6_protocol != filter->protocol)) {
4805                         return 1;
4806                 }
4807                 flags |= NLM_F_DUMP_FILTERED;
4808         }
4809
4810         return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4811                              RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4812                              arg->cb->nlh->nlmsg_seq, flags);
4813 }
4814
4815 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4816                               struct netlink_ext_ack *extack)
4817 {
4818         struct net *net = sock_net(in_skb->sk);
4819         struct nlattr *tb[RTA_MAX+1];
4820         int err, iif = 0, oif = 0;
4821         struct fib6_info *from;
4822         struct dst_entry *dst;
4823         struct rt6_info *rt;
4824         struct sk_buff *skb;
4825         struct rtmsg *rtm;
4826         struct flowi6 fl6 = {};
4827         bool fibmatch;
4828
4829         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4830                           extack);
4831         if (err < 0)
4832                 goto errout;
4833
4834         err = -EINVAL;
4835         rtm = nlmsg_data(nlh);
4836         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4837         fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4838
4839         if (tb[RTA_SRC]) {
4840                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4841                         goto errout;
4842
4843                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4844         }
4845
4846         if (tb[RTA_DST]) {
4847                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4848                         goto errout;
4849
4850                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4851         }
4852
4853         if (tb[RTA_IIF])
4854                 iif = nla_get_u32(tb[RTA_IIF]);
4855
4856         if (tb[RTA_OIF])
4857                 oif = nla_get_u32(tb[RTA_OIF]);
4858
4859         if (tb[RTA_MARK])
4860                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4861
4862         if (tb[RTA_UID])
4863                 fl6.flowi6_uid = make_kuid(current_user_ns(),
4864                                            nla_get_u32(tb[RTA_UID]));
4865         else
4866                 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4867
4868         if (tb[RTA_SPORT])
4869                 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
4870
4871         if (tb[RTA_DPORT])
4872                 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
4873
4874         if (tb[RTA_IP_PROTO]) {
4875                 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
4876                                                   &fl6.flowi6_proto, extack);
4877                 if (err)
4878                         goto errout;
4879         }
4880
4881         if (iif) {
4882                 struct net_device *dev;
4883                 int flags = 0;
4884
4885                 rcu_read_lock();
4886
4887                 dev = dev_get_by_index_rcu(net, iif);
4888                 if (!dev) {
4889                         rcu_read_unlock();
4890                         err = -ENODEV;
4891                         goto errout;
4892                 }
4893
4894                 fl6.flowi6_iif = iif;
4895
4896                 if (!ipv6_addr_any(&fl6.saddr))
4897                         flags |= RT6_LOOKUP_F_HAS_SADDR;
4898
4899                 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4900
4901                 rcu_read_unlock();
4902         } else {
4903                 fl6.flowi6_oif = oif;
4904
4905                 dst = ip6_route_output(net, NULL, &fl6);
4906         }
4907
4908
4909         rt = container_of(dst, struct rt6_info, dst);
4910         if (rt->dst.error) {
4911                 err = rt->dst.error;
4912                 ip6_rt_put(rt);
4913                 goto errout;
4914         }
4915
4916         if (rt == net->ipv6.ip6_null_entry) {
4917                 err = rt->dst.error;
4918                 ip6_rt_put(rt);
4919                 goto errout;
4920         }
4921
4922         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4923         if (!skb) {
4924                 ip6_rt_put(rt);
4925                 err = -ENOBUFS;
4926                 goto errout;
4927         }
4928
4929         skb_dst_set(skb, &rt->dst);
4930
4931         rcu_read_lock();
4932         from = rcu_dereference(rt->from);
4933
4934         if (fibmatch)
4935                 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif,
4936                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4937                                     nlh->nlmsg_seq, 0);
4938         else
4939                 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
4940                                     &fl6.saddr, iif, RTM_NEWROUTE,
4941                                     NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
4942                                     0);
4943         rcu_read_unlock();
4944
4945         if (err < 0) {
4946                 kfree_skb(skb);
4947                 goto errout;
4948         }
4949
4950         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4951 errout:
4952         return err;
4953 }
4954
4955 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
4956                      unsigned int nlm_flags)
4957 {
4958         struct sk_buff *skb;
4959         struct net *net = info->nl_net;
4960         u32 seq;
4961         int err;
4962
4963         err = -ENOBUFS;
4964         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4965
4966         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4967         if (!skb)
4968                 goto errout;
4969
4970         err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
4971                             event, info->portid, seq, nlm_flags);
4972         if (err < 0) {
4973                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4974                 WARN_ON(err == -EMSGSIZE);
4975                 kfree_skb(skb);
4976                 goto errout;
4977         }
4978         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4979                     info->nlh, gfp_any());
4980         return;
4981 errout:
4982         if (err < 0)
4983                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4984 }
4985
4986 static int ip6_route_dev_notify(struct notifier_block *this,
4987                                 unsigned long event, void *ptr)
4988 {
4989         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4990         struct net *net = dev_net(dev);
4991
4992         if (!(dev->flags & IFF_LOOPBACK))
4993                 return NOTIFY_OK;
4994
4995         if (event == NETDEV_REGISTER) {
4996                 net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
4997                 net->ipv6.ip6_null_entry->dst.dev = dev;
4998                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
4999 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5000                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
5001                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
5002                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
5003                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
5004 #endif
5005          } else if (event == NETDEV_UNREGISTER &&
5006                     dev->reg_state != NETREG_UNREGISTERED) {
5007                 /* NETDEV_UNREGISTER could be fired for multiple times by
5008                  * netdev_wait_allrefs(). Make sure we only call this once.
5009                  */
5010                 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
5011 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5012                 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
5013                 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
5014 #endif
5015         }
5016
5017         return NOTIFY_OK;
5018 }
5019
5020 /*
5021  *      /proc
5022  */
5023
5024 #ifdef CONFIG_PROC_FS
5025 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
5026 {
5027         struct net *net = (struct net *)seq->private;
5028         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
5029                    net->ipv6.rt6_stats->fib_nodes,
5030                    net->ipv6.rt6_stats->fib_route_nodes,
5031                    atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
5032                    net->ipv6.rt6_stats->fib_rt_entries,
5033                    net->ipv6.rt6_stats->fib_rt_cache,
5034                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
5035                    net->ipv6.rt6_stats->fib_discarded_routes);
5036
5037         return 0;
5038 }
5039 #endif  /* CONFIG_PROC_FS */
5040
5041 #ifdef CONFIG_SYSCTL
5042
5043 static
5044 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
5045                               void __user *buffer, size_t *lenp, loff_t *ppos)
5046 {
5047         struct net *net;
5048         int delay;
5049         int ret;
5050         if (!write)
5051                 return -EINVAL;
5052
5053         net = (struct net *)ctl->extra1;
5054         delay = net->ipv6.sysctl.flush_delay;
5055         ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
5056         if (ret)
5057                 return ret;
5058
5059         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
5060         return 0;
5061 }
5062
5063 static int zero;
5064 static int one = 1;
5065
5066 static struct ctl_table ipv6_route_table_template[] = {
5067         {
5068                 .procname       =       "flush",
5069                 .data           =       &init_net.ipv6.sysctl.flush_delay,
5070                 .maxlen         =       sizeof(int),
5071                 .mode           =       0200,
5072                 .proc_handler   =       ipv6_sysctl_rtcache_flush
5073         },
5074         {
5075                 .procname       =       "gc_thresh",
5076                 .data           =       &ip6_dst_ops_template.gc_thresh,
5077                 .maxlen         =       sizeof(int),
5078                 .mode           =       0644,
5079                 .proc_handler   =       proc_dointvec,
5080         },
5081         {
5082                 .procname       =       "max_size",
5083                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
5084                 .maxlen         =       sizeof(int),
5085                 .mode           =       0644,
5086                 .proc_handler   =       proc_dointvec,
5087         },
5088         {
5089                 .procname       =       "gc_min_interval",
5090                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5091                 .maxlen         =       sizeof(int),
5092                 .mode           =       0644,
5093                 .proc_handler   =       proc_dointvec_jiffies,
5094         },
5095         {
5096                 .procname       =       "gc_timeout",
5097                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5098                 .maxlen         =       sizeof(int),
5099                 .mode           =       0644,
5100                 .proc_handler   =       proc_dointvec_jiffies,
5101         },
5102         {
5103                 .procname       =       "gc_interval",
5104                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
5105                 .maxlen         =       sizeof(int),
5106                 .mode           =       0644,
5107                 .proc_handler   =       proc_dointvec_jiffies,
5108         },
5109         {
5110                 .procname       =       "gc_elasticity",
5111                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5112                 .maxlen         =       sizeof(int),
5113                 .mode           =       0644,
5114                 .proc_handler   =       proc_dointvec,
5115         },
5116         {
5117                 .procname       =       "mtu_expires",
5118                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5119                 .maxlen         =       sizeof(int),
5120                 .mode           =       0644,
5121                 .proc_handler   =       proc_dointvec_jiffies,
5122         },
5123         {
5124                 .procname       =       "min_adv_mss",
5125                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
5126                 .maxlen         =       sizeof(int),
5127                 .mode           =       0644,
5128                 .proc_handler   =       proc_dointvec,
5129         },
5130         {
5131                 .procname       =       "gc_min_interval_ms",
5132                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5133                 .maxlen         =       sizeof(int),
5134                 .mode           =       0644,
5135                 .proc_handler   =       proc_dointvec_ms_jiffies,
5136         },
5137         {
5138                 .procname       =       "skip_notify_on_dev_down",
5139                 .data           =       &init_net.ipv6.sysctl.skip_notify_on_dev_down,
5140                 .maxlen         =       sizeof(int),
5141                 .mode           =       0644,
5142                 .proc_handler   =       proc_dointvec,
5143                 .extra1         =       &zero,
5144                 .extra2         =       &one,
5145         },
5146         { }
5147 };
5148
5149 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5150 {
5151         struct ctl_table *table;
5152
5153         table = kmemdup(ipv6_route_table_template,
5154                         sizeof(ipv6_route_table_template),
5155                         GFP_KERNEL);
5156
5157         if (table) {
5158                 table[0].data = &net->ipv6.sysctl.flush_delay;
5159                 table[0].extra1 = net;
5160                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5161                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5162                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5163                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5164                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5165                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5166                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5167                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5168                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5169                 table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down;
5170
5171                 /* Don't export sysctls to unprivileged users */
5172                 if (net->user_ns != &init_user_ns)
5173                         table[0].procname = NULL;
5174         }
5175
5176         return table;
5177 }
5178 #endif
5179
5180 static int __net_init ip6_route_net_init(struct net *net)
5181 {
5182         int ret = -ENOMEM;
5183
5184         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5185                sizeof(net->ipv6.ip6_dst_ops));
5186
5187         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5188                 goto out_ip6_dst_ops;
5189
5190         net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5191                                             sizeof(*net->ipv6.fib6_null_entry),
5192                                             GFP_KERNEL);
5193         if (!net->ipv6.fib6_null_entry)
5194                 goto out_ip6_dst_entries;
5195
5196         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5197                                            sizeof(*net->ipv6.ip6_null_entry),
5198                                            GFP_KERNEL);
5199         if (!net->ipv6.ip6_null_entry)
5200                 goto out_fib6_null_entry;
5201         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5202         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5203                          ip6_template_metrics, true);
5204
5205 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5206         net->ipv6.fib6_has_custom_rules = false;
5207         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5208                                                sizeof(*net->ipv6.ip6_prohibit_entry),
5209                                                GFP_KERNEL);
5210         if (!net->ipv6.ip6_prohibit_entry)
5211                 goto out_ip6_null_entry;
5212         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5213         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5214                          ip6_template_metrics, true);
5215
5216         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5217                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
5218                                                GFP_KERNEL);
5219         if (!net->ipv6.ip6_blk_hole_entry)
5220                 goto out_ip6_prohibit_entry;
5221         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5222         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5223                          ip6_template_metrics, true);
5224 #endif
5225
5226         net->ipv6.sysctl.flush_delay = 0;
5227         net->ipv6.sysctl.ip6_rt_max_size = 4096;
5228         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5229         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5230         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5231         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5232         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5233         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5234         net->ipv6.sysctl.skip_notify_on_dev_down = 0;
5235
5236         net->ipv6.ip6_rt_gc_expire = 30*HZ;
5237
5238         ret = 0;
5239 out:
5240         return ret;
5241
5242 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5243 out_ip6_prohibit_entry:
5244         kfree(net->ipv6.ip6_prohibit_entry);
5245 out_ip6_null_entry:
5246         kfree(net->ipv6.ip6_null_entry);
5247 #endif
5248 out_fib6_null_entry:
5249         kfree(net->ipv6.fib6_null_entry);
5250 out_ip6_dst_entries:
5251         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5252 out_ip6_dst_ops:
5253         goto out;
5254 }
5255
5256 static void __net_exit ip6_route_net_exit(struct net *net)
5257 {
5258         kfree(net->ipv6.fib6_null_entry);
5259         kfree(net->ipv6.ip6_null_entry);
5260 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5261         kfree(net->ipv6.ip6_prohibit_entry);
5262         kfree(net->ipv6.ip6_blk_hole_entry);
5263 #endif
5264         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5265 }
5266
5267 static int __net_init ip6_route_net_init_late(struct net *net)
5268 {
5269 #ifdef CONFIG_PROC_FS
5270         proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
5271                         sizeof(struct ipv6_route_iter));
5272         proc_create_net_single("rt6_stats", 0444, net->proc_net,
5273                         rt6_stats_seq_show, NULL);
5274 #endif
5275         return 0;
5276 }
5277
5278 static void __net_exit ip6_route_net_exit_late(struct net *net)
5279 {
5280 #ifdef CONFIG_PROC_FS
5281         remove_proc_entry("ipv6_route", net->proc_net);
5282         remove_proc_entry("rt6_stats", net->proc_net);
5283 #endif
5284 }
5285
5286 static struct pernet_operations ip6_route_net_ops = {
5287         .init = ip6_route_net_init,
5288         .exit = ip6_route_net_exit,
5289 };
5290
5291 static int __net_init ipv6_inetpeer_init(struct net *net)
5292 {
5293         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5294
5295         if (!bp)
5296                 return -ENOMEM;
5297         inet_peer_base_init(bp);
5298         net->ipv6.peers = bp;
5299         return 0;
5300 }
5301
5302 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5303 {
5304         struct inet_peer_base *bp = net->ipv6.peers;
5305
5306         net->ipv6.peers = NULL;
5307         inetpeer_invalidate_tree(bp);
5308         kfree(bp);
5309 }
5310
5311 static struct pernet_operations ipv6_inetpeer_ops = {
5312         .init   =       ipv6_inetpeer_init,
5313         .exit   =       ipv6_inetpeer_exit,
5314 };
5315
5316 static struct pernet_operations ip6_route_net_late_ops = {
5317         .init = ip6_route_net_init_late,
5318         .exit = ip6_route_net_exit_late,
5319 };
5320
5321 static struct notifier_block ip6_route_dev_notifier = {
5322         .notifier_call = ip6_route_dev_notify,
5323         .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5324 };
5325
5326 void __init ip6_route_init_special_entries(void)
5327 {
5328         /* Registering of the loopback is done before this portion of code,
5329          * the loopback reference in rt6_info will not be taken, do it
5330          * manually for init_net */
5331         init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
5332         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5333         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5334   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5335         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5336         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5337         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5338         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5339   #endif
5340 }
5341
5342 int __init ip6_route_init(void)
5343 {
5344         int ret;
5345         int cpu;
5346
5347         ret = -ENOMEM;
5348         ip6_dst_ops_template.kmem_cachep =
5349                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5350                                   SLAB_HWCACHE_ALIGN, NULL);
5351         if (!ip6_dst_ops_template.kmem_cachep)
5352                 goto out;
5353
5354         ret = dst_entries_init(&ip6_dst_blackhole_ops);
5355         if (ret)
5356                 goto out_kmem_cache;
5357
5358         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5359         if (ret)
5360                 goto out_dst_entries;
5361
5362         ret = register_pernet_subsys(&ip6_route_net_ops);
5363         if (ret)
5364                 goto out_register_inetpeer;
5365
5366         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5367
5368         ret = fib6_init();
5369         if (ret)
5370                 goto out_register_subsys;
5371
5372         ret = xfrm6_init();
5373         if (ret)
5374                 goto out_fib6_init;
5375
5376         ret = fib6_rules_init();
5377         if (ret)
5378                 goto xfrm6_init;
5379
5380         ret = register_pernet_subsys(&ip6_route_net_late_ops);
5381         if (ret)
5382                 goto fib6_rules_init;
5383
5384         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5385                                    inet6_rtm_newroute, NULL, 0);
5386         if (ret < 0)
5387                 goto out_register_late_subsys;
5388
5389         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5390                                    inet6_rtm_delroute, NULL, 0);
5391         if (ret < 0)
5392                 goto out_register_late_subsys;
5393
5394         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5395                                    inet6_rtm_getroute, NULL,
5396                                    RTNL_FLAG_DOIT_UNLOCKED);
5397         if (ret < 0)
5398                 goto out_register_late_subsys;
5399
5400         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5401         if (ret)
5402                 goto out_register_late_subsys;
5403
5404         for_each_possible_cpu(cpu) {
5405                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5406
5407                 INIT_LIST_HEAD(&ul->head);
5408                 spin_lock_init(&ul->lock);
5409         }
5410
5411 out:
5412         return ret;
5413
5414 out_register_late_subsys:
5415         rtnl_unregister_all(PF_INET6);
5416         unregister_pernet_subsys(&ip6_route_net_late_ops);
5417 fib6_rules_init:
5418         fib6_rules_cleanup();
5419 xfrm6_init:
5420         xfrm6_fini();
5421 out_fib6_init:
5422         fib6_gc_cleanup();
5423 out_register_subsys:
5424         unregister_pernet_subsys(&ip6_route_net_ops);
5425 out_register_inetpeer:
5426         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5427 out_dst_entries:
5428         dst_entries_destroy(&ip6_dst_blackhole_ops);
5429 out_kmem_cache:
5430         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5431         goto out;
5432 }
5433
5434 void ip6_route_cleanup(void)
5435 {
5436         unregister_netdevice_notifier(&ip6_route_dev_notifier);
5437         unregister_pernet_subsys(&ip6_route_net_late_ops);
5438         fib6_rules_cleanup();
5439         xfrm6_fini();
5440         fib6_gc_cleanup();
5441         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5442         unregister_pernet_subsys(&ip6_route_net_ops);
5443         dst_entries_destroy(&ip6_dst_blackhole_ops);
5444         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5445 }