2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #define pr_fmt(fmt) "IPv6: " fmt
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
56 #include <linux/rtnetlink.h>
58 #include <net/dst_metadata.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
67 #include <linux/uaccess.h>
70 #include <linux/sysctl.h>
73 static int ip6_rt_type_to_error(u8 fib6_type);
75 #define CREATE_TRACE_POINTS
76 #include <trace/events/fib6.h>
77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
78 #undef CREATE_TRACE_POINTS
81 RT6_NUD_FAIL_HARD = -3,
82 RT6_NUD_FAIL_PROBE = -2,
83 RT6_NUD_FAIL_DO_RR = -1,
87 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
88 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
89 static unsigned int ip6_mtu(const struct dst_entry *dst);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void ip6_dst_destroy(struct dst_entry *);
92 static void ip6_dst_ifdown(struct dst_entry *,
93 struct net_device *dev, int how);
94 static int ip6_dst_gc(struct dst_ops *ops);
96 static int ip6_pkt_discard(struct sk_buff *skb);
97 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
98 static int ip6_pkt_prohibit(struct sk_buff *skb);
99 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
100 static void ip6_link_failure(struct sk_buff *skb);
101 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
102 struct sk_buff *skb, u32 mtu);
103 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
104 struct sk_buff *skb);
105 static int rt6_score_route(struct fib6_info *rt, int oif, int strict);
106 static size_t rt6_nlmsg_size(struct fib6_info *rt);
107 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
108 struct fib6_info *rt, struct dst_entry *dst,
109 struct in6_addr *dest, struct in6_addr *src,
110 int iif, int type, u32 portid, u32 seq,
112 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
113 struct in6_addr *daddr,
114 struct in6_addr *saddr);
116 #ifdef CONFIG_IPV6_ROUTE_INFO
117 static struct fib6_info *rt6_add_route_info(struct net *net,
118 const struct in6_addr *prefix, int prefixlen,
119 const struct in6_addr *gwaddr,
120 struct net_device *dev,
122 static struct fib6_info *rt6_get_route_info(struct net *net,
123 const struct in6_addr *prefix, int prefixlen,
124 const struct in6_addr *gwaddr,
125 struct net_device *dev);
128 struct uncached_list {
130 struct list_head head;
133 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
135 void rt6_uncached_list_add(struct rt6_info *rt)
137 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
139 rt->rt6i_uncached_list = ul;
141 spin_lock_bh(&ul->lock);
142 list_add_tail(&rt->rt6i_uncached, &ul->head);
143 spin_unlock_bh(&ul->lock);
146 void rt6_uncached_list_del(struct rt6_info *rt)
148 if (!list_empty(&rt->rt6i_uncached)) {
149 struct uncached_list *ul = rt->rt6i_uncached_list;
150 struct net *net = dev_net(rt->dst.dev);
152 spin_lock_bh(&ul->lock);
153 list_del(&rt->rt6i_uncached);
154 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
155 spin_unlock_bh(&ul->lock);
159 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
161 struct net_device *loopback_dev = net->loopback_dev;
164 if (dev == loopback_dev)
167 for_each_possible_cpu(cpu) {
168 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
171 spin_lock_bh(&ul->lock);
172 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
173 struct inet6_dev *rt_idev = rt->rt6i_idev;
174 struct net_device *rt_dev = rt->dst.dev;
176 if (rt_idev->dev == dev) {
177 rt->rt6i_idev = in6_dev_get(loopback_dev);
178 in6_dev_put(rt_idev);
182 rt->dst.dev = loopback_dev;
183 dev_hold(rt->dst.dev);
187 spin_unlock_bh(&ul->lock);
191 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
195 if (!ipv6_addr_any(p))
196 return (const void *) p;
198 return &ipv6_hdr(skb)->daddr;
202 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
203 struct net_device *dev,
209 daddr = choose_neigh_daddr(gw, skb, daddr);
210 n = __ipv6_neigh_lookup(dev, daddr);
214 n = neigh_create(&nd_tbl, daddr, dev);
215 return IS_ERR(n) ? NULL : n;
218 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
222 const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
224 return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
227 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
229 struct net_device *dev = dst->dev;
230 struct rt6_info *rt = (struct rt6_info *)dst;
232 daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
235 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
237 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
239 __ipv6_confirm_neigh(dev, daddr);
242 static struct dst_ops ip6_dst_ops_template = {
246 .check = ip6_dst_check,
247 .default_advmss = ip6_default_advmss,
249 .cow_metrics = dst_cow_metrics_generic,
250 .destroy = ip6_dst_destroy,
251 .ifdown = ip6_dst_ifdown,
252 .negative_advice = ip6_negative_advice,
253 .link_failure = ip6_link_failure,
254 .update_pmtu = ip6_rt_update_pmtu,
255 .redirect = rt6_do_redirect,
256 .local_out = __ip6_local_out,
257 .neigh_lookup = ip6_dst_neigh_lookup,
258 .confirm_neigh = ip6_confirm_neigh,
261 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
263 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
265 return mtu ? : dst->dev->mtu;
268 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
269 struct sk_buff *skb, u32 mtu)
273 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
278 static struct dst_ops ip6_dst_blackhole_ops = {
280 .destroy = ip6_dst_destroy,
281 .check = ip6_dst_check,
282 .mtu = ip6_blackhole_mtu,
283 .default_advmss = ip6_default_advmss,
284 .update_pmtu = ip6_rt_blackhole_update_pmtu,
285 .redirect = ip6_rt_blackhole_redirect,
286 .cow_metrics = dst_cow_metrics_generic,
287 .neigh_lookup = ip6_dst_neigh_lookup,
290 static const u32 ip6_template_metrics[RTAX_MAX] = {
291 [RTAX_HOPLIMIT - 1] = 0,
294 static const struct fib6_info fib6_null_entry_template = {
295 .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP),
296 .fib6_protocol = RTPROT_KERNEL,
297 .fib6_metric = ~(u32)0,
298 .fib6_ref = ATOMIC_INIT(1),
299 .fib6_type = RTN_UNREACHABLE,
300 .fib6_metrics = (struct dst_metrics *)&dst_default_metrics,
303 static const struct rt6_info ip6_null_entry_template = {
305 .__refcnt = ATOMIC_INIT(1),
307 .obsolete = DST_OBSOLETE_FORCE_CHK,
308 .error = -ENETUNREACH,
309 .input = ip6_pkt_discard,
310 .output = ip6_pkt_discard_out,
312 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
315 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
317 static const struct rt6_info ip6_prohibit_entry_template = {
319 .__refcnt = ATOMIC_INIT(1),
321 .obsolete = DST_OBSOLETE_FORCE_CHK,
323 .input = ip6_pkt_prohibit,
324 .output = ip6_pkt_prohibit_out,
326 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
329 static const struct rt6_info ip6_blk_hole_entry_template = {
331 .__refcnt = ATOMIC_INIT(1),
333 .obsolete = DST_OBSOLETE_FORCE_CHK,
335 .input = dst_discard,
336 .output = dst_discard_out,
338 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
343 static void rt6_info_init(struct rt6_info *rt)
345 struct dst_entry *dst = &rt->dst;
347 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
348 INIT_LIST_HEAD(&rt->rt6i_uncached);
351 /* allocate dst with ip6_dst_ops */
352 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
355 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
356 1, DST_OBSOLETE_FORCE_CHK, flags);
360 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
365 EXPORT_SYMBOL(ip6_dst_alloc);
367 static void ip6_dst_destroy(struct dst_entry *dst)
369 struct rt6_info *rt = (struct rt6_info *)dst;
370 struct fib6_info *from;
371 struct inet6_dev *idev;
373 ip_dst_metrics_put(dst);
374 rt6_uncached_list_del(rt);
376 idev = rt->rt6i_idev;
378 rt->rt6i_idev = NULL;
383 from = rcu_dereference(rt->from);
384 rcu_assign_pointer(rt->from, NULL);
385 fib6_info_release(from);
389 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
392 struct rt6_info *rt = (struct rt6_info *)dst;
393 struct inet6_dev *idev = rt->rt6i_idev;
394 struct net_device *loopback_dev =
395 dev_net(dev)->loopback_dev;
397 if (idev && idev->dev != loopback_dev) {
398 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
400 rt->rt6i_idev = loopback_idev;
406 static bool __rt6_check_expired(const struct rt6_info *rt)
408 if (rt->rt6i_flags & RTF_EXPIRES)
409 return time_after(jiffies, rt->dst.expires);
414 static bool rt6_check_expired(const struct rt6_info *rt)
416 struct fib6_info *from;
418 from = rcu_dereference(rt->from);
420 if (rt->rt6i_flags & RTF_EXPIRES) {
421 if (time_after(jiffies, rt->dst.expires))
424 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
425 fib6_check_expired(from);
430 struct fib6_info *fib6_multipath_select(const struct net *net,
431 struct fib6_info *match,
432 struct flowi6 *fl6, int oif,
433 const struct sk_buff *skb,
436 struct fib6_info *sibling, *next_sibling;
438 /* We might have already computed the hash for ICMPv6 errors. In such
439 * case it will always be non-zero. Otherwise now is the time to do it.
442 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
444 if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound))
447 list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
451 nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound);
452 if (fl6->mp_hash > nh_upper_bound)
454 if (rt6_score_route(sibling, oif, strict) < 0)
464 * Route lookup. rcu_read_lock() should be held.
467 static inline struct fib6_info *rt6_device_match(struct net *net,
468 struct fib6_info *rt,
469 const struct in6_addr *saddr,
473 struct fib6_info *sprt;
475 if (!oif && ipv6_addr_any(saddr) &&
476 !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
479 for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) {
480 const struct net_device *dev = sprt->fib6_nh.nh_dev;
482 if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD)
486 if (dev->ifindex == oif)
489 if (ipv6_chk_addr(net, saddr, dev,
490 flags & RT6_LOOKUP_F_IFACE))
495 if (oif && flags & RT6_LOOKUP_F_IFACE)
496 return net->ipv6.fib6_null_entry;
498 return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
501 #ifdef CONFIG_IPV6_ROUTER_PREF
502 struct __rt6_probe_work {
503 struct work_struct work;
504 struct in6_addr target;
505 struct net_device *dev;
508 static void rt6_probe_deferred(struct work_struct *w)
510 struct in6_addr mcaddr;
511 struct __rt6_probe_work *work =
512 container_of(w, struct __rt6_probe_work, work);
514 addrconf_addr_solict_mult(&work->target, &mcaddr);
515 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
520 static void rt6_probe(struct fib6_info *rt)
522 struct __rt6_probe_work *work = NULL;
523 const struct in6_addr *nh_gw;
524 struct neighbour *neigh;
525 struct net_device *dev;
526 struct inet6_dev *idev;
529 * Okay, this does not seem to be appropriate
530 * for now, however, we need to check if it
531 * is really so; aka Router Reachability Probing.
533 * Router Reachability Probe MUST be rate-limited
534 * to no more than one per minute.
536 if (!rt || !(rt->fib6_flags & RTF_GATEWAY))
539 nh_gw = &rt->fib6_nh.nh_gw;
540 dev = rt->fib6_nh.nh_dev;
542 idev = __in6_dev_get(dev);
543 neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
545 if (neigh->nud_state & NUD_VALID)
548 write_lock(&neigh->lock);
549 if (!(neigh->nud_state & NUD_VALID) &&
551 neigh->updated + idev->cnf.rtr_probe_interval)) {
552 work = kmalloc(sizeof(*work), GFP_ATOMIC);
554 __neigh_set_probe_once(neigh);
556 write_unlock(&neigh->lock);
557 } else if (time_after(jiffies, rt->last_probe +
558 idev->cnf.rtr_probe_interval)) {
559 work = kmalloc(sizeof(*work), GFP_ATOMIC);
563 rt->last_probe = jiffies;
564 INIT_WORK(&work->work, rt6_probe_deferred);
565 work->target = *nh_gw;
568 schedule_work(&work->work);
572 rcu_read_unlock_bh();
575 static inline void rt6_probe(struct fib6_info *rt)
581 * Default Router Selection (RFC 2461 6.3.6)
583 static inline int rt6_check_dev(struct fib6_info *rt, int oif)
585 const struct net_device *dev = rt->fib6_nh.nh_dev;
587 if (!oif || dev->ifindex == oif)
592 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
594 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
595 struct neighbour *neigh;
597 if (rt->fib6_flags & RTF_NONEXTHOP ||
598 !(rt->fib6_flags & RTF_GATEWAY))
599 return RT6_NUD_SUCCEED;
602 neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev,
605 read_lock(&neigh->lock);
606 if (neigh->nud_state & NUD_VALID)
607 ret = RT6_NUD_SUCCEED;
608 #ifdef CONFIG_IPV6_ROUTER_PREF
609 else if (!(neigh->nud_state & NUD_FAILED))
610 ret = RT6_NUD_SUCCEED;
612 ret = RT6_NUD_FAIL_PROBE;
614 read_unlock(&neigh->lock);
616 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
617 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
619 rcu_read_unlock_bh();
624 static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
628 m = rt6_check_dev(rt, oif);
629 if (!m && (strict & RT6_LOOKUP_F_IFACE))
630 return RT6_NUD_FAIL_HARD;
631 #ifdef CONFIG_IPV6_ROUTER_PREF
632 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2;
634 if (strict & RT6_LOOKUP_F_REACHABLE) {
635 int n = rt6_check_neigh(rt);
642 /* called with rc_read_lock held */
643 static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i)
645 const struct net_device *dev = fib6_info_nh_dev(f6i);
649 const struct inet6_dev *idev = __in6_dev_get(dev);
651 rc = !!idev->cnf.ignore_routes_with_linkdown;
657 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
658 int *mpri, struct fib6_info *match,
662 bool match_do_rr = false;
664 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
667 if (fib6_ignore_linkdown(rt) &&
668 rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
669 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
672 if (fib6_check_expired(rt))
675 m = rt6_score_route(rt, oif, strict);
676 if (m == RT6_NUD_FAIL_DO_RR) {
678 m = 0; /* lowest valid score */
679 } else if (m == RT6_NUD_FAIL_HARD) {
683 if (strict & RT6_LOOKUP_F_REACHABLE)
686 /* note that m can be RT6_NUD_FAIL_PROBE at this point */
688 *do_rr = match_do_rr;
696 static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
697 struct fib6_info *leaf,
698 struct fib6_info *rr_head,
699 u32 metric, int oif, int strict,
702 struct fib6_info *rt, *match, *cont;
707 for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) {
708 if (rt->fib6_metric != metric) {
713 match = find_match(rt, oif, strict, &mpri, match, do_rr);
716 for (rt = leaf; rt && rt != rr_head;
717 rt = rcu_dereference(rt->fib6_next)) {
718 if (rt->fib6_metric != metric) {
723 match = find_match(rt, oif, strict, &mpri, match, do_rr);
729 for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next))
730 match = find_match(rt, oif, strict, &mpri, match, do_rr);
735 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
738 struct fib6_info *leaf = rcu_dereference(fn->leaf);
739 struct fib6_info *match, *rt0;
743 if (!leaf || leaf == net->ipv6.fib6_null_entry)
744 return net->ipv6.fib6_null_entry;
746 rt0 = rcu_dereference(fn->rr_ptr);
750 /* Double check to make sure fn is not an intermediate node
751 * and fn->leaf does not points to its child's leaf
752 * (This might happen if all routes under fn are deleted from
753 * the tree and fib6_repair_tree() is called on the node.)
755 key_plen = rt0->fib6_dst.plen;
756 #ifdef CONFIG_IPV6_SUBTREES
757 if (rt0->fib6_src.plen)
758 key_plen = rt0->fib6_src.plen;
760 if (fn->fn_bit != key_plen)
761 return net->ipv6.fib6_null_entry;
763 match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
767 struct fib6_info *next = rcu_dereference(rt0->fib6_next);
769 /* no entries matched; do round-robin */
770 if (!next || next->fib6_metric != rt0->fib6_metric)
774 spin_lock_bh(&leaf->fib6_table->tb6_lock);
775 /* make sure next is not being deleted from the tree */
777 rcu_assign_pointer(fn->rr_ptr, next);
778 spin_unlock_bh(&leaf->fib6_table->tb6_lock);
782 return match ? match : net->ipv6.fib6_null_entry;
785 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
787 return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
790 #ifdef CONFIG_IPV6_ROUTE_INFO
791 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
792 const struct in6_addr *gwaddr)
794 struct net *net = dev_net(dev);
795 struct route_info *rinfo = (struct route_info *) opt;
796 struct in6_addr prefix_buf, *prefix;
798 unsigned long lifetime;
799 struct fib6_info *rt;
801 if (len < sizeof(struct route_info)) {
805 /* Sanity check for prefix_len and length */
806 if (rinfo->length > 3) {
808 } else if (rinfo->prefix_len > 128) {
810 } else if (rinfo->prefix_len > 64) {
811 if (rinfo->length < 2) {
814 } else if (rinfo->prefix_len > 0) {
815 if (rinfo->length < 1) {
820 pref = rinfo->route_pref;
821 if (pref == ICMPV6_ROUTER_PREF_INVALID)
824 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
826 if (rinfo->length == 3)
827 prefix = (struct in6_addr *)rinfo->prefix;
829 /* this function is safe */
830 ipv6_addr_prefix(&prefix_buf,
831 (struct in6_addr *)rinfo->prefix,
833 prefix = &prefix_buf;
836 if (rinfo->prefix_len == 0)
837 rt = rt6_get_dflt_router(net, gwaddr, dev);
839 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
842 if (rt && !lifetime) {
848 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
851 rt->fib6_flags = RTF_ROUTEINFO |
852 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
855 if (!addrconf_finite_timeout(lifetime))
856 fib6_clean_expires(rt);
858 fib6_set_expires(rt, jiffies + HZ * lifetime);
860 fib6_info_release(rt);
867 * Misc support functions
870 /* called with rcu_lock held */
871 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
873 struct net_device *dev = rt->fib6_nh.nh_dev;
875 if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
876 /* for copies of local routes, dst->dev needs to be the
877 * device if it is a master device, the master device if
878 * device is enslaved, and the loopback as the default
880 if (netif_is_l3_slave(dev) &&
881 !rt6_need_strict(&rt->fib6_dst.addr))
882 dev = l3mdev_master_dev_rcu(dev);
883 else if (!netif_is_l3_master(dev))
884 dev = dev_net(dev)->loopback_dev;
885 /* last case is netif_is_l3_master(dev) is true in which
886 * case we want dev returned to be dev
893 static const int fib6_prop[RTN_MAX + 1] = {
900 [RTN_BLACKHOLE] = -EINVAL,
901 [RTN_UNREACHABLE] = -EHOSTUNREACH,
902 [RTN_PROHIBIT] = -EACCES,
903 [RTN_THROW] = -EAGAIN,
905 [RTN_XRESOLVE] = -EINVAL,
908 static int ip6_rt_type_to_error(u8 fib6_type)
910 return fib6_prop[fib6_type];
913 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
915 unsigned short flags = 0;
918 flags |= DST_NOCOUNT;
919 if (rt->dst_nopolicy)
920 flags |= DST_NOPOLICY;
927 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
929 rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
931 switch (ort->fib6_type) {
933 rt->dst.output = dst_discard_out;
934 rt->dst.input = dst_discard;
937 rt->dst.output = ip6_pkt_prohibit_out;
938 rt->dst.input = ip6_pkt_prohibit;
941 case RTN_UNREACHABLE:
943 rt->dst.output = ip6_pkt_discard_out;
944 rt->dst.input = ip6_pkt_discard;
949 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
951 if (ort->fib6_flags & RTF_REJECT) {
952 ip6_rt_init_dst_reject(rt, ort);
957 rt->dst.output = ip6_output;
959 if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) {
960 rt->dst.input = ip6_input;
961 } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
962 rt->dst.input = ip6_mc_input;
964 rt->dst.input = ip6_forward;
967 if (ort->fib6_nh.nh_lwtstate) {
968 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
969 lwtunnel_set_redirect(&rt->dst);
972 rt->dst.lastuse = jiffies;
975 /* Caller must already hold reference to @from */
976 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
978 rt->rt6i_flags &= ~RTF_EXPIRES;
979 rcu_assign_pointer(rt->from, from);
980 ip_dst_init_metrics(&rt->dst, from->fib6_metrics);
983 /* Caller must already hold reference to @ort */
984 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
986 struct net_device *dev = fib6_info_nh_dev(ort);
988 ip6_rt_init_dst(rt, ort);
990 rt->rt6i_dst = ort->fib6_dst;
991 rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
992 rt->rt6i_gateway = ort->fib6_nh.nh_gw;
993 rt->rt6i_flags = ort->fib6_flags;
994 rt6_set_from(rt, ort);
995 #ifdef CONFIG_IPV6_SUBTREES
996 rt->rt6i_src = ort->fib6_src;
1000 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1001 struct in6_addr *saddr)
1003 struct fib6_node *pn, *sn;
1005 if (fn->fn_flags & RTN_TL_ROOT)
1007 pn = rcu_dereference(fn->parent);
1008 sn = FIB6_SUBTREE(pn);
1010 fn = fib6_node_lookup(sn, NULL, saddr);
1013 if (fn->fn_flags & RTN_RTINFO)
1018 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
1021 struct rt6_info *rt = *prt;
1023 if (dst_hold_safe(&rt->dst))
1025 if (null_fallback) {
1026 rt = net->ipv6.ip6_null_entry;
1035 /* called with rcu_lock held */
1036 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
1038 unsigned short flags = fib6_info_dst_flags(rt);
1039 struct net_device *dev = rt->fib6_nh.nh_dev;
1040 struct rt6_info *nrt;
1042 if (!fib6_info_hold_safe(rt))
1045 nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1047 ip6_rt_copy_init(nrt, rt);
1049 fib6_info_release(rt);
1054 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1055 struct fib6_table *table,
1057 const struct sk_buff *skb,
1060 struct fib6_info *f6i;
1061 struct fib6_node *fn;
1062 struct rt6_info *rt;
1064 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1065 flags &= ~RT6_LOOKUP_F_IFACE;
1068 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1070 f6i = rcu_dereference(fn->leaf);
1072 f6i = net->ipv6.fib6_null_entry;
1074 f6i = rt6_device_match(net, f6i, &fl6->saddr,
1075 fl6->flowi6_oif, flags);
1076 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
1077 f6i = fib6_multipath_select(net, f6i, fl6,
1078 fl6->flowi6_oif, skb,
1081 if (f6i == net->ipv6.fib6_null_entry) {
1082 fn = fib6_backtrack(fn, &fl6->saddr);
1087 trace_fib6_table_lookup(net, f6i, table, fl6);
1089 /* Search through exception table */
1090 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1092 if (ip6_hold_safe(net, &rt, true))
1093 dst_use_noref(&rt->dst, jiffies);
1094 } else if (f6i == net->ipv6.fib6_null_entry) {
1095 rt = net->ipv6.ip6_null_entry;
1098 rt = ip6_create_rt_rcu(f6i);
1100 rt = net->ipv6.ip6_null_entry;
1110 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1111 const struct sk_buff *skb, int flags)
1113 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1115 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1117 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1118 const struct in6_addr *saddr, int oif,
1119 const struct sk_buff *skb, int strict)
1121 struct flowi6 fl6 = {
1125 struct dst_entry *dst;
1126 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1129 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1130 flags |= RT6_LOOKUP_F_HAS_SADDR;
1133 dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1134 if (dst->error == 0)
1135 return (struct rt6_info *) dst;
1141 EXPORT_SYMBOL(rt6_lookup);
1143 /* ip6_ins_rt is called with FREE table->tb6_lock.
1144 * It takes new route entry, the addition fails by any reason the
1145 * route is released.
1146 * Caller must hold dst before calling it.
1149 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1150 struct netlink_ext_ack *extack)
1153 struct fib6_table *table;
1155 table = rt->fib6_table;
1156 spin_lock_bh(&table->tb6_lock);
1157 err = fib6_add(&table->tb6_root, rt, info, extack);
1158 spin_unlock_bh(&table->tb6_lock);
1163 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1165 struct nl_info info = { .nl_net = net, };
1167 return __ip6_ins_rt(rt, &info, NULL);
1170 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
1171 const struct in6_addr *daddr,
1172 const struct in6_addr *saddr)
1174 struct net_device *dev;
1175 struct rt6_info *rt;
1181 if (!fib6_info_hold_safe(ort))
1184 dev = ip6_rt_get_dev_rcu(ort);
1185 rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1187 fib6_info_release(ort);
1191 ip6_rt_copy_init(rt, ort);
1192 rt->rt6i_flags |= RTF_CACHE;
1193 rt->dst.flags |= DST_HOST;
1194 rt->rt6i_dst.addr = *daddr;
1195 rt->rt6i_dst.plen = 128;
1197 if (!rt6_is_gw_or_nonexthop(ort)) {
1198 if (ort->fib6_dst.plen != 128 &&
1199 ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
1200 rt->rt6i_flags |= RTF_ANYCAST;
1201 #ifdef CONFIG_IPV6_SUBTREES
1202 if (rt->rt6i_src.plen && saddr) {
1203 rt->rt6i_src.addr = *saddr;
1204 rt->rt6i_src.plen = 128;
1212 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
1214 unsigned short flags = fib6_info_dst_flags(rt);
1215 struct net_device *dev;
1216 struct rt6_info *pcpu_rt;
1218 if (!fib6_info_hold_safe(rt))
1222 dev = ip6_rt_get_dev_rcu(rt);
1223 pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1226 fib6_info_release(rt);
1229 ip6_rt_copy_init(pcpu_rt, rt);
1230 pcpu_rt->rt6i_flags |= RTF_PCPU;
1234 /* It should be called with rcu_read_lock() acquired */
1235 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
1237 struct rt6_info *pcpu_rt, **p;
1239 p = this_cpu_ptr(rt->rt6i_pcpu);
1243 ip6_hold_safe(NULL, &pcpu_rt, false);
1248 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1249 struct fib6_info *rt)
1251 struct rt6_info *pcpu_rt, *prev, **p;
1253 pcpu_rt = ip6_rt_pcpu_alloc(rt);
1255 dst_hold(&net->ipv6.ip6_null_entry->dst);
1256 return net->ipv6.ip6_null_entry;
1259 dst_hold(&pcpu_rt->dst);
1260 p = this_cpu_ptr(rt->rt6i_pcpu);
1261 prev = cmpxchg(p, NULL, pcpu_rt);
1267 /* exception hash table implementation
1269 static DEFINE_SPINLOCK(rt6_exception_lock);
1271 /* Remove rt6_ex from hash table and free the memory
1272 * Caller must hold rt6_exception_lock
1274 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1275 struct rt6_exception *rt6_ex)
1279 if (!bucket || !rt6_ex)
1282 net = dev_net(rt6_ex->rt6i->dst.dev);
1283 hlist_del_rcu(&rt6_ex->hlist);
1284 dst_release(&rt6_ex->rt6i->dst);
1285 kfree_rcu(rt6_ex, rcu);
1286 WARN_ON_ONCE(!bucket->depth);
1288 net->ipv6.rt6_stats->fib_rt_cache--;
1291 /* Remove oldest rt6_ex in bucket and free the memory
1292 * Caller must hold rt6_exception_lock
1294 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1296 struct rt6_exception *rt6_ex, *oldest = NULL;
1301 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1302 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1305 rt6_remove_exception(bucket, oldest);
1308 static u32 rt6_exception_hash(const struct in6_addr *dst,
1309 const struct in6_addr *src)
1311 static u32 seed __read_mostly;
1314 net_get_random_once(&seed, sizeof(seed));
1315 val = jhash(dst, sizeof(*dst), seed);
1317 #ifdef CONFIG_IPV6_SUBTREES
1319 val = jhash(src, sizeof(*src), val);
1321 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1324 /* Helper function to find the cached rt in the hash table
1325 * and update bucket pointer to point to the bucket for this
1326 * (daddr, saddr) pair
1327 * Caller must hold rt6_exception_lock
1329 static struct rt6_exception *
1330 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1331 const struct in6_addr *daddr,
1332 const struct in6_addr *saddr)
1334 struct rt6_exception *rt6_ex;
1337 if (!(*bucket) || !daddr)
1340 hval = rt6_exception_hash(daddr, saddr);
1343 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1344 struct rt6_info *rt6 = rt6_ex->rt6i;
1345 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1347 #ifdef CONFIG_IPV6_SUBTREES
1348 if (matched && saddr)
1349 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1357 /* Helper function to find the cached rt in the hash table
1358 * and update bucket pointer to point to the bucket for this
1359 * (daddr, saddr) pair
1360 * Caller must hold rcu_read_lock()
1362 static struct rt6_exception *
1363 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1364 const struct in6_addr *daddr,
1365 const struct in6_addr *saddr)
1367 struct rt6_exception *rt6_ex;
1370 WARN_ON_ONCE(!rcu_read_lock_held());
1372 if (!(*bucket) || !daddr)
1375 hval = rt6_exception_hash(daddr, saddr);
1378 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1379 struct rt6_info *rt6 = rt6_ex->rt6i;
1380 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1382 #ifdef CONFIG_IPV6_SUBTREES
1383 if (matched && saddr)
1384 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1392 static unsigned int fib6_mtu(const struct fib6_info *rt)
1396 if (rt->fib6_pmtu) {
1397 mtu = rt->fib6_pmtu;
1399 struct net_device *dev = fib6_info_nh_dev(rt);
1400 struct inet6_dev *idev;
1403 idev = __in6_dev_get(dev);
1404 mtu = idev->cnf.mtu6;
1408 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1410 return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
1413 static int rt6_insert_exception(struct rt6_info *nrt,
1414 struct fib6_info *ort)
1416 struct net *net = dev_net(nrt->dst.dev);
1417 struct rt6_exception_bucket *bucket;
1418 struct in6_addr *src_key = NULL;
1419 struct rt6_exception *rt6_ex;
1422 spin_lock_bh(&rt6_exception_lock);
1424 if (ort->exception_bucket_flushed) {
1429 bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1430 lockdep_is_held(&rt6_exception_lock));
1432 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1438 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1441 #ifdef CONFIG_IPV6_SUBTREES
1442 /* rt6i_src.plen != 0 indicates ort is in subtree
1443 * and exception table is indexed by a hash of
1444 * both rt6i_dst and rt6i_src.
1445 * Otherwise, the exception table is indexed by
1446 * a hash of only rt6i_dst.
1448 if (ort->fib6_src.plen)
1449 src_key = &nrt->rt6i_src.addr;
1451 /* rt6_mtu_change() might lower mtu on ort.
1452 * Only insert this exception route if its mtu
1453 * is less than ort's mtu value.
1455 if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1460 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1463 rt6_remove_exception(bucket, rt6_ex);
1465 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1471 rt6_ex->stamp = jiffies;
1472 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1474 net->ipv6.rt6_stats->fib_rt_cache++;
1476 if (bucket->depth > FIB6_MAX_DEPTH)
1477 rt6_exception_remove_oldest(bucket);
1480 spin_unlock_bh(&rt6_exception_lock);
1482 /* Update fn->fn_sernum to invalidate all cached dst */
1484 spin_lock_bh(&ort->fib6_table->tb6_lock);
1485 fib6_update_sernum(net, ort);
1486 spin_unlock_bh(&ort->fib6_table->tb6_lock);
1487 fib6_force_start_gc(net);
1493 void rt6_flush_exceptions(struct fib6_info *rt)
1495 struct rt6_exception_bucket *bucket;
1496 struct rt6_exception *rt6_ex;
1497 struct hlist_node *tmp;
1500 spin_lock_bh(&rt6_exception_lock);
1501 /* Prevent rt6_insert_exception() to recreate the bucket list */
1502 rt->exception_bucket_flushed = 1;
1504 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1505 lockdep_is_held(&rt6_exception_lock));
1509 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1510 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1511 rt6_remove_exception(bucket, rt6_ex);
1512 WARN_ON_ONCE(bucket->depth);
1517 spin_unlock_bh(&rt6_exception_lock);
1520 /* Find cached rt in the hash table inside passed in rt
1521 * Caller has to hold rcu_read_lock()
1523 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
1524 struct in6_addr *daddr,
1525 struct in6_addr *saddr)
1527 struct rt6_exception_bucket *bucket;
1528 struct in6_addr *src_key = NULL;
1529 struct rt6_exception *rt6_ex;
1530 struct rt6_info *res = NULL;
1532 bucket = rcu_dereference(rt->rt6i_exception_bucket);
1534 #ifdef CONFIG_IPV6_SUBTREES
1535 /* rt6i_src.plen != 0 indicates rt is in subtree
1536 * and exception table is indexed by a hash of
1537 * both rt6i_dst and rt6i_src.
1538 * Otherwise, the exception table is indexed by
1539 * a hash of only rt6i_dst.
1541 if (rt->fib6_src.plen)
1544 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1546 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1552 /* Remove the passed in cached rt from the hash table that contains it */
1553 static int rt6_remove_exception_rt(struct rt6_info *rt)
1555 struct rt6_exception_bucket *bucket;
1556 struct in6_addr *src_key = NULL;
1557 struct rt6_exception *rt6_ex;
1558 struct fib6_info *from;
1561 from = rcu_dereference(rt->from);
1563 !(rt->rt6i_flags & RTF_CACHE))
1566 if (!rcu_access_pointer(from->rt6i_exception_bucket))
1569 spin_lock_bh(&rt6_exception_lock);
1570 bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1571 lockdep_is_held(&rt6_exception_lock));
1572 #ifdef CONFIG_IPV6_SUBTREES
1573 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1574 * and exception table is indexed by a hash of
1575 * both rt6i_dst and rt6i_src.
1576 * Otherwise, the exception table is indexed by
1577 * a hash of only rt6i_dst.
1579 if (from->fib6_src.plen)
1580 src_key = &rt->rt6i_src.addr;
1582 rt6_ex = __rt6_find_exception_spinlock(&bucket,
1586 rt6_remove_exception(bucket, rt6_ex);
1592 spin_unlock_bh(&rt6_exception_lock);
1596 /* Find rt6_ex which contains the passed in rt cache and
1599 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1601 struct rt6_exception_bucket *bucket;
1602 struct in6_addr *src_key = NULL;
1603 struct rt6_exception *rt6_ex;
1604 struct fib6_info *from;
1607 from = rcu_dereference(rt->from);
1608 if (!from || !(rt->rt6i_flags & RTF_CACHE))
1611 bucket = rcu_dereference(from->rt6i_exception_bucket);
1613 #ifdef CONFIG_IPV6_SUBTREES
1614 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1615 * and exception table is indexed by a hash of
1616 * both rt6i_dst and rt6i_src.
1617 * Otherwise, the exception table is indexed by
1618 * a hash of only rt6i_dst.
1620 if (from->fib6_src.plen)
1621 src_key = &rt->rt6i_src.addr;
1623 rt6_ex = __rt6_find_exception_rcu(&bucket,
1627 rt6_ex->stamp = jiffies;
1633 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1634 struct rt6_info *rt, int mtu)
1636 /* If the new MTU is lower than the route PMTU, this new MTU will be the
1637 * lowest MTU in the path: always allow updating the route PMTU to
1638 * reflect PMTU decreases.
1640 * If the new MTU is higher, and the route PMTU is equal to the local
1641 * MTU, this means the old MTU is the lowest in the path, so allow
1642 * updating it: if other nodes now have lower MTUs, PMTU discovery will
1646 if (dst_mtu(&rt->dst) >= mtu)
1649 if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1655 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1656 struct fib6_info *rt, int mtu)
1658 struct rt6_exception_bucket *bucket;
1659 struct rt6_exception *rt6_ex;
1662 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1663 lockdep_is_held(&rt6_exception_lock));
1668 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1669 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1670 struct rt6_info *entry = rt6_ex->rt6i;
1672 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1673 * route), the metrics of its rt->from have already
1676 if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1677 rt6_mtu_change_route_allowed(idev, entry, mtu))
1678 dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1684 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
1686 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1687 struct in6_addr *gateway)
1689 struct rt6_exception_bucket *bucket;
1690 struct rt6_exception *rt6_ex;
1691 struct hlist_node *tmp;
1694 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1697 spin_lock_bh(&rt6_exception_lock);
1698 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1699 lockdep_is_held(&rt6_exception_lock));
1702 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1703 hlist_for_each_entry_safe(rt6_ex, tmp,
1704 &bucket->chain, hlist) {
1705 struct rt6_info *entry = rt6_ex->rt6i;
1707 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1708 RTF_CACHE_GATEWAY &&
1709 ipv6_addr_equal(gateway,
1710 &entry->rt6i_gateway)) {
1711 rt6_remove_exception(bucket, rt6_ex);
1718 spin_unlock_bh(&rt6_exception_lock);
1721 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1722 struct rt6_exception *rt6_ex,
1723 struct fib6_gc_args *gc_args,
1726 struct rt6_info *rt = rt6_ex->rt6i;
1728 /* we are pruning and obsoleting aged-out and non gateway exceptions
1729 * even if others have still references to them, so that on next
1730 * dst_check() such references can be dropped.
1731 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1732 * expired, independently from their aging, as per RFC 8201 section 4
1734 if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1735 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1736 RT6_TRACE("aging clone %p\n", rt);
1737 rt6_remove_exception(bucket, rt6_ex);
1740 } else if (time_after(jiffies, rt->dst.expires)) {
1741 RT6_TRACE("purging expired route %p\n", rt);
1742 rt6_remove_exception(bucket, rt6_ex);
1746 if (rt->rt6i_flags & RTF_GATEWAY) {
1747 struct neighbour *neigh;
1748 __u8 neigh_flags = 0;
1750 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1752 neigh_flags = neigh->flags;
1754 if (!(neigh_flags & NTF_ROUTER)) {
1755 RT6_TRACE("purging route %p via non-router but gateway\n",
1757 rt6_remove_exception(bucket, rt6_ex);
1765 void rt6_age_exceptions(struct fib6_info *rt,
1766 struct fib6_gc_args *gc_args,
1769 struct rt6_exception_bucket *bucket;
1770 struct rt6_exception *rt6_ex;
1771 struct hlist_node *tmp;
1774 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1778 spin_lock(&rt6_exception_lock);
1779 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1780 lockdep_is_held(&rt6_exception_lock));
1783 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1784 hlist_for_each_entry_safe(rt6_ex, tmp,
1785 &bucket->chain, hlist) {
1786 rt6_age_examine_exception(bucket, rt6_ex,
1792 spin_unlock(&rt6_exception_lock);
1793 rcu_read_unlock_bh();
1796 /* must be called with rcu lock held */
1797 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
1798 int oif, struct flowi6 *fl6, int strict)
1800 struct fib6_node *fn, *saved_fn;
1801 struct fib6_info *f6i;
1803 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1806 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1810 f6i = rt6_select(net, fn, oif, strict);
1811 if (f6i == net->ipv6.fib6_null_entry) {
1812 fn = fib6_backtrack(fn, &fl6->saddr);
1814 goto redo_rt6_select;
1815 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1816 /* also consider unreachable route */
1817 strict &= ~RT6_LOOKUP_F_REACHABLE;
1819 goto redo_rt6_select;
1823 trace_fib6_table_lookup(net, f6i, table, fl6);
1828 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1829 int oif, struct flowi6 *fl6,
1830 const struct sk_buff *skb, int flags)
1832 struct fib6_info *f6i;
1833 struct rt6_info *rt;
1836 strict |= flags & RT6_LOOKUP_F_IFACE;
1837 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1838 if (net->ipv6.devconf_all->forwarding == 0)
1839 strict |= RT6_LOOKUP_F_REACHABLE;
1843 f6i = fib6_table_lookup(net, table, oif, fl6, strict);
1844 if (f6i->fib6_nsiblings)
1845 f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
1847 if (f6i == net->ipv6.fib6_null_entry) {
1848 rt = net->ipv6.ip6_null_entry;
1854 /*Search through exception table */
1855 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1857 if (ip6_hold_safe(net, &rt, true))
1858 dst_use_noref(&rt->dst, jiffies);
1862 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1863 !(f6i->fib6_flags & RTF_GATEWAY))) {
1864 /* Create a RTF_CACHE clone which will not be
1865 * owned by the fib6 tree. It is for the special case where
1866 * the daddr in the skb during the neighbor look-up is different
1867 * from the fl6->daddr used to look-up route here.
1869 struct rt6_info *uncached_rt;
1871 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
1876 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1877 * No need for another dst_hold()
1879 rt6_uncached_list_add(uncached_rt);
1880 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1882 uncached_rt = net->ipv6.ip6_null_entry;
1883 dst_hold(&uncached_rt->dst);
1888 /* Get a percpu copy */
1890 struct rt6_info *pcpu_rt;
1893 pcpu_rt = rt6_get_pcpu_route(f6i);
1896 pcpu_rt = rt6_make_pcpu_route(net, f6i);
1904 EXPORT_SYMBOL_GPL(ip6_pol_route);
1906 static struct rt6_info *ip6_pol_route_input(struct net *net,
1907 struct fib6_table *table,
1909 const struct sk_buff *skb,
1912 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1915 struct dst_entry *ip6_route_input_lookup(struct net *net,
1916 struct net_device *dev,
1918 const struct sk_buff *skb,
1921 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1922 flags |= RT6_LOOKUP_F_IFACE;
1924 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1926 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1928 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1929 struct flow_keys *keys,
1930 struct flow_keys *flkeys)
1932 const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1933 const struct ipv6hdr *key_iph = outer_iph;
1934 struct flow_keys *_flkeys = flkeys;
1935 const struct ipv6hdr *inner_iph;
1936 const struct icmp6hdr *icmph;
1937 struct ipv6hdr _inner_iph;
1938 struct icmp6hdr _icmph;
1940 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1943 icmph = skb_header_pointer(skb, skb_transport_offset(skb),
1944 sizeof(_icmph), &_icmph);
1948 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1949 icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1950 icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1951 icmph->icmp6_type != ICMPV6_PARAMPROB)
1954 inner_iph = skb_header_pointer(skb,
1955 skb_transport_offset(skb) + sizeof(*icmph),
1956 sizeof(_inner_iph), &_inner_iph);
1960 key_iph = inner_iph;
1964 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1965 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1966 keys->tags.flow_label = _flkeys->tags.flow_label;
1967 keys->basic.ip_proto = _flkeys->basic.ip_proto;
1969 keys->addrs.v6addrs.src = key_iph->saddr;
1970 keys->addrs.v6addrs.dst = key_iph->daddr;
1971 keys->tags.flow_label = ip6_flowlabel(key_iph);
1972 keys->basic.ip_proto = key_iph->nexthdr;
1976 /* if skb is set it will be used and fl6 can be NULL */
1977 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
1978 const struct sk_buff *skb, struct flow_keys *flkeys)
1980 struct flow_keys hash_keys;
1983 switch (ip6_multipath_hash_policy(net)) {
1985 memset(&hash_keys, 0, sizeof(hash_keys));
1986 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1988 ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
1990 hash_keys.addrs.v6addrs.src = fl6->saddr;
1991 hash_keys.addrs.v6addrs.dst = fl6->daddr;
1992 hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
1993 hash_keys.basic.ip_proto = fl6->flowi6_proto;
1998 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1999 struct flow_keys keys;
2001 /* short-circuit if we already have L4 hash present */
2003 return skb_get_hash_raw(skb) >> 1;
2005 memset(&hash_keys, 0, sizeof(hash_keys));
2008 skb_flow_dissect_flow_keys(skb, &keys, flag);
2011 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2012 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2013 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2014 hash_keys.ports.src = flkeys->ports.src;
2015 hash_keys.ports.dst = flkeys->ports.dst;
2016 hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2018 memset(&hash_keys, 0, sizeof(hash_keys));
2019 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2020 hash_keys.addrs.v6addrs.src = fl6->saddr;
2021 hash_keys.addrs.v6addrs.dst = fl6->daddr;
2022 hash_keys.ports.src = fl6->fl6_sport;
2023 hash_keys.ports.dst = fl6->fl6_dport;
2024 hash_keys.basic.ip_proto = fl6->flowi6_proto;
2028 mhash = flow_hash_from_keys(&hash_keys);
2033 void ip6_route_input(struct sk_buff *skb)
2035 const struct ipv6hdr *iph = ipv6_hdr(skb);
2036 struct net *net = dev_net(skb->dev);
2037 int flags = RT6_LOOKUP_F_HAS_SADDR;
2038 struct ip_tunnel_info *tun_info;
2039 struct flowi6 fl6 = {
2040 .flowi6_iif = skb->dev->ifindex,
2041 .daddr = iph->daddr,
2042 .saddr = iph->saddr,
2043 .flowlabel = ip6_flowinfo(iph),
2044 .flowi6_mark = skb->mark,
2045 .flowi6_proto = iph->nexthdr,
2047 struct flow_keys *flkeys = NULL, _flkeys;
2049 tun_info = skb_tunnel_info(skb);
2050 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2051 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2053 if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2056 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2057 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2060 ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2063 static struct rt6_info *ip6_pol_route_output(struct net *net,
2064 struct fib6_table *table,
2066 const struct sk_buff *skb,
2069 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2072 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2073 struct flowi6 *fl6, int flags)
2077 if (ipv6_addr_type(&fl6->daddr) &
2078 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
2079 struct dst_entry *dst;
2081 dst = l3mdev_link_scope_lookup(net, fl6);
2086 fl6->flowi6_iif = LOOPBACK_IFINDEX;
2088 any_src = ipv6_addr_any(&fl6->saddr);
2089 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2090 (fl6->flowi6_oif && any_src))
2091 flags |= RT6_LOOKUP_F_IFACE;
2094 flags |= RT6_LOOKUP_F_HAS_SADDR;
2096 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2098 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2100 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2102 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2104 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2105 struct net_device *loopback_dev = net->loopback_dev;
2106 struct dst_entry *new = NULL;
2108 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2109 DST_OBSOLETE_DEAD, 0);
2112 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2116 new->input = dst_discard;
2117 new->output = dst_discard_out;
2119 dst_copy_metrics(new, &ort->dst);
2121 rt->rt6i_idev = in6_dev_get(loopback_dev);
2122 rt->rt6i_gateway = ort->rt6i_gateway;
2123 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2125 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2126 #ifdef CONFIG_IPV6_SUBTREES
2127 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2131 dst_release(dst_orig);
2132 return new ? new : ERR_PTR(-ENOMEM);
2136 * Destination cache support functions
2139 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2143 if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2146 if (fib6_check_expired(f6i))
2152 static struct dst_entry *rt6_check(struct rt6_info *rt,
2153 struct fib6_info *from,
2158 if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2159 rt_cookie != cookie)
2162 if (rt6_check_expired(rt))
2168 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2169 struct fib6_info *from,
2172 if (!__rt6_check_expired(rt) &&
2173 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2174 fib6_check(from, cookie))
2180 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2182 struct dst_entry *dst_ret;
2183 struct fib6_info *from;
2184 struct rt6_info *rt;
2186 rt = container_of(dst, struct rt6_info, dst);
2190 /* All IPV6 dsts are created with ->obsolete set to the value
2191 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2192 * into this function always.
2195 from = rcu_dereference(rt->from);
2197 if (from && (rt->rt6i_flags & RTF_PCPU ||
2198 unlikely(!list_empty(&rt->rt6i_uncached))))
2199 dst_ret = rt6_dst_from_check(rt, from, cookie);
2201 dst_ret = rt6_check(rt, from, cookie);
2208 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2210 struct rt6_info *rt = (struct rt6_info *) dst;
2213 if (rt->rt6i_flags & RTF_CACHE) {
2215 if (rt6_check_expired(rt)) {
2216 rt6_remove_exception_rt(rt);
2228 static void ip6_link_failure(struct sk_buff *skb)
2230 struct rt6_info *rt;
2232 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2234 rt = (struct rt6_info *) skb_dst(skb);
2237 if (rt->rt6i_flags & RTF_CACHE) {
2238 rt6_remove_exception_rt(rt);
2240 struct fib6_info *from;
2241 struct fib6_node *fn;
2243 from = rcu_dereference(rt->from);
2245 fn = rcu_dereference(from->fib6_node);
2246 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2254 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2256 if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2257 struct fib6_info *from;
2260 from = rcu_dereference(rt0->from);
2262 rt0->dst.expires = from->expires;
2266 dst_set_expires(&rt0->dst, timeout);
2267 rt0->rt6i_flags |= RTF_EXPIRES;
2270 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2272 struct net *net = dev_net(rt->dst.dev);
2274 dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2275 rt->rt6i_flags |= RTF_MODIFIED;
2276 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2279 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2284 from_set = !!rcu_dereference(rt->from);
2287 return !(rt->rt6i_flags & RTF_CACHE) &&
2288 (rt->rt6i_flags & RTF_PCPU || from_set);
2291 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2292 const struct ipv6hdr *iph, u32 mtu)
2294 const struct in6_addr *daddr, *saddr;
2295 struct rt6_info *rt6 = (struct rt6_info *)dst;
2297 if (dst_metric_locked(dst, RTAX_MTU))
2301 daddr = &iph->daddr;
2302 saddr = &iph->saddr;
2304 daddr = &sk->sk_v6_daddr;
2305 saddr = &inet6_sk(sk)->saddr;
2310 dst_confirm_neigh(dst, daddr);
2311 mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2312 if (mtu >= dst_mtu(dst))
2315 if (!rt6_cache_allowed_for_pmtu(rt6)) {
2316 rt6_do_update_pmtu(rt6, mtu);
2317 /* update rt6_ex->stamp for cache */
2318 if (rt6->rt6i_flags & RTF_CACHE)
2319 rt6_update_exception_stamp_rt(rt6);
2321 struct fib6_info *from;
2322 struct rt6_info *nrt6;
2325 from = rcu_dereference(rt6->from);
2326 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
2328 rt6_do_update_pmtu(nrt6, mtu);
2329 if (rt6_insert_exception(nrt6, from))
2330 dst_release_immediate(&nrt6->dst);
2336 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2337 struct sk_buff *skb, u32 mtu)
2339 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2342 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2343 int oif, u32 mark, kuid_t uid)
2345 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2346 struct dst_entry *dst;
2347 struct flowi6 fl6 = {
2349 .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark),
2350 .daddr = iph->daddr,
2351 .saddr = iph->saddr,
2352 .flowlabel = ip6_flowinfo(iph),
2356 dst = ip6_route_output(net, NULL, &fl6);
2358 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2361 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2363 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2365 int oif = sk->sk_bound_dev_if;
2366 struct dst_entry *dst;
2368 if (!oif && skb->dev)
2369 oif = l3mdev_master_ifindex(skb->dev);
2371 ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid);
2373 dst = __sk_dst_get(sk);
2374 if (!dst || !dst->obsolete ||
2375 dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2379 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2380 ip6_datagram_dst_update(sk, false);
2383 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2385 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2386 const struct flowi6 *fl6)
2388 #ifdef CONFIG_IPV6_SUBTREES
2389 struct ipv6_pinfo *np = inet6_sk(sk);
2392 ip6_dst_store(sk, dst,
2393 ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2394 &sk->sk_v6_daddr : NULL,
2395 #ifdef CONFIG_IPV6_SUBTREES
2396 ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2402 /* Handle redirects */
2403 struct ip6rd_flowi {
2405 struct in6_addr gateway;
2408 static struct rt6_info *__ip6_route_redirect(struct net *net,
2409 struct fib6_table *table,
2411 const struct sk_buff *skb,
2414 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2415 struct rt6_info *ret = NULL, *rt_cache;
2416 struct fib6_info *rt;
2417 struct fib6_node *fn;
2419 /* Get the "current" route for this destination and
2420 * check if the redirect has come from appropriate router.
2422 * RFC 4861 specifies that redirects should only be
2423 * accepted if they come from the nexthop to the target.
2424 * Due to the way the routes are chosen, this notion
2425 * is a bit fuzzy and one might need to check all possible
2430 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2432 for_each_fib6_node_rt_rcu(fn) {
2433 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
2435 if (fib6_check_expired(rt))
2437 if (rt->fib6_flags & RTF_REJECT)
2439 if (!(rt->fib6_flags & RTF_GATEWAY))
2441 if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex)
2443 /* rt_cache's gateway might be different from its 'parent'
2444 * in the case of an ip redirect.
2445 * So we keep searching in the exception table if the gateway
2448 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) {
2449 rt_cache = rt6_find_cached_rt(rt,
2453 ipv6_addr_equal(&rdfl->gateway,
2454 &rt_cache->rt6i_gateway)) {
2464 rt = net->ipv6.fib6_null_entry;
2465 else if (rt->fib6_flags & RTF_REJECT) {
2466 ret = net->ipv6.ip6_null_entry;
2470 if (rt == net->ipv6.fib6_null_entry) {
2471 fn = fib6_backtrack(fn, &fl6->saddr);
2478 ip6_hold_safe(net, &ret, true);
2480 ret = ip6_create_rt_rcu(rt);
2484 trace_fib6_table_lookup(net, rt, table, fl6);
2488 static struct dst_entry *ip6_route_redirect(struct net *net,
2489 const struct flowi6 *fl6,
2490 const struct sk_buff *skb,
2491 const struct in6_addr *gateway)
2493 int flags = RT6_LOOKUP_F_HAS_SADDR;
2494 struct ip6rd_flowi rdfl;
2497 rdfl.gateway = *gateway;
2499 return fib6_rule_lookup(net, &rdfl.fl6, skb,
2500 flags, __ip6_route_redirect);
2503 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2506 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2507 struct dst_entry *dst;
2508 struct flowi6 fl6 = {
2509 .flowi6_iif = LOOPBACK_IFINDEX,
2511 .flowi6_mark = mark,
2512 .daddr = iph->daddr,
2513 .saddr = iph->saddr,
2514 .flowlabel = ip6_flowinfo(iph),
2518 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2519 rt6_do_redirect(dst, NULL, skb);
2522 EXPORT_SYMBOL_GPL(ip6_redirect);
2524 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif)
2526 const struct ipv6hdr *iph = ipv6_hdr(skb);
2527 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2528 struct dst_entry *dst;
2529 struct flowi6 fl6 = {
2530 .flowi6_iif = LOOPBACK_IFINDEX,
2533 .saddr = iph->daddr,
2534 .flowi6_uid = sock_net_uid(net, NULL),
2537 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2538 rt6_do_redirect(dst, NULL, skb);
2542 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2544 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2547 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2549 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2551 struct net_device *dev = dst->dev;
2552 unsigned int mtu = dst_mtu(dst);
2553 struct net *net = dev_net(dev);
2555 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2557 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2558 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2561 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2562 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2563 * IPV6_MAXPLEN is also valid and means: "any MSS,
2564 * rely only on pmtu discovery"
2566 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2571 static unsigned int ip6_mtu(const struct dst_entry *dst)
2573 struct inet6_dev *idev;
2576 mtu = dst_metric_raw(dst, RTAX_MTU);
2583 idev = __in6_dev_get(dst->dev);
2585 mtu = idev->cnf.mtu6;
2589 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2591 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2595 * 1. mtu on route is locked - use it
2596 * 2. mtu from nexthop exception
2597 * 3. mtu from egress device
2599 * based on ip6_dst_mtu_forward and exception logic of
2600 * rt6_find_cached_rt; called with rcu_read_lock
2602 u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
2603 struct in6_addr *saddr)
2605 struct rt6_exception_bucket *bucket;
2606 struct rt6_exception *rt6_ex;
2607 struct in6_addr *src_key;
2608 struct inet6_dev *idev;
2611 if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
2612 mtu = f6i->fib6_pmtu;
2618 #ifdef CONFIG_IPV6_SUBTREES
2619 if (f6i->fib6_src.plen)
2623 bucket = rcu_dereference(f6i->rt6i_exception_bucket);
2624 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
2625 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
2626 mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU);
2629 struct net_device *dev = fib6_info_nh_dev(f6i);
2632 idev = __in6_dev_get(dev);
2633 if (idev && idev->cnf.mtu6 > mtu)
2634 mtu = idev->cnf.mtu6;
2637 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2639 return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu);
2642 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2645 struct dst_entry *dst;
2646 struct rt6_info *rt;
2647 struct inet6_dev *idev = in6_dev_get(dev);
2648 struct net *net = dev_net(dev);
2650 if (unlikely(!idev))
2651 return ERR_PTR(-ENODEV);
2653 rt = ip6_dst_alloc(net, dev, 0);
2654 if (unlikely(!rt)) {
2656 dst = ERR_PTR(-ENOMEM);
2660 rt->dst.flags |= DST_HOST;
2661 rt->dst.input = ip6_input;
2662 rt->dst.output = ip6_output;
2663 rt->rt6i_gateway = fl6->daddr;
2664 rt->rt6i_dst.addr = fl6->daddr;
2665 rt->rt6i_dst.plen = 128;
2666 rt->rt6i_idev = idev;
2667 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2669 /* Add this dst into uncached_list so that rt6_disable_ip() can
2670 * do proper release of the net_device
2672 rt6_uncached_list_add(rt);
2673 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2675 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2681 static int ip6_dst_gc(struct dst_ops *ops)
2683 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2684 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2685 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2686 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2687 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2688 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2691 entries = dst_entries_get_fast(ops);
2692 if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2693 entries <= rt_max_size)
2696 net->ipv6.ip6_rt_gc_expire++;
2697 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2698 entries = dst_entries_get_slow(ops);
2699 if (entries < ops->gc_thresh)
2700 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2702 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2703 return entries > rt_max_size;
2706 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2707 struct fib6_config *cfg,
2708 const struct in6_addr *gw_addr,
2709 u32 tbid, int flags)
2711 struct flowi6 fl6 = {
2712 .flowi6_oif = cfg->fc_ifindex,
2714 .saddr = cfg->fc_prefsrc,
2716 struct fib6_table *table;
2717 struct rt6_info *rt;
2719 table = fib6_get_table(net, tbid);
2723 if (!ipv6_addr_any(&cfg->fc_prefsrc))
2724 flags |= RT6_LOOKUP_F_HAS_SADDR;
2726 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2727 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2729 /* if table lookup failed, fall back to full lookup */
2730 if (rt == net->ipv6.ip6_null_entry) {
2738 static int ip6_route_check_nh_onlink(struct net *net,
2739 struct fib6_config *cfg,
2740 const struct net_device *dev,
2741 struct netlink_ext_ack *extack)
2743 u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2744 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2745 u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2746 struct fib6_info *from;
2747 struct rt6_info *grt;
2751 grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2754 from = rcu_dereference(grt->from);
2755 if (!grt->dst.error &&
2756 /* ignore match if it is the default route */
2757 from && !ipv6_addr_any(&from->fib6_dst.addr) &&
2758 (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2759 NL_SET_ERR_MSG(extack,
2760 "Nexthop has invalid gateway or device mismatch");
2771 static int ip6_route_check_nh(struct net *net,
2772 struct fib6_config *cfg,
2773 struct net_device **_dev,
2774 struct inet6_dev **idev)
2776 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2777 struct net_device *dev = _dev ? *_dev : NULL;
2778 struct rt6_info *grt = NULL;
2779 int err = -EHOSTUNREACH;
2781 if (cfg->fc_table) {
2782 int flags = RT6_LOOKUP_F_IFACE;
2784 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2785 cfg->fc_table, flags);
2787 if (grt->rt6i_flags & RTF_GATEWAY ||
2788 (dev && dev != grt->dst.dev)) {
2796 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2802 if (dev != grt->dst.dev) {
2807 *_dev = dev = grt->dst.dev;
2808 *idev = grt->rt6i_idev;
2810 in6_dev_hold(grt->rt6i_idev);
2813 if (!(grt->rt6i_flags & RTF_GATEWAY))
2822 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2823 struct net_device **_dev, struct inet6_dev **idev,
2824 struct netlink_ext_ack *extack)
2826 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2827 int gwa_type = ipv6_addr_type(gw_addr);
2828 bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2829 const struct net_device *dev = *_dev;
2830 bool need_addr_check = !dev;
2833 /* if gw_addr is local we will fail to detect this in case
2834 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2835 * will return already-added prefix route via interface that
2836 * prefix route was assigned to, which might be non-loopback.
2839 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2840 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2844 if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2845 /* IPv6 strictly inhibits using not link-local
2846 * addresses as nexthop address.
2847 * Otherwise, router will not able to send redirects.
2848 * It is very good, but in some (rare!) circumstances
2849 * (SIT, PtP, NBMA NOARP links) it is handy to allow
2850 * some exceptions. --ANK
2851 * We allow IPv4-mapped nexthops to support RFC4798-type
2854 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2855 NL_SET_ERR_MSG(extack, "Invalid gateway address");
2859 if (cfg->fc_flags & RTNH_F_ONLINK)
2860 err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2862 err = ip6_route_check_nh(net, cfg, _dev, idev);
2868 /* reload in case device was changed */
2873 NL_SET_ERR_MSG(extack, "Egress device not specified");
2875 } else if (dev->flags & IFF_LOOPBACK) {
2876 NL_SET_ERR_MSG(extack,
2877 "Egress device can not be loopback device for this route");
2881 /* if we did not check gw_addr above, do so now that the
2882 * egress device has been resolved.
2884 if (need_addr_check &&
2885 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2886 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2895 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
2897 struct netlink_ext_ack *extack)
2899 struct net *net = cfg->fc_nlinfo.nl_net;
2900 struct fib6_info *rt = NULL;
2901 struct net_device *dev = NULL;
2902 struct inet6_dev *idev = NULL;
2903 struct fib6_table *table;
2907 /* RTF_PCPU is an internal flag; can not be set by userspace */
2908 if (cfg->fc_flags & RTF_PCPU) {
2909 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2913 /* RTF_CACHE is an internal flag; can not be set by userspace */
2914 if (cfg->fc_flags & RTF_CACHE) {
2915 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2919 if (cfg->fc_type > RTN_MAX) {
2920 NL_SET_ERR_MSG(extack, "Invalid route type");
2924 if (cfg->fc_dst_len > 128) {
2925 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2928 if (cfg->fc_src_len > 128) {
2929 NL_SET_ERR_MSG(extack, "Invalid source address length");
2932 #ifndef CONFIG_IPV6_SUBTREES
2933 if (cfg->fc_src_len) {
2934 NL_SET_ERR_MSG(extack,
2935 "Specifying source address requires IPV6_SUBTREES to be enabled");
2939 if (cfg->fc_ifindex) {
2941 dev = dev_get_by_index(net, cfg->fc_ifindex);
2944 idev = in6_dev_get(dev);
2949 if (cfg->fc_metric == 0)
2950 cfg->fc_metric = IP6_RT_PRIO_USER;
2952 if (cfg->fc_flags & RTNH_F_ONLINK) {
2954 NL_SET_ERR_MSG(extack,
2955 "Nexthop device required for onlink");
2960 if (!(dev->flags & IFF_UP)) {
2961 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2968 if (cfg->fc_nlinfo.nlh &&
2969 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2970 table = fib6_get_table(net, cfg->fc_table);
2972 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2973 table = fib6_new_table(net, cfg->fc_table);
2976 table = fib6_new_table(net, cfg->fc_table);
2983 rt = fib6_info_alloc(gfp_flags);
2987 rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len,
2989 if (IS_ERR(rt->fib6_metrics)) {
2990 err = PTR_ERR(rt->fib6_metrics);
2991 /* Do not leave garbage there. */
2992 rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
2996 if (cfg->fc_flags & RTF_ADDRCONF)
2997 rt->dst_nocount = true;
2999 if (cfg->fc_flags & RTF_EXPIRES)
3000 fib6_set_expires(rt, jiffies +
3001 clock_t_to_jiffies(cfg->fc_expires));
3003 fib6_clean_expires(rt);
3005 if (cfg->fc_protocol == RTPROT_UNSPEC)
3006 cfg->fc_protocol = RTPROT_BOOT;
3007 rt->fib6_protocol = cfg->fc_protocol;
3009 addr_type = ipv6_addr_type(&cfg->fc_dst);
3011 if (cfg->fc_encap) {
3012 struct lwtunnel_state *lwtstate;
3014 err = lwtunnel_build_state(cfg->fc_encap_type,
3015 cfg->fc_encap, AF_INET6, cfg,
3019 rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate);
3022 ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3023 rt->fib6_dst.plen = cfg->fc_dst_len;
3024 if (rt->fib6_dst.plen == 128)
3025 rt->dst_host = true;
3027 #ifdef CONFIG_IPV6_SUBTREES
3028 ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3029 rt->fib6_src.plen = cfg->fc_src_len;
3032 rt->fib6_metric = cfg->fc_metric;
3033 rt->fib6_nh.nh_weight = 1;
3035 rt->fib6_type = cfg->fc_type;
3037 /* We cannot add true routes via loopback here,
3038 they would result in kernel looping; promote them to reject routes
3040 if ((cfg->fc_flags & RTF_REJECT) ||
3041 (dev && (dev->flags & IFF_LOOPBACK) &&
3042 !(addr_type & IPV6_ADDR_LOOPBACK) &&
3043 !(cfg->fc_flags & RTF_LOCAL))) {
3044 /* hold loopback dev/idev if we haven't done so. */
3045 if (dev != net->loopback_dev) {
3050 dev = net->loopback_dev;
3052 idev = in6_dev_get(dev);
3058 rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP;
3062 if (cfg->fc_flags & RTF_GATEWAY) {
3063 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
3067 rt->fib6_nh.nh_gw = cfg->fc_gateway;
3074 if (idev->cnf.disable_ipv6) {
3075 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3080 if (!(dev->flags & IFF_UP)) {
3081 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3086 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3087 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3088 NL_SET_ERR_MSG(extack, "Invalid source address");
3092 rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3093 rt->fib6_prefsrc.plen = 128;
3095 rt->fib6_prefsrc.plen = 0;
3097 rt->fib6_flags = cfg->fc_flags;
3100 if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3101 !netif_carrier_ok(dev))
3102 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3103 rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
3104 rt->fib6_nh.nh_dev = dev;
3105 rt->fib6_table = table;
3117 fib6_info_release(rt);
3118 return ERR_PTR(err);
3121 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3122 struct netlink_ext_ack *extack)
3124 struct fib6_info *rt;
3127 rt = ip6_route_info_create(cfg, gfp_flags, extack);
3131 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3132 fib6_info_release(rt);
3137 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3139 struct net *net = info->nl_net;
3140 struct fib6_table *table;
3143 if (rt == net->ipv6.fib6_null_entry) {
3148 table = rt->fib6_table;
3149 spin_lock_bh(&table->tb6_lock);
3150 err = fib6_del(rt, info);
3151 spin_unlock_bh(&table->tb6_lock);
3154 fib6_info_release(rt);
3158 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3160 struct nl_info info = { .nl_net = net };
3162 return __ip6_del_rt(rt, &info);
3165 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3167 struct nl_info *info = &cfg->fc_nlinfo;
3168 struct net *net = info->nl_net;
3169 struct sk_buff *skb = NULL;
3170 struct fib6_table *table;
3173 if (rt == net->ipv6.fib6_null_entry)
3175 table = rt->fib6_table;
3176 spin_lock_bh(&table->tb6_lock);
3178 if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3179 struct fib6_info *sibling, *next_sibling;
3181 /* prefer to send a single notification with all hops */
3182 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3184 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3186 if (rt6_fill_node(net, skb, rt, NULL,
3187 NULL, NULL, 0, RTM_DELROUTE,
3188 info->portid, seq, 0) < 0) {
3192 info->skip_notify = 1;
3195 list_for_each_entry_safe(sibling, next_sibling,
3198 err = fib6_del(sibling, info);
3204 err = fib6_del(rt, info);
3206 spin_unlock_bh(&table->tb6_lock);
3208 fib6_info_release(rt);
3211 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3212 info->nlh, gfp_any());
3217 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3221 if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3224 if (cfg->fc_flags & RTF_GATEWAY &&
3225 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3228 rc = rt6_remove_exception_rt(rt);
3233 static int ip6_route_del(struct fib6_config *cfg,
3234 struct netlink_ext_ack *extack)
3236 struct rt6_info *rt_cache;
3237 struct fib6_table *table;
3238 struct fib6_info *rt;
3239 struct fib6_node *fn;
3242 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3244 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3250 fn = fib6_locate(&table->tb6_root,
3251 &cfg->fc_dst, cfg->fc_dst_len,
3252 &cfg->fc_src, cfg->fc_src_len,
3253 !(cfg->fc_flags & RTF_CACHE));
3256 for_each_fib6_node_rt_rcu(fn) {
3257 if (cfg->fc_flags & RTF_CACHE) {
3260 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3263 rc = ip6_del_cached_rt(rt_cache, cfg);
3271 if (cfg->fc_ifindex &&
3272 (!rt->fib6_nh.nh_dev ||
3273 rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex))
3275 if (cfg->fc_flags & RTF_GATEWAY &&
3276 !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw))
3278 if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3280 if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3282 if (!fib6_info_hold_safe(rt))
3286 /* if gateway was specified only delete the one hop */
3287 if (cfg->fc_flags & RTF_GATEWAY)
3288 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3290 return __ip6_del_rt_siblings(rt, cfg);
3298 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3300 struct netevent_redirect netevent;
3301 struct rt6_info *rt, *nrt = NULL;
3302 struct ndisc_options ndopts;
3303 struct inet6_dev *in6_dev;
3304 struct neighbour *neigh;
3305 struct fib6_info *from;
3307 int optlen, on_link;
3310 optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3311 optlen -= sizeof(*msg);
3314 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3318 msg = (struct rd_msg *)icmp6_hdr(skb);
3320 if (ipv6_addr_is_multicast(&msg->dest)) {
3321 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3326 if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3328 } else if (ipv6_addr_type(&msg->target) !=
3329 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3330 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3334 in6_dev = __in6_dev_get(skb->dev);
3337 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3341 * The IP source address of the Redirect MUST be the same as the current
3342 * first-hop router for the specified ICMP Destination Address.
3345 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3346 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3351 if (ndopts.nd_opts_tgt_lladdr) {
3352 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3355 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3360 rt = (struct rt6_info *) dst;
3361 if (rt->rt6i_flags & RTF_REJECT) {
3362 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3366 /* Redirect received -> path was valid.
3367 * Look, redirects are sent only in response to data packets,
3368 * so that this nexthop apparently is reachable. --ANK
3370 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3372 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3377 * We have finally decided to accept it.
3380 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3381 NEIGH_UPDATE_F_WEAK_OVERRIDE|
3382 NEIGH_UPDATE_F_OVERRIDE|
3383 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3384 NEIGH_UPDATE_F_ISROUTER)),
3385 NDISC_REDIRECT, &ndopts);
3388 from = rcu_dereference(rt->from);
3389 /* This fib6_info_hold() is safe here because we hold reference to rt
3390 * and rt already holds reference to fib6_info.
3392 fib6_info_hold(from);
3395 nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
3399 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3401 nrt->rt6i_flags &= ~RTF_GATEWAY;
3403 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3405 /* No need to remove rt from the exception table if rt is
3406 * a cached route because rt6_insert_exception() will
3409 if (rt6_insert_exception(nrt, from)) {
3410 dst_release_immediate(&nrt->dst);
3414 netevent.old = &rt->dst;
3415 netevent.new = &nrt->dst;
3416 netevent.daddr = &msg->dest;
3417 netevent.neigh = neigh;
3418 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3421 fib6_info_release(from);
3422 neigh_release(neigh);
3425 #ifdef CONFIG_IPV6_ROUTE_INFO
3426 static struct fib6_info *rt6_get_route_info(struct net *net,
3427 const struct in6_addr *prefix, int prefixlen,
3428 const struct in6_addr *gwaddr,
3429 struct net_device *dev)
3431 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3432 int ifindex = dev->ifindex;
3433 struct fib6_node *fn;
3434 struct fib6_info *rt = NULL;
3435 struct fib6_table *table;
3437 table = fib6_get_table(net, tb_id);
3442 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3446 for_each_fib6_node_rt_rcu(fn) {
3447 if (rt->fib6_nh.nh_dev->ifindex != ifindex)
3449 if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3451 if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
3453 if (!fib6_info_hold_safe(rt))
3462 static struct fib6_info *rt6_add_route_info(struct net *net,
3463 const struct in6_addr *prefix, int prefixlen,
3464 const struct in6_addr *gwaddr,
3465 struct net_device *dev,
3468 struct fib6_config cfg = {
3469 .fc_metric = IP6_RT_PRIO_USER,
3470 .fc_ifindex = dev->ifindex,
3471 .fc_dst_len = prefixlen,
3472 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3473 RTF_UP | RTF_PREF(pref),
3474 .fc_protocol = RTPROT_RA,
3475 .fc_type = RTN_UNICAST,
3476 .fc_nlinfo.portid = 0,
3477 .fc_nlinfo.nlh = NULL,
3478 .fc_nlinfo.nl_net = net,
3481 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3482 cfg.fc_dst = *prefix;
3483 cfg.fc_gateway = *gwaddr;
3485 /* We should treat it as a default route if prefix length is 0. */
3487 cfg.fc_flags |= RTF_DEFAULT;
3489 ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3491 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3495 struct fib6_info *rt6_get_dflt_router(struct net *net,
3496 const struct in6_addr *addr,
3497 struct net_device *dev)
3499 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3500 struct fib6_info *rt;
3501 struct fib6_table *table;
3503 table = fib6_get_table(net, tb_id);
3508 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3509 if (dev == rt->fib6_nh.nh_dev &&
3510 ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3511 ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
3514 if (rt && !fib6_info_hold_safe(rt))
3520 struct fib6_info *rt6_add_dflt_router(struct net *net,
3521 const struct in6_addr *gwaddr,
3522 struct net_device *dev,
3525 struct fib6_config cfg = {
3526 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3527 .fc_metric = IP6_RT_PRIO_USER,
3528 .fc_ifindex = dev->ifindex,
3529 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3530 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3531 .fc_protocol = RTPROT_RA,
3532 .fc_type = RTN_UNICAST,
3533 .fc_nlinfo.portid = 0,
3534 .fc_nlinfo.nlh = NULL,
3535 .fc_nlinfo.nl_net = net,
3538 cfg.fc_gateway = *gwaddr;
3540 if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3541 struct fib6_table *table;
3543 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3545 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3548 return rt6_get_dflt_router(net, gwaddr, dev);
3551 static void __rt6_purge_dflt_routers(struct net *net,
3552 struct fib6_table *table)
3554 struct fib6_info *rt;
3558 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3559 struct net_device *dev = fib6_info_nh_dev(rt);
3560 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3562 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3563 (!idev || idev->cnf.accept_ra != 2) &&
3564 fib6_info_hold_safe(rt)) {
3566 ip6_del_rt(net, rt);
3572 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3575 void rt6_purge_dflt_routers(struct net *net)
3577 struct fib6_table *table;
3578 struct hlist_head *head;
3583 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3584 head = &net->ipv6.fib_table_hash[h];
3585 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3586 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3587 __rt6_purge_dflt_routers(net, table);
3594 static void rtmsg_to_fib6_config(struct net *net,
3595 struct in6_rtmsg *rtmsg,
3596 struct fib6_config *cfg)
3598 *cfg = (struct fib6_config){
3599 .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3601 .fc_ifindex = rtmsg->rtmsg_ifindex,
3602 .fc_metric = rtmsg->rtmsg_metric,
3603 .fc_expires = rtmsg->rtmsg_info,
3604 .fc_dst_len = rtmsg->rtmsg_dst_len,
3605 .fc_src_len = rtmsg->rtmsg_src_len,
3606 .fc_flags = rtmsg->rtmsg_flags,
3607 .fc_type = rtmsg->rtmsg_type,
3609 .fc_nlinfo.nl_net = net,
3611 .fc_dst = rtmsg->rtmsg_dst,
3612 .fc_src = rtmsg->rtmsg_src,
3613 .fc_gateway = rtmsg->rtmsg_gateway,
3617 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3619 struct fib6_config cfg;
3620 struct in6_rtmsg rtmsg;
3624 case SIOCADDRT: /* Add a route */
3625 case SIOCDELRT: /* Delete a route */
3626 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3628 err = copy_from_user(&rtmsg, arg,
3629 sizeof(struct in6_rtmsg));
3633 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3638 err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3641 err = ip6_route_del(&cfg, NULL);
3655 * Drop the packet on the floor
3658 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3661 struct dst_entry *dst = skb_dst(skb);
3662 switch (ipstats_mib_noroutes) {
3663 case IPSTATS_MIB_INNOROUTES:
3664 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3665 if (type == IPV6_ADDR_ANY) {
3666 IP6_INC_STATS(dev_net(dst->dev),
3667 __in6_dev_get_safely(skb->dev),
3668 IPSTATS_MIB_INADDRERRORS);
3672 case IPSTATS_MIB_OUTNOROUTES:
3673 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3674 ipstats_mib_noroutes);
3677 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3682 static int ip6_pkt_discard(struct sk_buff *skb)
3684 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3687 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3689 skb->dev = skb_dst(skb)->dev;
3690 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3693 static int ip6_pkt_prohibit(struct sk_buff *skb)
3695 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3698 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3700 skb->dev = skb_dst(skb)->dev;
3701 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3705 * Allocate a dst for local (unicast / anycast) address.
3708 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3709 struct inet6_dev *idev,
3710 const struct in6_addr *addr,
3711 bool anycast, gfp_t gfp_flags)
3714 struct net_device *dev = idev->dev;
3715 struct fib6_info *f6i;
3717 f6i = fib6_info_alloc(gfp_flags);
3719 return ERR_PTR(-ENOMEM);
3721 f6i->fib6_metrics = ip_fib_metrics_init(net, NULL, 0, NULL);
3722 f6i->dst_nocount = true;
3723 f6i->dst_host = true;
3724 f6i->fib6_protocol = RTPROT_KERNEL;
3725 f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP;
3727 f6i->fib6_type = RTN_ANYCAST;
3728 f6i->fib6_flags |= RTF_ANYCAST;
3730 f6i->fib6_type = RTN_LOCAL;
3731 f6i->fib6_flags |= RTF_LOCAL;
3734 f6i->fib6_nh.nh_gw = *addr;
3736 f6i->fib6_nh.nh_dev = dev;
3737 f6i->fib6_dst.addr = *addr;
3738 f6i->fib6_dst.plen = 128;
3739 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3740 f6i->fib6_table = fib6_get_table(net, tb_id);
3745 /* remove deleted ip from prefsrc entries */
3746 struct arg_dev_net_ip {
3747 struct net_device *dev;
3749 struct in6_addr *addr;
3752 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3754 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3755 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3756 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3758 if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
3759 rt != net->ipv6.fib6_null_entry &&
3760 ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3761 spin_lock_bh(&rt6_exception_lock);
3762 /* remove prefsrc entry */
3763 rt->fib6_prefsrc.plen = 0;
3764 spin_unlock_bh(&rt6_exception_lock);
3769 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3771 struct net *net = dev_net(ifp->idev->dev);
3772 struct arg_dev_net_ip adni = {
3773 .dev = ifp->idev->dev,
3777 fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3780 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3782 /* Remove routers and update dst entries when gateway turn into host. */
3783 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3785 struct in6_addr *gateway = (struct in6_addr *)arg;
3787 if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3788 ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) {
3792 /* Further clean up cached routes in exception table.
3793 * This is needed because cached route may have a different
3794 * gateway than its 'parent' in the case of an ip redirect.
3796 rt6_exceptions_clean_tohost(rt, gateway);
3801 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3803 fib6_clean_all(net, fib6_clean_tohost, gateway);
3806 struct arg_netdev_event {
3807 const struct net_device *dev;
3809 unsigned int nh_flags;
3810 unsigned long event;
3814 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3816 struct fib6_info *iter;
3817 struct fib6_node *fn;
3819 fn = rcu_dereference_protected(rt->fib6_node,
3820 lockdep_is_held(&rt->fib6_table->tb6_lock));
3821 iter = rcu_dereference_protected(fn->leaf,
3822 lockdep_is_held(&rt->fib6_table->tb6_lock));
3824 if (iter->fib6_metric == rt->fib6_metric &&
3825 rt6_qualify_for_ecmp(iter))
3827 iter = rcu_dereference_protected(iter->fib6_next,
3828 lockdep_is_held(&rt->fib6_table->tb6_lock));
3834 static bool rt6_is_dead(const struct fib6_info *rt)
3836 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
3837 (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
3838 fib6_ignore_linkdown(rt)))
3844 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3846 struct fib6_info *iter;
3849 if (!rt6_is_dead(rt))
3850 total += rt->fib6_nh.nh_weight;
3852 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3853 if (!rt6_is_dead(iter))
3854 total += iter->fib6_nh.nh_weight;
3860 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3862 int upper_bound = -1;
3864 if (!rt6_is_dead(rt)) {
3865 *weight += rt->fib6_nh.nh_weight;
3866 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3869 atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound);
3872 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3874 struct fib6_info *iter;
3877 rt6_upper_bound_set(rt, &weight, total);
3879 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3880 rt6_upper_bound_set(iter, &weight, total);
3883 void rt6_multipath_rebalance(struct fib6_info *rt)
3885 struct fib6_info *first;
3888 /* In case the entire multipath route was marked for flushing,
3889 * then there is no need to rebalance upon the removal of every
3892 if (!rt->fib6_nsiblings || rt->should_flush)
3895 /* During lookup routes are evaluated in order, so we need to
3896 * make sure upper bounds are assigned from the first sibling
3899 first = rt6_multipath_first_sibling(rt);
3900 if (WARN_ON_ONCE(!first))
3903 total = rt6_multipath_total_weight(first);
3904 rt6_multipath_upper_bound_set(first, total);
3907 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
3909 const struct arg_netdev_event *arg = p_arg;
3910 struct net *net = dev_net(arg->dev);
3912 if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) {
3913 rt->fib6_nh.nh_flags &= ~arg->nh_flags;
3914 fib6_update_sernum_upto_root(net, rt);
3915 rt6_multipath_rebalance(rt);
3921 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3923 struct arg_netdev_event arg = {
3926 .nh_flags = nh_flags,
3930 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3931 arg.nh_flags |= RTNH_F_LINKDOWN;
3933 fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3936 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
3937 const struct net_device *dev)
3939 struct fib6_info *iter;
3941 if (rt->fib6_nh.nh_dev == dev)
3943 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3944 if (iter->fib6_nh.nh_dev == dev)
3950 static void rt6_multipath_flush(struct fib6_info *rt)
3952 struct fib6_info *iter;
3954 rt->should_flush = 1;
3955 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3956 iter->should_flush = 1;
3959 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
3960 const struct net_device *down_dev)
3962 struct fib6_info *iter;
3963 unsigned int dead = 0;
3965 if (rt->fib6_nh.nh_dev == down_dev ||
3966 rt->fib6_nh.nh_flags & RTNH_F_DEAD)
3968 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3969 if (iter->fib6_nh.nh_dev == down_dev ||
3970 iter->fib6_nh.nh_flags & RTNH_F_DEAD)
3976 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
3977 const struct net_device *dev,
3978 unsigned int nh_flags)
3980 struct fib6_info *iter;
3982 if (rt->fib6_nh.nh_dev == dev)
3983 rt->fib6_nh.nh_flags |= nh_flags;
3984 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3985 if (iter->fib6_nh.nh_dev == dev)
3986 iter->fib6_nh.nh_flags |= nh_flags;
3989 /* called with write lock held for table with rt */
3990 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
3992 const struct arg_netdev_event *arg = p_arg;
3993 const struct net_device *dev = arg->dev;
3994 struct net *net = dev_net(dev);
3996 if (rt == net->ipv6.fib6_null_entry)
3999 switch (arg->event) {
4000 case NETDEV_UNREGISTER:
4001 return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4003 if (rt->should_flush)
4005 if (!rt->fib6_nsiblings)
4006 return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4007 if (rt6_multipath_uses_dev(rt, dev)) {
4010 count = rt6_multipath_dead_count(rt, dev);
4011 if (rt->fib6_nsiblings + 1 == count) {
4012 rt6_multipath_flush(rt);
4015 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4017 fib6_update_sernum(net, rt);
4018 rt6_multipath_rebalance(rt);
4022 if (rt->fib6_nh.nh_dev != dev ||
4023 rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4025 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
4026 rt6_multipath_rebalance(rt);
4033 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4035 struct arg_netdev_event arg = {
4041 struct net *net = dev_net(dev);
4043 if (net->ipv6.sysctl.skip_notify_on_dev_down)
4044 fib6_clean_all_skip_notify(net, fib6_ifdown, &arg);
4046 fib6_clean_all(net, fib6_ifdown, &arg);
4049 void rt6_disable_ip(struct net_device *dev, unsigned long event)
4051 rt6_sync_down_dev(dev, event);
4052 rt6_uncached_list_flush_dev(dev_net(dev), dev);
4053 neigh_ifdown(&nd_tbl, dev);
4056 struct rt6_mtu_change_arg {
4057 struct net_device *dev;
4061 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4063 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4064 struct inet6_dev *idev;
4066 /* In IPv6 pmtu discovery is not optional,
4067 so that RTAX_MTU lock cannot disable it.
4068 We still use this lock to block changes
4069 caused by addrconf/ndisc.
4072 idev = __in6_dev_get(arg->dev);
4076 /* For administrative MTU increase, there is no way to discover
4077 IPv6 PMTU increase, so PMTU increase should be updated here.
4078 Since RFC 1981 doesn't include administrative MTU increase
4079 update PMTU increase is a MUST. (i.e. jumbo frame)
4081 if (rt->fib6_nh.nh_dev == arg->dev &&
4082 !fib6_metric_locked(rt, RTAX_MTU)) {
4083 u32 mtu = rt->fib6_pmtu;
4085 if (mtu >= arg->mtu ||
4086 (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4087 fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4089 spin_lock_bh(&rt6_exception_lock);
4090 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4091 spin_unlock_bh(&rt6_exception_lock);
4096 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4098 struct rt6_mtu_change_arg arg = {
4103 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4106 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4107 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
4108 [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) },
4109 [RTA_OIF] = { .type = NLA_U32 },
4110 [RTA_IIF] = { .type = NLA_U32 },
4111 [RTA_PRIORITY] = { .type = NLA_U32 },
4112 [RTA_METRICS] = { .type = NLA_NESTED },
4113 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
4114 [RTA_PREF] = { .type = NLA_U8 },
4115 [RTA_ENCAP_TYPE] = { .type = NLA_U16 },
4116 [RTA_ENCAP] = { .type = NLA_NESTED },
4117 [RTA_EXPIRES] = { .type = NLA_U32 },
4118 [RTA_UID] = { .type = NLA_U32 },
4119 [RTA_MARK] = { .type = NLA_U32 },
4120 [RTA_TABLE] = { .type = NLA_U32 },
4121 [RTA_IP_PROTO] = { .type = NLA_U8 },
4122 [RTA_SPORT] = { .type = NLA_U16 },
4123 [RTA_DPORT] = { .type = NLA_U16 },
4126 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4127 struct fib6_config *cfg,
4128 struct netlink_ext_ack *extack)
4131 struct nlattr *tb[RTA_MAX+1];
4135 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4141 rtm = nlmsg_data(nlh);
4143 *cfg = (struct fib6_config){
4144 .fc_table = rtm->rtm_table,
4145 .fc_dst_len = rtm->rtm_dst_len,
4146 .fc_src_len = rtm->rtm_src_len,
4148 .fc_protocol = rtm->rtm_protocol,
4149 .fc_type = rtm->rtm_type,
4151 .fc_nlinfo.portid = NETLINK_CB(skb).portid,
4152 .fc_nlinfo.nlh = nlh,
4153 .fc_nlinfo.nl_net = sock_net(skb->sk),
4156 if (rtm->rtm_type == RTN_UNREACHABLE ||
4157 rtm->rtm_type == RTN_BLACKHOLE ||
4158 rtm->rtm_type == RTN_PROHIBIT ||
4159 rtm->rtm_type == RTN_THROW)
4160 cfg->fc_flags |= RTF_REJECT;
4162 if (rtm->rtm_type == RTN_LOCAL)
4163 cfg->fc_flags |= RTF_LOCAL;
4165 if (rtm->rtm_flags & RTM_F_CLONED)
4166 cfg->fc_flags |= RTF_CACHE;
4168 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4170 if (tb[RTA_GATEWAY]) {
4171 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4172 cfg->fc_flags |= RTF_GATEWAY;
4176 int plen = (rtm->rtm_dst_len + 7) >> 3;
4178 if (nla_len(tb[RTA_DST]) < plen)
4181 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4185 int plen = (rtm->rtm_src_len + 7) >> 3;
4187 if (nla_len(tb[RTA_SRC]) < plen)
4190 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4193 if (tb[RTA_PREFSRC])
4194 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4197 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4199 if (tb[RTA_PRIORITY])
4200 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4202 if (tb[RTA_METRICS]) {
4203 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4204 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4208 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4210 if (tb[RTA_MULTIPATH]) {
4211 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4212 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4214 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4215 cfg->fc_mp_len, extack);
4221 pref = nla_get_u8(tb[RTA_PREF]);
4222 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4223 pref != ICMPV6_ROUTER_PREF_HIGH)
4224 pref = ICMPV6_ROUTER_PREF_MEDIUM;
4225 cfg->fc_flags |= RTF_PREF(pref);
4229 cfg->fc_encap = tb[RTA_ENCAP];
4231 if (tb[RTA_ENCAP_TYPE]) {
4232 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4234 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4239 if (tb[RTA_EXPIRES]) {
4240 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4242 if (addrconf_finite_timeout(timeout)) {
4243 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4244 cfg->fc_flags |= RTF_EXPIRES;
4254 struct fib6_info *fib6_info;
4255 struct fib6_config r_cfg;
4256 struct list_head next;
4259 static int ip6_route_info_append(struct net *net,
4260 struct list_head *rt6_nh_list,
4261 struct fib6_info *rt,
4262 struct fib6_config *r_cfg)
4267 list_for_each_entry(nh, rt6_nh_list, next) {
4268 /* check if fib6_info already exists */
4269 if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4273 nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4277 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4278 list_add_tail(&nh->next, rt6_nh_list);
4283 static void ip6_route_mpath_notify(struct fib6_info *rt,
4284 struct fib6_info *rt_last,
4285 struct nl_info *info,
4288 /* if this is an APPEND route, then rt points to the first route
4289 * inserted and rt_last points to last route inserted. Userspace
4290 * wants a consistent dump of the route which starts at the first
4291 * nexthop. Since sibling routes are always added at the end of
4292 * the list, find the first sibling of the last route appended
4294 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4295 rt = list_first_entry(&rt_last->fib6_siblings,
4301 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4304 static int ip6_route_multipath_add(struct fib6_config *cfg,
4305 struct netlink_ext_ack *extack)
4307 struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4308 struct nl_info *info = &cfg->fc_nlinfo;
4309 struct fib6_config r_cfg;
4310 struct rtnexthop *rtnh;
4311 struct fib6_info *rt;
4312 struct rt6_nh *err_nh;
4313 struct rt6_nh *nh, *nh_safe;
4319 int replace = (cfg->fc_nlinfo.nlh &&
4320 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4321 LIST_HEAD(rt6_nh_list);
4323 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4324 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4325 nlflags |= NLM_F_APPEND;
4327 remaining = cfg->fc_mp_len;
4328 rtnh = (struct rtnexthop *)cfg->fc_mp;
4330 /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4331 * fib6_info structs per nexthop
4333 while (rtnh_ok(rtnh, remaining)) {
4334 memcpy(&r_cfg, cfg, sizeof(*cfg));
4335 if (rtnh->rtnh_ifindex)
4336 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4338 attrlen = rtnh_attrlen(rtnh);
4340 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4342 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4344 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4345 r_cfg.fc_flags |= RTF_GATEWAY;
4347 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4348 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4350 r_cfg.fc_encap_type = nla_get_u16(nla);
4353 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4354 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4360 if (!rt6_qualify_for_ecmp(rt)) {
4362 NL_SET_ERR_MSG(extack,
4363 "Device only routes can not be added for IPv6 using the multipath API.");
4364 fib6_info_release(rt);
4368 rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
4370 err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4373 fib6_info_release(rt);
4377 rtnh = rtnh_next(rtnh, &remaining);
4380 /* for add and replace send one notification with all nexthops.
4381 * Skip the notification in fib6_add_rt2node and send one with
4382 * the full route when done
4384 info->skip_notify = 1;
4387 list_for_each_entry(nh, &rt6_nh_list, next) {
4388 err = __ip6_ins_rt(nh->fib6_info, info, extack);
4389 fib6_info_release(nh->fib6_info);
4392 /* save reference to last route successfully inserted */
4393 rt_last = nh->fib6_info;
4395 /* save reference to first route for notification */
4397 rt_notif = nh->fib6_info;
4400 /* nh->fib6_info is used or freed at this point, reset to NULL*/
4401 nh->fib6_info = NULL;
4404 NL_SET_ERR_MSG_MOD(extack,
4405 "multipath route replace failed (check consistency of installed routes)");
4410 /* Because each route is added like a single route we remove
4411 * these flags after the first nexthop: if there is a collision,
4412 * we have already failed to add the first nexthop:
4413 * fib6_add_rt2node() has rejected it; when replacing, old
4414 * nexthops have been replaced by first new, the rest should
4417 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4422 /* success ... tell user about new route */
4423 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4427 /* send notification for routes that were added so that
4428 * the delete notifications sent by ip6_route_del are
4432 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4434 /* Delete routes that were already added */
4435 list_for_each_entry(nh, &rt6_nh_list, next) {
4438 ip6_route_del(&nh->r_cfg, extack);
4442 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4444 fib6_info_release(nh->fib6_info);
4445 list_del(&nh->next);
4452 static int ip6_route_multipath_del(struct fib6_config *cfg,
4453 struct netlink_ext_ack *extack)
4455 struct fib6_config r_cfg;
4456 struct rtnexthop *rtnh;
4459 int err = 1, last_err = 0;
4461 remaining = cfg->fc_mp_len;
4462 rtnh = (struct rtnexthop *)cfg->fc_mp;
4464 /* Parse a Multipath Entry */
4465 while (rtnh_ok(rtnh, remaining)) {
4466 memcpy(&r_cfg, cfg, sizeof(*cfg));
4467 if (rtnh->rtnh_ifindex)
4468 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4470 attrlen = rtnh_attrlen(rtnh);
4472 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4474 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4476 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4477 r_cfg.fc_flags |= RTF_GATEWAY;
4480 err = ip6_route_del(&r_cfg, extack);
4484 rtnh = rtnh_next(rtnh, &remaining);
4490 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4491 struct netlink_ext_ack *extack)
4493 struct fib6_config cfg;
4496 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4501 return ip6_route_multipath_del(&cfg, extack);
4503 cfg.fc_delete_all_nh = 1;
4504 return ip6_route_del(&cfg, extack);
4508 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4509 struct netlink_ext_ack *extack)
4511 struct fib6_config cfg;
4514 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4519 return ip6_route_multipath_add(&cfg, extack);
4521 return ip6_route_add(&cfg, GFP_KERNEL, extack);
4524 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4526 int nexthop_len = 0;
4528 if (rt->fib6_nsiblings) {
4529 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */
4530 + NLA_ALIGN(sizeof(struct rtnexthop))
4531 + nla_total_size(16) /* RTA_GATEWAY */
4532 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate);
4534 nexthop_len *= rt->fib6_nsiblings;
4537 return NLMSG_ALIGN(sizeof(struct rtmsg))
4538 + nla_total_size(16) /* RTA_SRC */
4539 + nla_total_size(16) /* RTA_DST */
4540 + nla_total_size(16) /* RTA_GATEWAY */
4541 + nla_total_size(16) /* RTA_PREFSRC */
4542 + nla_total_size(4) /* RTA_TABLE */
4543 + nla_total_size(4) /* RTA_IIF */
4544 + nla_total_size(4) /* RTA_OIF */
4545 + nla_total_size(4) /* RTA_PRIORITY */
4546 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4547 + nla_total_size(sizeof(struct rta_cacheinfo))
4548 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4549 + nla_total_size(1) /* RTA_PREF */
4550 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate)
4554 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
4555 unsigned int *flags, bool skip_oif)
4557 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4558 *flags |= RTNH_F_DEAD;
4560 if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
4561 *flags |= RTNH_F_LINKDOWN;
4564 if (fib6_ignore_linkdown(rt))
4565 *flags |= RTNH_F_DEAD;
4569 if (rt->fib6_flags & RTF_GATEWAY) {
4570 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0)
4571 goto nla_put_failure;
4574 *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK);
4575 if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD)
4576 *flags |= RTNH_F_OFFLOAD;
4578 /* not needed for multipath encoding b/c it has a rtnexthop struct */
4579 if (!skip_oif && rt->fib6_nh.nh_dev &&
4580 nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex))
4581 goto nla_put_failure;
4583 if (rt->fib6_nh.nh_lwtstate &&
4584 lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0)
4585 goto nla_put_failure;
4593 /* add multipath next hop */
4594 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt)
4596 const struct net_device *dev = rt->fib6_nh.nh_dev;
4597 struct rtnexthop *rtnh;
4598 unsigned int flags = 0;
4600 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4602 goto nla_put_failure;
4604 rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1;
4605 rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
4607 if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4608 goto nla_put_failure;
4610 rtnh->rtnh_flags = flags;
4612 /* length of rtnetlink header + attributes */
4613 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4621 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4622 struct fib6_info *rt, struct dst_entry *dst,
4623 struct in6_addr *dest, struct in6_addr *src,
4624 int iif, int type, u32 portid, u32 seq,
4627 struct rt6_info *rt6 = (struct rt6_info *)dst;
4628 struct rt6key *rt6_dst, *rt6_src;
4629 u32 *pmetrics, table, rt6_flags;
4630 struct nlmsghdr *nlh;
4634 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4639 rt6_dst = &rt6->rt6i_dst;
4640 rt6_src = &rt6->rt6i_src;
4641 rt6_flags = rt6->rt6i_flags;
4643 rt6_dst = &rt->fib6_dst;
4644 rt6_src = &rt->fib6_src;
4645 rt6_flags = rt->fib6_flags;
4648 rtm = nlmsg_data(nlh);
4649 rtm->rtm_family = AF_INET6;
4650 rtm->rtm_dst_len = rt6_dst->plen;
4651 rtm->rtm_src_len = rt6_src->plen;
4654 table = rt->fib6_table->tb6_id;
4656 table = RT6_TABLE_UNSPEC;
4657 rtm->rtm_table = table;
4658 if (nla_put_u32(skb, RTA_TABLE, table))
4659 goto nla_put_failure;
4661 rtm->rtm_type = rt->fib6_type;
4663 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4664 rtm->rtm_protocol = rt->fib6_protocol;
4666 if (rt6_flags & RTF_CACHE)
4667 rtm->rtm_flags |= RTM_F_CLONED;
4670 if (nla_put_in6_addr(skb, RTA_DST, dest))
4671 goto nla_put_failure;
4672 rtm->rtm_dst_len = 128;
4673 } else if (rtm->rtm_dst_len)
4674 if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
4675 goto nla_put_failure;
4676 #ifdef CONFIG_IPV6_SUBTREES
4678 if (nla_put_in6_addr(skb, RTA_SRC, src))
4679 goto nla_put_failure;
4680 rtm->rtm_src_len = 128;
4681 } else if (rtm->rtm_src_len &&
4682 nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
4683 goto nla_put_failure;
4686 #ifdef CONFIG_IPV6_MROUTE
4687 if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
4688 int err = ip6mr_get_route(net, skb, rtm, portid);
4693 goto nla_put_failure;
4696 if (nla_put_u32(skb, RTA_IIF, iif))
4697 goto nla_put_failure;
4699 struct in6_addr saddr_buf;
4700 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4701 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4702 goto nla_put_failure;
4705 if (rt->fib6_prefsrc.plen) {
4706 struct in6_addr saddr_buf;
4707 saddr_buf = rt->fib6_prefsrc.addr;
4708 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4709 goto nla_put_failure;
4712 pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4713 if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4714 goto nla_put_failure;
4716 if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4717 goto nla_put_failure;
4719 /* For multipath routes, walk the siblings list and add
4720 * each as a nexthop within RTA_MULTIPATH.
4723 if (rt6_flags & RTF_GATEWAY &&
4724 nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
4725 goto nla_put_failure;
4727 if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
4728 goto nla_put_failure;
4729 } else if (rt->fib6_nsiblings) {
4730 struct fib6_info *sibling, *next_sibling;
4733 mp = nla_nest_start(skb, RTA_MULTIPATH);
4735 goto nla_put_failure;
4737 if (rt6_add_nexthop(skb, rt) < 0)
4738 goto nla_put_failure;
4740 list_for_each_entry_safe(sibling, next_sibling,
4741 &rt->fib6_siblings, fib6_siblings) {
4742 if (rt6_add_nexthop(skb, sibling) < 0)
4743 goto nla_put_failure;
4746 nla_nest_end(skb, mp);
4748 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4749 goto nla_put_failure;
4752 if (rt6_flags & RTF_EXPIRES) {
4753 expires = dst ? dst->expires : rt->expires;
4757 if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4758 goto nla_put_failure;
4760 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
4761 goto nla_put_failure;
4764 nlmsg_end(skb, nlh);
4768 nlmsg_cancel(skb, nlh);
4772 static bool fib6_info_uses_dev(const struct fib6_info *f6i,
4773 const struct net_device *dev)
4775 if (f6i->fib6_nh.nh_dev == dev)
4778 if (f6i->fib6_nsiblings) {
4779 struct fib6_info *sibling, *next_sibling;
4781 list_for_each_entry_safe(sibling, next_sibling,
4782 &f6i->fib6_siblings, fib6_siblings) {
4783 if (sibling->fib6_nh.nh_dev == dev)
4791 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4793 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4794 struct fib_dump_filter *filter = &arg->filter;
4795 unsigned int flags = NLM_F_MULTI;
4796 struct net *net = arg->net;
4798 if (rt == net->ipv6.fib6_null_entry)
4801 if ((filter->flags & RTM_F_PREFIX) &&
4802 !(rt->fib6_flags & RTF_PREFIX_RT)) {
4803 /* success since this is not a prefix route */
4806 if (filter->filter_set) {
4807 if ((filter->rt_type && rt->fib6_type != filter->rt_type) ||
4808 (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) ||
4809 (filter->protocol && rt->fib6_protocol != filter->protocol)) {
4812 flags |= NLM_F_DUMP_FILTERED;
4815 return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4816 RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4817 arg->cb->nlh->nlmsg_seq, flags);
4820 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4821 struct netlink_ext_ack *extack)
4823 struct net *net = sock_net(in_skb->sk);
4824 struct nlattr *tb[RTA_MAX+1];
4825 int err, iif = 0, oif = 0;
4826 struct fib6_info *from;
4827 struct dst_entry *dst;
4828 struct rt6_info *rt;
4829 struct sk_buff *skb;
4831 struct flowi6 fl6 = {};
4834 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4840 rtm = nlmsg_data(nlh);
4841 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4842 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4845 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4848 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4852 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4855 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4859 iif = nla_get_u32(tb[RTA_IIF]);
4862 oif = nla_get_u32(tb[RTA_OIF]);
4865 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4868 fl6.flowi6_uid = make_kuid(current_user_ns(),
4869 nla_get_u32(tb[RTA_UID]));
4871 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4874 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
4877 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
4879 if (tb[RTA_IP_PROTO]) {
4880 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
4881 &fl6.flowi6_proto, extack);
4887 struct net_device *dev;
4892 dev = dev_get_by_index_rcu(net, iif);
4899 fl6.flowi6_iif = iif;
4901 if (!ipv6_addr_any(&fl6.saddr))
4902 flags |= RT6_LOOKUP_F_HAS_SADDR;
4904 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4908 fl6.flowi6_oif = oif;
4910 dst = ip6_route_output(net, NULL, &fl6);
4914 rt = container_of(dst, struct rt6_info, dst);
4915 if (rt->dst.error) {
4916 err = rt->dst.error;
4921 if (rt == net->ipv6.ip6_null_entry) {
4922 err = rt->dst.error;
4927 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4934 skb_dst_set(skb, &rt->dst);
4937 from = rcu_dereference(rt->from);
4940 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif,
4941 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4944 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
4945 &fl6.saddr, iif, RTM_NEWROUTE,
4946 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
4955 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4960 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
4961 unsigned int nlm_flags)
4963 struct sk_buff *skb;
4964 struct net *net = info->nl_net;
4969 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4971 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4975 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
4976 event, info->portid, seq, nlm_flags);
4978 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4979 WARN_ON(err == -EMSGSIZE);
4983 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4984 info->nlh, gfp_any());
4988 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4991 static int ip6_route_dev_notify(struct notifier_block *this,
4992 unsigned long event, void *ptr)
4994 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4995 struct net *net = dev_net(dev);
4997 if (!(dev->flags & IFF_LOOPBACK))
5000 if (event == NETDEV_REGISTER) {
5001 net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
5002 net->ipv6.ip6_null_entry->dst.dev = dev;
5003 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
5004 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5005 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
5006 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
5007 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
5008 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
5010 } else if (event == NETDEV_UNREGISTER &&
5011 dev->reg_state != NETREG_UNREGISTERED) {
5012 /* NETDEV_UNREGISTER could be fired for multiple times by
5013 * netdev_wait_allrefs(). Make sure we only call this once.
5015 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
5016 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5017 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
5018 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
5029 #ifdef CONFIG_PROC_FS
5030 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
5032 struct net *net = (struct net *)seq->private;
5033 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
5034 net->ipv6.rt6_stats->fib_nodes,
5035 net->ipv6.rt6_stats->fib_route_nodes,
5036 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
5037 net->ipv6.rt6_stats->fib_rt_entries,
5038 net->ipv6.rt6_stats->fib_rt_cache,
5039 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
5040 net->ipv6.rt6_stats->fib_discarded_routes);
5044 #endif /* CONFIG_PROC_FS */
5046 #ifdef CONFIG_SYSCTL
5049 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
5050 void __user *buffer, size_t *lenp, loff_t *ppos)
5058 net = (struct net *)ctl->extra1;
5059 delay = net->ipv6.sysctl.flush_delay;
5060 ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
5064 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
5071 static struct ctl_table ipv6_route_table_template[] = {
5073 .procname = "flush",
5074 .data = &init_net.ipv6.sysctl.flush_delay,
5075 .maxlen = sizeof(int),
5077 .proc_handler = ipv6_sysctl_rtcache_flush
5080 .procname = "gc_thresh",
5081 .data = &ip6_dst_ops_template.gc_thresh,
5082 .maxlen = sizeof(int),
5084 .proc_handler = proc_dointvec,
5087 .procname = "max_size",
5088 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
5089 .maxlen = sizeof(int),
5091 .proc_handler = proc_dointvec,
5094 .procname = "gc_min_interval",
5095 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5096 .maxlen = sizeof(int),
5098 .proc_handler = proc_dointvec_jiffies,
5101 .procname = "gc_timeout",
5102 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5103 .maxlen = sizeof(int),
5105 .proc_handler = proc_dointvec_jiffies,
5108 .procname = "gc_interval",
5109 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
5110 .maxlen = sizeof(int),
5112 .proc_handler = proc_dointvec_jiffies,
5115 .procname = "gc_elasticity",
5116 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5117 .maxlen = sizeof(int),
5119 .proc_handler = proc_dointvec,
5122 .procname = "mtu_expires",
5123 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5124 .maxlen = sizeof(int),
5126 .proc_handler = proc_dointvec_jiffies,
5129 .procname = "min_adv_mss",
5130 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
5131 .maxlen = sizeof(int),
5133 .proc_handler = proc_dointvec,
5136 .procname = "gc_min_interval_ms",
5137 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5138 .maxlen = sizeof(int),
5140 .proc_handler = proc_dointvec_ms_jiffies,
5143 .procname = "skip_notify_on_dev_down",
5144 .data = &init_net.ipv6.sysctl.skip_notify_on_dev_down,
5145 .maxlen = sizeof(int),
5147 .proc_handler = proc_dointvec,
5154 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5156 struct ctl_table *table;
5158 table = kmemdup(ipv6_route_table_template,
5159 sizeof(ipv6_route_table_template),
5163 table[0].data = &net->ipv6.sysctl.flush_delay;
5164 table[0].extra1 = net;
5165 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5166 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5167 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5168 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5169 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5170 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5171 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5172 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5173 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5174 table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down;
5176 /* Don't export sysctls to unprivileged users */
5177 if (net->user_ns != &init_user_ns)
5178 table[0].procname = NULL;
5185 static int __net_init ip6_route_net_init(struct net *net)
5189 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5190 sizeof(net->ipv6.ip6_dst_ops));
5192 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5193 goto out_ip6_dst_ops;
5195 net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5196 sizeof(*net->ipv6.fib6_null_entry),
5198 if (!net->ipv6.fib6_null_entry)
5199 goto out_ip6_dst_entries;
5201 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5202 sizeof(*net->ipv6.ip6_null_entry),
5204 if (!net->ipv6.ip6_null_entry)
5205 goto out_fib6_null_entry;
5206 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5207 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5208 ip6_template_metrics, true);
5210 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5211 net->ipv6.fib6_has_custom_rules = false;
5212 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5213 sizeof(*net->ipv6.ip6_prohibit_entry),
5215 if (!net->ipv6.ip6_prohibit_entry)
5216 goto out_ip6_null_entry;
5217 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5218 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5219 ip6_template_metrics, true);
5221 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5222 sizeof(*net->ipv6.ip6_blk_hole_entry),
5224 if (!net->ipv6.ip6_blk_hole_entry)
5225 goto out_ip6_prohibit_entry;
5226 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5227 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5228 ip6_template_metrics, true);
5231 net->ipv6.sysctl.flush_delay = 0;
5232 net->ipv6.sysctl.ip6_rt_max_size = 4096;
5233 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5234 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5235 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5236 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5237 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5238 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5239 net->ipv6.sysctl.skip_notify_on_dev_down = 0;
5241 net->ipv6.ip6_rt_gc_expire = 30*HZ;
5247 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5248 out_ip6_prohibit_entry:
5249 kfree(net->ipv6.ip6_prohibit_entry);
5251 kfree(net->ipv6.ip6_null_entry);
5253 out_fib6_null_entry:
5254 kfree(net->ipv6.fib6_null_entry);
5255 out_ip6_dst_entries:
5256 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5261 static void __net_exit ip6_route_net_exit(struct net *net)
5263 kfree(net->ipv6.fib6_null_entry);
5264 kfree(net->ipv6.ip6_null_entry);
5265 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5266 kfree(net->ipv6.ip6_prohibit_entry);
5267 kfree(net->ipv6.ip6_blk_hole_entry);
5269 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5272 static int __net_init ip6_route_net_init_late(struct net *net)
5274 #ifdef CONFIG_PROC_FS
5275 proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
5276 sizeof(struct ipv6_route_iter));
5277 proc_create_net_single("rt6_stats", 0444, net->proc_net,
5278 rt6_stats_seq_show, NULL);
5283 static void __net_exit ip6_route_net_exit_late(struct net *net)
5285 #ifdef CONFIG_PROC_FS
5286 remove_proc_entry("ipv6_route", net->proc_net);
5287 remove_proc_entry("rt6_stats", net->proc_net);
5291 static struct pernet_operations ip6_route_net_ops = {
5292 .init = ip6_route_net_init,
5293 .exit = ip6_route_net_exit,
5296 static int __net_init ipv6_inetpeer_init(struct net *net)
5298 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5302 inet_peer_base_init(bp);
5303 net->ipv6.peers = bp;
5307 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5309 struct inet_peer_base *bp = net->ipv6.peers;
5311 net->ipv6.peers = NULL;
5312 inetpeer_invalidate_tree(bp);
5316 static struct pernet_operations ipv6_inetpeer_ops = {
5317 .init = ipv6_inetpeer_init,
5318 .exit = ipv6_inetpeer_exit,
5321 static struct pernet_operations ip6_route_net_late_ops = {
5322 .init = ip6_route_net_init_late,
5323 .exit = ip6_route_net_exit_late,
5326 static struct notifier_block ip6_route_dev_notifier = {
5327 .notifier_call = ip6_route_dev_notify,
5328 .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5331 void __init ip6_route_init_special_entries(void)
5333 /* Registering of the loopback is done before this portion of code,
5334 * the loopback reference in rt6_info will not be taken, do it
5335 * manually for init_net */
5336 init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
5337 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5338 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5339 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5340 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5341 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5342 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5343 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5347 int __init ip6_route_init(void)
5353 ip6_dst_ops_template.kmem_cachep =
5354 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5355 SLAB_HWCACHE_ALIGN, NULL);
5356 if (!ip6_dst_ops_template.kmem_cachep)
5359 ret = dst_entries_init(&ip6_dst_blackhole_ops);
5361 goto out_kmem_cache;
5363 ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5365 goto out_dst_entries;
5367 ret = register_pernet_subsys(&ip6_route_net_ops);
5369 goto out_register_inetpeer;
5371 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5375 goto out_register_subsys;
5381 ret = fib6_rules_init();
5385 ret = register_pernet_subsys(&ip6_route_net_late_ops);
5387 goto fib6_rules_init;
5389 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5390 inet6_rtm_newroute, NULL, 0);
5392 goto out_register_late_subsys;
5394 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5395 inet6_rtm_delroute, NULL, 0);
5397 goto out_register_late_subsys;
5399 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5400 inet6_rtm_getroute, NULL,
5401 RTNL_FLAG_DOIT_UNLOCKED);
5403 goto out_register_late_subsys;
5405 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5407 goto out_register_late_subsys;
5409 for_each_possible_cpu(cpu) {
5410 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5412 INIT_LIST_HEAD(&ul->head);
5413 spin_lock_init(&ul->lock);
5419 out_register_late_subsys:
5420 rtnl_unregister_all(PF_INET6);
5421 unregister_pernet_subsys(&ip6_route_net_late_ops);
5423 fib6_rules_cleanup();
5428 out_register_subsys:
5429 unregister_pernet_subsys(&ip6_route_net_ops);
5430 out_register_inetpeer:
5431 unregister_pernet_subsys(&ipv6_inetpeer_ops);
5433 dst_entries_destroy(&ip6_dst_blackhole_ops);
5435 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5439 void ip6_route_cleanup(void)
5441 unregister_netdevice_notifier(&ip6_route_dev_notifier);
5442 unregister_pernet_subsys(&ip6_route_net_late_ops);
5443 fib6_rules_cleanup();
5446 unregister_pernet_subsys(&ipv6_inetpeer_ops);
5447 unregister_pernet_subsys(&ip6_route_net_ops);
5448 dst_entries_destroy(&ip6_dst_blackhole_ops);
5449 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);