2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #define pr_fmt(fmt) "IPv6: " fmt
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
56 #include <linux/rtnetlink.h>
58 #include <net/dst_metadata.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <trace/events/fib6.h>
68 #include <linux/uaccess.h>
71 #include <linux/sysctl.h>
75 RT6_NUD_FAIL_HARD = -3,
76 RT6_NUD_FAIL_PROBE = -2,
77 RT6_NUD_FAIL_DO_RR = -1,
81 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
82 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
83 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
84 static unsigned int ip6_mtu(const struct dst_entry *dst);
85 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
86 static void ip6_dst_destroy(struct dst_entry *);
87 static void ip6_dst_ifdown(struct dst_entry *,
88 struct net_device *dev, int how);
89 static int ip6_dst_gc(struct dst_ops *ops);
91 static int ip6_pkt_discard(struct sk_buff *skb);
92 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
93 static int ip6_pkt_prohibit(struct sk_buff *skb);
94 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
95 static void ip6_link_failure(struct sk_buff *skb);
96 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
97 struct sk_buff *skb, u32 mtu);
98 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
100 static void rt6_dst_from_metrics_check(struct rt6_info *rt);
101 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
102 static size_t rt6_nlmsg_size(struct rt6_info *rt);
103 static int rt6_fill_node(struct net *net,
104 struct sk_buff *skb, struct rt6_info *rt,
105 struct in6_addr *dst, struct in6_addr *src,
106 int iif, int type, u32 portid, u32 seq,
108 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
109 struct in6_addr *daddr,
110 struct in6_addr *saddr);
112 #ifdef CONFIG_IPV6_ROUTE_INFO
113 static struct rt6_info *rt6_add_route_info(struct net *net,
114 const struct in6_addr *prefix, int prefixlen,
115 const struct in6_addr *gwaddr,
116 struct net_device *dev,
118 static struct rt6_info *rt6_get_route_info(struct net *net,
119 const struct in6_addr *prefix, int prefixlen,
120 const struct in6_addr *gwaddr,
121 struct net_device *dev);
124 struct uncached_list {
126 struct list_head head;
129 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
131 void rt6_uncached_list_add(struct rt6_info *rt)
133 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
135 rt->rt6i_uncached_list = ul;
137 spin_lock_bh(&ul->lock);
138 list_add_tail(&rt->rt6i_uncached, &ul->head);
139 spin_unlock_bh(&ul->lock);
142 void rt6_uncached_list_del(struct rt6_info *rt)
144 if (!list_empty(&rt->rt6i_uncached)) {
145 struct uncached_list *ul = rt->rt6i_uncached_list;
146 struct net *net = dev_net(rt->dst.dev);
148 spin_lock_bh(&ul->lock);
149 list_del(&rt->rt6i_uncached);
150 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
151 spin_unlock_bh(&ul->lock);
155 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
157 struct net_device *loopback_dev = net->loopback_dev;
160 if (dev == loopback_dev)
163 for_each_possible_cpu(cpu) {
164 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
167 spin_lock_bh(&ul->lock);
168 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
169 struct inet6_dev *rt_idev = rt->rt6i_idev;
170 struct net_device *rt_dev = rt->dst.dev;
172 if (rt_idev->dev == dev) {
173 rt->rt6i_idev = in6_dev_get(loopback_dev);
174 in6_dev_put(rt_idev);
178 rt->dst.dev = loopback_dev;
179 dev_hold(rt->dst.dev);
183 spin_unlock_bh(&ul->lock);
187 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
189 return dst_metrics_write_ptr(&rt->from->dst);
192 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
194 struct rt6_info *rt = (struct rt6_info *)dst;
196 if (rt->rt6i_flags & RTF_PCPU)
197 return rt6_pcpu_cow_metrics(rt);
198 else if (rt->rt6i_flags & RTF_CACHE)
201 return dst_cow_metrics_generic(dst, old);
204 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
208 struct in6_addr *p = &rt->rt6i_gateway;
210 if (!ipv6_addr_any(p))
211 return (const void *) p;
213 return &ipv6_hdr(skb)->daddr;
217 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
221 struct rt6_info *rt = (struct rt6_info *) dst;
224 daddr = choose_neigh_daddr(rt, skb, daddr);
225 n = __ipv6_neigh_lookup(dst->dev, daddr);
228 return neigh_create(&nd_tbl, daddr, dst->dev);
231 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
233 struct net_device *dev = dst->dev;
234 struct rt6_info *rt = (struct rt6_info *)dst;
236 daddr = choose_neigh_daddr(rt, NULL, daddr);
239 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
241 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
243 __ipv6_confirm_neigh(dev, daddr);
246 static struct dst_ops ip6_dst_ops_template = {
250 .check = ip6_dst_check,
251 .default_advmss = ip6_default_advmss,
253 .cow_metrics = ipv6_cow_metrics,
254 .destroy = ip6_dst_destroy,
255 .ifdown = ip6_dst_ifdown,
256 .negative_advice = ip6_negative_advice,
257 .link_failure = ip6_link_failure,
258 .update_pmtu = ip6_rt_update_pmtu,
259 .redirect = rt6_do_redirect,
260 .local_out = __ip6_local_out,
261 .neigh_lookup = ip6_neigh_lookup,
262 .confirm_neigh = ip6_confirm_neigh,
265 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
267 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
269 return mtu ? : dst->dev->mtu;
272 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
273 struct sk_buff *skb, u32 mtu)
277 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
282 static struct dst_ops ip6_dst_blackhole_ops = {
284 .destroy = ip6_dst_destroy,
285 .check = ip6_dst_check,
286 .mtu = ip6_blackhole_mtu,
287 .default_advmss = ip6_default_advmss,
288 .update_pmtu = ip6_rt_blackhole_update_pmtu,
289 .redirect = ip6_rt_blackhole_redirect,
290 .cow_metrics = dst_cow_metrics_generic,
291 .neigh_lookup = ip6_neigh_lookup,
294 static const u32 ip6_template_metrics[RTAX_MAX] = {
295 [RTAX_HOPLIMIT - 1] = 0,
298 static const struct rt6_info ip6_null_entry_template = {
300 .__refcnt = ATOMIC_INIT(1),
302 .obsolete = DST_OBSOLETE_FORCE_CHK,
303 .error = -ENETUNREACH,
304 .input = ip6_pkt_discard,
305 .output = ip6_pkt_discard_out,
307 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
308 .rt6i_protocol = RTPROT_KERNEL,
309 .rt6i_metric = ~(u32) 0,
310 .rt6i_ref = ATOMIC_INIT(1),
313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
315 static const struct rt6_info ip6_prohibit_entry_template = {
317 .__refcnt = ATOMIC_INIT(1),
319 .obsolete = DST_OBSOLETE_FORCE_CHK,
321 .input = ip6_pkt_prohibit,
322 .output = ip6_pkt_prohibit_out,
324 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
325 .rt6i_protocol = RTPROT_KERNEL,
326 .rt6i_metric = ~(u32) 0,
327 .rt6i_ref = ATOMIC_INIT(1),
330 static const struct rt6_info ip6_blk_hole_entry_template = {
332 .__refcnt = ATOMIC_INIT(1),
334 .obsolete = DST_OBSOLETE_FORCE_CHK,
336 .input = dst_discard,
337 .output = dst_discard_out,
339 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
340 .rt6i_protocol = RTPROT_KERNEL,
341 .rt6i_metric = ~(u32) 0,
342 .rt6i_ref = ATOMIC_INIT(1),
347 static void rt6_info_init(struct rt6_info *rt)
349 struct dst_entry *dst = &rt->dst;
351 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
352 INIT_LIST_HEAD(&rt->rt6i_siblings);
353 INIT_LIST_HEAD(&rt->rt6i_uncached);
356 /* allocate dst with ip6_dst_ops */
357 static struct rt6_info *__ip6_dst_alloc(struct net *net,
358 struct net_device *dev,
361 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
362 1, DST_OBSOLETE_FORCE_CHK, flags);
366 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
372 struct rt6_info *ip6_dst_alloc(struct net *net,
373 struct net_device *dev,
376 struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
379 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
380 if (!rt->rt6i_pcpu) {
381 dst_release_immediate(&rt->dst);
388 EXPORT_SYMBOL(ip6_dst_alloc);
390 static void ip6_dst_destroy(struct dst_entry *dst)
392 struct rt6_info *rt = (struct rt6_info *)dst;
393 struct rt6_exception_bucket *bucket;
394 struct rt6_info *from = rt->from;
395 struct inet6_dev *idev;
397 dst_destroy_metrics_generic(dst);
398 free_percpu(rt->rt6i_pcpu);
399 rt6_uncached_list_del(rt);
401 idev = rt->rt6i_idev;
403 rt->rt6i_idev = NULL;
406 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1);
408 rt->rt6i_exception_bucket = NULL;
413 dst_release(&from->dst);
416 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
419 struct rt6_info *rt = (struct rt6_info *)dst;
420 struct inet6_dev *idev = rt->rt6i_idev;
421 struct net_device *loopback_dev =
422 dev_net(dev)->loopback_dev;
424 if (idev && idev->dev != loopback_dev) {
425 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
427 rt->rt6i_idev = loopback_idev;
433 static bool __rt6_check_expired(const struct rt6_info *rt)
435 if (rt->rt6i_flags & RTF_EXPIRES)
436 return time_after(jiffies, rt->dst.expires);
441 static bool rt6_check_expired(const struct rt6_info *rt)
443 if (rt->rt6i_flags & RTF_EXPIRES) {
444 if (time_after(jiffies, rt->dst.expires))
446 } else if (rt->from) {
447 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
448 rt6_check_expired(rt->from);
453 static struct rt6_info *rt6_multipath_select(const struct net *net,
454 struct rt6_info *match,
455 struct flowi6 *fl6, int oif,
456 const struct sk_buff *skb,
459 struct rt6_info *sibling, *next_sibling;
461 /* We might have already computed the hash for ICMPv6 errors. In such
462 * case it will always be non-zero. Otherwise now is the time to do it.
465 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
467 if (fl6->mp_hash <= atomic_read(&match->rt6i_nh_upper_bound))
470 list_for_each_entry_safe(sibling, next_sibling, &match->rt6i_siblings,
472 if (fl6->mp_hash > atomic_read(&sibling->rt6i_nh_upper_bound))
474 if (rt6_score_route(sibling, oif, strict) < 0)
484 * Route lookup. rcu_read_lock() should be held.
487 static inline struct rt6_info *rt6_device_match(struct net *net,
489 const struct in6_addr *saddr,
493 struct rt6_info *local = NULL;
494 struct rt6_info *sprt;
496 if (!oif && ipv6_addr_any(saddr) && !(rt->rt6i_nh_flags & RTNH_F_DEAD))
499 for (sprt = rt; sprt; sprt = rcu_dereference(sprt->rt6_next)) {
500 struct net_device *dev = sprt->dst.dev;
502 if (sprt->rt6i_nh_flags & RTNH_F_DEAD)
506 if (dev->ifindex == oif)
508 if (dev->flags & IFF_LOOPBACK) {
509 if (!sprt->rt6i_idev ||
510 sprt->rt6i_idev->dev->ifindex != oif) {
511 if (flags & RT6_LOOKUP_F_IFACE)
514 local->rt6i_idev->dev->ifindex == oif)
520 if (ipv6_chk_addr(net, saddr, dev,
521 flags & RT6_LOOKUP_F_IFACE))
530 if (flags & RT6_LOOKUP_F_IFACE)
531 return net->ipv6.ip6_null_entry;
534 return rt->rt6i_nh_flags & RTNH_F_DEAD ? net->ipv6.ip6_null_entry : rt;
537 #ifdef CONFIG_IPV6_ROUTER_PREF
538 struct __rt6_probe_work {
539 struct work_struct work;
540 struct in6_addr target;
541 struct net_device *dev;
544 static void rt6_probe_deferred(struct work_struct *w)
546 struct in6_addr mcaddr;
547 struct __rt6_probe_work *work =
548 container_of(w, struct __rt6_probe_work, work);
550 addrconf_addr_solict_mult(&work->target, &mcaddr);
551 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
556 static void rt6_probe(struct rt6_info *rt)
558 struct __rt6_probe_work *work;
559 struct neighbour *neigh;
561 * Okay, this does not seem to be appropriate
562 * for now, however, we need to check if it
563 * is really so; aka Router Reachability Probing.
565 * Router Reachability Probe MUST be rate-limited
566 * to no more than one per minute.
568 if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
571 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
573 if (neigh->nud_state & NUD_VALID)
577 write_lock(&neigh->lock);
578 if (!(neigh->nud_state & NUD_VALID) &&
581 rt->rt6i_idev->cnf.rtr_probe_interval)) {
582 work = kmalloc(sizeof(*work), GFP_ATOMIC);
584 __neigh_set_probe_once(neigh);
586 write_unlock(&neigh->lock);
588 work = kmalloc(sizeof(*work), GFP_ATOMIC);
592 INIT_WORK(&work->work, rt6_probe_deferred);
593 work->target = rt->rt6i_gateway;
594 dev_hold(rt->dst.dev);
595 work->dev = rt->dst.dev;
596 schedule_work(&work->work);
600 rcu_read_unlock_bh();
603 static inline void rt6_probe(struct rt6_info *rt)
609 * Default Router Selection (RFC 2461 6.3.6)
611 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
613 struct net_device *dev = rt->dst.dev;
614 if (!oif || dev->ifindex == oif)
616 if ((dev->flags & IFF_LOOPBACK) &&
617 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
622 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
624 struct neighbour *neigh;
625 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
627 if (rt->rt6i_flags & RTF_NONEXTHOP ||
628 !(rt->rt6i_flags & RTF_GATEWAY))
629 return RT6_NUD_SUCCEED;
632 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
634 read_lock(&neigh->lock);
635 if (neigh->nud_state & NUD_VALID)
636 ret = RT6_NUD_SUCCEED;
637 #ifdef CONFIG_IPV6_ROUTER_PREF
638 else if (!(neigh->nud_state & NUD_FAILED))
639 ret = RT6_NUD_SUCCEED;
641 ret = RT6_NUD_FAIL_PROBE;
643 read_unlock(&neigh->lock);
645 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
646 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
648 rcu_read_unlock_bh();
653 static int rt6_score_route(struct rt6_info *rt, int oif,
658 m = rt6_check_dev(rt, oif);
659 if (!m && (strict & RT6_LOOKUP_F_IFACE))
660 return RT6_NUD_FAIL_HARD;
661 #ifdef CONFIG_IPV6_ROUTER_PREF
662 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
664 if (strict & RT6_LOOKUP_F_REACHABLE) {
665 int n = rt6_check_neigh(rt);
672 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
673 int *mpri, struct rt6_info *match,
677 bool match_do_rr = false;
678 struct inet6_dev *idev = rt->rt6i_idev;
680 if (rt->rt6i_nh_flags & RTNH_F_DEAD)
683 if (idev->cnf.ignore_routes_with_linkdown &&
684 rt->rt6i_nh_flags & RTNH_F_LINKDOWN &&
685 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
688 if (rt6_check_expired(rt))
691 m = rt6_score_route(rt, oif, strict);
692 if (m == RT6_NUD_FAIL_DO_RR) {
694 m = 0; /* lowest valid score */
695 } else if (m == RT6_NUD_FAIL_HARD) {
699 if (strict & RT6_LOOKUP_F_REACHABLE)
702 /* note that m can be RT6_NUD_FAIL_PROBE at this point */
704 *do_rr = match_do_rr;
712 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
713 struct rt6_info *leaf,
714 struct rt6_info *rr_head,
715 u32 metric, int oif, int strict,
718 struct rt6_info *rt, *match, *cont;
723 for (rt = rr_head; rt; rt = rcu_dereference(rt->rt6_next)) {
724 if (rt->rt6i_metric != metric) {
729 match = find_match(rt, oif, strict, &mpri, match, do_rr);
732 for (rt = leaf; rt && rt != rr_head;
733 rt = rcu_dereference(rt->rt6_next)) {
734 if (rt->rt6i_metric != metric) {
739 match = find_match(rt, oif, strict, &mpri, match, do_rr);
745 for (rt = cont; rt; rt = rcu_dereference(rt->rt6_next))
746 match = find_match(rt, oif, strict, &mpri, match, do_rr);
751 static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
754 struct rt6_info *leaf = rcu_dereference(fn->leaf);
755 struct rt6_info *match, *rt0;
759 if (!leaf || leaf == net->ipv6.ip6_null_entry)
760 return net->ipv6.ip6_null_entry;
762 rt0 = rcu_dereference(fn->rr_ptr);
766 /* Double check to make sure fn is not an intermediate node
767 * and fn->leaf does not points to its child's leaf
768 * (This might happen if all routes under fn are deleted from
769 * the tree and fib6_repair_tree() is called on the node.)
771 key_plen = rt0->rt6i_dst.plen;
772 #ifdef CONFIG_IPV6_SUBTREES
773 if (rt0->rt6i_src.plen)
774 key_plen = rt0->rt6i_src.plen;
776 if (fn->fn_bit != key_plen)
777 return net->ipv6.ip6_null_entry;
779 match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict,
783 struct rt6_info *next = rcu_dereference(rt0->rt6_next);
785 /* no entries matched; do round-robin */
786 if (!next || next->rt6i_metric != rt0->rt6i_metric)
790 spin_lock_bh(&leaf->rt6i_table->tb6_lock);
791 /* make sure next is not being deleted from the tree */
793 rcu_assign_pointer(fn->rr_ptr, next);
794 spin_unlock_bh(&leaf->rt6i_table->tb6_lock);
798 return match ? match : net->ipv6.ip6_null_entry;
801 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
803 return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
806 #ifdef CONFIG_IPV6_ROUTE_INFO
807 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
808 const struct in6_addr *gwaddr)
810 struct net *net = dev_net(dev);
811 struct route_info *rinfo = (struct route_info *) opt;
812 struct in6_addr prefix_buf, *prefix;
814 unsigned long lifetime;
817 if (len < sizeof(struct route_info)) {
821 /* Sanity check for prefix_len and length */
822 if (rinfo->length > 3) {
824 } else if (rinfo->prefix_len > 128) {
826 } else if (rinfo->prefix_len > 64) {
827 if (rinfo->length < 2) {
830 } else if (rinfo->prefix_len > 0) {
831 if (rinfo->length < 1) {
836 pref = rinfo->route_pref;
837 if (pref == ICMPV6_ROUTER_PREF_INVALID)
840 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
842 if (rinfo->length == 3)
843 prefix = (struct in6_addr *)rinfo->prefix;
845 /* this function is safe */
846 ipv6_addr_prefix(&prefix_buf,
847 (struct in6_addr *)rinfo->prefix,
849 prefix = &prefix_buf;
852 if (rinfo->prefix_len == 0)
853 rt = rt6_get_dflt_router(gwaddr, dev);
855 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
858 if (rt && !lifetime) {
864 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
867 rt->rt6i_flags = RTF_ROUTEINFO |
868 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
871 if (!addrconf_finite_timeout(lifetime))
872 rt6_clean_expires(rt);
874 rt6_set_expires(rt, jiffies + HZ * lifetime);
882 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
883 struct in6_addr *saddr)
885 struct fib6_node *pn, *sn;
887 if (fn->fn_flags & RTN_TL_ROOT)
889 pn = rcu_dereference(fn->parent);
890 sn = FIB6_SUBTREE(pn);
892 fn = fib6_lookup(sn, NULL, saddr);
895 if (fn->fn_flags & RTN_RTINFO)
900 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
903 struct rt6_info *rt = *prt;
905 if (dst_hold_safe(&rt->dst))
908 rt = net->ipv6.ip6_null_entry;
917 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
918 struct fib6_table *table,
920 const struct sk_buff *skb,
923 struct rt6_info *rt, *rt_cache;
924 struct fib6_node *fn;
926 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
927 flags &= ~RT6_LOOKUP_F_IFACE;
930 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
932 rt = rcu_dereference(fn->leaf);
934 rt = net->ipv6.ip6_null_entry;
936 rt = rt6_device_match(net, rt, &fl6->saddr,
937 fl6->flowi6_oif, flags);
938 if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
939 rt = rt6_multipath_select(net, rt, fl6, fl6->flowi6_oif,
942 if (rt == net->ipv6.ip6_null_entry) {
943 fn = fib6_backtrack(fn, &fl6->saddr);
947 /* Search through exception table */
948 rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
952 if (ip6_hold_safe(net, &rt, true))
953 dst_use_noref(&rt->dst, jiffies);
957 trace_fib6_table_lookup(net, rt, table, fl6);
963 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
964 const struct sk_buff *skb, int flags)
966 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
968 EXPORT_SYMBOL_GPL(ip6_route_lookup);
970 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
971 const struct in6_addr *saddr, int oif,
972 const struct sk_buff *skb, int strict)
974 struct flowi6 fl6 = {
978 struct dst_entry *dst;
979 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
982 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
983 flags |= RT6_LOOKUP_F_HAS_SADDR;
986 dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
988 return (struct rt6_info *) dst;
994 EXPORT_SYMBOL(rt6_lookup);
996 /* ip6_ins_rt is called with FREE table->tb6_lock.
997 * It takes new route entry, the addition fails by any reason the
999 * Caller must hold dst before calling it.
1002 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
1003 struct mx6_config *mxc,
1004 struct netlink_ext_ack *extack)
1007 struct fib6_table *table;
1009 table = rt->rt6i_table;
1010 spin_lock_bh(&table->tb6_lock);
1011 err = fib6_add(&table->tb6_root, rt, info, mxc, extack);
1012 spin_unlock_bh(&table->tb6_lock);
1017 int ip6_ins_rt(struct rt6_info *rt)
1019 struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
1020 struct mx6_config mxc = { .mx = NULL, };
1022 /* Hold dst to account for the reference from the fib6 tree */
1024 return __ip6_ins_rt(rt, &info, &mxc, NULL);
1027 /* called with rcu_lock held */
1028 static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
1030 struct net_device *dev = rt->dst.dev;
1032 if (rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) {
1033 /* for copies of local routes, dst->dev needs to be the
1034 * device if it is a master device, the master device if
1035 * device is enslaved, and the loopback as the default
1037 if (netif_is_l3_slave(dev) &&
1038 !rt6_need_strict(&rt->rt6i_dst.addr))
1039 dev = l3mdev_master_dev_rcu(dev);
1040 else if (!netif_is_l3_master(dev))
1041 dev = dev_net(dev)->loopback_dev;
1042 /* last case is netif_is_l3_master(dev) is true in which
1043 * case we want dev returned to be dev
1050 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
1051 const struct in6_addr *daddr,
1052 const struct in6_addr *saddr)
1054 struct net_device *dev;
1055 struct rt6_info *rt;
1061 if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1065 dev = ip6_rt_get_dev_rcu(ort);
1066 rt = __ip6_dst_alloc(dev_net(dev), dev, 0);
1071 ip6_rt_copy_init(rt, ort);
1072 rt->rt6i_flags |= RTF_CACHE;
1073 rt->rt6i_metric = 0;
1074 rt->dst.flags |= DST_HOST;
1075 rt->rt6i_dst.addr = *daddr;
1076 rt->rt6i_dst.plen = 128;
1078 if (!rt6_is_gw_or_nonexthop(ort)) {
1079 if (ort->rt6i_dst.plen != 128 &&
1080 ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
1081 rt->rt6i_flags |= RTF_ANYCAST;
1082 #ifdef CONFIG_IPV6_SUBTREES
1083 if (rt->rt6i_src.plen && saddr) {
1084 rt->rt6i_src.addr = *saddr;
1085 rt->rt6i_src.plen = 128;
1093 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
1095 struct net_device *dev;
1096 struct rt6_info *pcpu_rt;
1099 dev = ip6_rt_get_dev_rcu(rt);
1100 pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags);
1104 ip6_rt_copy_init(pcpu_rt, rt);
1105 pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1106 pcpu_rt->rt6i_flags |= RTF_PCPU;
1110 /* It should be called with rcu_read_lock() acquired */
1111 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1113 struct rt6_info *pcpu_rt, **p;
1115 p = this_cpu_ptr(rt->rt6i_pcpu);
1118 if (pcpu_rt && ip6_hold_safe(NULL, &pcpu_rt, false))
1119 rt6_dst_from_metrics_check(pcpu_rt);
1124 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1126 struct rt6_info *pcpu_rt, *prev, **p;
1128 pcpu_rt = ip6_rt_pcpu_alloc(rt);
1130 struct net *net = dev_net(rt->dst.dev);
1132 dst_hold(&net->ipv6.ip6_null_entry->dst);
1133 return net->ipv6.ip6_null_entry;
1136 dst_hold(&pcpu_rt->dst);
1137 p = this_cpu_ptr(rt->rt6i_pcpu);
1138 prev = cmpxchg(p, NULL, pcpu_rt);
1141 rt6_dst_from_metrics_check(pcpu_rt);
1145 /* exception hash table implementation
1147 static DEFINE_SPINLOCK(rt6_exception_lock);
1149 /* Remove rt6_ex from hash table and free the memory
1150 * Caller must hold rt6_exception_lock
1152 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1153 struct rt6_exception *rt6_ex)
1157 if (!bucket || !rt6_ex)
1160 net = dev_net(rt6_ex->rt6i->dst.dev);
1161 rt6_ex->rt6i->rt6i_node = NULL;
1162 hlist_del_rcu(&rt6_ex->hlist);
1163 rt6_release(rt6_ex->rt6i);
1164 kfree_rcu(rt6_ex, rcu);
1165 WARN_ON_ONCE(!bucket->depth);
1167 net->ipv6.rt6_stats->fib_rt_cache--;
1170 /* Remove oldest rt6_ex in bucket and free the memory
1171 * Caller must hold rt6_exception_lock
1173 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1175 struct rt6_exception *rt6_ex, *oldest = NULL;
1180 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1181 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1184 rt6_remove_exception(bucket, oldest);
1187 static u32 rt6_exception_hash(const struct in6_addr *dst,
1188 const struct in6_addr *src)
1190 static u32 seed __read_mostly;
1193 net_get_random_once(&seed, sizeof(seed));
1194 val = jhash(dst, sizeof(*dst), seed);
1196 #ifdef CONFIG_IPV6_SUBTREES
1198 val = jhash(src, sizeof(*src), val);
1200 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1203 /* Helper function to find the cached rt in the hash table
1204 * and update bucket pointer to point to the bucket for this
1205 * (daddr, saddr) pair
1206 * Caller must hold rt6_exception_lock
1208 static struct rt6_exception *
1209 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1210 const struct in6_addr *daddr,
1211 const struct in6_addr *saddr)
1213 struct rt6_exception *rt6_ex;
1216 if (!(*bucket) || !daddr)
1219 hval = rt6_exception_hash(daddr, saddr);
1222 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1223 struct rt6_info *rt6 = rt6_ex->rt6i;
1224 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1226 #ifdef CONFIG_IPV6_SUBTREES
1227 if (matched && saddr)
1228 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1236 /* Helper function to find the cached rt in the hash table
1237 * and update bucket pointer to point to the bucket for this
1238 * (daddr, saddr) pair
1239 * Caller must hold rcu_read_lock()
1241 static struct rt6_exception *
1242 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1243 const struct in6_addr *daddr,
1244 const struct in6_addr *saddr)
1246 struct rt6_exception *rt6_ex;
1249 WARN_ON_ONCE(!rcu_read_lock_held());
1251 if (!(*bucket) || !daddr)
1254 hval = rt6_exception_hash(daddr, saddr);
1257 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1258 struct rt6_info *rt6 = rt6_ex->rt6i;
1259 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1261 #ifdef CONFIG_IPV6_SUBTREES
1262 if (matched && saddr)
1263 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1271 static int rt6_insert_exception(struct rt6_info *nrt,
1272 struct rt6_info *ort)
1274 struct net *net = dev_net(ort->dst.dev);
1275 struct rt6_exception_bucket *bucket;
1276 struct in6_addr *src_key = NULL;
1277 struct rt6_exception *rt6_ex;
1280 /* ort can't be a cache or pcpu route */
1281 if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1283 WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU));
1285 spin_lock_bh(&rt6_exception_lock);
1287 if (ort->exception_bucket_flushed) {
1292 bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1293 lockdep_is_held(&rt6_exception_lock));
1295 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1301 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1304 #ifdef CONFIG_IPV6_SUBTREES
1305 /* rt6i_src.plen != 0 indicates ort is in subtree
1306 * and exception table is indexed by a hash of
1307 * both rt6i_dst and rt6i_src.
1308 * Otherwise, the exception table is indexed by
1309 * a hash of only rt6i_dst.
1311 if (ort->rt6i_src.plen)
1312 src_key = &nrt->rt6i_src.addr;
1315 /* Update rt6i_prefsrc as it could be changed
1316 * in rt6_remove_prefsrc()
1318 nrt->rt6i_prefsrc = ort->rt6i_prefsrc;
1319 /* rt6_mtu_change() might lower mtu on ort.
1320 * Only insert this exception route if its mtu
1321 * is less than ort's mtu value.
1323 if (nrt->rt6i_pmtu >= dst_mtu(&ort->dst)) {
1328 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1331 rt6_remove_exception(bucket, rt6_ex);
1333 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1339 rt6_ex->stamp = jiffies;
1340 atomic_inc(&nrt->rt6i_ref);
1341 nrt->rt6i_node = ort->rt6i_node;
1342 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1344 net->ipv6.rt6_stats->fib_rt_cache++;
1346 if (bucket->depth > FIB6_MAX_DEPTH)
1347 rt6_exception_remove_oldest(bucket);
1350 spin_unlock_bh(&rt6_exception_lock);
1352 /* Update fn->fn_sernum to invalidate all cached dst */
1354 spin_lock_bh(&ort->rt6i_table->tb6_lock);
1355 fib6_update_sernum(ort);
1356 spin_unlock_bh(&ort->rt6i_table->tb6_lock);
1357 fib6_force_start_gc(net);
1363 void rt6_flush_exceptions(struct rt6_info *rt)
1365 struct rt6_exception_bucket *bucket;
1366 struct rt6_exception *rt6_ex;
1367 struct hlist_node *tmp;
1370 spin_lock_bh(&rt6_exception_lock);
1371 /* Prevent rt6_insert_exception() to recreate the bucket list */
1372 rt->exception_bucket_flushed = 1;
1374 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1375 lockdep_is_held(&rt6_exception_lock));
1379 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1380 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1381 rt6_remove_exception(bucket, rt6_ex);
1382 WARN_ON_ONCE(bucket->depth);
1387 spin_unlock_bh(&rt6_exception_lock);
1390 /* Find cached rt in the hash table inside passed in rt
1391 * Caller has to hold rcu_read_lock()
1393 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
1394 struct in6_addr *daddr,
1395 struct in6_addr *saddr)
1397 struct rt6_exception_bucket *bucket;
1398 struct in6_addr *src_key = NULL;
1399 struct rt6_exception *rt6_ex;
1400 struct rt6_info *res = NULL;
1402 bucket = rcu_dereference(rt->rt6i_exception_bucket);
1404 #ifdef CONFIG_IPV6_SUBTREES
1405 /* rt6i_src.plen != 0 indicates rt is in subtree
1406 * and exception table is indexed by a hash of
1407 * both rt6i_dst and rt6i_src.
1408 * Otherwise, the exception table is indexed by
1409 * a hash of only rt6i_dst.
1411 if (rt->rt6i_src.plen)
1414 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1416 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1422 /* Remove the passed in cached rt from the hash table that contains it */
1423 int rt6_remove_exception_rt(struct rt6_info *rt)
1425 struct rt6_exception_bucket *bucket;
1426 struct rt6_info *from = rt->from;
1427 struct in6_addr *src_key = NULL;
1428 struct rt6_exception *rt6_ex;
1432 !(rt->rt6i_flags & RTF_CACHE))
1435 if (!rcu_access_pointer(from->rt6i_exception_bucket))
1438 spin_lock_bh(&rt6_exception_lock);
1439 bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1440 lockdep_is_held(&rt6_exception_lock));
1441 #ifdef CONFIG_IPV6_SUBTREES
1442 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1443 * and exception table is indexed by a hash of
1444 * both rt6i_dst and rt6i_src.
1445 * Otherwise, the exception table is indexed by
1446 * a hash of only rt6i_dst.
1448 if (from->rt6i_src.plen)
1449 src_key = &rt->rt6i_src.addr;
1451 rt6_ex = __rt6_find_exception_spinlock(&bucket,
1455 rt6_remove_exception(bucket, rt6_ex);
1461 spin_unlock_bh(&rt6_exception_lock);
1465 /* Find rt6_ex which contains the passed in rt cache and
1468 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1470 struct rt6_exception_bucket *bucket;
1471 struct rt6_info *from = rt->from;
1472 struct in6_addr *src_key = NULL;
1473 struct rt6_exception *rt6_ex;
1476 !(rt->rt6i_flags & RTF_CACHE))
1480 bucket = rcu_dereference(from->rt6i_exception_bucket);
1482 #ifdef CONFIG_IPV6_SUBTREES
1483 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1484 * and exception table is indexed by a hash of
1485 * both rt6i_dst and rt6i_src.
1486 * Otherwise, the exception table is indexed by
1487 * a hash of only rt6i_dst.
1489 if (from->rt6i_src.plen)
1490 src_key = &rt->rt6i_src.addr;
1492 rt6_ex = __rt6_find_exception_rcu(&bucket,
1496 rt6_ex->stamp = jiffies;
1501 static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt)
1503 struct rt6_exception_bucket *bucket;
1504 struct rt6_exception *rt6_ex;
1507 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1508 lockdep_is_held(&rt6_exception_lock));
1511 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1512 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1513 rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1520 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1521 struct rt6_info *rt, int mtu)
1523 /* If the new MTU is lower than the route PMTU, this new MTU will be the
1524 * lowest MTU in the path: always allow updating the route PMTU to
1525 * reflect PMTU decreases.
1527 * If the new MTU is higher, and the route PMTU is equal to the local
1528 * MTU, this means the old MTU is the lowest in the path, so allow
1529 * updating it: if other nodes now have lower MTUs, PMTU discovery will
1533 if (dst_mtu(&rt->dst) >= mtu)
1536 if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1542 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1543 struct rt6_info *rt, int mtu)
1545 struct rt6_exception_bucket *bucket;
1546 struct rt6_exception *rt6_ex;
1549 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1550 lockdep_is_held(&rt6_exception_lock));
1555 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1556 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1557 struct rt6_info *entry = rt6_ex->rt6i;
1559 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1560 * route), the metrics of its rt->dst.from have already
1563 if (entry->rt6i_pmtu &&
1564 rt6_mtu_change_route_allowed(idev, entry, mtu))
1565 entry->rt6i_pmtu = mtu;
1571 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
1573 static void rt6_exceptions_clean_tohost(struct rt6_info *rt,
1574 struct in6_addr *gateway)
1576 struct rt6_exception_bucket *bucket;
1577 struct rt6_exception *rt6_ex;
1578 struct hlist_node *tmp;
1581 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1584 spin_lock_bh(&rt6_exception_lock);
1585 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1586 lockdep_is_held(&rt6_exception_lock));
1589 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1590 hlist_for_each_entry_safe(rt6_ex, tmp,
1591 &bucket->chain, hlist) {
1592 struct rt6_info *entry = rt6_ex->rt6i;
1594 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1595 RTF_CACHE_GATEWAY &&
1596 ipv6_addr_equal(gateway,
1597 &entry->rt6i_gateway)) {
1598 rt6_remove_exception(bucket, rt6_ex);
1605 spin_unlock_bh(&rt6_exception_lock);
1608 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1609 struct rt6_exception *rt6_ex,
1610 struct fib6_gc_args *gc_args,
1613 struct rt6_info *rt = rt6_ex->rt6i;
1615 /* we are pruning and obsoleting aged-out and non gateway exceptions
1616 * even if others have still references to them, so that on next
1617 * dst_check() such references can be dropped.
1618 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1619 * expired, independently from their aging, as per RFC 8201 section 4
1621 if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1622 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1623 RT6_TRACE("aging clone %p\n", rt);
1624 rt6_remove_exception(bucket, rt6_ex);
1627 } else if (time_after(jiffies, rt->dst.expires)) {
1628 RT6_TRACE("purging expired route %p\n", rt);
1629 rt6_remove_exception(bucket, rt6_ex);
1633 if (rt->rt6i_flags & RTF_GATEWAY) {
1634 struct neighbour *neigh;
1635 __u8 neigh_flags = 0;
1637 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1639 neigh_flags = neigh->flags;
1641 if (!(neigh_flags & NTF_ROUTER)) {
1642 RT6_TRACE("purging route %p via non-router but gateway\n",
1644 rt6_remove_exception(bucket, rt6_ex);
1652 void rt6_age_exceptions(struct rt6_info *rt,
1653 struct fib6_gc_args *gc_args,
1656 struct rt6_exception_bucket *bucket;
1657 struct rt6_exception *rt6_ex;
1658 struct hlist_node *tmp;
1661 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1665 spin_lock(&rt6_exception_lock);
1666 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1667 lockdep_is_held(&rt6_exception_lock));
1670 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1671 hlist_for_each_entry_safe(rt6_ex, tmp,
1672 &bucket->chain, hlist) {
1673 rt6_age_examine_exception(bucket, rt6_ex,
1679 spin_unlock(&rt6_exception_lock);
1680 rcu_read_unlock_bh();
1683 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1684 int oif, struct flowi6 *fl6,
1685 const struct sk_buff *skb, int flags)
1687 struct fib6_node *fn, *saved_fn;
1688 struct rt6_info *rt, *rt_cache;
1691 strict |= flags & RT6_LOOKUP_F_IFACE;
1692 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1693 if (net->ipv6.devconf_all->forwarding == 0)
1694 strict |= RT6_LOOKUP_F_REACHABLE;
1698 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1701 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1705 rt = rt6_select(net, fn, oif, strict);
1706 if (rt->rt6i_nsiblings)
1707 rt = rt6_multipath_select(net, rt, fl6, oif, skb, strict);
1708 if (rt == net->ipv6.ip6_null_entry) {
1709 fn = fib6_backtrack(fn, &fl6->saddr);
1711 goto redo_rt6_select;
1712 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1713 /* also consider unreachable route */
1714 strict &= ~RT6_LOOKUP_F_REACHABLE;
1716 goto redo_rt6_select;
1720 /*Search through exception table */
1721 rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
1725 if (rt == net->ipv6.ip6_null_entry) {
1728 trace_fib6_table_lookup(net, rt, table, fl6);
1730 } else if (rt->rt6i_flags & RTF_CACHE) {
1731 if (ip6_hold_safe(net, &rt, true)) {
1732 dst_use_noref(&rt->dst, jiffies);
1733 rt6_dst_from_metrics_check(rt);
1736 trace_fib6_table_lookup(net, rt, table, fl6);
1738 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1739 !(rt->rt6i_flags & RTF_GATEWAY))) {
1740 /* Create a RTF_CACHE clone which will not be
1741 * owned by the fib6 tree. It is for the special case where
1742 * the daddr in the skb during the neighbor look-up is different
1743 * from the fl6->daddr used to look-up route here.
1746 struct rt6_info *uncached_rt;
1748 if (ip6_hold_safe(net, &rt, true)) {
1749 dst_use_noref(&rt->dst, jiffies);
1753 goto uncached_rt_out;
1757 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1758 dst_release(&rt->dst);
1761 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1762 * No need for another dst_hold()
1764 rt6_uncached_list_add(uncached_rt);
1765 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1767 uncached_rt = net->ipv6.ip6_null_entry;
1768 dst_hold(&uncached_rt->dst);
1772 trace_fib6_table_lookup(net, uncached_rt, table, fl6);
1776 /* Get a percpu copy */
1778 struct rt6_info *pcpu_rt;
1780 dst_use_noref(&rt->dst, jiffies);
1782 pcpu_rt = rt6_get_pcpu_route(rt);
1785 /* atomic_inc_not_zero() is needed when using rcu */
1786 if (atomic_inc_not_zero(&rt->rt6i_ref)) {
1787 /* No dst_hold() on rt is needed because grabbing
1788 * rt->rt6i_ref makes sure rt can't be released.
1790 pcpu_rt = rt6_make_pcpu_route(rt);
1793 /* rt is already removed from tree */
1794 pcpu_rt = net->ipv6.ip6_null_entry;
1795 dst_hold(&pcpu_rt->dst);
1800 trace_fib6_table_lookup(net, pcpu_rt, table, fl6);
1804 EXPORT_SYMBOL_GPL(ip6_pol_route);
1806 static struct rt6_info *ip6_pol_route_input(struct net *net,
1807 struct fib6_table *table,
1809 const struct sk_buff *skb,
1812 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1815 struct dst_entry *ip6_route_input_lookup(struct net *net,
1816 struct net_device *dev,
1818 const struct sk_buff *skb,
1821 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1822 flags |= RT6_LOOKUP_F_IFACE;
1824 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1826 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1828 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1829 struct flow_keys *keys,
1830 struct flow_keys *flkeys)
1832 const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1833 const struct ipv6hdr *key_iph = outer_iph;
1834 struct flow_keys *_flkeys = flkeys;
1835 const struct ipv6hdr *inner_iph;
1836 const struct icmp6hdr *icmph;
1837 struct ipv6hdr _inner_iph;
1839 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1842 icmph = icmp6_hdr(skb);
1843 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1844 icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1845 icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1846 icmph->icmp6_type != ICMPV6_PARAMPROB)
1849 inner_iph = skb_header_pointer(skb,
1850 skb_transport_offset(skb) + sizeof(*icmph),
1851 sizeof(_inner_iph), &_inner_iph);
1855 key_iph = inner_iph;
1859 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1860 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1861 keys->tags.flow_label = _flkeys->tags.flow_label;
1862 keys->basic.ip_proto = _flkeys->basic.ip_proto;
1864 keys->addrs.v6addrs.src = key_iph->saddr;
1865 keys->addrs.v6addrs.dst = key_iph->daddr;
1866 keys->tags.flow_label = ip6_flowinfo(key_iph);
1867 keys->basic.ip_proto = key_iph->nexthdr;
1871 /* if skb is set it will be used and fl6 can be NULL */
1872 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
1873 const struct sk_buff *skb, struct flow_keys *flkeys)
1875 struct flow_keys hash_keys;
1878 switch (ip6_multipath_hash_policy(net)) {
1880 memset(&hash_keys, 0, sizeof(hash_keys));
1881 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1883 ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
1885 hash_keys.addrs.v6addrs.src = fl6->saddr;
1886 hash_keys.addrs.v6addrs.dst = fl6->daddr;
1887 hash_keys.tags.flow_label = (__force u32)fl6->flowlabel;
1888 hash_keys.basic.ip_proto = fl6->flowi6_proto;
1893 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1894 struct flow_keys keys;
1896 /* short-circuit if we already have L4 hash present */
1898 return skb_get_hash_raw(skb) >> 1;
1900 memset(&hash_keys, 0, sizeof(hash_keys));
1903 skb_flow_dissect_flow_keys(skb, &keys, flag);
1906 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1907 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
1908 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
1909 hash_keys.ports.src = flkeys->ports.src;
1910 hash_keys.ports.dst = flkeys->ports.dst;
1911 hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1913 memset(&hash_keys, 0, sizeof(hash_keys));
1914 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1915 hash_keys.addrs.v6addrs.src = fl6->saddr;
1916 hash_keys.addrs.v6addrs.dst = fl6->daddr;
1917 hash_keys.ports.src = fl6->fl6_sport;
1918 hash_keys.ports.dst = fl6->fl6_dport;
1919 hash_keys.basic.ip_proto = fl6->flowi6_proto;
1923 mhash = flow_hash_from_keys(&hash_keys);
1928 void ip6_route_input(struct sk_buff *skb)
1930 const struct ipv6hdr *iph = ipv6_hdr(skb);
1931 struct net *net = dev_net(skb->dev);
1932 int flags = RT6_LOOKUP_F_HAS_SADDR;
1933 struct ip_tunnel_info *tun_info;
1934 struct flowi6 fl6 = {
1935 .flowi6_iif = skb->dev->ifindex,
1936 .daddr = iph->daddr,
1937 .saddr = iph->saddr,
1938 .flowlabel = ip6_flowinfo(iph),
1939 .flowi6_mark = skb->mark,
1940 .flowi6_proto = iph->nexthdr,
1942 struct flow_keys *flkeys = NULL, _flkeys;
1944 tun_info = skb_tunnel_info(skb);
1945 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1946 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1948 if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
1951 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
1952 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
1955 ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
1958 static struct rt6_info *ip6_pol_route_output(struct net *net,
1959 struct fib6_table *table,
1961 const struct sk_buff *skb,
1964 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
1967 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1968 struct flowi6 *fl6, int flags)
1972 if (rt6_need_strict(&fl6->daddr)) {
1973 struct dst_entry *dst;
1975 dst = l3mdev_link_scope_lookup(net, fl6);
1980 fl6->flowi6_iif = LOOPBACK_IFINDEX;
1982 any_src = ipv6_addr_any(&fl6->saddr);
1983 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1984 (fl6->flowi6_oif && any_src))
1985 flags |= RT6_LOOKUP_F_IFACE;
1988 flags |= RT6_LOOKUP_F_HAS_SADDR;
1990 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1992 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
1994 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1996 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1998 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1999 struct net_device *loopback_dev = net->loopback_dev;
2000 struct dst_entry *new = NULL;
2002 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2003 DST_OBSOLETE_DEAD, 0);
2006 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2010 new->input = dst_discard;
2011 new->output = dst_discard_out;
2013 dst_copy_metrics(new, &ort->dst);
2015 rt->rt6i_idev = in6_dev_get(loopback_dev);
2016 rt->rt6i_gateway = ort->rt6i_gateway;
2017 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2018 rt->rt6i_metric = 0;
2020 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2021 #ifdef CONFIG_IPV6_SUBTREES
2022 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2026 dst_release(dst_orig);
2027 return new ? new : ERR_PTR(-ENOMEM);
2031 * Destination cache support functions
2034 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
2037 dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(&rt->from->dst))
2038 dst_init_metrics(&rt->dst, dst_metrics_ptr(&rt->from->dst), true);
2041 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
2045 if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
2048 if (rt6_check_expired(rt))
2054 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
2056 if (!__rt6_check_expired(rt) &&
2057 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2058 rt6_check(rt->from, cookie))
2064 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2066 struct rt6_info *rt;
2068 rt = (struct rt6_info *) dst;
2070 /* All IPV6 dsts are created with ->obsolete set to the value
2071 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2072 * into this function always.
2075 rt6_dst_from_metrics_check(rt);
2077 if (rt->rt6i_flags & RTF_PCPU ||
2078 (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from))
2079 return rt6_dst_from_check(rt, cookie);
2081 return rt6_check(rt, cookie);
2084 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2086 struct rt6_info *rt = (struct rt6_info *) dst;
2089 if (rt->rt6i_flags & RTF_CACHE) {
2090 if (rt6_check_expired(rt)) {
2102 static void ip6_link_failure(struct sk_buff *skb)
2104 struct rt6_info *rt;
2106 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2108 rt = (struct rt6_info *) skb_dst(skb);
2110 if (rt->rt6i_flags & RTF_CACHE) {
2111 if (dst_hold_safe(&rt->dst))
2114 struct fib6_node *fn;
2117 fn = rcu_dereference(rt->rt6i_node);
2118 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2125 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2127 struct net *net = dev_net(rt->dst.dev);
2129 rt->rt6i_flags |= RTF_MODIFIED;
2130 rt->rt6i_pmtu = mtu;
2131 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2134 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2136 return !(rt->rt6i_flags & RTF_CACHE) &&
2137 (rt->rt6i_flags & RTF_PCPU ||
2138 rcu_access_pointer(rt->rt6i_node));
2141 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2142 const struct ipv6hdr *iph, u32 mtu)
2144 const struct in6_addr *daddr, *saddr;
2145 struct rt6_info *rt6 = (struct rt6_info *)dst;
2147 if (rt6->rt6i_flags & RTF_LOCAL)
2150 if (dst_metric_locked(dst, RTAX_MTU))
2154 daddr = &iph->daddr;
2155 saddr = &iph->saddr;
2157 daddr = &sk->sk_v6_daddr;
2158 saddr = &inet6_sk(sk)->saddr;
2163 dst_confirm_neigh(dst, daddr);
2164 mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2165 if (mtu >= dst_mtu(dst))
2168 if (!rt6_cache_allowed_for_pmtu(rt6)) {
2169 rt6_do_update_pmtu(rt6, mtu);
2170 /* update rt6_ex->stamp for cache */
2171 if (rt6->rt6i_flags & RTF_CACHE)
2172 rt6_update_exception_stamp_rt(rt6);
2174 struct rt6_info *nrt6;
2176 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
2178 rt6_do_update_pmtu(nrt6, mtu);
2179 if (rt6_insert_exception(nrt6, rt6))
2180 dst_release_immediate(&nrt6->dst);
2185 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2186 struct sk_buff *skb, u32 mtu)
2188 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2191 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2192 int oif, u32 mark, kuid_t uid)
2194 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2195 struct dst_entry *dst;
2198 memset(&fl6, 0, sizeof(fl6));
2199 fl6.flowi6_oif = oif;
2200 fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2201 fl6.daddr = iph->daddr;
2202 fl6.saddr = iph->saddr;
2203 fl6.flowlabel = ip6_flowinfo(iph);
2204 fl6.flowi6_uid = uid;
2206 dst = ip6_route_output(net, NULL, &fl6);
2208 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2211 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2213 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2215 struct dst_entry *dst;
2217 ip6_update_pmtu(skb, sock_net(sk), mtu,
2218 sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2220 dst = __sk_dst_get(sk);
2221 if (!dst || !dst->obsolete ||
2222 dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2226 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2227 ip6_datagram_dst_update(sk, false);
2230 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2232 /* Handle redirects */
2233 struct ip6rd_flowi {
2235 struct in6_addr gateway;
2238 static struct rt6_info *__ip6_route_redirect(struct net *net,
2239 struct fib6_table *table,
2241 const struct sk_buff *skb,
2244 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2245 struct rt6_info *rt, *rt_cache;
2246 struct fib6_node *fn;
2248 /* Get the "current" route for this destination and
2249 * check if the redirect has come from appropriate router.
2251 * RFC 4861 specifies that redirects should only be
2252 * accepted if they come from the nexthop to the target.
2253 * Due to the way the routes are chosen, this notion
2254 * is a bit fuzzy and one might need to check all possible
2259 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2261 for_each_fib6_node_rt_rcu(fn) {
2262 if (rt->rt6i_nh_flags & RTNH_F_DEAD)
2264 if (rt6_check_expired(rt))
2268 if (!(rt->rt6i_flags & RTF_GATEWAY))
2270 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
2272 /* rt_cache's gateway might be different from its 'parent'
2273 * in the case of an ip redirect.
2274 * So we keep searching in the exception table if the gateway
2277 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) {
2278 rt_cache = rt6_find_cached_rt(rt,
2282 ipv6_addr_equal(&rdfl->gateway,
2283 &rt_cache->rt6i_gateway)) {
2293 rt = net->ipv6.ip6_null_entry;
2294 else if (rt->dst.error) {
2295 rt = net->ipv6.ip6_null_entry;
2299 if (rt == net->ipv6.ip6_null_entry) {
2300 fn = fib6_backtrack(fn, &fl6->saddr);
2306 ip6_hold_safe(net, &rt, true);
2310 trace_fib6_table_lookup(net, rt, table, fl6);
2314 static struct dst_entry *ip6_route_redirect(struct net *net,
2315 const struct flowi6 *fl6,
2316 const struct sk_buff *skb,
2317 const struct in6_addr *gateway)
2319 int flags = RT6_LOOKUP_F_HAS_SADDR;
2320 struct ip6rd_flowi rdfl;
2323 rdfl.gateway = *gateway;
2325 return fib6_rule_lookup(net, &rdfl.fl6, skb,
2326 flags, __ip6_route_redirect);
2329 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2332 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2333 struct dst_entry *dst;
2336 memset(&fl6, 0, sizeof(fl6));
2337 fl6.flowi6_iif = LOOPBACK_IFINDEX;
2338 fl6.flowi6_oif = oif;
2339 fl6.flowi6_mark = mark;
2340 fl6.daddr = iph->daddr;
2341 fl6.saddr = iph->saddr;
2342 fl6.flowlabel = ip6_flowinfo(iph);
2343 fl6.flowi6_uid = uid;
2345 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2346 rt6_do_redirect(dst, NULL, skb);
2349 EXPORT_SYMBOL_GPL(ip6_redirect);
2351 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2354 const struct ipv6hdr *iph = ipv6_hdr(skb);
2355 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2356 struct dst_entry *dst;
2359 memset(&fl6, 0, sizeof(fl6));
2360 fl6.flowi6_iif = LOOPBACK_IFINDEX;
2361 fl6.flowi6_oif = oif;
2362 fl6.flowi6_mark = mark;
2363 fl6.daddr = msg->dest;
2364 fl6.saddr = iph->daddr;
2365 fl6.flowi6_uid = sock_net_uid(net, NULL);
2367 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2368 rt6_do_redirect(dst, NULL, skb);
2372 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2374 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2377 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2379 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2381 struct net_device *dev = dst->dev;
2382 unsigned int mtu = dst_mtu(dst);
2383 struct net *net = dev_net(dev);
2385 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2387 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2388 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2391 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2392 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2393 * IPV6_MAXPLEN is also valid and means: "any MSS,
2394 * rely only on pmtu discovery"
2396 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2401 static unsigned int ip6_mtu(const struct dst_entry *dst)
2403 const struct rt6_info *rt = (const struct rt6_info *)dst;
2404 unsigned int mtu = rt->rt6i_pmtu;
2405 struct inet6_dev *idev;
2410 mtu = dst_metric_raw(dst, RTAX_MTU);
2417 idev = __in6_dev_get(dst->dev);
2419 mtu = idev->cnf.mtu6;
2423 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2425 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2428 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2431 struct dst_entry *dst;
2432 struct rt6_info *rt;
2433 struct inet6_dev *idev = in6_dev_get(dev);
2434 struct net *net = dev_net(dev);
2436 if (unlikely(!idev))
2437 return ERR_PTR(-ENODEV);
2439 rt = ip6_dst_alloc(net, dev, 0);
2440 if (unlikely(!rt)) {
2442 dst = ERR_PTR(-ENOMEM);
2446 rt->dst.flags |= DST_HOST;
2447 rt->dst.input = ip6_input;
2448 rt->dst.output = ip6_output;
2449 rt->rt6i_gateway = fl6->daddr;
2450 rt->rt6i_dst.addr = fl6->daddr;
2451 rt->rt6i_dst.plen = 128;
2452 rt->rt6i_idev = idev;
2453 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2455 /* Add this dst into uncached_list so that rt6_disable_ip() can
2456 * do proper release of the net_device
2458 rt6_uncached_list_add(rt);
2459 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2461 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2467 static int ip6_dst_gc(struct dst_ops *ops)
2469 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2470 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2471 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2472 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2473 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2474 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2477 entries = dst_entries_get_fast(ops);
2478 if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2479 entries <= rt_max_size)
2482 net->ipv6.ip6_rt_gc_expire++;
2483 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2484 entries = dst_entries_get_slow(ops);
2485 if (entries < ops->gc_thresh)
2486 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2488 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2489 return entries > rt_max_size;
2492 static int ip6_convert_metrics(struct mx6_config *mxc,
2493 const struct fib6_config *cfg)
2495 struct net *net = cfg->fc_nlinfo.nl_net;
2496 bool ecn_ca = false;
2504 mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
2508 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
2509 int type = nla_type(nla);
2514 if (unlikely(type > RTAX_MAX))
2517 if (type == RTAX_CC_ALGO) {
2518 char tmp[TCP_CA_NAME_MAX];
2520 nla_strlcpy(tmp, nla, sizeof(tmp));
2521 val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca);
2522 if (val == TCP_CA_UNSPEC)
2525 val = nla_get_u32(nla);
2527 if (type == RTAX_HOPLIMIT && val > 255)
2529 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
2533 __set_bit(type - 1, mxc->mx_valid);
2537 __set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
2538 mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
2548 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2549 struct fib6_config *cfg,
2550 const struct in6_addr *gw_addr,
2551 u32 tbid, int flags)
2553 struct flowi6 fl6 = {
2554 .flowi6_oif = cfg->fc_ifindex,
2556 .saddr = cfg->fc_prefsrc,
2558 struct fib6_table *table;
2559 struct rt6_info *rt;
2561 table = fib6_get_table(net, tbid);
2565 if (!ipv6_addr_any(&cfg->fc_prefsrc))
2566 flags |= RT6_LOOKUP_F_HAS_SADDR;
2568 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2569 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2571 /* if table lookup failed, fall back to full lookup */
2572 if (rt == net->ipv6.ip6_null_entry) {
2580 static int ip6_route_check_nh_onlink(struct net *net,
2581 struct fib6_config *cfg,
2582 const struct net_device *dev,
2583 struct netlink_ext_ack *extack)
2585 u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2586 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2587 u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2588 struct rt6_info *grt;
2592 grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2594 if (!grt->dst.error &&
2595 (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2596 NL_SET_ERR_MSG(extack,
2597 "Nexthop has invalid gateway or device mismatch");
2607 static int ip6_route_check_nh(struct net *net,
2608 struct fib6_config *cfg,
2609 struct net_device **_dev,
2610 struct inet6_dev **idev)
2612 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2613 struct net_device *dev = _dev ? *_dev : NULL;
2614 struct rt6_info *grt = NULL;
2615 int err = -EHOSTUNREACH;
2617 if (cfg->fc_table) {
2618 int flags = RT6_LOOKUP_F_IFACE;
2620 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2621 cfg->fc_table, flags);
2623 if (grt->rt6i_flags & RTF_GATEWAY ||
2624 (dev && dev != grt->dst.dev)) {
2632 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2638 if (dev != grt->dst.dev) {
2643 *_dev = dev = grt->dst.dev;
2644 *idev = grt->rt6i_idev;
2646 in6_dev_hold(grt->rt6i_idev);
2649 if (!(grt->rt6i_flags & RTF_GATEWAY))
2658 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2659 struct net_device **_dev, struct inet6_dev **idev,
2660 struct netlink_ext_ack *extack)
2662 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2663 int gwa_type = ipv6_addr_type(gw_addr);
2664 bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2665 const struct net_device *dev = *_dev;
2666 bool need_addr_check = !dev;
2669 /* if gw_addr is local we will fail to detect this in case
2670 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2671 * will return already-added prefix route via interface that
2672 * prefix route was assigned to, which might be non-loopback.
2675 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2676 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2680 if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2681 /* IPv6 strictly inhibits using not link-local
2682 * addresses as nexthop address.
2683 * Otherwise, router will not able to send redirects.
2684 * It is very good, but in some (rare!) circumstances
2685 * (SIT, PtP, NBMA NOARP links) it is handy to allow
2686 * some exceptions. --ANK
2687 * We allow IPv4-mapped nexthops to support RFC4798-type
2690 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2691 NL_SET_ERR_MSG(extack, "Invalid gateway address");
2695 if (cfg->fc_flags & RTNH_F_ONLINK)
2696 err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2698 err = ip6_route_check_nh(net, cfg, _dev, idev);
2704 /* reload in case device was changed */
2709 NL_SET_ERR_MSG(extack, "Egress device not specified");
2711 } else if (dev->flags & IFF_LOOPBACK) {
2712 NL_SET_ERR_MSG(extack,
2713 "Egress device can not be loopback device for this route");
2717 /* if we did not check gw_addr above, do so now that the
2718 * egress device has been resolved.
2720 if (need_addr_check &&
2721 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2722 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2731 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
2732 struct netlink_ext_ack *extack)
2734 struct net *net = cfg->fc_nlinfo.nl_net;
2735 struct rt6_info *rt = NULL;
2736 struct net_device *dev = NULL;
2737 struct inet6_dev *idev = NULL;
2738 struct fib6_table *table;
2742 /* RTF_PCPU is an internal flag; can not be set by userspace */
2743 if (cfg->fc_flags & RTF_PCPU) {
2744 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2748 /* RTF_CACHE is an internal flag; can not be set by userspace */
2749 if (cfg->fc_flags & RTF_CACHE) {
2750 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2754 if (cfg->fc_dst_len > 128) {
2755 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2758 if (cfg->fc_src_len > 128) {
2759 NL_SET_ERR_MSG(extack, "Invalid source address length");
2762 #ifndef CONFIG_IPV6_SUBTREES
2763 if (cfg->fc_src_len) {
2764 NL_SET_ERR_MSG(extack,
2765 "Specifying source address requires IPV6_SUBTREES to be enabled");
2769 if (cfg->fc_ifindex) {
2771 dev = dev_get_by_index(net, cfg->fc_ifindex);
2774 idev = in6_dev_get(dev);
2779 if (cfg->fc_metric == 0)
2780 cfg->fc_metric = IP6_RT_PRIO_USER;
2782 if (cfg->fc_flags & RTNH_F_ONLINK) {
2784 NL_SET_ERR_MSG(extack,
2785 "Nexthop device required for onlink");
2790 if (!(dev->flags & IFF_UP)) {
2791 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2798 if (cfg->fc_nlinfo.nlh &&
2799 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2800 table = fib6_get_table(net, cfg->fc_table);
2802 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2803 table = fib6_new_table(net, cfg->fc_table);
2806 table = fib6_new_table(net, cfg->fc_table);
2812 rt = ip6_dst_alloc(net, NULL,
2813 (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
2820 if (cfg->fc_flags & RTF_EXPIRES)
2821 rt6_set_expires(rt, jiffies +
2822 clock_t_to_jiffies(cfg->fc_expires));
2824 rt6_clean_expires(rt);
2826 if (cfg->fc_protocol == RTPROT_UNSPEC)
2827 cfg->fc_protocol = RTPROT_BOOT;
2828 rt->rt6i_protocol = cfg->fc_protocol;
2830 addr_type = ipv6_addr_type(&cfg->fc_dst);
2832 if (addr_type & IPV6_ADDR_MULTICAST)
2833 rt->dst.input = ip6_mc_input;
2834 else if (cfg->fc_flags & RTF_LOCAL)
2835 rt->dst.input = ip6_input;
2837 rt->dst.input = ip6_forward;
2839 rt->dst.output = ip6_output;
2841 if (cfg->fc_encap) {
2842 struct lwtunnel_state *lwtstate;
2844 err = lwtunnel_build_state(cfg->fc_encap_type,
2845 cfg->fc_encap, AF_INET6, cfg,
2849 rt->dst.lwtstate = lwtstate_get(lwtstate);
2850 lwtunnel_set_redirect(&rt->dst);
2853 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
2854 rt->rt6i_dst.plen = cfg->fc_dst_len;
2855 if (rt->rt6i_dst.plen == 128)
2856 rt->dst.flags |= DST_HOST;
2858 #ifdef CONFIG_IPV6_SUBTREES
2859 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
2860 rt->rt6i_src.plen = cfg->fc_src_len;
2863 rt->rt6i_metric = cfg->fc_metric;
2864 rt->rt6i_nh_weight = 1;
2866 /* We cannot add true routes via loopback here,
2867 they would result in kernel looping; promote them to reject routes
2869 if ((cfg->fc_flags & RTF_REJECT) ||
2870 (dev && (dev->flags & IFF_LOOPBACK) &&
2871 !(addr_type & IPV6_ADDR_LOOPBACK) &&
2872 !(cfg->fc_flags & RTF_LOCAL))) {
2873 /* hold loopback dev/idev if we haven't done so. */
2874 if (dev != net->loopback_dev) {
2879 dev = net->loopback_dev;
2881 idev = in6_dev_get(dev);
2887 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
2888 switch (cfg->fc_type) {
2890 rt->dst.error = -EINVAL;
2891 rt->dst.output = dst_discard_out;
2892 rt->dst.input = dst_discard;
2895 rt->dst.error = -EACCES;
2896 rt->dst.output = ip6_pkt_prohibit_out;
2897 rt->dst.input = ip6_pkt_prohibit;
2900 case RTN_UNREACHABLE:
2902 rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
2903 : (cfg->fc_type == RTN_UNREACHABLE)
2904 ? -EHOSTUNREACH : -ENETUNREACH;
2905 rt->dst.output = ip6_pkt_discard_out;
2906 rt->dst.input = ip6_pkt_discard;
2912 if (cfg->fc_flags & RTF_GATEWAY) {
2913 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
2917 rt->rt6i_gateway = cfg->fc_gateway;
2924 if (idev->cnf.disable_ipv6) {
2925 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
2930 if (!(dev->flags & IFF_UP)) {
2931 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2936 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2937 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2938 NL_SET_ERR_MSG(extack, "Invalid source address");
2942 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2943 rt->rt6i_prefsrc.plen = 128;
2945 rt->rt6i_prefsrc.plen = 0;
2947 rt->rt6i_flags = cfg->fc_flags;
2950 if (!(rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
2951 !netif_carrier_ok(dev))
2952 rt->rt6i_nh_flags |= RTNH_F_LINKDOWN;
2953 rt->rt6i_nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
2955 rt->rt6i_idev = idev;
2956 rt->rt6i_table = table;
2958 cfg->fc_nlinfo.nl_net = dev_net(dev);
2967 dst_release_immediate(&rt->dst);
2969 return ERR_PTR(err);
2972 int ip6_route_add(struct fib6_config *cfg,
2973 struct netlink_ext_ack *extack)
2975 struct mx6_config mxc = { .mx = NULL, };
2976 struct rt6_info *rt;
2979 rt = ip6_route_info_create(cfg, extack);
2986 err = ip6_convert_metrics(&mxc, cfg);
2990 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack);
2997 dst_release_immediate(&rt->dst);
3002 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
3005 struct fib6_table *table;
3006 struct net *net = dev_net(rt->dst.dev);
3008 if (rt == net->ipv6.ip6_null_entry) {
3013 table = rt->rt6i_table;
3014 spin_lock_bh(&table->tb6_lock);
3015 err = fib6_del(rt, info);
3016 spin_unlock_bh(&table->tb6_lock);
3023 int ip6_del_rt(struct rt6_info *rt)
3025 struct nl_info info = {
3026 .nl_net = dev_net(rt->dst.dev),
3028 return __ip6_del_rt(rt, &info);
3031 static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
3033 struct nl_info *info = &cfg->fc_nlinfo;
3034 struct net *net = info->nl_net;
3035 struct sk_buff *skb = NULL;
3036 struct fib6_table *table;
3039 if (rt == net->ipv6.ip6_null_entry)
3041 table = rt->rt6i_table;
3042 spin_lock_bh(&table->tb6_lock);
3044 if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
3045 struct rt6_info *sibling, *next_sibling;
3047 /* prefer to send a single notification with all hops */
3048 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3050 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3052 if (rt6_fill_node(net, skb, rt,
3053 NULL, NULL, 0, RTM_DELROUTE,
3054 info->portid, seq, 0) < 0) {
3058 info->skip_notify = 1;
3061 list_for_each_entry_safe(sibling, next_sibling,
3064 err = fib6_del(sibling, info);
3070 err = fib6_del(rt, info);
3072 spin_unlock_bh(&table->tb6_lock);
3077 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3078 info->nlh, gfp_any());
3083 static int ip6_route_del(struct fib6_config *cfg,
3084 struct netlink_ext_ack *extack)
3086 struct rt6_info *rt, *rt_cache;
3087 struct fib6_table *table;
3088 struct fib6_node *fn;
3091 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3093 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3099 fn = fib6_locate(&table->tb6_root,
3100 &cfg->fc_dst, cfg->fc_dst_len,
3101 &cfg->fc_src, cfg->fc_src_len,
3102 !(cfg->fc_flags & RTF_CACHE));
3105 for_each_fib6_node_rt_rcu(fn) {
3106 if (cfg->fc_flags & RTF_CACHE) {
3107 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3113 if (cfg->fc_ifindex &&
3115 rt->dst.dev->ifindex != cfg->fc_ifindex))
3117 if (cfg->fc_flags & RTF_GATEWAY &&
3118 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3120 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
3122 if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
3124 if (!dst_hold_safe(&rt->dst))
3128 /* if gateway was specified only delete the one hop */
3129 if (cfg->fc_flags & RTF_GATEWAY)
3130 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3132 return __ip6_del_rt_siblings(rt, cfg);
3140 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3142 struct netevent_redirect netevent;
3143 struct rt6_info *rt, *nrt = NULL;
3144 struct ndisc_options ndopts;
3145 struct inet6_dev *in6_dev;
3146 struct neighbour *neigh;
3148 int optlen, on_link;
3151 optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3152 optlen -= sizeof(*msg);
3155 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3159 msg = (struct rd_msg *)icmp6_hdr(skb);
3161 if (ipv6_addr_is_multicast(&msg->dest)) {
3162 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3167 if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3169 } else if (ipv6_addr_type(&msg->target) !=
3170 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3171 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3175 in6_dev = __in6_dev_get(skb->dev);
3178 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3182 * The IP source address of the Redirect MUST be the same as the current
3183 * first-hop router for the specified ICMP Destination Address.
3186 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3187 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3192 if (ndopts.nd_opts_tgt_lladdr) {
3193 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3196 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3201 rt = (struct rt6_info *) dst;
3202 if (rt->rt6i_flags & RTF_REJECT) {
3203 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3207 /* Redirect received -> path was valid.
3208 * Look, redirects are sent only in response to data packets,
3209 * so that this nexthop apparently is reachable. --ANK
3211 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3213 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3218 * We have finally decided to accept it.
3221 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3222 NEIGH_UPDATE_F_WEAK_OVERRIDE|
3223 NEIGH_UPDATE_F_OVERRIDE|
3224 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3225 NEIGH_UPDATE_F_ISROUTER)),
3226 NDISC_REDIRECT, &ndopts);
3228 nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
3232 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3234 nrt->rt6i_flags &= ~RTF_GATEWAY;
3236 nrt->rt6i_protocol = RTPROT_REDIRECT;
3237 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3239 /* No need to remove rt from the exception table if rt is
3240 * a cached route because rt6_insert_exception() will
3243 if (rt6_insert_exception(nrt, rt)) {
3244 dst_release_immediate(&nrt->dst);
3248 netevent.old = &rt->dst;
3249 netevent.new = &nrt->dst;
3250 netevent.daddr = &msg->dest;
3251 netevent.neigh = neigh;
3252 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3255 neigh_release(neigh);
3259 * Misc support functions
3262 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
3266 rt->rt6i_flags &= ~RTF_EXPIRES;
3267 dst_hold(&from->dst);
3269 dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
3272 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
3274 rt->dst.input = ort->dst.input;
3275 rt->dst.output = ort->dst.output;
3276 rt->rt6i_dst = ort->rt6i_dst;
3277 rt->dst.error = ort->dst.error;
3278 rt->rt6i_idev = ort->rt6i_idev;
3280 in6_dev_hold(rt->rt6i_idev);
3281 rt->dst.lastuse = jiffies;
3282 rt->rt6i_gateway = ort->rt6i_gateway;
3283 rt->rt6i_flags = ort->rt6i_flags;
3284 rt6_set_from(rt, ort);
3285 rt->rt6i_metric = ort->rt6i_metric;
3286 #ifdef CONFIG_IPV6_SUBTREES
3287 rt->rt6i_src = ort->rt6i_src;
3289 rt->rt6i_prefsrc = ort->rt6i_prefsrc;
3290 rt->rt6i_table = ort->rt6i_table;
3291 rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
3294 #ifdef CONFIG_IPV6_ROUTE_INFO
3295 static struct rt6_info *rt6_get_route_info(struct net *net,
3296 const struct in6_addr *prefix, int prefixlen,
3297 const struct in6_addr *gwaddr,
3298 struct net_device *dev)
3300 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3301 int ifindex = dev->ifindex;
3302 struct fib6_node *fn;
3303 struct rt6_info *rt = NULL;
3304 struct fib6_table *table;
3306 table = fib6_get_table(net, tb_id);
3311 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3315 for_each_fib6_node_rt_rcu(fn) {
3316 if (rt->dst.dev->ifindex != ifindex)
3318 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3320 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
3322 ip6_hold_safe(NULL, &rt, false);
3330 static struct rt6_info *rt6_add_route_info(struct net *net,
3331 const struct in6_addr *prefix, int prefixlen,
3332 const struct in6_addr *gwaddr,
3333 struct net_device *dev,
3336 struct fib6_config cfg = {
3337 .fc_metric = IP6_RT_PRIO_USER,
3338 .fc_ifindex = dev->ifindex,
3339 .fc_dst_len = prefixlen,
3340 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3341 RTF_UP | RTF_PREF(pref),
3342 .fc_protocol = RTPROT_RA,
3343 .fc_nlinfo.portid = 0,
3344 .fc_nlinfo.nlh = NULL,
3345 .fc_nlinfo.nl_net = net,
3348 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3349 cfg.fc_dst = *prefix;
3350 cfg.fc_gateway = *gwaddr;
3352 /* We should treat it as a default route if prefix length is 0. */
3354 cfg.fc_flags |= RTF_DEFAULT;
3356 ip6_route_add(&cfg, NULL);
3358 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3362 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
3364 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3365 struct rt6_info *rt;
3366 struct fib6_table *table;
3368 table = fib6_get_table(dev_net(dev), tb_id);
3373 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3374 if (dev == rt->dst.dev &&
3375 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3376 ipv6_addr_equal(&rt->rt6i_gateway, addr))
3380 ip6_hold_safe(NULL, &rt, false);
3385 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
3386 struct net_device *dev,
3389 struct fib6_config cfg = {
3390 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3391 .fc_metric = IP6_RT_PRIO_USER,
3392 .fc_ifindex = dev->ifindex,
3393 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3394 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3395 .fc_protocol = RTPROT_RA,
3396 .fc_nlinfo.portid = 0,
3397 .fc_nlinfo.nlh = NULL,
3398 .fc_nlinfo.nl_net = dev_net(dev),
3401 cfg.fc_gateway = *gwaddr;
3403 if (!ip6_route_add(&cfg, NULL)) {
3404 struct fib6_table *table;
3406 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3408 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3411 return rt6_get_dflt_router(gwaddr, dev);
3414 static void __rt6_purge_dflt_routers(struct fib6_table *table)
3416 struct rt6_info *rt;
3420 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3421 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3422 (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
3423 if (dst_hold_safe(&rt->dst)) {
3434 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3437 void rt6_purge_dflt_routers(struct net *net)
3439 struct fib6_table *table;
3440 struct hlist_head *head;
3445 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3446 head = &net->ipv6.fib_table_hash[h];
3447 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3448 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3449 __rt6_purge_dflt_routers(table);
3456 static void rtmsg_to_fib6_config(struct net *net,
3457 struct in6_rtmsg *rtmsg,
3458 struct fib6_config *cfg)
3460 memset(cfg, 0, sizeof(*cfg));
3462 cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3464 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3465 cfg->fc_metric = rtmsg->rtmsg_metric;
3466 cfg->fc_expires = rtmsg->rtmsg_info;
3467 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3468 cfg->fc_src_len = rtmsg->rtmsg_src_len;
3469 cfg->fc_flags = rtmsg->rtmsg_flags;
3471 cfg->fc_nlinfo.nl_net = net;
3473 cfg->fc_dst = rtmsg->rtmsg_dst;
3474 cfg->fc_src = rtmsg->rtmsg_src;
3475 cfg->fc_gateway = rtmsg->rtmsg_gateway;
3478 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3480 struct fib6_config cfg;
3481 struct in6_rtmsg rtmsg;
3485 case SIOCADDRT: /* Add a route */
3486 case SIOCDELRT: /* Delete a route */
3487 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3489 err = copy_from_user(&rtmsg, arg,
3490 sizeof(struct in6_rtmsg));
3494 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3499 err = ip6_route_add(&cfg, NULL);
3502 err = ip6_route_del(&cfg, NULL);
3516 * Drop the packet on the floor
3519 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3522 struct dst_entry *dst = skb_dst(skb);
3523 switch (ipstats_mib_noroutes) {
3524 case IPSTATS_MIB_INNOROUTES:
3525 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3526 if (type == IPV6_ADDR_ANY) {
3527 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3528 IPSTATS_MIB_INADDRERRORS);
3532 case IPSTATS_MIB_OUTNOROUTES:
3533 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3534 ipstats_mib_noroutes);
3537 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3542 static int ip6_pkt_discard(struct sk_buff *skb)
3544 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3547 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3549 skb->dev = skb_dst(skb)->dev;
3550 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3553 static int ip6_pkt_prohibit(struct sk_buff *skb)
3555 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3558 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3560 skb->dev = skb_dst(skb)->dev;
3561 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3565 * Allocate a dst for local (unicast / anycast) address.
3568 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
3569 const struct in6_addr *addr,
3573 struct net *net = dev_net(idev->dev);
3574 struct net_device *dev = idev->dev;
3575 struct rt6_info *rt;
3577 rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
3579 return ERR_PTR(-ENOMEM);
3583 rt->dst.flags |= DST_HOST;
3584 rt->dst.input = ip6_input;
3585 rt->dst.output = ip6_output;
3586 rt->rt6i_idev = idev;
3588 rt->rt6i_protocol = RTPROT_KERNEL;
3589 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
3591 rt->rt6i_flags |= RTF_ANYCAST;
3593 rt->rt6i_flags |= RTF_LOCAL;
3595 rt->rt6i_gateway = *addr;
3596 rt->rt6i_dst.addr = *addr;
3597 rt->rt6i_dst.plen = 128;
3598 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3599 rt->rt6i_table = fib6_get_table(net, tb_id);
3604 /* remove deleted ip from prefsrc entries */
3605 struct arg_dev_net_ip {
3606 struct net_device *dev;
3608 struct in6_addr *addr;
3611 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
3613 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3614 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3615 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3617 if (((void *)rt->dst.dev == dev || !dev) &&
3618 rt != net->ipv6.ip6_null_entry &&
3619 ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
3620 spin_lock_bh(&rt6_exception_lock);
3621 /* remove prefsrc entry */
3622 rt->rt6i_prefsrc.plen = 0;
3623 /* need to update cache as well */
3624 rt6_exceptions_remove_prefsrc(rt);
3625 spin_unlock_bh(&rt6_exception_lock);
3630 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3632 struct net *net = dev_net(ifp->idev->dev);
3633 struct arg_dev_net_ip adni = {
3634 .dev = ifp->idev->dev,
3638 fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3641 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3643 /* Remove routers and update dst entries when gateway turn into host. */
3644 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
3646 struct in6_addr *gateway = (struct in6_addr *)arg;
3648 if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3649 ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
3653 /* Further clean up cached routes in exception table.
3654 * This is needed because cached route may have a different
3655 * gateway than its 'parent' in the case of an ip redirect.
3657 rt6_exceptions_clean_tohost(rt, gateway);
3662 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3664 fib6_clean_all(net, fib6_clean_tohost, gateway);
3667 struct arg_netdev_event {
3668 const struct net_device *dev;
3670 unsigned int nh_flags;
3671 unsigned long event;
3675 static struct rt6_info *rt6_multipath_first_sibling(const struct rt6_info *rt)
3677 struct rt6_info *iter;
3678 struct fib6_node *fn;
3680 fn = rcu_dereference_protected(rt->rt6i_node,
3681 lockdep_is_held(&rt->rt6i_table->tb6_lock));
3682 iter = rcu_dereference_protected(fn->leaf,
3683 lockdep_is_held(&rt->rt6i_table->tb6_lock));
3685 if (iter->rt6i_metric == rt->rt6i_metric &&
3686 rt6_qualify_for_ecmp(iter))
3688 iter = rcu_dereference_protected(iter->rt6_next,
3689 lockdep_is_held(&rt->rt6i_table->tb6_lock));
3695 static bool rt6_is_dead(const struct rt6_info *rt)
3697 if (rt->rt6i_nh_flags & RTNH_F_DEAD ||
3698 (rt->rt6i_nh_flags & RTNH_F_LINKDOWN &&
3699 rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
3705 static int rt6_multipath_total_weight(const struct rt6_info *rt)
3707 struct rt6_info *iter;
3710 if (!rt6_is_dead(rt))
3711 total += rt->rt6i_nh_weight;
3713 list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) {
3714 if (!rt6_is_dead(iter))
3715 total += iter->rt6i_nh_weight;
3721 static void rt6_upper_bound_set(struct rt6_info *rt, int *weight, int total)
3723 int upper_bound = -1;
3725 if (!rt6_is_dead(rt)) {
3726 *weight += rt->rt6i_nh_weight;
3727 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3730 atomic_set(&rt->rt6i_nh_upper_bound, upper_bound);
3733 static void rt6_multipath_upper_bound_set(struct rt6_info *rt, int total)
3735 struct rt6_info *iter;
3738 rt6_upper_bound_set(rt, &weight, total);
3740 list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3741 rt6_upper_bound_set(iter, &weight, total);
3744 void rt6_multipath_rebalance(struct rt6_info *rt)
3746 struct rt6_info *first;
3749 /* In case the entire multipath route was marked for flushing,
3750 * then there is no need to rebalance upon the removal of every
3753 if (!rt->rt6i_nsiblings || rt->should_flush)
3756 /* During lookup routes are evaluated in order, so we need to
3757 * make sure upper bounds are assigned from the first sibling
3760 first = rt6_multipath_first_sibling(rt);
3761 if (WARN_ON_ONCE(!first))
3764 total = rt6_multipath_total_weight(first);
3765 rt6_multipath_upper_bound_set(first, total);
3768 static int fib6_ifup(struct rt6_info *rt, void *p_arg)
3770 const struct arg_netdev_event *arg = p_arg;
3771 const struct net *net = dev_net(arg->dev);
3773 if (rt != net->ipv6.ip6_null_entry && rt->dst.dev == arg->dev) {
3774 rt->rt6i_nh_flags &= ~arg->nh_flags;
3775 fib6_update_sernum_upto_root(dev_net(rt->dst.dev), rt);
3776 rt6_multipath_rebalance(rt);
3782 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3784 struct arg_netdev_event arg = {
3787 .nh_flags = nh_flags,
3791 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3792 arg.nh_flags |= RTNH_F_LINKDOWN;
3794 fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3797 static bool rt6_multipath_uses_dev(const struct rt6_info *rt,
3798 const struct net_device *dev)
3800 struct rt6_info *iter;
3802 if (rt->dst.dev == dev)
3804 list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3805 if (iter->dst.dev == dev)
3811 static void rt6_multipath_flush(struct rt6_info *rt)
3813 struct rt6_info *iter;
3815 rt->should_flush = 1;
3816 list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3817 iter->should_flush = 1;
3820 static unsigned int rt6_multipath_dead_count(const struct rt6_info *rt,
3821 const struct net_device *down_dev)
3823 struct rt6_info *iter;
3824 unsigned int dead = 0;
3826 if (rt->dst.dev == down_dev || rt->rt6i_nh_flags & RTNH_F_DEAD)
3828 list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3829 if (iter->dst.dev == down_dev ||
3830 iter->rt6i_nh_flags & RTNH_F_DEAD)
3836 static void rt6_multipath_nh_flags_set(struct rt6_info *rt,
3837 const struct net_device *dev,
3838 unsigned int nh_flags)
3840 struct rt6_info *iter;
3842 if (rt->dst.dev == dev)
3843 rt->rt6i_nh_flags |= nh_flags;
3844 list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3845 if (iter->dst.dev == dev)
3846 iter->rt6i_nh_flags |= nh_flags;
3849 /* called with write lock held for table with rt */
3850 static int fib6_ifdown(struct rt6_info *rt, void *p_arg)
3852 const struct arg_netdev_event *arg = p_arg;
3853 const struct net_device *dev = arg->dev;
3854 const struct net *net = dev_net(dev);
3856 if (rt == net->ipv6.ip6_null_entry)
3859 switch (arg->event) {
3860 case NETDEV_UNREGISTER:
3861 return rt->dst.dev == dev ? -1 : 0;
3863 if (rt->should_flush)
3865 if (!rt->rt6i_nsiblings)
3866 return rt->dst.dev == dev ? -1 : 0;
3867 if (rt6_multipath_uses_dev(rt, dev)) {
3870 count = rt6_multipath_dead_count(rt, dev);
3871 if (rt->rt6i_nsiblings + 1 == count) {
3872 rt6_multipath_flush(rt);
3875 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
3877 fib6_update_sernum(rt);
3878 rt6_multipath_rebalance(rt);
3882 if (rt->dst.dev != dev ||
3883 rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST))
3885 rt->rt6i_nh_flags |= RTNH_F_LINKDOWN;
3886 rt6_multipath_rebalance(rt);
3893 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
3895 struct arg_netdev_event arg = {
3902 fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
3905 void rt6_disable_ip(struct net_device *dev, unsigned long event)
3907 rt6_sync_down_dev(dev, event);
3908 rt6_uncached_list_flush_dev(dev_net(dev), dev);
3909 neigh_ifdown(&nd_tbl, dev);
3912 struct rt6_mtu_change_arg {
3913 struct net_device *dev;
3917 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
3919 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
3920 struct inet6_dev *idev;
3922 /* In IPv6 pmtu discovery is not optional,
3923 so that RTAX_MTU lock cannot disable it.
3924 We still use this lock to block changes
3925 caused by addrconf/ndisc.
3928 idev = __in6_dev_get(arg->dev);
3932 /* For administrative MTU increase, there is no way to discover
3933 IPv6 PMTU increase, so PMTU increase should be updated here.
3934 Since RFC 1981 doesn't include administrative MTU increase
3935 update PMTU increase is a MUST. (i.e. jumbo frame)
3937 if (rt->dst.dev == arg->dev &&
3938 !dst_metric_locked(&rt->dst, RTAX_MTU)) {
3939 spin_lock_bh(&rt6_exception_lock);
3940 if (dst_metric_raw(&rt->dst, RTAX_MTU) &&
3941 rt6_mtu_change_route_allowed(idev, rt, arg->mtu))
3942 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
3943 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
3944 spin_unlock_bh(&rt6_exception_lock);
3949 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
3951 struct rt6_mtu_change_arg arg = {
3956 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
3959 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
3960 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
3961 [RTA_OIF] = { .type = NLA_U32 },
3962 [RTA_IIF] = { .type = NLA_U32 },
3963 [RTA_PRIORITY] = { .type = NLA_U32 },
3964 [RTA_METRICS] = { .type = NLA_NESTED },
3965 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
3966 [RTA_PREF] = { .type = NLA_U8 },
3967 [RTA_ENCAP_TYPE] = { .type = NLA_U16 },
3968 [RTA_ENCAP] = { .type = NLA_NESTED },
3969 [RTA_EXPIRES] = { .type = NLA_U32 },
3970 [RTA_UID] = { .type = NLA_U32 },
3971 [RTA_MARK] = { .type = NLA_U32 },
3974 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
3975 struct fib6_config *cfg,
3976 struct netlink_ext_ack *extack)
3979 struct nlattr *tb[RTA_MAX+1];
3983 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
3989 rtm = nlmsg_data(nlh);
3990 memset(cfg, 0, sizeof(*cfg));
3992 cfg->fc_table = rtm->rtm_table;
3993 cfg->fc_dst_len = rtm->rtm_dst_len;
3994 cfg->fc_src_len = rtm->rtm_src_len;
3995 cfg->fc_flags = RTF_UP;
3996 cfg->fc_protocol = rtm->rtm_protocol;
3997 cfg->fc_type = rtm->rtm_type;
3999 if (rtm->rtm_type == RTN_UNREACHABLE ||
4000 rtm->rtm_type == RTN_BLACKHOLE ||
4001 rtm->rtm_type == RTN_PROHIBIT ||
4002 rtm->rtm_type == RTN_THROW)
4003 cfg->fc_flags |= RTF_REJECT;
4005 if (rtm->rtm_type == RTN_LOCAL)
4006 cfg->fc_flags |= RTF_LOCAL;
4008 if (rtm->rtm_flags & RTM_F_CLONED)
4009 cfg->fc_flags |= RTF_CACHE;
4011 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4013 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
4014 cfg->fc_nlinfo.nlh = nlh;
4015 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
4017 if (tb[RTA_GATEWAY]) {
4018 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4019 cfg->fc_flags |= RTF_GATEWAY;
4023 int plen = (rtm->rtm_dst_len + 7) >> 3;
4025 if (nla_len(tb[RTA_DST]) < plen)
4028 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4032 int plen = (rtm->rtm_src_len + 7) >> 3;
4034 if (nla_len(tb[RTA_SRC]) < plen)
4037 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4040 if (tb[RTA_PREFSRC])
4041 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4044 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4046 if (tb[RTA_PRIORITY])
4047 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4049 if (tb[RTA_METRICS]) {
4050 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4051 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4055 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4057 if (tb[RTA_MULTIPATH]) {
4058 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4059 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4061 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4062 cfg->fc_mp_len, extack);
4068 pref = nla_get_u8(tb[RTA_PREF]);
4069 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4070 pref != ICMPV6_ROUTER_PREF_HIGH)
4071 pref = ICMPV6_ROUTER_PREF_MEDIUM;
4072 cfg->fc_flags |= RTF_PREF(pref);
4076 cfg->fc_encap = tb[RTA_ENCAP];
4078 if (tb[RTA_ENCAP_TYPE]) {
4079 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4081 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4086 if (tb[RTA_EXPIRES]) {
4087 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4089 if (addrconf_finite_timeout(timeout)) {
4090 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4091 cfg->fc_flags |= RTF_EXPIRES;
4101 struct rt6_info *rt6_info;
4102 struct fib6_config r_cfg;
4103 struct mx6_config mxc;
4104 struct list_head next;
4107 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
4111 list_for_each_entry(nh, rt6_nh_list, next) {
4112 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
4113 &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
4114 nh->r_cfg.fc_ifindex);
4118 static int ip6_route_info_append(struct list_head *rt6_nh_list,
4119 struct rt6_info *rt, struct fib6_config *r_cfg)
4124 list_for_each_entry(nh, rt6_nh_list, next) {
4125 /* check if rt6_info already exists */
4126 if (rt6_duplicate_nexthop(nh->rt6_info, rt))
4130 nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4134 err = ip6_convert_metrics(&nh->mxc, r_cfg);
4139 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4140 list_add_tail(&nh->next, rt6_nh_list);
4145 static void ip6_route_mpath_notify(struct rt6_info *rt,
4146 struct rt6_info *rt_last,
4147 struct nl_info *info,
4150 /* if this is an APPEND route, then rt points to the first route
4151 * inserted and rt_last points to last route inserted. Userspace
4152 * wants a consistent dump of the route which starts at the first
4153 * nexthop. Since sibling routes are always added at the end of
4154 * the list, find the first sibling of the last route appended
4156 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
4157 rt = list_first_entry(&rt_last->rt6i_siblings,
4163 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4166 static int ip6_route_multipath_add(struct fib6_config *cfg,
4167 struct netlink_ext_ack *extack)
4169 struct rt6_info *rt_notif = NULL, *rt_last = NULL;
4170 struct nl_info *info = &cfg->fc_nlinfo;
4171 struct fib6_config r_cfg;
4172 struct rtnexthop *rtnh;
4173 struct rt6_info *rt;
4174 struct rt6_nh *err_nh;
4175 struct rt6_nh *nh, *nh_safe;
4181 int replace = (cfg->fc_nlinfo.nlh &&
4182 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4183 LIST_HEAD(rt6_nh_list);
4185 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4186 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4187 nlflags |= NLM_F_APPEND;
4189 remaining = cfg->fc_mp_len;
4190 rtnh = (struct rtnexthop *)cfg->fc_mp;
4192 /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4193 * rt6_info structs per nexthop
4195 while (rtnh_ok(rtnh, remaining)) {
4196 memcpy(&r_cfg, cfg, sizeof(*cfg));
4197 if (rtnh->rtnh_ifindex)
4198 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4200 attrlen = rtnh_attrlen(rtnh);
4202 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4204 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4206 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4207 r_cfg.fc_flags |= RTF_GATEWAY;
4209 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4210 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4212 r_cfg.fc_encap_type = nla_get_u16(nla);
4215 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4216 rt = ip6_route_info_create(&r_cfg, extack);
4223 rt->rt6i_nh_weight = rtnh->rtnh_hops + 1;
4225 err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
4227 dst_release_immediate(&rt->dst);
4231 rtnh = rtnh_next(rtnh, &remaining);
4234 /* for add and replace send one notification with all nexthops.
4235 * Skip the notification in fib6_add_rt2node and send one with
4236 * the full route when done
4238 info->skip_notify = 1;
4241 list_for_each_entry(nh, &rt6_nh_list, next) {
4242 rt_last = nh->rt6_info;
4243 err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack);
4244 /* save reference to first route for notification */
4245 if (!rt_notif && !err)
4246 rt_notif = nh->rt6_info;
4248 /* nh->rt6_info is used or freed at this point, reset to NULL*/
4249 nh->rt6_info = NULL;
4252 ip6_print_replace_route_err(&rt6_nh_list);
4257 /* Because each route is added like a single route we remove
4258 * these flags after the first nexthop: if there is a collision,
4259 * we have already failed to add the first nexthop:
4260 * fib6_add_rt2node() has rejected it; when replacing, old
4261 * nexthops have been replaced by first new, the rest should
4264 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4269 /* success ... tell user about new route */
4270 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4274 /* send notification for routes that were added so that
4275 * the delete notifications sent by ip6_route_del are
4279 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4281 /* Delete routes that were already added */
4282 list_for_each_entry(nh, &rt6_nh_list, next) {
4285 ip6_route_del(&nh->r_cfg, extack);
4289 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4291 dst_release_immediate(&nh->rt6_info->dst);
4293 list_del(&nh->next);
4300 static int ip6_route_multipath_del(struct fib6_config *cfg,
4301 struct netlink_ext_ack *extack)
4303 struct fib6_config r_cfg;
4304 struct rtnexthop *rtnh;
4307 int err = 1, last_err = 0;
4309 remaining = cfg->fc_mp_len;
4310 rtnh = (struct rtnexthop *)cfg->fc_mp;
4312 /* Parse a Multipath Entry */
4313 while (rtnh_ok(rtnh, remaining)) {
4314 memcpy(&r_cfg, cfg, sizeof(*cfg));
4315 if (rtnh->rtnh_ifindex)
4316 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4318 attrlen = rtnh_attrlen(rtnh);
4320 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4322 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4324 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4325 r_cfg.fc_flags |= RTF_GATEWAY;
4328 err = ip6_route_del(&r_cfg, extack);
4332 rtnh = rtnh_next(rtnh, &remaining);
4338 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4339 struct netlink_ext_ack *extack)
4341 struct fib6_config cfg;
4344 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4349 return ip6_route_multipath_del(&cfg, extack);
4351 cfg.fc_delete_all_nh = 1;
4352 return ip6_route_del(&cfg, extack);
4356 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4357 struct netlink_ext_ack *extack)
4359 struct fib6_config cfg;
4362 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4367 return ip6_route_multipath_add(&cfg, extack);
4369 return ip6_route_add(&cfg, extack);
4372 static size_t rt6_nlmsg_size(struct rt6_info *rt)
4374 int nexthop_len = 0;
4376 if (rt->rt6i_nsiblings) {
4377 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */
4378 + NLA_ALIGN(sizeof(struct rtnexthop))
4379 + nla_total_size(16) /* RTA_GATEWAY */
4380 + lwtunnel_get_encap_size(rt->dst.lwtstate);
4382 nexthop_len *= rt->rt6i_nsiblings;
4385 return NLMSG_ALIGN(sizeof(struct rtmsg))
4386 + nla_total_size(16) /* RTA_SRC */
4387 + nla_total_size(16) /* RTA_DST */
4388 + nla_total_size(16) /* RTA_GATEWAY */
4389 + nla_total_size(16) /* RTA_PREFSRC */
4390 + nla_total_size(4) /* RTA_TABLE */
4391 + nla_total_size(4) /* RTA_IIF */
4392 + nla_total_size(4) /* RTA_OIF */
4393 + nla_total_size(4) /* RTA_PRIORITY */
4394 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4395 + nla_total_size(sizeof(struct rta_cacheinfo))
4396 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4397 + nla_total_size(1) /* RTA_PREF */
4398 + lwtunnel_get_encap_size(rt->dst.lwtstate)
4402 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
4403 unsigned int *flags, bool skip_oif)
4405 if (rt->rt6i_nh_flags & RTNH_F_DEAD)
4406 *flags |= RTNH_F_DEAD;
4408 if (rt->rt6i_nh_flags & RTNH_F_LINKDOWN) {
4409 *flags |= RTNH_F_LINKDOWN;
4410 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
4411 *flags |= RTNH_F_DEAD;
4414 if (rt->rt6i_flags & RTF_GATEWAY) {
4415 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
4416 goto nla_put_failure;
4419 *flags |= (rt->rt6i_nh_flags & RTNH_F_ONLINK);
4420 if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD)
4421 *flags |= RTNH_F_OFFLOAD;
4423 /* not needed for multipath encoding b/c it has a rtnexthop struct */
4424 if (!skip_oif && rt->dst.dev &&
4425 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
4426 goto nla_put_failure;
4428 if (rt->dst.lwtstate &&
4429 lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
4430 goto nla_put_failure;
4438 /* add multipath next hop */
4439 static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
4441 struct rtnexthop *rtnh;
4442 unsigned int flags = 0;
4444 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4446 goto nla_put_failure;
4448 rtnh->rtnh_hops = rt->rt6i_nh_weight - 1;
4449 rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
4451 if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4452 goto nla_put_failure;
4454 rtnh->rtnh_flags = flags;
4456 /* length of rtnetlink header + attributes */
4457 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4465 static int rt6_fill_node(struct net *net,
4466 struct sk_buff *skb, struct rt6_info *rt,
4467 struct in6_addr *dst, struct in6_addr *src,
4468 int iif, int type, u32 portid, u32 seq,
4471 u32 metrics[RTAX_MAX];
4473 struct nlmsghdr *nlh;
4477 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4481 rtm = nlmsg_data(nlh);
4482 rtm->rtm_family = AF_INET6;
4483 rtm->rtm_dst_len = rt->rt6i_dst.plen;
4484 rtm->rtm_src_len = rt->rt6i_src.plen;
4487 table = rt->rt6i_table->tb6_id;
4489 table = RT6_TABLE_UNSPEC;
4490 rtm->rtm_table = table;
4491 if (nla_put_u32(skb, RTA_TABLE, table))
4492 goto nla_put_failure;
4493 if (rt->rt6i_flags & RTF_REJECT) {
4494 switch (rt->dst.error) {
4496 rtm->rtm_type = RTN_BLACKHOLE;
4499 rtm->rtm_type = RTN_PROHIBIT;
4502 rtm->rtm_type = RTN_THROW;
4505 rtm->rtm_type = RTN_UNREACHABLE;
4509 else if (rt->rt6i_flags & RTF_LOCAL)
4510 rtm->rtm_type = RTN_LOCAL;
4511 else if (rt->rt6i_flags & RTF_ANYCAST)
4512 rtm->rtm_type = RTN_ANYCAST;
4513 else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
4514 rtm->rtm_type = RTN_LOCAL;
4516 rtm->rtm_type = RTN_UNICAST;
4518 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4519 rtm->rtm_protocol = rt->rt6i_protocol;
4521 if (rt->rt6i_flags & RTF_CACHE)
4522 rtm->rtm_flags |= RTM_F_CLONED;
4525 if (nla_put_in6_addr(skb, RTA_DST, dst))
4526 goto nla_put_failure;
4527 rtm->rtm_dst_len = 128;
4528 } else if (rtm->rtm_dst_len)
4529 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
4530 goto nla_put_failure;
4531 #ifdef CONFIG_IPV6_SUBTREES
4533 if (nla_put_in6_addr(skb, RTA_SRC, src))
4534 goto nla_put_failure;
4535 rtm->rtm_src_len = 128;
4536 } else if (rtm->rtm_src_len &&
4537 nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
4538 goto nla_put_failure;
4541 #ifdef CONFIG_IPV6_MROUTE
4542 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
4543 int err = ip6mr_get_route(net, skb, rtm, portid);
4548 goto nla_put_failure;
4551 if (nla_put_u32(skb, RTA_IIF, iif))
4552 goto nla_put_failure;
4554 struct in6_addr saddr_buf;
4555 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
4556 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4557 goto nla_put_failure;
4560 if (rt->rt6i_prefsrc.plen) {
4561 struct in6_addr saddr_buf;
4562 saddr_buf = rt->rt6i_prefsrc.addr;
4563 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4564 goto nla_put_failure;
4567 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
4569 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
4570 if (rtnetlink_put_metrics(skb, metrics) < 0)
4571 goto nla_put_failure;
4573 if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
4574 goto nla_put_failure;
4576 /* For multipath routes, walk the siblings list and add
4577 * each as a nexthop within RTA_MULTIPATH.
4579 if (rt->rt6i_nsiblings) {
4580 struct rt6_info *sibling, *next_sibling;
4583 mp = nla_nest_start(skb, RTA_MULTIPATH);
4585 goto nla_put_failure;
4587 if (rt6_add_nexthop(skb, rt) < 0)
4588 goto nla_put_failure;
4590 list_for_each_entry_safe(sibling, next_sibling,
4591 &rt->rt6i_siblings, rt6i_siblings) {
4592 if (rt6_add_nexthop(skb, sibling) < 0)
4593 goto nla_put_failure;
4596 nla_nest_end(skb, mp);
4598 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4599 goto nla_put_failure;
4602 expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
4604 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
4605 goto nla_put_failure;
4607 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
4608 goto nla_put_failure;
4611 nlmsg_end(skb, nlh);
4615 nlmsg_cancel(skb, nlh);
4619 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
4621 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4622 struct net *net = arg->net;
4624 if (rt == net->ipv6.ip6_null_entry)
4627 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4628 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4630 /* user wants prefix routes only */
4631 if (rtm->rtm_flags & RTM_F_PREFIX &&
4632 !(rt->rt6i_flags & RTF_PREFIX_RT)) {
4633 /* success since this is not a prefix route */
4638 return rt6_fill_node(net,
4639 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
4640 NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
4644 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4645 struct netlink_ext_ack *extack)
4647 struct net *net = sock_net(in_skb->sk);
4648 struct nlattr *tb[RTA_MAX+1];
4649 int err, iif = 0, oif = 0;
4650 struct dst_entry *dst;
4651 struct rt6_info *rt;
4652 struct sk_buff *skb;
4657 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4663 memset(&fl6, 0, sizeof(fl6));
4664 rtm = nlmsg_data(nlh);
4665 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4666 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4669 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4672 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4676 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4679 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4683 iif = nla_get_u32(tb[RTA_IIF]);
4686 oif = nla_get_u32(tb[RTA_OIF]);
4689 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4692 fl6.flowi6_uid = make_kuid(current_user_ns(),
4693 nla_get_u32(tb[RTA_UID]));
4695 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4698 struct net_device *dev;
4703 dev = dev_get_by_index_rcu(net, iif);
4710 fl6.flowi6_iif = iif;
4712 if (!ipv6_addr_any(&fl6.saddr))
4713 flags |= RT6_LOOKUP_F_HAS_SADDR;
4715 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4719 fl6.flowi6_oif = oif;
4721 dst = ip6_route_output(net, NULL, &fl6);
4725 rt = container_of(dst, struct rt6_info, dst);
4726 if (rt->dst.error) {
4727 err = rt->dst.error;
4732 if (rt == net->ipv6.ip6_null_entry) {
4733 err = rt->dst.error;
4738 if (fibmatch && rt->from) {
4739 struct rt6_info *ort = rt->from;
4741 dst_hold(&ort->dst);
4746 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4753 skb_dst_set(skb, &rt->dst);
4755 err = rt6_fill_node(net, skb, rt, NULL, NULL, iif,
4756 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4759 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
4760 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4767 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4772 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
4773 unsigned int nlm_flags)
4775 struct sk_buff *skb;
4776 struct net *net = info->nl_net;
4781 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4783 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4787 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
4788 event, info->portid, seq, nlm_flags);
4790 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4791 WARN_ON(err == -EMSGSIZE);
4795 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4796 info->nlh, gfp_any());
4800 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4803 static int ip6_route_dev_notify(struct notifier_block *this,
4804 unsigned long event, void *ptr)
4806 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4807 struct net *net = dev_net(dev);
4809 if (!(dev->flags & IFF_LOOPBACK))
4812 if (event == NETDEV_REGISTER) {
4813 net->ipv6.ip6_null_entry->dst.dev = dev;
4814 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
4815 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4816 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
4817 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
4818 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
4819 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
4821 } else if (event == NETDEV_UNREGISTER &&
4822 dev->reg_state != NETREG_UNREGISTERED) {
4823 /* NETDEV_UNREGISTER could be fired for multiple times by
4824 * netdev_wait_allrefs(). Make sure we only call this once.
4826 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
4827 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4828 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
4829 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
4840 #ifdef CONFIG_PROC_FS
4842 static const struct file_operations ipv6_route_proc_fops = {
4843 .open = ipv6_route_open,
4845 .llseek = seq_lseek,
4846 .release = seq_release_net,
4849 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
4851 struct net *net = (struct net *)seq->private;
4852 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
4853 net->ipv6.rt6_stats->fib_nodes,
4854 net->ipv6.rt6_stats->fib_route_nodes,
4855 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
4856 net->ipv6.rt6_stats->fib_rt_entries,
4857 net->ipv6.rt6_stats->fib_rt_cache,
4858 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
4859 net->ipv6.rt6_stats->fib_discarded_routes);
4864 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
4866 return single_open_net(inode, file, rt6_stats_seq_show);
4869 static const struct file_operations rt6_stats_seq_fops = {
4870 .open = rt6_stats_seq_open,
4872 .llseek = seq_lseek,
4873 .release = single_release_net,
4875 #endif /* CONFIG_PROC_FS */
4877 #ifdef CONFIG_SYSCTL
4880 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
4881 void __user *buffer, size_t *lenp, loff_t *ppos)
4888 net = (struct net *)ctl->extra1;
4889 delay = net->ipv6.sysctl.flush_delay;
4890 proc_dointvec(ctl, write, buffer, lenp, ppos);
4891 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
4895 struct ctl_table ipv6_route_table_template[] = {
4897 .procname = "flush",
4898 .data = &init_net.ipv6.sysctl.flush_delay,
4899 .maxlen = sizeof(int),
4901 .proc_handler = ipv6_sysctl_rtcache_flush
4904 .procname = "gc_thresh",
4905 .data = &ip6_dst_ops_template.gc_thresh,
4906 .maxlen = sizeof(int),
4908 .proc_handler = proc_dointvec,
4911 .procname = "max_size",
4912 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
4913 .maxlen = sizeof(int),
4915 .proc_handler = proc_dointvec,
4918 .procname = "gc_min_interval",
4919 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4920 .maxlen = sizeof(int),
4922 .proc_handler = proc_dointvec_jiffies,
4925 .procname = "gc_timeout",
4926 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
4927 .maxlen = sizeof(int),
4929 .proc_handler = proc_dointvec_jiffies,
4932 .procname = "gc_interval",
4933 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
4934 .maxlen = sizeof(int),
4936 .proc_handler = proc_dointvec_jiffies,
4939 .procname = "gc_elasticity",
4940 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
4941 .maxlen = sizeof(int),
4943 .proc_handler = proc_dointvec,
4946 .procname = "mtu_expires",
4947 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
4948 .maxlen = sizeof(int),
4950 .proc_handler = proc_dointvec_jiffies,
4953 .procname = "min_adv_mss",
4954 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
4955 .maxlen = sizeof(int),
4957 .proc_handler = proc_dointvec,
4960 .procname = "gc_min_interval_ms",
4961 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4962 .maxlen = sizeof(int),
4964 .proc_handler = proc_dointvec_ms_jiffies,
4969 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
4971 struct ctl_table *table;
4973 table = kmemdup(ipv6_route_table_template,
4974 sizeof(ipv6_route_table_template),
4978 table[0].data = &net->ipv6.sysctl.flush_delay;
4979 table[0].extra1 = net;
4980 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
4981 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
4982 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4983 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
4984 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
4985 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
4986 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
4987 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
4988 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4990 /* Don't export sysctls to unprivileged users */
4991 if (net->user_ns != &init_user_ns)
4992 table[0].procname = NULL;
4999 static int __net_init ip6_route_net_init(struct net *net)
5003 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5004 sizeof(net->ipv6.ip6_dst_ops));
5006 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5007 goto out_ip6_dst_ops;
5009 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5010 sizeof(*net->ipv6.ip6_null_entry),
5012 if (!net->ipv6.ip6_null_entry)
5013 goto out_ip6_dst_entries;
5014 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5015 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5016 ip6_template_metrics, true);
5018 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5019 net->ipv6.fib6_has_custom_rules = false;
5020 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5021 sizeof(*net->ipv6.ip6_prohibit_entry),
5023 if (!net->ipv6.ip6_prohibit_entry)
5024 goto out_ip6_null_entry;
5025 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5026 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5027 ip6_template_metrics, true);
5029 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5030 sizeof(*net->ipv6.ip6_blk_hole_entry),
5032 if (!net->ipv6.ip6_blk_hole_entry)
5033 goto out_ip6_prohibit_entry;
5034 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5035 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5036 ip6_template_metrics, true);
5039 net->ipv6.sysctl.flush_delay = 0;
5040 net->ipv6.sysctl.ip6_rt_max_size = 4096;
5041 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5042 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5043 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5044 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5045 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5046 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5048 net->ipv6.ip6_rt_gc_expire = 30*HZ;
5054 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5055 out_ip6_prohibit_entry:
5056 kfree(net->ipv6.ip6_prohibit_entry);
5058 kfree(net->ipv6.ip6_null_entry);
5060 out_ip6_dst_entries:
5061 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5066 static void __net_exit ip6_route_net_exit(struct net *net)
5068 kfree(net->ipv6.ip6_null_entry);
5069 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5070 kfree(net->ipv6.ip6_prohibit_entry);
5071 kfree(net->ipv6.ip6_blk_hole_entry);
5073 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5076 static int __net_init ip6_route_net_init_late(struct net *net)
5078 #ifdef CONFIG_PROC_FS
5079 proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
5080 proc_create("rt6_stats", 0444, net->proc_net, &rt6_stats_seq_fops);
5085 static void __net_exit ip6_route_net_exit_late(struct net *net)
5087 #ifdef CONFIG_PROC_FS
5088 remove_proc_entry("ipv6_route", net->proc_net);
5089 remove_proc_entry("rt6_stats", net->proc_net);
5093 static struct pernet_operations ip6_route_net_ops = {
5094 .init = ip6_route_net_init,
5095 .exit = ip6_route_net_exit,
5098 static int __net_init ipv6_inetpeer_init(struct net *net)
5100 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5104 inet_peer_base_init(bp);
5105 net->ipv6.peers = bp;
5109 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5111 struct inet_peer_base *bp = net->ipv6.peers;
5113 net->ipv6.peers = NULL;
5114 inetpeer_invalidate_tree(bp);
5118 static struct pernet_operations ipv6_inetpeer_ops = {
5119 .init = ipv6_inetpeer_init,
5120 .exit = ipv6_inetpeer_exit,
5123 static struct pernet_operations ip6_route_net_late_ops = {
5124 .init = ip6_route_net_init_late,
5125 .exit = ip6_route_net_exit_late,
5128 static struct notifier_block ip6_route_dev_notifier = {
5129 .notifier_call = ip6_route_dev_notify,
5130 .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5133 void __init ip6_route_init_special_entries(void)
5135 /* Registering of the loopback is done before this portion of code,
5136 * the loopback reference in rt6_info will not be taken, do it
5137 * manually for init_net */
5138 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5139 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5140 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5141 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5142 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5143 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5144 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5148 int __init ip6_route_init(void)
5154 ip6_dst_ops_template.kmem_cachep =
5155 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5156 SLAB_HWCACHE_ALIGN, NULL);
5157 if (!ip6_dst_ops_template.kmem_cachep)
5160 ret = dst_entries_init(&ip6_dst_blackhole_ops);
5162 goto out_kmem_cache;
5164 ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5166 goto out_dst_entries;
5168 ret = register_pernet_subsys(&ip6_route_net_ops);
5170 goto out_register_inetpeer;
5172 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5176 goto out_register_subsys;
5182 ret = fib6_rules_init();
5186 ret = register_pernet_subsys(&ip6_route_net_late_ops);
5188 goto fib6_rules_init;
5190 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5191 inet6_rtm_newroute, NULL, 0);
5193 goto out_register_late_subsys;
5195 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5196 inet6_rtm_delroute, NULL, 0);
5198 goto out_register_late_subsys;
5200 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5201 inet6_rtm_getroute, NULL,
5202 RTNL_FLAG_DOIT_UNLOCKED);
5204 goto out_register_late_subsys;
5206 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5208 goto out_register_late_subsys;
5210 for_each_possible_cpu(cpu) {
5211 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5213 INIT_LIST_HEAD(&ul->head);
5214 spin_lock_init(&ul->lock);
5220 out_register_late_subsys:
5221 rtnl_unregister_all(PF_INET6);
5222 unregister_pernet_subsys(&ip6_route_net_late_ops);
5224 fib6_rules_cleanup();
5229 out_register_subsys:
5230 unregister_pernet_subsys(&ip6_route_net_ops);
5231 out_register_inetpeer:
5232 unregister_pernet_subsys(&ipv6_inetpeer_ops);
5234 dst_entries_destroy(&ip6_dst_blackhole_ops);
5236 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5240 void ip6_route_cleanup(void)
5242 unregister_netdevice_notifier(&ip6_route_dev_notifier);
5243 unregister_pernet_subsys(&ip6_route_net_late_ops);
5244 fib6_rules_cleanup();
5247 unregister_pernet_subsys(&ipv6_inetpeer_ops);
5248 unregister_pernet_subsys(&ip6_route_net_ops);
5249 dst_entries_destroy(&ip6_dst_blackhole_ops);
5250 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);