Merge branches 'pm-cpuidle', 'pm-cpufreq' and 'pm-sleep'
[sfrench/cifs-2.6.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <net/ip.h>
67 #include <linux/uaccess.h>
68
69 #ifdef CONFIG_SYSCTL
70 #include <linux/sysctl.h>
71 #endif
72
73 static int ip6_rt_type_to_error(u8 fib6_type);
74
75 #define CREATE_TRACE_POINTS
76 #include <trace/events/fib6.h>
77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
78 #undef CREATE_TRACE_POINTS
79
80 enum rt6_nud_state {
81         RT6_NUD_FAIL_HARD = -3,
82         RT6_NUD_FAIL_PROBE = -2,
83         RT6_NUD_FAIL_DO_RR = -1,
84         RT6_NUD_SUCCEED = 1
85 };
86
87 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
88 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
89 static unsigned int      ip6_mtu(const struct dst_entry *dst);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void             ip6_dst_destroy(struct dst_entry *);
92 static void             ip6_dst_ifdown(struct dst_entry *,
93                                        struct net_device *dev, int how);
94 static int               ip6_dst_gc(struct dst_ops *ops);
95
96 static int              ip6_pkt_discard(struct sk_buff *skb);
97 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
98 static int              ip6_pkt_prohibit(struct sk_buff *skb);
99 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
100 static void             ip6_link_failure(struct sk_buff *skb);
101 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
102                                            struct sk_buff *skb, u32 mtu);
103 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
104                                         struct sk_buff *skb);
105 static int rt6_score_route(struct fib6_info *rt, int oif, int strict);
106 static size_t rt6_nlmsg_size(struct fib6_info *rt);
107 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
108                          struct fib6_info *rt, struct dst_entry *dst,
109                          struct in6_addr *dest, struct in6_addr *src,
110                          int iif, int type, u32 portid, u32 seq,
111                          unsigned int flags);
112 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
113                                            struct in6_addr *daddr,
114                                            struct in6_addr *saddr);
115
116 #ifdef CONFIG_IPV6_ROUTE_INFO
117 static struct fib6_info *rt6_add_route_info(struct net *net,
118                                            const struct in6_addr *prefix, int prefixlen,
119                                            const struct in6_addr *gwaddr,
120                                            struct net_device *dev,
121                                            unsigned int pref);
122 static struct fib6_info *rt6_get_route_info(struct net *net,
123                                            const struct in6_addr *prefix, int prefixlen,
124                                            const struct in6_addr *gwaddr,
125                                            struct net_device *dev);
126 #endif
127
128 struct uncached_list {
129         spinlock_t              lock;
130         struct list_head        head;
131 };
132
133 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
134
135 void rt6_uncached_list_add(struct rt6_info *rt)
136 {
137         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
138
139         rt->rt6i_uncached_list = ul;
140
141         spin_lock_bh(&ul->lock);
142         list_add_tail(&rt->rt6i_uncached, &ul->head);
143         spin_unlock_bh(&ul->lock);
144 }
145
146 void rt6_uncached_list_del(struct rt6_info *rt)
147 {
148         if (!list_empty(&rt->rt6i_uncached)) {
149                 struct uncached_list *ul = rt->rt6i_uncached_list;
150                 struct net *net = dev_net(rt->dst.dev);
151
152                 spin_lock_bh(&ul->lock);
153                 list_del(&rt->rt6i_uncached);
154                 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
155                 spin_unlock_bh(&ul->lock);
156         }
157 }
158
159 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
160 {
161         struct net_device *loopback_dev = net->loopback_dev;
162         int cpu;
163
164         if (dev == loopback_dev)
165                 return;
166
167         for_each_possible_cpu(cpu) {
168                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
169                 struct rt6_info *rt;
170
171                 spin_lock_bh(&ul->lock);
172                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
173                         struct inet6_dev *rt_idev = rt->rt6i_idev;
174                         struct net_device *rt_dev = rt->dst.dev;
175
176                         if (rt_idev->dev == dev) {
177                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
178                                 in6_dev_put(rt_idev);
179                         }
180
181                         if (rt_dev == dev) {
182                                 rt->dst.dev = loopback_dev;
183                                 dev_hold(rt->dst.dev);
184                                 dev_put(rt_dev);
185                         }
186                 }
187                 spin_unlock_bh(&ul->lock);
188         }
189 }
190
191 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
192                                              struct sk_buff *skb,
193                                              const void *daddr)
194 {
195         if (!ipv6_addr_any(p))
196                 return (const void *) p;
197         else if (skb)
198                 return &ipv6_hdr(skb)->daddr;
199         return daddr;
200 }
201
202 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
203                                    struct net_device *dev,
204                                    struct sk_buff *skb,
205                                    const void *daddr)
206 {
207         struct neighbour *n;
208
209         daddr = choose_neigh_daddr(gw, skb, daddr);
210         n = __ipv6_neigh_lookup(dev, daddr);
211         if (n)
212                 return n;
213
214         n = neigh_create(&nd_tbl, daddr, dev);
215         return IS_ERR(n) ? NULL : n;
216 }
217
218 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
219                                               struct sk_buff *skb,
220                                               const void *daddr)
221 {
222         const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
223
224         return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
225 }
226
227 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
228 {
229         struct net_device *dev = dst->dev;
230         struct rt6_info *rt = (struct rt6_info *)dst;
231
232         daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
233         if (!daddr)
234                 return;
235         if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
236                 return;
237         if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
238                 return;
239         __ipv6_confirm_neigh(dev, daddr);
240 }
241
242 static struct dst_ops ip6_dst_ops_template = {
243         .family                 =       AF_INET6,
244         .gc                     =       ip6_dst_gc,
245         .gc_thresh              =       1024,
246         .check                  =       ip6_dst_check,
247         .default_advmss         =       ip6_default_advmss,
248         .mtu                    =       ip6_mtu,
249         .cow_metrics            =       dst_cow_metrics_generic,
250         .destroy                =       ip6_dst_destroy,
251         .ifdown                 =       ip6_dst_ifdown,
252         .negative_advice        =       ip6_negative_advice,
253         .link_failure           =       ip6_link_failure,
254         .update_pmtu            =       ip6_rt_update_pmtu,
255         .redirect               =       rt6_do_redirect,
256         .local_out              =       __ip6_local_out,
257         .neigh_lookup           =       ip6_dst_neigh_lookup,
258         .confirm_neigh          =       ip6_confirm_neigh,
259 };
260
261 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
262 {
263         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
264
265         return mtu ? : dst->dev->mtu;
266 }
267
268 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
269                                          struct sk_buff *skb, u32 mtu)
270 {
271 }
272
273 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
274                                       struct sk_buff *skb)
275 {
276 }
277
278 static struct dst_ops ip6_dst_blackhole_ops = {
279         .family                 =       AF_INET6,
280         .destroy                =       ip6_dst_destroy,
281         .check                  =       ip6_dst_check,
282         .mtu                    =       ip6_blackhole_mtu,
283         .default_advmss         =       ip6_default_advmss,
284         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
285         .redirect               =       ip6_rt_blackhole_redirect,
286         .cow_metrics            =       dst_cow_metrics_generic,
287         .neigh_lookup           =       ip6_dst_neigh_lookup,
288 };
289
290 static const u32 ip6_template_metrics[RTAX_MAX] = {
291         [RTAX_HOPLIMIT - 1] = 0,
292 };
293
294 static const struct fib6_info fib6_null_entry_template = {
295         .fib6_flags     = (RTF_REJECT | RTF_NONEXTHOP),
296         .fib6_protocol  = RTPROT_KERNEL,
297         .fib6_metric    = ~(u32)0,
298         .fib6_ref       = ATOMIC_INIT(1),
299         .fib6_type      = RTN_UNREACHABLE,
300         .fib6_metrics   = (struct dst_metrics *)&dst_default_metrics,
301 };
302
303 static const struct rt6_info ip6_null_entry_template = {
304         .dst = {
305                 .__refcnt       = ATOMIC_INIT(1),
306                 .__use          = 1,
307                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
308                 .error          = -ENETUNREACH,
309                 .input          = ip6_pkt_discard,
310                 .output         = ip6_pkt_discard_out,
311         },
312         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
313 };
314
315 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
316
317 static const struct rt6_info ip6_prohibit_entry_template = {
318         .dst = {
319                 .__refcnt       = ATOMIC_INIT(1),
320                 .__use          = 1,
321                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
322                 .error          = -EACCES,
323                 .input          = ip6_pkt_prohibit,
324                 .output         = ip6_pkt_prohibit_out,
325         },
326         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
327 };
328
329 static const struct rt6_info ip6_blk_hole_entry_template = {
330         .dst = {
331                 .__refcnt       = ATOMIC_INIT(1),
332                 .__use          = 1,
333                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
334                 .error          = -EINVAL,
335                 .input          = dst_discard,
336                 .output         = dst_discard_out,
337         },
338         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
339 };
340
341 #endif
342
343 static void rt6_info_init(struct rt6_info *rt)
344 {
345         struct dst_entry *dst = &rt->dst;
346
347         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
348         INIT_LIST_HEAD(&rt->rt6i_uncached);
349 }
350
351 /* allocate dst with ip6_dst_ops */
352 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
353                                int flags)
354 {
355         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
356                                         1, DST_OBSOLETE_FORCE_CHK, flags);
357
358         if (rt) {
359                 rt6_info_init(rt);
360                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
361         }
362
363         return rt;
364 }
365 EXPORT_SYMBOL(ip6_dst_alloc);
366
367 static void ip6_dst_destroy(struct dst_entry *dst)
368 {
369         struct rt6_info *rt = (struct rt6_info *)dst;
370         struct fib6_info *from;
371         struct inet6_dev *idev;
372
373         ip_dst_metrics_put(dst);
374         rt6_uncached_list_del(rt);
375
376         idev = rt->rt6i_idev;
377         if (idev) {
378                 rt->rt6i_idev = NULL;
379                 in6_dev_put(idev);
380         }
381
382         rcu_read_lock();
383         from = rcu_dereference(rt->from);
384         rcu_assign_pointer(rt->from, NULL);
385         fib6_info_release(from);
386         rcu_read_unlock();
387 }
388
389 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
390                            int how)
391 {
392         struct rt6_info *rt = (struct rt6_info *)dst;
393         struct inet6_dev *idev = rt->rt6i_idev;
394         struct net_device *loopback_dev =
395                 dev_net(dev)->loopback_dev;
396
397         if (idev && idev->dev != loopback_dev) {
398                 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
399                 if (loopback_idev) {
400                         rt->rt6i_idev = loopback_idev;
401                         in6_dev_put(idev);
402                 }
403         }
404 }
405
406 static bool __rt6_check_expired(const struct rt6_info *rt)
407 {
408         if (rt->rt6i_flags & RTF_EXPIRES)
409                 return time_after(jiffies, rt->dst.expires);
410         else
411                 return false;
412 }
413
414 static bool rt6_check_expired(const struct rt6_info *rt)
415 {
416         struct fib6_info *from;
417
418         from = rcu_dereference(rt->from);
419
420         if (rt->rt6i_flags & RTF_EXPIRES) {
421                 if (time_after(jiffies, rt->dst.expires))
422                         return true;
423         } else if (from) {
424                 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
425                         fib6_check_expired(from);
426         }
427         return false;
428 }
429
430 struct fib6_info *fib6_multipath_select(const struct net *net,
431                                         struct fib6_info *match,
432                                         struct flowi6 *fl6, int oif,
433                                         const struct sk_buff *skb,
434                                         int strict)
435 {
436         struct fib6_info *sibling, *next_sibling;
437
438         /* We might have already computed the hash for ICMPv6 errors. In such
439          * case it will always be non-zero. Otherwise now is the time to do it.
440          */
441         if (!fl6->mp_hash)
442                 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
443
444         if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound))
445                 return match;
446
447         list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
448                                  fib6_siblings) {
449                 int nh_upper_bound;
450
451                 nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound);
452                 if (fl6->mp_hash > nh_upper_bound)
453                         continue;
454                 if (rt6_score_route(sibling, oif, strict) < 0)
455                         break;
456                 match = sibling;
457                 break;
458         }
459
460         return match;
461 }
462
463 /*
464  *      Route lookup. rcu_read_lock() should be held.
465  */
466
467 static inline struct fib6_info *rt6_device_match(struct net *net,
468                                                  struct fib6_info *rt,
469                                                     const struct in6_addr *saddr,
470                                                     int oif,
471                                                     int flags)
472 {
473         struct fib6_info *sprt;
474
475         if (!oif && ipv6_addr_any(saddr) &&
476             !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
477                 return rt;
478
479         for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) {
480                 const struct net_device *dev = sprt->fib6_nh.nh_dev;
481
482                 if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD)
483                         continue;
484
485                 if (oif) {
486                         if (dev->ifindex == oif)
487                                 return sprt;
488                 } else {
489                         if (ipv6_chk_addr(net, saddr, dev,
490                                           flags & RT6_LOOKUP_F_IFACE))
491                                 return sprt;
492                 }
493         }
494
495         if (oif && flags & RT6_LOOKUP_F_IFACE)
496                 return net->ipv6.fib6_null_entry;
497
498         return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
499 }
500
501 #ifdef CONFIG_IPV6_ROUTER_PREF
502 struct __rt6_probe_work {
503         struct work_struct work;
504         struct in6_addr target;
505         struct net_device *dev;
506 };
507
508 static void rt6_probe_deferred(struct work_struct *w)
509 {
510         struct in6_addr mcaddr;
511         struct __rt6_probe_work *work =
512                 container_of(w, struct __rt6_probe_work, work);
513
514         addrconf_addr_solict_mult(&work->target, &mcaddr);
515         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
516         dev_put(work->dev);
517         kfree(work);
518 }
519
520 static void rt6_probe(struct fib6_info *rt)
521 {
522         struct __rt6_probe_work *work = NULL;
523         const struct in6_addr *nh_gw;
524         struct neighbour *neigh;
525         struct net_device *dev;
526         struct inet6_dev *idev;
527
528         /*
529          * Okay, this does not seem to be appropriate
530          * for now, however, we need to check if it
531          * is really so; aka Router Reachability Probing.
532          *
533          * Router Reachability Probe MUST be rate-limited
534          * to no more than one per minute.
535          */
536         if (!rt || !(rt->fib6_flags & RTF_GATEWAY))
537                 return;
538
539         nh_gw = &rt->fib6_nh.nh_gw;
540         dev = rt->fib6_nh.nh_dev;
541         rcu_read_lock_bh();
542         idev = __in6_dev_get(dev);
543         neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
544         if (neigh) {
545                 if (neigh->nud_state & NUD_VALID)
546                         goto out;
547
548                 write_lock(&neigh->lock);
549                 if (!(neigh->nud_state & NUD_VALID) &&
550                     time_after(jiffies,
551                                neigh->updated + idev->cnf.rtr_probe_interval)) {
552                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
553                         if (work)
554                                 __neigh_set_probe_once(neigh);
555                 }
556                 write_unlock(&neigh->lock);
557         } else if (time_after(jiffies, rt->last_probe +
558                                        idev->cnf.rtr_probe_interval)) {
559                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
560         }
561
562         if (work) {
563                 rt->last_probe = jiffies;
564                 INIT_WORK(&work->work, rt6_probe_deferred);
565                 work->target = *nh_gw;
566                 dev_hold(dev);
567                 work->dev = dev;
568                 schedule_work(&work->work);
569         }
570
571 out:
572         rcu_read_unlock_bh();
573 }
574 #else
575 static inline void rt6_probe(struct fib6_info *rt)
576 {
577 }
578 #endif
579
580 /*
581  * Default Router Selection (RFC 2461 6.3.6)
582  */
583 static inline int rt6_check_dev(struct fib6_info *rt, int oif)
584 {
585         const struct net_device *dev = rt->fib6_nh.nh_dev;
586
587         if (!oif || dev->ifindex == oif)
588                 return 2;
589         return 0;
590 }
591
592 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
593 {
594         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
595         struct neighbour *neigh;
596
597         if (rt->fib6_flags & RTF_NONEXTHOP ||
598             !(rt->fib6_flags & RTF_GATEWAY))
599                 return RT6_NUD_SUCCEED;
600
601         rcu_read_lock_bh();
602         neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev,
603                                           &rt->fib6_nh.nh_gw);
604         if (neigh) {
605                 read_lock(&neigh->lock);
606                 if (neigh->nud_state & NUD_VALID)
607                         ret = RT6_NUD_SUCCEED;
608 #ifdef CONFIG_IPV6_ROUTER_PREF
609                 else if (!(neigh->nud_state & NUD_FAILED))
610                         ret = RT6_NUD_SUCCEED;
611                 else
612                         ret = RT6_NUD_FAIL_PROBE;
613 #endif
614                 read_unlock(&neigh->lock);
615         } else {
616                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
617                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
618         }
619         rcu_read_unlock_bh();
620
621         return ret;
622 }
623
624 static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
625 {
626         int m;
627
628         m = rt6_check_dev(rt, oif);
629         if (!m && (strict & RT6_LOOKUP_F_IFACE))
630                 return RT6_NUD_FAIL_HARD;
631 #ifdef CONFIG_IPV6_ROUTER_PREF
632         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2;
633 #endif
634         if (strict & RT6_LOOKUP_F_REACHABLE) {
635                 int n = rt6_check_neigh(rt);
636                 if (n < 0)
637                         return n;
638         }
639         return m;
640 }
641
642 /* called with rc_read_lock held */
643 static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i)
644 {
645         const struct net_device *dev = fib6_info_nh_dev(f6i);
646         bool rc = false;
647
648         if (dev) {
649                 const struct inet6_dev *idev = __in6_dev_get(dev);
650
651                 rc = !!idev->cnf.ignore_routes_with_linkdown;
652         }
653
654         return rc;
655 }
656
657 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
658                                    int *mpri, struct fib6_info *match,
659                                    bool *do_rr)
660 {
661         int m;
662         bool match_do_rr = false;
663
664         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
665                 goto out;
666
667         if (fib6_ignore_linkdown(rt) &&
668             rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
669             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
670                 goto out;
671
672         if (fib6_check_expired(rt))
673                 goto out;
674
675         m = rt6_score_route(rt, oif, strict);
676         if (m == RT6_NUD_FAIL_DO_RR) {
677                 match_do_rr = true;
678                 m = 0; /* lowest valid score */
679         } else if (m == RT6_NUD_FAIL_HARD) {
680                 goto out;
681         }
682
683         if (strict & RT6_LOOKUP_F_REACHABLE)
684                 rt6_probe(rt);
685
686         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
687         if (m > *mpri) {
688                 *do_rr = match_do_rr;
689                 *mpri = m;
690                 match = rt;
691         }
692 out:
693         return match;
694 }
695
696 static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
697                                      struct fib6_info *leaf,
698                                      struct fib6_info *rr_head,
699                                      u32 metric, int oif, int strict,
700                                      bool *do_rr)
701 {
702         struct fib6_info *rt, *match, *cont;
703         int mpri = -1;
704
705         match = NULL;
706         cont = NULL;
707         for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) {
708                 if (rt->fib6_metric != metric) {
709                         cont = rt;
710                         break;
711                 }
712
713                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
714         }
715
716         for (rt = leaf; rt && rt != rr_head;
717              rt = rcu_dereference(rt->fib6_next)) {
718                 if (rt->fib6_metric != metric) {
719                         cont = rt;
720                         break;
721                 }
722
723                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
724         }
725
726         if (match || !cont)
727                 return match;
728
729         for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next))
730                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
731
732         return match;
733 }
734
735 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
736                                    int oif, int strict)
737 {
738         struct fib6_info *leaf = rcu_dereference(fn->leaf);
739         struct fib6_info *match, *rt0;
740         bool do_rr = false;
741         int key_plen;
742
743         if (!leaf || leaf == net->ipv6.fib6_null_entry)
744                 return net->ipv6.fib6_null_entry;
745
746         rt0 = rcu_dereference(fn->rr_ptr);
747         if (!rt0)
748                 rt0 = leaf;
749
750         /* Double check to make sure fn is not an intermediate node
751          * and fn->leaf does not points to its child's leaf
752          * (This might happen if all routes under fn are deleted from
753          * the tree and fib6_repair_tree() is called on the node.)
754          */
755         key_plen = rt0->fib6_dst.plen;
756 #ifdef CONFIG_IPV6_SUBTREES
757         if (rt0->fib6_src.plen)
758                 key_plen = rt0->fib6_src.plen;
759 #endif
760         if (fn->fn_bit != key_plen)
761                 return net->ipv6.fib6_null_entry;
762
763         match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
764                              &do_rr);
765
766         if (do_rr) {
767                 struct fib6_info *next = rcu_dereference(rt0->fib6_next);
768
769                 /* no entries matched; do round-robin */
770                 if (!next || next->fib6_metric != rt0->fib6_metric)
771                         next = leaf;
772
773                 if (next != rt0) {
774                         spin_lock_bh(&leaf->fib6_table->tb6_lock);
775                         /* make sure next is not being deleted from the tree */
776                         if (next->fib6_node)
777                                 rcu_assign_pointer(fn->rr_ptr, next);
778                         spin_unlock_bh(&leaf->fib6_table->tb6_lock);
779                 }
780         }
781
782         return match ? match : net->ipv6.fib6_null_entry;
783 }
784
785 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
786 {
787         return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
788 }
789
790 #ifdef CONFIG_IPV6_ROUTE_INFO
791 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
792                   const struct in6_addr *gwaddr)
793 {
794         struct net *net = dev_net(dev);
795         struct route_info *rinfo = (struct route_info *) opt;
796         struct in6_addr prefix_buf, *prefix;
797         unsigned int pref;
798         unsigned long lifetime;
799         struct fib6_info *rt;
800
801         if (len < sizeof(struct route_info)) {
802                 return -EINVAL;
803         }
804
805         /* Sanity check for prefix_len and length */
806         if (rinfo->length > 3) {
807                 return -EINVAL;
808         } else if (rinfo->prefix_len > 128) {
809                 return -EINVAL;
810         } else if (rinfo->prefix_len > 64) {
811                 if (rinfo->length < 2) {
812                         return -EINVAL;
813                 }
814         } else if (rinfo->prefix_len > 0) {
815                 if (rinfo->length < 1) {
816                         return -EINVAL;
817                 }
818         }
819
820         pref = rinfo->route_pref;
821         if (pref == ICMPV6_ROUTER_PREF_INVALID)
822                 return -EINVAL;
823
824         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
825
826         if (rinfo->length == 3)
827                 prefix = (struct in6_addr *)rinfo->prefix;
828         else {
829                 /* this function is safe */
830                 ipv6_addr_prefix(&prefix_buf,
831                                  (struct in6_addr *)rinfo->prefix,
832                                  rinfo->prefix_len);
833                 prefix = &prefix_buf;
834         }
835
836         if (rinfo->prefix_len == 0)
837                 rt = rt6_get_dflt_router(net, gwaddr, dev);
838         else
839                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
840                                         gwaddr, dev);
841
842         if (rt && !lifetime) {
843                 ip6_del_rt(net, rt);
844                 rt = NULL;
845         }
846
847         if (!rt && lifetime)
848                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
849                                         dev, pref);
850         else if (rt)
851                 rt->fib6_flags = RTF_ROUTEINFO |
852                                  (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
853
854         if (rt) {
855                 if (!addrconf_finite_timeout(lifetime))
856                         fib6_clean_expires(rt);
857                 else
858                         fib6_set_expires(rt, jiffies + HZ * lifetime);
859
860                 fib6_info_release(rt);
861         }
862         return 0;
863 }
864 #endif
865
866 /*
867  *      Misc support functions
868  */
869
870 /* called with rcu_lock held */
871 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
872 {
873         struct net_device *dev = rt->fib6_nh.nh_dev;
874
875         if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
876                 /* for copies of local routes, dst->dev needs to be the
877                  * device if it is a master device, the master device if
878                  * device is enslaved, and the loopback as the default
879                  */
880                 if (netif_is_l3_slave(dev) &&
881                     !rt6_need_strict(&rt->fib6_dst.addr))
882                         dev = l3mdev_master_dev_rcu(dev);
883                 else if (!netif_is_l3_master(dev))
884                         dev = dev_net(dev)->loopback_dev;
885                 /* last case is netif_is_l3_master(dev) is true in which
886                  * case we want dev returned to be dev
887                  */
888         }
889
890         return dev;
891 }
892
893 static const int fib6_prop[RTN_MAX + 1] = {
894         [RTN_UNSPEC]    = 0,
895         [RTN_UNICAST]   = 0,
896         [RTN_LOCAL]     = 0,
897         [RTN_BROADCAST] = 0,
898         [RTN_ANYCAST]   = 0,
899         [RTN_MULTICAST] = 0,
900         [RTN_BLACKHOLE] = -EINVAL,
901         [RTN_UNREACHABLE] = -EHOSTUNREACH,
902         [RTN_PROHIBIT]  = -EACCES,
903         [RTN_THROW]     = -EAGAIN,
904         [RTN_NAT]       = -EINVAL,
905         [RTN_XRESOLVE]  = -EINVAL,
906 };
907
908 static int ip6_rt_type_to_error(u8 fib6_type)
909 {
910         return fib6_prop[fib6_type];
911 }
912
913 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
914 {
915         unsigned short flags = 0;
916
917         if (rt->dst_nocount)
918                 flags |= DST_NOCOUNT;
919         if (rt->dst_nopolicy)
920                 flags |= DST_NOPOLICY;
921         if (rt->dst_host)
922                 flags |= DST_HOST;
923
924         return flags;
925 }
926
927 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
928 {
929         rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
930
931         switch (ort->fib6_type) {
932         case RTN_BLACKHOLE:
933                 rt->dst.output = dst_discard_out;
934                 rt->dst.input = dst_discard;
935                 break;
936         case RTN_PROHIBIT:
937                 rt->dst.output = ip6_pkt_prohibit_out;
938                 rt->dst.input = ip6_pkt_prohibit;
939                 break;
940         case RTN_THROW:
941         case RTN_UNREACHABLE:
942         default:
943                 rt->dst.output = ip6_pkt_discard_out;
944                 rt->dst.input = ip6_pkt_discard;
945                 break;
946         }
947 }
948
949 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
950 {
951         if (ort->fib6_flags & RTF_REJECT) {
952                 ip6_rt_init_dst_reject(rt, ort);
953                 return;
954         }
955
956         rt->dst.error = 0;
957         rt->dst.output = ip6_output;
958
959         if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) {
960                 rt->dst.input = ip6_input;
961         } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
962                 rt->dst.input = ip6_mc_input;
963         } else {
964                 rt->dst.input = ip6_forward;
965         }
966
967         if (ort->fib6_nh.nh_lwtstate) {
968                 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
969                 lwtunnel_set_redirect(&rt->dst);
970         }
971
972         rt->dst.lastuse = jiffies;
973 }
974
975 /* Caller must already hold reference to @from */
976 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
977 {
978         rt->rt6i_flags &= ~RTF_EXPIRES;
979         rcu_assign_pointer(rt->from, from);
980         ip_dst_init_metrics(&rt->dst, from->fib6_metrics);
981 }
982
983 /* Caller must already hold reference to @ort */
984 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
985 {
986         struct net_device *dev = fib6_info_nh_dev(ort);
987
988         ip6_rt_init_dst(rt, ort);
989
990         rt->rt6i_dst = ort->fib6_dst;
991         rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
992         rt->rt6i_gateway = ort->fib6_nh.nh_gw;
993         rt->rt6i_flags = ort->fib6_flags;
994         rt6_set_from(rt, ort);
995 #ifdef CONFIG_IPV6_SUBTREES
996         rt->rt6i_src = ort->fib6_src;
997 #endif
998 }
999
1000 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1001                                         struct in6_addr *saddr)
1002 {
1003         struct fib6_node *pn, *sn;
1004         while (1) {
1005                 if (fn->fn_flags & RTN_TL_ROOT)
1006                         return NULL;
1007                 pn = rcu_dereference(fn->parent);
1008                 sn = FIB6_SUBTREE(pn);
1009                 if (sn && sn != fn)
1010                         fn = fib6_node_lookup(sn, NULL, saddr);
1011                 else
1012                         fn = pn;
1013                 if (fn->fn_flags & RTN_RTINFO)
1014                         return fn;
1015         }
1016 }
1017
1018 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
1019                           bool null_fallback)
1020 {
1021         struct rt6_info *rt = *prt;
1022
1023         if (dst_hold_safe(&rt->dst))
1024                 return true;
1025         if (null_fallback) {
1026                 rt = net->ipv6.ip6_null_entry;
1027                 dst_hold(&rt->dst);
1028         } else {
1029                 rt = NULL;
1030         }
1031         *prt = rt;
1032         return false;
1033 }
1034
1035 /* called with rcu_lock held */
1036 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
1037 {
1038         unsigned short flags = fib6_info_dst_flags(rt);
1039         struct net_device *dev = rt->fib6_nh.nh_dev;
1040         struct rt6_info *nrt;
1041
1042         if (!fib6_info_hold_safe(rt))
1043                 return NULL;
1044
1045         nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1046         if (nrt)
1047                 ip6_rt_copy_init(nrt, rt);
1048         else
1049                 fib6_info_release(rt);
1050
1051         return nrt;
1052 }
1053
1054 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1055                                              struct fib6_table *table,
1056                                              struct flowi6 *fl6,
1057                                              const struct sk_buff *skb,
1058                                              int flags)
1059 {
1060         struct fib6_info *f6i;
1061         struct fib6_node *fn;
1062         struct rt6_info *rt;
1063
1064         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1065                 flags &= ~RT6_LOOKUP_F_IFACE;
1066
1067         rcu_read_lock();
1068         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1069 restart:
1070         f6i = rcu_dereference(fn->leaf);
1071         if (!f6i) {
1072                 f6i = net->ipv6.fib6_null_entry;
1073         } else {
1074                 f6i = rt6_device_match(net, f6i, &fl6->saddr,
1075                                       fl6->flowi6_oif, flags);
1076                 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
1077                         f6i = fib6_multipath_select(net, f6i, fl6,
1078                                                     fl6->flowi6_oif, skb,
1079                                                     flags);
1080         }
1081         if (f6i == net->ipv6.fib6_null_entry) {
1082                 fn = fib6_backtrack(fn, &fl6->saddr);
1083                 if (fn)
1084                         goto restart;
1085         }
1086
1087         trace_fib6_table_lookup(net, f6i, table, fl6);
1088
1089         /* Search through exception table */
1090         rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1091         if (rt) {
1092                 if (ip6_hold_safe(net, &rt, true))
1093                         dst_use_noref(&rt->dst, jiffies);
1094         } else if (f6i == net->ipv6.fib6_null_entry) {
1095                 rt = net->ipv6.ip6_null_entry;
1096                 dst_hold(&rt->dst);
1097         } else {
1098                 rt = ip6_create_rt_rcu(f6i);
1099                 if (!rt) {
1100                         rt = net->ipv6.ip6_null_entry;
1101                         dst_hold(&rt->dst);
1102                 }
1103         }
1104
1105         rcu_read_unlock();
1106
1107         return rt;
1108 }
1109
1110 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1111                                    const struct sk_buff *skb, int flags)
1112 {
1113         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1114 }
1115 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1116
1117 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1118                             const struct in6_addr *saddr, int oif,
1119                             const struct sk_buff *skb, int strict)
1120 {
1121         struct flowi6 fl6 = {
1122                 .flowi6_oif = oif,
1123                 .daddr = *daddr,
1124         };
1125         struct dst_entry *dst;
1126         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1127
1128         if (saddr) {
1129                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1130                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1131         }
1132
1133         dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1134         if (dst->error == 0)
1135                 return (struct rt6_info *) dst;
1136
1137         dst_release(dst);
1138
1139         return NULL;
1140 }
1141 EXPORT_SYMBOL(rt6_lookup);
1142
1143 /* ip6_ins_rt is called with FREE table->tb6_lock.
1144  * It takes new route entry, the addition fails by any reason the
1145  * route is released.
1146  * Caller must hold dst before calling it.
1147  */
1148
1149 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1150                         struct netlink_ext_ack *extack)
1151 {
1152         int err;
1153         struct fib6_table *table;
1154
1155         table = rt->fib6_table;
1156         spin_lock_bh(&table->tb6_lock);
1157         err = fib6_add(&table->tb6_root, rt, info, extack);
1158         spin_unlock_bh(&table->tb6_lock);
1159
1160         return err;
1161 }
1162
1163 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1164 {
1165         struct nl_info info = { .nl_net = net, };
1166
1167         return __ip6_ins_rt(rt, &info, NULL);
1168 }
1169
1170 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
1171                                            const struct in6_addr *daddr,
1172                                            const struct in6_addr *saddr)
1173 {
1174         struct net_device *dev;
1175         struct rt6_info *rt;
1176
1177         /*
1178          *      Clone the route.
1179          */
1180
1181         if (!fib6_info_hold_safe(ort))
1182                 return NULL;
1183
1184         dev = ip6_rt_get_dev_rcu(ort);
1185         rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1186         if (!rt) {
1187                 fib6_info_release(ort);
1188                 return NULL;
1189         }
1190
1191         ip6_rt_copy_init(rt, ort);
1192         rt->rt6i_flags |= RTF_CACHE;
1193         rt->dst.flags |= DST_HOST;
1194         rt->rt6i_dst.addr = *daddr;
1195         rt->rt6i_dst.plen = 128;
1196
1197         if (!rt6_is_gw_or_nonexthop(ort)) {
1198                 if (ort->fib6_dst.plen != 128 &&
1199                     ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
1200                         rt->rt6i_flags |= RTF_ANYCAST;
1201 #ifdef CONFIG_IPV6_SUBTREES
1202                 if (rt->rt6i_src.plen && saddr) {
1203                         rt->rt6i_src.addr = *saddr;
1204                         rt->rt6i_src.plen = 128;
1205                 }
1206 #endif
1207         }
1208
1209         return rt;
1210 }
1211
1212 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
1213 {
1214         unsigned short flags = fib6_info_dst_flags(rt);
1215         struct net_device *dev;
1216         struct rt6_info *pcpu_rt;
1217
1218         if (!fib6_info_hold_safe(rt))
1219                 return NULL;
1220
1221         rcu_read_lock();
1222         dev = ip6_rt_get_dev_rcu(rt);
1223         pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1224         rcu_read_unlock();
1225         if (!pcpu_rt) {
1226                 fib6_info_release(rt);
1227                 return NULL;
1228         }
1229         ip6_rt_copy_init(pcpu_rt, rt);
1230         pcpu_rt->rt6i_flags |= RTF_PCPU;
1231         return pcpu_rt;
1232 }
1233
1234 /* It should be called with rcu_read_lock() acquired */
1235 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
1236 {
1237         struct rt6_info *pcpu_rt, **p;
1238
1239         p = this_cpu_ptr(rt->rt6i_pcpu);
1240         pcpu_rt = *p;
1241
1242         if (pcpu_rt)
1243                 ip6_hold_safe(NULL, &pcpu_rt, false);
1244
1245         return pcpu_rt;
1246 }
1247
1248 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1249                                             struct fib6_info *rt)
1250 {
1251         struct rt6_info *pcpu_rt, *prev, **p;
1252
1253         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1254         if (!pcpu_rt) {
1255                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1256                 return net->ipv6.ip6_null_entry;
1257         }
1258
1259         dst_hold(&pcpu_rt->dst);
1260         p = this_cpu_ptr(rt->rt6i_pcpu);
1261         prev = cmpxchg(p, NULL, pcpu_rt);
1262         BUG_ON(prev);
1263
1264         return pcpu_rt;
1265 }
1266
1267 /* exception hash table implementation
1268  */
1269 static DEFINE_SPINLOCK(rt6_exception_lock);
1270
1271 /* Remove rt6_ex from hash table and free the memory
1272  * Caller must hold rt6_exception_lock
1273  */
1274 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1275                                  struct rt6_exception *rt6_ex)
1276 {
1277         struct net *net;
1278
1279         if (!bucket || !rt6_ex)
1280                 return;
1281
1282         net = dev_net(rt6_ex->rt6i->dst.dev);
1283         hlist_del_rcu(&rt6_ex->hlist);
1284         dst_release(&rt6_ex->rt6i->dst);
1285         kfree_rcu(rt6_ex, rcu);
1286         WARN_ON_ONCE(!bucket->depth);
1287         bucket->depth--;
1288         net->ipv6.rt6_stats->fib_rt_cache--;
1289 }
1290
1291 /* Remove oldest rt6_ex in bucket and free the memory
1292  * Caller must hold rt6_exception_lock
1293  */
1294 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1295 {
1296         struct rt6_exception *rt6_ex, *oldest = NULL;
1297
1298         if (!bucket)
1299                 return;
1300
1301         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1302                 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1303                         oldest = rt6_ex;
1304         }
1305         rt6_remove_exception(bucket, oldest);
1306 }
1307
1308 static u32 rt6_exception_hash(const struct in6_addr *dst,
1309                               const struct in6_addr *src)
1310 {
1311         static u32 seed __read_mostly;
1312         u32 val;
1313
1314         net_get_random_once(&seed, sizeof(seed));
1315         val = jhash(dst, sizeof(*dst), seed);
1316
1317 #ifdef CONFIG_IPV6_SUBTREES
1318         if (src)
1319                 val = jhash(src, sizeof(*src), val);
1320 #endif
1321         return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1322 }
1323
1324 /* Helper function to find the cached rt in the hash table
1325  * and update bucket pointer to point to the bucket for this
1326  * (daddr, saddr) pair
1327  * Caller must hold rt6_exception_lock
1328  */
1329 static struct rt6_exception *
1330 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1331                               const struct in6_addr *daddr,
1332                               const struct in6_addr *saddr)
1333 {
1334         struct rt6_exception *rt6_ex;
1335         u32 hval;
1336
1337         if (!(*bucket) || !daddr)
1338                 return NULL;
1339
1340         hval = rt6_exception_hash(daddr, saddr);
1341         *bucket += hval;
1342
1343         hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1344                 struct rt6_info *rt6 = rt6_ex->rt6i;
1345                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1346
1347 #ifdef CONFIG_IPV6_SUBTREES
1348                 if (matched && saddr)
1349                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1350 #endif
1351                 if (matched)
1352                         return rt6_ex;
1353         }
1354         return NULL;
1355 }
1356
1357 /* Helper function to find the cached rt in the hash table
1358  * and update bucket pointer to point to the bucket for this
1359  * (daddr, saddr) pair
1360  * Caller must hold rcu_read_lock()
1361  */
1362 static struct rt6_exception *
1363 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1364                          const struct in6_addr *daddr,
1365                          const struct in6_addr *saddr)
1366 {
1367         struct rt6_exception *rt6_ex;
1368         u32 hval;
1369
1370         WARN_ON_ONCE(!rcu_read_lock_held());
1371
1372         if (!(*bucket) || !daddr)
1373                 return NULL;
1374
1375         hval = rt6_exception_hash(daddr, saddr);
1376         *bucket += hval;
1377
1378         hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1379                 struct rt6_info *rt6 = rt6_ex->rt6i;
1380                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1381
1382 #ifdef CONFIG_IPV6_SUBTREES
1383                 if (matched && saddr)
1384                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1385 #endif
1386                 if (matched)
1387                         return rt6_ex;
1388         }
1389         return NULL;
1390 }
1391
1392 static unsigned int fib6_mtu(const struct fib6_info *rt)
1393 {
1394         unsigned int mtu;
1395
1396         if (rt->fib6_pmtu) {
1397                 mtu = rt->fib6_pmtu;
1398         } else {
1399                 struct net_device *dev = fib6_info_nh_dev(rt);
1400                 struct inet6_dev *idev;
1401
1402                 rcu_read_lock();
1403                 idev = __in6_dev_get(dev);
1404                 mtu = idev->cnf.mtu6;
1405                 rcu_read_unlock();
1406         }
1407
1408         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1409
1410         return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
1411 }
1412
1413 static int rt6_insert_exception(struct rt6_info *nrt,
1414                                 struct fib6_info *ort)
1415 {
1416         struct net *net = dev_net(nrt->dst.dev);
1417         struct rt6_exception_bucket *bucket;
1418         struct in6_addr *src_key = NULL;
1419         struct rt6_exception *rt6_ex;
1420         int err = 0;
1421
1422         spin_lock_bh(&rt6_exception_lock);
1423
1424         if (ort->exception_bucket_flushed) {
1425                 err = -EINVAL;
1426                 goto out;
1427         }
1428
1429         bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1430                                         lockdep_is_held(&rt6_exception_lock));
1431         if (!bucket) {
1432                 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1433                                  GFP_ATOMIC);
1434                 if (!bucket) {
1435                         err = -ENOMEM;
1436                         goto out;
1437                 }
1438                 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1439         }
1440
1441 #ifdef CONFIG_IPV6_SUBTREES
1442         /* rt6i_src.plen != 0 indicates ort is in subtree
1443          * and exception table is indexed by a hash of
1444          * both rt6i_dst and rt6i_src.
1445          * Otherwise, the exception table is indexed by
1446          * a hash of only rt6i_dst.
1447          */
1448         if (ort->fib6_src.plen)
1449                 src_key = &nrt->rt6i_src.addr;
1450 #endif
1451         /* rt6_mtu_change() might lower mtu on ort.
1452          * Only insert this exception route if its mtu
1453          * is less than ort's mtu value.
1454          */
1455         if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1456                 err = -EINVAL;
1457                 goto out;
1458         }
1459
1460         rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1461                                                src_key);
1462         if (rt6_ex)
1463                 rt6_remove_exception(bucket, rt6_ex);
1464
1465         rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1466         if (!rt6_ex) {
1467                 err = -ENOMEM;
1468                 goto out;
1469         }
1470         rt6_ex->rt6i = nrt;
1471         rt6_ex->stamp = jiffies;
1472         hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1473         bucket->depth++;
1474         net->ipv6.rt6_stats->fib_rt_cache++;
1475
1476         if (bucket->depth > FIB6_MAX_DEPTH)
1477                 rt6_exception_remove_oldest(bucket);
1478
1479 out:
1480         spin_unlock_bh(&rt6_exception_lock);
1481
1482         /* Update fn->fn_sernum to invalidate all cached dst */
1483         if (!err) {
1484                 spin_lock_bh(&ort->fib6_table->tb6_lock);
1485                 fib6_update_sernum(net, ort);
1486                 spin_unlock_bh(&ort->fib6_table->tb6_lock);
1487                 fib6_force_start_gc(net);
1488         }
1489
1490         return err;
1491 }
1492
1493 void rt6_flush_exceptions(struct fib6_info *rt)
1494 {
1495         struct rt6_exception_bucket *bucket;
1496         struct rt6_exception *rt6_ex;
1497         struct hlist_node *tmp;
1498         int i;
1499
1500         spin_lock_bh(&rt6_exception_lock);
1501         /* Prevent rt6_insert_exception() to recreate the bucket list */
1502         rt->exception_bucket_flushed = 1;
1503
1504         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1505                                     lockdep_is_held(&rt6_exception_lock));
1506         if (!bucket)
1507                 goto out;
1508
1509         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1510                 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1511                         rt6_remove_exception(bucket, rt6_ex);
1512                 WARN_ON_ONCE(bucket->depth);
1513                 bucket++;
1514         }
1515
1516 out:
1517         spin_unlock_bh(&rt6_exception_lock);
1518 }
1519
1520 /* Find cached rt in the hash table inside passed in rt
1521  * Caller has to hold rcu_read_lock()
1522  */
1523 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
1524                                            struct in6_addr *daddr,
1525                                            struct in6_addr *saddr)
1526 {
1527         struct rt6_exception_bucket *bucket;
1528         struct in6_addr *src_key = NULL;
1529         struct rt6_exception *rt6_ex;
1530         struct rt6_info *res = NULL;
1531
1532         bucket = rcu_dereference(rt->rt6i_exception_bucket);
1533
1534 #ifdef CONFIG_IPV6_SUBTREES
1535         /* rt6i_src.plen != 0 indicates rt is in subtree
1536          * and exception table is indexed by a hash of
1537          * both rt6i_dst and rt6i_src.
1538          * Otherwise, the exception table is indexed by
1539          * a hash of only rt6i_dst.
1540          */
1541         if (rt->fib6_src.plen)
1542                 src_key = saddr;
1543 #endif
1544         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1545
1546         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1547                 res = rt6_ex->rt6i;
1548
1549         return res;
1550 }
1551
1552 /* Remove the passed in cached rt from the hash table that contains it */
1553 static int rt6_remove_exception_rt(struct rt6_info *rt)
1554 {
1555         struct rt6_exception_bucket *bucket;
1556         struct in6_addr *src_key = NULL;
1557         struct rt6_exception *rt6_ex;
1558         struct fib6_info *from;
1559         int err;
1560
1561         from = rcu_dereference(rt->from);
1562         if (!from ||
1563             !(rt->rt6i_flags & RTF_CACHE))
1564                 return -EINVAL;
1565
1566         if (!rcu_access_pointer(from->rt6i_exception_bucket))
1567                 return -ENOENT;
1568
1569         spin_lock_bh(&rt6_exception_lock);
1570         bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1571                                     lockdep_is_held(&rt6_exception_lock));
1572 #ifdef CONFIG_IPV6_SUBTREES
1573         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1574          * and exception table is indexed by a hash of
1575          * both rt6i_dst and rt6i_src.
1576          * Otherwise, the exception table is indexed by
1577          * a hash of only rt6i_dst.
1578          */
1579         if (from->fib6_src.plen)
1580                 src_key = &rt->rt6i_src.addr;
1581 #endif
1582         rt6_ex = __rt6_find_exception_spinlock(&bucket,
1583                                                &rt->rt6i_dst.addr,
1584                                                src_key);
1585         if (rt6_ex) {
1586                 rt6_remove_exception(bucket, rt6_ex);
1587                 err = 0;
1588         } else {
1589                 err = -ENOENT;
1590         }
1591
1592         spin_unlock_bh(&rt6_exception_lock);
1593         return err;
1594 }
1595
1596 /* Find rt6_ex which contains the passed in rt cache and
1597  * refresh its stamp
1598  */
1599 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1600 {
1601         struct rt6_exception_bucket *bucket;
1602         struct fib6_info *from = rt->from;
1603         struct in6_addr *src_key = NULL;
1604         struct rt6_exception *rt6_ex;
1605
1606         if (!from ||
1607             !(rt->rt6i_flags & RTF_CACHE))
1608                 return;
1609
1610         rcu_read_lock();
1611         bucket = rcu_dereference(from->rt6i_exception_bucket);
1612
1613 #ifdef CONFIG_IPV6_SUBTREES
1614         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1615          * and exception table is indexed by a hash of
1616          * both rt6i_dst and rt6i_src.
1617          * Otherwise, the exception table is indexed by
1618          * a hash of only rt6i_dst.
1619          */
1620         if (from->fib6_src.plen)
1621                 src_key = &rt->rt6i_src.addr;
1622 #endif
1623         rt6_ex = __rt6_find_exception_rcu(&bucket,
1624                                           &rt->rt6i_dst.addr,
1625                                           src_key);
1626         if (rt6_ex)
1627                 rt6_ex->stamp = jiffies;
1628
1629         rcu_read_unlock();
1630 }
1631
1632 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1633                                          struct rt6_info *rt, int mtu)
1634 {
1635         /* If the new MTU is lower than the route PMTU, this new MTU will be the
1636          * lowest MTU in the path: always allow updating the route PMTU to
1637          * reflect PMTU decreases.
1638          *
1639          * If the new MTU is higher, and the route PMTU is equal to the local
1640          * MTU, this means the old MTU is the lowest in the path, so allow
1641          * updating it: if other nodes now have lower MTUs, PMTU discovery will
1642          * handle this.
1643          */
1644
1645         if (dst_mtu(&rt->dst) >= mtu)
1646                 return true;
1647
1648         if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1649                 return true;
1650
1651         return false;
1652 }
1653
1654 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1655                                        struct fib6_info *rt, int mtu)
1656 {
1657         struct rt6_exception_bucket *bucket;
1658         struct rt6_exception *rt6_ex;
1659         int i;
1660
1661         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1662                                         lockdep_is_held(&rt6_exception_lock));
1663
1664         if (!bucket)
1665                 return;
1666
1667         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1668                 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1669                         struct rt6_info *entry = rt6_ex->rt6i;
1670
1671                         /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1672                          * route), the metrics of its rt->from have already
1673                          * been updated.
1674                          */
1675                         if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1676                             rt6_mtu_change_route_allowed(idev, entry, mtu))
1677                                 dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1678                 }
1679                 bucket++;
1680         }
1681 }
1682
1683 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
1684
1685 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1686                                         struct in6_addr *gateway)
1687 {
1688         struct rt6_exception_bucket *bucket;
1689         struct rt6_exception *rt6_ex;
1690         struct hlist_node *tmp;
1691         int i;
1692
1693         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1694                 return;
1695
1696         spin_lock_bh(&rt6_exception_lock);
1697         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1698                                      lockdep_is_held(&rt6_exception_lock));
1699
1700         if (bucket) {
1701                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1702                         hlist_for_each_entry_safe(rt6_ex, tmp,
1703                                                   &bucket->chain, hlist) {
1704                                 struct rt6_info *entry = rt6_ex->rt6i;
1705
1706                                 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1707                                     RTF_CACHE_GATEWAY &&
1708                                     ipv6_addr_equal(gateway,
1709                                                     &entry->rt6i_gateway)) {
1710                                         rt6_remove_exception(bucket, rt6_ex);
1711                                 }
1712                         }
1713                         bucket++;
1714                 }
1715         }
1716
1717         spin_unlock_bh(&rt6_exception_lock);
1718 }
1719
1720 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1721                                       struct rt6_exception *rt6_ex,
1722                                       struct fib6_gc_args *gc_args,
1723                                       unsigned long now)
1724 {
1725         struct rt6_info *rt = rt6_ex->rt6i;
1726
1727         /* we are pruning and obsoleting aged-out and non gateway exceptions
1728          * even if others have still references to them, so that on next
1729          * dst_check() such references can be dropped.
1730          * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1731          * expired, independently from their aging, as per RFC 8201 section 4
1732          */
1733         if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1734                 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1735                         RT6_TRACE("aging clone %p\n", rt);
1736                         rt6_remove_exception(bucket, rt6_ex);
1737                         return;
1738                 }
1739         } else if (time_after(jiffies, rt->dst.expires)) {
1740                 RT6_TRACE("purging expired route %p\n", rt);
1741                 rt6_remove_exception(bucket, rt6_ex);
1742                 return;
1743         }
1744
1745         if (rt->rt6i_flags & RTF_GATEWAY) {
1746                 struct neighbour *neigh;
1747                 __u8 neigh_flags = 0;
1748
1749                 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1750                 if (neigh)
1751                         neigh_flags = neigh->flags;
1752
1753                 if (!(neigh_flags & NTF_ROUTER)) {
1754                         RT6_TRACE("purging route %p via non-router but gateway\n",
1755                                   rt);
1756                         rt6_remove_exception(bucket, rt6_ex);
1757                         return;
1758                 }
1759         }
1760
1761         gc_args->more++;
1762 }
1763
1764 void rt6_age_exceptions(struct fib6_info *rt,
1765                         struct fib6_gc_args *gc_args,
1766                         unsigned long now)
1767 {
1768         struct rt6_exception_bucket *bucket;
1769         struct rt6_exception *rt6_ex;
1770         struct hlist_node *tmp;
1771         int i;
1772
1773         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1774                 return;
1775
1776         rcu_read_lock_bh();
1777         spin_lock(&rt6_exception_lock);
1778         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1779                                     lockdep_is_held(&rt6_exception_lock));
1780
1781         if (bucket) {
1782                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1783                         hlist_for_each_entry_safe(rt6_ex, tmp,
1784                                                   &bucket->chain, hlist) {
1785                                 rt6_age_examine_exception(bucket, rt6_ex,
1786                                                           gc_args, now);
1787                         }
1788                         bucket++;
1789                 }
1790         }
1791         spin_unlock(&rt6_exception_lock);
1792         rcu_read_unlock_bh();
1793 }
1794
1795 /* must be called with rcu lock held */
1796 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
1797                                     int oif, struct flowi6 *fl6, int strict)
1798 {
1799         struct fib6_node *fn, *saved_fn;
1800         struct fib6_info *f6i;
1801
1802         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1803         saved_fn = fn;
1804
1805         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1806                 oif = 0;
1807
1808 redo_rt6_select:
1809         f6i = rt6_select(net, fn, oif, strict);
1810         if (f6i == net->ipv6.fib6_null_entry) {
1811                 fn = fib6_backtrack(fn, &fl6->saddr);
1812                 if (fn)
1813                         goto redo_rt6_select;
1814                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1815                         /* also consider unreachable route */
1816                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1817                         fn = saved_fn;
1818                         goto redo_rt6_select;
1819                 }
1820         }
1821
1822         trace_fib6_table_lookup(net, f6i, table, fl6);
1823
1824         return f6i;
1825 }
1826
1827 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1828                                int oif, struct flowi6 *fl6,
1829                                const struct sk_buff *skb, int flags)
1830 {
1831         struct fib6_info *f6i;
1832         struct rt6_info *rt;
1833         int strict = 0;
1834
1835         strict |= flags & RT6_LOOKUP_F_IFACE;
1836         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1837         if (net->ipv6.devconf_all->forwarding == 0)
1838                 strict |= RT6_LOOKUP_F_REACHABLE;
1839
1840         rcu_read_lock();
1841
1842         f6i = fib6_table_lookup(net, table, oif, fl6, strict);
1843         if (f6i->fib6_nsiblings)
1844                 f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
1845
1846         if (f6i == net->ipv6.fib6_null_entry) {
1847                 rt = net->ipv6.ip6_null_entry;
1848                 rcu_read_unlock();
1849                 dst_hold(&rt->dst);
1850                 return rt;
1851         }
1852
1853         /*Search through exception table */
1854         rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1855         if (rt) {
1856                 if (ip6_hold_safe(net, &rt, true))
1857                         dst_use_noref(&rt->dst, jiffies);
1858
1859                 rcu_read_unlock();
1860                 return rt;
1861         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1862                             !(f6i->fib6_flags & RTF_GATEWAY))) {
1863                 /* Create a RTF_CACHE clone which will not be
1864                  * owned by the fib6 tree.  It is for the special case where
1865                  * the daddr in the skb during the neighbor look-up is different
1866                  * from the fl6->daddr used to look-up route here.
1867                  */
1868                 struct rt6_info *uncached_rt;
1869
1870                 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
1871
1872                 rcu_read_unlock();
1873
1874                 if (uncached_rt) {
1875                         /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1876                          * No need for another dst_hold()
1877                          */
1878                         rt6_uncached_list_add(uncached_rt);
1879                         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1880                 } else {
1881                         uncached_rt = net->ipv6.ip6_null_entry;
1882                         dst_hold(&uncached_rt->dst);
1883                 }
1884
1885                 return uncached_rt;
1886         } else {
1887                 /* Get a percpu copy */
1888
1889                 struct rt6_info *pcpu_rt;
1890
1891                 local_bh_disable();
1892                 pcpu_rt = rt6_get_pcpu_route(f6i);
1893
1894                 if (!pcpu_rt)
1895                         pcpu_rt = rt6_make_pcpu_route(net, f6i);
1896
1897                 local_bh_enable();
1898                 rcu_read_unlock();
1899
1900                 return pcpu_rt;
1901         }
1902 }
1903 EXPORT_SYMBOL_GPL(ip6_pol_route);
1904
1905 static struct rt6_info *ip6_pol_route_input(struct net *net,
1906                                             struct fib6_table *table,
1907                                             struct flowi6 *fl6,
1908                                             const struct sk_buff *skb,
1909                                             int flags)
1910 {
1911         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1912 }
1913
1914 struct dst_entry *ip6_route_input_lookup(struct net *net,
1915                                          struct net_device *dev,
1916                                          struct flowi6 *fl6,
1917                                          const struct sk_buff *skb,
1918                                          int flags)
1919 {
1920         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1921                 flags |= RT6_LOOKUP_F_IFACE;
1922
1923         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1924 }
1925 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1926
1927 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1928                                   struct flow_keys *keys,
1929                                   struct flow_keys *flkeys)
1930 {
1931         const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1932         const struct ipv6hdr *key_iph = outer_iph;
1933         struct flow_keys *_flkeys = flkeys;
1934         const struct ipv6hdr *inner_iph;
1935         const struct icmp6hdr *icmph;
1936         struct ipv6hdr _inner_iph;
1937         struct icmp6hdr _icmph;
1938
1939         if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1940                 goto out;
1941
1942         icmph = skb_header_pointer(skb, skb_transport_offset(skb),
1943                                    sizeof(_icmph), &_icmph);
1944         if (!icmph)
1945                 goto out;
1946
1947         if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1948             icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1949             icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1950             icmph->icmp6_type != ICMPV6_PARAMPROB)
1951                 goto out;
1952
1953         inner_iph = skb_header_pointer(skb,
1954                                        skb_transport_offset(skb) + sizeof(*icmph),
1955                                        sizeof(_inner_iph), &_inner_iph);
1956         if (!inner_iph)
1957                 goto out;
1958
1959         key_iph = inner_iph;
1960         _flkeys = NULL;
1961 out:
1962         if (_flkeys) {
1963                 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1964                 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1965                 keys->tags.flow_label = _flkeys->tags.flow_label;
1966                 keys->basic.ip_proto = _flkeys->basic.ip_proto;
1967         } else {
1968                 keys->addrs.v6addrs.src = key_iph->saddr;
1969                 keys->addrs.v6addrs.dst = key_iph->daddr;
1970                 keys->tags.flow_label = ip6_flowlabel(key_iph);
1971                 keys->basic.ip_proto = key_iph->nexthdr;
1972         }
1973 }
1974
1975 /* if skb is set it will be used and fl6 can be NULL */
1976 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
1977                        const struct sk_buff *skb, struct flow_keys *flkeys)
1978 {
1979         struct flow_keys hash_keys;
1980         u32 mhash;
1981
1982         switch (ip6_multipath_hash_policy(net)) {
1983         case 0:
1984                 memset(&hash_keys, 0, sizeof(hash_keys));
1985                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1986                 if (skb) {
1987                         ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
1988                 } else {
1989                         hash_keys.addrs.v6addrs.src = fl6->saddr;
1990                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
1991                         hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
1992                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
1993                 }
1994                 break;
1995         case 1:
1996                 if (skb) {
1997                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1998                         struct flow_keys keys;
1999
2000                         /* short-circuit if we already have L4 hash present */
2001                         if (skb->l4_hash)
2002                                 return skb_get_hash_raw(skb) >> 1;
2003
2004                         memset(&hash_keys, 0, sizeof(hash_keys));
2005
2006                         if (!flkeys) {
2007                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
2008                                 flkeys = &keys;
2009                         }
2010                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2011                         hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2012                         hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2013                         hash_keys.ports.src = flkeys->ports.src;
2014                         hash_keys.ports.dst = flkeys->ports.dst;
2015                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2016                 } else {
2017                         memset(&hash_keys, 0, sizeof(hash_keys));
2018                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2019                         hash_keys.addrs.v6addrs.src = fl6->saddr;
2020                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
2021                         hash_keys.ports.src = fl6->fl6_sport;
2022                         hash_keys.ports.dst = fl6->fl6_dport;
2023                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
2024                 }
2025                 break;
2026         }
2027         mhash = flow_hash_from_keys(&hash_keys);
2028
2029         return mhash >> 1;
2030 }
2031
2032 void ip6_route_input(struct sk_buff *skb)
2033 {
2034         const struct ipv6hdr *iph = ipv6_hdr(skb);
2035         struct net *net = dev_net(skb->dev);
2036         int flags = RT6_LOOKUP_F_HAS_SADDR;
2037         struct ip_tunnel_info *tun_info;
2038         struct flowi6 fl6 = {
2039                 .flowi6_iif = skb->dev->ifindex,
2040                 .daddr = iph->daddr,
2041                 .saddr = iph->saddr,
2042                 .flowlabel = ip6_flowinfo(iph),
2043                 .flowi6_mark = skb->mark,
2044                 .flowi6_proto = iph->nexthdr,
2045         };
2046         struct flow_keys *flkeys = NULL, _flkeys;
2047
2048         tun_info = skb_tunnel_info(skb);
2049         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2050                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2051
2052         if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2053                 flkeys = &_flkeys;
2054
2055         if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2056                 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2057         skb_dst_drop(skb);
2058         skb_dst_set(skb,
2059                     ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2060 }
2061
2062 static struct rt6_info *ip6_pol_route_output(struct net *net,
2063                                              struct fib6_table *table,
2064                                              struct flowi6 *fl6,
2065                                              const struct sk_buff *skb,
2066                                              int flags)
2067 {
2068         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2069 }
2070
2071 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2072                                          struct flowi6 *fl6, int flags)
2073 {
2074         bool any_src;
2075
2076         if (ipv6_addr_type(&fl6->daddr) &
2077             (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
2078                 struct dst_entry *dst;
2079
2080                 dst = l3mdev_link_scope_lookup(net, fl6);
2081                 if (dst)
2082                         return dst;
2083         }
2084
2085         fl6->flowi6_iif = LOOPBACK_IFINDEX;
2086
2087         any_src = ipv6_addr_any(&fl6->saddr);
2088         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2089             (fl6->flowi6_oif && any_src))
2090                 flags |= RT6_LOOKUP_F_IFACE;
2091
2092         if (!any_src)
2093                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2094         else if (sk)
2095                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2096
2097         return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2098 }
2099 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2100
2101 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2102 {
2103         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2104         struct net_device *loopback_dev = net->loopback_dev;
2105         struct dst_entry *new = NULL;
2106
2107         rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2108                        DST_OBSOLETE_DEAD, 0);
2109         if (rt) {
2110                 rt6_info_init(rt);
2111                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2112
2113                 new = &rt->dst;
2114                 new->__use = 1;
2115                 new->input = dst_discard;
2116                 new->output = dst_discard_out;
2117
2118                 dst_copy_metrics(new, &ort->dst);
2119
2120                 rt->rt6i_idev = in6_dev_get(loopback_dev);
2121                 rt->rt6i_gateway = ort->rt6i_gateway;
2122                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2123
2124                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2125 #ifdef CONFIG_IPV6_SUBTREES
2126                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2127 #endif
2128         }
2129
2130         dst_release(dst_orig);
2131         return new ? new : ERR_PTR(-ENOMEM);
2132 }
2133
2134 /*
2135  *      Destination cache support functions
2136  */
2137
2138 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2139 {
2140         u32 rt_cookie = 0;
2141
2142         if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2143                 return false;
2144
2145         if (fib6_check_expired(f6i))
2146                 return false;
2147
2148         return true;
2149 }
2150
2151 static struct dst_entry *rt6_check(struct rt6_info *rt,
2152                                    struct fib6_info *from,
2153                                    u32 cookie)
2154 {
2155         u32 rt_cookie = 0;
2156
2157         if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2158             rt_cookie != cookie)
2159                 return NULL;
2160
2161         if (rt6_check_expired(rt))
2162                 return NULL;
2163
2164         return &rt->dst;
2165 }
2166
2167 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2168                                             struct fib6_info *from,
2169                                             u32 cookie)
2170 {
2171         if (!__rt6_check_expired(rt) &&
2172             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2173             fib6_check(from, cookie))
2174                 return &rt->dst;
2175         else
2176                 return NULL;
2177 }
2178
2179 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2180 {
2181         struct dst_entry *dst_ret;
2182         struct fib6_info *from;
2183         struct rt6_info *rt;
2184
2185         rt = container_of(dst, struct rt6_info, dst);
2186
2187         rcu_read_lock();
2188
2189         /* All IPV6 dsts are created with ->obsolete set to the value
2190          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2191          * into this function always.
2192          */
2193
2194         from = rcu_dereference(rt->from);
2195
2196         if (from && (rt->rt6i_flags & RTF_PCPU ||
2197             unlikely(!list_empty(&rt->rt6i_uncached))))
2198                 dst_ret = rt6_dst_from_check(rt, from, cookie);
2199         else
2200                 dst_ret = rt6_check(rt, from, cookie);
2201
2202         rcu_read_unlock();
2203
2204         return dst_ret;
2205 }
2206
2207 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2208 {
2209         struct rt6_info *rt = (struct rt6_info *) dst;
2210
2211         if (rt) {
2212                 if (rt->rt6i_flags & RTF_CACHE) {
2213                         rcu_read_lock();
2214                         if (rt6_check_expired(rt)) {
2215                                 rt6_remove_exception_rt(rt);
2216                                 dst = NULL;
2217                         }
2218                         rcu_read_unlock();
2219                 } else {
2220                         dst_release(dst);
2221                         dst = NULL;
2222                 }
2223         }
2224         return dst;
2225 }
2226
2227 static void ip6_link_failure(struct sk_buff *skb)
2228 {
2229         struct rt6_info *rt;
2230
2231         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2232
2233         rt = (struct rt6_info *) skb_dst(skb);
2234         if (rt) {
2235                 rcu_read_lock();
2236                 if (rt->rt6i_flags & RTF_CACHE) {
2237                         rt6_remove_exception_rt(rt);
2238                 } else {
2239                         struct fib6_info *from;
2240                         struct fib6_node *fn;
2241
2242                         from = rcu_dereference(rt->from);
2243                         if (from) {
2244                                 fn = rcu_dereference(from->fib6_node);
2245                                 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2246                                         fn->fn_sernum = -1;
2247                         }
2248                 }
2249                 rcu_read_unlock();
2250         }
2251 }
2252
2253 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2254 {
2255         if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2256                 struct fib6_info *from;
2257
2258                 rcu_read_lock();
2259                 from = rcu_dereference(rt0->from);
2260                 if (from)
2261                         rt0->dst.expires = from->expires;
2262                 rcu_read_unlock();
2263         }
2264
2265         dst_set_expires(&rt0->dst, timeout);
2266         rt0->rt6i_flags |= RTF_EXPIRES;
2267 }
2268
2269 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2270 {
2271         struct net *net = dev_net(rt->dst.dev);
2272
2273         dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2274         rt->rt6i_flags |= RTF_MODIFIED;
2275         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2276 }
2277
2278 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2279 {
2280         bool from_set;
2281
2282         rcu_read_lock();
2283         from_set = !!rcu_dereference(rt->from);
2284         rcu_read_unlock();
2285
2286         return !(rt->rt6i_flags & RTF_CACHE) &&
2287                 (rt->rt6i_flags & RTF_PCPU || from_set);
2288 }
2289
2290 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2291                                  const struct ipv6hdr *iph, u32 mtu)
2292 {
2293         const struct in6_addr *daddr, *saddr;
2294         struct rt6_info *rt6 = (struct rt6_info *)dst;
2295
2296         if (dst_metric_locked(dst, RTAX_MTU))
2297                 return;
2298
2299         if (iph) {
2300                 daddr = &iph->daddr;
2301                 saddr = &iph->saddr;
2302         } else if (sk) {
2303                 daddr = &sk->sk_v6_daddr;
2304                 saddr = &inet6_sk(sk)->saddr;
2305         } else {
2306                 daddr = NULL;
2307                 saddr = NULL;
2308         }
2309         dst_confirm_neigh(dst, daddr);
2310         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2311         if (mtu >= dst_mtu(dst))
2312                 return;
2313
2314         if (!rt6_cache_allowed_for_pmtu(rt6)) {
2315                 rt6_do_update_pmtu(rt6, mtu);
2316                 /* update rt6_ex->stamp for cache */
2317                 if (rt6->rt6i_flags & RTF_CACHE)
2318                         rt6_update_exception_stamp_rt(rt6);
2319         } else if (daddr) {
2320                 struct fib6_info *from;
2321                 struct rt6_info *nrt6;
2322
2323                 rcu_read_lock();
2324                 from = rcu_dereference(rt6->from);
2325                 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
2326                 if (nrt6) {
2327                         rt6_do_update_pmtu(nrt6, mtu);
2328                         if (rt6_insert_exception(nrt6, from))
2329                                 dst_release_immediate(&nrt6->dst);
2330                 }
2331                 rcu_read_unlock();
2332         }
2333 }
2334
2335 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2336                                struct sk_buff *skb, u32 mtu)
2337 {
2338         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2339 }
2340
2341 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2342                      int oif, u32 mark, kuid_t uid)
2343 {
2344         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2345         struct dst_entry *dst;
2346         struct flowi6 fl6 = {
2347                 .flowi6_oif = oif,
2348                 .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark),
2349                 .daddr = iph->daddr,
2350                 .saddr = iph->saddr,
2351                 .flowlabel = ip6_flowinfo(iph),
2352                 .flowi6_uid = uid,
2353         };
2354
2355         dst = ip6_route_output(net, NULL, &fl6);
2356         if (!dst->error)
2357                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2358         dst_release(dst);
2359 }
2360 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2361
2362 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2363 {
2364         int oif = sk->sk_bound_dev_if;
2365         struct dst_entry *dst;
2366
2367         if (!oif && skb->dev)
2368                 oif = l3mdev_master_ifindex(skb->dev);
2369
2370         ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid);
2371
2372         dst = __sk_dst_get(sk);
2373         if (!dst || !dst->obsolete ||
2374             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2375                 return;
2376
2377         bh_lock_sock(sk);
2378         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2379                 ip6_datagram_dst_update(sk, false);
2380         bh_unlock_sock(sk);
2381 }
2382 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2383
2384 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2385                            const struct flowi6 *fl6)
2386 {
2387 #ifdef CONFIG_IPV6_SUBTREES
2388         struct ipv6_pinfo *np = inet6_sk(sk);
2389 #endif
2390
2391         ip6_dst_store(sk, dst,
2392                       ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2393                       &sk->sk_v6_daddr : NULL,
2394 #ifdef CONFIG_IPV6_SUBTREES
2395                       ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2396                       &np->saddr :
2397 #endif
2398                       NULL);
2399 }
2400
2401 /* Handle redirects */
2402 struct ip6rd_flowi {
2403         struct flowi6 fl6;
2404         struct in6_addr gateway;
2405 };
2406
2407 static struct rt6_info *__ip6_route_redirect(struct net *net,
2408                                              struct fib6_table *table,
2409                                              struct flowi6 *fl6,
2410                                              const struct sk_buff *skb,
2411                                              int flags)
2412 {
2413         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2414         struct rt6_info *ret = NULL, *rt_cache;
2415         struct fib6_info *rt;
2416         struct fib6_node *fn;
2417
2418         /* Get the "current" route for this destination and
2419          * check if the redirect has come from appropriate router.
2420          *
2421          * RFC 4861 specifies that redirects should only be
2422          * accepted if they come from the nexthop to the target.
2423          * Due to the way the routes are chosen, this notion
2424          * is a bit fuzzy and one might need to check all possible
2425          * routes.
2426          */
2427
2428         rcu_read_lock();
2429         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2430 restart:
2431         for_each_fib6_node_rt_rcu(fn) {
2432                 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
2433                         continue;
2434                 if (fib6_check_expired(rt))
2435                         continue;
2436                 if (rt->fib6_flags & RTF_REJECT)
2437                         break;
2438                 if (!(rt->fib6_flags & RTF_GATEWAY))
2439                         continue;
2440                 if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex)
2441                         continue;
2442                 /* rt_cache's gateway might be different from its 'parent'
2443                  * in the case of an ip redirect.
2444                  * So we keep searching in the exception table if the gateway
2445                  * is different.
2446                  */
2447                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) {
2448                         rt_cache = rt6_find_cached_rt(rt,
2449                                                       &fl6->daddr,
2450                                                       &fl6->saddr);
2451                         if (rt_cache &&
2452                             ipv6_addr_equal(&rdfl->gateway,
2453                                             &rt_cache->rt6i_gateway)) {
2454                                 ret = rt_cache;
2455                                 break;
2456                         }
2457                         continue;
2458                 }
2459                 break;
2460         }
2461
2462         if (!rt)
2463                 rt = net->ipv6.fib6_null_entry;
2464         else if (rt->fib6_flags & RTF_REJECT) {
2465                 ret = net->ipv6.ip6_null_entry;
2466                 goto out;
2467         }
2468
2469         if (rt == net->ipv6.fib6_null_entry) {
2470                 fn = fib6_backtrack(fn, &fl6->saddr);
2471                 if (fn)
2472                         goto restart;
2473         }
2474
2475 out:
2476         if (ret)
2477                 ip6_hold_safe(net, &ret, true);
2478         else
2479                 ret = ip6_create_rt_rcu(rt);
2480
2481         rcu_read_unlock();
2482
2483         trace_fib6_table_lookup(net, rt, table, fl6);
2484         return ret;
2485 };
2486
2487 static struct dst_entry *ip6_route_redirect(struct net *net,
2488                                             const struct flowi6 *fl6,
2489                                             const struct sk_buff *skb,
2490                                             const struct in6_addr *gateway)
2491 {
2492         int flags = RT6_LOOKUP_F_HAS_SADDR;
2493         struct ip6rd_flowi rdfl;
2494
2495         rdfl.fl6 = *fl6;
2496         rdfl.gateway = *gateway;
2497
2498         return fib6_rule_lookup(net, &rdfl.fl6, skb,
2499                                 flags, __ip6_route_redirect);
2500 }
2501
2502 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2503                   kuid_t uid)
2504 {
2505         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2506         struct dst_entry *dst;
2507         struct flowi6 fl6 = {
2508                 .flowi6_iif = LOOPBACK_IFINDEX,
2509                 .flowi6_oif = oif,
2510                 .flowi6_mark = mark,
2511                 .daddr = iph->daddr,
2512                 .saddr = iph->saddr,
2513                 .flowlabel = ip6_flowinfo(iph),
2514                 .flowi6_uid = uid,
2515         };
2516
2517         dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2518         rt6_do_redirect(dst, NULL, skb);
2519         dst_release(dst);
2520 }
2521 EXPORT_SYMBOL_GPL(ip6_redirect);
2522
2523 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif)
2524 {
2525         const struct ipv6hdr *iph = ipv6_hdr(skb);
2526         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2527         struct dst_entry *dst;
2528         struct flowi6 fl6 = {
2529                 .flowi6_iif = LOOPBACK_IFINDEX,
2530                 .flowi6_oif = oif,
2531                 .daddr = msg->dest,
2532                 .saddr = iph->daddr,
2533                 .flowi6_uid = sock_net_uid(net, NULL),
2534         };
2535
2536         dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2537         rt6_do_redirect(dst, NULL, skb);
2538         dst_release(dst);
2539 }
2540
2541 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2542 {
2543         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2544                      sk->sk_uid);
2545 }
2546 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2547
2548 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2549 {
2550         struct net_device *dev = dst->dev;
2551         unsigned int mtu = dst_mtu(dst);
2552         struct net *net = dev_net(dev);
2553
2554         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2555
2556         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2557                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2558
2559         /*
2560          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2561          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2562          * IPV6_MAXPLEN is also valid and means: "any MSS,
2563          * rely only on pmtu discovery"
2564          */
2565         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2566                 mtu = IPV6_MAXPLEN;
2567         return mtu;
2568 }
2569
2570 static unsigned int ip6_mtu(const struct dst_entry *dst)
2571 {
2572         struct inet6_dev *idev;
2573         unsigned int mtu;
2574
2575         mtu = dst_metric_raw(dst, RTAX_MTU);
2576         if (mtu)
2577                 goto out;
2578
2579         mtu = IPV6_MIN_MTU;
2580
2581         rcu_read_lock();
2582         idev = __in6_dev_get(dst->dev);
2583         if (idev)
2584                 mtu = idev->cnf.mtu6;
2585         rcu_read_unlock();
2586
2587 out:
2588         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2589
2590         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2591 }
2592
2593 /* MTU selection:
2594  * 1. mtu on route is locked - use it
2595  * 2. mtu from nexthop exception
2596  * 3. mtu from egress device
2597  *
2598  * based on ip6_dst_mtu_forward and exception logic of
2599  * rt6_find_cached_rt; called with rcu_read_lock
2600  */
2601 u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
2602                       struct in6_addr *saddr)
2603 {
2604         struct rt6_exception_bucket *bucket;
2605         struct rt6_exception *rt6_ex;
2606         struct in6_addr *src_key;
2607         struct inet6_dev *idev;
2608         u32 mtu = 0;
2609
2610         if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
2611                 mtu = f6i->fib6_pmtu;
2612                 if (mtu)
2613                         goto out;
2614         }
2615
2616         src_key = NULL;
2617 #ifdef CONFIG_IPV6_SUBTREES
2618         if (f6i->fib6_src.plen)
2619                 src_key = saddr;
2620 #endif
2621
2622         bucket = rcu_dereference(f6i->rt6i_exception_bucket);
2623         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
2624         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
2625                 mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU);
2626
2627         if (likely(!mtu)) {
2628                 struct net_device *dev = fib6_info_nh_dev(f6i);
2629
2630                 mtu = IPV6_MIN_MTU;
2631                 idev = __in6_dev_get(dev);
2632                 if (idev && idev->cnf.mtu6 > mtu)
2633                         mtu = idev->cnf.mtu6;
2634         }
2635
2636         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2637 out:
2638         return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu);
2639 }
2640
2641 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2642                                   struct flowi6 *fl6)
2643 {
2644         struct dst_entry *dst;
2645         struct rt6_info *rt;
2646         struct inet6_dev *idev = in6_dev_get(dev);
2647         struct net *net = dev_net(dev);
2648
2649         if (unlikely(!idev))
2650                 return ERR_PTR(-ENODEV);
2651
2652         rt = ip6_dst_alloc(net, dev, 0);
2653         if (unlikely(!rt)) {
2654                 in6_dev_put(idev);
2655                 dst = ERR_PTR(-ENOMEM);
2656                 goto out;
2657         }
2658
2659         rt->dst.flags |= DST_HOST;
2660         rt->dst.input = ip6_input;
2661         rt->dst.output  = ip6_output;
2662         rt->rt6i_gateway  = fl6->daddr;
2663         rt->rt6i_dst.addr = fl6->daddr;
2664         rt->rt6i_dst.plen = 128;
2665         rt->rt6i_idev     = idev;
2666         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2667
2668         /* Add this dst into uncached_list so that rt6_disable_ip() can
2669          * do proper release of the net_device
2670          */
2671         rt6_uncached_list_add(rt);
2672         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2673
2674         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2675
2676 out:
2677         return dst;
2678 }
2679
2680 static int ip6_dst_gc(struct dst_ops *ops)
2681 {
2682         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2683         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2684         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2685         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2686         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2687         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2688         int entries;
2689
2690         entries = dst_entries_get_fast(ops);
2691         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2692             entries <= rt_max_size)
2693                 goto out;
2694
2695         net->ipv6.ip6_rt_gc_expire++;
2696         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2697         entries = dst_entries_get_slow(ops);
2698         if (entries < ops->gc_thresh)
2699                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2700 out:
2701         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2702         return entries > rt_max_size;
2703 }
2704
2705 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2706                                             struct fib6_config *cfg,
2707                                             const struct in6_addr *gw_addr,
2708                                             u32 tbid, int flags)
2709 {
2710         struct flowi6 fl6 = {
2711                 .flowi6_oif = cfg->fc_ifindex,
2712                 .daddr = *gw_addr,
2713                 .saddr = cfg->fc_prefsrc,
2714         };
2715         struct fib6_table *table;
2716         struct rt6_info *rt;
2717
2718         table = fib6_get_table(net, tbid);
2719         if (!table)
2720                 return NULL;
2721
2722         if (!ipv6_addr_any(&cfg->fc_prefsrc))
2723                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2724
2725         flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2726         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2727
2728         /* if table lookup failed, fall back to full lookup */
2729         if (rt == net->ipv6.ip6_null_entry) {
2730                 ip6_rt_put(rt);
2731                 rt = NULL;
2732         }
2733
2734         return rt;
2735 }
2736
2737 static int ip6_route_check_nh_onlink(struct net *net,
2738                                      struct fib6_config *cfg,
2739                                      const struct net_device *dev,
2740                                      struct netlink_ext_ack *extack)
2741 {
2742         u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2743         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2744         u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2745         struct rt6_info *grt;
2746         int err;
2747
2748         err = 0;
2749         grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2750         if (grt) {
2751                 if (!grt->dst.error &&
2752                     /* ignore match if it is the default route */
2753                     grt->from && !ipv6_addr_any(&grt->from->fib6_dst.addr) &&
2754                     (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2755                         NL_SET_ERR_MSG(extack,
2756                                        "Nexthop has invalid gateway or device mismatch");
2757                         err = -EINVAL;
2758                 }
2759
2760                 ip6_rt_put(grt);
2761         }
2762
2763         return err;
2764 }
2765
2766 static int ip6_route_check_nh(struct net *net,
2767                               struct fib6_config *cfg,
2768                               struct net_device **_dev,
2769                               struct inet6_dev **idev)
2770 {
2771         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2772         struct net_device *dev = _dev ? *_dev : NULL;
2773         struct rt6_info *grt = NULL;
2774         int err = -EHOSTUNREACH;
2775
2776         if (cfg->fc_table) {
2777                 int flags = RT6_LOOKUP_F_IFACE;
2778
2779                 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2780                                           cfg->fc_table, flags);
2781                 if (grt) {
2782                         if (grt->rt6i_flags & RTF_GATEWAY ||
2783                             (dev && dev != grt->dst.dev)) {
2784                                 ip6_rt_put(grt);
2785                                 grt = NULL;
2786                         }
2787                 }
2788         }
2789
2790         if (!grt)
2791                 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2792
2793         if (!grt)
2794                 goto out;
2795
2796         if (dev) {
2797                 if (dev != grt->dst.dev) {
2798                         ip6_rt_put(grt);
2799                         goto out;
2800                 }
2801         } else {
2802                 *_dev = dev = grt->dst.dev;
2803                 *idev = grt->rt6i_idev;
2804                 dev_hold(dev);
2805                 in6_dev_hold(grt->rt6i_idev);
2806         }
2807
2808         if (!(grt->rt6i_flags & RTF_GATEWAY))
2809                 err = 0;
2810
2811         ip6_rt_put(grt);
2812
2813 out:
2814         return err;
2815 }
2816
2817 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2818                            struct net_device **_dev, struct inet6_dev **idev,
2819                            struct netlink_ext_ack *extack)
2820 {
2821         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2822         int gwa_type = ipv6_addr_type(gw_addr);
2823         bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2824         const struct net_device *dev = *_dev;
2825         bool need_addr_check = !dev;
2826         int err = -EINVAL;
2827
2828         /* if gw_addr is local we will fail to detect this in case
2829          * address is still TENTATIVE (DAD in progress). rt6_lookup()
2830          * will return already-added prefix route via interface that
2831          * prefix route was assigned to, which might be non-loopback.
2832          */
2833         if (dev &&
2834             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2835                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2836                 goto out;
2837         }
2838
2839         if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2840                 /* IPv6 strictly inhibits using not link-local
2841                  * addresses as nexthop address.
2842                  * Otherwise, router will not able to send redirects.
2843                  * It is very good, but in some (rare!) circumstances
2844                  * (SIT, PtP, NBMA NOARP links) it is handy to allow
2845                  * some exceptions. --ANK
2846                  * We allow IPv4-mapped nexthops to support RFC4798-type
2847                  * addressing
2848                  */
2849                 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2850                         NL_SET_ERR_MSG(extack, "Invalid gateway address");
2851                         goto out;
2852                 }
2853
2854                 if (cfg->fc_flags & RTNH_F_ONLINK)
2855                         err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2856                 else
2857                         err = ip6_route_check_nh(net, cfg, _dev, idev);
2858
2859                 if (err)
2860                         goto out;
2861         }
2862
2863         /* reload in case device was changed */
2864         dev = *_dev;
2865
2866         err = -EINVAL;
2867         if (!dev) {
2868                 NL_SET_ERR_MSG(extack, "Egress device not specified");
2869                 goto out;
2870         } else if (dev->flags & IFF_LOOPBACK) {
2871                 NL_SET_ERR_MSG(extack,
2872                                "Egress device can not be loopback device for this route");
2873                 goto out;
2874         }
2875
2876         /* if we did not check gw_addr above, do so now that the
2877          * egress device has been resolved.
2878          */
2879         if (need_addr_check &&
2880             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2881                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2882                 goto out;
2883         }
2884
2885         err = 0;
2886 out:
2887         return err;
2888 }
2889
2890 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
2891                                               gfp_t gfp_flags,
2892                                               struct netlink_ext_ack *extack)
2893 {
2894         struct net *net = cfg->fc_nlinfo.nl_net;
2895         struct fib6_info *rt = NULL;
2896         struct net_device *dev = NULL;
2897         struct inet6_dev *idev = NULL;
2898         struct fib6_table *table;
2899         int addr_type;
2900         int err = -EINVAL;
2901
2902         /* RTF_PCPU is an internal flag; can not be set by userspace */
2903         if (cfg->fc_flags & RTF_PCPU) {
2904                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2905                 goto out;
2906         }
2907
2908         /* RTF_CACHE is an internal flag; can not be set by userspace */
2909         if (cfg->fc_flags & RTF_CACHE) {
2910                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2911                 goto out;
2912         }
2913
2914         if (cfg->fc_type > RTN_MAX) {
2915                 NL_SET_ERR_MSG(extack, "Invalid route type");
2916                 goto out;
2917         }
2918
2919         if (cfg->fc_dst_len > 128) {
2920                 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2921                 goto out;
2922         }
2923         if (cfg->fc_src_len > 128) {
2924                 NL_SET_ERR_MSG(extack, "Invalid source address length");
2925                 goto out;
2926         }
2927 #ifndef CONFIG_IPV6_SUBTREES
2928         if (cfg->fc_src_len) {
2929                 NL_SET_ERR_MSG(extack,
2930                                "Specifying source address requires IPV6_SUBTREES to be enabled");
2931                 goto out;
2932         }
2933 #endif
2934         if (cfg->fc_ifindex) {
2935                 err = -ENODEV;
2936                 dev = dev_get_by_index(net, cfg->fc_ifindex);
2937                 if (!dev)
2938                         goto out;
2939                 idev = in6_dev_get(dev);
2940                 if (!idev)
2941                         goto out;
2942         }
2943
2944         if (cfg->fc_metric == 0)
2945                 cfg->fc_metric = IP6_RT_PRIO_USER;
2946
2947         if (cfg->fc_flags & RTNH_F_ONLINK) {
2948                 if (!dev) {
2949                         NL_SET_ERR_MSG(extack,
2950                                        "Nexthop device required for onlink");
2951                         err = -ENODEV;
2952                         goto out;
2953                 }
2954
2955                 if (!(dev->flags & IFF_UP)) {
2956                         NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2957                         err = -ENETDOWN;
2958                         goto out;
2959                 }
2960         }
2961
2962         err = -ENOBUFS;
2963         if (cfg->fc_nlinfo.nlh &&
2964             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2965                 table = fib6_get_table(net, cfg->fc_table);
2966                 if (!table) {
2967                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2968                         table = fib6_new_table(net, cfg->fc_table);
2969                 }
2970         } else {
2971                 table = fib6_new_table(net, cfg->fc_table);
2972         }
2973
2974         if (!table)
2975                 goto out;
2976
2977         err = -ENOMEM;
2978         rt = fib6_info_alloc(gfp_flags);
2979         if (!rt)
2980                 goto out;
2981
2982         rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len,
2983                                                extack);
2984         if (IS_ERR(rt->fib6_metrics)) {
2985                 err = PTR_ERR(rt->fib6_metrics);
2986                 /* Do not leave garbage there. */
2987                 rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
2988                 goto out;
2989         }
2990
2991         if (cfg->fc_flags & RTF_ADDRCONF)
2992                 rt->dst_nocount = true;
2993
2994         if (cfg->fc_flags & RTF_EXPIRES)
2995                 fib6_set_expires(rt, jiffies +
2996                                 clock_t_to_jiffies(cfg->fc_expires));
2997         else
2998                 fib6_clean_expires(rt);
2999
3000         if (cfg->fc_protocol == RTPROT_UNSPEC)
3001                 cfg->fc_protocol = RTPROT_BOOT;
3002         rt->fib6_protocol = cfg->fc_protocol;
3003
3004         addr_type = ipv6_addr_type(&cfg->fc_dst);
3005
3006         if (cfg->fc_encap) {
3007                 struct lwtunnel_state *lwtstate;
3008
3009                 err = lwtunnel_build_state(cfg->fc_encap_type,
3010                                            cfg->fc_encap, AF_INET6, cfg,
3011                                            &lwtstate, extack);
3012                 if (err)
3013                         goto out;
3014                 rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate);
3015         }
3016
3017         ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3018         rt->fib6_dst.plen = cfg->fc_dst_len;
3019         if (rt->fib6_dst.plen == 128)
3020                 rt->dst_host = true;
3021
3022 #ifdef CONFIG_IPV6_SUBTREES
3023         ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3024         rt->fib6_src.plen = cfg->fc_src_len;
3025 #endif
3026
3027         rt->fib6_metric = cfg->fc_metric;
3028         rt->fib6_nh.nh_weight = 1;
3029
3030         rt->fib6_type = cfg->fc_type;
3031
3032         /* We cannot add true routes via loopback here,
3033            they would result in kernel looping; promote them to reject routes
3034          */
3035         if ((cfg->fc_flags & RTF_REJECT) ||
3036             (dev && (dev->flags & IFF_LOOPBACK) &&
3037              !(addr_type & IPV6_ADDR_LOOPBACK) &&
3038              !(cfg->fc_flags & RTF_LOCAL))) {
3039                 /* hold loopback dev/idev if we haven't done so. */
3040                 if (dev != net->loopback_dev) {
3041                         if (dev) {
3042                                 dev_put(dev);
3043                                 in6_dev_put(idev);
3044                         }
3045                         dev = net->loopback_dev;
3046                         dev_hold(dev);
3047                         idev = in6_dev_get(dev);
3048                         if (!idev) {
3049                                 err = -ENODEV;
3050                                 goto out;
3051                         }
3052                 }
3053                 rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP;
3054                 goto install_route;
3055         }
3056
3057         if (cfg->fc_flags & RTF_GATEWAY) {
3058                 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
3059                 if (err)
3060                         goto out;
3061
3062                 rt->fib6_nh.nh_gw = cfg->fc_gateway;
3063         }
3064
3065         err = -ENODEV;
3066         if (!dev)
3067                 goto out;
3068
3069         if (idev->cnf.disable_ipv6) {
3070                 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3071                 err = -EACCES;
3072                 goto out;
3073         }
3074
3075         if (!(dev->flags & IFF_UP)) {
3076                 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3077                 err = -ENETDOWN;
3078                 goto out;
3079         }
3080
3081         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3082                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3083                         NL_SET_ERR_MSG(extack, "Invalid source address");
3084                         err = -EINVAL;
3085                         goto out;
3086                 }
3087                 rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3088                 rt->fib6_prefsrc.plen = 128;
3089         } else
3090                 rt->fib6_prefsrc.plen = 0;
3091
3092         rt->fib6_flags = cfg->fc_flags;
3093
3094 install_route:
3095         if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3096             !netif_carrier_ok(dev))
3097                 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3098         rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
3099         rt->fib6_nh.nh_dev = dev;
3100         rt->fib6_table = table;
3101
3102         if (idev)
3103                 in6_dev_put(idev);
3104
3105         return rt;
3106 out:
3107         if (dev)
3108                 dev_put(dev);
3109         if (idev)
3110                 in6_dev_put(idev);
3111
3112         fib6_info_release(rt);
3113         return ERR_PTR(err);
3114 }
3115
3116 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3117                   struct netlink_ext_ack *extack)
3118 {
3119         struct fib6_info *rt;
3120         int err;
3121
3122         rt = ip6_route_info_create(cfg, gfp_flags, extack);
3123         if (IS_ERR(rt))
3124                 return PTR_ERR(rt);
3125
3126         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3127         fib6_info_release(rt);
3128
3129         return err;
3130 }
3131
3132 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3133 {
3134         struct net *net = info->nl_net;
3135         struct fib6_table *table;
3136         int err;
3137
3138         if (rt == net->ipv6.fib6_null_entry) {
3139                 err = -ENOENT;
3140                 goto out;
3141         }
3142
3143         table = rt->fib6_table;
3144         spin_lock_bh(&table->tb6_lock);
3145         err = fib6_del(rt, info);
3146         spin_unlock_bh(&table->tb6_lock);
3147
3148 out:
3149         fib6_info_release(rt);
3150         return err;
3151 }
3152
3153 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3154 {
3155         struct nl_info info = { .nl_net = net };
3156
3157         return __ip6_del_rt(rt, &info);
3158 }
3159
3160 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3161 {
3162         struct nl_info *info = &cfg->fc_nlinfo;
3163         struct net *net = info->nl_net;
3164         struct sk_buff *skb = NULL;
3165         struct fib6_table *table;
3166         int err = -ENOENT;
3167
3168         if (rt == net->ipv6.fib6_null_entry)
3169                 goto out_put;
3170         table = rt->fib6_table;
3171         spin_lock_bh(&table->tb6_lock);
3172
3173         if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3174                 struct fib6_info *sibling, *next_sibling;
3175
3176                 /* prefer to send a single notification with all hops */
3177                 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3178                 if (skb) {
3179                         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3180
3181                         if (rt6_fill_node(net, skb, rt, NULL,
3182                                           NULL, NULL, 0, RTM_DELROUTE,
3183                                           info->portid, seq, 0) < 0) {
3184                                 kfree_skb(skb);
3185                                 skb = NULL;
3186                         } else
3187                                 info->skip_notify = 1;
3188                 }
3189
3190                 list_for_each_entry_safe(sibling, next_sibling,
3191                                          &rt->fib6_siblings,
3192                                          fib6_siblings) {
3193                         err = fib6_del(sibling, info);
3194                         if (err)
3195                                 goto out_unlock;
3196                 }
3197         }
3198
3199         err = fib6_del(rt, info);
3200 out_unlock:
3201         spin_unlock_bh(&table->tb6_lock);
3202 out_put:
3203         fib6_info_release(rt);
3204
3205         if (skb) {
3206                 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3207                             info->nlh, gfp_any());
3208         }
3209         return err;
3210 }
3211
3212 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3213 {
3214         int rc = -ESRCH;
3215
3216         if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3217                 goto out;
3218
3219         if (cfg->fc_flags & RTF_GATEWAY &&
3220             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3221                 goto out;
3222
3223         rc = rt6_remove_exception_rt(rt);
3224 out:
3225         return rc;
3226 }
3227
3228 static int ip6_route_del(struct fib6_config *cfg,
3229                          struct netlink_ext_ack *extack)
3230 {
3231         struct rt6_info *rt_cache;
3232         struct fib6_table *table;
3233         struct fib6_info *rt;
3234         struct fib6_node *fn;
3235         int err = -ESRCH;
3236
3237         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3238         if (!table) {
3239                 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3240                 return err;
3241         }
3242
3243         rcu_read_lock();
3244
3245         fn = fib6_locate(&table->tb6_root,
3246                          &cfg->fc_dst, cfg->fc_dst_len,
3247                          &cfg->fc_src, cfg->fc_src_len,
3248                          !(cfg->fc_flags & RTF_CACHE));
3249
3250         if (fn) {
3251                 for_each_fib6_node_rt_rcu(fn) {
3252                         if (cfg->fc_flags & RTF_CACHE) {
3253                                 int rc;
3254
3255                                 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3256                                                               &cfg->fc_src);
3257                                 if (rt_cache) {
3258                                         rc = ip6_del_cached_rt(rt_cache, cfg);
3259                                         if (rc != -ESRCH) {
3260                                                 rcu_read_unlock();
3261                                                 return rc;
3262                                         }
3263                                 }
3264                                 continue;
3265                         }
3266                         if (cfg->fc_ifindex &&
3267                             (!rt->fib6_nh.nh_dev ||
3268                              rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex))
3269                                 continue;
3270                         if (cfg->fc_flags & RTF_GATEWAY &&
3271                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw))
3272                                 continue;
3273                         if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3274                                 continue;
3275                         if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3276                                 continue;
3277                         if (!fib6_info_hold_safe(rt))
3278                                 continue;
3279                         rcu_read_unlock();
3280
3281                         /* if gateway was specified only delete the one hop */
3282                         if (cfg->fc_flags & RTF_GATEWAY)
3283                                 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3284
3285                         return __ip6_del_rt_siblings(rt, cfg);
3286                 }
3287         }
3288         rcu_read_unlock();
3289
3290         return err;
3291 }
3292
3293 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3294 {
3295         struct netevent_redirect netevent;
3296         struct rt6_info *rt, *nrt = NULL;
3297         struct ndisc_options ndopts;
3298         struct inet6_dev *in6_dev;
3299         struct neighbour *neigh;
3300         struct fib6_info *from;
3301         struct rd_msg *msg;
3302         int optlen, on_link;
3303         u8 *lladdr;
3304
3305         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3306         optlen -= sizeof(*msg);
3307
3308         if (optlen < 0) {
3309                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3310                 return;
3311         }
3312
3313         msg = (struct rd_msg *)icmp6_hdr(skb);
3314
3315         if (ipv6_addr_is_multicast(&msg->dest)) {
3316                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3317                 return;
3318         }
3319
3320         on_link = 0;
3321         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3322                 on_link = 1;
3323         } else if (ipv6_addr_type(&msg->target) !=
3324                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3325                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3326                 return;
3327         }
3328
3329         in6_dev = __in6_dev_get(skb->dev);
3330         if (!in6_dev)
3331                 return;
3332         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3333                 return;
3334
3335         /* RFC2461 8.1:
3336          *      The IP source address of the Redirect MUST be the same as the current
3337          *      first-hop router for the specified ICMP Destination Address.
3338          */
3339
3340         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3341                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3342                 return;
3343         }
3344
3345         lladdr = NULL;
3346         if (ndopts.nd_opts_tgt_lladdr) {
3347                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3348                                              skb->dev);
3349                 if (!lladdr) {
3350                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3351                         return;
3352                 }
3353         }
3354
3355         rt = (struct rt6_info *) dst;
3356         if (rt->rt6i_flags & RTF_REJECT) {
3357                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3358                 return;
3359         }
3360
3361         /* Redirect received -> path was valid.
3362          * Look, redirects are sent only in response to data packets,
3363          * so that this nexthop apparently is reachable. --ANK
3364          */
3365         dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3366
3367         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3368         if (!neigh)
3369                 return;
3370
3371         /*
3372          *      We have finally decided to accept it.
3373          */
3374
3375         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3376                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
3377                      NEIGH_UPDATE_F_OVERRIDE|
3378                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3379                                      NEIGH_UPDATE_F_ISROUTER)),
3380                      NDISC_REDIRECT, &ndopts);
3381
3382         rcu_read_lock();
3383         from = rcu_dereference(rt->from);
3384         /* This fib6_info_hold() is safe here because we hold reference to rt
3385          * and rt already holds reference to fib6_info.
3386          */
3387         fib6_info_hold(from);
3388         rcu_read_unlock();
3389
3390         nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
3391         if (!nrt)
3392                 goto out;
3393
3394         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3395         if (on_link)
3396                 nrt->rt6i_flags &= ~RTF_GATEWAY;
3397
3398         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3399
3400         /* No need to remove rt from the exception table if rt is
3401          * a cached route because rt6_insert_exception() will
3402          * takes care of it
3403          */
3404         if (rt6_insert_exception(nrt, from)) {
3405                 dst_release_immediate(&nrt->dst);
3406                 goto out;
3407         }
3408
3409         netevent.old = &rt->dst;
3410         netevent.new = &nrt->dst;
3411         netevent.daddr = &msg->dest;
3412         netevent.neigh = neigh;
3413         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3414
3415 out:
3416         fib6_info_release(from);
3417         neigh_release(neigh);
3418 }
3419
3420 #ifdef CONFIG_IPV6_ROUTE_INFO
3421 static struct fib6_info *rt6_get_route_info(struct net *net,
3422                                            const struct in6_addr *prefix, int prefixlen,
3423                                            const struct in6_addr *gwaddr,
3424                                            struct net_device *dev)
3425 {
3426         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3427         int ifindex = dev->ifindex;
3428         struct fib6_node *fn;
3429         struct fib6_info *rt = NULL;
3430         struct fib6_table *table;
3431
3432         table = fib6_get_table(net, tb_id);
3433         if (!table)
3434                 return NULL;
3435
3436         rcu_read_lock();
3437         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3438         if (!fn)
3439                 goto out;
3440
3441         for_each_fib6_node_rt_rcu(fn) {
3442                 if (rt->fib6_nh.nh_dev->ifindex != ifindex)
3443                         continue;
3444                 if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3445                         continue;
3446                 if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
3447                         continue;
3448                 if (!fib6_info_hold_safe(rt))
3449                         continue;
3450                 break;
3451         }
3452 out:
3453         rcu_read_unlock();
3454         return rt;
3455 }
3456
3457 static struct fib6_info *rt6_add_route_info(struct net *net,
3458                                            const struct in6_addr *prefix, int prefixlen,
3459                                            const struct in6_addr *gwaddr,
3460                                            struct net_device *dev,
3461                                            unsigned int pref)
3462 {
3463         struct fib6_config cfg = {
3464                 .fc_metric      = IP6_RT_PRIO_USER,
3465                 .fc_ifindex     = dev->ifindex,
3466                 .fc_dst_len     = prefixlen,
3467                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3468                                   RTF_UP | RTF_PREF(pref),
3469                 .fc_protocol = RTPROT_RA,
3470                 .fc_type = RTN_UNICAST,
3471                 .fc_nlinfo.portid = 0,
3472                 .fc_nlinfo.nlh = NULL,
3473                 .fc_nlinfo.nl_net = net,
3474         };
3475
3476         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3477         cfg.fc_dst = *prefix;
3478         cfg.fc_gateway = *gwaddr;
3479
3480         /* We should treat it as a default route if prefix length is 0. */
3481         if (!prefixlen)
3482                 cfg.fc_flags |= RTF_DEFAULT;
3483
3484         ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3485
3486         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3487 }
3488 #endif
3489
3490 struct fib6_info *rt6_get_dflt_router(struct net *net,
3491                                      const struct in6_addr *addr,
3492                                      struct net_device *dev)
3493 {
3494         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3495         struct fib6_info *rt;
3496         struct fib6_table *table;
3497
3498         table = fib6_get_table(net, tb_id);
3499         if (!table)
3500                 return NULL;
3501
3502         rcu_read_lock();
3503         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3504                 if (dev == rt->fib6_nh.nh_dev &&
3505                     ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3506                     ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
3507                         break;
3508         }
3509         if (rt && !fib6_info_hold_safe(rt))
3510                 rt = NULL;
3511         rcu_read_unlock();
3512         return rt;
3513 }
3514
3515 struct fib6_info *rt6_add_dflt_router(struct net *net,
3516                                      const struct in6_addr *gwaddr,
3517                                      struct net_device *dev,
3518                                      unsigned int pref)
3519 {
3520         struct fib6_config cfg = {
3521                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3522                 .fc_metric      = IP6_RT_PRIO_USER,
3523                 .fc_ifindex     = dev->ifindex,
3524                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3525                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3526                 .fc_protocol = RTPROT_RA,
3527                 .fc_type = RTN_UNICAST,
3528                 .fc_nlinfo.portid = 0,
3529                 .fc_nlinfo.nlh = NULL,
3530                 .fc_nlinfo.nl_net = net,
3531         };
3532
3533         cfg.fc_gateway = *gwaddr;
3534
3535         if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3536                 struct fib6_table *table;
3537
3538                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3539                 if (table)
3540                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3541         }
3542
3543         return rt6_get_dflt_router(net, gwaddr, dev);
3544 }
3545
3546 static void __rt6_purge_dflt_routers(struct net *net,
3547                                      struct fib6_table *table)
3548 {
3549         struct fib6_info *rt;
3550
3551 restart:
3552         rcu_read_lock();
3553         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3554                 struct net_device *dev = fib6_info_nh_dev(rt);
3555                 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3556
3557                 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3558                     (!idev || idev->cnf.accept_ra != 2) &&
3559                     fib6_info_hold_safe(rt)) {
3560                         rcu_read_unlock();
3561                         ip6_del_rt(net, rt);
3562                         goto restart;
3563                 }
3564         }
3565         rcu_read_unlock();
3566
3567         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3568 }
3569
3570 void rt6_purge_dflt_routers(struct net *net)
3571 {
3572         struct fib6_table *table;
3573         struct hlist_head *head;
3574         unsigned int h;
3575
3576         rcu_read_lock();
3577
3578         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3579                 head = &net->ipv6.fib_table_hash[h];
3580                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3581                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3582                                 __rt6_purge_dflt_routers(net, table);
3583                 }
3584         }
3585
3586         rcu_read_unlock();
3587 }
3588
3589 static void rtmsg_to_fib6_config(struct net *net,
3590                                  struct in6_rtmsg *rtmsg,
3591                                  struct fib6_config *cfg)
3592 {
3593         *cfg = (struct fib6_config){
3594                 .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3595                          : RT6_TABLE_MAIN,
3596                 .fc_ifindex = rtmsg->rtmsg_ifindex,
3597                 .fc_metric = rtmsg->rtmsg_metric,
3598                 .fc_expires = rtmsg->rtmsg_info,
3599                 .fc_dst_len = rtmsg->rtmsg_dst_len,
3600                 .fc_src_len = rtmsg->rtmsg_src_len,
3601                 .fc_flags = rtmsg->rtmsg_flags,
3602                 .fc_type = rtmsg->rtmsg_type,
3603
3604                 .fc_nlinfo.nl_net = net,
3605
3606                 .fc_dst = rtmsg->rtmsg_dst,
3607                 .fc_src = rtmsg->rtmsg_src,
3608                 .fc_gateway = rtmsg->rtmsg_gateway,
3609         };
3610 }
3611
3612 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3613 {
3614         struct fib6_config cfg;
3615         struct in6_rtmsg rtmsg;
3616         int err;
3617
3618         switch (cmd) {
3619         case SIOCADDRT:         /* Add a route */
3620         case SIOCDELRT:         /* Delete a route */
3621                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3622                         return -EPERM;
3623                 err = copy_from_user(&rtmsg, arg,
3624                                      sizeof(struct in6_rtmsg));
3625                 if (err)
3626                         return -EFAULT;
3627
3628                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3629
3630                 rtnl_lock();
3631                 switch (cmd) {
3632                 case SIOCADDRT:
3633                         err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3634                         break;
3635                 case SIOCDELRT:
3636                         err = ip6_route_del(&cfg, NULL);
3637                         break;
3638                 default:
3639                         err = -EINVAL;
3640                 }
3641                 rtnl_unlock();
3642
3643                 return err;
3644         }
3645
3646         return -EINVAL;
3647 }
3648
3649 /*
3650  *      Drop the packet on the floor
3651  */
3652
3653 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3654 {
3655         int type;
3656         struct dst_entry *dst = skb_dst(skb);
3657         switch (ipstats_mib_noroutes) {
3658         case IPSTATS_MIB_INNOROUTES:
3659                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3660                 if (type == IPV6_ADDR_ANY) {
3661                         IP6_INC_STATS(dev_net(dst->dev),
3662                                       __in6_dev_get_safely(skb->dev),
3663                                       IPSTATS_MIB_INADDRERRORS);
3664                         break;
3665                 }
3666                 /* FALLTHROUGH */
3667         case IPSTATS_MIB_OUTNOROUTES:
3668                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3669                               ipstats_mib_noroutes);
3670                 break;
3671         }
3672         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3673         kfree_skb(skb);
3674         return 0;
3675 }
3676
3677 static int ip6_pkt_discard(struct sk_buff *skb)
3678 {
3679         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3680 }
3681
3682 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3683 {
3684         skb->dev = skb_dst(skb)->dev;
3685         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3686 }
3687
3688 static int ip6_pkt_prohibit(struct sk_buff *skb)
3689 {
3690         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3691 }
3692
3693 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3694 {
3695         skb->dev = skb_dst(skb)->dev;
3696         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3697 }
3698
3699 /*
3700  *      Allocate a dst for local (unicast / anycast) address.
3701  */
3702
3703 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3704                                      struct inet6_dev *idev,
3705                                      const struct in6_addr *addr,
3706                                      bool anycast, gfp_t gfp_flags)
3707 {
3708         u32 tb_id;
3709         struct net_device *dev = idev->dev;
3710         struct fib6_info *f6i;
3711
3712         f6i = fib6_info_alloc(gfp_flags);
3713         if (!f6i)
3714                 return ERR_PTR(-ENOMEM);
3715
3716         f6i->fib6_metrics = ip_fib_metrics_init(net, NULL, 0, NULL);
3717         f6i->dst_nocount = true;
3718         f6i->dst_host = true;
3719         f6i->fib6_protocol = RTPROT_KERNEL;
3720         f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP;
3721         if (anycast) {
3722                 f6i->fib6_type = RTN_ANYCAST;
3723                 f6i->fib6_flags |= RTF_ANYCAST;
3724         } else {
3725                 f6i->fib6_type = RTN_LOCAL;
3726                 f6i->fib6_flags |= RTF_LOCAL;
3727         }
3728
3729         f6i->fib6_nh.nh_gw = *addr;
3730         dev_hold(dev);
3731         f6i->fib6_nh.nh_dev = dev;
3732         f6i->fib6_dst.addr = *addr;
3733         f6i->fib6_dst.plen = 128;
3734         tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3735         f6i->fib6_table = fib6_get_table(net, tb_id);
3736
3737         return f6i;
3738 }
3739
3740 /* remove deleted ip from prefsrc entries */
3741 struct arg_dev_net_ip {
3742         struct net_device *dev;
3743         struct net *net;
3744         struct in6_addr *addr;
3745 };
3746
3747 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3748 {
3749         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3750         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3751         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3752
3753         if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
3754             rt != net->ipv6.fib6_null_entry &&
3755             ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3756                 spin_lock_bh(&rt6_exception_lock);
3757                 /* remove prefsrc entry */
3758                 rt->fib6_prefsrc.plen = 0;
3759                 spin_unlock_bh(&rt6_exception_lock);
3760         }
3761         return 0;
3762 }
3763
3764 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3765 {
3766         struct net *net = dev_net(ifp->idev->dev);
3767         struct arg_dev_net_ip adni = {
3768                 .dev = ifp->idev->dev,
3769                 .net = net,
3770                 .addr = &ifp->addr,
3771         };
3772         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3773 }
3774
3775 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3776
3777 /* Remove routers and update dst entries when gateway turn into host. */
3778 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3779 {
3780         struct in6_addr *gateway = (struct in6_addr *)arg;
3781
3782         if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3783             ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) {
3784                 return -1;
3785         }
3786
3787         /* Further clean up cached routes in exception table.
3788          * This is needed because cached route may have a different
3789          * gateway than its 'parent' in the case of an ip redirect.
3790          */
3791         rt6_exceptions_clean_tohost(rt, gateway);
3792
3793         return 0;
3794 }
3795
3796 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3797 {
3798         fib6_clean_all(net, fib6_clean_tohost, gateway);
3799 }
3800
3801 struct arg_netdev_event {
3802         const struct net_device *dev;
3803         union {
3804                 unsigned int nh_flags;
3805                 unsigned long event;
3806         };
3807 };
3808
3809 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3810 {
3811         struct fib6_info *iter;
3812         struct fib6_node *fn;
3813
3814         fn = rcu_dereference_protected(rt->fib6_node,
3815                         lockdep_is_held(&rt->fib6_table->tb6_lock));
3816         iter = rcu_dereference_protected(fn->leaf,
3817                         lockdep_is_held(&rt->fib6_table->tb6_lock));
3818         while (iter) {
3819                 if (iter->fib6_metric == rt->fib6_metric &&
3820                     rt6_qualify_for_ecmp(iter))
3821                         return iter;
3822                 iter = rcu_dereference_protected(iter->fib6_next,
3823                                 lockdep_is_held(&rt->fib6_table->tb6_lock));
3824         }
3825
3826         return NULL;
3827 }
3828
3829 static bool rt6_is_dead(const struct fib6_info *rt)
3830 {
3831         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
3832             (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
3833              fib6_ignore_linkdown(rt)))
3834                 return true;
3835
3836         return false;
3837 }
3838
3839 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3840 {
3841         struct fib6_info *iter;
3842         int total = 0;
3843
3844         if (!rt6_is_dead(rt))
3845                 total += rt->fib6_nh.nh_weight;
3846
3847         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3848                 if (!rt6_is_dead(iter))
3849                         total += iter->fib6_nh.nh_weight;
3850         }
3851
3852         return total;
3853 }
3854
3855 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3856 {
3857         int upper_bound = -1;
3858
3859         if (!rt6_is_dead(rt)) {
3860                 *weight += rt->fib6_nh.nh_weight;
3861                 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3862                                                     total) - 1;
3863         }
3864         atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound);
3865 }
3866
3867 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3868 {
3869         struct fib6_info *iter;
3870         int weight = 0;
3871
3872         rt6_upper_bound_set(rt, &weight, total);
3873
3874         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3875                 rt6_upper_bound_set(iter, &weight, total);
3876 }
3877
3878 void rt6_multipath_rebalance(struct fib6_info *rt)
3879 {
3880         struct fib6_info *first;
3881         int total;
3882
3883         /* In case the entire multipath route was marked for flushing,
3884          * then there is no need to rebalance upon the removal of every
3885          * sibling route.
3886          */
3887         if (!rt->fib6_nsiblings || rt->should_flush)
3888                 return;
3889
3890         /* During lookup routes are evaluated in order, so we need to
3891          * make sure upper bounds are assigned from the first sibling
3892          * onwards.
3893          */
3894         first = rt6_multipath_first_sibling(rt);
3895         if (WARN_ON_ONCE(!first))
3896                 return;
3897
3898         total = rt6_multipath_total_weight(first);
3899         rt6_multipath_upper_bound_set(first, total);
3900 }
3901
3902 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
3903 {
3904         const struct arg_netdev_event *arg = p_arg;
3905         struct net *net = dev_net(arg->dev);
3906
3907         if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) {
3908                 rt->fib6_nh.nh_flags &= ~arg->nh_flags;
3909                 fib6_update_sernum_upto_root(net, rt);
3910                 rt6_multipath_rebalance(rt);
3911         }
3912
3913         return 0;
3914 }
3915
3916 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3917 {
3918         struct arg_netdev_event arg = {
3919                 .dev = dev,
3920                 {
3921                         .nh_flags = nh_flags,
3922                 },
3923         };
3924
3925         if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3926                 arg.nh_flags |= RTNH_F_LINKDOWN;
3927
3928         fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3929 }
3930
3931 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
3932                                    const struct net_device *dev)
3933 {
3934         struct fib6_info *iter;
3935
3936         if (rt->fib6_nh.nh_dev == dev)
3937                 return true;
3938         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3939                 if (iter->fib6_nh.nh_dev == dev)
3940                         return true;
3941
3942         return false;
3943 }
3944
3945 static void rt6_multipath_flush(struct fib6_info *rt)
3946 {
3947         struct fib6_info *iter;
3948
3949         rt->should_flush = 1;
3950         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3951                 iter->should_flush = 1;
3952 }
3953
3954 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
3955                                              const struct net_device *down_dev)
3956 {
3957         struct fib6_info *iter;
3958         unsigned int dead = 0;
3959
3960         if (rt->fib6_nh.nh_dev == down_dev ||
3961             rt->fib6_nh.nh_flags & RTNH_F_DEAD)
3962                 dead++;
3963         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3964                 if (iter->fib6_nh.nh_dev == down_dev ||
3965                     iter->fib6_nh.nh_flags & RTNH_F_DEAD)
3966                         dead++;
3967
3968         return dead;
3969 }
3970
3971 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
3972                                        const struct net_device *dev,
3973                                        unsigned int nh_flags)
3974 {
3975         struct fib6_info *iter;
3976
3977         if (rt->fib6_nh.nh_dev == dev)
3978                 rt->fib6_nh.nh_flags |= nh_flags;
3979         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3980                 if (iter->fib6_nh.nh_dev == dev)
3981                         iter->fib6_nh.nh_flags |= nh_flags;
3982 }
3983
3984 /* called with write lock held for table with rt */
3985 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
3986 {
3987         const struct arg_netdev_event *arg = p_arg;
3988         const struct net_device *dev = arg->dev;
3989         struct net *net = dev_net(dev);
3990
3991         if (rt == net->ipv6.fib6_null_entry)
3992                 return 0;
3993
3994         switch (arg->event) {
3995         case NETDEV_UNREGISTER:
3996                 return rt->fib6_nh.nh_dev == dev ? -1 : 0;
3997         case NETDEV_DOWN:
3998                 if (rt->should_flush)
3999                         return -1;
4000                 if (!rt->fib6_nsiblings)
4001                         return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4002                 if (rt6_multipath_uses_dev(rt, dev)) {
4003                         unsigned int count;
4004
4005                         count = rt6_multipath_dead_count(rt, dev);
4006                         if (rt->fib6_nsiblings + 1 == count) {
4007                                 rt6_multipath_flush(rt);
4008                                 return -1;
4009                         }
4010                         rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4011                                                    RTNH_F_LINKDOWN);
4012                         fib6_update_sernum(net, rt);
4013                         rt6_multipath_rebalance(rt);
4014                 }
4015                 return -2;
4016         case NETDEV_CHANGE:
4017                 if (rt->fib6_nh.nh_dev != dev ||
4018                     rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4019                         break;
4020                 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
4021                 rt6_multipath_rebalance(rt);
4022                 break;
4023         }
4024
4025         return 0;
4026 }
4027
4028 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4029 {
4030         struct arg_netdev_event arg = {
4031                 .dev = dev,
4032                 {
4033                         .event = event,
4034                 },
4035         };
4036         struct net *net = dev_net(dev);
4037
4038         if (net->ipv6.sysctl.skip_notify_on_dev_down)
4039                 fib6_clean_all_skip_notify(net, fib6_ifdown, &arg);
4040         else
4041                 fib6_clean_all(net, fib6_ifdown, &arg);
4042 }
4043
4044 void rt6_disable_ip(struct net_device *dev, unsigned long event)
4045 {
4046         rt6_sync_down_dev(dev, event);
4047         rt6_uncached_list_flush_dev(dev_net(dev), dev);
4048         neigh_ifdown(&nd_tbl, dev);
4049 }
4050
4051 struct rt6_mtu_change_arg {
4052         struct net_device *dev;
4053         unsigned int mtu;
4054 };
4055
4056 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4057 {
4058         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4059         struct inet6_dev *idev;
4060
4061         /* In IPv6 pmtu discovery is not optional,
4062            so that RTAX_MTU lock cannot disable it.
4063            We still use this lock to block changes
4064            caused by addrconf/ndisc.
4065         */
4066
4067         idev = __in6_dev_get(arg->dev);
4068         if (!idev)
4069                 return 0;
4070
4071         /* For administrative MTU increase, there is no way to discover
4072            IPv6 PMTU increase, so PMTU increase should be updated here.
4073            Since RFC 1981 doesn't include administrative MTU increase
4074            update PMTU increase is a MUST. (i.e. jumbo frame)
4075          */
4076         if (rt->fib6_nh.nh_dev == arg->dev &&
4077             !fib6_metric_locked(rt, RTAX_MTU)) {
4078                 u32 mtu = rt->fib6_pmtu;
4079
4080                 if (mtu >= arg->mtu ||
4081                     (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4082                         fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4083
4084                 spin_lock_bh(&rt6_exception_lock);
4085                 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4086                 spin_unlock_bh(&rt6_exception_lock);
4087         }
4088         return 0;
4089 }
4090
4091 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4092 {
4093         struct rt6_mtu_change_arg arg = {
4094                 .dev = dev,
4095                 .mtu = mtu,
4096         };
4097
4098         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4099 }
4100
4101 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4102         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
4103         [RTA_PREFSRC]           = { .len = sizeof(struct in6_addr) },
4104         [RTA_OIF]               = { .type = NLA_U32 },
4105         [RTA_IIF]               = { .type = NLA_U32 },
4106         [RTA_PRIORITY]          = { .type = NLA_U32 },
4107         [RTA_METRICS]           = { .type = NLA_NESTED },
4108         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
4109         [RTA_PREF]              = { .type = NLA_U8 },
4110         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
4111         [RTA_ENCAP]             = { .type = NLA_NESTED },
4112         [RTA_EXPIRES]           = { .type = NLA_U32 },
4113         [RTA_UID]               = { .type = NLA_U32 },
4114         [RTA_MARK]              = { .type = NLA_U32 },
4115         [RTA_TABLE]             = { .type = NLA_U32 },
4116         [RTA_IP_PROTO]          = { .type = NLA_U8 },
4117         [RTA_SPORT]             = { .type = NLA_U16 },
4118         [RTA_DPORT]             = { .type = NLA_U16 },
4119 };
4120
4121 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4122                               struct fib6_config *cfg,
4123                               struct netlink_ext_ack *extack)
4124 {
4125         struct rtmsg *rtm;
4126         struct nlattr *tb[RTA_MAX+1];
4127         unsigned int pref;
4128         int err;
4129
4130         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4131                           extack);
4132         if (err < 0)
4133                 goto errout;
4134
4135         err = -EINVAL;
4136         rtm = nlmsg_data(nlh);
4137
4138         *cfg = (struct fib6_config){
4139                 .fc_table = rtm->rtm_table,
4140                 .fc_dst_len = rtm->rtm_dst_len,
4141                 .fc_src_len = rtm->rtm_src_len,
4142                 .fc_flags = RTF_UP,
4143                 .fc_protocol = rtm->rtm_protocol,
4144                 .fc_type = rtm->rtm_type,
4145
4146                 .fc_nlinfo.portid = NETLINK_CB(skb).portid,
4147                 .fc_nlinfo.nlh = nlh,
4148                 .fc_nlinfo.nl_net = sock_net(skb->sk),
4149         };
4150
4151         if (rtm->rtm_type == RTN_UNREACHABLE ||
4152             rtm->rtm_type == RTN_BLACKHOLE ||
4153             rtm->rtm_type == RTN_PROHIBIT ||
4154             rtm->rtm_type == RTN_THROW)
4155                 cfg->fc_flags |= RTF_REJECT;
4156
4157         if (rtm->rtm_type == RTN_LOCAL)
4158                 cfg->fc_flags |= RTF_LOCAL;
4159
4160         if (rtm->rtm_flags & RTM_F_CLONED)
4161                 cfg->fc_flags |= RTF_CACHE;
4162
4163         cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4164
4165         if (tb[RTA_GATEWAY]) {
4166                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4167                 cfg->fc_flags |= RTF_GATEWAY;
4168         }
4169
4170         if (tb[RTA_DST]) {
4171                 int plen = (rtm->rtm_dst_len + 7) >> 3;
4172
4173                 if (nla_len(tb[RTA_DST]) < plen)
4174                         goto errout;
4175
4176                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4177         }
4178
4179         if (tb[RTA_SRC]) {
4180                 int plen = (rtm->rtm_src_len + 7) >> 3;
4181
4182                 if (nla_len(tb[RTA_SRC]) < plen)
4183                         goto errout;
4184
4185                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4186         }
4187
4188         if (tb[RTA_PREFSRC])
4189                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4190
4191         if (tb[RTA_OIF])
4192                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4193
4194         if (tb[RTA_PRIORITY])
4195                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4196
4197         if (tb[RTA_METRICS]) {
4198                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4199                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4200         }
4201
4202         if (tb[RTA_TABLE])
4203                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4204
4205         if (tb[RTA_MULTIPATH]) {
4206                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4207                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4208
4209                 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4210                                                      cfg->fc_mp_len, extack);
4211                 if (err < 0)
4212                         goto errout;
4213         }
4214
4215         if (tb[RTA_PREF]) {
4216                 pref = nla_get_u8(tb[RTA_PREF]);
4217                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4218                     pref != ICMPV6_ROUTER_PREF_HIGH)
4219                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
4220                 cfg->fc_flags |= RTF_PREF(pref);
4221         }
4222
4223         if (tb[RTA_ENCAP])
4224                 cfg->fc_encap = tb[RTA_ENCAP];
4225
4226         if (tb[RTA_ENCAP_TYPE]) {
4227                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4228
4229                 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4230                 if (err < 0)
4231                         goto errout;
4232         }
4233
4234         if (tb[RTA_EXPIRES]) {
4235                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4236
4237                 if (addrconf_finite_timeout(timeout)) {
4238                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4239                         cfg->fc_flags |= RTF_EXPIRES;
4240                 }
4241         }
4242
4243         err = 0;
4244 errout:
4245         return err;
4246 }
4247
4248 struct rt6_nh {
4249         struct fib6_info *fib6_info;
4250         struct fib6_config r_cfg;
4251         struct list_head next;
4252 };
4253
4254 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
4255 {
4256         struct rt6_nh *nh;
4257
4258         list_for_each_entry(nh, rt6_nh_list, next) {
4259                 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
4260                         &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
4261                         nh->r_cfg.fc_ifindex);
4262         }
4263 }
4264
4265 static int ip6_route_info_append(struct net *net,
4266                                  struct list_head *rt6_nh_list,
4267                                  struct fib6_info *rt,
4268                                  struct fib6_config *r_cfg)
4269 {
4270         struct rt6_nh *nh;
4271         int err = -EEXIST;
4272
4273         list_for_each_entry(nh, rt6_nh_list, next) {
4274                 /* check if fib6_info already exists */
4275                 if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4276                         return err;
4277         }
4278
4279         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4280         if (!nh)
4281                 return -ENOMEM;
4282         nh->fib6_info = rt;
4283         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4284         list_add_tail(&nh->next, rt6_nh_list);
4285
4286         return 0;
4287 }
4288
4289 static void ip6_route_mpath_notify(struct fib6_info *rt,
4290                                    struct fib6_info *rt_last,
4291                                    struct nl_info *info,
4292                                    __u16 nlflags)
4293 {
4294         /* if this is an APPEND route, then rt points to the first route
4295          * inserted and rt_last points to last route inserted. Userspace
4296          * wants a consistent dump of the route which starts at the first
4297          * nexthop. Since sibling routes are always added at the end of
4298          * the list, find the first sibling of the last route appended
4299          */
4300         if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4301                 rt = list_first_entry(&rt_last->fib6_siblings,
4302                                       struct fib6_info,
4303                                       fib6_siblings);
4304         }
4305
4306         if (rt)
4307                 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4308 }
4309
4310 static int ip6_route_multipath_add(struct fib6_config *cfg,
4311                                    struct netlink_ext_ack *extack)
4312 {
4313         struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4314         struct nl_info *info = &cfg->fc_nlinfo;
4315         struct fib6_config r_cfg;
4316         struct rtnexthop *rtnh;
4317         struct fib6_info *rt;
4318         struct rt6_nh *err_nh;
4319         struct rt6_nh *nh, *nh_safe;
4320         __u16 nlflags;
4321         int remaining;
4322         int attrlen;
4323         int err = 1;
4324         int nhn = 0;
4325         int replace = (cfg->fc_nlinfo.nlh &&
4326                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4327         LIST_HEAD(rt6_nh_list);
4328
4329         nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4330         if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4331                 nlflags |= NLM_F_APPEND;
4332
4333         remaining = cfg->fc_mp_len;
4334         rtnh = (struct rtnexthop *)cfg->fc_mp;
4335
4336         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4337          * fib6_info structs per nexthop
4338          */
4339         while (rtnh_ok(rtnh, remaining)) {
4340                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4341                 if (rtnh->rtnh_ifindex)
4342                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4343
4344                 attrlen = rtnh_attrlen(rtnh);
4345                 if (attrlen > 0) {
4346                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4347
4348                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4349                         if (nla) {
4350                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4351                                 r_cfg.fc_flags |= RTF_GATEWAY;
4352                         }
4353                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4354                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4355                         if (nla)
4356                                 r_cfg.fc_encap_type = nla_get_u16(nla);
4357                 }
4358
4359                 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4360                 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4361                 if (IS_ERR(rt)) {
4362                         err = PTR_ERR(rt);
4363                         rt = NULL;
4364                         goto cleanup;
4365                 }
4366                 if (!rt6_qualify_for_ecmp(rt)) {
4367                         err = -EINVAL;
4368                         NL_SET_ERR_MSG(extack,
4369                                        "Device only routes can not be added for IPv6 using the multipath API.");
4370                         fib6_info_release(rt);
4371                         goto cleanup;
4372                 }
4373
4374                 rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
4375
4376                 err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4377                                             rt, &r_cfg);
4378                 if (err) {
4379                         fib6_info_release(rt);
4380                         goto cleanup;
4381                 }
4382
4383                 rtnh = rtnh_next(rtnh, &remaining);
4384         }
4385
4386         /* for add and replace send one notification with all nexthops.
4387          * Skip the notification in fib6_add_rt2node and send one with
4388          * the full route when done
4389          */
4390         info->skip_notify = 1;
4391
4392         err_nh = NULL;
4393         list_for_each_entry(nh, &rt6_nh_list, next) {
4394                 err = __ip6_ins_rt(nh->fib6_info, info, extack);
4395                 fib6_info_release(nh->fib6_info);
4396
4397                 if (!err) {
4398                         /* save reference to last route successfully inserted */
4399                         rt_last = nh->fib6_info;
4400
4401                         /* save reference to first route for notification */
4402                         if (!rt_notif)
4403                                 rt_notif = nh->fib6_info;
4404                 }
4405
4406                 /* nh->fib6_info is used or freed at this point, reset to NULL*/
4407                 nh->fib6_info = NULL;
4408                 if (err) {
4409                         if (replace && nhn)
4410                                 ip6_print_replace_route_err(&rt6_nh_list);
4411                         err_nh = nh;
4412                         goto add_errout;
4413                 }
4414
4415                 /* Because each route is added like a single route we remove
4416                  * these flags after the first nexthop: if there is a collision,
4417                  * we have already failed to add the first nexthop:
4418                  * fib6_add_rt2node() has rejected it; when replacing, old
4419                  * nexthops have been replaced by first new, the rest should
4420                  * be added to it.
4421                  */
4422                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4423                                                      NLM_F_REPLACE);
4424                 nhn++;
4425         }
4426
4427         /* success ... tell user about new route */
4428         ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4429         goto cleanup;
4430
4431 add_errout:
4432         /* send notification for routes that were added so that
4433          * the delete notifications sent by ip6_route_del are
4434          * coherent
4435          */
4436         if (rt_notif)
4437                 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4438
4439         /* Delete routes that were already added */
4440         list_for_each_entry(nh, &rt6_nh_list, next) {
4441                 if (err_nh == nh)
4442                         break;
4443                 ip6_route_del(&nh->r_cfg, extack);
4444         }
4445
4446 cleanup:
4447         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4448                 if (nh->fib6_info)
4449                         fib6_info_release(nh->fib6_info);
4450                 list_del(&nh->next);
4451                 kfree(nh);
4452         }
4453
4454         return err;
4455 }
4456
4457 static int ip6_route_multipath_del(struct fib6_config *cfg,
4458                                    struct netlink_ext_ack *extack)
4459 {
4460         struct fib6_config r_cfg;
4461         struct rtnexthop *rtnh;
4462         int remaining;
4463         int attrlen;
4464         int err = 1, last_err = 0;
4465
4466         remaining = cfg->fc_mp_len;
4467         rtnh = (struct rtnexthop *)cfg->fc_mp;
4468
4469         /* Parse a Multipath Entry */
4470         while (rtnh_ok(rtnh, remaining)) {
4471                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4472                 if (rtnh->rtnh_ifindex)
4473                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4474
4475                 attrlen = rtnh_attrlen(rtnh);
4476                 if (attrlen > 0) {
4477                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4478
4479                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4480                         if (nla) {
4481                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4482                                 r_cfg.fc_flags |= RTF_GATEWAY;
4483                         }
4484                 }
4485                 err = ip6_route_del(&r_cfg, extack);
4486                 if (err)
4487                         last_err = err;
4488
4489                 rtnh = rtnh_next(rtnh, &remaining);
4490         }
4491
4492         return last_err;
4493 }
4494
4495 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4496                               struct netlink_ext_ack *extack)
4497 {
4498         struct fib6_config cfg;
4499         int err;
4500
4501         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4502         if (err < 0)
4503                 return err;
4504
4505         if (cfg.fc_mp)
4506                 return ip6_route_multipath_del(&cfg, extack);
4507         else {
4508                 cfg.fc_delete_all_nh = 1;
4509                 return ip6_route_del(&cfg, extack);
4510         }
4511 }
4512
4513 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4514                               struct netlink_ext_ack *extack)
4515 {
4516         struct fib6_config cfg;
4517         int err;
4518
4519         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4520         if (err < 0)
4521                 return err;
4522
4523         if (cfg.fc_mp)
4524                 return ip6_route_multipath_add(&cfg, extack);
4525         else
4526                 return ip6_route_add(&cfg, GFP_KERNEL, extack);
4527 }
4528
4529 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4530 {
4531         int nexthop_len = 0;
4532
4533         if (rt->fib6_nsiblings) {
4534                 nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
4535                             + NLA_ALIGN(sizeof(struct rtnexthop))
4536                             + nla_total_size(16) /* RTA_GATEWAY */
4537                             + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate);
4538
4539                 nexthop_len *= rt->fib6_nsiblings;
4540         }
4541
4542         return NLMSG_ALIGN(sizeof(struct rtmsg))
4543                + nla_total_size(16) /* RTA_SRC */
4544                + nla_total_size(16) /* RTA_DST */
4545                + nla_total_size(16) /* RTA_GATEWAY */
4546                + nla_total_size(16) /* RTA_PREFSRC */
4547                + nla_total_size(4) /* RTA_TABLE */
4548                + nla_total_size(4) /* RTA_IIF */
4549                + nla_total_size(4) /* RTA_OIF */
4550                + nla_total_size(4) /* RTA_PRIORITY */
4551                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4552                + nla_total_size(sizeof(struct rta_cacheinfo))
4553                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4554                + nla_total_size(1) /* RTA_PREF */
4555                + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate)
4556                + nexthop_len;
4557 }
4558
4559 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
4560                             unsigned int *flags, bool skip_oif)
4561 {
4562         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4563                 *flags |= RTNH_F_DEAD;
4564
4565         if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
4566                 *flags |= RTNH_F_LINKDOWN;
4567
4568                 rcu_read_lock();
4569                 if (fib6_ignore_linkdown(rt))
4570                         *flags |= RTNH_F_DEAD;
4571                 rcu_read_unlock();
4572         }
4573
4574         if (rt->fib6_flags & RTF_GATEWAY) {
4575                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0)
4576                         goto nla_put_failure;
4577         }
4578
4579         *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK);
4580         if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD)
4581                 *flags |= RTNH_F_OFFLOAD;
4582
4583         /* not needed for multipath encoding b/c it has a rtnexthop struct */
4584         if (!skip_oif && rt->fib6_nh.nh_dev &&
4585             nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex))
4586                 goto nla_put_failure;
4587
4588         if (rt->fib6_nh.nh_lwtstate &&
4589             lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0)
4590                 goto nla_put_failure;
4591
4592         return 0;
4593
4594 nla_put_failure:
4595         return -EMSGSIZE;
4596 }
4597
4598 /* add multipath next hop */
4599 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt)
4600 {
4601         const struct net_device *dev = rt->fib6_nh.nh_dev;
4602         struct rtnexthop *rtnh;
4603         unsigned int flags = 0;
4604
4605         rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4606         if (!rtnh)
4607                 goto nla_put_failure;
4608
4609         rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1;
4610         rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
4611
4612         if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4613                 goto nla_put_failure;
4614
4615         rtnh->rtnh_flags = flags;
4616
4617         /* length of rtnetlink header + attributes */
4618         rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4619
4620         return 0;
4621
4622 nla_put_failure:
4623         return -EMSGSIZE;
4624 }
4625
4626 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4627                          struct fib6_info *rt, struct dst_entry *dst,
4628                          struct in6_addr *dest, struct in6_addr *src,
4629                          int iif, int type, u32 portid, u32 seq,
4630                          unsigned int flags)
4631 {
4632         struct rt6_info *rt6 = (struct rt6_info *)dst;
4633         struct rt6key *rt6_dst, *rt6_src;
4634         u32 *pmetrics, table, rt6_flags;
4635         struct nlmsghdr *nlh;
4636         struct rtmsg *rtm;
4637         long expires = 0;
4638
4639         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4640         if (!nlh)
4641                 return -EMSGSIZE;
4642
4643         if (rt6) {
4644                 rt6_dst = &rt6->rt6i_dst;
4645                 rt6_src = &rt6->rt6i_src;
4646                 rt6_flags = rt6->rt6i_flags;
4647         } else {
4648                 rt6_dst = &rt->fib6_dst;
4649                 rt6_src = &rt->fib6_src;
4650                 rt6_flags = rt->fib6_flags;
4651         }
4652
4653         rtm = nlmsg_data(nlh);
4654         rtm->rtm_family = AF_INET6;
4655         rtm->rtm_dst_len = rt6_dst->plen;
4656         rtm->rtm_src_len = rt6_src->plen;
4657         rtm->rtm_tos = 0;
4658         if (rt->fib6_table)
4659                 table = rt->fib6_table->tb6_id;
4660         else
4661                 table = RT6_TABLE_UNSPEC;
4662         rtm->rtm_table = table;
4663         if (nla_put_u32(skb, RTA_TABLE, table))
4664                 goto nla_put_failure;
4665
4666         rtm->rtm_type = rt->fib6_type;
4667         rtm->rtm_flags = 0;
4668         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4669         rtm->rtm_protocol = rt->fib6_protocol;
4670
4671         if (rt6_flags & RTF_CACHE)
4672                 rtm->rtm_flags |= RTM_F_CLONED;
4673
4674         if (dest) {
4675                 if (nla_put_in6_addr(skb, RTA_DST, dest))
4676                         goto nla_put_failure;
4677                 rtm->rtm_dst_len = 128;
4678         } else if (rtm->rtm_dst_len)
4679                 if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
4680                         goto nla_put_failure;
4681 #ifdef CONFIG_IPV6_SUBTREES
4682         if (src) {
4683                 if (nla_put_in6_addr(skb, RTA_SRC, src))
4684                         goto nla_put_failure;
4685                 rtm->rtm_src_len = 128;
4686         } else if (rtm->rtm_src_len &&
4687                    nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
4688                 goto nla_put_failure;
4689 #endif
4690         if (iif) {
4691 #ifdef CONFIG_IPV6_MROUTE
4692                 if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
4693                         int err = ip6mr_get_route(net, skb, rtm, portid);
4694
4695                         if (err == 0)
4696                                 return 0;
4697                         if (err < 0)
4698                                 goto nla_put_failure;
4699                 } else
4700 #endif
4701                         if (nla_put_u32(skb, RTA_IIF, iif))
4702                                 goto nla_put_failure;
4703         } else if (dest) {
4704                 struct in6_addr saddr_buf;
4705                 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4706                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4707                         goto nla_put_failure;
4708         }
4709
4710         if (rt->fib6_prefsrc.plen) {
4711                 struct in6_addr saddr_buf;
4712                 saddr_buf = rt->fib6_prefsrc.addr;
4713                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4714                         goto nla_put_failure;
4715         }
4716
4717         pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4718         if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4719                 goto nla_put_failure;
4720
4721         if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4722                 goto nla_put_failure;
4723
4724         /* For multipath routes, walk the siblings list and add
4725          * each as a nexthop within RTA_MULTIPATH.
4726          */
4727         if (rt6) {
4728                 if (rt6_flags & RTF_GATEWAY &&
4729                     nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
4730                         goto nla_put_failure;
4731
4732                 if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
4733                         goto nla_put_failure;
4734         } else if (rt->fib6_nsiblings) {
4735                 struct fib6_info *sibling, *next_sibling;
4736                 struct nlattr *mp;
4737
4738                 mp = nla_nest_start(skb, RTA_MULTIPATH);
4739                 if (!mp)
4740                         goto nla_put_failure;
4741
4742                 if (rt6_add_nexthop(skb, rt) < 0)
4743                         goto nla_put_failure;
4744
4745                 list_for_each_entry_safe(sibling, next_sibling,
4746                                          &rt->fib6_siblings, fib6_siblings) {
4747                         if (rt6_add_nexthop(skb, sibling) < 0)
4748                                 goto nla_put_failure;
4749                 }
4750
4751                 nla_nest_end(skb, mp);
4752         } else {
4753                 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4754                         goto nla_put_failure;
4755         }
4756
4757         if (rt6_flags & RTF_EXPIRES) {
4758                 expires = dst ? dst->expires : rt->expires;
4759                 expires -= jiffies;
4760         }
4761
4762         if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4763                 goto nla_put_failure;
4764
4765         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
4766                 goto nla_put_failure;
4767
4768
4769         nlmsg_end(skb, nlh);
4770         return 0;
4771
4772 nla_put_failure:
4773         nlmsg_cancel(skb, nlh);
4774         return -EMSGSIZE;
4775 }
4776
4777 static bool fib6_info_uses_dev(const struct fib6_info *f6i,
4778                                const struct net_device *dev)
4779 {
4780         if (f6i->fib6_nh.nh_dev == dev)
4781                 return true;
4782
4783         if (f6i->fib6_nsiblings) {
4784                 struct fib6_info *sibling, *next_sibling;
4785
4786                 list_for_each_entry_safe(sibling, next_sibling,
4787                                          &f6i->fib6_siblings, fib6_siblings) {
4788                         if (sibling->fib6_nh.nh_dev == dev)
4789                                 return true;
4790                 }
4791         }
4792
4793         return false;
4794 }
4795
4796 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4797 {
4798         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4799         struct fib_dump_filter *filter = &arg->filter;
4800         unsigned int flags = NLM_F_MULTI;
4801         struct net *net = arg->net;
4802
4803         if (rt == net->ipv6.fib6_null_entry)
4804                 return 0;
4805
4806         if ((filter->flags & RTM_F_PREFIX) &&
4807             !(rt->fib6_flags & RTF_PREFIX_RT)) {
4808                 /* success since this is not a prefix route */
4809                 return 1;
4810         }
4811         if (filter->filter_set) {
4812                 if ((filter->rt_type && rt->fib6_type != filter->rt_type) ||
4813                     (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) ||
4814                     (filter->protocol && rt->fib6_protocol != filter->protocol)) {
4815                         return 1;
4816                 }
4817                 flags |= NLM_F_DUMP_FILTERED;
4818         }
4819
4820         return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4821                              RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4822                              arg->cb->nlh->nlmsg_seq, flags);
4823 }
4824
4825 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4826                               struct netlink_ext_ack *extack)
4827 {
4828         struct net *net = sock_net(in_skb->sk);
4829         struct nlattr *tb[RTA_MAX+1];
4830         int err, iif = 0, oif = 0;
4831         struct fib6_info *from;
4832         struct dst_entry *dst;
4833         struct rt6_info *rt;
4834         struct sk_buff *skb;
4835         struct rtmsg *rtm;
4836         struct flowi6 fl6 = {};
4837         bool fibmatch;
4838
4839         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4840                           extack);
4841         if (err < 0)
4842                 goto errout;
4843
4844         err = -EINVAL;
4845         rtm = nlmsg_data(nlh);
4846         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4847         fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4848
4849         if (tb[RTA_SRC]) {
4850                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4851                         goto errout;
4852
4853                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4854         }
4855
4856         if (tb[RTA_DST]) {
4857                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4858                         goto errout;
4859
4860                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4861         }
4862
4863         if (tb[RTA_IIF])
4864                 iif = nla_get_u32(tb[RTA_IIF]);
4865
4866         if (tb[RTA_OIF])
4867                 oif = nla_get_u32(tb[RTA_OIF]);
4868
4869         if (tb[RTA_MARK])
4870                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4871
4872         if (tb[RTA_UID])
4873                 fl6.flowi6_uid = make_kuid(current_user_ns(),
4874                                            nla_get_u32(tb[RTA_UID]));
4875         else
4876                 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4877
4878         if (tb[RTA_SPORT])
4879                 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
4880
4881         if (tb[RTA_DPORT])
4882                 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
4883
4884         if (tb[RTA_IP_PROTO]) {
4885                 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
4886                                                   &fl6.flowi6_proto, extack);
4887                 if (err)
4888                         goto errout;
4889         }
4890
4891         if (iif) {
4892                 struct net_device *dev;
4893                 int flags = 0;
4894
4895                 rcu_read_lock();
4896
4897                 dev = dev_get_by_index_rcu(net, iif);
4898                 if (!dev) {
4899                         rcu_read_unlock();
4900                         err = -ENODEV;
4901                         goto errout;
4902                 }
4903
4904                 fl6.flowi6_iif = iif;
4905
4906                 if (!ipv6_addr_any(&fl6.saddr))
4907                         flags |= RT6_LOOKUP_F_HAS_SADDR;
4908
4909                 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4910
4911                 rcu_read_unlock();
4912         } else {
4913                 fl6.flowi6_oif = oif;
4914
4915                 dst = ip6_route_output(net, NULL, &fl6);
4916         }
4917
4918
4919         rt = container_of(dst, struct rt6_info, dst);
4920         if (rt->dst.error) {
4921                 err = rt->dst.error;
4922                 ip6_rt_put(rt);
4923                 goto errout;
4924         }
4925
4926         if (rt == net->ipv6.ip6_null_entry) {
4927                 err = rt->dst.error;
4928                 ip6_rt_put(rt);
4929                 goto errout;
4930         }
4931
4932         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4933         if (!skb) {
4934                 ip6_rt_put(rt);
4935                 err = -ENOBUFS;
4936                 goto errout;
4937         }
4938
4939         skb_dst_set(skb, &rt->dst);
4940
4941         rcu_read_lock();
4942         from = rcu_dereference(rt->from);
4943
4944         if (fibmatch)
4945                 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif,
4946                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4947                                     nlh->nlmsg_seq, 0);
4948         else
4949                 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
4950                                     &fl6.saddr, iif, RTM_NEWROUTE,
4951                                     NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
4952                                     0);
4953         rcu_read_unlock();
4954
4955         if (err < 0) {
4956                 kfree_skb(skb);
4957                 goto errout;
4958         }
4959
4960         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4961 errout:
4962         return err;
4963 }
4964
4965 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
4966                      unsigned int nlm_flags)
4967 {
4968         struct sk_buff *skb;
4969         struct net *net = info->nl_net;
4970         u32 seq;
4971         int err;
4972
4973         err = -ENOBUFS;
4974         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4975
4976         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4977         if (!skb)
4978                 goto errout;
4979
4980         err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
4981                             event, info->portid, seq, nlm_flags);
4982         if (err < 0) {
4983                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4984                 WARN_ON(err == -EMSGSIZE);
4985                 kfree_skb(skb);
4986                 goto errout;
4987         }
4988         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4989                     info->nlh, gfp_any());
4990         return;
4991 errout:
4992         if (err < 0)
4993                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4994 }
4995
4996 static int ip6_route_dev_notify(struct notifier_block *this,
4997                                 unsigned long event, void *ptr)
4998 {
4999         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
5000         struct net *net = dev_net(dev);
5001
5002         if (!(dev->flags & IFF_LOOPBACK))
5003                 return NOTIFY_OK;
5004
5005         if (event == NETDEV_REGISTER) {
5006                 net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
5007                 net->ipv6.ip6_null_entry->dst.dev = dev;
5008                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
5009 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5010                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
5011                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
5012                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
5013                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
5014 #endif
5015          } else if (event == NETDEV_UNREGISTER &&
5016                     dev->reg_state != NETREG_UNREGISTERED) {
5017                 /* NETDEV_UNREGISTER could be fired for multiple times by
5018                  * netdev_wait_allrefs(). Make sure we only call this once.
5019                  */
5020                 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
5021 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5022                 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
5023                 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
5024 #endif
5025         }
5026
5027         return NOTIFY_OK;
5028 }
5029
5030 /*
5031  *      /proc
5032  */
5033
5034 #ifdef CONFIG_PROC_FS
5035 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
5036 {
5037         struct net *net = (struct net *)seq->private;
5038         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
5039                    net->ipv6.rt6_stats->fib_nodes,
5040                    net->ipv6.rt6_stats->fib_route_nodes,
5041                    atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
5042                    net->ipv6.rt6_stats->fib_rt_entries,
5043                    net->ipv6.rt6_stats->fib_rt_cache,
5044                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
5045                    net->ipv6.rt6_stats->fib_discarded_routes);
5046
5047         return 0;
5048 }
5049 #endif  /* CONFIG_PROC_FS */
5050
5051 #ifdef CONFIG_SYSCTL
5052
5053 static
5054 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
5055                               void __user *buffer, size_t *lenp, loff_t *ppos)
5056 {
5057         struct net *net;
5058         int delay;
5059         int ret;
5060         if (!write)
5061                 return -EINVAL;
5062
5063         net = (struct net *)ctl->extra1;
5064         delay = net->ipv6.sysctl.flush_delay;
5065         ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
5066         if (ret)
5067                 return ret;
5068
5069         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
5070         return 0;
5071 }
5072
5073 static int zero;
5074 static int one = 1;
5075
5076 static struct ctl_table ipv6_route_table_template[] = {
5077         {
5078                 .procname       =       "flush",
5079                 .data           =       &init_net.ipv6.sysctl.flush_delay,
5080                 .maxlen         =       sizeof(int),
5081                 .mode           =       0200,
5082                 .proc_handler   =       ipv6_sysctl_rtcache_flush
5083         },
5084         {
5085                 .procname       =       "gc_thresh",
5086                 .data           =       &ip6_dst_ops_template.gc_thresh,
5087                 .maxlen         =       sizeof(int),
5088                 .mode           =       0644,
5089                 .proc_handler   =       proc_dointvec,
5090         },
5091         {
5092                 .procname       =       "max_size",
5093                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
5094                 .maxlen         =       sizeof(int),
5095                 .mode           =       0644,
5096                 .proc_handler   =       proc_dointvec,
5097         },
5098         {
5099                 .procname       =       "gc_min_interval",
5100                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5101                 .maxlen         =       sizeof(int),
5102                 .mode           =       0644,
5103                 .proc_handler   =       proc_dointvec_jiffies,
5104         },
5105         {
5106                 .procname       =       "gc_timeout",
5107                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5108                 .maxlen         =       sizeof(int),
5109                 .mode           =       0644,
5110                 .proc_handler   =       proc_dointvec_jiffies,
5111         },
5112         {
5113                 .procname       =       "gc_interval",
5114                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
5115                 .maxlen         =       sizeof(int),
5116                 .mode           =       0644,
5117                 .proc_handler   =       proc_dointvec_jiffies,
5118         },
5119         {
5120                 .procname       =       "gc_elasticity",
5121                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5122                 .maxlen         =       sizeof(int),
5123                 .mode           =       0644,
5124                 .proc_handler   =       proc_dointvec,
5125         },
5126         {
5127                 .procname       =       "mtu_expires",
5128                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5129                 .maxlen         =       sizeof(int),
5130                 .mode           =       0644,
5131                 .proc_handler   =       proc_dointvec_jiffies,
5132         },
5133         {
5134                 .procname       =       "min_adv_mss",
5135                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
5136                 .maxlen         =       sizeof(int),
5137                 .mode           =       0644,
5138                 .proc_handler   =       proc_dointvec,
5139         },
5140         {
5141                 .procname       =       "gc_min_interval_ms",
5142                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5143                 .maxlen         =       sizeof(int),
5144                 .mode           =       0644,
5145                 .proc_handler   =       proc_dointvec_ms_jiffies,
5146         },
5147         {
5148                 .procname       =       "skip_notify_on_dev_down",
5149                 .data           =       &init_net.ipv6.sysctl.skip_notify_on_dev_down,
5150                 .maxlen         =       sizeof(int),
5151                 .mode           =       0644,
5152                 .proc_handler   =       proc_dointvec,
5153                 .extra1         =       &zero,
5154                 .extra2         =       &one,
5155         },
5156         { }
5157 };
5158
5159 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5160 {
5161         struct ctl_table *table;
5162
5163         table = kmemdup(ipv6_route_table_template,
5164                         sizeof(ipv6_route_table_template),
5165                         GFP_KERNEL);
5166
5167         if (table) {
5168                 table[0].data = &net->ipv6.sysctl.flush_delay;
5169                 table[0].extra1 = net;
5170                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5171                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5172                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5173                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5174                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5175                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5176                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5177                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5178                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5179                 table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down;
5180
5181                 /* Don't export sysctls to unprivileged users */
5182                 if (net->user_ns != &init_user_ns)
5183                         table[0].procname = NULL;
5184         }
5185
5186         return table;
5187 }
5188 #endif
5189
5190 static int __net_init ip6_route_net_init(struct net *net)
5191 {
5192         int ret = -ENOMEM;
5193
5194         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5195                sizeof(net->ipv6.ip6_dst_ops));
5196
5197         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5198                 goto out_ip6_dst_ops;
5199
5200         net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5201                                             sizeof(*net->ipv6.fib6_null_entry),
5202                                             GFP_KERNEL);
5203         if (!net->ipv6.fib6_null_entry)
5204                 goto out_ip6_dst_entries;
5205
5206         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5207                                            sizeof(*net->ipv6.ip6_null_entry),
5208                                            GFP_KERNEL);
5209         if (!net->ipv6.ip6_null_entry)
5210                 goto out_fib6_null_entry;
5211         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5212         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5213                          ip6_template_metrics, true);
5214
5215 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5216         net->ipv6.fib6_has_custom_rules = false;
5217         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5218                                                sizeof(*net->ipv6.ip6_prohibit_entry),
5219                                                GFP_KERNEL);
5220         if (!net->ipv6.ip6_prohibit_entry)
5221                 goto out_ip6_null_entry;
5222         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5223         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5224                          ip6_template_metrics, true);
5225
5226         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5227                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
5228                                                GFP_KERNEL);
5229         if (!net->ipv6.ip6_blk_hole_entry)
5230                 goto out_ip6_prohibit_entry;
5231         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5232         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5233                          ip6_template_metrics, true);
5234 #endif
5235
5236         net->ipv6.sysctl.flush_delay = 0;
5237         net->ipv6.sysctl.ip6_rt_max_size = 4096;
5238         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5239         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5240         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5241         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5242         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5243         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5244         net->ipv6.sysctl.skip_notify_on_dev_down = 0;
5245
5246         net->ipv6.ip6_rt_gc_expire = 30*HZ;
5247
5248         ret = 0;
5249 out:
5250         return ret;
5251
5252 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5253 out_ip6_prohibit_entry:
5254         kfree(net->ipv6.ip6_prohibit_entry);
5255 out_ip6_null_entry:
5256         kfree(net->ipv6.ip6_null_entry);
5257 #endif
5258 out_fib6_null_entry:
5259         kfree(net->ipv6.fib6_null_entry);
5260 out_ip6_dst_entries:
5261         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5262 out_ip6_dst_ops:
5263         goto out;
5264 }
5265
5266 static void __net_exit ip6_route_net_exit(struct net *net)
5267 {
5268         kfree(net->ipv6.fib6_null_entry);
5269         kfree(net->ipv6.ip6_null_entry);
5270 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5271         kfree(net->ipv6.ip6_prohibit_entry);
5272         kfree(net->ipv6.ip6_blk_hole_entry);
5273 #endif
5274         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5275 }
5276
5277 static int __net_init ip6_route_net_init_late(struct net *net)
5278 {
5279 #ifdef CONFIG_PROC_FS
5280         proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
5281                         sizeof(struct ipv6_route_iter));
5282         proc_create_net_single("rt6_stats", 0444, net->proc_net,
5283                         rt6_stats_seq_show, NULL);
5284 #endif
5285         return 0;
5286 }
5287
5288 static void __net_exit ip6_route_net_exit_late(struct net *net)
5289 {
5290 #ifdef CONFIG_PROC_FS
5291         remove_proc_entry("ipv6_route", net->proc_net);
5292         remove_proc_entry("rt6_stats", net->proc_net);
5293 #endif
5294 }
5295
5296 static struct pernet_operations ip6_route_net_ops = {
5297         .init = ip6_route_net_init,
5298         .exit = ip6_route_net_exit,
5299 };
5300
5301 static int __net_init ipv6_inetpeer_init(struct net *net)
5302 {
5303         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5304
5305         if (!bp)
5306                 return -ENOMEM;
5307         inet_peer_base_init(bp);
5308         net->ipv6.peers = bp;
5309         return 0;
5310 }
5311
5312 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5313 {
5314         struct inet_peer_base *bp = net->ipv6.peers;
5315
5316         net->ipv6.peers = NULL;
5317         inetpeer_invalidate_tree(bp);
5318         kfree(bp);
5319 }
5320
5321 static struct pernet_operations ipv6_inetpeer_ops = {
5322         .init   =       ipv6_inetpeer_init,
5323         .exit   =       ipv6_inetpeer_exit,
5324 };
5325
5326 static struct pernet_operations ip6_route_net_late_ops = {
5327         .init = ip6_route_net_init_late,
5328         .exit = ip6_route_net_exit_late,
5329 };
5330
5331 static struct notifier_block ip6_route_dev_notifier = {
5332         .notifier_call = ip6_route_dev_notify,
5333         .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5334 };
5335
5336 void __init ip6_route_init_special_entries(void)
5337 {
5338         /* Registering of the loopback is done before this portion of code,
5339          * the loopback reference in rt6_info will not be taken, do it
5340          * manually for init_net */
5341         init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
5342         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5343         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5344   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5345         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5346         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5347         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5348         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5349   #endif
5350 }
5351
5352 int __init ip6_route_init(void)
5353 {
5354         int ret;
5355         int cpu;
5356
5357         ret = -ENOMEM;
5358         ip6_dst_ops_template.kmem_cachep =
5359                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5360                                   SLAB_HWCACHE_ALIGN, NULL);
5361         if (!ip6_dst_ops_template.kmem_cachep)
5362                 goto out;
5363
5364         ret = dst_entries_init(&ip6_dst_blackhole_ops);
5365         if (ret)
5366                 goto out_kmem_cache;
5367
5368         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5369         if (ret)
5370                 goto out_dst_entries;
5371
5372         ret = register_pernet_subsys(&ip6_route_net_ops);
5373         if (ret)
5374                 goto out_register_inetpeer;
5375
5376         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5377
5378         ret = fib6_init();
5379         if (ret)
5380                 goto out_register_subsys;
5381
5382         ret = xfrm6_init();
5383         if (ret)
5384                 goto out_fib6_init;
5385
5386         ret = fib6_rules_init();
5387         if (ret)
5388                 goto xfrm6_init;
5389
5390         ret = register_pernet_subsys(&ip6_route_net_late_ops);
5391         if (ret)
5392                 goto fib6_rules_init;
5393
5394         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5395                                    inet6_rtm_newroute, NULL, 0);
5396         if (ret < 0)
5397                 goto out_register_late_subsys;
5398
5399         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5400                                    inet6_rtm_delroute, NULL, 0);
5401         if (ret < 0)
5402                 goto out_register_late_subsys;
5403
5404         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5405                                    inet6_rtm_getroute, NULL,
5406                                    RTNL_FLAG_DOIT_UNLOCKED);
5407         if (ret < 0)
5408                 goto out_register_late_subsys;
5409
5410         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5411         if (ret)
5412                 goto out_register_late_subsys;
5413
5414         for_each_possible_cpu(cpu) {
5415                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5416
5417                 INIT_LIST_HEAD(&ul->head);
5418                 spin_lock_init(&ul->lock);
5419         }
5420
5421 out:
5422         return ret;
5423
5424 out_register_late_subsys:
5425         rtnl_unregister_all(PF_INET6);
5426         unregister_pernet_subsys(&ip6_route_net_late_ops);
5427 fib6_rules_init:
5428         fib6_rules_cleanup();
5429 xfrm6_init:
5430         xfrm6_fini();
5431 out_fib6_init:
5432         fib6_gc_cleanup();
5433 out_register_subsys:
5434         unregister_pernet_subsys(&ip6_route_net_ops);
5435 out_register_inetpeer:
5436         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5437 out_dst_entries:
5438         dst_entries_destroy(&ip6_dst_blackhole_ops);
5439 out_kmem_cache:
5440         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5441         goto out;
5442 }
5443
5444 void ip6_route_cleanup(void)
5445 {
5446         unregister_netdevice_notifier(&ip6_route_dev_notifier);
5447         unregister_pernet_subsys(&ip6_route_net_late_ops);
5448         fib6_rules_cleanup();
5449         xfrm6_fini();
5450         fib6_gc_cleanup();
5451         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5452         unregister_pernet_subsys(&ip6_route_net_ops);
5453         dst_entries_destroy(&ip6_dst_blackhole_ops);
5454         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5455 }