ipv6: ip6_route_output does not modify sk parameter, so make it const
[sfrench/cifs-2.6.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/mroute6.h>
38 #include <linux/init.h>
39 #include <linux/if_arp.h>
40 #include <linux/proc_fs.h>
41 #include <linux/seq_file.h>
42 #include <linux/nsproxy.h>
43 #include <linux/slab.h>
44 #include <net/net_namespace.h>
45 #include <net/snmp.h>
46 #include <net/ipv6.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #include <net/ndisc.h>
50 #include <net/addrconf.h>
51 #include <net/tcp.h>
52 #include <linux/rtnetlink.h>
53 #include <net/dst.h>
54 #include <net/xfrm.h>
55 #include <net/netevent.h>
56 #include <net/netlink.h>
57
58 #include <asm/uaccess.h>
59
60 #ifdef CONFIG_SYSCTL
61 #include <linux/sysctl.h>
62 #endif
63
64 /* Set to 3 to get tracing. */
65 #define RT6_DEBUG 2
66
67 #if RT6_DEBUG >= 3
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
70 #else
71 #define RDBG(x)
72 #define RT6_TRACE(x...) do { ; } while (0)
73 #endif
74
75 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
76 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
77 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
78 static unsigned int      ip6_default_mtu(const struct dst_entry *dst);
79 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
80 static void             ip6_dst_destroy(struct dst_entry *);
81 static void             ip6_dst_ifdown(struct dst_entry *,
82                                        struct net_device *dev, int how);
83 static int               ip6_dst_gc(struct dst_ops *ops);
84
85 static int              ip6_pkt_discard(struct sk_buff *skb);
86 static int              ip6_pkt_discard_out(struct sk_buff *skb);
87 static void             ip6_link_failure(struct sk_buff *skb);
88 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
89
90 #ifdef CONFIG_IPV6_ROUTE_INFO
91 static struct rt6_info *rt6_add_route_info(struct net *net,
92                                            struct in6_addr *prefix, int prefixlen,
93                                            struct in6_addr *gwaddr, int ifindex,
94                                            unsigned pref);
95 static struct rt6_info *rt6_get_route_info(struct net *net,
96                                            struct in6_addr *prefix, int prefixlen,
97                                            struct in6_addr *gwaddr, int ifindex);
98 #endif
99
100 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
101 {
102         struct rt6_info *rt = (struct rt6_info *) dst;
103         struct inet_peer *peer;
104         u32 *p = NULL;
105
106         if (!rt->rt6i_peer)
107                 rt6_bind_peer(rt, 1);
108
109         peer = rt->rt6i_peer;
110         if (peer) {
111                 u32 *old_p = __DST_METRICS_PTR(old);
112                 unsigned long prev, new;
113
114                 p = peer->metrics;
115                 if (inet_metrics_new(peer))
116                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
117
118                 new = (unsigned long) p;
119                 prev = cmpxchg(&dst->_metrics, old, new);
120
121                 if (prev != old) {
122                         p = __DST_METRICS_PTR(prev);
123                         if (prev & DST_METRICS_READ_ONLY)
124                                 p = NULL;
125                 }
126         }
127         return p;
128 }
129
130 static struct dst_ops ip6_dst_ops_template = {
131         .family                 =       AF_INET6,
132         .protocol               =       cpu_to_be16(ETH_P_IPV6),
133         .gc                     =       ip6_dst_gc,
134         .gc_thresh              =       1024,
135         .check                  =       ip6_dst_check,
136         .default_advmss         =       ip6_default_advmss,
137         .default_mtu            =       ip6_default_mtu,
138         .cow_metrics            =       ipv6_cow_metrics,
139         .destroy                =       ip6_dst_destroy,
140         .ifdown                 =       ip6_dst_ifdown,
141         .negative_advice        =       ip6_negative_advice,
142         .link_failure           =       ip6_link_failure,
143         .update_pmtu            =       ip6_rt_update_pmtu,
144         .local_out              =       __ip6_local_out,
145 };
146
147 static unsigned int ip6_blackhole_default_mtu(const struct dst_entry *dst)
148 {
149         return 0;
150 }
151
152 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
153 {
154 }
155
156 static struct dst_ops ip6_dst_blackhole_ops = {
157         .family                 =       AF_INET6,
158         .protocol               =       cpu_to_be16(ETH_P_IPV6),
159         .destroy                =       ip6_dst_destroy,
160         .check                  =       ip6_dst_check,
161         .default_mtu            =       ip6_blackhole_default_mtu,
162         .default_advmss         =       ip6_default_advmss,
163         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
164 };
165
166 static const u32 ip6_template_metrics[RTAX_MAX] = {
167         [RTAX_HOPLIMIT - 1] = 255,
168 };
169
170 static struct rt6_info ip6_null_entry_template = {
171         .dst = {
172                 .__refcnt       = ATOMIC_INIT(1),
173                 .__use          = 1,
174                 .obsolete       = -1,
175                 .error          = -ENETUNREACH,
176                 .input          = ip6_pkt_discard,
177                 .output         = ip6_pkt_discard_out,
178         },
179         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
180         .rt6i_protocol  = RTPROT_KERNEL,
181         .rt6i_metric    = ~(u32) 0,
182         .rt6i_ref       = ATOMIC_INIT(1),
183 };
184
185 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
186
187 static int ip6_pkt_prohibit(struct sk_buff *skb);
188 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
189
190 static struct rt6_info ip6_prohibit_entry_template = {
191         .dst = {
192                 .__refcnt       = ATOMIC_INIT(1),
193                 .__use          = 1,
194                 .obsolete       = -1,
195                 .error          = -EACCES,
196                 .input          = ip6_pkt_prohibit,
197                 .output         = ip6_pkt_prohibit_out,
198         },
199         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
200         .rt6i_protocol  = RTPROT_KERNEL,
201         .rt6i_metric    = ~(u32) 0,
202         .rt6i_ref       = ATOMIC_INIT(1),
203 };
204
205 static struct rt6_info ip6_blk_hole_entry_template = {
206         .dst = {
207                 .__refcnt       = ATOMIC_INIT(1),
208                 .__use          = 1,
209                 .obsolete       = -1,
210                 .error          = -EINVAL,
211                 .input          = dst_discard,
212                 .output         = dst_discard,
213         },
214         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
215         .rt6i_protocol  = RTPROT_KERNEL,
216         .rt6i_metric    = ~(u32) 0,
217         .rt6i_ref       = ATOMIC_INIT(1),
218 };
219
220 #endif
221
222 /* allocate dst with ip6_dst_ops */
223 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops)
224 {
225         return (struct rt6_info *)dst_alloc(ops, 0);
226 }
227
228 static void ip6_dst_destroy(struct dst_entry *dst)
229 {
230         struct rt6_info *rt = (struct rt6_info *)dst;
231         struct inet6_dev *idev = rt->rt6i_idev;
232         struct inet_peer *peer = rt->rt6i_peer;
233
234         if (idev != NULL) {
235                 rt->rt6i_idev = NULL;
236                 in6_dev_put(idev);
237         }
238         if (peer) {
239                 rt->rt6i_peer = NULL;
240                 inet_putpeer(peer);
241         }
242 }
243
244 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
245
246 static u32 rt6_peer_genid(void)
247 {
248         return atomic_read(&__rt6_peer_genid);
249 }
250
251 void rt6_bind_peer(struct rt6_info *rt, int create)
252 {
253         struct inet_peer *peer;
254
255         peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
256         if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
257                 inet_putpeer(peer);
258         else
259                 rt->rt6i_peer_genid = rt6_peer_genid();
260 }
261
262 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
263                            int how)
264 {
265         struct rt6_info *rt = (struct rt6_info *)dst;
266         struct inet6_dev *idev = rt->rt6i_idev;
267         struct net_device *loopback_dev =
268                 dev_net(dev)->loopback_dev;
269
270         if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
271                 struct inet6_dev *loopback_idev =
272                         in6_dev_get(loopback_dev);
273                 if (loopback_idev != NULL) {
274                         rt->rt6i_idev = loopback_idev;
275                         in6_dev_put(idev);
276                 }
277         }
278 }
279
280 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
281 {
282         return (rt->rt6i_flags & RTF_EXPIRES) &&
283                 time_after(jiffies, rt->rt6i_expires);
284 }
285
286 static inline int rt6_need_strict(struct in6_addr *daddr)
287 {
288         return ipv6_addr_type(daddr) &
289                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
290 }
291
292 /*
293  *      Route lookup. Any table->tb6_lock is implied.
294  */
295
296 static inline struct rt6_info *rt6_device_match(struct net *net,
297                                                     struct rt6_info *rt,
298                                                     struct in6_addr *saddr,
299                                                     int oif,
300                                                     int flags)
301 {
302         struct rt6_info *local = NULL;
303         struct rt6_info *sprt;
304
305         if (!oif && ipv6_addr_any(saddr))
306                 goto out;
307
308         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
309                 struct net_device *dev = sprt->rt6i_dev;
310
311                 if (oif) {
312                         if (dev->ifindex == oif)
313                                 return sprt;
314                         if (dev->flags & IFF_LOOPBACK) {
315                                 if (sprt->rt6i_idev == NULL ||
316                                     sprt->rt6i_idev->dev->ifindex != oif) {
317                                         if (flags & RT6_LOOKUP_F_IFACE && oif)
318                                                 continue;
319                                         if (local && (!oif ||
320                                                       local->rt6i_idev->dev->ifindex == oif))
321                                                 continue;
322                                 }
323                                 local = sprt;
324                         }
325                 } else {
326                         if (ipv6_chk_addr(net, saddr, dev,
327                                           flags & RT6_LOOKUP_F_IFACE))
328                                 return sprt;
329                 }
330         }
331
332         if (oif) {
333                 if (local)
334                         return local;
335
336                 if (flags & RT6_LOOKUP_F_IFACE)
337                         return net->ipv6.ip6_null_entry;
338         }
339 out:
340         return rt;
341 }
342
343 #ifdef CONFIG_IPV6_ROUTER_PREF
344 static void rt6_probe(struct rt6_info *rt)
345 {
346         struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
347         /*
348          * Okay, this does not seem to be appropriate
349          * for now, however, we need to check if it
350          * is really so; aka Router Reachability Probing.
351          *
352          * Router Reachability Probe MUST be rate-limited
353          * to no more than one per minute.
354          */
355         if (!neigh || (neigh->nud_state & NUD_VALID))
356                 return;
357         read_lock_bh(&neigh->lock);
358         if (!(neigh->nud_state & NUD_VALID) &&
359             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
360                 struct in6_addr mcaddr;
361                 struct in6_addr *target;
362
363                 neigh->updated = jiffies;
364                 read_unlock_bh(&neigh->lock);
365
366                 target = (struct in6_addr *)&neigh->primary_key;
367                 addrconf_addr_solict_mult(target, &mcaddr);
368                 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
369         } else
370                 read_unlock_bh(&neigh->lock);
371 }
372 #else
373 static inline void rt6_probe(struct rt6_info *rt)
374 {
375 }
376 #endif
377
378 /*
379  * Default Router Selection (RFC 2461 6.3.6)
380  */
381 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
382 {
383         struct net_device *dev = rt->rt6i_dev;
384         if (!oif || dev->ifindex == oif)
385                 return 2;
386         if ((dev->flags & IFF_LOOPBACK) &&
387             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
388                 return 1;
389         return 0;
390 }
391
392 static inline int rt6_check_neigh(struct rt6_info *rt)
393 {
394         struct neighbour *neigh = rt->rt6i_nexthop;
395         int m;
396         if (rt->rt6i_flags & RTF_NONEXTHOP ||
397             !(rt->rt6i_flags & RTF_GATEWAY))
398                 m = 1;
399         else if (neigh) {
400                 read_lock_bh(&neigh->lock);
401                 if (neigh->nud_state & NUD_VALID)
402                         m = 2;
403 #ifdef CONFIG_IPV6_ROUTER_PREF
404                 else if (neigh->nud_state & NUD_FAILED)
405                         m = 0;
406 #endif
407                 else
408                         m = 1;
409                 read_unlock_bh(&neigh->lock);
410         } else
411                 m = 0;
412         return m;
413 }
414
415 static int rt6_score_route(struct rt6_info *rt, int oif,
416                            int strict)
417 {
418         int m, n;
419
420         m = rt6_check_dev(rt, oif);
421         if (!m && (strict & RT6_LOOKUP_F_IFACE))
422                 return -1;
423 #ifdef CONFIG_IPV6_ROUTER_PREF
424         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
425 #endif
426         n = rt6_check_neigh(rt);
427         if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
428                 return -1;
429         return m;
430 }
431
432 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
433                                    int *mpri, struct rt6_info *match)
434 {
435         int m;
436
437         if (rt6_check_expired(rt))
438                 goto out;
439
440         m = rt6_score_route(rt, oif, strict);
441         if (m < 0)
442                 goto out;
443
444         if (m > *mpri) {
445                 if (strict & RT6_LOOKUP_F_REACHABLE)
446                         rt6_probe(match);
447                 *mpri = m;
448                 match = rt;
449         } else if (strict & RT6_LOOKUP_F_REACHABLE) {
450                 rt6_probe(rt);
451         }
452
453 out:
454         return match;
455 }
456
457 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
458                                      struct rt6_info *rr_head,
459                                      u32 metric, int oif, int strict)
460 {
461         struct rt6_info *rt, *match;
462         int mpri = -1;
463
464         match = NULL;
465         for (rt = rr_head; rt && rt->rt6i_metric == metric;
466              rt = rt->dst.rt6_next)
467                 match = find_match(rt, oif, strict, &mpri, match);
468         for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
469              rt = rt->dst.rt6_next)
470                 match = find_match(rt, oif, strict, &mpri, match);
471
472         return match;
473 }
474
475 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
476 {
477         struct rt6_info *match, *rt0;
478         struct net *net;
479
480         RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
481                   __func__, fn->leaf, oif);
482
483         rt0 = fn->rr_ptr;
484         if (!rt0)
485                 fn->rr_ptr = rt0 = fn->leaf;
486
487         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
488
489         if (!match &&
490             (strict & RT6_LOOKUP_F_REACHABLE)) {
491                 struct rt6_info *next = rt0->dst.rt6_next;
492
493                 /* no entries matched; do round-robin */
494                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
495                         next = fn->leaf;
496
497                 if (next != rt0)
498                         fn->rr_ptr = next;
499         }
500
501         RT6_TRACE("%s() => %p\n",
502                   __func__, match);
503
504         net = dev_net(rt0->rt6i_dev);
505         return match ? match : net->ipv6.ip6_null_entry;
506 }
507
508 #ifdef CONFIG_IPV6_ROUTE_INFO
509 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
510                   struct in6_addr *gwaddr)
511 {
512         struct net *net = dev_net(dev);
513         struct route_info *rinfo = (struct route_info *) opt;
514         struct in6_addr prefix_buf, *prefix;
515         unsigned int pref;
516         unsigned long lifetime;
517         struct rt6_info *rt;
518
519         if (len < sizeof(struct route_info)) {
520                 return -EINVAL;
521         }
522
523         /* Sanity check for prefix_len and length */
524         if (rinfo->length > 3) {
525                 return -EINVAL;
526         } else if (rinfo->prefix_len > 128) {
527                 return -EINVAL;
528         } else if (rinfo->prefix_len > 64) {
529                 if (rinfo->length < 2) {
530                         return -EINVAL;
531                 }
532         } else if (rinfo->prefix_len > 0) {
533                 if (rinfo->length < 1) {
534                         return -EINVAL;
535                 }
536         }
537
538         pref = rinfo->route_pref;
539         if (pref == ICMPV6_ROUTER_PREF_INVALID)
540                 return -EINVAL;
541
542         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
543
544         if (rinfo->length == 3)
545                 prefix = (struct in6_addr *)rinfo->prefix;
546         else {
547                 /* this function is safe */
548                 ipv6_addr_prefix(&prefix_buf,
549                                  (struct in6_addr *)rinfo->prefix,
550                                  rinfo->prefix_len);
551                 prefix = &prefix_buf;
552         }
553
554         rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
555                                 dev->ifindex);
556
557         if (rt && !lifetime) {
558                 ip6_del_rt(rt);
559                 rt = NULL;
560         }
561
562         if (!rt && lifetime)
563                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
564                                         pref);
565         else if (rt)
566                 rt->rt6i_flags = RTF_ROUTEINFO |
567                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
568
569         if (rt) {
570                 if (!addrconf_finite_timeout(lifetime)) {
571                         rt->rt6i_flags &= ~RTF_EXPIRES;
572                 } else {
573                         rt->rt6i_expires = jiffies + HZ * lifetime;
574                         rt->rt6i_flags |= RTF_EXPIRES;
575                 }
576                 dst_release(&rt->dst);
577         }
578         return 0;
579 }
580 #endif
581
582 #define BACKTRACK(__net, saddr)                 \
583 do { \
584         if (rt == __net->ipv6.ip6_null_entry) { \
585                 struct fib6_node *pn; \
586                 while (1) { \
587                         if (fn->fn_flags & RTN_TL_ROOT) \
588                                 goto out; \
589                         pn = fn->parent; \
590                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
591                                 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
592                         else \
593                                 fn = pn; \
594                         if (fn->fn_flags & RTN_RTINFO) \
595                                 goto restart; \
596                 } \
597         } \
598 } while(0)
599
600 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
601                                              struct fib6_table *table,
602                                              struct flowi6 *fl6, int flags)
603 {
604         struct fib6_node *fn;
605         struct rt6_info *rt;
606
607         read_lock_bh(&table->tb6_lock);
608         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
609 restart:
610         rt = fn->leaf;
611         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
612         BACKTRACK(net, &fl6->saddr);
613 out:
614         dst_use(&rt->dst, jiffies);
615         read_unlock_bh(&table->tb6_lock);
616         return rt;
617
618 }
619
620 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
621                             const struct in6_addr *saddr, int oif, int strict)
622 {
623         struct flowi6 fl6 = {
624                 .flowi6_oif = oif,
625                 .daddr = *daddr,
626         };
627         struct dst_entry *dst;
628         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
629
630         if (saddr) {
631                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
632                 flags |= RT6_LOOKUP_F_HAS_SADDR;
633         }
634
635         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
636         if (dst->error == 0)
637                 return (struct rt6_info *) dst;
638
639         dst_release(dst);
640
641         return NULL;
642 }
643
644 EXPORT_SYMBOL(rt6_lookup);
645
646 /* ip6_ins_rt is called with FREE table->tb6_lock.
647    It takes new route entry, the addition fails by any reason the
648    route is freed. In any case, if caller does not hold it, it may
649    be destroyed.
650  */
651
652 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
653 {
654         int err;
655         struct fib6_table *table;
656
657         table = rt->rt6i_table;
658         write_lock_bh(&table->tb6_lock);
659         err = fib6_add(&table->tb6_root, rt, info);
660         write_unlock_bh(&table->tb6_lock);
661
662         return err;
663 }
664
665 int ip6_ins_rt(struct rt6_info *rt)
666 {
667         struct nl_info info = {
668                 .nl_net = dev_net(rt->rt6i_dev),
669         };
670         return __ip6_ins_rt(rt, &info);
671 }
672
673 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
674                                       struct in6_addr *saddr)
675 {
676         struct rt6_info *rt;
677
678         /*
679          *      Clone the route.
680          */
681
682         rt = ip6_rt_copy(ort);
683
684         if (rt) {
685                 struct neighbour *neigh;
686                 int attempts = !in_softirq();
687
688                 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
689                         if (rt->rt6i_dst.plen != 128 &&
690                             ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
691                                 rt->rt6i_flags |= RTF_ANYCAST;
692                         ipv6_addr_copy(&rt->rt6i_gateway, daddr);
693                 }
694
695                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
696                 rt->rt6i_dst.plen = 128;
697                 rt->rt6i_flags |= RTF_CACHE;
698                 rt->dst.flags |= DST_HOST;
699
700 #ifdef CONFIG_IPV6_SUBTREES
701                 if (rt->rt6i_src.plen && saddr) {
702                         ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
703                         rt->rt6i_src.plen = 128;
704                 }
705 #endif
706
707         retry:
708                 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
709                 if (IS_ERR(neigh)) {
710                         struct net *net = dev_net(rt->rt6i_dev);
711                         int saved_rt_min_interval =
712                                 net->ipv6.sysctl.ip6_rt_gc_min_interval;
713                         int saved_rt_elasticity =
714                                 net->ipv6.sysctl.ip6_rt_gc_elasticity;
715
716                         if (attempts-- > 0) {
717                                 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
718                                 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
719
720                                 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
721
722                                 net->ipv6.sysctl.ip6_rt_gc_elasticity =
723                                         saved_rt_elasticity;
724                                 net->ipv6.sysctl.ip6_rt_gc_min_interval =
725                                         saved_rt_min_interval;
726                                 goto retry;
727                         }
728
729                         if (net_ratelimit())
730                                 printk(KERN_WARNING
731                                        "ipv6: Neighbour table overflow.\n");
732                         dst_free(&rt->dst);
733                         return NULL;
734                 }
735                 rt->rt6i_nexthop = neigh;
736
737         }
738
739         return rt;
740 }
741
742 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
743 {
744         struct rt6_info *rt = ip6_rt_copy(ort);
745         if (rt) {
746                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
747                 rt->rt6i_dst.plen = 128;
748                 rt->rt6i_flags |= RTF_CACHE;
749                 rt->dst.flags |= DST_HOST;
750                 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
751         }
752         return rt;
753 }
754
755 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
756                                       struct flowi6 *fl6, int flags)
757 {
758         struct fib6_node *fn;
759         struct rt6_info *rt, *nrt;
760         int strict = 0;
761         int attempts = 3;
762         int err;
763         int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
764
765         strict |= flags & RT6_LOOKUP_F_IFACE;
766
767 relookup:
768         read_lock_bh(&table->tb6_lock);
769
770 restart_2:
771         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
772
773 restart:
774         rt = rt6_select(fn, oif, strict | reachable);
775
776         BACKTRACK(net, &fl6->saddr);
777         if (rt == net->ipv6.ip6_null_entry ||
778             rt->rt6i_flags & RTF_CACHE)
779                 goto out;
780
781         dst_hold(&rt->dst);
782         read_unlock_bh(&table->tb6_lock);
783
784         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
785                 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
786         else if (!(rt->dst.flags & DST_HOST))
787                 nrt = rt6_alloc_clone(rt, &fl6->daddr);
788         else
789                 goto out2;
790
791         dst_release(&rt->dst);
792         rt = nrt ? : net->ipv6.ip6_null_entry;
793
794         dst_hold(&rt->dst);
795         if (nrt) {
796                 err = ip6_ins_rt(nrt);
797                 if (!err)
798                         goto out2;
799         }
800
801         if (--attempts <= 0)
802                 goto out2;
803
804         /*
805          * Race condition! In the gap, when table->tb6_lock was
806          * released someone could insert this route.  Relookup.
807          */
808         dst_release(&rt->dst);
809         goto relookup;
810
811 out:
812         if (reachable) {
813                 reachable = 0;
814                 goto restart_2;
815         }
816         dst_hold(&rt->dst);
817         read_unlock_bh(&table->tb6_lock);
818 out2:
819         rt->dst.lastuse = jiffies;
820         rt->dst.__use++;
821
822         return rt;
823 }
824
825 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
826                                             struct flowi6 *fl6, int flags)
827 {
828         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
829 }
830
831 void ip6_route_input(struct sk_buff *skb)
832 {
833         struct ipv6hdr *iph = ipv6_hdr(skb);
834         struct net *net = dev_net(skb->dev);
835         int flags = RT6_LOOKUP_F_HAS_SADDR;
836         struct flowi6 fl6 = {
837                 .flowi6_iif = skb->dev->ifindex,
838                 .daddr = iph->daddr,
839                 .saddr = iph->saddr,
840                 .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
841                 .flowi6_mark = skb->mark,
842                 .flowi6_proto = iph->nexthdr,
843         };
844
845         if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
846                 flags |= RT6_LOOKUP_F_IFACE;
847
848         skb_dst_set(skb, fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_input));
849 }
850
851 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
852                                              struct flowi6 *fl6, int flags)
853 {
854         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
855 }
856
857 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
858                                     struct flowi6 *fl6)
859 {
860         int flags = 0;
861
862         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
863                 flags |= RT6_LOOKUP_F_IFACE;
864
865         if (!ipv6_addr_any(&fl6->saddr))
866                 flags |= RT6_LOOKUP_F_HAS_SADDR;
867         else if (sk)
868                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
869
870         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
871 }
872
873 EXPORT_SYMBOL(ip6_route_output);
874
875 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
876 {
877         struct rt6_info *rt = dst_alloc(&ip6_dst_blackhole_ops, 1);
878         struct rt6_info *ort = (struct rt6_info *) dst_orig;
879         struct dst_entry *new = NULL;
880
881         if (rt) {
882                 new = &rt->dst;
883
884                 new->__use = 1;
885                 new->input = dst_discard;
886                 new->output = dst_discard;
887
888                 dst_copy_metrics(new, &ort->dst);
889                 new->dev = ort->dst.dev;
890                 if (new->dev)
891                         dev_hold(new->dev);
892                 rt->rt6i_idev = ort->rt6i_idev;
893                 if (rt->rt6i_idev)
894                         in6_dev_hold(rt->rt6i_idev);
895                 rt->rt6i_expires = 0;
896
897                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
898                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
899                 rt->rt6i_metric = 0;
900
901                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
902 #ifdef CONFIG_IPV6_SUBTREES
903                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
904 #endif
905
906                 dst_free(new);
907         }
908
909         dst_release(dst_orig);
910         return new ? new : ERR_PTR(-ENOMEM);
911 }
912
913 /*
914  *      Destination cache support functions
915  */
916
917 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
918 {
919         struct rt6_info *rt;
920
921         rt = (struct rt6_info *) dst;
922
923         if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
924                 if (rt->rt6i_peer_genid != rt6_peer_genid()) {
925                         if (!rt->rt6i_peer)
926                                 rt6_bind_peer(rt, 0);
927                         rt->rt6i_peer_genid = rt6_peer_genid();
928                 }
929                 return dst;
930         }
931         return NULL;
932 }
933
934 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
935 {
936         struct rt6_info *rt = (struct rt6_info *) dst;
937
938         if (rt) {
939                 if (rt->rt6i_flags & RTF_CACHE) {
940                         if (rt6_check_expired(rt)) {
941                                 ip6_del_rt(rt);
942                                 dst = NULL;
943                         }
944                 } else {
945                         dst_release(dst);
946                         dst = NULL;
947                 }
948         }
949         return dst;
950 }
951
952 static void ip6_link_failure(struct sk_buff *skb)
953 {
954         struct rt6_info *rt;
955
956         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
957
958         rt = (struct rt6_info *) skb_dst(skb);
959         if (rt) {
960                 if (rt->rt6i_flags&RTF_CACHE) {
961                         dst_set_expires(&rt->dst, 0);
962                         rt->rt6i_flags |= RTF_EXPIRES;
963                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
964                         rt->rt6i_node->fn_sernum = -1;
965         }
966 }
967
968 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
969 {
970         struct rt6_info *rt6 = (struct rt6_info*)dst;
971
972         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
973                 rt6->rt6i_flags |= RTF_MODIFIED;
974                 if (mtu < IPV6_MIN_MTU) {
975                         u32 features = dst_metric(dst, RTAX_FEATURES);
976                         mtu = IPV6_MIN_MTU;
977                         features |= RTAX_FEATURE_ALLFRAG;
978                         dst_metric_set(dst, RTAX_FEATURES, features);
979                 }
980                 dst_metric_set(dst, RTAX_MTU, mtu);
981         }
982 }
983
984 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
985 {
986         struct net_device *dev = dst->dev;
987         unsigned int mtu = dst_mtu(dst);
988         struct net *net = dev_net(dev);
989
990         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
991
992         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
993                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
994
995         /*
996          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
997          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
998          * IPV6_MAXPLEN is also valid and means: "any MSS,
999          * rely only on pmtu discovery"
1000          */
1001         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1002                 mtu = IPV6_MAXPLEN;
1003         return mtu;
1004 }
1005
1006 static unsigned int ip6_default_mtu(const struct dst_entry *dst)
1007 {
1008         unsigned int mtu = IPV6_MIN_MTU;
1009         struct inet6_dev *idev;
1010
1011         rcu_read_lock();
1012         idev = __in6_dev_get(dst->dev);
1013         if (idev)
1014                 mtu = idev->cnf.mtu6;
1015         rcu_read_unlock();
1016
1017         return mtu;
1018 }
1019
1020 static struct dst_entry *icmp6_dst_gc_list;
1021 static DEFINE_SPINLOCK(icmp6_dst_lock);
1022
1023 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1024                                   struct neighbour *neigh,
1025                                   const struct in6_addr *addr)
1026 {
1027         struct rt6_info *rt;
1028         struct inet6_dev *idev = in6_dev_get(dev);
1029         struct net *net = dev_net(dev);
1030
1031         if (unlikely(idev == NULL))
1032                 return NULL;
1033
1034         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1035         if (unlikely(rt == NULL)) {
1036                 in6_dev_put(idev);
1037                 goto out;
1038         }
1039
1040         dev_hold(dev);
1041         if (neigh)
1042                 neigh_hold(neigh);
1043         else {
1044                 neigh = ndisc_get_neigh(dev, addr);
1045                 if (IS_ERR(neigh))
1046                         neigh = NULL;
1047         }
1048
1049         rt->rt6i_dev      = dev;
1050         rt->rt6i_idev     = idev;
1051         rt->rt6i_nexthop  = neigh;
1052         atomic_set(&rt->dst.__refcnt, 1);
1053         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
1054         rt->dst.output  = ip6_output;
1055
1056 #if 0   /* there's no chance to use these for ndisc */
1057         rt->dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
1058                                 ? DST_HOST
1059                                 : 0;
1060         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1061         rt->rt6i_dst.plen = 128;
1062 #endif
1063
1064         spin_lock_bh(&icmp6_dst_lock);
1065         rt->dst.next = icmp6_dst_gc_list;
1066         icmp6_dst_gc_list = &rt->dst;
1067         spin_unlock_bh(&icmp6_dst_lock);
1068
1069         fib6_force_start_gc(net);
1070
1071 out:
1072         return &rt->dst;
1073 }
1074
1075 int icmp6_dst_gc(void)
1076 {
1077         struct dst_entry *dst, **pprev;
1078         int more = 0;
1079
1080         spin_lock_bh(&icmp6_dst_lock);
1081         pprev = &icmp6_dst_gc_list;
1082
1083         while ((dst = *pprev) != NULL) {
1084                 if (!atomic_read(&dst->__refcnt)) {
1085                         *pprev = dst->next;
1086                         dst_free(dst);
1087                 } else {
1088                         pprev = &dst->next;
1089                         ++more;
1090                 }
1091         }
1092
1093         spin_unlock_bh(&icmp6_dst_lock);
1094
1095         return more;
1096 }
1097
1098 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1099                             void *arg)
1100 {
1101         struct dst_entry *dst, **pprev;
1102
1103         spin_lock_bh(&icmp6_dst_lock);
1104         pprev = &icmp6_dst_gc_list;
1105         while ((dst = *pprev) != NULL) {
1106                 struct rt6_info *rt = (struct rt6_info *) dst;
1107                 if (func(rt, arg)) {
1108                         *pprev = dst->next;
1109                         dst_free(dst);
1110                 } else {
1111                         pprev = &dst->next;
1112                 }
1113         }
1114         spin_unlock_bh(&icmp6_dst_lock);
1115 }
1116
1117 static int ip6_dst_gc(struct dst_ops *ops)
1118 {
1119         unsigned long now = jiffies;
1120         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1121         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1122         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1123         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1124         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1125         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1126         int entries;
1127
1128         entries = dst_entries_get_fast(ops);
1129         if (time_after(rt_last_gc + rt_min_interval, now) &&
1130             entries <= rt_max_size)
1131                 goto out;
1132
1133         net->ipv6.ip6_rt_gc_expire++;
1134         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1135         net->ipv6.ip6_rt_last_gc = now;
1136         entries = dst_entries_get_slow(ops);
1137         if (entries < ops->gc_thresh)
1138                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1139 out:
1140         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1141         return entries > rt_max_size;
1142 }
1143
1144 /* Clean host part of a prefix. Not necessary in radix tree,
1145    but results in cleaner routing tables.
1146
1147    Remove it only when all the things will work!
1148  */
1149
1150 int ip6_dst_hoplimit(struct dst_entry *dst)
1151 {
1152         int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1153         if (hoplimit == 0) {
1154                 struct net_device *dev = dst->dev;
1155                 struct inet6_dev *idev;
1156
1157                 rcu_read_lock();
1158                 idev = __in6_dev_get(dev);
1159                 if (idev)
1160                         hoplimit = idev->cnf.hop_limit;
1161                 else
1162                         hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1163                 rcu_read_unlock();
1164         }
1165         return hoplimit;
1166 }
1167 EXPORT_SYMBOL(ip6_dst_hoplimit);
1168
1169 /*
1170  *
1171  */
1172
1173 int ip6_route_add(struct fib6_config *cfg)
1174 {
1175         int err;
1176         struct net *net = cfg->fc_nlinfo.nl_net;
1177         struct rt6_info *rt = NULL;
1178         struct net_device *dev = NULL;
1179         struct inet6_dev *idev = NULL;
1180         struct fib6_table *table;
1181         int addr_type;
1182
1183         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1184                 return -EINVAL;
1185 #ifndef CONFIG_IPV6_SUBTREES
1186         if (cfg->fc_src_len)
1187                 return -EINVAL;
1188 #endif
1189         if (cfg->fc_ifindex) {
1190                 err = -ENODEV;
1191                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1192                 if (!dev)
1193                         goto out;
1194                 idev = in6_dev_get(dev);
1195                 if (!idev)
1196                         goto out;
1197         }
1198
1199         if (cfg->fc_metric == 0)
1200                 cfg->fc_metric = IP6_RT_PRIO_USER;
1201
1202         table = fib6_new_table(net, cfg->fc_table);
1203         if (table == NULL) {
1204                 err = -ENOBUFS;
1205                 goto out;
1206         }
1207
1208         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1209
1210         if (rt == NULL) {
1211                 err = -ENOMEM;
1212                 goto out;
1213         }
1214
1215         rt->dst.obsolete = -1;
1216         rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1217                                 jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1218                                 0;
1219
1220         if (cfg->fc_protocol == RTPROT_UNSPEC)
1221                 cfg->fc_protocol = RTPROT_BOOT;
1222         rt->rt6i_protocol = cfg->fc_protocol;
1223
1224         addr_type = ipv6_addr_type(&cfg->fc_dst);
1225
1226         if (addr_type & IPV6_ADDR_MULTICAST)
1227                 rt->dst.input = ip6_mc_input;
1228         else if (cfg->fc_flags & RTF_LOCAL)
1229                 rt->dst.input = ip6_input;
1230         else
1231                 rt->dst.input = ip6_forward;
1232
1233         rt->dst.output = ip6_output;
1234
1235         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1236         rt->rt6i_dst.plen = cfg->fc_dst_len;
1237         if (rt->rt6i_dst.plen == 128)
1238                rt->dst.flags = DST_HOST;
1239
1240 #ifdef CONFIG_IPV6_SUBTREES
1241         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1242         rt->rt6i_src.plen = cfg->fc_src_len;
1243 #endif
1244
1245         rt->rt6i_metric = cfg->fc_metric;
1246
1247         /* We cannot add true routes via loopback here,
1248            they would result in kernel looping; promote them to reject routes
1249          */
1250         if ((cfg->fc_flags & RTF_REJECT) ||
1251             (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK)
1252                                               && !(cfg->fc_flags&RTF_LOCAL))) {
1253                 /* hold loopback dev/idev if we haven't done so. */
1254                 if (dev != net->loopback_dev) {
1255                         if (dev) {
1256                                 dev_put(dev);
1257                                 in6_dev_put(idev);
1258                         }
1259                         dev = net->loopback_dev;
1260                         dev_hold(dev);
1261                         idev = in6_dev_get(dev);
1262                         if (!idev) {
1263                                 err = -ENODEV;
1264                                 goto out;
1265                         }
1266                 }
1267                 rt->dst.output = ip6_pkt_discard_out;
1268                 rt->dst.input = ip6_pkt_discard;
1269                 rt->dst.error = -ENETUNREACH;
1270                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1271                 goto install_route;
1272         }
1273
1274         if (cfg->fc_flags & RTF_GATEWAY) {
1275                 struct in6_addr *gw_addr;
1276                 int gwa_type;
1277
1278                 gw_addr = &cfg->fc_gateway;
1279                 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1280                 gwa_type = ipv6_addr_type(gw_addr);
1281
1282                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1283                         struct rt6_info *grt;
1284
1285                         /* IPv6 strictly inhibits using not link-local
1286                            addresses as nexthop address.
1287                            Otherwise, router will not able to send redirects.
1288                            It is very good, but in some (rare!) circumstances
1289                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1290                            some exceptions. --ANK
1291                          */
1292                         err = -EINVAL;
1293                         if (!(gwa_type&IPV6_ADDR_UNICAST))
1294                                 goto out;
1295
1296                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1297
1298                         err = -EHOSTUNREACH;
1299                         if (grt == NULL)
1300                                 goto out;
1301                         if (dev) {
1302                                 if (dev != grt->rt6i_dev) {
1303                                         dst_release(&grt->dst);
1304                                         goto out;
1305                                 }
1306                         } else {
1307                                 dev = grt->rt6i_dev;
1308                                 idev = grt->rt6i_idev;
1309                                 dev_hold(dev);
1310                                 in6_dev_hold(grt->rt6i_idev);
1311                         }
1312                         if (!(grt->rt6i_flags&RTF_GATEWAY))
1313                                 err = 0;
1314                         dst_release(&grt->dst);
1315
1316                         if (err)
1317                                 goto out;
1318                 }
1319                 err = -EINVAL;
1320                 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1321                         goto out;
1322         }
1323
1324         err = -ENODEV;
1325         if (dev == NULL)
1326                 goto out;
1327
1328         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1329                 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1330                 if (IS_ERR(rt->rt6i_nexthop)) {
1331                         err = PTR_ERR(rt->rt6i_nexthop);
1332                         rt->rt6i_nexthop = NULL;
1333                         goto out;
1334                 }
1335         }
1336
1337         rt->rt6i_flags = cfg->fc_flags;
1338
1339 install_route:
1340         if (cfg->fc_mx) {
1341                 struct nlattr *nla;
1342                 int remaining;
1343
1344                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1345                         int type = nla_type(nla);
1346
1347                         if (type) {
1348                                 if (type > RTAX_MAX) {
1349                                         err = -EINVAL;
1350                                         goto out;
1351                                 }
1352
1353                                 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1354                         }
1355                 }
1356         }
1357
1358         rt->dst.dev = dev;
1359         rt->rt6i_idev = idev;
1360         rt->rt6i_table = table;
1361
1362         cfg->fc_nlinfo.nl_net = dev_net(dev);
1363
1364         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1365
1366 out:
1367         if (dev)
1368                 dev_put(dev);
1369         if (idev)
1370                 in6_dev_put(idev);
1371         if (rt)
1372                 dst_free(&rt->dst);
1373         return err;
1374 }
1375
1376 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1377 {
1378         int err;
1379         struct fib6_table *table;
1380         struct net *net = dev_net(rt->rt6i_dev);
1381
1382         if (rt == net->ipv6.ip6_null_entry)
1383                 return -ENOENT;
1384
1385         table = rt->rt6i_table;
1386         write_lock_bh(&table->tb6_lock);
1387
1388         err = fib6_del(rt, info);
1389         dst_release(&rt->dst);
1390
1391         write_unlock_bh(&table->tb6_lock);
1392
1393         return err;
1394 }
1395
1396 int ip6_del_rt(struct rt6_info *rt)
1397 {
1398         struct nl_info info = {
1399                 .nl_net = dev_net(rt->rt6i_dev),
1400         };
1401         return __ip6_del_rt(rt, &info);
1402 }
1403
1404 static int ip6_route_del(struct fib6_config *cfg)
1405 {
1406         struct fib6_table *table;
1407         struct fib6_node *fn;
1408         struct rt6_info *rt;
1409         int err = -ESRCH;
1410
1411         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1412         if (table == NULL)
1413                 return err;
1414
1415         read_lock_bh(&table->tb6_lock);
1416
1417         fn = fib6_locate(&table->tb6_root,
1418                          &cfg->fc_dst, cfg->fc_dst_len,
1419                          &cfg->fc_src, cfg->fc_src_len);
1420
1421         if (fn) {
1422                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1423                         if (cfg->fc_ifindex &&
1424                             (rt->rt6i_dev == NULL ||
1425                              rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1426                                 continue;
1427                         if (cfg->fc_flags & RTF_GATEWAY &&
1428                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1429                                 continue;
1430                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1431                                 continue;
1432                         dst_hold(&rt->dst);
1433                         read_unlock_bh(&table->tb6_lock);
1434
1435                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1436                 }
1437         }
1438         read_unlock_bh(&table->tb6_lock);
1439
1440         return err;
1441 }
1442
1443 /*
1444  *      Handle redirects
1445  */
1446 struct ip6rd_flowi {
1447         struct flowi6 fl6;
1448         struct in6_addr gateway;
1449 };
1450
1451 static struct rt6_info *__ip6_route_redirect(struct net *net,
1452                                              struct fib6_table *table,
1453                                              struct flowi6 *fl6,
1454                                              int flags)
1455 {
1456         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1457         struct rt6_info *rt;
1458         struct fib6_node *fn;
1459
1460         /*
1461          * Get the "current" route for this destination and
1462          * check if the redirect has come from approriate router.
1463          *
1464          * RFC 2461 specifies that redirects should only be
1465          * accepted if they come from the nexthop to the target.
1466          * Due to the way the routes are chosen, this notion
1467          * is a bit fuzzy and one might need to check all possible
1468          * routes.
1469          */
1470
1471         read_lock_bh(&table->tb6_lock);
1472         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1473 restart:
1474         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1475                 /*
1476                  * Current route is on-link; redirect is always invalid.
1477                  *
1478                  * Seems, previous statement is not true. It could
1479                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1480                  * But then router serving it might decide, that we should
1481                  * know truth 8)8) --ANK (980726).
1482                  */
1483                 if (rt6_check_expired(rt))
1484                         continue;
1485                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1486                         continue;
1487                 if (fl6->flowi6_oif != rt->rt6i_dev->ifindex)
1488                         continue;
1489                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1490                         continue;
1491                 break;
1492         }
1493
1494         if (!rt)
1495                 rt = net->ipv6.ip6_null_entry;
1496         BACKTRACK(net, &fl6->saddr);
1497 out:
1498         dst_hold(&rt->dst);
1499
1500         read_unlock_bh(&table->tb6_lock);
1501
1502         return rt;
1503 };
1504
1505 static struct rt6_info *ip6_route_redirect(struct in6_addr *dest,
1506                                            struct in6_addr *src,
1507                                            struct in6_addr *gateway,
1508                                            struct net_device *dev)
1509 {
1510         int flags = RT6_LOOKUP_F_HAS_SADDR;
1511         struct net *net = dev_net(dev);
1512         struct ip6rd_flowi rdfl = {
1513                 .fl6 = {
1514                         .flowi6_oif = dev->ifindex,
1515                         .daddr = *dest,
1516                         .saddr = *src,
1517                 },
1518         };
1519
1520         ipv6_addr_copy(&rdfl.gateway, gateway);
1521
1522         if (rt6_need_strict(dest))
1523                 flags |= RT6_LOOKUP_F_IFACE;
1524
1525         return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1526                                                    flags, __ip6_route_redirect);
1527 }
1528
1529 void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
1530                   struct in6_addr *saddr,
1531                   struct neighbour *neigh, u8 *lladdr, int on_link)
1532 {
1533         struct rt6_info *rt, *nrt = NULL;
1534         struct netevent_redirect netevent;
1535         struct net *net = dev_net(neigh->dev);
1536
1537         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1538
1539         if (rt == net->ipv6.ip6_null_entry) {
1540                 if (net_ratelimit())
1541                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1542                                "for redirect target\n");
1543                 goto out;
1544         }
1545
1546         /*
1547          *      We have finally decided to accept it.
1548          */
1549
1550         neigh_update(neigh, lladdr, NUD_STALE,
1551                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1552                      NEIGH_UPDATE_F_OVERRIDE|
1553                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1554                                      NEIGH_UPDATE_F_ISROUTER))
1555                      );
1556
1557         /*
1558          * Redirect received -> path was valid.
1559          * Look, redirects are sent only in response to data packets,
1560          * so that this nexthop apparently is reachable. --ANK
1561          */
1562         dst_confirm(&rt->dst);
1563
1564         /* Duplicate redirect: silently ignore. */
1565         if (neigh == rt->dst.neighbour)
1566                 goto out;
1567
1568         nrt = ip6_rt_copy(rt);
1569         if (nrt == NULL)
1570                 goto out;
1571
1572         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1573         if (on_link)
1574                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1575
1576         ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1577         nrt->rt6i_dst.plen = 128;
1578         nrt->dst.flags |= DST_HOST;
1579
1580         ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1581         nrt->rt6i_nexthop = neigh_clone(neigh);
1582
1583         if (ip6_ins_rt(nrt))
1584                 goto out;
1585
1586         netevent.old = &rt->dst;
1587         netevent.new = &nrt->dst;
1588         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1589
1590         if (rt->rt6i_flags&RTF_CACHE) {
1591                 ip6_del_rt(rt);
1592                 return;
1593         }
1594
1595 out:
1596         dst_release(&rt->dst);
1597 }
1598
1599 /*
1600  *      Handle ICMP "packet too big" messages
1601  *      i.e. Path MTU discovery
1602  */
1603
1604 static void rt6_do_pmtu_disc(struct in6_addr *daddr, struct in6_addr *saddr,
1605                              struct net *net, u32 pmtu, int ifindex)
1606 {
1607         struct rt6_info *rt, *nrt;
1608         int allfrag = 0;
1609 again:
1610         rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1611         if (rt == NULL)
1612                 return;
1613
1614         if (rt6_check_expired(rt)) {
1615                 ip6_del_rt(rt);
1616                 goto again;
1617         }
1618
1619         if (pmtu >= dst_mtu(&rt->dst))
1620                 goto out;
1621
1622         if (pmtu < IPV6_MIN_MTU) {
1623                 /*
1624                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1625                  * MTU (1280) and a fragment header should always be included
1626                  * after a node receiving Too Big message reporting PMTU is
1627                  * less than the IPv6 Minimum Link MTU.
1628                  */
1629                 pmtu = IPV6_MIN_MTU;
1630                 allfrag = 1;
1631         }
1632
1633         /* New mtu received -> path was valid.
1634            They are sent only in response to data packets,
1635            so that this nexthop apparently is reachable. --ANK
1636          */
1637         dst_confirm(&rt->dst);
1638
1639         /* Host route. If it is static, it would be better
1640            not to override it, but add new one, so that
1641            when cache entry will expire old pmtu
1642            would return automatically.
1643          */
1644         if (rt->rt6i_flags & RTF_CACHE) {
1645                 dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1646                 if (allfrag) {
1647                         u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1648                         features |= RTAX_FEATURE_ALLFRAG;
1649                         dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1650                 }
1651                 dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1652                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1653                 goto out;
1654         }
1655
1656         /* Network route.
1657            Two cases are possible:
1658            1. It is connected route. Action: COW
1659            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1660          */
1661         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1662                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1663         else
1664                 nrt = rt6_alloc_clone(rt, daddr);
1665
1666         if (nrt) {
1667                 dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1668                 if (allfrag) {
1669                         u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1670                         features |= RTAX_FEATURE_ALLFRAG;
1671                         dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1672                 }
1673
1674                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1675                  * happened within 5 mins, the recommended timer is 10 mins.
1676                  * Here this route expiration time is set to ip6_rt_mtu_expires
1677                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1678                  * and detecting PMTU increase will be automatically happened.
1679                  */
1680                 dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1681                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1682
1683                 ip6_ins_rt(nrt);
1684         }
1685 out:
1686         dst_release(&rt->dst);
1687 }
1688
1689 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1690                         struct net_device *dev, u32 pmtu)
1691 {
1692         struct net *net = dev_net(dev);
1693
1694         /*
1695          * RFC 1981 states that a node "MUST reduce the size of the packets it
1696          * is sending along the path" that caused the Packet Too Big message.
1697          * Since it's not possible in the general case to determine which
1698          * interface was used to send the original packet, we update the MTU
1699          * on the interface that will be used to send future packets. We also
1700          * update the MTU on the interface that received the Packet Too Big in
1701          * case the original packet was forced out that interface with
1702          * SO_BINDTODEVICE or similar. This is the next best thing to the
1703          * correct behaviour, which would be to update the MTU on all
1704          * interfaces.
1705          */
1706         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1707         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1708 }
1709
1710 /*
1711  *      Misc support functions
1712  */
1713
1714 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1715 {
1716         struct net *net = dev_net(ort->rt6i_dev);
1717         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1718
1719         if (rt) {
1720                 rt->dst.input = ort->dst.input;
1721                 rt->dst.output = ort->dst.output;
1722
1723                 dst_copy_metrics(&rt->dst, &ort->dst);
1724                 rt->dst.error = ort->dst.error;
1725                 rt->dst.dev = ort->dst.dev;
1726                 if (rt->dst.dev)
1727                         dev_hold(rt->dst.dev);
1728                 rt->rt6i_idev = ort->rt6i_idev;
1729                 if (rt->rt6i_idev)
1730                         in6_dev_hold(rt->rt6i_idev);
1731                 rt->dst.lastuse = jiffies;
1732                 rt->rt6i_expires = 0;
1733
1734                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1735                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1736                 rt->rt6i_metric = 0;
1737
1738                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1739 #ifdef CONFIG_IPV6_SUBTREES
1740                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1741 #endif
1742                 rt->rt6i_table = ort->rt6i_table;
1743         }
1744         return rt;
1745 }
1746
1747 #ifdef CONFIG_IPV6_ROUTE_INFO
1748 static struct rt6_info *rt6_get_route_info(struct net *net,
1749                                            struct in6_addr *prefix, int prefixlen,
1750                                            struct in6_addr *gwaddr, int ifindex)
1751 {
1752         struct fib6_node *fn;
1753         struct rt6_info *rt = NULL;
1754         struct fib6_table *table;
1755
1756         table = fib6_get_table(net, RT6_TABLE_INFO);
1757         if (table == NULL)
1758                 return NULL;
1759
1760         write_lock_bh(&table->tb6_lock);
1761         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1762         if (!fn)
1763                 goto out;
1764
1765         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1766                 if (rt->rt6i_dev->ifindex != ifindex)
1767                         continue;
1768                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1769                         continue;
1770                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1771                         continue;
1772                 dst_hold(&rt->dst);
1773                 break;
1774         }
1775 out:
1776         write_unlock_bh(&table->tb6_lock);
1777         return rt;
1778 }
1779
1780 static struct rt6_info *rt6_add_route_info(struct net *net,
1781                                            struct in6_addr *prefix, int prefixlen,
1782                                            struct in6_addr *gwaddr, int ifindex,
1783                                            unsigned pref)
1784 {
1785         struct fib6_config cfg = {
1786                 .fc_table       = RT6_TABLE_INFO,
1787                 .fc_metric      = IP6_RT_PRIO_USER,
1788                 .fc_ifindex     = ifindex,
1789                 .fc_dst_len     = prefixlen,
1790                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1791                                   RTF_UP | RTF_PREF(pref),
1792                 .fc_nlinfo.pid = 0,
1793                 .fc_nlinfo.nlh = NULL,
1794                 .fc_nlinfo.nl_net = net,
1795         };
1796
1797         ipv6_addr_copy(&cfg.fc_dst, prefix);
1798         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1799
1800         /* We should treat it as a default route if prefix length is 0. */
1801         if (!prefixlen)
1802                 cfg.fc_flags |= RTF_DEFAULT;
1803
1804         ip6_route_add(&cfg);
1805
1806         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1807 }
1808 #endif
1809
1810 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1811 {
1812         struct rt6_info *rt;
1813         struct fib6_table *table;
1814
1815         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1816         if (table == NULL)
1817                 return NULL;
1818
1819         write_lock_bh(&table->tb6_lock);
1820         for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1821                 if (dev == rt->rt6i_dev &&
1822                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1823                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1824                         break;
1825         }
1826         if (rt)
1827                 dst_hold(&rt->dst);
1828         write_unlock_bh(&table->tb6_lock);
1829         return rt;
1830 }
1831
1832 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1833                                      struct net_device *dev,
1834                                      unsigned int pref)
1835 {
1836         struct fib6_config cfg = {
1837                 .fc_table       = RT6_TABLE_DFLT,
1838                 .fc_metric      = IP6_RT_PRIO_USER,
1839                 .fc_ifindex     = dev->ifindex,
1840                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1841                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1842                 .fc_nlinfo.pid = 0,
1843                 .fc_nlinfo.nlh = NULL,
1844                 .fc_nlinfo.nl_net = dev_net(dev),
1845         };
1846
1847         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1848
1849         ip6_route_add(&cfg);
1850
1851         return rt6_get_dflt_router(gwaddr, dev);
1852 }
1853
1854 void rt6_purge_dflt_routers(struct net *net)
1855 {
1856         struct rt6_info *rt;
1857         struct fib6_table *table;
1858
1859         /* NOTE: Keep consistent with rt6_get_dflt_router */
1860         table = fib6_get_table(net, RT6_TABLE_DFLT);
1861         if (table == NULL)
1862                 return;
1863
1864 restart:
1865         read_lock_bh(&table->tb6_lock);
1866         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1867                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1868                         dst_hold(&rt->dst);
1869                         read_unlock_bh(&table->tb6_lock);
1870                         ip6_del_rt(rt);
1871                         goto restart;
1872                 }
1873         }
1874         read_unlock_bh(&table->tb6_lock);
1875 }
1876
1877 static void rtmsg_to_fib6_config(struct net *net,
1878                                  struct in6_rtmsg *rtmsg,
1879                                  struct fib6_config *cfg)
1880 {
1881         memset(cfg, 0, sizeof(*cfg));
1882
1883         cfg->fc_table = RT6_TABLE_MAIN;
1884         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1885         cfg->fc_metric = rtmsg->rtmsg_metric;
1886         cfg->fc_expires = rtmsg->rtmsg_info;
1887         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1888         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1889         cfg->fc_flags = rtmsg->rtmsg_flags;
1890
1891         cfg->fc_nlinfo.nl_net = net;
1892
1893         ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1894         ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1895         ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1896 }
1897
1898 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1899 {
1900         struct fib6_config cfg;
1901         struct in6_rtmsg rtmsg;
1902         int err;
1903
1904         switch(cmd) {
1905         case SIOCADDRT:         /* Add a route */
1906         case SIOCDELRT:         /* Delete a route */
1907                 if (!capable(CAP_NET_ADMIN))
1908                         return -EPERM;
1909                 err = copy_from_user(&rtmsg, arg,
1910                                      sizeof(struct in6_rtmsg));
1911                 if (err)
1912                         return -EFAULT;
1913
1914                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1915
1916                 rtnl_lock();
1917                 switch (cmd) {
1918                 case SIOCADDRT:
1919                         err = ip6_route_add(&cfg);
1920                         break;
1921                 case SIOCDELRT:
1922                         err = ip6_route_del(&cfg);
1923                         break;
1924                 default:
1925                         err = -EINVAL;
1926                 }
1927                 rtnl_unlock();
1928
1929                 return err;
1930         }
1931
1932         return -EINVAL;
1933 }
1934
1935 /*
1936  *      Drop the packet on the floor
1937  */
1938
1939 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
1940 {
1941         int type;
1942         struct dst_entry *dst = skb_dst(skb);
1943         switch (ipstats_mib_noroutes) {
1944         case IPSTATS_MIB_INNOROUTES:
1945                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1946                 if (type == IPV6_ADDR_ANY) {
1947                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1948                                       IPSTATS_MIB_INADDRERRORS);
1949                         break;
1950                 }
1951                 /* FALLTHROUGH */
1952         case IPSTATS_MIB_OUTNOROUTES:
1953                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1954                               ipstats_mib_noroutes);
1955                 break;
1956         }
1957         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
1958         kfree_skb(skb);
1959         return 0;
1960 }
1961
1962 static int ip6_pkt_discard(struct sk_buff *skb)
1963 {
1964         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
1965 }
1966
1967 static int ip6_pkt_discard_out(struct sk_buff *skb)
1968 {
1969         skb->dev = skb_dst(skb)->dev;
1970         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
1971 }
1972
1973 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1974
1975 static int ip6_pkt_prohibit(struct sk_buff *skb)
1976 {
1977         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
1978 }
1979
1980 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
1981 {
1982         skb->dev = skb_dst(skb)->dev;
1983         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
1984 }
1985
1986 #endif
1987
1988 /*
1989  *      Allocate a dst for local (unicast / anycast) address.
1990  */
1991
1992 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1993                                     const struct in6_addr *addr,
1994                                     int anycast)
1995 {
1996         struct net *net = dev_net(idev->dev);
1997         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1998         struct neighbour *neigh;
1999
2000         if (rt == NULL) {
2001                 if (net_ratelimit())
2002                         pr_warning("IPv6:  Maximum number of routes reached,"
2003                                    " consider increasing route/max_size.\n");
2004                 return ERR_PTR(-ENOMEM);
2005         }
2006
2007         dev_hold(net->loopback_dev);
2008         in6_dev_hold(idev);
2009
2010         rt->dst.flags = DST_HOST;
2011         rt->dst.input = ip6_input;
2012         rt->dst.output = ip6_output;
2013         rt->rt6i_dev = net->loopback_dev;
2014         rt->rt6i_idev = idev;
2015         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, -1);
2016         rt->dst.obsolete = -1;
2017
2018         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2019         if (anycast)
2020                 rt->rt6i_flags |= RTF_ANYCAST;
2021         else
2022                 rt->rt6i_flags |= RTF_LOCAL;
2023         neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
2024         if (IS_ERR(neigh)) {
2025                 dst_free(&rt->dst);
2026
2027                 return ERR_CAST(neigh);
2028         }
2029         rt->rt6i_nexthop = neigh;
2030
2031         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
2032         rt->rt6i_dst.plen = 128;
2033         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2034
2035         atomic_set(&rt->dst.__refcnt, 1);
2036
2037         return rt;
2038 }
2039
2040 struct arg_dev_net {
2041         struct net_device *dev;
2042         struct net *net;
2043 };
2044
2045 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2046 {
2047         const struct arg_dev_net *adn = arg;
2048         const struct net_device *dev = adn->dev;
2049
2050         if ((rt->rt6i_dev == dev || dev == NULL) &&
2051             rt != adn->net->ipv6.ip6_null_entry) {
2052                 RT6_TRACE("deleted by ifdown %p\n", rt);
2053                 return -1;
2054         }
2055         return 0;
2056 }
2057
2058 void rt6_ifdown(struct net *net, struct net_device *dev)
2059 {
2060         struct arg_dev_net adn = {
2061                 .dev = dev,
2062                 .net = net,
2063         };
2064
2065         fib6_clean_all(net, fib6_ifdown, 0, &adn);
2066         icmp6_clean_all(fib6_ifdown, &adn);
2067 }
2068
2069 struct rt6_mtu_change_arg
2070 {
2071         struct net_device *dev;
2072         unsigned mtu;
2073 };
2074
2075 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2076 {
2077         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2078         struct inet6_dev *idev;
2079
2080         /* In IPv6 pmtu discovery is not optional,
2081            so that RTAX_MTU lock cannot disable it.
2082            We still use this lock to block changes
2083            caused by addrconf/ndisc.
2084         */
2085
2086         idev = __in6_dev_get(arg->dev);
2087         if (idev == NULL)
2088                 return 0;
2089
2090         /* For administrative MTU increase, there is no way to discover
2091            IPv6 PMTU increase, so PMTU increase should be updated here.
2092            Since RFC 1981 doesn't include administrative MTU increase
2093            update PMTU increase is a MUST. (i.e. jumbo frame)
2094          */
2095         /*
2096            If new MTU is less than route PMTU, this new MTU will be the
2097            lowest MTU in the path, update the route PMTU to reflect PMTU
2098            decreases; if new MTU is greater than route PMTU, and the
2099            old MTU is the lowest MTU in the path, update the route PMTU
2100            to reflect the increase. In this case if the other nodes' MTU
2101            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2102            PMTU discouvery.
2103          */
2104         if (rt->rt6i_dev == arg->dev &&
2105             !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2106             (dst_mtu(&rt->dst) >= arg->mtu ||
2107              (dst_mtu(&rt->dst) < arg->mtu &&
2108               dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2109                 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2110         }
2111         return 0;
2112 }
2113
2114 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2115 {
2116         struct rt6_mtu_change_arg arg = {
2117                 .dev = dev,
2118                 .mtu = mtu,
2119         };
2120
2121         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2122 }
2123
2124 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2125         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2126         [RTA_OIF]               = { .type = NLA_U32 },
2127         [RTA_IIF]               = { .type = NLA_U32 },
2128         [RTA_PRIORITY]          = { .type = NLA_U32 },
2129         [RTA_METRICS]           = { .type = NLA_NESTED },
2130 };
2131
2132 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2133                               struct fib6_config *cfg)
2134 {
2135         struct rtmsg *rtm;
2136         struct nlattr *tb[RTA_MAX+1];
2137         int err;
2138
2139         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2140         if (err < 0)
2141                 goto errout;
2142
2143         err = -EINVAL;
2144         rtm = nlmsg_data(nlh);
2145         memset(cfg, 0, sizeof(*cfg));
2146
2147         cfg->fc_table = rtm->rtm_table;
2148         cfg->fc_dst_len = rtm->rtm_dst_len;
2149         cfg->fc_src_len = rtm->rtm_src_len;
2150         cfg->fc_flags = RTF_UP;
2151         cfg->fc_protocol = rtm->rtm_protocol;
2152
2153         if (rtm->rtm_type == RTN_UNREACHABLE)
2154                 cfg->fc_flags |= RTF_REJECT;
2155
2156         if (rtm->rtm_type == RTN_LOCAL)
2157                 cfg->fc_flags |= RTF_LOCAL;
2158
2159         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2160         cfg->fc_nlinfo.nlh = nlh;
2161         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2162
2163         if (tb[RTA_GATEWAY]) {
2164                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2165                 cfg->fc_flags |= RTF_GATEWAY;
2166         }
2167
2168         if (tb[RTA_DST]) {
2169                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2170
2171                 if (nla_len(tb[RTA_DST]) < plen)
2172                         goto errout;
2173
2174                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2175         }
2176
2177         if (tb[RTA_SRC]) {
2178                 int plen = (rtm->rtm_src_len + 7) >> 3;
2179
2180                 if (nla_len(tb[RTA_SRC]) < plen)
2181                         goto errout;
2182
2183                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2184         }
2185
2186         if (tb[RTA_OIF])
2187                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2188
2189         if (tb[RTA_PRIORITY])
2190                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2191
2192         if (tb[RTA_METRICS]) {
2193                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2194                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2195         }
2196
2197         if (tb[RTA_TABLE])
2198                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2199
2200         err = 0;
2201 errout:
2202         return err;
2203 }
2204
2205 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2206 {
2207         struct fib6_config cfg;
2208         int err;
2209
2210         err = rtm_to_fib6_config(skb, nlh, &cfg);
2211         if (err < 0)
2212                 return err;
2213
2214         return ip6_route_del(&cfg);
2215 }
2216
2217 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2218 {
2219         struct fib6_config cfg;
2220         int err;
2221
2222         err = rtm_to_fib6_config(skb, nlh, &cfg);
2223         if (err < 0)
2224                 return err;
2225
2226         return ip6_route_add(&cfg);
2227 }
2228
2229 static inline size_t rt6_nlmsg_size(void)
2230 {
2231         return NLMSG_ALIGN(sizeof(struct rtmsg))
2232                + nla_total_size(16) /* RTA_SRC */
2233                + nla_total_size(16) /* RTA_DST */
2234                + nla_total_size(16) /* RTA_GATEWAY */
2235                + nla_total_size(16) /* RTA_PREFSRC */
2236                + nla_total_size(4) /* RTA_TABLE */
2237                + nla_total_size(4) /* RTA_IIF */
2238                + nla_total_size(4) /* RTA_OIF */
2239                + nla_total_size(4) /* RTA_PRIORITY */
2240                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2241                + nla_total_size(sizeof(struct rta_cacheinfo));
2242 }
2243
2244 static int rt6_fill_node(struct net *net,
2245                          struct sk_buff *skb, struct rt6_info *rt,
2246                          struct in6_addr *dst, struct in6_addr *src,
2247                          int iif, int type, u32 pid, u32 seq,
2248                          int prefix, int nowait, unsigned int flags)
2249 {
2250         struct rtmsg *rtm;
2251         struct nlmsghdr *nlh;
2252         long expires;
2253         u32 table;
2254
2255         if (prefix) {   /* user wants prefix routes only */
2256                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2257                         /* success since this is not a prefix route */
2258                         return 1;
2259                 }
2260         }
2261
2262         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2263         if (nlh == NULL)
2264                 return -EMSGSIZE;
2265
2266         rtm = nlmsg_data(nlh);
2267         rtm->rtm_family = AF_INET6;
2268         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2269         rtm->rtm_src_len = rt->rt6i_src.plen;
2270         rtm->rtm_tos = 0;
2271         if (rt->rt6i_table)
2272                 table = rt->rt6i_table->tb6_id;
2273         else
2274                 table = RT6_TABLE_UNSPEC;
2275         rtm->rtm_table = table;
2276         NLA_PUT_U32(skb, RTA_TABLE, table);
2277         if (rt->rt6i_flags&RTF_REJECT)
2278                 rtm->rtm_type = RTN_UNREACHABLE;
2279         else if (rt->rt6i_flags&RTF_LOCAL)
2280                 rtm->rtm_type = RTN_LOCAL;
2281         else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2282                 rtm->rtm_type = RTN_LOCAL;
2283         else
2284                 rtm->rtm_type = RTN_UNICAST;
2285         rtm->rtm_flags = 0;
2286         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2287         rtm->rtm_protocol = rt->rt6i_protocol;
2288         if (rt->rt6i_flags&RTF_DYNAMIC)
2289                 rtm->rtm_protocol = RTPROT_REDIRECT;
2290         else if (rt->rt6i_flags & RTF_ADDRCONF)
2291                 rtm->rtm_protocol = RTPROT_KERNEL;
2292         else if (rt->rt6i_flags&RTF_DEFAULT)
2293                 rtm->rtm_protocol = RTPROT_RA;
2294
2295         if (rt->rt6i_flags&RTF_CACHE)
2296                 rtm->rtm_flags |= RTM_F_CLONED;
2297
2298         if (dst) {
2299                 NLA_PUT(skb, RTA_DST, 16, dst);
2300                 rtm->rtm_dst_len = 128;
2301         } else if (rtm->rtm_dst_len)
2302                 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2303 #ifdef CONFIG_IPV6_SUBTREES
2304         if (src) {
2305                 NLA_PUT(skb, RTA_SRC, 16, src);
2306                 rtm->rtm_src_len = 128;
2307         } else if (rtm->rtm_src_len)
2308                 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2309 #endif
2310         if (iif) {
2311 #ifdef CONFIG_IPV6_MROUTE
2312                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2313                         int err = ip6mr_get_route(net, skb, rtm, nowait);
2314                         if (err <= 0) {
2315                                 if (!nowait) {
2316                                         if (err == 0)
2317                                                 return 0;
2318                                         goto nla_put_failure;
2319                                 } else {
2320                                         if (err == -EMSGSIZE)
2321                                                 goto nla_put_failure;
2322                                 }
2323                         }
2324                 } else
2325 #endif
2326                         NLA_PUT_U32(skb, RTA_IIF, iif);
2327         } else if (dst) {
2328                 struct inet6_dev *idev = ip6_dst_idev(&rt->dst);
2329                 struct in6_addr saddr_buf;
2330                 if (ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2331                                        dst, 0, &saddr_buf) == 0)
2332                         NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2333         }
2334
2335         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2336                 goto nla_put_failure;
2337
2338         if (rt->dst.neighbour)
2339                 NLA_PUT(skb, RTA_GATEWAY, 16, &rt->dst.neighbour->primary_key);
2340
2341         if (rt->dst.dev)
2342                 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2343
2344         NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2345
2346         if (!(rt->rt6i_flags & RTF_EXPIRES))
2347                 expires = 0;
2348         else if (rt->rt6i_expires - jiffies < INT_MAX)
2349                 expires = rt->rt6i_expires - jiffies;
2350         else
2351                 expires = INT_MAX;
2352
2353         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
2354                                expires, rt->dst.error) < 0)
2355                 goto nla_put_failure;
2356
2357         return nlmsg_end(skb, nlh);
2358
2359 nla_put_failure:
2360         nlmsg_cancel(skb, nlh);
2361         return -EMSGSIZE;
2362 }
2363
2364 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2365 {
2366         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2367         int prefix;
2368
2369         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2370                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2371                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2372         } else
2373                 prefix = 0;
2374
2375         return rt6_fill_node(arg->net,
2376                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2377                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2378                      prefix, 0, NLM_F_MULTI);
2379 }
2380
2381 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2382 {
2383         struct net *net = sock_net(in_skb->sk);
2384         struct nlattr *tb[RTA_MAX+1];
2385         struct rt6_info *rt;
2386         struct sk_buff *skb;
2387         struct rtmsg *rtm;
2388         struct flowi6 fl6;
2389         int err, iif = 0;
2390
2391         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2392         if (err < 0)
2393                 goto errout;
2394
2395         err = -EINVAL;
2396         memset(&fl6, 0, sizeof(fl6));
2397
2398         if (tb[RTA_SRC]) {
2399                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2400                         goto errout;
2401
2402                 ipv6_addr_copy(&fl6.saddr, nla_data(tb[RTA_SRC]));
2403         }
2404
2405         if (tb[RTA_DST]) {
2406                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2407                         goto errout;
2408
2409                 ipv6_addr_copy(&fl6.daddr, nla_data(tb[RTA_DST]));
2410         }
2411
2412         if (tb[RTA_IIF])
2413                 iif = nla_get_u32(tb[RTA_IIF]);
2414
2415         if (tb[RTA_OIF])
2416                 fl6.flowi6_oif = nla_get_u32(tb[RTA_OIF]);
2417
2418         if (iif) {
2419                 struct net_device *dev;
2420                 dev = __dev_get_by_index(net, iif);
2421                 if (!dev) {
2422                         err = -ENODEV;
2423                         goto errout;
2424                 }
2425         }
2426
2427         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2428         if (skb == NULL) {
2429                 err = -ENOBUFS;
2430                 goto errout;
2431         }
2432
2433         /* Reserve room for dummy headers, this skb can pass
2434            through good chunk of routing engine.
2435          */
2436         skb_reset_mac_header(skb);
2437         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2438
2439         rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl6);
2440         skb_dst_set(skb, &rt->dst);
2441
2442         err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2443                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2444                             nlh->nlmsg_seq, 0, 0, 0);
2445         if (err < 0) {
2446                 kfree_skb(skb);
2447                 goto errout;
2448         }
2449
2450         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2451 errout:
2452         return err;
2453 }
2454
2455 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2456 {
2457         struct sk_buff *skb;
2458         struct net *net = info->nl_net;
2459         u32 seq;
2460         int err;
2461
2462         err = -ENOBUFS;
2463         seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2464
2465         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2466         if (skb == NULL)
2467                 goto errout;
2468
2469         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2470                                 event, info->pid, seq, 0, 0, 0);
2471         if (err < 0) {
2472                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2473                 WARN_ON(err == -EMSGSIZE);
2474                 kfree_skb(skb);
2475                 goto errout;
2476         }
2477         rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2478                     info->nlh, gfp_any());
2479         return;
2480 errout:
2481         if (err < 0)
2482                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2483 }
2484
2485 static int ip6_route_dev_notify(struct notifier_block *this,
2486                                 unsigned long event, void *data)
2487 {
2488         struct net_device *dev = (struct net_device *)data;
2489         struct net *net = dev_net(dev);
2490
2491         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2492                 net->ipv6.ip6_null_entry->dst.dev = dev;
2493                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2494 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2495                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2496                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2497                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2498                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2499 #endif
2500         }
2501
2502         return NOTIFY_OK;
2503 }
2504
2505 /*
2506  *      /proc
2507  */
2508
2509 #ifdef CONFIG_PROC_FS
2510
2511 struct rt6_proc_arg
2512 {
2513         char *buffer;
2514         int offset;
2515         int length;
2516         int skip;
2517         int len;
2518 };
2519
2520 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2521 {
2522         struct seq_file *m = p_arg;
2523
2524         seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2525
2526 #ifdef CONFIG_IPV6_SUBTREES
2527         seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2528 #else
2529         seq_puts(m, "00000000000000000000000000000000 00 ");
2530 #endif
2531
2532         if (rt->rt6i_nexthop) {
2533                 seq_printf(m, "%pi6", rt->rt6i_nexthop->primary_key);
2534         } else {
2535                 seq_puts(m, "00000000000000000000000000000000");
2536         }
2537         seq_printf(m, " %08x %08x %08x %08x %8s\n",
2538                    rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2539                    rt->dst.__use, rt->rt6i_flags,
2540                    rt->rt6i_dev ? rt->rt6i_dev->name : "");
2541         return 0;
2542 }
2543
2544 static int ipv6_route_show(struct seq_file *m, void *v)
2545 {
2546         struct net *net = (struct net *)m->private;
2547         fib6_clean_all(net, rt6_info_route, 0, m);
2548         return 0;
2549 }
2550
2551 static int ipv6_route_open(struct inode *inode, struct file *file)
2552 {
2553         return single_open_net(inode, file, ipv6_route_show);
2554 }
2555
2556 static const struct file_operations ipv6_route_proc_fops = {
2557         .owner          = THIS_MODULE,
2558         .open           = ipv6_route_open,
2559         .read           = seq_read,
2560         .llseek         = seq_lseek,
2561         .release        = single_release_net,
2562 };
2563
2564 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2565 {
2566         struct net *net = (struct net *)seq->private;
2567         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2568                    net->ipv6.rt6_stats->fib_nodes,
2569                    net->ipv6.rt6_stats->fib_route_nodes,
2570                    net->ipv6.rt6_stats->fib_rt_alloc,
2571                    net->ipv6.rt6_stats->fib_rt_entries,
2572                    net->ipv6.rt6_stats->fib_rt_cache,
2573                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2574                    net->ipv6.rt6_stats->fib_discarded_routes);
2575
2576         return 0;
2577 }
2578
2579 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2580 {
2581         return single_open_net(inode, file, rt6_stats_seq_show);
2582 }
2583
2584 static const struct file_operations rt6_stats_seq_fops = {
2585         .owner   = THIS_MODULE,
2586         .open    = rt6_stats_seq_open,
2587         .read    = seq_read,
2588         .llseek  = seq_lseek,
2589         .release = single_release_net,
2590 };
2591 #endif  /* CONFIG_PROC_FS */
2592
2593 #ifdef CONFIG_SYSCTL
2594
2595 static
2596 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2597                               void __user *buffer, size_t *lenp, loff_t *ppos)
2598 {
2599         struct net *net;
2600         int delay;
2601         if (!write)
2602                 return -EINVAL;
2603
2604         net = (struct net *)ctl->extra1;
2605         delay = net->ipv6.sysctl.flush_delay;
2606         proc_dointvec(ctl, write, buffer, lenp, ppos);
2607         fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2608         return 0;
2609 }
2610
2611 ctl_table ipv6_route_table_template[] = {
2612         {
2613                 .procname       =       "flush",
2614                 .data           =       &init_net.ipv6.sysctl.flush_delay,
2615                 .maxlen         =       sizeof(int),
2616                 .mode           =       0200,
2617                 .proc_handler   =       ipv6_sysctl_rtcache_flush
2618         },
2619         {
2620                 .procname       =       "gc_thresh",
2621                 .data           =       &ip6_dst_ops_template.gc_thresh,
2622                 .maxlen         =       sizeof(int),
2623                 .mode           =       0644,
2624                 .proc_handler   =       proc_dointvec,
2625         },
2626         {
2627                 .procname       =       "max_size",
2628                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
2629                 .maxlen         =       sizeof(int),
2630                 .mode           =       0644,
2631                 .proc_handler   =       proc_dointvec,
2632         },
2633         {
2634                 .procname       =       "gc_min_interval",
2635                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2636                 .maxlen         =       sizeof(int),
2637                 .mode           =       0644,
2638                 .proc_handler   =       proc_dointvec_jiffies,
2639         },
2640         {
2641                 .procname       =       "gc_timeout",
2642                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2643                 .maxlen         =       sizeof(int),
2644                 .mode           =       0644,
2645                 .proc_handler   =       proc_dointvec_jiffies,
2646         },
2647         {
2648                 .procname       =       "gc_interval",
2649                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2650                 .maxlen         =       sizeof(int),
2651                 .mode           =       0644,
2652                 .proc_handler   =       proc_dointvec_jiffies,
2653         },
2654         {
2655                 .procname       =       "gc_elasticity",
2656                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2657                 .maxlen         =       sizeof(int),
2658                 .mode           =       0644,
2659                 .proc_handler   =       proc_dointvec,
2660         },
2661         {
2662                 .procname       =       "mtu_expires",
2663                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2664                 .maxlen         =       sizeof(int),
2665                 .mode           =       0644,
2666                 .proc_handler   =       proc_dointvec_jiffies,
2667         },
2668         {
2669                 .procname       =       "min_adv_mss",
2670                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2671                 .maxlen         =       sizeof(int),
2672                 .mode           =       0644,
2673                 .proc_handler   =       proc_dointvec,
2674         },
2675         {
2676                 .procname       =       "gc_min_interval_ms",
2677                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2678                 .maxlen         =       sizeof(int),
2679                 .mode           =       0644,
2680                 .proc_handler   =       proc_dointvec_ms_jiffies,
2681         },
2682         { }
2683 };
2684
2685 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2686 {
2687         struct ctl_table *table;
2688
2689         table = kmemdup(ipv6_route_table_template,
2690                         sizeof(ipv6_route_table_template),
2691                         GFP_KERNEL);
2692
2693         if (table) {
2694                 table[0].data = &net->ipv6.sysctl.flush_delay;
2695                 table[0].extra1 = net;
2696                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2697                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2698                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2699                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2700                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2701                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2702                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2703                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2704                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2705         }
2706
2707         return table;
2708 }
2709 #endif
2710
2711 static int __net_init ip6_route_net_init(struct net *net)
2712 {
2713         int ret = -ENOMEM;
2714
2715         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2716                sizeof(net->ipv6.ip6_dst_ops));
2717
2718         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2719                 goto out_ip6_dst_ops;
2720
2721         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2722                                            sizeof(*net->ipv6.ip6_null_entry),
2723                                            GFP_KERNEL);
2724         if (!net->ipv6.ip6_null_entry)
2725                 goto out_ip6_dst_entries;
2726         net->ipv6.ip6_null_entry->dst.path =
2727                 (struct dst_entry *)net->ipv6.ip6_null_entry;
2728         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2729         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2730                          ip6_template_metrics, true);
2731
2732 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2733         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2734                                                sizeof(*net->ipv6.ip6_prohibit_entry),
2735                                                GFP_KERNEL);
2736         if (!net->ipv6.ip6_prohibit_entry)
2737                 goto out_ip6_null_entry;
2738         net->ipv6.ip6_prohibit_entry->dst.path =
2739                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2740         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2741         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2742                          ip6_template_metrics, true);
2743
2744         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2745                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
2746                                                GFP_KERNEL);
2747         if (!net->ipv6.ip6_blk_hole_entry)
2748                 goto out_ip6_prohibit_entry;
2749         net->ipv6.ip6_blk_hole_entry->dst.path =
2750                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2751         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2752         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2753                          ip6_template_metrics, true);
2754 #endif
2755
2756         net->ipv6.sysctl.flush_delay = 0;
2757         net->ipv6.sysctl.ip6_rt_max_size = 4096;
2758         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2759         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2760         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2761         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2762         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2763         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2764
2765 #ifdef CONFIG_PROC_FS
2766         proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2767         proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2768 #endif
2769         net->ipv6.ip6_rt_gc_expire = 30*HZ;
2770
2771         ret = 0;
2772 out:
2773         return ret;
2774
2775 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2776 out_ip6_prohibit_entry:
2777         kfree(net->ipv6.ip6_prohibit_entry);
2778 out_ip6_null_entry:
2779         kfree(net->ipv6.ip6_null_entry);
2780 #endif
2781 out_ip6_dst_entries:
2782         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2783 out_ip6_dst_ops:
2784         goto out;
2785 }
2786
2787 static void __net_exit ip6_route_net_exit(struct net *net)
2788 {
2789 #ifdef CONFIG_PROC_FS
2790         proc_net_remove(net, "ipv6_route");
2791         proc_net_remove(net, "rt6_stats");
2792 #endif
2793         kfree(net->ipv6.ip6_null_entry);
2794 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2795         kfree(net->ipv6.ip6_prohibit_entry);
2796         kfree(net->ipv6.ip6_blk_hole_entry);
2797 #endif
2798         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2799 }
2800
2801 static struct pernet_operations ip6_route_net_ops = {
2802         .init = ip6_route_net_init,
2803         .exit = ip6_route_net_exit,
2804 };
2805
2806 static struct notifier_block ip6_route_dev_notifier = {
2807         .notifier_call = ip6_route_dev_notify,
2808         .priority = 0,
2809 };
2810
2811 int __init ip6_route_init(void)
2812 {
2813         int ret;
2814
2815         ret = -ENOMEM;
2816         ip6_dst_ops_template.kmem_cachep =
2817                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2818                                   SLAB_HWCACHE_ALIGN, NULL);
2819         if (!ip6_dst_ops_template.kmem_cachep)
2820                 goto out;
2821
2822         ret = dst_entries_init(&ip6_dst_blackhole_ops);
2823         if (ret)
2824                 goto out_kmem_cache;
2825
2826         ret = register_pernet_subsys(&ip6_route_net_ops);
2827         if (ret)
2828                 goto out_dst_entries;
2829
2830         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2831
2832         /* Registering of the loopback is done before this portion of code,
2833          * the loopback reference in rt6_info will not be taken, do it
2834          * manually for init_net */
2835         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2836         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2837   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2838         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2839         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2840         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2841         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2842   #endif
2843         ret = fib6_init();
2844         if (ret)
2845                 goto out_register_subsys;
2846
2847         ret = xfrm6_init();
2848         if (ret)
2849                 goto out_fib6_init;
2850
2851         ret = fib6_rules_init();
2852         if (ret)
2853                 goto xfrm6_init;
2854
2855         ret = -ENOBUFS;
2856         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL) ||
2857             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL) ||
2858             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL))
2859                 goto fib6_rules_init;
2860
2861         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2862         if (ret)
2863                 goto fib6_rules_init;
2864
2865 out:
2866         return ret;
2867
2868 fib6_rules_init:
2869         fib6_rules_cleanup();
2870 xfrm6_init:
2871         xfrm6_fini();
2872 out_fib6_init:
2873         fib6_gc_cleanup();
2874 out_register_subsys:
2875         unregister_pernet_subsys(&ip6_route_net_ops);
2876 out_dst_entries:
2877         dst_entries_destroy(&ip6_dst_blackhole_ops);
2878 out_kmem_cache:
2879         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2880         goto out;
2881 }
2882
2883 void ip6_route_cleanup(void)
2884 {
2885         unregister_netdevice_notifier(&ip6_route_dev_notifier);
2886         fib6_rules_cleanup();
2887         xfrm6_fini();
2888         fib6_gc_cleanup();
2889         unregister_pernet_subsys(&ip6_route_net_ops);
2890         dst_entries_destroy(&ip6_dst_blackhole_ops);
2891         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2892 }