Merge branch 'master' of master.kernel.org:/pub/scm/linux/kernel/git/davem/net-2.6
[sfrench/cifs-2.6.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/mroute6.h>
38 #include <linux/init.h>
39 #include <linux/if_arp.h>
40 #include <linux/proc_fs.h>
41 #include <linux/seq_file.h>
42 #include <linux/nsproxy.h>
43 #include <linux/slab.h>
44 #include <net/net_namespace.h>
45 #include <net/snmp.h>
46 #include <net/ipv6.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #include <net/ndisc.h>
50 #include <net/addrconf.h>
51 #include <net/tcp.h>
52 #include <linux/rtnetlink.h>
53 #include <net/dst.h>
54 #include <net/xfrm.h>
55 #include <net/netevent.h>
56 #include <net/netlink.h>
57
58 #include <asm/uaccess.h>
59
60 #ifdef CONFIG_SYSCTL
61 #include <linux/sysctl.h>
62 #endif
63
64 /* Set to 3 to get tracing. */
65 #define RT6_DEBUG 2
66
67 #if RT6_DEBUG >= 3
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
70 #else
71 #define RDBG(x)
72 #define RT6_TRACE(x...) do { ; } while (0)
73 #endif
74
75 #define CLONE_OFFLINK_ROUTE 0
76
77 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
78 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
79 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
80 static void             ip6_dst_destroy(struct dst_entry *);
81 static void             ip6_dst_ifdown(struct dst_entry *,
82                                        struct net_device *dev, int how);
83 static int               ip6_dst_gc(struct dst_ops *ops);
84
85 static int              ip6_pkt_discard(struct sk_buff *skb);
86 static int              ip6_pkt_discard_out(struct sk_buff *skb);
87 static void             ip6_link_failure(struct sk_buff *skb);
88 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
89
90 #ifdef CONFIG_IPV6_ROUTE_INFO
91 static struct rt6_info *rt6_add_route_info(struct net *net,
92                                            struct in6_addr *prefix, int prefixlen,
93                                            struct in6_addr *gwaddr, int ifindex,
94                                            unsigned pref);
95 static struct rt6_info *rt6_get_route_info(struct net *net,
96                                            struct in6_addr *prefix, int prefixlen,
97                                            struct in6_addr *gwaddr, int ifindex);
98 #endif
99
100 static struct dst_ops ip6_dst_ops_template = {
101         .family                 =       AF_INET6,
102         .protocol               =       cpu_to_be16(ETH_P_IPV6),
103         .gc                     =       ip6_dst_gc,
104         .gc_thresh              =       1024,
105         .check                  =       ip6_dst_check,
106         .destroy                =       ip6_dst_destroy,
107         .ifdown                 =       ip6_dst_ifdown,
108         .negative_advice        =       ip6_negative_advice,
109         .link_failure           =       ip6_link_failure,
110         .update_pmtu            =       ip6_rt_update_pmtu,
111         .local_out              =       __ip6_local_out,
112         .entries                =       ATOMIC_INIT(0),
113 };
114
115 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
116 {
117 }
118
119 static struct dst_ops ip6_dst_blackhole_ops = {
120         .family                 =       AF_INET6,
121         .protocol               =       cpu_to_be16(ETH_P_IPV6),
122         .destroy                =       ip6_dst_destroy,
123         .check                  =       ip6_dst_check,
124         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
125         .entries                =       ATOMIC_INIT(0),
126 };
127
128 static struct rt6_info ip6_null_entry_template = {
129         .dst = {
130                 .__refcnt       = ATOMIC_INIT(1),
131                 .__use          = 1,
132                 .obsolete       = -1,
133                 .error          = -ENETUNREACH,
134                 .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
135                 .input          = ip6_pkt_discard,
136                 .output         = ip6_pkt_discard_out,
137         },
138         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
139         .rt6i_protocol  = RTPROT_KERNEL,
140         .rt6i_metric    = ~(u32) 0,
141         .rt6i_ref       = ATOMIC_INIT(1),
142 };
143
144 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
145
146 static int ip6_pkt_prohibit(struct sk_buff *skb);
147 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
148
149 static struct rt6_info ip6_prohibit_entry_template = {
150         .dst = {
151                 .__refcnt       = ATOMIC_INIT(1),
152                 .__use          = 1,
153                 .obsolete       = -1,
154                 .error          = -EACCES,
155                 .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
156                 .input          = ip6_pkt_prohibit,
157                 .output         = ip6_pkt_prohibit_out,
158         },
159         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
160         .rt6i_protocol  = RTPROT_KERNEL,
161         .rt6i_metric    = ~(u32) 0,
162         .rt6i_ref       = ATOMIC_INIT(1),
163 };
164
165 static struct rt6_info ip6_blk_hole_entry_template = {
166         .dst = {
167                 .__refcnt       = ATOMIC_INIT(1),
168                 .__use          = 1,
169                 .obsolete       = -1,
170                 .error          = -EINVAL,
171                 .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
172                 .input          = dst_discard,
173                 .output         = dst_discard,
174         },
175         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
176         .rt6i_protocol  = RTPROT_KERNEL,
177         .rt6i_metric    = ~(u32) 0,
178         .rt6i_ref       = ATOMIC_INIT(1),
179 };
180
181 #endif
182
183 /* allocate dst with ip6_dst_ops */
184 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops)
185 {
186         return (struct rt6_info *)dst_alloc(ops);
187 }
188
189 static void ip6_dst_destroy(struct dst_entry *dst)
190 {
191         struct rt6_info *rt = (struct rt6_info *)dst;
192         struct inet6_dev *idev = rt->rt6i_idev;
193
194         if (idev != NULL) {
195                 rt->rt6i_idev = NULL;
196                 in6_dev_put(idev);
197         }
198 }
199
200 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
201                            int how)
202 {
203         struct rt6_info *rt = (struct rt6_info *)dst;
204         struct inet6_dev *idev = rt->rt6i_idev;
205         struct net_device *loopback_dev =
206                 dev_net(dev)->loopback_dev;
207
208         if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
209                 struct inet6_dev *loopback_idev =
210                         in6_dev_get(loopback_dev);
211                 if (loopback_idev != NULL) {
212                         rt->rt6i_idev = loopback_idev;
213                         in6_dev_put(idev);
214                 }
215         }
216 }
217
218 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
219 {
220         return (rt->rt6i_flags & RTF_EXPIRES &&
221                 time_after(jiffies, rt->rt6i_expires));
222 }
223
224 static inline int rt6_need_strict(struct in6_addr *daddr)
225 {
226         return (ipv6_addr_type(daddr) &
227                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK));
228 }
229
230 /*
231  *      Route lookup. Any table->tb6_lock is implied.
232  */
233
234 static inline struct rt6_info *rt6_device_match(struct net *net,
235                                                     struct rt6_info *rt,
236                                                     struct in6_addr *saddr,
237                                                     int oif,
238                                                     int flags)
239 {
240         struct rt6_info *local = NULL;
241         struct rt6_info *sprt;
242
243         if (!oif && ipv6_addr_any(saddr))
244                 goto out;
245
246         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
247                 struct net_device *dev = sprt->rt6i_dev;
248
249                 if (oif) {
250                         if (dev->ifindex == oif)
251                                 return sprt;
252                         if (dev->flags & IFF_LOOPBACK) {
253                                 if (sprt->rt6i_idev == NULL ||
254                                     sprt->rt6i_idev->dev->ifindex != oif) {
255                                         if (flags & RT6_LOOKUP_F_IFACE && oif)
256                                                 continue;
257                                         if (local && (!oif ||
258                                                       local->rt6i_idev->dev->ifindex == oif))
259                                                 continue;
260                                 }
261                                 local = sprt;
262                         }
263                 } else {
264                         if (ipv6_chk_addr(net, saddr, dev,
265                                           flags & RT6_LOOKUP_F_IFACE))
266                                 return sprt;
267                 }
268         }
269
270         if (oif) {
271                 if (local)
272                         return local;
273
274                 if (flags & RT6_LOOKUP_F_IFACE)
275                         return net->ipv6.ip6_null_entry;
276         }
277 out:
278         return rt;
279 }
280
281 #ifdef CONFIG_IPV6_ROUTER_PREF
282 static void rt6_probe(struct rt6_info *rt)
283 {
284         struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
285         /*
286          * Okay, this does not seem to be appropriate
287          * for now, however, we need to check if it
288          * is really so; aka Router Reachability Probing.
289          *
290          * Router Reachability Probe MUST be rate-limited
291          * to no more than one per minute.
292          */
293         if (!neigh || (neigh->nud_state & NUD_VALID))
294                 return;
295         read_lock_bh(&neigh->lock);
296         if (!(neigh->nud_state & NUD_VALID) &&
297             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
298                 struct in6_addr mcaddr;
299                 struct in6_addr *target;
300
301                 neigh->updated = jiffies;
302                 read_unlock_bh(&neigh->lock);
303
304                 target = (struct in6_addr *)&neigh->primary_key;
305                 addrconf_addr_solict_mult(target, &mcaddr);
306                 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
307         } else
308                 read_unlock_bh(&neigh->lock);
309 }
310 #else
311 static inline void rt6_probe(struct rt6_info *rt)
312 {
313 }
314 #endif
315
316 /*
317  * Default Router Selection (RFC 2461 6.3.6)
318  */
319 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
320 {
321         struct net_device *dev = rt->rt6i_dev;
322         if (!oif || dev->ifindex == oif)
323                 return 2;
324         if ((dev->flags & IFF_LOOPBACK) &&
325             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
326                 return 1;
327         return 0;
328 }
329
330 static inline int rt6_check_neigh(struct rt6_info *rt)
331 {
332         struct neighbour *neigh = rt->rt6i_nexthop;
333         int m;
334         if (rt->rt6i_flags & RTF_NONEXTHOP ||
335             !(rt->rt6i_flags & RTF_GATEWAY))
336                 m = 1;
337         else if (neigh) {
338                 read_lock_bh(&neigh->lock);
339                 if (neigh->nud_state & NUD_VALID)
340                         m = 2;
341 #ifdef CONFIG_IPV6_ROUTER_PREF
342                 else if (neigh->nud_state & NUD_FAILED)
343                         m = 0;
344 #endif
345                 else
346                         m = 1;
347                 read_unlock_bh(&neigh->lock);
348         } else
349                 m = 0;
350         return m;
351 }
352
353 static int rt6_score_route(struct rt6_info *rt, int oif,
354                            int strict)
355 {
356         int m, n;
357
358         m = rt6_check_dev(rt, oif);
359         if (!m && (strict & RT6_LOOKUP_F_IFACE))
360                 return -1;
361 #ifdef CONFIG_IPV6_ROUTER_PREF
362         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
363 #endif
364         n = rt6_check_neigh(rt);
365         if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
366                 return -1;
367         return m;
368 }
369
370 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
371                                    int *mpri, struct rt6_info *match)
372 {
373         int m;
374
375         if (rt6_check_expired(rt))
376                 goto out;
377
378         m = rt6_score_route(rt, oif, strict);
379         if (m < 0)
380                 goto out;
381
382         if (m > *mpri) {
383                 if (strict & RT6_LOOKUP_F_REACHABLE)
384                         rt6_probe(match);
385                 *mpri = m;
386                 match = rt;
387         } else if (strict & RT6_LOOKUP_F_REACHABLE) {
388                 rt6_probe(rt);
389         }
390
391 out:
392         return match;
393 }
394
395 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
396                                      struct rt6_info *rr_head,
397                                      u32 metric, int oif, int strict)
398 {
399         struct rt6_info *rt, *match;
400         int mpri = -1;
401
402         match = NULL;
403         for (rt = rr_head; rt && rt->rt6i_metric == metric;
404              rt = rt->dst.rt6_next)
405                 match = find_match(rt, oif, strict, &mpri, match);
406         for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
407              rt = rt->dst.rt6_next)
408                 match = find_match(rt, oif, strict, &mpri, match);
409
410         return match;
411 }
412
413 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
414 {
415         struct rt6_info *match, *rt0;
416         struct net *net;
417
418         RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
419                   __func__, fn->leaf, oif);
420
421         rt0 = fn->rr_ptr;
422         if (!rt0)
423                 fn->rr_ptr = rt0 = fn->leaf;
424
425         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
426
427         if (!match &&
428             (strict & RT6_LOOKUP_F_REACHABLE)) {
429                 struct rt6_info *next = rt0->dst.rt6_next;
430
431                 /* no entries matched; do round-robin */
432                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
433                         next = fn->leaf;
434
435                 if (next != rt0)
436                         fn->rr_ptr = next;
437         }
438
439         RT6_TRACE("%s() => %p\n",
440                   __func__, match);
441
442         net = dev_net(rt0->rt6i_dev);
443         return (match ? match : net->ipv6.ip6_null_entry);
444 }
445
446 #ifdef CONFIG_IPV6_ROUTE_INFO
447 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
448                   struct in6_addr *gwaddr)
449 {
450         struct net *net = dev_net(dev);
451         struct route_info *rinfo = (struct route_info *) opt;
452         struct in6_addr prefix_buf, *prefix;
453         unsigned int pref;
454         unsigned long lifetime;
455         struct rt6_info *rt;
456
457         if (len < sizeof(struct route_info)) {
458                 return -EINVAL;
459         }
460
461         /* Sanity check for prefix_len and length */
462         if (rinfo->length > 3) {
463                 return -EINVAL;
464         } else if (rinfo->prefix_len > 128) {
465                 return -EINVAL;
466         } else if (rinfo->prefix_len > 64) {
467                 if (rinfo->length < 2) {
468                         return -EINVAL;
469                 }
470         } else if (rinfo->prefix_len > 0) {
471                 if (rinfo->length < 1) {
472                         return -EINVAL;
473                 }
474         }
475
476         pref = rinfo->route_pref;
477         if (pref == ICMPV6_ROUTER_PREF_INVALID)
478                 return -EINVAL;
479
480         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
481
482         if (rinfo->length == 3)
483                 prefix = (struct in6_addr *)rinfo->prefix;
484         else {
485                 /* this function is safe */
486                 ipv6_addr_prefix(&prefix_buf,
487                                  (struct in6_addr *)rinfo->prefix,
488                                  rinfo->prefix_len);
489                 prefix = &prefix_buf;
490         }
491
492         rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
493                                 dev->ifindex);
494
495         if (rt && !lifetime) {
496                 ip6_del_rt(rt);
497                 rt = NULL;
498         }
499
500         if (!rt && lifetime)
501                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
502                                         pref);
503         else if (rt)
504                 rt->rt6i_flags = RTF_ROUTEINFO |
505                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
506
507         if (rt) {
508                 if (!addrconf_finite_timeout(lifetime)) {
509                         rt->rt6i_flags &= ~RTF_EXPIRES;
510                 } else {
511                         rt->rt6i_expires = jiffies + HZ * lifetime;
512                         rt->rt6i_flags |= RTF_EXPIRES;
513                 }
514                 dst_release(&rt->dst);
515         }
516         return 0;
517 }
518 #endif
519
520 #define BACKTRACK(__net, saddr)                 \
521 do { \
522         if (rt == __net->ipv6.ip6_null_entry) { \
523                 struct fib6_node *pn; \
524                 while (1) { \
525                         if (fn->fn_flags & RTN_TL_ROOT) \
526                                 goto out; \
527                         pn = fn->parent; \
528                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
529                                 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
530                         else \
531                                 fn = pn; \
532                         if (fn->fn_flags & RTN_RTINFO) \
533                                 goto restart; \
534                 } \
535         } \
536 } while(0)
537
538 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
539                                              struct fib6_table *table,
540                                              struct flowi *fl, int flags)
541 {
542         struct fib6_node *fn;
543         struct rt6_info *rt;
544
545         read_lock_bh(&table->tb6_lock);
546         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
547 restart:
548         rt = fn->leaf;
549         rt = rt6_device_match(net, rt, &fl->fl6_src, fl->oif, flags);
550         BACKTRACK(net, &fl->fl6_src);
551 out:
552         dst_use(&rt->dst, jiffies);
553         read_unlock_bh(&table->tb6_lock);
554         return rt;
555
556 }
557
558 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
559                             const struct in6_addr *saddr, int oif, int strict)
560 {
561         struct flowi fl = {
562                 .oif = oif,
563                 .nl_u = {
564                         .ip6_u = {
565                                 .daddr = *daddr,
566                         },
567                 },
568         };
569         struct dst_entry *dst;
570         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
571
572         if (saddr) {
573                 memcpy(&fl.fl6_src, saddr, sizeof(*saddr));
574                 flags |= RT6_LOOKUP_F_HAS_SADDR;
575         }
576
577         dst = fib6_rule_lookup(net, &fl, flags, ip6_pol_route_lookup);
578         if (dst->error == 0)
579                 return (struct rt6_info *) dst;
580
581         dst_release(dst);
582
583         return NULL;
584 }
585
586 EXPORT_SYMBOL(rt6_lookup);
587
588 /* ip6_ins_rt is called with FREE table->tb6_lock.
589    It takes new route entry, the addition fails by any reason the
590    route is freed. In any case, if caller does not hold it, it may
591    be destroyed.
592  */
593
594 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
595 {
596         int err;
597         struct fib6_table *table;
598
599         table = rt->rt6i_table;
600         write_lock_bh(&table->tb6_lock);
601         err = fib6_add(&table->tb6_root, rt, info);
602         write_unlock_bh(&table->tb6_lock);
603
604         return err;
605 }
606
607 int ip6_ins_rt(struct rt6_info *rt)
608 {
609         struct nl_info info = {
610                 .nl_net = dev_net(rt->rt6i_dev),
611         };
612         return __ip6_ins_rt(rt, &info);
613 }
614
615 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
616                                       struct in6_addr *saddr)
617 {
618         struct rt6_info *rt;
619
620         /*
621          *      Clone the route.
622          */
623
624         rt = ip6_rt_copy(ort);
625
626         if (rt) {
627                 struct neighbour *neigh;
628                 int attempts = !in_softirq();
629
630                 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
631                         if (rt->rt6i_dst.plen != 128 &&
632                             ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
633                                 rt->rt6i_flags |= RTF_ANYCAST;
634                         ipv6_addr_copy(&rt->rt6i_gateway, daddr);
635                 }
636
637                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
638                 rt->rt6i_dst.plen = 128;
639                 rt->rt6i_flags |= RTF_CACHE;
640                 rt->dst.flags |= DST_HOST;
641
642 #ifdef CONFIG_IPV6_SUBTREES
643                 if (rt->rt6i_src.plen && saddr) {
644                         ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
645                         rt->rt6i_src.plen = 128;
646                 }
647 #endif
648
649         retry:
650                 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
651                 if (IS_ERR(neigh)) {
652                         struct net *net = dev_net(rt->rt6i_dev);
653                         int saved_rt_min_interval =
654                                 net->ipv6.sysctl.ip6_rt_gc_min_interval;
655                         int saved_rt_elasticity =
656                                 net->ipv6.sysctl.ip6_rt_gc_elasticity;
657
658                         if (attempts-- > 0) {
659                                 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
660                                 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
661
662                                 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
663
664                                 net->ipv6.sysctl.ip6_rt_gc_elasticity =
665                                         saved_rt_elasticity;
666                                 net->ipv6.sysctl.ip6_rt_gc_min_interval =
667                                         saved_rt_min_interval;
668                                 goto retry;
669                         }
670
671                         if (net_ratelimit())
672                                 printk(KERN_WARNING
673                                        "Neighbour table overflow.\n");
674                         dst_free(&rt->dst);
675                         return NULL;
676                 }
677                 rt->rt6i_nexthop = neigh;
678
679         }
680
681         return rt;
682 }
683
684 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
685 {
686         struct rt6_info *rt = ip6_rt_copy(ort);
687         if (rt) {
688                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
689                 rt->rt6i_dst.plen = 128;
690                 rt->rt6i_flags |= RTF_CACHE;
691                 rt->dst.flags |= DST_HOST;
692                 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
693         }
694         return rt;
695 }
696
697 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
698                                       struct flowi *fl, int flags)
699 {
700         struct fib6_node *fn;
701         struct rt6_info *rt, *nrt;
702         int strict = 0;
703         int attempts = 3;
704         int err;
705         int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
706
707         strict |= flags & RT6_LOOKUP_F_IFACE;
708
709 relookup:
710         read_lock_bh(&table->tb6_lock);
711
712 restart_2:
713         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
714
715 restart:
716         rt = rt6_select(fn, oif, strict | reachable);
717
718         BACKTRACK(net, &fl->fl6_src);
719         if (rt == net->ipv6.ip6_null_entry ||
720             rt->rt6i_flags & RTF_CACHE)
721                 goto out;
722
723         dst_hold(&rt->dst);
724         read_unlock_bh(&table->tb6_lock);
725
726         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
727                 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
728         else {
729 #if CLONE_OFFLINK_ROUTE
730                 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
731 #else
732                 goto out2;
733 #endif
734         }
735
736         dst_release(&rt->dst);
737         rt = nrt ? : net->ipv6.ip6_null_entry;
738
739         dst_hold(&rt->dst);
740         if (nrt) {
741                 err = ip6_ins_rt(nrt);
742                 if (!err)
743                         goto out2;
744         }
745
746         if (--attempts <= 0)
747                 goto out2;
748
749         /*
750          * Race condition! In the gap, when table->tb6_lock was
751          * released someone could insert this route.  Relookup.
752          */
753         dst_release(&rt->dst);
754         goto relookup;
755
756 out:
757         if (reachable) {
758                 reachable = 0;
759                 goto restart_2;
760         }
761         dst_hold(&rt->dst);
762         read_unlock_bh(&table->tb6_lock);
763 out2:
764         rt->dst.lastuse = jiffies;
765         rt->dst.__use++;
766
767         return rt;
768 }
769
770 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
771                                             struct flowi *fl, int flags)
772 {
773         return ip6_pol_route(net, table, fl->iif, fl, flags);
774 }
775
776 void ip6_route_input(struct sk_buff *skb)
777 {
778         struct ipv6hdr *iph = ipv6_hdr(skb);
779         struct net *net = dev_net(skb->dev);
780         int flags = RT6_LOOKUP_F_HAS_SADDR;
781         struct flowi fl = {
782                 .iif = skb->dev->ifindex,
783                 .nl_u = {
784                         .ip6_u = {
785                                 .daddr = iph->daddr,
786                                 .saddr = iph->saddr,
787                                 .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
788                         },
789                 },
790                 .mark = skb->mark,
791                 .proto = iph->nexthdr,
792         };
793
794         if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
795                 flags |= RT6_LOOKUP_F_IFACE;
796
797         skb_dst_set(skb, fib6_rule_lookup(net, &fl, flags, ip6_pol_route_input));
798 }
799
800 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
801                                              struct flowi *fl, int flags)
802 {
803         return ip6_pol_route(net, table, fl->oif, fl, flags);
804 }
805
806 struct dst_entry * ip6_route_output(struct net *net, struct sock *sk,
807                                     struct flowi *fl)
808 {
809         int flags = 0;
810
811         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl->fl6_dst))
812                 flags |= RT6_LOOKUP_F_IFACE;
813
814         if (!ipv6_addr_any(&fl->fl6_src))
815                 flags |= RT6_LOOKUP_F_HAS_SADDR;
816         else if (sk)
817                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
818
819         return fib6_rule_lookup(net, fl, flags, ip6_pol_route_output);
820 }
821
822 EXPORT_SYMBOL(ip6_route_output);
823
824 int ip6_dst_blackhole(struct sock *sk, struct dst_entry **dstp, struct flowi *fl)
825 {
826         struct rt6_info *ort = (struct rt6_info *) *dstp;
827         struct rt6_info *rt = (struct rt6_info *)
828                 dst_alloc(&ip6_dst_blackhole_ops);
829         struct dst_entry *new = NULL;
830
831         if (rt) {
832                 new = &rt->dst;
833
834                 atomic_set(&new->__refcnt, 1);
835                 new->__use = 1;
836                 new->input = dst_discard;
837                 new->output = dst_discard;
838
839                 memcpy(new->metrics, ort->dst.metrics, RTAX_MAX*sizeof(u32));
840                 new->dev = ort->dst.dev;
841                 if (new->dev)
842                         dev_hold(new->dev);
843                 rt->rt6i_idev = ort->rt6i_idev;
844                 if (rt->rt6i_idev)
845                         in6_dev_hold(rt->rt6i_idev);
846                 rt->rt6i_expires = 0;
847
848                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
849                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
850                 rt->rt6i_metric = 0;
851
852                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
853 #ifdef CONFIG_IPV6_SUBTREES
854                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
855 #endif
856
857                 dst_free(new);
858         }
859
860         dst_release(*dstp);
861         *dstp = new;
862         return (new ? 0 : -ENOMEM);
863 }
864 EXPORT_SYMBOL_GPL(ip6_dst_blackhole);
865
866 /*
867  *      Destination cache support functions
868  */
869
870 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
871 {
872         struct rt6_info *rt;
873
874         rt = (struct rt6_info *) dst;
875
876         if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
877                 return dst;
878
879         return NULL;
880 }
881
882 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
883 {
884         struct rt6_info *rt = (struct rt6_info *) dst;
885
886         if (rt) {
887                 if (rt->rt6i_flags & RTF_CACHE) {
888                         if (rt6_check_expired(rt)) {
889                                 ip6_del_rt(rt);
890                                 dst = NULL;
891                         }
892                 } else {
893                         dst_release(dst);
894                         dst = NULL;
895                 }
896         }
897         return dst;
898 }
899
900 static void ip6_link_failure(struct sk_buff *skb)
901 {
902         struct rt6_info *rt;
903
904         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
905
906         rt = (struct rt6_info *) skb_dst(skb);
907         if (rt) {
908                 if (rt->rt6i_flags&RTF_CACHE) {
909                         dst_set_expires(&rt->dst, 0);
910                         rt->rt6i_flags |= RTF_EXPIRES;
911                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
912                         rt->rt6i_node->fn_sernum = -1;
913         }
914 }
915
916 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
917 {
918         struct rt6_info *rt6 = (struct rt6_info*)dst;
919
920         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
921                 rt6->rt6i_flags |= RTF_MODIFIED;
922                 if (mtu < IPV6_MIN_MTU) {
923                         mtu = IPV6_MIN_MTU;
924                         dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
925                 }
926                 dst->metrics[RTAX_MTU-1] = mtu;
927                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
928         }
929 }
930
931 static int ipv6_get_mtu(struct net_device *dev);
932
933 static inline unsigned int ipv6_advmss(struct net *net, unsigned int mtu)
934 {
935         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
936
937         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
938                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
939
940         /*
941          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
942          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
943          * IPV6_MAXPLEN is also valid and means: "any MSS,
944          * rely only on pmtu discovery"
945          */
946         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
947                 mtu = IPV6_MAXPLEN;
948         return mtu;
949 }
950
951 static struct dst_entry *icmp6_dst_gc_list;
952 static DEFINE_SPINLOCK(icmp6_dst_lock);
953
954 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
955                                   struct neighbour *neigh,
956                                   const struct in6_addr *addr)
957 {
958         struct rt6_info *rt;
959         struct inet6_dev *idev = in6_dev_get(dev);
960         struct net *net = dev_net(dev);
961
962         if (unlikely(idev == NULL))
963                 return NULL;
964
965         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
966         if (unlikely(rt == NULL)) {
967                 in6_dev_put(idev);
968                 goto out;
969         }
970
971         dev_hold(dev);
972         if (neigh)
973                 neigh_hold(neigh);
974         else {
975                 neigh = ndisc_get_neigh(dev, addr);
976                 if (IS_ERR(neigh))
977                         neigh = NULL;
978         }
979
980         rt->rt6i_dev      = dev;
981         rt->rt6i_idev     = idev;
982         rt->rt6i_nexthop  = neigh;
983         atomic_set(&rt->dst.__refcnt, 1);
984         rt->dst.metrics[RTAX_HOPLIMIT-1] = 255;
985         rt->dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
986         rt->dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->dst));
987         rt->dst.output  = ip6_output;
988
989 #if 0   /* there's no chance to use these for ndisc */
990         rt->dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
991                                 ? DST_HOST
992                                 : 0;
993         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
994         rt->rt6i_dst.plen = 128;
995 #endif
996
997         spin_lock_bh(&icmp6_dst_lock);
998         rt->dst.next = icmp6_dst_gc_list;
999         icmp6_dst_gc_list = &rt->dst;
1000         spin_unlock_bh(&icmp6_dst_lock);
1001
1002         fib6_force_start_gc(net);
1003
1004 out:
1005         return &rt->dst;
1006 }
1007
1008 int icmp6_dst_gc(void)
1009 {
1010         struct dst_entry *dst, *next, **pprev;
1011         int more = 0;
1012
1013         next = NULL;
1014
1015         spin_lock_bh(&icmp6_dst_lock);
1016         pprev = &icmp6_dst_gc_list;
1017
1018         while ((dst = *pprev) != NULL) {
1019                 if (!atomic_read(&dst->__refcnt)) {
1020                         *pprev = dst->next;
1021                         dst_free(dst);
1022                 } else {
1023                         pprev = &dst->next;
1024                         ++more;
1025                 }
1026         }
1027
1028         spin_unlock_bh(&icmp6_dst_lock);
1029
1030         return more;
1031 }
1032
1033 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1034                             void *arg)
1035 {
1036         struct dst_entry *dst, **pprev;
1037
1038         spin_lock_bh(&icmp6_dst_lock);
1039         pprev = &icmp6_dst_gc_list;
1040         while ((dst = *pprev) != NULL) {
1041                 struct rt6_info *rt = (struct rt6_info *) dst;
1042                 if (func(rt, arg)) {
1043                         *pprev = dst->next;
1044                         dst_free(dst);
1045                 } else {
1046                         pprev = &dst->next;
1047                 }
1048         }
1049         spin_unlock_bh(&icmp6_dst_lock);
1050 }
1051
1052 static int ip6_dst_gc(struct dst_ops *ops)
1053 {
1054         unsigned long now = jiffies;
1055         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1056         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1057         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1058         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1059         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1060         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1061
1062         if (time_after(rt_last_gc + rt_min_interval, now) &&
1063             atomic_read(&ops->entries) <= rt_max_size)
1064                 goto out;
1065
1066         net->ipv6.ip6_rt_gc_expire++;
1067         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1068         net->ipv6.ip6_rt_last_gc = now;
1069         if (atomic_read(&ops->entries) < ops->gc_thresh)
1070                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1071 out:
1072         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1073         return (atomic_read(&ops->entries) > rt_max_size);
1074 }
1075
1076 /* Clean host part of a prefix. Not necessary in radix tree,
1077    but results in cleaner routing tables.
1078
1079    Remove it only when all the things will work!
1080  */
1081
1082 static int ipv6_get_mtu(struct net_device *dev)
1083 {
1084         int mtu = IPV6_MIN_MTU;
1085         struct inet6_dev *idev;
1086
1087         rcu_read_lock();
1088         idev = __in6_dev_get(dev);
1089         if (idev)
1090                 mtu = idev->cnf.mtu6;
1091         rcu_read_unlock();
1092         return mtu;
1093 }
1094
1095 int ip6_dst_hoplimit(struct dst_entry *dst)
1096 {
1097         int hoplimit = dst_metric(dst, RTAX_HOPLIMIT);
1098         if (hoplimit < 0) {
1099                 struct net_device *dev = dst->dev;
1100                 struct inet6_dev *idev;
1101
1102                 rcu_read_lock();
1103                 idev = __in6_dev_get(dev);
1104                 if (idev)
1105                         hoplimit = idev->cnf.hop_limit;
1106                 else
1107                         hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1108                 rcu_read_unlock();
1109         }
1110         return hoplimit;
1111 }
1112
1113 /*
1114  *
1115  */
1116
1117 int ip6_route_add(struct fib6_config *cfg)
1118 {
1119         int err;
1120         struct net *net = cfg->fc_nlinfo.nl_net;
1121         struct rt6_info *rt = NULL;
1122         struct net_device *dev = NULL;
1123         struct inet6_dev *idev = NULL;
1124         struct fib6_table *table;
1125         int addr_type;
1126
1127         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1128                 return -EINVAL;
1129 #ifndef CONFIG_IPV6_SUBTREES
1130         if (cfg->fc_src_len)
1131                 return -EINVAL;
1132 #endif
1133         if (cfg->fc_ifindex) {
1134                 err = -ENODEV;
1135                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1136                 if (!dev)
1137                         goto out;
1138                 idev = in6_dev_get(dev);
1139                 if (!idev)
1140                         goto out;
1141         }
1142
1143         if (cfg->fc_metric == 0)
1144                 cfg->fc_metric = IP6_RT_PRIO_USER;
1145
1146         table = fib6_new_table(net, cfg->fc_table);
1147         if (table == NULL) {
1148                 err = -ENOBUFS;
1149                 goto out;
1150         }
1151
1152         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1153
1154         if (rt == NULL) {
1155                 err = -ENOMEM;
1156                 goto out;
1157         }
1158
1159         rt->dst.obsolete = -1;
1160         rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1161                                 jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1162                                 0;
1163
1164         if (cfg->fc_protocol == RTPROT_UNSPEC)
1165                 cfg->fc_protocol = RTPROT_BOOT;
1166         rt->rt6i_protocol = cfg->fc_protocol;
1167
1168         addr_type = ipv6_addr_type(&cfg->fc_dst);
1169
1170         if (addr_type & IPV6_ADDR_MULTICAST)
1171                 rt->dst.input = ip6_mc_input;
1172         else
1173                 rt->dst.input = ip6_forward;
1174
1175         rt->dst.output = ip6_output;
1176
1177         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1178         rt->rt6i_dst.plen = cfg->fc_dst_len;
1179         if (rt->rt6i_dst.plen == 128)
1180                rt->dst.flags = DST_HOST;
1181
1182 #ifdef CONFIG_IPV6_SUBTREES
1183         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1184         rt->rt6i_src.plen = cfg->fc_src_len;
1185 #endif
1186
1187         rt->rt6i_metric = cfg->fc_metric;
1188
1189         /* We cannot add true routes via loopback here,
1190            they would result in kernel looping; promote them to reject routes
1191          */
1192         if ((cfg->fc_flags & RTF_REJECT) ||
1193             (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
1194                 /* hold loopback dev/idev if we haven't done so. */
1195                 if (dev != net->loopback_dev) {
1196                         if (dev) {
1197                                 dev_put(dev);
1198                                 in6_dev_put(idev);
1199                         }
1200                         dev = net->loopback_dev;
1201                         dev_hold(dev);
1202                         idev = in6_dev_get(dev);
1203                         if (!idev) {
1204                                 err = -ENODEV;
1205                                 goto out;
1206                         }
1207                 }
1208                 rt->dst.output = ip6_pkt_discard_out;
1209                 rt->dst.input = ip6_pkt_discard;
1210                 rt->dst.error = -ENETUNREACH;
1211                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1212                 goto install_route;
1213         }
1214
1215         if (cfg->fc_flags & RTF_GATEWAY) {
1216                 struct in6_addr *gw_addr;
1217                 int gwa_type;
1218
1219                 gw_addr = &cfg->fc_gateway;
1220                 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1221                 gwa_type = ipv6_addr_type(gw_addr);
1222
1223                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1224                         struct rt6_info *grt;
1225
1226                         /* IPv6 strictly inhibits using not link-local
1227                            addresses as nexthop address.
1228                            Otherwise, router will not able to send redirects.
1229                            It is very good, but in some (rare!) circumstances
1230                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1231                            some exceptions. --ANK
1232                          */
1233                         err = -EINVAL;
1234                         if (!(gwa_type&IPV6_ADDR_UNICAST))
1235                                 goto out;
1236
1237                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1238
1239                         err = -EHOSTUNREACH;
1240                         if (grt == NULL)
1241                                 goto out;
1242                         if (dev) {
1243                                 if (dev != grt->rt6i_dev) {
1244                                         dst_release(&grt->dst);
1245                                         goto out;
1246                                 }
1247                         } else {
1248                                 dev = grt->rt6i_dev;
1249                                 idev = grt->rt6i_idev;
1250                                 dev_hold(dev);
1251                                 in6_dev_hold(grt->rt6i_idev);
1252                         }
1253                         if (!(grt->rt6i_flags&RTF_GATEWAY))
1254                                 err = 0;
1255                         dst_release(&grt->dst);
1256
1257                         if (err)
1258                                 goto out;
1259                 }
1260                 err = -EINVAL;
1261                 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1262                         goto out;
1263         }
1264
1265         err = -ENODEV;
1266         if (dev == NULL)
1267                 goto out;
1268
1269         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1270                 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1271                 if (IS_ERR(rt->rt6i_nexthop)) {
1272                         err = PTR_ERR(rt->rt6i_nexthop);
1273                         rt->rt6i_nexthop = NULL;
1274                         goto out;
1275                 }
1276         }
1277
1278         rt->rt6i_flags = cfg->fc_flags;
1279
1280 install_route:
1281         if (cfg->fc_mx) {
1282                 struct nlattr *nla;
1283                 int remaining;
1284
1285                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1286                         int type = nla_type(nla);
1287
1288                         if (type) {
1289                                 if (type > RTAX_MAX) {
1290                                         err = -EINVAL;
1291                                         goto out;
1292                                 }
1293
1294                                 rt->dst.metrics[type - 1] = nla_get_u32(nla);
1295                         }
1296                 }
1297         }
1298
1299         if (dst_metric(&rt->dst, RTAX_HOPLIMIT) == 0)
1300                 rt->dst.metrics[RTAX_HOPLIMIT-1] = -1;
1301         if (!dst_mtu(&rt->dst))
1302                 rt->dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1303         if (!dst_metric(&rt->dst, RTAX_ADVMSS))
1304                 rt->dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->dst));
1305         rt->dst.dev = dev;
1306         rt->rt6i_idev = idev;
1307         rt->rt6i_table = table;
1308
1309         cfg->fc_nlinfo.nl_net = dev_net(dev);
1310
1311         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1312
1313 out:
1314         if (dev)
1315                 dev_put(dev);
1316         if (idev)
1317                 in6_dev_put(idev);
1318         if (rt)
1319                 dst_free(&rt->dst);
1320         return err;
1321 }
1322
1323 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1324 {
1325         int err;
1326         struct fib6_table *table;
1327         struct net *net = dev_net(rt->rt6i_dev);
1328
1329         if (rt == net->ipv6.ip6_null_entry)
1330                 return -ENOENT;
1331
1332         table = rt->rt6i_table;
1333         write_lock_bh(&table->tb6_lock);
1334
1335         err = fib6_del(rt, info);
1336         dst_release(&rt->dst);
1337
1338         write_unlock_bh(&table->tb6_lock);
1339
1340         return err;
1341 }
1342
1343 int ip6_del_rt(struct rt6_info *rt)
1344 {
1345         struct nl_info info = {
1346                 .nl_net = dev_net(rt->rt6i_dev),
1347         };
1348         return __ip6_del_rt(rt, &info);
1349 }
1350
1351 static int ip6_route_del(struct fib6_config *cfg)
1352 {
1353         struct fib6_table *table;
1354         struct fib6_node *fn;
1355         struct rt6_info *rt;
1356         int err = -ESRCH;
1357
1358         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1359         if (table == NULL)
1360                 return err;
1361
1362         read_lock_bh(&table->tb6_lock);
1363
1364         fn = fib6_locate(&table->tb6_root,
1365                          &cfg->fc_dst, cfg->fc_dst_len,
1366                          &cfg->fc_src, cfg->fc_src_len);
1367
1368         if (fn) {
1369                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1370                         if (cfg->fc_ifindex &&
1371                             (rt->rt6i_dev == NULL ||
1372                              rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1373                                 continue;
1374                         if (cfg->fc_flags & RTF_GATEWAY &&
1375                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1376                                 continue;
1377                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1378                                 continue;
1379                         dst_hold(&rt->dst);
1380                         read_unlock_bh(&table->tb6_lock);
1381
1382                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1383                 }
1384         }
1385         read_unlock_bh(&table->tb6_lock);
1386
1387         return err;
1388 }
1389
1390 /*
1391  *      Handle redirects
1392  */
1393 struct ip6rd_flowi {
1394         struct flowi fl;
1395         struct in6_addr gateway;
1396 };
1397
1398 static struct rt6_info *__ip6_route_redirect(struct net *net,
1399                                              struct fib6_table *table,
1400                                              struct flowi *fl,
1401                                              int flags)
1402 {
1403         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl;
1404         struct rt6_info *rt;
1405         struct fib6_node *fn;
1406
1407         /*
1408          * Get the "current" route for this destination and
1409          * check if the redirect has come from approriate router.
1410          *
1411          * RFC 2461 specifies that redirects should only be
1412          * accepted if they come from the nexthop to the target.
1413          * Due to the way the routes are chosen, this notion
1414          * is a bit fuzzy and one might need to check all possible
1415          * routes.
1416          */
1417
1418         read_lock_bh(&table->tb6_lock);
1419         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
1420 restart:
1421         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1422                 /*
1423                  * Current route is on-link; redirect is always invalid.
1424                  *
1425                  * Seems, previous statement is not true. It could
1426                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1427                  * But then router serving it might decide, that we should
1428                  * know truth 8)8) --ANK (980726).
1429                  */
1430                 if (rt6_check_expired(rt))
1431                         continue;
1432                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1433                         continue;
1434                 if (fl->oif != rt->rt6i_dev->ifindex)
1435                         continue;
1436                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1437                         continue;
1438                 break;
1439         }
1440
1441         if (!rt)
1442                 rt = net->ipv6.ip6_null_entry;
1443         BACKTRACK(net, &fl->fl6_src);
1444 out:
1445         dst_hold(&rt->dst);
1446
1447         read_unlock_bh(&table->tb6_lock);
1448
1449         return rt;
1450 };
1451
1452 static struct rt6_info *ip6_route_redirect(struct in6_addr *dest,
1453                                            struct in6_addr *src,
1454                                            struct in6_addr *gateway,
1455                                            struct net_device *dev)
1456 {
1457         int flags = RT6_LOOKUP_F_HAS_SADDR;
1458         struct net *net = dev_net(dev);
1459         struct ip6rd_flowi rdfl = {
1460                 .fl = {
1461                         .oif = dev->ifindex,
1462                         .nl_u = {
1463                                 .ip6_u = {
1464                                         .daddr = *dest,
1465                                         .saddr = *src,
1466                                 },
1467                         },
1468                 },
1469         };
1470
1471         ipv6_addr_copy(&rdfl.gateway, gateway);
1472
1473         if (rt6_need_strict(dest))
1474                 flags |= RT6_LOOKUP_F_IFACE;
1475
1476         return (struct rt6_info *)fib6_rule_lookup(net, (struct flowi *)&rdfl,
1477                                                    flags, __ip6_route_redirect);
1478 }
1479
1480 void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
1481                   struct in6_addr *saddr,
1482                   struct neighbour *neigh, u8 *lladdr, int on_link)
1483 {
1484         struct rt6_info *rt, *nrt = NULL;
1485         struct netevent_redirect netevent;
1486         struct net *net = dev_net(neigh->dev);
1487
1488         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1489
1490         if (rt == net->ipv6.ip6_null_entry) {
1491                 if (net_ratelimit())
1492                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1493                                "for redirect target\n");
1494                 goto out;
1495         }
1496
1497         /*
1498          *      We have finally decided to accept it.
1499          */
1500
1501         neigh_update(neigh, lladdr, NUD_STALE,
1502                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1503                      NEIGH_UPDATE_F_OVERRIDE|
1504                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1505                                      NEIGH_UPDATE_F_ISROUTER))
1506                      );
1507
1508         /*
1509          * Redirect received -> path was valid.
1510          * Look, redirects are sent only in response to data packets,
1511          * so that this nexthop apparently is reachable. --ANK
1512          */
1513         dst_confirm(&rt->dst);
1514
1515         /* Duplicate redirect: silently ignore. */
1516         if (neigh == rt->dst.neighbour)
1517                 goto out;
1518
1519         nrt = ip6_rt_copy(rt);
1520         if (nrt == NULL)
1521                 goto out;
1522
1523         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1524         if (on_link)
1525                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1526
1527         ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1528         nrt->rt6i_dst.plen = 128;
1529         nrt->dst.flags |= DST_HOST;
1530
1531         ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1532         nrt->rt6i_nexthop = neigh_clone(neigh);
1533         /* Reset pmtu, it may be better */
1534         nrt->dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1535         nrt->dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dev_net(neigh->dev),
1536                                                         dst_mtu(&nrt->dst));
1537
1538         if (ip6_ins_rt(nrt))
1539                 goto out;
1540
1541         netevent.old = &rt->dst;
1542         netevent.new = &nrt->dst;
1543         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1544
1545         if (rt->rt6i_flags&RTF_CACHE) {
1546                 ip6_del_rt(rt);
1547                 return;
1548         }
1549
1550 out:
1551         dst_release(&rt->dst);
1552 }
1553
1554 /*
1555  *      Handle ICMP "packet too big" messages
1556  *      i.e. Path MTU discovery
1557  */
1558
1559 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1560                         struct net_device *dev, u32 pmtu)
1561 {
1562         struct rt6_info *rt, *nrt;
1563         struct net *net = dev_net(dev);
1564         int allfrag = 0;
1565
1566         rt = rt6_lookup(net, daddr, saddr, dev->ifindex, 0);
1567         if (rt == NULL)
1568                 return;
1569
1570         if (pmtu >= dst_mtu(&rt->dst))
1571                 goto out;
1572
1573         if (pmtu < IPV6_MIN_MTU) {
1574                 /*
1575                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1576                  * MTU (1280) and a fragment header should always be included
1577                  * after a node receiving Too Big message reporting PMTU is
1578                  * less than the IPv6 Minimum Link MTU.
1579                  */
1580                 pmtu = IPV6_MIN_MTU;
1581                 allfrag = 1;
1582         }
1583
1584         /* New mtu received -> path was valid.
1585            They are sent only in response to data packets,
1586            so that this nexthop apparently is reachable. --ANK
1587          */
1588         dst_confirm(&rt->dst);
1589
1590         /* Host route. If it is static, it would be better
1591            not to override it, but add new one, so that
1592            when cache entry will expire old pmtu
1593            would return automatically.
1594          */
1595         if (rt->rt6i_flags & RTF_CACHE) {
1596                 rt->dst.metrics[RTAX_MTU-1] = pmtu;
1597                 if (allfrag)
1598                         rt->dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1599                 dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1600                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1601                 goto out;
1602         }
1603
1604         /* Network route.
1605            Two cases are possible:
1606            1. It is connected route. Action: COW
1607            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1608          */
1609         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1610                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1611         else
1612                 nrt = rt6_alloc_clone(rt, daddr);
1613
1614         if (nrt) {
1615                 nrt->dst.metrics[RTAX_MTU-1] = pmtu;
1616                 if (allfrag)
1617                         nrt->dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1618
1619                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1620                  * happened within 5 mins, the recommended timer is 10 mins.
1621                  * Here this route expiration time is set to ip6_rt_mtu_expires
1622                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1623                  * and detecting PMTU increase will be automatically happened.
1624                  */
1625                 dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1626                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1627
1628                 ip6_ins_rt(nrt);
1629         }
1630 out:
1631         dst_release(&rt->dst);
1632 }
1633
1634 /*
1635  *      Misc support functions
1636  */
1637
1638 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1639 {
1640         struct net *net = dev_net(ort->rt6i_dev);
1641         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1642
1643         if (rt) {
1644                 rt->dst.input = ort->dst.input;
1645                 rt->dst.output = ort->dst.output;
1646
1647                 memcpy(rt->dst.metrics, ort->dst.metrics, RTAX_MAX*sizeof(u32));
1648                 rt->dst.error = ort->dst.error;
1649                 rt->dst.dev = ort->dst.dev;
1650                 if (rt->dst.dev)
1651                         dev_hold(rt->dst.dev);
1652                 rt->rt6i_idev = ort->rt6i_idev;
1653                 if (rt->rt6i_idev)
1654                         in6_dev_hold(rt->rt6i_idev);
1655                 rt->dst.lastuse = jiffies;
1656                 rt->rt6i_expires = 0;
1657
1658                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1659                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1660                 rt->rt6i_metric = 0;
1661
1662                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1663 #ifdef CONFIG_IPV6_SUBTREES
1664                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1665 #endif
1666                 rt->rt6i_table = ort->rt6i_table;
1667         }
1668         return rt;
1669 }
1670
1671 #ifdef CONFIG_IPV6_ROUTE_INFO
1672 static struct rt6_info *rt6_get_route_info(struct net *net,
1673                                            struct in6_addr *prefix, int prefixlen,
1674                                            struct in6_addr *gwaddr, int ifindex)
1675 {
1676         struct fib6_node *fn;
1677         struct rt6_info *rt = NULL;
1678         struct fib6_table *table;
1679
1680         table = fib6_get_table(net, RT6_TABLE_INFO);
1681         if (table == NULL)
1682                 return NULL;
1683
1684         write_lock_bh(&table->tb6_lock);
1685         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1686         if (!fn)
1687                 goto out;
1688
1689         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1690                 if (rt->rt6i_dev->ifindex != ifindex)
1691                         continue;
1692                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1693                         continue;
1694                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1695                         continue;
1696                 dst_hold(&rt->dst);
1697                 break;
1698         }
1699 out:
1700         write_unlock_bh(&table->tb6_lock);
1701         return rt;
1702 }
1703
1704 static struct rt6_info *rt6_add_route_info(struct net *net,
1705                                            struct in6_addr *prefix, int prefixlen,
1706                                            struct in6_addr *gwaddr, int ifindex,
1707                                            unsigned pref)
1708 {
1709         struct fib6_config cfg = {
1710                 .fc_table       = RT6_TABLE_INFO,
1711                 .fc_metric      = IP6_RT_PRIO_USER,
1712                 .fc_ifindex     = ifindex,
1713                 .fc_dst_len     = prefixlen,
1714                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1715                                   RTF_UP | RTF_PREF(pref),
1716                 .fc_nlinfo.pid = 0,
1717                 .fc_nlinfo.nlh = NULL,
1718                 .fc_nlinfo.nl_net = net,
1719         };
1720
1721         ipv6_addr_copy(&cfg.fc_dst, prefix);
1722         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1723
1724         /* We should treat it as a default route if prefix length is 0. */
1725         if (!prefixlen)
1726                 cfg.fc_flags |= RTF_DEFAULT;
1727
1728         ip6_route_add(&cfg);
1729
1730         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1731 }
1732 #endif
1733
1734 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1735 {
1736         struct rt6_info *rt;
1737         struct fib6_table *table;
1738
1739         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1740         if (table == NULL)
1741                 return NULL;
1742
1743         write_lock_bh(&table->tb6_lock);
1744         for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1745                 if (dev == rt->rt6i_dev &&
1746                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1747                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1748                         break;
1749         }
1750         if (rt)
1751                 dst_hold(&rt->dst);
1752         write_unlock_bh(&table->tb6_lock);
1753         return rt;
1754 }
1755
1756 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1757                                      struct net_device *dev,
1758                                      unsigned int pref)
1759 {
1760         struct fib6_config cfg = {
1761                 .fc_table       = RT6_TABLE_DFLT,
1762                 .fc_metric      = IP6_RT_PRIO_USER,
1763                 .fc_ifindex     = dev->ifindex,
1764                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1765                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1766                 .fc_nlinfo.pid = 0,
1767                 .fc_nlinfo.nlh = NULL,
1768                 .fc_nlinfo.nl_net = dev_net(dev),
1769         };
1770
1771         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1772
1773         ip6_route_add(&cfg);
1774
1775         return rt6_get_dflt_router(gwaddr, dev);
1776 }
1777
1778 void rt6_purge_dflt_routers(struct net *net)
1779 {
1780         struct rt6_info *rt;
1781         struct fib6_table *table;
1782
1783         /* NOTE: Keep consistent with rt6_get_dflt_router */
1784         table = fib6_get_table(net, RT6_TABLE_DFLT);
1785         if (table == NULL)
1786                 return;
1787
1788 restart:
1789         read_lock_bh(&table->tb6_lock);
1790         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1791                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1792                         dst_hold(&rt->dst);
1793                         read_unlock_bh(&table->tb6_lock);
1794                         ip6_del_rt(rt);
1795                         goto restart;
1796                 }
1797         }
1798         read_unlock_bh(&table->tb6_lock);
1799 }
1800
1801 static void rtmsg_to_fib6_config(struct net *net,
1802                                  struct in6_rtmsg *rtmsg,
1803                                  struct fib6_config *cfg)
1804 {
1805         memset(cfg, 0, sizeof(*cfg));
1806
1807         cfg->fc_table = RT6_TABLE_MAIN;
1808         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1809         cfg->fc_metric = rtmsg->rtmsg_metric;
1810         cfg->fc_expires = rtmsg->rtmsg_info;
1811         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1812         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1813         cfg->fc_flags = rtmsg->rtmsg_flags;
1814
1815         cfg->fc_nlinfo.nl_net = net;
1816
1817         ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1818         ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1819         ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1820 }
1821
1822 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1823 {
1824         struct fib6_config cfg;
1825         struct in6_rtmsg rtmsg;
1826         int err;
1827
1828         switch(cmd) {
1829         case SIOCADDRT:         /* Add a route */
1830         case SIOCDELRT:         /* Delete a route */
1831                 if (!capable(CAP_NET_ADMIN))
1832                         return -EPERM;
1833                 err = copy_from_user(&rtmsg, arg,
1834                                      sizeof(struct in6_rtmsg));
1835                 if (err)
1836                         return -EFAULT;
1837
1838                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1839
1840                 rtnl_lock();
1841                 switch (cmd) {
1842                 case SIOCADDRT:
1843                         err = ip6_route_add(&cfg);
1844                         break;
1845                 case SIOCDELRT:
1846                         err = ip6_route_del(&cfg);
1847                         break;
1848                 default:
1849                         err = -EINVAL;
1850                 }
1851                 rtnl_unlock();
1852
1853                 return err;
1854         }
1855
1856         return -EINVAL;
1857 }
1858
1859 /*
1860  *      Drop the packet on the floor
1861  */
1862
1863 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
1864 {
1865         int type;
1866         struct dst_entry *dst = skb_dst(skb);
1867         switch (ipstats_mib_noroutes) {
1868         case IPSTATS_MIB_INNOROUTES:
1869                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1870                 if (type == IPV6_ADDR_ANY) {
1871                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1872                                       IPSTATS_MIB_INADDRERRORS);
1873                         break;
1874                 }
1875                 /* FALLTHROUGH */
1876         case IPSTATS_MIB_OUTNOROUTES:
1877                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1878                               ipstats_mib_noroutes);
1879                 break;
1880         }
1881         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
1882         kfree_skb(skb);
1883         return 0;
1884 }
1885
1886 static int ip6_pkt_discard(struct sk_buff *skb)
1887 {
1888         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
1889 }
1890
1891 static int ip6_pkt_discard_out(struct sk_buff *skb)
1892 {
1893         skb->dev = skb_dst(skb)->dev;
1894         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
1895 }
1896
1897 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1898
1899 static int ip6_pkt_prohibit(struct sk_buff *skb)
1900 {
1901         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
1902 }
1903
1904 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
1905 {
1906         skb->dev = skb_dst(skb)->dev;
1907         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
1908 }
1909
1910 #endif
1911
1912 /*
1913  *      Allocate a dst for local (unicast / anycast) address.
1914  */
1915
1916 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1917                                     const struct in6_addr *addr,
1918                                     int anycast)
1919 {
1920         struct net *net = dev_net(idev->dev);
1921         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1922         struct neighbour *neigh;
1923
1924         if (rt == NULL)
1925                 return ERR_PTR(-ENOMEM);
1926
1927         dev_hold(net->loopback_dev);
1928         in6_dev_hold(idev);
1929
1930         rt->dst.flags = DST_HOST;
1931         rt->dst.input = ip6_input;
1932         rt->dst.output = ip6_output;
1933         rt->rt6i_dev = net->loopback_dev;
1934         rt->rt6i_idev = idev;
1935         rt->dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1936         rt->dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->dst));
1937         rt->dst.metrics[RTAX_HOPLIMIT-1] = -1;
1938         rt->dst.obsolete = -1;
1939
1940         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1941         if (anycast)
1942                 rt->rt6i_flags |= RTF_ANYCAST;
1943         else
1944                 rt->rt6i_flags |= RTF_LOCAL;
1945         neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1946         if (IS_ERR(neigh)) {
1947                 dst_free(&rt->dst);
1948
1949                 /* We are casting this because that is the return
1950                  * value type.  But an errno encoded pointer is the
1951                  * same regardless of the underlying pointer type,
1952                  * and that's what we are returning.  So this is OK.
1953                  */
1954                 return (struct rt6_info *) neigh;
1955         }
1956         rt->rt6i_nexthop = neigh;
1957
1958         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1959         rt->rt6i_dst.plen = 128;
1960         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
1961
1962         atomic_set(&rt->dst.__refcnt, 1);
1963
1964         return rt;
1965 }
1966
1967 struct arg_dev_net {
1968         struct net_device *dev;
1969         struct net *net;
1970 };
1971
1972 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1973 {
1974         struct net_device *dev = ((struct arg_dev_net *)arg)->dev;
1975         struct net *net = ((struct arg_dev_net *)arg)->net;
1976
1977         if (((void *)rt->rt6i_dev == dev || dev == NULL) &&
1978             rt != net->ipv6.ip6_null_entry) {
1979                 RT6_TRACE("deleted by ifdown %p\n", rt);
1980                 return -1;
1981         }
1982         return 0;
1983 }
1984
1985 void rt6_ifdown(struct net *net, struct net_device *dev)
1986 {
1987         struct arg_dev_net adn = {
1988                 .dev = dev,
1989                 .net = net,
1990         };
1991
1992         fib6_clean_all(net, fib6_ifdown, 0, &adn);
1993         icmp6_clean_all(fib6_ifdown, &adn);
1994 }
1995
1996 struct rt6_mtu_change_arg
1997 {
1998         struct net_device *dev;
1999         unsigned mtu;
2000 };
2001
2002 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2003 {
2004         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2005         struct inet6_dev *idev;
2006         struct net *net = dev_net(arg->dev);
2007
2008         /* In IPv6 pmtu discovery is not optional,
2009            so that RTAX_MTU lock cannot disable it.
2010            We still use this lock to block changes
2011            caused by addrconf/ndisc.
2012         */
2013
2014         idev = __in6_dev_get(arg->dev);
2015         if (idev == NULL)
2016                 return 0;
2017
2018         /* For administrative MTU increase, there is no way to discover
2019            IPv6 PMTU increase, so PMTU increase should be updated here.
2020            Since RFC 1981 doesn't include administrative MTU increase
2021            update PMTU increase is a MUST. (i.e. jumbo frame)
2022          */
2023         /*
2024            If new MTU is less than route PMTU, this new MTU will be the
2025            lowest MTU in the path, update the route PMTU to reflect PMTU
2026            decreases; if new MTU is greater than route PMTU, and the
2027            old MTU is the lowest MTU in the path, update the route PMTU
2028            to reflect the increase. In this case if the other nodes' MTU
2029            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2030            PMTU discouvery.
2031          */
2032         if (rt->rt6i_dev == arg->dev &&
2033             !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2034             (dst_mtu(&rt->dst) >= arg->mtu ||
2035              (dst_mtu(&rt->dst) < arg->mtu &&
2036               dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2037                 rt->dst.metrics[RTAX_MTU-1] = arg->mtu;
2038                 rt->dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, arg->mtu);
2039         }
2040         return 0;
2041 }
2042
2043 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2044 {
2045         struct rt6_mtu_change_arg arg = {
2046                 .dev = dev,
2047                 .mtu = mtu,
2048         };
2049
2050         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2051 }
2052
2053 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2054         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2055         [RTA_OIF]               = { .type = NLA_U32 },
2056         [RTA_IIF]               = { .type = NLA_U32 },
2057         [RTA_PRIORITY]          = { .type = NLA_U32 },
2058         [RTA_METRICS]           = { .type = NLA_NESTED },
2059 };
2060
2061 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2062                               struct fib6_config *cfg)
2063 {
2064         struct rtmsg *rtm;
2065         struct nlattr *tb[RTA_MAX+1];
2066         int err;
2067
2068         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2069         if (err < 0)
2070                 goto errout;
2071
2072         err = -EINVAL;
2073         rtm = nlmsg_data(nlh);
2074         memset(cfg, 0, sizeof(*cfg));
2075
2076         cfg->fc_table = rtm->rtm_table;
2077         cfg->fc_dst_len = rtm->rtm_dst_len;
2078         cfg->fc_src_len = rtm->rtm_src_len;
2079         cfg->fc_flags = RTF_UP;
2080         cfg->fc_protocol = rtm->rtm_protocol;
2081
2082         if (rtm->rtm_type == RTN_UNREACHABLE)
2083                 cfg->fc_flags |= RTF_REJECT;
2084
2085         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2086         cfg->fc_nlinfo.nlh = nlh;
2087         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2088
2089         if (tb[RTA_GATEWAY]) {
2090                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2091                 cfg->fc_flags |= RTF_GATEWAY;
2092         }
2093
2094         if (tb[RTA_DST]) {
2095                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2096
2097                 if (nla_len(tb[RTA_DST]) < plen)
2098                         goto errout;
2099
2100                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2101         }
2102
2103         if (tb[RTA_SRC]) {
2104                 int plen = (rtm->rtm_src_len + 7) >> 3;
2105
2106                 if (nla_len(tb[RTA_SRC]) < plen)
2107                         goto errout;
2108
2109                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2110         }
2111
2112         if (tb[RTA_OIF])
2113                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2114
2115         if (tb[RTA_PRIORITY])
2116                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2117
2118         if (tb[RTA_METRICS]) {
2119                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2120                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2121         }
2122
2123         if (tb[RTA_TABLE])
2124                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2125
2126         err = 0;
2127 errout:
2128         return err;
2129 }
2130
2131 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2132 {
2133         struct fib6_config cfg;
2134         int err;
2135
2136         err = rtm_to_fib6_config(skb, nlh, &cfg);
2137         if (err < 0)
2138                 return err;
2139
2140         return ip6_route_del(&cfg);
2141 }
2142
2143 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2144 {
2145         struct fib6_config cfg;
2146         int err;
2147
2148         err = rtm_to_fib6_config(skb, nlh, &cfg);
2149         if (err < 0)
2150                 return err;
2151
2152         return ip6_route_add(&cfg);
2153 }
2154
2155 static inline size_t rt6_nlmsg_size(void)
2156 {
2157         return NLMSG_ALIGN(sizeof(struct rtmsg))
2158                + nla_total_size(16) /* RTA_SRC */
2159                + nla_total_size(16) /* RTA_DST */
2160                + nla_total_size(16) /* RTA_GATEWAY */
2161                + nla_total_size(16) /* RTA_PREFSRC */
2162                + nla_total_size(4) /* RTA_TABLE */
2163                + nla_total_size(4) /* RTA_IIF */
2164                + nla_total_size(4) /* RTA_OIF */
2165                + nla_total_size(4) /* RTA_PRIORITY */
2166                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2167                + nla_total_size(sizeof(struct rta_cacheinfo));
2168 }
2169
2170 static int rt6_fill_node(struct net *net,
2171                          struct sk_buff *skb, struct rt6_info *rt,
2172                          struct in6_addr *dst, struct in6_addr *src,
2173                          int iif, int type, u32 pid, u32 seq,
2174                          int prefix, int nowait, unsigned int flags)
2175 {
2176         struct rtmsg *rtm;
2177         struct nlmsghdr *nlh;
2178         long expires;
2179         u32 table;
2180
2181         if (prefix) {   /* user wants prefix routes only */
2182                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2183                         /* success since this is not a prefix route */
2184                         return 1;
2185                 }
2186         }
2187
2188         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2189         if (nlh == NULL)
2190                 return -EMSGSIZE;
2191
2192         rtm = nlmsg_data(nlh);
2193         rtm->rtm_family = AF_INET6;
2194         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2195         rtm->rtm_src_len = rt->rt6i_src.plen;
2196         rtm->rtm_tos = 0;
2197         if (rt->rt6i_table)
2198                 table = rt->rt6i_table->tb6_id;
2199         else
2200                 table = RT6_TABLE_UNSPEC;
2201         rtm->rtm_table = table;
2202         NLA_PUT_U32(skb, RTA_TABLE, table);
2203         if (rt->rt6i_flags&RTF_REJECT)
2204                 rtm->rtm_type = RTN_UNREACHABLE;
2205         else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2206                 rtm->rtm_type = RTN_LOCAL;
2207         else
2208                 rtm->rtm_type = RTN_UNICAST;
2209         rtm->rtm_flags = 0;
2210         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2211         rtm->rtm_protocol = rt->rt6i_protocol;
2212         if (rt->rt6i_flags&RTF_DYNAMIC)
2213                 rtm->rtm_protocol = RTPROT_REDIRECT;
2214         else if (rt->rt6i_flags & RTF_ADDRCONF)
2215                 rtm->rtm_protocol = RTPROT_KERNEL;
2216         else if (rt->rt6i_flags&RTF_DEFAULT)
2217                 rtm->rtm_protocol = RTPROT_RA;
2218
2219         if (rt->rt6i_flags&RTF_CACHE)
2220                 rtm->rtm_flags |= RTM_F_CLONED;
2221
2222         if (dst) {
2223                 NLA_PUT(skb, RTA_DST, 16, dst);
2224                 rtm->rtm_dst_len = 128;
2225         } else if (rtm->rtm_dst_len)
2226                 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2227 #ifdef CONFIG_IPV6_SUBTREES
2228         if (src) {
2229                 NLA_PUT(skb, RTA_SRC, 16, src);
2230                 rtm->rtm_src_len = 128;
2231         } else if (rtm->rtm_src_len)
2232                 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2233 #endif
2234         if (iif) {
2235 #ifdef CONFIG_IPV6_MROUTE
2236                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2237                         int err = ip6mr_get_route(net, skb, rtm, nowait);
2238                         if (err <= 0) {
2239                                 if (!nowait) {
2240                                         if (err == 0)
2241                                                 return 0;
2242                                         goto nla_put_failure;
2243                                 } else {
2244                                         if (err == -EMSGSIZE)
2245                                                 goto nla_put_failure;
2246                                 }
2247                         }
2248                 } else
2249 #endif
2250                         NLA_PUT_U32(skb, RTA_IIF, iif);
2251         } else if (dst) {
2252                 struct inet6_dev *idev = ip6_dst_idev(&rt->dst);
2253                 struct in6_addr saddr_buf;
2254                 if (ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2255                                        dst, 0, &saddr_buf) == 0)
2256                         NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2257         }
2258
2259         if (rtnetlink_put_metrics(skb, rt->dst.metrics) < 0)
2260                 goto nla_put_failure;
2261
2262         if (rt->dst.neighbour)
2263                 NLA_PUT(skb, RTA_GATEWAY, 16, &rt->dst.neighbour->primary_key);
2264
2265         if (rt->dst.dev)
2266                 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2267
2268         NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2269
2270         if (!(rt->rt6i_flags & RTF_EXPIRES))
2271                 expires = 0;
2272         else if (rt->rt6i_expires - jiffies < INT_MAX)
2273                 expires = rt->rt6i_expires - jiffies;
2274         else
2275                 expires = INT_MAX;
2276
2277         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
2278                                expires, rt->dst.error) < 0)
2279                 goto nla_put_failure;
2280
2281         return nlmsg_end(skb, nlh);
2282
2283 nla_put_failure:
2284         nlmsg_cancel(skb, nlh);
2285         return -EMSGSIZE;
2286 }
2287
2288 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2289 {
2290         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2291         int prefix;
2292
2293         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2294                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2295                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2296         } else
2297                 prefix = 0;
2298
2299         return rt6_fill_node(arg->net,
2300                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2301                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2302                      prefix, 0, NLM_F_MULTI);
2303 }
2304
2305 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2306 {
2307         struct net *net = sock_net(in_skb->sk);
2308         struct nlattr *tb[RTA_MAX+1];
2309         struct rt6_info *rt;
2310         struct sk_buff *skb;
2311         struct rtmsg *rtm;
2312         struct flowi fl;
2313         int err, iif = 0;
2314
2315         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2316         if (err < 0)
2317                 goto errout;
2318
2319         err = -EINVAL;
2320         memset(&fl, 0, sizeof(fl));
2321
2322         if (tb[RTA_SRC]) {
2323                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2324                         goto errout;
2325
2326                 ipv6_addr_copy(&fl.fl6_src, nla_data(tb[RTA_SRC]));
2327         }
2328
2329         if (tb[RTA_DST]) {
2330                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2331                         goto errout;
2332
2333                 ipv6_addr_copy(&fl.fl6_dst, nla_data(tb[RTA_DST]));
2334         }
2335
2336         if (tb[RTA_IIF])
2337                 iif = nla_get_u32(tb[RTA_IIF]);
2338
2339         if (tb[RTA_OIF])
2340                 fl.oif = nla_get_u32(tb[RTA_OIF]);
2341
2342         if (iif) {
2343                 struct net_device *dev;
2344                 dev = __dev_get_by_index(net, iif);
2345                 if (!dev) {
2346                         err = -ENODEV;
2347                         goto errout;
2348                 }
2349         }
2350
2351         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2352         if (skb == NULL) {
2353                 err = -ENOBUFS;
2354                 goto errout;
2355         }
2356
2357         /* Reserve room for dummy headers, this skb can pass
2358            through good chunk of routing engine.
2359          */
2360         skb_reset_mac_header(skb);
2361         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2362
2363         rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl);
2364         skb_dst_set(skb, &rt->dst);
2365
2366         err = rt6_fill_node(net, skb, rt, &fl.fl6_dst, &fl.fl6_src, iif,
2367                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2368                             nlh->nlmsg_seq, 0, 0, 0);
2369         if (err < 0) {
2370                 kfree_skb(skb);
2371                 goto errout;
2372         }
2373
2374         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2375 errout:
2376         return err;
2377 }
2378
2379 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2380 {
2381         struct sk_buff *skb;
2382         struct net *net = info->nl_net;
2383         u32 seq;
2384         int err;
2385
2386         err = -ENOBUFS;
2387         seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2388
2389         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2390         if (skb == NULL)
2391                 goto errout;
2392
2393         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2394                                 event, info->pid, seq, 0, 0, 0);
2395         if (err < 0) {
2396                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2397                 WARN_ON(err == -EMSGSIZE);
2398                 kfree_skb(skb);
2399                 goto errout;
2400         }
2401         rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2402                     info->nlh, gfp_any());
2403         return;
2404 errout:
2405         if (err < 0)
2406                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2407 }
2408
2409 static int ip6_route_dev_notify(struct notifier_block *this,
2410                                 unsigned long event, void *data)
2411 {
2412         struct net_device *dev = (struct net_device *)data;
2413         struct net *net = dev_net(dev);
2414
2415         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2416                 net->ipv6.ip6_null_entry->dst.dev = dev;
2417                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2418 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2419                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2420                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2421                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2422                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2423 #endif
2424         }
2425
2426         return NOTIFY_OK;
2427 }
2428
2429 /*
2430  *      /proc
2431  */
2432
2433 #ifdef CONFIG_PROC_FS
2434
2435 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2436
2437 struct rt6_proc_arg
2438 {
2439         char *buffer;
2440         int offset;
2441         int length;
2442         int skip;
2443         int len;
2444 };
2445
2446 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2447 {
2448         struct seq_file *m = p_arg;
2449
2450         seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2451
2452 #ifdef CONFIG_IPV6_SUBTREES
2453         seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2454 #else
2455         seq_puts(m, "00000000000000000000000000000000 00 ");
2456 #endif
2457
2458         if (rt->rt6i_nexthop) {
2459                 seq_printf(m, "%pi6", rt->rt6i_nexthop->primary_key);
2460         } else {
2461                 seq_puts(m, "00000000000000000000000000000000");
2462         }
2463         seq_printf(m, " %08x %08x %08x %08x %8s\n",
2464                    rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2465                    rt->dst.__use, rt->rt6i_flags,
2466                    rt->rt6i_dev ? rt->rt6i_dev->name : "");
2467         return 0;
2468 }
2469
2470 static int ipv6_route_show(struct seq_file *m, void *v)
2471 {
2472         struct net *net = (struct net *)m->private;
2473         fib6_clean_all(net, rt6_info_route, 0, m);
2474         return 0;
2475 }
2476
2477 static int ipv6_route_open(struct inode *inode, struct file *file)
2478 {
2479         return single_open_net(inode, file, ipv6_route_show);
2480 }
2481
2482 static const struct file_operations ipv6_route_proc_fops = {
2483         .owner          = THIS_MODULE,
2484         .open           = ipv6_route_open,
2485         .read           = seq_read,
2486         .llseek         = seq_lseek,
2487         .release        = single_release_net,
2488 };
2489
2490 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2491 {
2492         struct net *net = (struct net *)seq->private;
2493         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2494                    net->ipv6.rt6_stats->fib_nodes,
2495                    net->ipv6.rt6_stats->fib_route_nodes,
2496                    net->ipv6.rt6_stats->fib_rt_alloc,
2497                    net->ipv6.rt6_stats->fib_rt_entries,
2498                    net->ipv6.rt6_stats->fib_rt_cache,
2499                    atomic_read(&net->ipv6.ip6_dst_ops.entries),
2500                    net->ipv6.rt6_stats->fib_discarded_routes);
2501
2502         return 0;
2503 }
2504
2505 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2506 {
2507         return single_open_net(inode, file, rt6_stats_seq_show);
2508 }
2509
2510 static const struct file_operations rt6_stats_seq_fops = {
2511         .owner   = THIS_MODULE,
2512         .open    = rt6_stats_seq_open,
2513         .read    = seq_read,
2514         .llseek  = seq_lseek,
2515         .release = single_release_net,
2516 };
2517 #endif  /* CONFIG_PROC_FS */
2518
2519 #ifdef CONFIG_SYSCTL
2520
2521 static
2522 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2523                               void __user *buffer, size_t *lenp, loff_t *ppos)
2524 {
2525         struct net *net = current->nsproxy->net_ns;
2526         int delay = net->ipv6.sysctl.flush_delay;
2527         if (write) {
2528                 proc_dointvec(ctl, write, buffer, lenp, ppos);
2529                 fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2530                 return 0;
2531         } else
2532                 return -EINVAL;
2533 }
2534
2535 ctl_table ipv6_route_table_template[] = {
2536         {
2537                 .procname       =       "flush",
2538                 .data           =       &init_net.ipv6.sysctl.flush_delay,
2539                 .maxlen         =       sizeof(int),
2540                 .mode           =       0200,
2541                 .proc_handler   =       ipv6_sysctl_rtcache_flush
2542         },
2543         {
2544                 .procname       =       "gc_thresh",
2545                 .data           =       &ip6_dst_ops_template.gc_thresh,
2546                 .maxlen         =       sizeof(int),
2547                 .mode           =       0644,
2548                 .proc_handler   =       proc_dointvec,
2549         },
2550         {
2551                 .procname       =       "max_size",
2552                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
2553                 .maxlen         =       sizeof(int),
2554                 .mode           =       0644,
2555                 .proc_handler   =       proc_dointvec,
2556         },
2557         {
2558                 .procname       =       "gc_min_interval",
2559                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2560                 .maxlen         =       sizeof(int),
2561                 .mode           =       0644,
2562                 .proc_handler   =       proc_dointvec_jiffies,
2563         },
2564         {
2565                 .procname       =       "gc_timeout",
2566                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2567                 .maxlen         =       sizeof(int),
2568                 .mode           =       0644,
2569                 .proc_handler   =       proc_dointvec_jiffies,
2570         },
2571         {
2572                 .procname       =       "gc_interval",
2573                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2574                 .maxlen         =       sizeof(int),
2575                 .mode           =       0644,
2576                 .proc_handler   =       proc_dointvec_jiffies,
2577         },
2578         {
2579                 .procname       =       "gc_elasticity",
2580                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2581                 .maxlen         =       sizeof(int),
2582                 .mode           =       0644,
2583                 .proc_handler   =       proc_dointvec_jiffies,
2584         },
2585         {
2586                 .procname       =       "mtu_expires",
2587                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2588                 .maxlen         =       sizeof(int),
2589                 .mode           =       0644,
2590                 .proc_handler   =       proc_dointvec_jiffies,
2591         },
2592         {
2593                 .procname       =       "min_adv_mss",
2594                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2595                 .maxlen         =       sizeof(int),
2596                 .mode           =       0644,
2597                 .proc_handler   =       proc_dointvec_jiffies,
2598         },
2599         {
2600                 .procname       =       "gc_min_interval_ms",
2601                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2602                 .maxlen         =       sizeof(int),
2603                 .mode           =       0644,
2604                 .proc_handler   =       proc_dointvec_ms_jiffies,
2605         },
2606         { }
2607 };
2608
2609 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2610 {
2611         struct ctl_table *table;
2612
2613         table = kmemdup(ipv6_route_table_template,
2614                         sizeof(ipv6_route_table_template),
2615                         GFP_KERNEL);
2616
2617         if (table) {
2618                 table[0].data = &net->ipv6.sysctl.flush_delay;
2619                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2620                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2621                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2622                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2623                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2624                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2625                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2626                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2627                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2628         }
2629
2630         return table;
2631 }
2632 #endif
2633
2634 static int __net_init ip6_route_net_init(struct net *net)
2635 {
2636         int ret = -ENOMEM;
2637
2638         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2639                sizeof(net->ipv6.ip6_dst_ops));
2640
2641         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2642                                            sizeof(*net->ipv6.ip6_null_entry),
2643                                            GFP_KERNEL);
2644         if (!net->ipv6.ip6_null_entry)
2645                 goto out_ip6_dst_ops;
2646         net->ipv6.ip6_null_entry->dst.path =
2647                 (struct dst_entry *)net->ipv6.ip6_null_entry;
2648         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2649
2650 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2651         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2652                                                sizeof(*net->ipv6.ip6_prohibit_entry),
2653                                                GFP_KERNEL);
2654         if (!net->ipv6.ip6_prohibit_entry)
2655                 goto out_ip6_null_entry;
2656         net->ipv6.ip6_prohibit_entry->dst.path =
2657                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2658         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2659
2660         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2661                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
2662                                                GFP_KERNEL);
2663         if (!net->ipv6.ip6_blk_hole_entry)
2664                 goto out_ip6_prohibit_entry;
2665         net->ipv6.ip6_blk_hole_entry->dst.path =
2666                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2667         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2668 #endif
2669
2670         net->ipv6.sysctl.flush_delay = 0;
2671         net->ipv6.sysctl.ip6_rt_max_size = 4096;
2672         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2673         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2674         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2675         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2676         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2677         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2678
2679 #ifdef CONFIG_PROC_FS
2680         proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2681         proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2682 #endif
2683         net->ipv6.ip6_rt_gc_expire = 30*HZ;
2684
2685         ret = 0;
2686 out:
2687         return ret;
2688
2689 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2690 out_ip6_prohibit_entry:
2691         kfree(net->ipv6.ip6_prohibit_entry);
2692 out_ip6_null_entry:
2693         kfree(net->ipv6.ip6_null_entry);
2694 #endif
2695 out_ip6_dst_ops:
2696         goto out;
2697 }
2698
2699 static void __net_exit ip6_route_net_exit(struct net *net)
2700 {
2701 #ifdef CONFIG_PROC_FS
2702         proc_net_remove(net, "ipv6_route");
2703         proc_net_remove(net, "rt6_stats");
2704 #endif
2705         kfree(net->ipv6.ip6_null_entry);
2706 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2707         kfree(net->ipv6.ip6_prohibit_entry);
2708         kfree(net->ipv6.ip6_blk_hole_entry);
2709 #endif
2710 }
2711
2712 static struct pernet_operations ip6_route_net_ops = {
2713         .init = ip6_route_net_init,
2714         .exit = ip6_route_net_exit,
2715 };
2716
2717 static struct notifier_block ip6_route_dev_notifier = {
2718         .notifier_call = ip6_route_dev_notify,
2719         .priority = 0,
2720 };
2721
2722 int __init ip6_route_init(void)
2723 {
2724         int ret;
2725
2726         ret = -ENOMEM;
2727         ip6_dst_ops_template.kmem_cachep =
2728                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2729                                   SLAB_HWCACHE_ALIGN, NULL);
2730         if (!ip6_dst_ops_template.kmem_cachep)
2731                 goto out;
2732
2733         ret = register_pernet_subsys(&ip6_route_net_ops);
2734         if (ret)
2735                 goto out_kmem_cache;
2736
2737         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2738
2739         /* Registering of the loopback is done before this portion of code,
2740          * the loopback reference in rt6_info will not be taken, do it
2741          * manually for init_net */
2742         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2743         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2744   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2745         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2746         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2747         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2748         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2749   #endif
2750         ret = fib6_init();
2751         if (ret)
2752                 goto out_register_subsys;
2753
2754         ret = xfrm6_init();
2755         if (ret)
2756                 goto out_fib6_init;
2757
2758         ret = fib6_rules_init();
2759         if (ret)
2760                 goto xfrm6_init;
2761
2762         ret = -ENOBUFS;
2763         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL) ||
2764             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL) ||
2765             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL))
2766                 goto fib6_rules_init;
2767
2768         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2769         if (ret)
2770                 goto fib6_rules_init;
2771
2772 out:
2773         return ret;
2774
2775 fib6_rules_init:
2776         fib6_rules_cleanup();
2777 xfrm6_init:
2778         xfrm6_fini();
2779 out_fib6_init:
2780         fib6_gc_cleanup();
2781 out_register_subsys:
2782         unregister_pernet_subsys(&ip6_route_net_ops);
2783 out_kmem_cache:
2784         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2785         goto out;
2786 }
2787
2788 void ip6_route_cleanup(void)
2789 {
2790         unregister_netdevice_notifier(&ip6_route_dev_notifier);
2791         fib6_rules_cleanup();
2792         xfrm6_fini();
2793         fib6_gc_cleanup();
2794         unregister_pernet_subsys(&ip6_route_net_ops);
2795         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2796 }