Merge branch 'for-linus' of master.kernel.org:/pub/scm/linux/kernel/git/roland/infiniband
[sfrench/cifs-2.6.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>     
7  *
8  *      $Id: route.c,v 1.56 2001/10/31 21:55:55 davem Exp $
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  */
15
16 /*      Changes:
17  *
18  *      YOSHIFUJI Hideaki @USAGI
19  *              reworked default router selection.
20  *              - respect outgoing interface
21  *              - select from (probably) reachable routers (i.e.
22  *              routers in REACHABLE, STALE, DELAY or PROBE states).
23  *              - always select the same router if it is (probably)
24  *              reachable.  otherwise, round-robin the list.
25  */
26
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/init.h>
38 #include <linux/netlink.h>
39 #include <linux/if_arp.h>
40
41 #ifdef  CONFIG_PROC_FS
42 #include <linux/proc_fs.h>
43 #include <linux/seq_file.h>
44 #endif
45
46 #include <net/snmp.h>
47 #include <net/ipv6.h>
48 #include <net/ip6_fib.h>
49 #include <net/ip6_route.h>
50 #include <net/ndisc.h>
51 #include <net/addrconf.h>
52 #include <net/tcp.h>
53 #include <linux/rtnetlink.h>
54 #include <net/dst.h>
55 #include <net/xfrm.h>
56
57 #include <asm/uaccess.h>
58
59 #ifdef CONFIG_SYSCTL
60 #include <linux/sysctl.h>
61 #endif
62
63 /* Set to 3 to get tracing. */
64 #define RT6_DEBUG 2
65
66 #if RT6_DEBUG >= 3
67 #define RDBG(x) printk x
68 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
69 #else
70 #define RDBG(x)
71 #define RT6_TRACE(x...) do { ; } while (0)
72 #endif
73
74 #define CLONE_OFFLINK_ROUTE 0
75
76 #define RT6_SELECT_F_IFACE      0x1
77 #define RT6_SELECT_F_REACHABLE  0x2
78
79 static int ip6_rt_max_size = 4096;
80 static int ip6_rt_gc_min_interval = HZ / 2;
81 static int ip6_rt_gc_timeout = 60*HZ;
82 int ip6_rt_gc_interval = 30*HZ;
83 static int ip6_rt_gc_elasticity = 9;
84 static int ip6_rt_mtu_expires = 10*60*HZ;
85 static int ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
86
87 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
88 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
89 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
90 static void             ip6_dst_destroy(struct dst_entry *);
91 static void             ip6_dst_ifdown(struct dst_entry *,
92                                        struct net_device *dev, int how);
93 static int               ip6_dst_gc(void);
94
95 static int              ip6_pkt_discard(struct sk_buff *skb);
96 static int              ip6_pkt_discard_out(struct sk_buff *skb);
97 static void             ip6_link_failure(struct sk_buff *skb);
98 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
99
100 #ifdef CONFIG_IPV6_ROUTE_INFO
101 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
102                                            struct in6_addr *gwaddr, int ifindex,
103                                            unsigned pref);
104 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
105                                            struct in6_addr *gwaddr, int ifindex);
106 #endif
107
108 static struct dst_ops ip6_dst_ops = {
109         .family                 =       AF_INET6,
110         .protocol               =       __constant_htons(ETH_P_IPV6),
111         .gc                     =       ip6_dst_gc,
112         .gc_thresh              =       1024,
113         .check                  =       ip6_dst_check,
114         .destroy                =       ip6_dst_destroy,
115         .ifdown                 =       ip6_dst_ifdown,
116         .negative_advice        =       ip6_negative_advice,
117         .link_failure           =       ip6_link_failure,
118         .update_pmtu            =       ip6_rt_update_pmtu,
119         .entry_size             =       sizeof(struct rt6_info),
120 };
121
122 struct rt6_info ip6_null_entry = {
123         .u = {
124                 .dst = {
125                         .__refcnt       = ATOMIC_INIT(1),
126                         .__use          = 1,
127                         .dev            = &loopback_dev,
128                         .obsolete       = -1,
129                         .error          = -ENETUNREACH,
130                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
131                         .input          = ip6_pkt_discard,
132                         .output         = ip6_pkt_discard_out,
133                         .ops            = &ip6_dst_ops,
134                         .path           = (struct dst_entry*)&ip6_null_entry,
135                 }
136         },
137         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
138         .rt6i_metric    = ~(u32) 0,
139         .rt6i_ref       = ATOMIC_INIT(1),
140 };
141
142 struct fib6_node ip6_routing_table = {
143         .leaf           = &ip6_null_entry,
144         .fn_flags       = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO,
145 };
146
147 /* Protects all the ip6 fib */
148
149 DEFINE_RWLOCK(rt6_lock);
150
151
152 /* allocate dst with ip6_dst_ops */
153 static __inline__ struct rt6_info *ip6_dst_alloc(void)
154 {
155         return (struct rt6_info *)dst_alloc(&ip6_dst_ops);
156 }
157
158 static void ip6_dst_destroy(struct dst_entry *dst)
159 {
160         struct rt6_info *rt = (struct rt6_info *)dst;
161         struct inet6_dev *idev = rt->rt6i_idev;
162
163         if (idev != NULL) {
164                 rt->rt6i_idev = NULL;
165                 in6_dev_put(idev);
166         }       
167 }
168
169 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
170                            int how)
171 {
172         struct rt6_info *rt = (struct rt6_info *)dst;
173         struct inet6_dev *idev = rt->rt6i_idev;
174
175         if (dev != &loopback_dev && idev != NULL && idev->dev == dev) {
176                 struct inet6_dev *loopback_idev = in6_dev_get(&loopback_dev);
177                 if (loopback_idev != NULL) {
178                         rt->rt6i_idev = loopback_idev;
179                         in6_dev_put(idev);
180                 }
181         }
182 }
183
184 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
185 {
186         return (rt->rt6i_flags & RTF_EXPIRES &&
187                 time_after(jiffies, rt->rt6i_expires));
188 }
189
190 /*
191  *      Route lookup. Any rt6_lock is implied.
192  */
193
194 static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt,
195                                                     int oif,
196                                                     int strict)
197 {
198         struct rt6_info *local = NULL;
199         struct rt6_info *sprt;
200
201         if (oif) {
202                 for (sprt = rt; sprt; sprt = sprt->u.next) {
203                         struct net_device *dev = sprt->rt6i_dev;
204                         if (dev->ifindex == oif)
205                                 return sprt;
206                         if (dev->flags & IFF_LOOPBACK) {
207                                 if (sprt->rt6i_idev == NULL ||
208                                     sprt->rt6i_idev->dev->ifindex != oif) {
209                                         if (strict && oif)
210                                                 continue;
211                                         if (local && (!oif || 
212                                                       local->rt6i_idev->dev->ifindex == oif))
213                                                 continue;
214                                 }
215                                 local = sprt;
216                         }
217                 }
218
219                 if (local)
220                         return local;
221
222                 if (strict)
223                         return &ip6_null_entry;
224         }
225         return rt;
226 }
227
228 #ifdef CONFIG_IPV6_ROUTER_PREF
229 static void rt6_probe(struct rt6_info *rt)
230 {
231         struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
232         /*
233          * Okay, this does not seem to be appropriate
234          * for now, however, we need to check if it
235          * is really so; aka Router Reachability Probing.
236          *
237          * Router Reachability Probe MUST be rate-limited
238          * to no more than one per minute.
239          */
240         if (!neigh || (neigh->nud_state & NUD_VALID))
241                 return;
242         read_lock_bh(&neigh->lock);
243         if (!(neigh->nud_state & NUD_VALID) &&
244             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
245                 struct in6_addr mcaddr;
246                 struct in6_addr *target;
247
248                 neigh->updated = jiffies;
249                 read_unlock_bh(&neigh->lock);
250
251                 target = (struct in6_addr *)&neigh->primary_key;
252                 addrconf_addr_solict_mult(target, &mcaddr);
253                 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
254         } else
255                 read_unlock_bh(&neigh->lock);
256 }
257 #else
258 static inline void rt6_probe(struct rt6_info *rt)
259 {
260         return;
261 }
262 #endif
263
264 /*
265  * Default Router Selection (RFC 2461 6.3.6)
266  */
267 static int inline rt6_check_dev(struct rt6_info *rt, int oif)
268 {
269         struct net_device *dev = rt->rt6i_dev;
270         if (!oif || dev->ifindex == oif)
271                 return 2;
272         if ((dev->flags & IFF_LOOPBACK) &&
273             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
274                 return 1;
275         return 0;
276 }
277
278 static int inline rt6_check_neigh(struct rt6_info *rt)
279 {
280         struct neighbour *neigh = rt->rt6i_nexthop;
281         int m = 0;
282         if (rt->rt6i_flags & RTF_NONEXTHOP ||
283             !(rt->rt6i_flags & RTF_GATEWAY))
284                 m = 1;
285         else if (neigh) {
286                 read_lock_bh(&neigh->lock);
287                 if (neigh->nud_state & NUD_VALID)
288                         m = 2;
289                 read_unlock_bh(&neigh->lock);
290         }
291         return m;
292 }
293
294 static int rt6_score_route(struct rt6_info *rt, int oif,
295                            int strict)
296 {
297         int m, n;
298                 
299         m = rt6_check_dev(rt, oif);
300         if (!m && (strict & RT6_SELECT_F_IFACE))
301                 return -1;
302 #ifdef CONFIG_IPV6_ROUTER_PREF
303         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
304 #endif
305         n = rt6_check_neigh(rt);
306         if (n > 1)
307                 m |= 16;
308         else if (!n && strict & RT6_SELECT_F_REACHABLE)
309                 return -1;
310         return m;
311 }
312
313 static struct rt6_info *rt6_select(struct rt6_info **head, int oif,
314                                    int strict)
315 {
316         struct rt6_info *match = NULL, *last = NULL;
317         struct rt6_info *rt, *rt0 = *head;
318         u32 metric;
319         int mpri = -1;
320
321         RT6_TRACE("%s(head=%p(*head=%p), oif=%d)\n",
322                   __FUNCTION__, head, head ? *head : NULL, oif);
323
324         for (rt = rt0, metric = rt0->rt6i_metric;
325              rt && rt->rt6i_metric == metric && (!last || rt != rt0);
326              rt = rt->u.next) {
327                 int m;
328
329                 if (rt6_check_expired(rt))
330                         continue;
331
332                 last = rt;
333
334                 m = rt6_score_route(rt, oif, strict);
335                 if (m < 0)
336                         continue;
337
338                 if (m > mpri) {
339                         rt6_probe(match);
340                         match = rt;
341                         mpri = m;
342                 } else {
343                         rt6_probe(rt);
344                 }
345         }
346
347         if (!match &&
348             (strict & RT6_SELECT_F_REACHABLE) &&
349             last && last != rt0) {
350                 /* no entries matched; do round-robin */
351                 static DEFINE_SPINLOCK(lock);
352                 spin_lock(&lock);
353                 *head = rt0->u.next;
354                 rt0->u.next = last->u.next;
355                 last->u.next = rt0;
356                 spin_unlock(&lock);
357         }
358
359         RT6_TRACE("%s() => %p, score=%d\n",
360                   __FUNCTION__, match, mpri);
361
362         return (match ? match : &ip6_null_entry);
363 }
364
365 #ifdef CONFIG_IPV6_ROUTE_INFO
366 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
367                   struct in6_addr *gwaddr)
368 {
369         struct route_info *rinfo = (struct route_info *) opt;
370         struct in6_addr prefix_buf, *prefix;
371         unsigned int pref;
372         u32 lifetime;
373         struct rt6_info *rt;
374
375         if (len < sizeof(struct route_info)) {
376                 return -EINVAL;
377         }
378
379         /* Sanity check for prefix_len and length */
380         if (rinfo->length > 3) {
381                 return -EINVAL;
382         } else if (rinfo->prefix_len > 128) {
383                 return -EINVAL;
384         } else if (rinfo->prefix_len > 64) {
385                 if (rinfo->length < 2) {
386                         return -EINVAL;
387                 }
388         } else if (rinfo->prefix_len > 0) {
389                 if (rinfo->length < 1) {
390                         return -EINVAL;
391                 }
392         }
393
394         pref = rinfo->route_pref;
395         if (pref == ICMPV6_ROUTER_PREF_INVALID)
396                 pref = ICMPV6_ROUTER_PREF_MEDIUM;
397
398         lifetime = htonl(rinfo->lifetime);
399         if (lifetime == 0xffffffff) {
400                 /* infinity */
401         } else if (lifetime > 0x7fffffff/HZ) {
402                 /* Avoid arithmetic overflow */
403                 lifetime = 0x7fffffff/HZ - 1;
404         }
405
406         if (rinfo->length == 3)
407                 prefix = (struct in6_addr *)rinfo->prefix;
408         else {
409                 /* this function is safe */
410                 ipv6_addr_prefix(&prefix_buf,
411                                  (struct in6_addr *)rinfo->prefix,
412                                  rinfo->prefix_len);
413                 prefix = &prefix_buf;
414         }
415
416         rt = rt6_get_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex);
417
418         if (rt && !lifetime) {
419                 ip6_del_rt(rt, NULL, NULL, NULL);
420                 rt = NULL;
421         }
422
423         if (!rt && lifetime)
424                 rt = rt6_add_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
425                                         pref);
426         else if (rt)
427                 rt->rt6i_flags = RTF_ROUTEINFO |
428                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
429
430         if (rt) {
431                 if (lifetime == 0xffffffff) {
432                         rt->rt6i_flags &= ~RTF_EXPIRES;
433                 } else {
434                         rt->rt6i_expires = jiffies + HZ * lifetime;
435                         rt->rt6i_flags |= RTF_EXPIRES;
436                 }
437                 dst_release(&rt->u.dst);
438         }
439         return 0;
440 }
441 #endif
442
443 struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
444                             int oif, int strict)
445 {
446         struct fib6_node *fn;
447         struct rt6_info *rt;
448
449         read_lock_bh(&rt6_lock);
450         fn = fib6_lookup(&ip6_routing_table, daddr, saddr);
451         rt = rt6_device_match(fn->leaf, oif, strict);
452         dst_hold(&rt->u.dst);
453         rt->u.dst.__use++;
454         read_unlock_bh(&rt6_lock);
455
456         rt->u.dst.lastuse = jiffies;
457         if (rt->u.dst.error == 0)
458                 return rt;
459         dst_release(&rt->u.dst);
460         return NULL;
461 }
462
463 /* ip6_ins_rt is called with FREE rt6_lock.
464    It takes new route entry, the addition fails by any reason the
465    route is freed. In any case, if caller does not hold it, it may
466    be destroyed.
467  */
468
469 int ip6_ins_rt(struct rt6_info *rt, struct nlmsghdr *nlh,
470                 void *_rtattr, struct netlink_skb_parms *req)
471 {
472         int err;
473
474         write_lock_bh(&rt6_lock);
475         err = fib6_add(&ip6_routing_table, rt, nlh, _rtattr, req);
476         write_unlock_bh(&rt6_lock);
477
478         return err;
479 }
480
481 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
482                                       struct in6_addr *saddr)
483 {
484         struct rt6_info *rt;
485
486         /*
487          *      Clone the route.
488          */
489
490         rt = ip6_rt_copy(ort);
491
492         if (rt) {
493                 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
494                         if (rt->rt6i_dst.plen != 128 &&
495                             ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
496                                 rt->rt6i_flags |= RTF_ANYCAST;
497                         ipv6_addr_copy(&rt->rt6i_gateway, daddr);
498                 }
499
500                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
501                 rt->rt6i_dst.plen = 128;
502                 rt->rt6i_flags |= RTF_CACHE;
503                 rt->u.dst.flags |= DST_HOST;
504
505 #ifdef CONFIG_IPV6_SUBTREES
506                 if (rt->rt6i_src.plen && saddr) {
507                         ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
508                         rt->rt6i_src.plen = 128;
509                 }
510 #endif
511
512                 rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
513
514         }
515
516         return rt;
517 }
518
519 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
520 {
521         struct rt6_info *rt = ip6_rt_copy(ort);
522         if (rt) {
523                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
524                 rt->rt6i_dst.plen = 128;
525                 rt->rt6i_flags |= RTF_CACHE;
526                 if (rt->rt6i_flags & RTF_REJECT)
527                         rt->u.dst.error = ort->u.dst.error;
528                 rt->u.dst.flags |= DST_HOST;
529                 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
530         }
531         return rt;
532 }
533
534 #define BACKTRACK() \
535 if (rt == &ip6_null_entry) { \
536        while ((fn = fn->parent) != NULL) { \
537                 if (fn->fn_flags & RTN_ROOT) { \
538                         goto out; \
539                 } \
540                 if (fn->fn_flags & RTN_RTINFO) \
541                         goto restart; \
542         } \
543 }
544
545
546 void ip6_route_input(struct sk_buff *skb)
547 {
548         struct fib6_node *fn;
549         struct rt6_info *rt, *nrt;
550         int strict;
551         int attempts = 3;
552         int err;
553         int reachable = RT6_SELECT_F_REACHABLE;
554
555         strict = ipv6_addr_type(&skb->nh.ipv6h->daddr) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL) ? RT6_SELECT_F_IFACE : 0;
556
557 relookup:
558         read_lock_bh(&rt6_lock);
559
560 restart_2:
561         fn = fib6_lookup(&ip6_routing_table, &skb->nh.ipv6h->daddr,
562                          &skb->nh.ipv6h->saddr);
563
564 restart:
565         rt = rt6_select(&fn->leaf, skb->dev->ifindex, strict | reachable);
566         BACKTRACK();
567         if (rt == &ip6_null_entry ||
568             rt->rt6i_flags & RTF_CACHE)
569                 goto out;
570
571         dst_hold(&rt->u.dst);
572         read_unlock_bh(&rt6_lock);
573
574         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
575                 nrt = rt6_alloc_cow(rt, &skb->nh.ipv6h->daddr, &skb->nh.ipv6h->saddr);
576         else {
577 #if CLONE_OFFLINK_ROUTE
578                 nrt = rt6_alloc_clone(rt, &skb->nh.ipv6h->daddr);
579 #else
580                 goto out2;
581 #endif
582         }
583
584         dst_release(&rt->u.dst);
585         rt = nrt ? : &ip6_null_entry;
586
587         dst_hold(&rt->u.dst);
588         if (nrt) {
589                 err = ip6_ins_rt(nrt, NULL, NULL, &NETLINK_CB(skb));
590                 if (!err)
591                         goto out2;
592         }
593
594         if (--attempts <= 0)
595                 goto out2;
596
597         /*
598          * Race condition! In the gap, when rt6_lock was
599          * released someone could insert this route.  Relookup.
600          */
601         dst_release(&rt->u.dst);
602         goto relookup;
603
604 out:
605         if (reachable) {
606                 reachable = 0;
607                 goto restart_2;
608         }
609         dst_hold(&rt->u.dst);
610         read_unlock_bh(&rt6_lock);
611 out2:
612         rt->u.dst.lastuse = jiffies;
613         rt->u.dst.__use++;
614         skb->dst = (struct dst_entry *) rt;
615         return;
616 }
617
618 struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl)
619 {
620         struct fib6_node *fn;
621         struct rt6_info *rt, *nrt;
622         int strict;
623         int attempts = 3;
624         int err;
625         int reachable = RT6_SELECT_F_REACHABLE;
626
627         strict = ipv6_addr_type(&fl->fl6_dst) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL) ? RT6_SELECT_F_IFACE : 0;
628
629 relookup:
630         read_lock_bh(&rt6_lock);
631
632 restart_2:
633         fn = fib6_lookup(&ip6_routing_table, &fl->fl6_dst, &fl->fl6_src);
634
635 restart:
636         rt = rt6_select(&fn->leaf, fl->oif, strict | reachable);
637         BACKTRACK();
638         if (rt == &ip6_null_entry ||
639             rt->rt6i_flags & RTF_CACHE)
640                 goto out;
641
642         dst_hold(&rt->u.dst);
643         read_unlock_bh(&rt6_lock);
644
645         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
646                 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
647         else {
648 #if CLONE_OFFLINK_ROUTE
649                 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
650 #else
651                 goto out2;
652 #endif
653         }
654
655         dst_release(&rt->u.dst);
656         rt = nrt ? : &ip6_null_entry;
657
658         dst_hold(&rt->u.dst);
659         if (nrt) {
660                 err = ip6_ins_rt(nrt, NULL, NULL, NULL);
661                 if (!err)
662                         goto out2;
663         }
664
665         if (--attempts <= 0)
666                 goto out2;
667
668         /*
669          * Race condition! In the gap, when rt6_lock was
670          * released someone could insert this route.  Relookup.
671          */
672         dst_release(&rt->u.dst);
673         goto relookup;
674
675 out:
676         if (reachable) {
677                 reachable = 0;
678                 goto restart_2;
679         }
680         dst_hold(&rt->u.dst);
681         read_unlock_bh(&rt6_lock);
682 out2:
683         rt->u.dst.lastuse = jiffies;
684         rt->u.dst.__use++;
685         return &rt->u.dst;
686 }
687
688
689 /*
690  *      Destination cache support functions
691  */
692
693 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
694 {
695         struct rt6_info *rt;
696
697         rt = (struct rt6_info *) dst;
698
699         if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
700                 return dst;
701
702         return NULL;
703 }
704
705 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
706 {
707         struct rt6_info *rt = (struct rt6_info *) dst;
708
709         if (rt) {
710                 if (rt->rt6i_flags & RTF_CACHE)
711                         ip6_del_rt(rt, NULL, NULL, NULL);
712                 else
713                         dst_release(dst);
714         }
715         return NULL;
716 }
717
718 static void ip6_link_failure(struct sk_buff *skb)
719 {
720         struct rt6_info *rt;
721
722         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
723
724         rt = (struct rt6_info *) skb->dst;
725         if (rt) {
726                 if (rt->rt6i_flags&RTF_CACHE) {
727                         dst_set_expires(&rt->u.dst, 0);
728                         rt->rt6i_flags |= RTF_EXPIRES;
729                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
730                         rt->rt6i_node->fn_sernum = -1;
731         }
732 }
733
734 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
735 {
736         struct rt6_info *rt6 = (struct rt6_info*)dst;
737
738         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
739                 rt6->rt6i_flags |= RTF_MODIFIED;
740                 if (mtu < IPV6_MIN_MTU) {
741                         mtu = IPV6_MIN_MTU;
742                         dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
743                 }
744                 dst->metrics[RTAX_MTU-1] = mtu;
745         }
746 }
747
748 /* Protected by rt6_lock.  */
749 static struct dst_entry *ndisc_dst_gc_list;
750 static int ipv6_get_mtu(struct net_device *dev);
751
752 static inline unsigned int ipv6_advmss(unsigned int mtu)
753 {
754         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
755
756         if (mtu < ip6_rt_min_advmss)
757                 mtu = ip6_rt_min_advmss;
758
759         /*
760          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and 
761          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. 
762          * IPV6_MAXPLEN is also valid and means: "any MSS, 
763          * rely only on pmtu discovery"
764          */
765         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
766                 mtu = IPV6_MAXPLEN;
767         return mtu;
768 }
769
770 struct dst_entry *ndisc_dst_alloc(struct net_device *dev, 
771                                   struct neighbour *neigh,
772                                   struct in6_addr *addr,
773                                   int (*output)(struct sk_buff *))
774 {
775         struct rt6_info *rt;
776         struct inet6_dev *idev = in6_dev_get(dev);
777
778         if (unlikely(idev == NULL))
779                 return NULL;
780
781         rt = ip6_dst_alloc();
782         if (unlikely(rt == NULL)) {
783                 in6_dev_put(idev);
784                 goto out;
785         }
786
787         dev_hold(dev);
788         if (neigh)
789                 neigh_hold(neigh);
790         else
791                 neigh = ndisc_get_neigh(dev, addr);
792
793         rt->rt6i_dev      = dev;
794         rt->rt6i_idev     = idev;
795         rt->rt6i_nexthop  = neigh;
796         atomic_set(&rt->u.dst.__refcnt, 1);
797         rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
798         rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
799         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
800         rt->u.dst.output  = output;
801
802 #if 0   /* there's no chance to use these for ndisc */
803         rt->u.dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST 
804                                 ? DST_HOST 
805                                 : 0;
806         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
807         rt->rt6i_dst.plen = 128;
808 #endif
809
810         write_lock_bh(&rt6_lock);
811         rt->u.dst.next = ndisc_dst_gc_list;
812         ndisc_dst_gc_list = &rt->u.dst;
813         write_unlock_bh(&rt6_lock);
814
815         fib6_force_start_gc();
816
817 out:
818         return (struct dst_entry *)rt;
819 }
820
821 int ndisc_dst_gc(int *more)
822 {
823         struct dst_entry *dst, *next, **pprev;
824         int freed;
825
826         next = NULL;
827         pprev = &ndisc_dst_gc_list;
828         freed = 0;
829         while ((dst = *pprev) != NULL) {
830                 if (!atomic_read(&dst->__refcnt)) {
831                         *pprev = dst->next;
832                         dst_free(dst);
833                         freed++;
834                 } else {
835                         pprev = &dst->next;
836                         (*more)++;
837                 }
838         }
839
840         return freed;
841 }
842
843 static int ip6_dst_gc(void)
844 {
845         static unsigned expire = 30*HZ;
846         static unsigned long last_gc;
847         unsigned long now = jiffies;
848
849         if (time_after(last_gc + ip6_rt_gc_min_interval, now) &&
850             atomic_read(&ip6_dst_ops.entries) <= ip6_rt_max_size)
851                 goto out;
852
853         expire++;
854         fib6_run_gc(expire);
855         last_gc = now;
856         if (atomic_read(&ip6_dst_ops.entries) < ip6_dst_ops.gc_thresh)
857                 expire = ip6_rt_gc_timeout>>1;
858
859 out:
860         expire -= expire>>ip6_rt_gc_elasticity;
861         return (atomic_read(&ip6_dst_ops.entries) > ip6_rt_max_size);
862 }
863
864 /* Clean host part of a prefix. Not necessary in radix tree,
865    but results in cleaner routing tables.
866
867    Remove it only when all the things will work!
868  */
869
870 static int ipv6_get_mtu(struct net_device *dev)
871 {
872         int mtu = IPV6_MIN_MTU;
873         struct inet6_dev *idev;
874
875         idev = in6_dev_get(dev);
876         if (idev) {
877                 mtu = idev->cnf.mtu6;
878                 in6_dev_put(idev);
879         }
880         return mtu;
881 }
882
883 int ipv6_get_hoplimit(struct net_device *dev)
884 {
885         int hoplimit = ipv6_devconf.hop_limit;
886         struct inet6_dev *idev;
887
888         idev = in6_dev_get(dev);
889         if (idev) {
890                 hoplimit = idev->cnf.hop_limit;
891                 in6_dev_put(idev);
892         }
893         return hoplimit;
894 }
895
896 /*
897  *
898  */
899
900 int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, 
901                 void *_rtattr, struct netlink_skb_parms *req)
902 {
903         int err;
904         struct rtmsg *r;
905         struct rtattr **rta;
906         struct rt6_info *rt = NULL;
907         struct net_device *dev = NULL;
908         struct inet6_dev *idev = NULL;
909         int addr_type;
910
911         rta = (struct rtattr **) _rtattr;
912
913         if (rtmsg->rtmsg_dst_len > 128 || rtmsg->rtmsg_src_len > 128)
914                 return -EINVAL;
915 #ifndef CONFIG_IPV6_SUBTREES
916         if (rtmsg->rtmsg_src_len)
917                 return -EINVAL;
918 #endif
919         if (rtmsg->rtmsg_ifindex) {
920                 err = -ENODEV;
921                 dev = dev_get_by_index(rtmsg->rtmsg_ifindex);
922                 if (!dev)
923                         goto out;
924                 idev = in6_dev_get(dev);
925                 if (!idev)
926                         goto out;
927         }
928
929         if (rtmsg->rtmsg_metric == 0)
930                 rtmsg->rtmsg_metric = IP6_RT_PRIO_USER;
931
932         rt = ip6_dst_alloc();
933
934         if (rt == NULL) {
935                 err = -ENOMEM;
936                 goto out;
937         }
938
939         rt->u.dst.obsolete = -1;
940         rt->rt6i_expires = jiffies + clock_t_to_jiffies(rtmsg->rtmsg_info);
941         if (nlh && (r = NLMSG_DATA(nlh))) {
942                 rt->rt6i_protocol = r->rtm_protocol;
943         } else {
944                 rt->rt6i_protocol = RTPROT_BOOT;
945         }
946
947         addr_type = ipv6_addr_type(&rtmsg->rtmsg_dst);
948
949         if (addr_type & IPV6_ADDR_MULTICAST)
950                 rt->u.dst.input = ip6_mc_input;
951         else
952                 rt->u.dst.input = ip6_forward;
953
954         rt->u.dst.output = ip6_output;
955
956         ipv6_addr_prefix(&rt->rt6i_dst.addr, 
957                          &rtmsg->rtmsg_dst, rtmsg->rtmsg_dst_len);
958         rt->rt6i_dst.plen = rtmsg->rtmsg_dst_len;
959         if (rt->rt6i_dst.plen == 128)
960                rt->u.dst.flags = DST_HOST;
961
962 #ifdef CONFIG_IPV6_SUBTREES
963         ipv6_addr_prefix(&rt->rt6i_src.addr, 
964                          &rtmsg->rtmsg_src, rtmsg->rtmsg_src_len);
965         rt->rt6i_src.plen = rtmsg->rtmsg_src_len;
966 #endif
967
968         rt->rt6i_metric = rtmsg->rtmsg_metric;
969
970         /* We cannot add true routes via loopback here,
971            they would result in kernel looping; promote them to reject routes
972          */
973         if ((rtmsg->rtmsg_flags&RTF_REJECT) ||
974             (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
975                 /* hold loopback dev/idev if we haven't done so. */
976                 if (dev != &loopback_dev) {
977                         if (dev) {
978                                 dev_put(dev);
979                                 in6_dev_put(idev);
980                         }
981                         dev = &loopback_dev;
982                         dev_hold(dev);
983                         idev = in6_dev_get(dev);
984                         if (!idev) {
985                                 err = -ENODEV;
986                                 goto out;
987                         }
988                 }
989                 rt->u.dst.output = ip6_pkt_discard_out;
990                 rt->u.dst.input = ip6_pkt_discard;
991                 rt->u.dst.error = -ENETUNREACH;
992                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
993                 goto install_route;
994         }
995
996         if (rtmsg->rtmsg_flags & RTF_GATEWAY) {
997                 struct in6_addr *gw_addr;
998                 int gwa_type;
999
1000                 gw_addr = &rtmsg->rtmsg_gateway;
1001                 ipv6_addr_copy(&rt->rt6i_gateway, &rtmsg->rtmsg_gateway);
1002                 gwa_type = ipv6_addr_type(gw_addr);
1003
1004                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1005                         struct rt6_info *grt;
1006
1007                         /* IPv6 strictly inhibits using not link-local
1008                            addresses as nexthop address.
1009                            Otherwise, router will not able to send redirects.
1010                            It is very good, but in some (rare!) circumstances
1011                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1012                            some exceptions. --ANK
1013                          */
1014                         err = -EINVAL;
1015                         if (!(gwa_type&IPV6_ADDR_UNICAST))
1016                                 goto out;
1017
1018                         grt = rt6_lookup(gw_addr, NULL, rtmsg->rtmsg_ifindex, 1);
1019
1020                         err = -EHOSTUNREACH;
1021                         if (grt == NULL)
1022                                 goto out;
1023                         if (dev) {
1024                                 if (dev != grt->rt6i_dev) {
1025                                         dst_release(&grt->u.dst);
1026                                         goto out;
1027                                 }
1028                         } else {
1029                                 dev = grt->rt6i_dev;
1030                                 idev = grt->rt6i_idev;
1031                                 dev_hold(dev);
1032                                 in6_dev_hold(grt->rt6i_idev);
1033                         }
1034                         if (!(grt->rt6i_flags&RTF_GATEWAY))
1035                                 err = 0;
1036                         dst_release(&grt->u.dst);
1037
1038                         if (err)
1039                                 goto out;
1040                 }
1041                 err = -EINVAL;
1042                 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1043                         goto out;
1044         }
1045
1046         err = -ENODEV;
1047         if (dev == NULL)
1048                 goto out;
1049
1050         if (rtmsg->rtmsg_flags & (RTF_GATEWAY|RTF_NONEXTHOP)) {
1051                 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1052                 if (IS_ERR(rt->rt6i_nexthop)) {
1053                         err = PTR_ERR(rt->rt6i_nexthop);
1054                         rt->rt6i_nexthop = NULL;
1055                         goto out;
1056                 }
1057         }
1058
1059         rt->rt6i_flags = rtmsg->rtmsg_flags;
1060
1061 install_route:
1062         if (rta && rta[RTA_METRICS-1]) {
1063                 int attrlen = RTA_PAYLOAD(rta[RTA_METRICS-1]);
1064                 struct rtattr *attr = RTA_DATA(rta[RTA_METRICS-1]);
1065
1066                 while (RTA_OK(attr, attrlen)) {
1067                         unsigned flavor = attr->rta_type;
1068                         if (flavor) {
1069                                 if (flavor > RTAX_MAX) {
1070                                         err = -EINVAL;
1071                                         goto out;
1072                                 }
1073                                 rt->u.dst.metrics[flavor-1] =
1074                                         *(u32 *)RTA_DATA(attr);
1075                         }
1076                         attr = RTA_NEXT(attr, attrlen);
1077                 }
1078         }
1079
1080         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1081                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1082         if (!rt->u.dst.metrics[RTAX_MTU-1])
1083                 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1084         if (!rt->u.dst.metrics[RTAX_ADVMSS-1])
1085                 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1086         rt->u.dst.dev = dev;
1087         rt->rt6i_idev = idev;
1088         return ip6_ins_rt(rt, nlh, _rtattr, req);
1089
1090 out:
1091         if (dev)
1092                 dev_put(dev);
1093         if (idev)
1094                 in6_dev_put(idev);
1095         if (rt)
1096                 dst_free((struct dst_entry *) rt);
1097         return err;
1098 }
1099
1100 int ip6_del_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
1101 {
1102         int err;
1103
1104         write_lock_bh(&rt6_lock);
1105
1106         err = fib6_del(rt, nlh, _rtattr, req);
1107         dst_release(&rt->u.dst);
1108
1109         write_unlock_bh(&rt6_lock);
1110
1111         return err;
1112 }
1113
1114 static int ip6_route_del(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
1115 {
1116         struct fib6_node *fn;
1117         struct rt6_info *rt;
1118         int err = -ESRCH;
1119
1120         read_lock_bh(&rt6_lock);
1121
1122         fn = fib6_locate(&ip6_routing_table,
1123                          &rtmsg->rtmsg_dst, rtmsg->rtmsg_dst_len,
1124                          &rtmsg->rtmsg_src, rtmsg->rtmsg_src_len);
1125         
1126         if (fn) {
1127                 for (rt = fn->leaf; rt; rt = rt->u.next) {
1128                         if (rtmsg->rtmsg_ifindex &&
1129                             (rt->rt6i_dev == NULL ||
1130                              rt->rt6i_dev->ifindex != rtmsg->rtmsg_ifindex))
1131                                 continue;
1132                         if (rtmsg->rtmsg_flags&RTF_GATEWAY &&
1133                             !ipv6_addr_equal(&rtmsg->rtmsg_gateway, &rt->rt6i_gateway))
1134                                 continue;
1135                         if (rtmsg->rtmsg_metric &&
1136                             rtmsg->rtmsg_metric != rt->rt6i_metric)
1137                                 continue;
1138                         dst_hold(&rt->u.dst);
1139                         read_unlock_bh(&rt6_lock);
1140
1141                         return ip6_del_rt(rt, nlh, _rtattr, req);
1142                 }
1143         }
1144         read_unlock_bh(&rt6_lock);
1145
1146         return err;
1147 }
1148
1149 /*
1150  *      Handle redirects
1151  */
1152 void rt6_redirect(struct in6_addr *dest, struct in6_addr *saddr,
1153                   struct neighbour *neigh, u8 *lladdr, int on_link)
1154 {
1155         struct rt6_info *rt, *nrt = NULL;
1156         int strict;
1157         struct fib6_node *fn;
1158
1159         /*
1160          * Get the "current" route for this destination and
1161          * check if the redirect has come from approriate router.
1162          *
1163          * RFC 2461 specifies that redirects should only be
1164          * accepted if they come from the nexthop to the target.
1165          * Due to the way the routes are chosen, this notion
1166          * is a bit fuzzy and one might need to check all possible
1167          * routes.
1168          */
1169         strict = ipv6_addr_type(dest) & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL);
1170
1171         read_lock_bh(&rt6_lock);
1172         fn = fib6_lookup(&ip6_routing_table, dest, NULL);
1173 restart:
1174         for (rt = fn->leaf; rt; rt = rt->u.next) {
1175                 /*
1176                  * Current route is on-link; redirect is always invalid.
1177                  *
1178                  * Seems, previous statement is not true. It could
1179                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1180                  * But then router serving it might decide, that we should
1181                  * know truth 8)8) --ANK (980726).
1182                  */
1183                 if (rt6_check_expired(rt))
1184                         continue;
1185                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1186                         continue;
1187                 if (neigh->dev != rt->rt6i_dev)
1188                         continue;
1189                 if (!ipv6_addr_equal(saddr, &rt->rt6i_gateway))
1190                         continue;
1191                 break;
1192         }
1193         if (rt)
1194                 dst_hold(&rt->u.dst);
1195         else if (strict) {
1196                 while ((fn = fn->parent) != NULL) {
1197                         if (fn->fn_flags & RTN_ROOT)
1198                                 break;
1199                         if (fn->fn_flags & RTN_RTINFO)
1200                                 goto restart;
1201                 }
1202         }
1203         read_unlock_bh(&rt6_lock);
1204
1205         if (!rt) {
1206                 if (net_ratelimit())
1207                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1208                                "for redirect target\n");
1209                 return;
1210         }
1211
1212         /*
1213          *      We have finally decided to accept it.
1214          */
1215
1216         neigh_update(neigh, lladdr, NUD_STALE, 
1217                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1218                      NEIGH_UPDATE_F_OVERRIDE|
1219                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1220                                      NEIGH_UPDATE_F_ISROUTER))
1221                      );
1222
1223         /*
1224          * Redirect received -> path was valid.
1225          * Look, redirects are sent only in response to data packets,
1226          * so that this nexthop apparently is reachable. --ANK
1227          */
1228         dst_confirm(&rt->u.dst);
1229
1230         /* Duplicate redirect: silently ignore. */
1231         if (neigh == rt->u.dst.neighbour)
1232                 goto out;
1233
1234         nrt = ip6_rt_copy(rt);
1235         if (nrt == NULL)
1236                 goto out;
1237
1238         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1239         if (on_link)
1240                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1241
1242         ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1243         nrt->rt6i_dst.plen = 128;
1244         nrt->u.dst.flags |= DST_HOST;
1245
1246         ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1247         nrt->rt6i_nexthop = neigh_clone(neigh);
1248         /* Reset pmtu, it may be better */
1249         nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1250         nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst));
1251
1252         if (ip6_ins_rt(nrt, NULL, NULL, NULL))
1253                 goto out;
1254
1255         if (rt->rt6i_flags&RTF_CACHE) {
1256                 ip6_del_rt(rt, NULL, NULL, NULL);
1257                 return;
1258         }
1259
1260 out:
1261         dst_release(&rt->u.dst);
1262         return;
1263 }
1264
1265 /*
1266  *      Handle ICMP "packet too big" messages
1267  *      i.e. Path MTU discovery
1268  */
1269
1270 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1271                         struct net_device *dev, u32 pmtu)
1272 {
1273         struct rt6_info *rt, *nrt;
1274         int allfrag = 0;
1275
1276         rt = rt6_lookup(daddr, saddr, dev->ifindex, 0);
1277         if (rt == NULL)
1278                 return;
1279
1280         if (pmtu >= dst_mtu(&rt->u.dst))
1281                 goto out;
1282
1283         if (pmtu < IPV6_MIN_MTU) {
1284                 /*
1285                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link 
1286                  * MTU (1280) and a fragment header should always be included
1287                  * after a node receiving Too Big message reporting PMTU is
1288                  * less than the IPv6 Minimum Link MTU.
1289                  */
1290                 pmtu = IPV6_MIN_MTU;
1291                 allfrag = 1;
1292         }
1293
1294         /* New mtu received -> path was valid.
1295            They are sent only in response to data packets,
1296            so that this nexthop apparently is reachable. --ANK
1297          */
1298         dst_confirm(&rt->u.dst);
1299
1300         /* Host route. If it is static, it would be better
1301            not to override it, but add new one, so that
1302            when cache entry will expire old pmtu
1303            would return automatically.
1304          */
1305         if (rt->rt6i_flags & RTF_CACHE) {
1306                 rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1307                 if (allfrag)
1308                         rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1309                 dst_set_expires(&rt->u.dst, ip6_rt_mtu_expires);
1310                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1311                 goto out;
1312         }
1313
1314         /* Network route.
1315            Two cases are possible:
1316            1. It is connected route. Action: COW
1317            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1318          */
1319         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1320                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1321         else
1322                 nrt = rt6_alloc_clone(rt, daddr);
1323
1324         if (nrt) {
1325                 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1326                 if (allfrag)
1327                         nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1328
1329                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1330                  * happened within 5 mins, the recommended timer is 10 mins.
1331                  * Here this route expiration time is set to ip6_rt_mtu_expires
1332                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1333                  * and detecting PMTU increase will be automatically happened.
1334                  */
1335                 dst_set_expires(&nrt->u.dst, ip6_rt_mtu_expires);
1336                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1337
1338                 ip6_ins_rt(nrt, NULL, NULL, NULL);
1339         }
1340 out:
1341         dst_release(&rt->u.dst);
1342 }
1343
1344 /*
1345  *      Misc support functions
1346  */
1347
1348 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1349 {
1350         struct rt6_info *rt = ip6_dst_alloc();
1351
1352         if (rt) {
1353                 rt->u.dst.input = ort->u.dst.input;
1354                 rt->u.dst.output = ort->u.dst.output;
1355
1356                 memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1357                 rt->u.dst.dev = ort->u.dst.dev;
1358                 if (rt->u.dst.dev)
1359                         dev_hold(rt->u.dst.dev);
1360                 rt->rt6i_idev = ort->rt6i_idev;
1361                 if (rt->rt6i_idev)
1362                         in6_dev_hold(rt->rt6i_idev);
1363                 rt->u.dst.lastuse = jiffies;
1364                 rt->rt6i_expires = 0;
1365
1366                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1367                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1368                 rt->rt6i_metric = 0;
1369
1370                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1371 #ifdef CONFIG_IPV6_SUBTREES
1372                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1373 #endif
1374         }
1375         return rt;
1376 }
1377
1378 #ifdef CONFIG_IPV6_ROUTE_INFO
1379 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
1380                                            struct in6_addr *gwaddr, int ifindex)
1381 {
1382         struct fib6_node *fn;
1383         struct rt6_info *rt = NULL;
1384
1385         write_lock_bh(&rt6_lock);
1386         fn = fib6_locate(&ip6_routing_table, prefix ,prefixlen, NULL, 0);
1387         if (!fn)
1388                 goto out;
1389
1390         for (rt = fn->leaf; rt; rt = rt->u.next) {
1391                 if (rt->rt6i_dev->ifindex != ifindex)
1392                         continue;
1393                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1394                         continue;
1395                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1396                         continue;
1397                 dst_hold(&rt->u.dst);
1398                 break;
1399         }
1400 out:
1401         write_unlock_bh(&rt6_lock);
1402         return rt;
1403 }
1404
1405 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
1406                                            struct in6_addr *gwaddr, int ifindex,
1407                                            unsigned pref)
1408 {
1409         struct in6_rtmsg rtmsg;
1410
1411         memset(&rtmsg, 0, sizeof(rtmsg));
1412         rtmsg.rtmsg_type = RTMSG_NEWROUTE;
1413         ipv6_addr_copy(&rtmsg.rtmsg_dst, prefix);
1414         rtmsg.rtmsg_dst_len = prefixlen;
1415         ipv6_addr_copy(&rtmsg.rtmsg_gateway, gwaddr);
1416         rtmsg.rtmsg_metric = 1024;
1417         rtmsg.rtmsg_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | RTF_UP | RTF_PREF(pref);
1418         /* We should treat it as a default route if prefix length is 0. */
1419         if (!prefixlen)
1420                 rtmsg.rtmsg_flags |= RTF_DEFAULT;
1421         rtmsg.rtmsg_ifindex = ifindex;
1422
1423         ip6_route_add(&rtmsg, NULL, NULL, NULL);
1424
1425         return rt6_get_route_info(prefix, prefixlen, gwaddr, ifindex);
1426 }
1427 #endif
1428
1429 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1430 {       
1431         struct rt6_info *rt;
1432         struct fib6_node *fn;
1433
1434         fn = &ip6_routing_table;
1435
1436         write_lock_bh(&rt6_lock);
1437         for (rt = fn->leaf; rt; rt=rt->u.next) {
1438                 if (dev == rt->rt6i_dev &&
1439                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1440                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1441                         break;
1442         }
1443         if (rt)
1444                 dst_hold(&rt->u.dst);
1445         write_unlock_bh(&rt6_lock);
1446         return rt;
1447 }
1448
1449 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1450                                      struct net_device *dev,
1451                                      unsigned int pref)
1452 {
1453         struct in6_rtmsg rtmsg;
1454
1455         memset(&rtmsg, 0, sizeof(struct in6_rtmsg));
1456         rtmsg.rtmsg_type = RTMSG_NEWROUTE;
1457         ipv6_addr_copy(&rtmsg.rtmsg_gateway, gwaddr);
1458         rtmsg.rtmsg_metric = 1024;
1459         rtmsg.rtmsg_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | RTF_UP | RTF_EXPIRES |
1460                             RTF_PREF(pref);
1461
1462         rtmsg.rtmsg_ifindex = dev->ifindex;
1463
1464         ip6_route_add(&rtmsg, NULL, NULL, NULL);
1465         return rt6_get_dflt_router(gwaddr, dev);
1466 }
1467
1468 void rt6_purge_dflt_routers(void)
1469 {
1470         struct rt6_info *rt;
1471
1472 restart:
1473         read_lock_bh(&rt6_lock);
1474         for (rt = ip6_routing_table.leaf; rt; rt = rt->u.next) {
1475                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1476                         dst_hold(&rt->u.dst);
1477
1478                         read_unlock_bh(&rt6_lock);
1479
1480                         ip6_del_rt(rt, NULL, NULL, NULL);
1481
1482                         goto restart;
1483                 }
1484         }
1485         read_unlock_bh(&rt6_lock);
1486 }
1487
1488 int ipv6_route_ioctl(unsigned int cmd, void __user *arg)
1489 {
1490         struct in6_rtmsg rtmsg;
1491         int err;
1492
1493         switch(cmd) {
1494         case SIOCADDRT:         /* Add a route */
1495         case SIOCDELRT:         /* Delete a route */
1496                 if (!capable(CAP_NET_ADMIN))
1497                         return -EPERM;
1498                 err = copy_from_user(&rtmsg, arg,
1499                                      sizeof(struct in6_rtmsg));
1500                 if (err)
1501                         return -EFAULT;
1502                         
1503                 rtnl_lock();
1504                 switch (cmd) {
1505                 case SIOCADDRT:
1506                         err = ip6_route_add(&rtmsg, NULL, NULL, NULL);
1507                         break;
1508                 case SIOCDELRT:
1509                         err = ip6_route_del(&rtmsg, NULL, NULL, NULL);
1510                         break;
1511                 default:
1512                         err = -EINVAL;
1513                 }
1514                 rtnl_unlock();
1515
1516                 return err;
1517         };
1518
1519         return -EINVAL;
1520 }
1521
1522 /*
1523  *      Drop the packet on the floor
1524  */
1525
1526 static int ip6_pkt_discard(struct sk_buff *skb)
1527 {
1528         IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
1529         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_NOROUTE, 0, skb->dev);
1530         kfree_skb(skb);
1531         return 0;
1532 }
1533
1534 static int ip6_pkt_discard_out(struct sk_buff *skb)
1535 {
1536         skb->dev = skb->dst->dev;
1537         return ip6_pkt_discard(skb);
1538 }
1539
1540 /*
1541  *      Allocate a dst for local (unicast / anycast) address.
1542  */
1543
1544 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1545                                     const struct in6_addr *addr,
1546                                     int anycast)
1547 {
1548         struct rt6_info *rt = ip6_dst_alloc();
1549
1550         if (rt == NULL)
1551                 return ERR_PTR(-ENOMEM);
1552
1553         dev_hold(&loopback_dev);
1554         in6_dev_hold(idev);
1555
1556         rt->u.dst.flags = DST_HOST;
1557         rt->u.dst.input = ip6_input;
1558         rt->u.dst.output = ip6_output;
1559         rt->rt6i_dev = &loopback_dev;
1560         rt->rt6i_idev = idev;
1561         rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1562         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1563         rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1564         rt->u.dst.obsolete = -1;
1565
1566         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1567         if (anycast)
1568                 rt->rt6i_flags |= RTF_ANYCAST;
1569         else
1570                 rt->rt6i_flags |= RTF_LOCAL;
1571         rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1572         if (rt->rt6i_nexthop == NULL) {
1573                 dst_free((struct dst_entry *) rt);
1574                 return ERR_PTR(-ENOMEM);
1575         }
1576
1577         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1578         rt->rt6i_dst.plen = 128;
1579
1580         atomic_set(&rt->u.dst.__refcnt, 1);
1581
1582         return rt;
1583 }
1584
1585 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1586 {
1587         if (((void*)rt->rt6i_dev == arg || arg == NULL) &&
1588             rt != &ip6_null_entry) {
1589                 RT6_TRACE("deleted by ifdown %p\n", rt);
1590                 return -1;
1591         }
1592         return 0;
1593 }
1594
1595 void rt6_ifdown(struct net_device *dev)
1596 {
1597         write_lock_bh(&rt6_lock);
1598         fib6_clean_tree(&ip6_routing_table, fib6_ifdown, 0, dev);
1599         write_unlock_bh(&rt6_lock);
1600 }
1601
1602 struct rt6_mtu_change_arg
1603 {
1604         struct net_device *dev;
1605         unsigned mtu;
1606 };
1607
1608 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
1609 {
1610         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
1611         struct inet6_dev *idev;
1612
1613         /* In IPv6 pmtu discovery is not optional,
1614            so that RTAX_MTU lock cannot disable it.
1615            We still use this lock to block changes
1616            caused by addrconf/ndisc.
1617         */
1618
1619         idev = __in6_dev_get(arg->dev);
1620         if (idev == NULL)
1621                 return 0;
1622
1623         /* For administrative MTU increase, there is no way to discover
1624            IPv6 PMTU increase, so PMTU increase should be updated here.
1625            Since RFC 1981 doesn't include administrative MTU increase
1626            update PMTU increase is a MUST. (i.e. jumbo frame)
1627          */
1628         /*
1629            If new MTU is less than route PMTU, this new MTU will be the
1630            lowest MTU in the path, update the route PMTU to reflect PMTU
1631            decreases; if new MTU is greater than route PMTU, and the
1632            old MTU is the lowest MTU in the path, update the route PMTU
1633            to reflect the increase. In this case if the other nodes' MTU
1634            also have the lowest MTU, TOO BIG MESSAGE will be lead to
1635            PMTU discouvery.
1636          */
1637         if (rt->rt6i_dev == arg->dev &&
1638             !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1639             (dst_mtu(&rt->u.dst) > arg->mtu ||
1640              (dst_mtu(&rt->u.dst) < arg->mtu &&
1641               dst_mtu(&rt->u.dst) == idev->cnf.mtu6)))
1642                 rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
1643         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(arg->mtu);
1644         return 0;
1645 }
1646
1647 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
1648 {
1649         struct rt6_mtu_change_arg arg;
1650
1651         arg.dev = dev;
1652         arg.mtu = mtu;
1653         read_lock_bh(&rt6_lock);
1654         fib6_clean_tree(&ip6_routing_table, rt6_mtu_change_route, 0, &arg);
1655         read_unlock_bh(&rt6_lock);
1656 }
1657
1658 static int inet6_rtm_to_rtmsg(struct rtmsg *r, struct rtattr **rta,
1659                               struct in6_rtmsg *rtmsg)
1660 {
1661         memset(rtmsg, 0, sizeof(*rtmsg));
1662
1663         rtmsg->rtmsg_dst_len = r->rtm_dst_len;
1664         rtmsg->rtmsg_src_len = r->rtm_src_len;
1665         rtmsg->rtmsg_flags = RTF_UP;
1666         if (r->rtm_type == RTN_UNREACHABLE)
1667                 rtmsg->rtmsg_flags |= RTF_REJECT;
1668
1669         if (rta[RTA_GATEWAY-1]) {
1670                 if (rta[RTA_GATEWAY-1]->rta_len != RTA_LENGTH(16))
1671                         return -EINVAL;
1672                 memcpy(&rtmsg->rtmsg_gateway, RTA_DATA(rta[RTA_GATEWAY-1]), 16);
1673                 rtmsg->rtmsg_flags |= RTF_GATEWAY;
1674         }
1675         if (rta[RTA_DST-1]) {
1676                 if (RTA_PAYLOAD(rta[RTA_DST-1]) < ((r->rtm_dst_len+7)>>3))
1677                         return -EINVAL;
1678                 memcpy(&rtmsg->rtmsg_dst, RTA_DATA(rta[RTA_DST-1]), ((r->rtm_dst_len+7)>>3));
1679         }
1680         if (rta[RTA_SRC-1]) {
1681                 if (RTA_PAYLOAD(rta[RTA_SRC-1]) < ((r->rtm_src_len+7)>>3))
1682                         return -EINVAL;
1683                 memcpy(&rtmsg->rtmsg_src, RTA_DATA(rta[RTA_SRC-1]), ((r->rtm_src_len+7)>>3));
1684         }
1685         if (rta[RTA_OIF-1]) {
1686                 if (rta[RTA_OIF-1]->rta_len != RTA_LENGTH(sizeof(int)))
1687                         return -EINVAL;
1688                 memcpy(&rtmsg->rtmsg_ifindex, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
1689         }
1690         if (rta[RTA_PRIORITY-1]) {
1691                 if (rta[RTA_PRIORITY-1]->rta_len != RTA_LENGTH(4))
1692                         return -EINVAL;
1693                 memcpy(&rtmsg->rtmsg_metric, RTA_DATA(rta[RTA_PRIORITY-1]), 4);
1694         }
1695         return 0;
1696 }
1697
1698 int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1699 {
1700         struct rtmsg *r = NLMSG_DATA(nlh);
1701         struct in6_rtmsg rtmsg;
1702
1703         if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
1704                 return -EINVAL;
1705         return ip6_route_del(&rtmsg, nlh, arg, &NETLINK_CB(skb));
1706 }
1707
1708 int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1709 {
1710         struct rtmsg *r = NLMSG_DATA(nlh);
1711         struct in6_rtmsg rtmsg;
1712
1713         if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
1714                 return -EINVAL;
1715         return ip6_route_add(&rtmsg, nlh, arg, &NETLINK_CB(skb));
1716 }
1717
1718 struct rt6_rtnl_dump_arg
1719 {
1720         struct sk_buff *skb;
1721         struct netlink_callback *cb;
1722 };
1723
1724 static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
1725                          struct in6_addr *dst, struct in6_addr *src,
1726                          int iif, int type, u32 pid, u32 seq,
1727                          int prefix, unsigned int flags)
1728 {
1729         struct rtmsg *rtm;
1730         struct nlmsghdr  *nlh;
1731         unsigned char    *b = skb->tail;
1732         struct rta_cacheinfo ci;
1733
1734         if (prefix) {   /* user wants prefix routes only */
1735                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
1736                         /* success since this is not a prefix route */
1737                         return 1;
1738                 }
1739         }
1740
1741         nlh = NLMSG_NEW(skb, pid, seq, type, sizeof(*rtm), flags);
1742         rtm = NLMSG_DATA(nlh);
1743         rtm->rtm_family = AF_INET6;
1744         rtm->rtm_dst_len = rt->rt6i_dst.plen;
1745         rtm->rtm_src_len = rt->rt6i_src.plen;
1746         rtm->rtm_tos = 0;
1747         rtm->rtm_table = RT_TABLE_MAIN;
1748         if (rt->rt6i_flags&RTF_REJECT)
1749                 rtm->rtm_type = RTN_UNREACHABLE;
1750         else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
1751                 rtm->rtm_type = RTN_LOCAL;
1752         else
1753                 rtm->rtm_type = RTN_UNICAST;
1754         rtm->rtm_flags = 0;
1755         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
1756         rtm->rtm_protocol = rt->rt6i_protocol;
1757         if (rt->rt6i_flags&RTF_DYNAMIC)
1758                 rtm->rtm_protocol = RTPROT_REDIRECT;
1759         else if (rt->rt6i_flags & RTF_ADDRCONF)
1760                 rtm->rtm_protocol = RTPROT_KERNEL;
1761         else if (rt->rt6i_flags&RTF_DEFAULT)
1762                 rtm->rtm_protocol = RTPROT_RA;
1763
1764         if (rt->rt6i_flags&RTF_CACHE)
1765                 rtm->rtm_flags |= RTM_F_CLONED;
1766
1767         if (dst) {
1768                 RTA_PUT(skb, RTA_DST, 16, dst);
1769                 rtm->rtm_dst_len = 128;
1770         } else if (rtm->rtm_dst_len)
1771                 RTA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
1772 #ifdef CONFIG_IPV6_SUBTREES
1773         if (src) {
1774                 RTA_PUT(skb, RTA_SRC, 16, src);
1775                 rtm->rtm_src_len = 128;
1776         } else if (rtm->rtm_src_len)
1777                 RTA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
1778 #endif
1779         if (iif)
1780                 RTA_PUT(skb, RTA_IIF, 4, &iif);
1781         else if (dst) {
1782                 struct in6_addr saddr_buf;
1783                 if (ipv6_get_saddr(&rt->u.dst, dst, &saddr_buf) == 0)
1784                         RTA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
1785         }
1786         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
1787                 goto rtattr_failure;
1788         if (rt->u.dst.neighbour)
1789                 RTA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
1790         if (rt->u.dst.dev)
1791                 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->rt6i_dev->ifindex);
1792         RTA_PUT(skb, RTA_PRIORITY, 4, &rt->rt6i_metric);
1793         ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
1794         if (rt->rt6i_expires)
1795                 ci.rta_expires = jiffies_to_clock_t(rt->rt6i_expires - jiffies);
1796         else
1797                 ci.rta_expires = 0;
1798         ci.rta_used = rt->u.dst.__use;
1799         ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
1800         ci.rta_error = rt->u.dst.error;
1801         ci.rta_id = 0;
1802         ci.rta_ts = 0;
1803         ci.rta_tsage = 0;
1804         RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
1805         nlh->nlmsg_len = skb->tail - b;
1806         return skb->len;
1807
1808 nlmsg_failure:
1809 rtattr_failure:
1810         skb_trim(skb, b - skb->data);
1811         return -1;
1812 }
1813
1814 static int rt6_dump_route(struct rt6_info *rt, void *p_arg)
1815 {
1816         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
1817         int prefix;
1818
1819         if (arg->cb->nlh->nlmsg_len >= NLMSG_LENGTH(sizeof(struct rtmsg))) {
1820                 struct rtmsg *rtm = NLMSG_DATA(arg->cb->nlh);
1821                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
1822         } else
1823                 prefix = 0;
1824
1825         return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
1826                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
1827                      prefix, NLM_F_MULTI);
1828 }
1829
1830 static int fib6_dump_node(struct fib6_walker_t *w)
1831 {
1832         int res;
1833         struct rt6_info *rt;
1834
1835         for (rt = w->leaf; rt; rt = rt->u.next) {
1836                 res = rt6_dump_route(rt, w->args);
1837                 if (res < 0) {
1838                         /* Frame is full, suspend walking */
1839                         w->leaf = rt;
1840                         return 1;
1841                 }
1842                 BUG_TRAP(res!=0);
1843         }
1844         w->leaf = NULL;
1845         return 0;
1846 }
1847
1848 static void fib6_dump_end(struct netlink_callback *cb)
1849 {
1850         struct fib6_walker_t *w = (void*)cb->args[0];
1851
1852         if (w) {
1853                 cb->args[0] = 0;
1854                 fib6_walker_unlink(w);
1855                 kfree(w);
1856         }
1857         cb->done = (void*)cb->args[1];
1858         cb->args[1] = 0;
1859 }
1860
1861 static int fib6_dump_done(struct netlink_callback *cb)
1862 {
1863         fib6_dump_end(cb);
1864         return cb->done ? cb->done(cb) : 0;
1865 }
1866
1867 int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
1868 {
1869         struct rt6_rtnl_dump_arg arg;
1870         struct fib6_walker_t *w;
1871         int res;
1872
1873         arg.skb = skb;
1874         arg.cb = cb;
1875
1876         w = (void*)cb->args[0];
1877         if (w == NULL) {
1878                 /* New dump:
1879                  * 
1880                  * 1. hook callback destructor.
1881                  */
1882                 cb->args[1] = (long)cb->done;
1883                 cb->done = fib6_dump_done;
1884
1885                 /*
1886                  * 2. allocate and initialize walker.
1887                  */
1888                 w = kzalloc(sizeof(*w), GFP_ATOMIC);
1889                 if (w == NULL)
1890                         return -ENOMEM;
1891                 RT6_TRACE("dump<%p", w);
1892                 w->root = &ip6_routing_table;
1893                 w->func = fib6_dump_node;
1894                 w->args = &arg;
1895                 cb->args[0] = (long)w;
1896                 read_lock_bh(&rt6_lock);
1897                 res = fib6_walk(w);
1898                 read_unlock_bh(&rt6_lock);
1899         } else {
1900                 w->args = &arg;
1901                 read_lock_bh(&rt6_lock);
1902                 res = fib6_walk_continue(w);
1903                 read_unlock_bh(&rt6_lock);
1904         }
1905 #if RT6_DEBUG >= 3
1906         if (res <= 0 && skb->len == 0)
1907                 RT6_TRACE("%p>dump end\n", w);
1908 #endif
1909         res = res < 0 ? res : skb->len;
1910         /* res < 0 is an error. (really, impossible)
1911            res == 0 means that dump is complete, but skb still can contain data.
1912            res > 0 dump is not complete, but frame is full.
1913          */
1914         /* Destroy walker, if dump of this table is complete. */
1915         if (res <= 0)
1916                 fib6_dump_end(cb);
1917         return res;
1918 }
1919
1920 int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
1921 {
1922         struct rtattr **rta = arg;
1923         int iif = 0;
1924         int err = -ENOBUFS;
1925         struct sk_buff *skb;
1926         struct flowi fl;
1927         struct rt6_info *rt;
1928
1929         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1930         if (skb == NULL)
1931                 goto out;
1932
1933         /* Reserve room for dummy headers, this skb can pass
1934            through good chunk of routing engine.
1935          */
1936         skb->mac.raw = skb->data;
1937         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
1938
1939         memset(&fl, 0, sizeof(fl));
1940         if (rta[RTA_SRC-1])
1941                 ipv6_addr_copy(&fl.fl6_src,
1942                                (struct in6_addr*)RTA_DATA(rta[RTA_SRC-1]));
1943         if (rta[RTA_DST-1])
1944                 ipv6_addr_copy(&fl.fl6_dst,
1945                                (struct in6_addr*)RTA_DATA(rta[RTA_DST-1]));
1946
1947         if (rta[RTA_IIF-1])
1948                 memcpy(&iif, RTA_DATA(rta[RTA_IIF-1]), sizeof(int));
1949
1950         if (iif) {
1951                 struct net_device *dev;
1952                 dev = __dev_get_by_index(iif);
1953                 if (!dev) {
1954                         err = -ENODEV;
1955                         goto out_free;
1956                 }
1957         }
1958
1959         fl.oif = 0;
1960         if (rta[RTA_OIF-1])
1961                 memcpy(&fl.oif, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
1962
1963         rt = (struct rt6_info*)ip6_route_output(NULL, &fl);
1964
1965         skb->dst = &rt->u.dst;
1966
1967         NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
1968         err = rt6_fill_node(skb, rt, 
1969                             &fl.fl6_dst, &fl.fl6_src,
1970                             iif,
1971                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
1972                             nlh->nlmsg_seq, 0, 0);
1973         if (err < 0) {
1974                 err = -EMSGSIZE;
1975                 goto out_free;
1976         }
1977
1978         err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
1979         if (err > 0)
1980                 err = 0;
1981 out:
1982         return err;
1983 out_free:
1984         kfree_skb(skb);
1985         goto out;       
1986 }
1987
1988 void inet6_rt_notify(int event, struct rt6_info *rt, struct nlmsghdr *nlh, 
1989                         struct netlink_skb_parms *req)
1990 {
1991         struct sk_buff *skb;
1992         int size = NLMSG_SPACE(sizeof(struct rtmsg)+256);
1993         u32 pid = current->pid;
1994         u32 seq = 0;
1995
1996         if (req)
1997                 pid = req->pid;
1998         if (nlh)
1999                 seq = nlh->nlmsg_seq;
2000         
2001         skb = alloc_skb(size, gfp_any());
2002         if (!skb) {
2003                 netlink_set_err(rtnl, 0, RTNLGRP_IPV6_ROUTE, ENOBUFS);
2004                 return;
2005         }
2006         if (rt6_fill_node(skb, rt, NULL, NULL, 0, event, pid, seq, 0, 0) < 0) {
2007                 kfree_skb(skb);
2008                 netlink_set_err(rtnl, 0, RTNLGRP_IPV6_ROUTE, EINVAL);
2009                 return;
2010         }
2011         NETLINK_CB(skb).dst_group = RTNLGRP_IPV6_ROUTE;
2012         netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV6_ROUTE, gfp_any());
2013 }
2014
2015 /*
2016  *      /proc
2017  */
2018
2019 #ifdef CONFIG_PROC_FS
2020
2021 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2022
2023 struct rt6_proc_arg
2024 {
2025         char *buffer;
2026         int offset;
2027         int length;
2028         int skip;
2029         int len;
2030 };
2031
2032 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2033 {
2034         struct rt6_proc_arg *arg = (struct rt6_proc_arg *) p_arg;
2035         int i;
2036
2037         if (arg->skip < arg->offset / RT6_INFO_LEN) {
2038                 arg->skip++;
2039                 return 0;
2040         }
2041
2042         if (arg->len >= arg->length)
2043                 return 0;
2044
2045         for (i=0; i<16; i++) {
2046                 sprintf(arg->buffer + arg->len, "%02x",
2047                         rt->rt6i_dst.addr.s6_addr[i]);
2048                 arg->len += 2;
2049         }
2050         arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2051                             rt->rt6i_dst.plen);
2052
2053 #ifdef CONFIG_IPV6_SUBTREES
2054         for (i=0; i<16; i++) {
2055                 sprintf(arg->buffer + arg->len, "%02x",
2056                         rt->rt6i_src.addr.s6_addr[i]);
2057                 arg->len += 2;
2058         }
2059         arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2060                             rt->rt6i_src.plen);
2061 #else
2062         sprintf(arg->buffer + arg->len,
2063                 "00000000000000000000000000000000 00 ");
2064         arg->len += 36;
2065 #endif
2066
2067         if (rt->rt6i_nexthop) {
2068                 for (i=0; i<16; i++) {
2069                         sprintf(arg->buffer + arg->len, "%02x",
2070                                 rt->rt6i_nexthop->primary_key[i]);
2071                         arg->len += 2;
2072                 }
2073         } else {
2074                 sprintf(arg->buffer + arg->len,
2075                         "00000000000000000000000000000000");
2076                 arg->len += 32;
2077         }
2078         arg->len += sprintf(arg->buffer + arg->len,
2079                             " %08x %08x %08x %08x %8s\n",
2080                             rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
2081                             rt->u.dst.__use, rt->rt6i_flags, 
2082                             rt->rt6i_dev ? rt->rt6i_dev->name : "");
2083         return 0;
2084 }
2085
2086 static int rt6_proc_info(char *buffer, char **start, off_t offset, int length)
2087 {
2088         struct rt6_proc_arg arg;
2089         arg.buffer = buffer;
2090         arg.offset = offset;
2091         arg.length = length;
2092         arg.skip = 0;
2093         arg.len = 0;
2094
2095         read_lock_bh(&rt6_lock);
2096         fib6_clean_tree(&ip6_routing_table, rt6_info_route, 0, &arg);
2097         read_unlock_bh(&rt6_lock);
2098
2099         *start = buffer;
2100         if (offset)
2101                 *start += offset % RT6_INFO_LEN;
2102
2103         arg.len -= offset % RT6_INFO_LEN;
2104
2105         if (arg.len > length)
2106                 arg.len = length;
2107         if (arg.len < 0)
2108                 arg.len = 0;
2109
2110         return arg.len;
2111 }
2112
2113 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2114 {
2115         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2116                       rt6_stats.fib_nodes, rt6_stats.fib_route_nodes,
2117                       rt6_stats.fib_rt_alloc, rt6_stats.fib_rt_entries,
2118                       rt6_stats.fib_rt_cache,
2119                       atomic_read(&ip6_dst_ops.entries),
2120                       rt6_stats.fib_discarded_routes);
2121
2122         return 0;
2123 }
2124
2125 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2126 {
2127         return single_open(file, rt6_stats_seq_show, NULL);
2128 }
2129
2130 static struct file_operations rt6_stats_seq_fops = {
2131         .owner   = THIS_MODULE,
2132         .open    = rt6_stats_seq_open,
2133         .read    = seq_read,
2134         .llseek  = seq_lseek,
2135         .release = single_release,
2136 };
2137 #endif  /* CONFIG_PROC_FS */
2138
2139 #ifdef CONFIG_SYSCTL
2140
2141 static int flush_delay;
2142
2143 static
2144 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
2145                               void __user *buffer, size_t *lenp, loff_t *ppos)
2146 {
2147         if (write) {
2148                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2149                 fib6_run_gc(flush_delay <= 0 ? ~0UL : (unsigned long)flush_delay);
2150                 return 0;
2151         } else
2152                 return -EINVAL;
2153 }
2154
2155 ctl_table ipv6_route_table[] = {
2156         {
2157                 .ctl_name       =       NET_IPV6_ROUTE_FLUSH, 
2158                 .procname       =       "flush",
2159                 .data           =       &flush_delay,
2160                 .maxlen         =       sizeof(int),
2161                 .mode           =       0200,
2162                 .proc_handler   =       &ipv6_sysctl_rtcache_flush
2163         },
2164         {
2165                 .ctl_name       =       NET_IPV6_ROUTE_GC_THRESH,
2166                 .procname       =       "gc_thresh",
2167                 .data           =       &ip6_dst_ops.gc_thresh,
2168                 .maxlen         =       sizeof(int),
2169                 .mode           =       0644,
2170                 .proc_handler   =       &proc_dointvec,
2171         },
2172         {
2173                 .ctl_name       =       NET_IPV6_ROUTE_MAX_SIZE,
2174                 .procname       =       "max_size",
2175                 .data           =       &ip6_rt_max_size,
2176                 .maxlen         =       sizeof(int),
2177                 .mode           =       0644,
2178                 .proc_handler   =       &proc_dointvec,
2179         },
2180         {
2181                 .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL,
2182                 .procname       =       "gc_min_interval",
2183                 .data           =       &ip6_rt_gc_min_interval,
2184                 .maxlen         =       sizeof(int),
2185                 .mode           =       0644,
2186                 .proc_handler   =       &proc_dointvec_jiffies,
2187                 .strategy       =       &sysctl_jiffies,
2188         },
2189         {
2190                 .ctl_name       =       NET_IPV6_ROUTE_GC_TIMEOUT,
2191                 .procname       =       "gc_timeout",
2192                 .data           =       &ip6_rt_gc_timeout,
2193                 .maxlen         =       sizeof(int),
2194                 .mode           =       0644,
2195                 .proc_handler   =       &proc_dointvec_jiffies,
2196                 .strategy       =       &sysctl_jiffies,
2197         },
2198         {
2199                 .ctl_name       =       NET_IPV6_ROUTE_GC_INTERVAL,
2200                 .procname       =       "gc_interval",
2201                 .data           =       &ip6_rt_gc_interval,
2202                 .maxlen         =       sizeof(int),
2203                 .mode           =       0644,
2204                 .proc_handler   =       &proc_dointvec_jiffies,
2205                 .strategy       =       &sysctl_jiffies,
2206         },
2207         {
2208                 .ctl_name       =       NET_IPV6_ROUTE_GC_ELASTICITY,
2209                 .procname       =       "gc_elasticity",
2210                 .data           =       &ip6_rt_gc_elasticity,
2211                 .maxlen         =       sizeof(int),
2212                 .mode           =       0644,
2213                 .proc_handler   =       &proc_dointvec_jiffies,
2214                 .strategy       =       &sysctl_jiffies,
2215         },
2216         {
2217                 .ctl_name       =       NET_IPV6_ROUTE_MTU_EXPIRES,
2218                 .procname       =       "mtu_expires",
2219                 .data           =       &ip6_rt_mtu_expires,
2220                 .maxlen         =       sizeof(int),
2221                 .mode           =       0644,
2222                 .proc_handler   =       &proc_dointvec_jiffies,
2223                 .strategy       =       &sysctl_jiffies,
2224         },
2225         {
2226                 .ctl_name       =       NET_IPV6_ROUTE_MIN_ADVMSS,
2227                 .procname       =       "min_adv_mss",
2228                 .data           =       &ip6_rt_min_advmss,
2229                 .maxlen         =       sizeof(int),
2230                 .mode           =       0644,
2231                 .proc_handler   =       &proc_dointvec_jiffies,
2232                 .strategy       =       &sysctl_jiffies,
2233         },
2234         {
2235                 .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,
2236                 .procname       =       "gc_min_interval_ms",
2237                 .data           =       &ip6_rt_gc_min_interval,
2238                 .maxlen         =       sizeof(int),
2239                 .mode           =       0644,
2240                 .proc_handler   =       &proc_dointvec_ms_jiffies,
2241                 .strategy       =       &sysctl_ms_jiffies,
2242         },
2243         { .ctl_name = 0 }
2244 };
2245
2246 #endif
2247
2248 void __init ip6_route_init(void)
2249 {
2250         struct proc_dir_entry *p;
2251
2252         ip6_dst_ops.kmem_cachep = kmem_cache_create("ip6_dst_cache",
2253                                                      sizeof(struct rt6_info),
2254                                                      0, SLAB_HWCACHE_ALIGN,
2255                                                      NULL, NULL);
2256         if (!ip6_dst_ops.kmem_cachep)
2257                 panic("cannot create ip6_dst_cache");
2258
2259         fib6_init();
2260 #ifdef  CONFIG_PROC_FS
2261         p = proc_net_create("ipv6_route", 0, rt6_proc_info);
2262         if (p)
2263                 p->owner = THIS_MODULE;
2264
2265         proc_net_fops_create("rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2266 #endif
2267 #ifdef CONFIG_XFRM
2268         xfrm6_init();
2269 #endif
2270 }
2271
2272 void ip6_route_cleanup(void)
2273 {
2274 #ifdef CONFIG_PROC_FS
2275         proc_net_remove("ipv6_route");
2276         proc_net_remove("rt6_stats");
2277 #endif
2278 #ifdef CONFIG_XFRM
2279         xfrm6_fini();
2280 #endif
2281         rt6_ifdown(NULL);
2282         fib6_gc_cleanup();
2283         kmem_cache_destroy(ip6_dst_ops.kmem_cachep);
2284 }