Merge branch 'release' of git://git.kernel.org/pub/scm/linux/kernel/git/aegl/linux-2.6
[sfrench/cifs-2.6.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>     
7  *
8  *      $Id: route.c,v 1.56 2001/10/31 21:55:55 davem Exp $
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  */
15
16 /*      Changes:
17  *
18  *      YOSHIFUJI Hideaki @USAGI
19  *              reworked default router selection.
20  *              - respect outgoing interface
21  *              - select from (probably) reachable routers (i.e.
22  *              routers in REACHABLE, STALE, DELAY or PROBE states).
23  *              - always select the same router if it is (probably)
24  *              reachable.  otherwise, round-robin the list.
25  */
26
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/init.h>
38 #include <linux/netlink.h>
39 #include <linux/if_arp.h>
40
41 #ifdef  CONFIG_PROC_FS
42 #include <linux/proc_fs.h>
43 #include <linux/seq_file.h>
44 #endif
45
46 #include <net/snmp.h>
47 #include <net/ipv6.h>
48 #include <net/ip6_fib.h>
49 #include <net/ip6_route.h>
50 #include <net/ndisc.h>
51 #include <net/addrconf.h>
52 #include <net/tcp.h>
53 #include <linux/rtnetlink.h>
54 #include <net/dst.h>
55 #include <net/xfrm.h>
56 #include <net/netevent.h>
57
58 #include <asm/uaccess.h>
59
60 #ifdef CONFIG_SYSCTL
61 #include <linux/sysctl.h>
62 #endif
63
64 /* Set to 3 to get tracing. */
65 #define RT6_DEBUG 2
66
67 #if RT6_DEBUG >= 3
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
70 #else
71 #define RDBG(x)
72 #define RT6_TRACE(x...) do { ; } while (0)
73 #endif
74
75 #define CLONE_OFFLINK_ROUTE 0
76
77 #define RT6_SELECT_F_IFACE      0x1
78 #define RT6_SELECT_F_REACHABLE  0x2
79
80 static int ip6_rt_max_size = 4096;
81 static int ip6_rt_gc_min_interval = HZ / 2;
82 static int ip6_rt_gc_timeout = 60*HZ;
83 int ip6_rt_gc_interval = 30*HZ;
84 static int ip6_rt_gc_elasticity = 9;
85 static int ip6_rt_mtu_expires = 10*60*HZ;
86 static int ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
87
88 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
89 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void             ip6_dst_destroy(struct dst_entry *);
92 static void             ip6_dst_ifdown(struct dst_entry *,
93                                        struct net_device *dev, int how);
94 static int               ip6_dst_gc(void);
95
96 static int              ip6_pkt_discard(struct sk_buff *skb);
97 static int              ip6_pkt_discard_out(struct sk_buff *skb);
98 static void             ip6_link_failure(struct sk_buff *skb);
99 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
100
101 #ifdef CONFIG_IPV6_ROUTE_INFO
102 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
103                                            struct in6_addr *gwaddr, int ifindex,
104                                            unsigned pref);
105 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
106                                            struct in6_addr *gwaddr, int ifindex);
107 #endif
108
109 static struct dst_ops ip6_dst_ops = {
110         .family                 =       AF_INET6,
111         .protocol               =       __constant_htons(ETH_P_IPV6),
112         .gc                     =       ip6_dst_gc,
113         .gc_thresh              =       1024,
114         .check                  =       ip6_dst_check,
115         .destroy                =       ip6_dst_destroy,
116         .ifdown                 =       ip6_dst_ifdown,
117         .negative_advice        =       ip6_negative_advice,
118         .link_failure           =       ip6_link_failure,
119         .update_pmtu            =       ip6_rt_update_pmtu,
120         .entry_size             =       sizeof(struct rt6_info),
121 };
122
123 struct rt6_info ip6_null_entry = {
124         .u = {
125                 .dst = {
126                         .__refcnt       = ATOMIC_INIT(1),
127                         .__use          = 1,
128                         .dev            = &loopback_dev,
129                         .obsolete       = -1,
130                         .error          = -ENETUNREACH,
131                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
132                         .input          = ip6_pkt_discard,
133                         .output         = ip6_pkt_discard_out,
134                         .ops            = &ip6_dst_ops,
135                         .path           = (struct dst_entry*)&ip6_null_entry,
136                 }
137         },
138         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
139         .rt6i_metric    = ~(u32) 0,
140         .rt6i_ref       = ATOMIC_INIT(1),
141 };
142
143 struct fib6_node ip6_routing_table = {
144         .leaf           = &ip6_null_entry,
145         .fn_flags       = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO,
146 };
147
148 /* Protects all the ip6 fib */
149
150 DEFINE_RWLOCK(rt6_lock);
151
152
153 /* allocate dst with ip6_dst_ops */
154 static __inline__ struct rt6_info *ip6_dst_alloc(void)
155 {
156         return (struct rt6_info *)dst_alloc(&ip6_dst_ops);
157 }
158
159 static void ip6_dst_destroy(struct dst_entry *dst)
160 {
161         struct rt6_info *rt = (struct rt6_info *)dst;
162         struct inet6_dev *idev = rt->rt6i_idev;
163
164         if (idev != NULL) {
165                 rt->rt6i_idev = NULL;
166                 in6_dev_put(idev);
167         }       
168 }
169
170 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
171                            int how)
172 {
173         struct rt6_info *rt = (struct rt6_info *)dst;
174         struct inet6_dev *idev = rt->rt6i_idev;
175
176         if (dev != &loopback_dev && idev != NULL && idev->dev == dev) {
177                 struct inet6_dev *loopback_idev = in6_dev_get(&loopback_dev);
178                 if (loopback_idev != NULL) {
179                         rt->rt6i_idev = loopback_idev;
180                         in6_dev_put(idev);
181                 }
182         }
183 }
184
185 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
186 {
187         return (rt->rt6i_flags & RTF_EXPIRES &&
188                 time_after(jiffies, rt->rt6i_expires));
189 }
190
191 /*
192  *      Route lookup. Any rt6_lock is implied.
193  */
194
195 static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt,
196                                                     int oif,
197                                                     int strict)
198 {
199         struct rt6_info *local = NULL;
200         struct rt6_info *sprt;
201
202         if (oif) {
203                 for (sprt = rt; sprt; sprt = sprt->u.next) {
204                         struct net_device *dev = sprt->rt6i_dev;
205                         if (dev->ifindex == oif)
206                                 return sprt;
207                         if (dev->flags & IFF_LOOPBACK) {
208                                 if (sprt->rt6i_idev == NULL ||
209                                     sprt->rt6i_idev->dev->ifindex != oif) {
210                                         if (strict && oif)
211                                                 continue;
212                                         if (local && (!oif || 
213                                                       local->rt6i_idev->dev->ifindex == oif))
214                                                 continue;
215                                 }
216                                 local = sprt;
217                         }
218                 }
219
220                 if (local)
221                         return local;
222
223                 if (strict)
224                         return &ip6_null_entry;
225         }
226         return rt;
227 }
228
229 #ifdef CONFIG_IPV6_ROUTER_PREF
230 static void rt6_probe(struct rt6_info *rt)
231 {
232         struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
233         /*
234          * Okay, this does not seem to be appropriate
235          * for now, however, we need to check if it
236          * is really so; aka Router Reachability Probing.
237          *
238          * Router Reachability Probe MUST be rate-limited
239          * to no more than one per minute.
240          */
241         if (!neigh || (neigh->nud_state & NUD_VALID))
242                 return;
243         read_lock_bh(&neigh->lock);
244         if (!(neigh->nud_state & NUD_VALID) &&
245             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
246                 struct in6_addr mcaddr;
247                 struct in6_addr *target;
248
249                 neigh->updated = jiffies;
250                 read_unlock_bh(&neigh->lock);
251
252                 target = (struct in6_addr *)&neigh->primary_key;
253                 addrconf_addr_solict_mult(target, &mcaddr);
254                 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
255         } else
256                 read_unlock_bh(&neigh->lock);
257 }
258 #else
259 static inline void rt6_probe(struct rt6_info *rt)
260 {
261         return;
262 }
263 #endif
264
265 /*
266  * Default Router Selection (RFC 2461 6.3.6)
267  */
268 static int inline rt6_check_dev(struct rt6_info *rt, int oif)
269 {
270         struct net_device *dev = rt->rt6i_dev;
271         if (!oif || dev->ifindex == oif)
272                 return 2;
273         if ((dev->flags & IFF_LOOPBACK) &&
274             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
275                 return 1;
276         return 0;
277 }
278
279 static int inline rt6_check_neigh(struct rt6_info *rt)
280 {
281         struct neighbour *neigh = rt->rt6i_nexthop;
282         int m = 0;
283         if (rt->rt6i_flags & RTF_NONEXTHOP ||
284             !(rt->rt6i_flags & RTF_GATEWAY))
285                 m = 1;
286         else if (neigh) {
287                 read_lock_bh(&neigh->lock);
288                 if (neigh->nud_state & NUD_VALID)
289                         m = 2;
290                 read_unlock_bh(&neigh->lock);
291         }
292         return m;
293 }
294
295 static int rt6_score_route(struct rt6_info *rt, int oif,
296                            int strict)
297 {
298         int m, n;
299                 
300         m = rt6_check_dev(rt, oif);
301         if (!m && (strict & RT6_SELECT_F_IFACE))
302                 return -1;
303 #ifdef CONFIG_IPV6_ROUTER_PREF
304         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
305 #endif
306         n = rt6_check_neigh(rt);
307         if (n > 1)
308                 m |= 16;
309         else if (!n && strict & RT6_SELECT_F_REACHABLE)
310                 return -1;
311         return m;
312 }
313
314 static struct rt6_info *rt6_select(struct rt6_info **head, int oif,
315                                    int strict)
316 {
317         struct rt6_info *match = NULL, *last = NULL;
318         struct rt6_info *rt, *rt0 = *head;
319         u32 metric;
320         int mpri = -1;
321
322         RT6_TRACE("%s(head=%p(*head=%p), oif=%d)\n",
323                   __FUNCTION__, head, head ? *head : NULL, oif);
324
325         for (rt = rt0, metric = rt0->rt6i_metric;
326              rt && rt->rt6i_metric == metric && (!last || rt != rt0);
327              rt = rt->u.next) {
328                 int m;
329
330                 if (rt6_check_expired(rt))
331                         continue;
332
333                 last = rt;
334
335                 m = rt6_score_route(rt, oif, strict);
336                 if (m < 0)
337                         continue;
338
339                 if (m > mpri) {
340                         rt6_probe(match);
341                         match = rt;
342                         mpri = m;
343                 } else {
344                         rt6_probe(rt);
345                 }
346         }
347
348         if (!match &&
349             (strict & RT6_SELECT_F_REACHABLE) &&
350             last && last != rt0) {
351                 /* no entries matched; do round-robin */
352                 static DEFINE_SPINLOCK(lock);
353                 spin_lock(&lock);
354                 *head = rt0->u.next;
355                 rt0->u.next = last->u.next;
356                 last->u.next = rt0;
357                 spin_unlock(&lock);
358         }
359
360         RT6_TRACE("%s() => %p, score=%d\n",
361                   __FUNCTION__, match, mpri);
362
363         return (match ? match : &ip6_null_entry);
364 }
365
366 #ifdef CONFIG_IPV6_ROUTE_INFO
367 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
368                   struct in6_addr *gwaddr)
369 {
370         struct route_info *rinfo = (struct route_info *) opt;
371         struct in6_addr prefix_buf, *prefix;
372         unsigned int pref;
373         u32 lifetime;
374         struct rt6_info *rt;
375
376         if (len < sizeof(struct route_info)) {
377                 return -EINVAL;
378         }
379
380         /* Sanity check for prefix_len and length */
381         if (rinfo->length > 3) {
382                 return -EINVAL;
383         } else if (rinfo->prefix_len > 128) {
384                 return -EINVAL;
385         } else if (rinfo->prefix_len > 64) {
386                 if (rinfo->length < 2) {
387                         return -EINVAL;
388                 }
389         } else if (rinfo->prefix_len > 0) {
390                 if (rinfo->length < 1) {
391                         return -EINVAL;
392                 }
393         }
394
395         pref = rinfo->route_pref;
396         if (pref == ICMPV6_ROUTER_PREF_INVALID)
397                 pref = ICMPV6_ROUTER_PREF_MEDIUM;
398
399         lifetime = htonl(rinfo->lifetime);
400         if (lifetime == 0xffffffff) {
401                 /* infinity */
402         } else if (lifetime > 0x7fffffff/HZ) {
403                 /* Avoid arithmetic overflow */
404                 lifetime = 0x7fffffff/HZ - 1;
405         }
406
407         if (rinfo->length == 3)
408                 prefix = (struct in6_addr *)rinfo->prefix;
409         else {
410                 /* this function is safe */
411                 ipv6_addr_prefix(&prefix_buf,
412                                  (struct in6_addr *)rinfo->prefix,
413                                  rinfo->prefix_len);
414                 prefix = &prefix_buf;
415         }
416
417         rt = rt6_get_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex);
418
419         if (rt && !lifetime) {
420                 ip6_del_rt(rt, NULL, NULL, NULL);
421                 rt = NULL;
422         }
423
424         if (!rt && lifetime)
425                 rt = rt6_add_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
426                                         pref);
427         else if (rt)
428                 rt->rt6i_flags = RTF_ROUTEINFO |
429                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
430
431         if (rt) {
432                 if (lifetime == 0xffffffff) {
433                         rt->rt6i_flags &= ~RTF_EXPIRES;
434                 } else {
435                         rt->rt6i_expires = jiffies + HZ * lifetime;
436                         rt->rt6i_flags |= RTF_EXPIRES;
437                 }
438                 dst_release(&rt->u.dst);
439         }
440         return 0;
441 }
442 #endif
443
444 struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
445                             int oif, int strict)
446 {
447         struct fib6_node *fn;
448         struct rt6_info *rt;
449
450         read_lock_bh(&rt6_lock);
451         fn = fib6_lookup(&ip6_routing_table, daddr, saddr);
452         rt = rt6_device_match(fn->leaf, oif, strict);
453         dst_hold(&rt->u.dst);
454         rt->u.dst.__use++;
455         read_unlock_bh(&rt6_lock);
456
457         rt->u.dst.lastuse = jiffies;
458         if (rt->u.dst.error == 0)
459                 return rt;
460         dst_release(&rt->u.dst);
461         return NULL;
462 }
463
464 /* ip6_ins_rt is called with FREE rt6_lock.
465    It takes new route entry, the addition fails by any reason the
466    route is freed. In any case, if caller does not hold it, it may
467    be destroyed.
468  */
469
470 int ip6_ins_rt(struct rt6_info *rt, struct nlmsghdr *nlh,
471                 void *_rtattr, struct netlink_skb_parms *req)
472 {
473         int err;
474
475         write_lock_bh(&rt6_lock);
476         err = fib6_add(&ip6_routing_table, rt, nlh, _rtattr, req);
477         write_unlock_bh(&rt6_lock);
478
479         return err;
480 }
481
482 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
483                                       struct in6_addr *saddr)
484 {
485         struct rt6_info *rt;
486
487         /*
488          *      Clone the route.
489          */
490
491         rt = ip6_rt_copy(ort);
492
493         if (rt) {
494                 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
495                         if (rt->rt6i_dst.plen != 128 &&
496                             ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
497                                 rt->rt6i_flags |= RTF_ANYCAST;
498                         ipv6_addr_copy(&rt->rt6i_gateway, daddr);
499                 }
500
501                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
502                 rt->rt6i_dst.plen = 128;
503                 rt->rt6i_flags |= RTF_CACHE;
504                 rt->u.dst.flags |= DST_HOST;
505
506 #ifdef CONFIG_IPV6_SUBTREES
507                 if (rt->rt6i_src.plen && saddr) {
508                         ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
509                         rt->rt6i_src.plen = 128;
510                 }
511 #endif
512
513                 rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
514
515         }
516
517         return rt;
518 }
519
520 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
521 {
522         struct rt6_info *rt = ip6_rt_copy(ort);
523         if (rt) {
524                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
525                 rt->rt6i_dst.plen = 128;
526                 rt->rt6i_flags |= RTF_CACHE;
527                 if (rt->rt6i_flags & RTF_REJECT)
528                         rt->u.dst.error = ort->u.dst.error;
529                 rt->u.dst.flags |= DST_HOST;
530                 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
531         }
532         return rt;
533 }
534
535 #define BACKTRACK() \
536 if (rt == &ip6_null_entry) { \
537        while ((fn = fn->parent) != NULL) { \
538                 if (fn->fn_flags & RTN_ROOT) { \
539                         goto out; \
540                 } \
541                 if (fn->fn_flags & RTN_RTINFO) \
542                         goto restart; \
543         } \
544 }
545
546
547 void ip6_route_input(struct sk_buff *skb)
548 {
549         struct fib6_node *fn;
550         struct rt6_info *rt, *nrt;
551         int strict;
552         int attempts = 3;
553         int err;
554         int reachable = RT6_SELECT_F_REACHABLE;
555
556         strict = ipv6_addr_type(&skb->nh.ipv6h->daddr) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL) ? RT6_SELECT_F_IFACE : 0;
557
558 relookup:
559         read_lock_bh(&rt6_lock);
560
561 restart_2:
562         fn = fib6_lookup(&ip6_routing_table, &skb->nh.ipv6h->daddr,
563                          &skb->nh.ipv6h->saddr);
564
565 restart:
566         rt = rt6_select(&fn->leaf, skb->dev->ifindex, strict | reachable);
567         BACKTRACK();
568         if (rt == &ip6_null_entry ||
569             rt->rt6i_flags & RTF_CACHE)
570                 goto out;
571
572         dst_hold(&rt->u.dst);
573         read_unlock_bh(&rt6_lock);
574
575         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
576                 nrt = rt6_alloc_cow(rt, &skb->nh.ipv6h->daddr, &skb->nh.ipv6h->saddr);
577         else {
578 #if CLONE_OFFLINK_ROUTE
579                 nrt = rt6_alloc_clone(rt, &skb->nh.ipv6h->daddr);
580 #else
581                 goto out2;
582 #endif
583         }
584
585         dst_release(&rt->u.dst);
586         rt = nrt ? : &ip6_null_entry;
587
588         dst_hold(&rt->u.dst);
589         if (nrt) {
590                 err = ip6_ins_rt(nrt, NULL, NULL, &NETLINK_CB(skb));
591                 if (!err)
592                         goto out2;
593         }
594
595         if (--attempts <= 0)
596                 goto out2;
597
598         /*
599          * Race condition! In the gap, when rt6_lock was
600          * released someone could insert this route.  Relookup.
601          */
602         dst_release(&rt->u.dst);
603         goto relookup;
604
605 out:
606         if (reachable) {
607                 reachable = 0;
608                 goto restart_2;
609         }
610         dst_hold(&rt->u.dst);
611         read_unlock_bh(&rt6_lock);
612 out2:
613         rt->u.dst.lastuse = jiffies;
614         rt->u.dst.__use++;
615         skb->dst = (struct dst_entry *) rt;
616         return;
617 }
618
619 struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl)
620 {
621         struct fib6_node *fn;
622         struct rt6_info *rt, *nrt;
623         int strict;
624         int attempts = 3;
625         int err;
626         int reachable = RT6_SELECT_F_REACHABLE;
627
628         strict = ipv6_addr_type(&fl->fl6_dst) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL) ? RT6_SELECT_F_IFACE : 0;
629
630 relookup:
631         read_lock_bh(&rt6_lock);
632
633 restart_2:
634         fn = fib6_lookup(&ip6_routing_table, &fl->fl6_dst, &fl->fl6_src);
635
636 restart:
637         rt = rt6_select(&fn->leaf, fl->oif, strict | reachable);
638         BACKTRACK();
639         if (rt == &ip6_null_entry ||
640             rt->rt6i_flags & RTF_CACHE)
641                 goto out;
642
643         dst_hold(&rt->u.dst);
644         read_unlock_bh(&rt6_lock);
645
646         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
647                 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
648         else {
649 #if CLONE_OFFLINK_ROUTE
650                 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
651 #else
652                 goto out2;
653 #endif
654         }
655
656         dst_release(&rt->u.dst);
657         rt = nrt ? : &ip6_null_entry;
658
659         dst_hold(&rt->u.dst);
660         if (nrt) {
661                 err = ip6_ins_rt(nrt, NULL, NULL, NULL);
662                 if (!err)
663                         goto out2;
664         }
665
666         if (--attempts <= 0)
667                 goto out2;
668
669         /*
670          * Race condition! In the gap, when rt6_lock was
671          * released someone could insert this route.  Relookup.
672          */
673         dst_release(&rt->u.dst);
674         goto relookup;
675
676 out:
677         if (reachable) {
678                 reachable = 0;
679                 goto restart_2;
680         }
681         dst_hold(&rt->u.dst);
682         read_unlock_bh(&rt6_lock);
683 out2:
684         rt->u.dst.lastuse = jiffies;
685         rt->u.dst.__use++;
686         return &rt->u.dst;
687 }
688
689
690 /*
691  *      Destination cache support functions
692  */
693
694 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
695 {
696         struct rt6_info *rt;
697
698         rt = (struct rt6_info *) dst;
699
700         if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
701                 return dst;
702
703         return NULL;
704 }
705
706 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
707 {
708         struct rt6_info *rt = (struct rt6_info *) dst;
709
710         if (rt) {
711                 if (rt->rt6i_flags & RTF_CACHE)
712                         ip6_del_rt(rt, NULL, NULL, NULL);
713                 else
714                         dst_release(dst);
715         }
716         return NULL;
717 }
718
719 static void ip6_link_failure(struct sk_buff *skb)
720 {
721         struct rt6_info *rt;
722
723         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
724
725         rt = (struct rt6_info *) skb->dst;
726         if (rt) {
727                 if (rt->rt6i_flags&RTF_CACHE) {
728                         dst_set_expires(&rt->u.dst, 0);
729                         rt->rt6i_flags |= RTF_EXPIRES;
730                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
731                         rt->rt6i_node->fn_sernum = -1;
732         }
733 }
734
735 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
736 {
737         struct rt6_info *rt6 = (struct rt6_info*)dst;
738
739         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
740                 rt6->rt6i_flags |= RTF_MODIFIED;
741                 if (mtu < IPV6_MIN_MTU) {
742                         mtu = IPV6_MIN_MTU;
743                         dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
744                 }
745                 dst->metrics[RTAX_MTU-1] = mtu;
746                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
747         }
748 }
749
750 /* Protected by rt6_lock.  */
751 static struct dst_entry *ndisc_dst_gc_list;
752 static int ipv6_get_mtu(struct net_device *dev);
753
754 static inline unsigned int ipv6_advmss(unsigned int mtu)
755 {
756         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
757
758         if (mtu < ip6_rt_min_advmss)
759                 mtu = ip6_rt_min_advmss;
760
761         /*
762          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and 
763          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. 
764          * IPV6_MAXPLEN is also valid and means: "any MSS, 
765          * rely only on pmtu discovery"
766          */
767         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
768                 mtu = IPV6_MAXPLEN;
769         return mtu;
770 }
771
772 struct dst_entry *ndisc_dst_alloc(struct net_device *dev, 
773                                   struct neighbour *neigh,
774                                   struct in6_addr *addr,
775                                   int (*output)(struct sk_buff *))
776 {
777         struct rt6_info *rt;
778         struct inet6_dev *idev = in6_dev_get(dev);
779
780         if (unlikely(idev == NULL))
781                 return NULL;
782
783         rt = ip6_dst_alloc();
784         if (unlikely(rt == NULL)) {
785                 in6_dev_put(idev);
786                 goto out;
787         }
788
789         dev_hold(dev);
790         if (neigh)
791                 neigh_hold(neigh);
792         else
793                 neigh = ndisc_get_neigh(dev, addr);
794
795         rt->rt6i_dev      = dev;
796         rt->rt6i_idev     = idev;
797         rt->rt6i_nexthop  = neigh;
798         atomic_set(&rt->u.dst.__refcnt, 1);
799         rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
800         rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
801         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
802         rt->u.dst.output  = output;
803
804 #if 0   /* there's no chance to use these for ndisc */
805         rt->u.dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST 
806                                 ? DST_HOST 
807                                 : 0;
808         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
809         rt->rt6i_dst.plen = 128;
810 #endif
811
812         write_lock_bh(&rt6_lock);
813         rt->u.dst.next = ndisc_dst_gc_list;
814         ndisc_dst_gc_list = &rt->u.dst;
815         write_unlock_bh(&rt6_lock);
816
817         fib6_force_start_gc();
818
819 out:
820         return (struct dst_entry *)rt;
821 }
822
823 int ndisc_dst_gc(int *more)
824 {
825         struct dst_entry *dst, *next, **pprev;
826         int freed;
827
828         next = NULL;
829         pprev = &ndisc_dst_gc_list;
830         freed = 0;
831         while ((dst = *pprev) != NULL) {
832                 if (!atomic_read(&dst->__refcnt)) {
833                         *pprev = dst->next;
834                         dst_free(dst);
835                         freed++;
836                 } else {
837                         pprev = &dst->next;
838                         (*more)++;
839                 }
840         }
841
842         return freed;
843 }
844
845 static int ip6_dst_gc(void)
846 {
847         static unsigned expire = 30*HZ;
848         static unsigned long last_gc;
849         unsigned long now = jiffies;
850
851         if (time_after(last_gc + ip6_rt_gc_min_interval, now) &&
852             atomic_read(&ip6_dst_ops.entries) <= ip6_rt_max_size)
853                 goto out;
854
855         expire++;
856         fib6_run_gc(expire);
857         last_gc = now;
858         if (atomic_read(&ip6_dst_ops.entries) < ip6_dst_ops.gc_thresh)
859                 expire = ip6_rt_gc_timeout>>1;
860
861 out:
862         expire -= expire>>ip6_rt_gc_elasticity;
863         return (atomic_read(&ip6_dst_ops.entries) > ip6_rt_max_size);
864 }
865
866 /* Clean host part of a prefix. Not necessary in radix tree,
867    but results in cleaner routing tables.
868
869    Remove it only when all the things will work!
870  */
871
872 static int ipv6_get_mtu(struct net_device *dev)
873 {
874         int mtu = IPV6_MIN_MTU;
875         struct inet6_dev *idev;
876
877         idev = in6_dev_get(dev);
878         if (idev) {
879                 mtu = idev->cnf.mtu6;
880                 in6_dev_put(idev);
881         }
882         return mtu;
883 }
884
885 int ipv6_get_hoplimit(struct net_device *dev)
886 {
887         int hoplimit = ipv6_devconf.hop_limit;
888         struct inet6_dev *idev;
889
890         idev = in6_dev_get(dev);
891         if (idev) {
892                 hoplimit = idev->cnf.hop_limit;
893                 in6_dev_put(idev);
894         }
895         return hoplimit;
896 }
897
898 /*
899  *
900  */
901
902 int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, 
903                 void *_rtattr, struct netlink_skb_parms *req)
904 {
905         int err;
906         struct rtmsg *r;
907         struct rtattr **rta;
908         struct rt6_info *rt = NULL;
909         struct net_device *dev = NULL;
910         struct inet6_dev *idev = NULL;
911         int addr_type;
912
913         rta = (struct rtattr **) _rtattr;
914
915         if (rtmsg->rtmsg_dst_len > 128 || rtmsg->rtmsg_src_len > 128)
916                 return -EINVAL;
917 #ifndef CONFIG_IPV6_SUBTREES
918         if (rtmsg->rtmsg_src_len)
919                 return -EINVAL;
920 #endif
921         if (rtmsg->rtmsg_ifindex) {
922                 err = -ENODEV;
923                 dev = dev_get_by_index(rtmsg->rtmsg_ifindex);
924                 if (!dev)
925                         goto out;
926                 idev = in6_dev_get(dev);
927                 if (!idev)
928                         goto out;
929         }
930
931         if (rtmsg->rtmsg_metric == 0)
932                 rtmsg->rtmsg_metric = IP6_RT_PRIO_USER;
933
934         rt = ip6_dst_alloc();
935
936         if (rt == NULL) {
937                 err = -ENOMEM;
938                 goto out;
939         }
940
941         rt->u.dst.obsolete = -1;
942         rt->rt6i_expires = jiffies + clock_t_to_jiffies(rtmsg->rtmsg_info);
943         if (nlh && (r = NLMSG_DATA(nlh))) {
944                 rt->rt6i_protocol = r->rtm_protocol;
945         } else {
946                 rt->rt6i_protocol = RTPROT_BOOT;
947         }
948
949         addr_type = ipv6_addr_type(&rtmsg->rtmsg_dst);
950
951         if (addr_type & IPV6_ADDR_MULTICAST)
952                 rt->u.dst.input = ip6_mc_input;
953         else
954                 rt->u.dst.input = ip6_forward;
955
956         rt->u.dst.output = ip6_output;
957
958         ipv6_addr_prefix(&rt->rt6i_dst.addr, 
959                          &rtmsg->rtmsg_dst, rtmsg->rtmsg_dst_len);
960         rt->rt6i_dst.plen = rtmsg->rtmsg_dst_len;
961         if (rt->rt6i_dst.plen == 128)
962                rt->u.dst.flags = DST_HOST;
963
964 #ifdef CONFIG_IPV6_SUBTREES
965         ipv6_addr_prefix(&rt->rt6i_src.addr, 
966                          &rtmsg->rtmsg_src, rtmsg->rtmsg_src_len);
967         rt->rt6i_src.plen = rtmsg->rtmsg_src_len;
968 #endif
969
970         rt->rt6i_metric = rtmsg->rtmsg_metric;
971
972         /* We cannot add true routes via loopback here,
973            they would result in kernel looping; promote them to reject routes
974          */
975         if ((rtmsg->rtmsg_flags&RTF_REJECT) ||
976             (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
977                 /* hold loopback dev/idev if we haven't done so. */
978                 if (dev != &loopback_dev) {
979                         if (dev) {
980                                 dev_put(dev);
981                                 in6_dev_put(idev);
982                         }
983                         dev = &loopback_dev;
984                         dev_hold(dev);
985                         idev = in6_dev_get(dev);
986                         if (!idev) {
987                                 err = -ENODEV;
988                                 goto out;
989                         }
990                 }
991                 rt->u.dst.output = ip6_pkt_discard_out;
992                 rt->u.dst.input = ip6_pkt_discard;
993                 rt->u.dst.error = -ENETUNREACH;
994                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
995                 goto install_route;
996         }
997
998         if (rtmsg->rtmsg_flags & RTF_GATEWAY) {
999                 struct in6_addr *gw_addr;
1000                 int gwa_type;
1001
1002                 gw_addr = &rtmsg->rtmsg_gateway;
1003                 ipv6_addr_copy(&rt->rt6i_gateway, &rtmsg->rtmsg_gateway);
1004                 gwa_type = ipv6_addr_type(gw_addr);
1005
1006                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1007                         struct rt6_info *grt;
1008
1009                         /* IPv6 strictly inhibits using not link-local
1010                            addresses as nexthop address.
1011                            Otherwise, router will not able to send redirects.
1012                            It is very good, but in some (rare!) circumstances
1013                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1014                            some exceptions. --ANK
1015                          */
1016                         err = -EINVAL;
1017                         if (!(gwa_type&IPV6_ADDR_UNICAST))
1018                                 goto out;
1019
1020                         grt = rt6_lookup(gw_addr, NULL, rtmsg->rtmsg_ifindex, 1);
1021
1022                         err = -EHOSTUNREACH;
1023                         if (grt == NULL)
1024                                 goto out;
1025                         if (dev) {
1026                                 if (dev != grt->rt6i_dev) {
1027                                         dst_release(&grt->u.dst);
1028                                         goto out;
1029                                 }
1030                         } else {
1031                                 dev = grt->rt6i_dev;
1032                                 idev = grt->rt6i_idev;
1033                                 dev_hold(dev);
1034                                 in6_dev_hold(grt->rt6i_idev);
1035                         }
1036                         if (!(grt->rt6i_flags&RTF_GATEWAY))
1037                                 err = 0;
1038                         dst_release(&grt->u.dst);
1039
1040                         if (err)
1041                                 goto out;
1042                 }
1043                 err = -EINVAL;
1044                 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1045                         goto out;
1046         }
1047
1048         err = -ENODEV;
1049         if (dev == NULL)
1050                 goto out;
1051
1052         if (rtmsg->rtmsg_flags & (RTF_GATEWAY|RTF_NONEXTHOP)) {
1053                 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1054                 if (IS_ERR(rt->rt6i_nexthop)) {
1055                         err = PTR_ERR(rt->rt6i_nexthop);
1056                         rt->rt6i_nexthop = NULL;
1057                         goto out;
1058                 }
1059         }
1060
1061         rt->rt6i_flags = rtmsg->rtmsg_flags;
1062
1063 install_route:
1064         if (rta && rta[RTA_METRICS-1]) {
1065                 int attrlen = RTA_PAYLOAD(rta[RTA_METRICS-1]);
1066                 struct rtattr *attr = RTA_DATA(rta[RTA_METRICS-1]);
1067
1068                 while (RTA_OK(attr, attrlen)) {
1069                         unsigned flavor = attr->rta_type;
1070                         if (flavor) {
1071                                 if (flavor > RTAX_MAX) {
1072                                         err = -EINVAL;
1073                                         goto out;
1074                                 }
1075                                 rt->u.dst.metrics[flavor-1] =
1076                                         *(u32 *)RTA_DATA(attr);
1077                         }
1078                         attr = RTA_NEXT(attr, attrlen);
1079                 }
1080         }
1081
1082         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1083                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1084         if (!rt->u.dst.metrics[RTAX_MTU-1])
1085                 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1086         if (!rt->u.dst.metrics[RTAX_ADVMSS-1])
1087                 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1088         rt->u.dst.dev = dev;
1089         rt->rt6i_idev = idev;
1090         return ip6_ins_rt(rt, nlh, _rtattr, req);
1091
1092 out:
1093         if (dev)
1094                 dev_put(dev);
1095         if (idev)
1096                 in6_dev_put(idev);
1097         if (rt)
1098                 dst_free((struct dst_entry *) rt);
1099         return err;
1100 }
1101
1102 int ip6_del_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
1103 {
1104         int err;
1105
1106         write_lock_bh(&rt6_lock);
1107
1108         err = fib6_del(rt, nlh, _rtattr, req);
1109         dst_release(&rt->u.dst);
1110
1111         write_unlock_bh(&rt6_lock);
1112
1113         return err;
1114 }
1115
1116 static int ip6_route_del(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
1117 {
1118         struct fib6_node *fn;
1119         struct rt6_info *rt;
1120         int err = -ESRCH;
1121
1122         read_lock_bh(&rt6_lock);
1123
1124         fn = fib6_locate(&ip6_routing_table,
1125                          &rtmsg->rtmsg_dst, rtmsg->rtmsg_dst_len,
1126                          &rtmsg->rtmsg_src, rtmsg->rtmsg_src_len);
1127         
1128         if (fn) {
1129                 for (rt = fn->leaf; rt; rt = rt->u.next) {
1130                         if (rtmsg->rtmsg_ifindex &&
1131                             (rt->rt6i_dev == NULL ||
1132                              rt->rt6i_dev->ifindex != rtmsg->rtmsg_ifindex))
1133                                 continue;
1134                         if (rtmsg->rtmsg_flags&RTF_GATEWAY &&
1135                             !ipv6_addr_equal(&rtmsg->rtmsg_gateway, &rt->rt6i_gateway))
1136                                 continue;
1137                         if (rtmsg->rtmsg_metric &&
1138                             rtmsg->rtmsg_metric != rt->rt6i_metric)
1139                                 continue;
1140                         dst_hold(&rt->u.dst);
1141                         read_unlock_bh(&rt6_lock);
1142
1143                         return ip6_del_rt(rt, nlh, _rtattr, req);
1144                 }
1145         }
1146         read_unlock_bh(&rt6_lock);
1147
1148         return err;
1149 }
1150
1151 /*
1152  *      Handle redirects
1153  */
1154 void rt6_redirect(struct in6_addr *dest, struct in6_addr *saddr,
1155                   struct neighbour *neigh, u8 *lladdr, int on_link)
1156 {
1157         struct rt6_info *rt, *nrt = NULL;
1158         int strict;
1159         struct fib6_node *fn;
1160         struct netevent_redirect netevent;
1161
1162         /*
1163          * Get the "current" route for this destination and
1164          * check if the redirect has come from approriate router.
1165          *
1166          * RFC 2461 specifies that redirects should only be
1167          * accepted if they come from the nexthop to the target.
1168          * Due to the way the routes are chosen, this notion
1169          * is a bit fuzzy and one might need to check all possible
1170          * routes.
1171          */
1172         strict = ipv6_addr_type(dest) & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL);
1173
1174         read_lock_bh(&rt6_lock);
1175         fn = fib6_lookup(&ip6_routing_table, dest, NULL);
1176 restart:
1177         for (rt = fn->leaf; rt; rt = rt->u.next) {
1178                 /*
1179                  * Current route is on-link; redirect is always invalid.
1180                  *
1181                  * Seems, previous statement is not true. It could
1182                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1183                  * But then router serving it might decide, that we should
1184                  * know truth 8)8) --ANK (980726).
1185                  */
1186                 if (rt6_check_expired(rt))
1187                         continue;
1188                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1189                         continue;
1190                 if (neigh->dev != rt->rt6i_dev)
1191                         continue;
1192                 if (!ipv6_addr_equal(saddr, &rt->rt6i_gateway))
1193                         continue;
1194                 break;
1195         }
1196         if (rt)
1197                 dst_hold(&rt->u.dst);
1198         else if (strict) {
1199                 while ((fn = fn->parent) != NULL) {
1200                         if (fn->fn_flags & RTN_ROOT)
1201                                 break;
1202                         if (fn->fn_flags & RTN_RTINFO)
1203                                 goto restart;
1204                 }
1205         }
1206         read_unlock_bh(&rt6_lock);
1207
1208         if (!rt) {
1209                 if (net_ratelimit())
1210                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1211                                "for redirect target\n");
1212                 return;
1213         }
1214
1215         /*
1216          *      We have finally decided to accept it.
1217          */
1218
1219         neigh_update(neigh, lladdr, NUD_STALE, 
1220                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1221                      NEIGH_UPDATE_F_OVERRIDE|
1222                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1223                                      NEIGH_UPDATE_F_ISROUTER))
1224                      );
1225
1226         /*
1227          * Redirect received -> path was valid.
1228          * Look, redirects are sent only in response to data packets,
1229          * so that this nexthop apparently is reachable. --ANK
1230          */
1231         dst_confirm(&rt->u.dst);
1232
1233         /* Duplicate redirect: silently ignore. */
1234         if (neigh == rt->u.dst.neighbour)
1235                 goto out;
1236
1237         nrt = ip6_rt_copy(rt);
1238         if (nrt == NULL)
1239                 goto out;
1240
1241         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1242         if (on_link)
1243                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1244
1245         ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1246         nrt->rt6i_dst.plen = 128;
1247         nrt->u.dst.flags |= DST_HOST;
1248
1249         ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1250         nrt->rt6i_nexthop = neigh_clone(neigh);
1251         /* Reset pmtu, it may be better */
1252         nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1253         nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst));
1254
1255         if (ip6_ins_rt(nrt, NULL, NULL, NULL))
1256                 goto out;
1257
1258         netevent.old = &rt->u.dst;
1259         netevent.new = &nrt->u.dst;
1260         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1261
1262         if (rt->rt6i_flags&RTF_CACHE) {
1263                 ip6_del_rt(rt, NULL, NULL, NULL);
1264                 return;
1265         }
1266
1267 out:
1268         dst_release(&rt->u.dst);
1269         return;
1270 }
1271
1272 /*
1273  *      Handle ICMP "packet too big" messages
1274  *      i.e. Path MTU discovery
1275  */
1276
1277 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1278                         struct net_device *dev, u32 pmtu)
1279 {
1280         struct rt6_info *rt, *nrt;
1281         int allfrag = 0;
1282
1283         rt = rt6_lookup(daddr, saddr, dev->ifindex, 0);
1284         if (rt == NULL)
1285                 return;
1286
1287         if (pmtu >= dst_mtu(&rt->u.dst))
1288                 goto out;
1289
1290         if (pmtu < IPV6_MIN_MTU) {
1291                 /*
1292                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link 
1293                  * MTU (1280) and a fragment header should always be included
1294                  * after a node receiving Too Big message reporting PMTU is
1295                  * less than the IPv6 Minimum Link MTU.
1296                  */
1297                 pmtu = IPV6_MIN_MTU;
1298                 allfrag = 1;
1299         }
1300
1301         /* New mtu received -> path was valid.
1302            They are sent only in response to data packets,
1303            so that this nexthop apparently is reachable. --ANK
1304          */
1305         dst_confirm(&rt->u.dst);
1306
1307         /* Host route. If it is static, it would be better
1308            not to override it, but add new one, so that
1309            when cache entry will expire old pmtu
1310            would return automatically.
1311          */
1312         if (rt->rt6i_flags & RTF_CACHE) {
1313                 rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1314                 if (allfrag)
1315                         rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1316                 dst_set_expires(&rt->u.dst, ip6_rt_mtu_expires);
1317                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1318                 goto out;
1319         }
1320
1321         /* Network route.
1322            Two cases are possible:
1323            1. It is connected route. Action: COW
1324            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1325          */
1326         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1327                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1328         else
1329                 nrt = rt6_alloc_clone(rt, daddr);
1330
1331         if (nrt) {
1332                 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1333                 if (allfrag)
1334                         nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1335
1336                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1337                  * happened within 5 mins, the recommended timer is 10 mins.
1338                  * Here this route expiration time is set to ip6_rt_mtu_expires
1339                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1340                  * and detecting PMTU increase will be automatically happened.
1341                  */
1342                 dst_set_expires(&nrt->u.dst, ip6_rt_mtu_expires);
1343                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1344
1345                 ip6_ins_rt(nrt, NULL, NULL, NULL);
1346         }
1347 out:
1348         dst_release(&rt->u.dst);
1349 }
1350
1351 /*
1352  *      Misc support functions
1353  */
1354
1355 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1356 {
1357         struct rt6_info *rt = ip6_dst_alloc();
1358
1359         if (rt) {
1360                 rt->u.dst.input = ort->u.dst.input;
1361                 rt->u.dst.output = ort->u.dst.output;
1362
1363                 memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1364                 rt->u.dst.dev = ort->u.dst.dev;
1365                 if (rt->u.dst.dev)
1366                         dev_hold(rt->u.dst.dev);
1367                 rt->rt6i_idev = ort->rt6i_idev;
1368                 if (rt->rt6i_idev)
1369                         in6_dev_hold(rt->rt6i_idev);
1370                 rt->u.dst.lastuse = jiffies;
1371                 rt->rt6i_expires = 0;
1372
1373                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1374                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1375                 rt->rt6i_metric = 0;
1376
1377                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1378 #ifdef CONFIG_IPV6_SUBTREES
1379                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1380 #endif
1381         }
1382         return rt;
1383 }
1384
1385 #ifdef CONFIG_IPV6_ROUTE_INFO
1386 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
1387                                            struct in6_addr *gwaddr, int ifindex)
1388 {
1389         struct fib6_node *fn;
1390         struct rt6_info *rt = NULL;
1391
1392         write_lock_bh(&rt6_lock);
1393         fn = fib6_locate(&ip6_routing_table, prefix ,prefixlen, NULL, 0);
1394         if (!fn)
1395                 goto out;
1396
1397         for (rt = fn->leaf; rt; rt = rt->u.next) {
1398                 if (rt->rt6i_dev->ifindex != ifindex)
1399                         continue;
1400                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1401                         continue;
1402                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1403                         continue;
1404                 dst_hold(&rt->u.dst);
1405                 break;
1406         }
1407 out:
1408         write_unlock_bh(&rt6_lock);
1409         return rt;
1410 }
1411
1412 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
1413                                            struct in6_addr *gwaddr, int ifindex,
1414                                            unsigned pref)
1415 {
1416         struct in6_rtmsg rtmsg;
1417
1418         memset(&rtmsg, 0, sizeof(rtmsg));
1419         rtmsg.rtmsg_type = RTMSG_NEWROUTE;
1420         ipv6_addr_copy(&rtmsg.rtmsg_dst, prefix);
1421         rtmsg.rtmsg_dst_len = prefixlen;
1422         ipv6_addr_copy(&rtmsg.rtmsg_gateway, gwaddr);
1423         rtmsg.rtmsg_metric = 1024;
1424         rtmsg.rtmsg_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | RTF_UP | RTF_PREF(pref);
1425         /* We should treat it as a default route if prefix length is 0. */
1426         if (!prefixlen)
1427                 rtmsg.rtmsg_flags |= RTF_DEFAULT;
1428         rtmsg.rtmsg_ifindex = ifindex;
1429
1430         ip6_route_add(&rtmsg, NULL, NULL, NULL);
1431
1432         return rt6_get_route_info(prefix, prefixlen, gwaddr, ifindex);
1433 }
1434 #endif
1435
1436 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1437 {       
1438         struct rt6_info *rt;
1439         struct fib6_node *fn;
1440
1441         fn = &ip6_routing_table;
1442
1443         write_lock_bh(&rt6_lock);
1444         for (rt = fn->leaf; rt; rt=rt->u.next) {
1445                 if (dev == rt->rt6i_dev &&
1446                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1447                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1448                         break;
1449         }
1450         if (rt)
1451                 dst_hold(&rt->u.dst);
1452         write_unlock_bh(&rt6_lock);
1453         return rt;
1454 }
1455
1456 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1457                                      struct net_device *dev,
1458                                      unsigned int pref)
1459 {
1460         struct in6_rtmsg rtmsg;
1461
1462         memset(&rtmsg, 0, sizeof(struct in6_rtmsg));
1463         rtmsg.rtmsg_type = RTMSG_NEWROUTE;
1464         ipv6_addr_copy(&rtmsg.rtmsg_gateway, gwaddr);
1465         rtmsg.rtmsg_metric = 1024;
1466         rtmsg.rtmsg_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | RTF_UP | RTF_EXPIRES |
1467                             RTF_PREF(pref);
1468
1469         rtmsg.rtmsg_ifindex = dev->ifindex;
1470
1471         ip6_route_add(&rtmsg, NULL, NULL, NULL);
1472         return rt6_get_dflt_router(gwaddr, dev);
1473 }
1474
1475 void rt6_purge_dflt_routers(void)
1476 {
1477         struct rt6_info *rt;
1478
1479 restart:
1480         read_lock_bh(&rt6_lock);
1481         for (rt = ip6_routing_table.leaf; rt; rt = rt->u.next) {
1482                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1483                         dst_hold(&rt->u.dst);
1484
1485                         read_unlock_bh(&rt6_lock);
1486
1487                         ip6_del_rt(rt, NULL, NULL, NULL);
1488
1489                         goto restart;
1490                 }
1491         }
1492         read_unlock_bh(&rt6_lock);
1493 }
1494
1495 int ipv6_route_ioctl(unsigned int cmd, void __user *arg)
1496 {
1497         struct in6_rtmsg rtmsg;
1498         int err;
1499
1500         switch(cmd) {
1501         case SIOCADDRT:         /* Add a route */
1502         case SIOCDELRT:         /* Delete a route */
1503                 if (!capable(CAP_NET_ADMIN))
1504                         return -EPERM;
1505                 err = copy_from_user(&rtmsg, arg,
1506                                      sizeof(struct in6_rtmsg));
1507                 if (err)
1508                         return -EFAULT;
1509                         
1510                 rtnl_lock();
1511                 switch (cmd) {
1512                 case SIOCADDRT:
1513                         err = ip6_route_add(&rtmsg, NULL, NULL, NULL);
1514                         break;
1515                 case SIOCDELRT:
1516                         err = ip6_route_del(&rtmsg, NULL, NULL, NULL);
1517                         break;
1518                 default:
1519                         err = -EINVAL;
1520                 }
1521                 rtnl_unlock();
1522
1523                 return err;
1524         };
1525
1526         return -EINVAL;
1527 }
1528
1529 /*
1530  *      Drop the packet on the floor
1531  */
1532
1533 static int ip6_pkt_discard(struct sk_buff *skb)
1534 {
1535         int type = ipv6_addr_type(&skb->nh.ipv6h->daddr);
1536         if (type == IPV6_ADDR_ANY || type == IPV6_ADDR_RESERVED)
1537                 IP6_INC_STATS(IPSTATS_MIB_INADDRERRORS);
1538
1539         IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
1540         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_NOROUTE, 0, skb->dev);
1541         kfree_skb(skb);
1542         return 0;
1543 }
1544
1545 static int ip6_pkt_discard_out(struct sk_buff *skb)
1546 {
1547         skb->dev = skb->dst->dev;
1548         return ip6_pkt_discard(skb);
1549 }
1550
1551 /*
1552  *      Allocate a dst for local (unicast / anycast) address.
1553  */
1554
1555 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1556                                     const struct in6_addr *addr,
1557                                     int anycast)
1558 {
1559         struct rt6_info *rt = ip6_dst_alloc();
1560
1561         if (rt == NULL)
1562                 return ERR_PTR(-ENOMEM);
1563
1564         dev_hold(&loopback_dev);
1565         in6_dev_hold(idev);
1566
1567         rt->u.dst.flags = DST_HOST;
1568         rt->u.dst.input = ip6_input;
1569         rt->u.dst.output = ip6_output;
1570         rt->rt6i_dev = &loopback_dev;
1571         rt->rt6i_idev = idev;
1572         rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1573         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1574         rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1575         rt->u.dst.obsolete = -1;
1576
1577         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1578         if (anycast)
1579                 rt->rt6i_flags |= RTF_ANYCAST;
1580         else
1581                 rt->rt6i_flags |= RTF_LOCAL;
1582         rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1583         if (rt->rt6i_nexthop == NULL) {
1584                 dst_free((struct dst_entry *) rt);
1585                 return ERR_PTR(-ENOMEM);
1586         }
1587
1588         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1589         rt->rt6i_dst.plen = 128;
1590
1591         atomic_set(&rt->u.dst.__refcnt, 1);
1592
1593         return rt;
1594 }
1595
1596 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1597 {
1598         if (((void*)rt->rt6i_dev == arg || arg == NULL) &&
1599             rt != &ip6_null_entry) {
1600                 RT6_TRACE("deleted by ifdown %p\n", rt);
1601                 return -1;
1602         }
1603         return 0;
1604 }
1605
1606 void rt6_ifdown(struct net_device *dev)
1607 {
1608         write_lock_bh(&rt6_lock);
1609         fib6_clean_tree(&ip6_routing_table, fib6_ifdown, 0, dev);
1610         write_unlock_bh(&rt6_lock);
1611 }
1612
1613 struct rt6_mtu_change_arg
1614 {
1615         struct net_device *dev;
1616         unsigned mtu;
1617 };
1618
1619 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
1620 {
1621         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
1622         struct inet6_dev *idev;
1623
1624         /* In IPv6 pmtu discovery is not optional,
1625            so that RTAX_MTU lock cannot disable it.
1626            We still use this lock to block changes
1627            caused by addrconf/ndisc.
1628         */
1629
1630         idev = __in6_dev_get(arg->dev);
1631         if (idev == NULL)
1632                 return 0;
1633
1634         /* For administrative MTU increase, there is no way to discover
1635            IPv6 PMTU increase, so PMTU increase should be updated here.
1636            Since RFC 1981 doesn't include administrative MTU increase
1637            update PMTU increase is a MUST. (i.e. jumbo frame)
1638          */
1639         /*
1640            If new MTU is less than route PMTU, this new MTU will be the
1641            lowest MTU in the path, update the route PMTU to reflect PMTU
1642            decreases; if new MTU is greater than route PMTU, and the
1643            old MTU is the lowest MTU in the path, update the route PMTU
1644            to reflect the increase. In this case if the other nodes' MTU
1645            also have the lowest MTU, TOO BIG MESSAGE will be lead to
1646            PMTU discouvery.
1647          */
1648         if (rt->rt6i_dev == arg->dev &&
1649             !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1650             (dst_mtu(&rt->u.dst) > arg->mtu ||
1651              (dst_mtu(&rt->u.dst) < arg->mtu &&
1652               dst_mtu(&rt->u.dst) == idev->cnf.mtu6)))
1653                 rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
1654         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(arg->mtu);
1655         return 0;
1656 }
1657
1658 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
1659 {
1660         struct rt6_mtu_change_arg arg;
1661
1662         arg.dev = dev;
1663         arg.mtu = mtu;
1664         read_lock_bh(&rt6_lock);
1665         fib6_clean_tree(&ip6_routing_table, rt6_mtu_change_route, 0, &arg);
1666         read_unlock_bh(&rt6_lock);
1667 }
1668
1669 static int inet6_rtm_to_rtmsg(struct rtmsg *r, struct rtattr **rta,
1670                               struct in6_rtmsg *rtmsg)
1671 {
1672         memset(rtmsg, 0, sizeof(*rtmsg));
1673
1674         rtmsg->rtmsg_dst_len = r->rtm_dst_len;
1675         rtmsg->rtmsg_src_len = r->rtm_src_len;
1676         rtmsg->rtmsg_flags = RTF_UP;
1677         if (r->rtm_type == RTN_UNREACHABLE)
1678                 rtmsg->rtmsg_flags |= RTF_REJECT;
1679
1680         if (rta[RTA_GATEWAY-1]) {
1681                 if (rta[RTA_GATEWAY-1]->rta_len != RTA_LENGTH(16))
1682                         return -EINVAL;
1683                 memcpy(&rtmsg->rtmsg_gateway, RTA_DATA(rta[RTA_GATEWAY-1]), 16);
1684                 rtmsg->rtmsg_flags |= RTF_GATEWAY;
1685         }
1686         if (rta[RTA_DST-1]) {
1687                 if (RTA_PAYLOAD(rta[RTA_DST-1]) < ((r->rtm_dst_len+7)>>3))
1688                         return -EINVAL;
1689                 memcpy(&rtmsg->rtmsg_dst, RTA_DATA(rta[RTA_DST-1]), ((r->rtm_dst_len+7)>>3));
1690         }
1691         if (rta[RTA_SRC-1]) {
1692                 if (RTA_PAYLOAD(rta[RTA_SRC-1]) < ((r->rtm_src_len+7)>>3))
1693                         return -EINVAL;
1694                 memcpy(&rtmsg->rtmsg_src, RTA_DATA(rta[RTA_SRC-1]), ((r->rtm_src_len+7)>>3));
1695         }
1696         if (rta[RTA_OIF-1]) {
1697                 if (rta[RTA_OIF-1]->rta_len != RTA_LENGTH(sizeof(int)))
1698                         return -EINVAL;
1699                 memcpy(&rtmsg->rtmsg_ifindex, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
1700         }
1701         if (rta[RTA_PRIORITY-1]) {
1702                 if (rta[RTA_PRIORITY-1]->rta_len != RTA_LENGTH(4))
1703                         return -EINVAL;
1704                 memcpy(&rtmsg->rtmsg_metric, RTA_DATA(rta[RTA_PRIORITY-1]), 4);
1705         }
1706         return 0;
1707 }
1708
1709 int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1710 {
1711         struct rtmsg *r = NLMSG_DATA(nlh);
1712         struct in6_rtmsg rtmsg;
1713
1714         if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
1715                 return -EINVAL;
1716         return ip6_route_del(&rtmsg, nlh, arg, &NETLINK_CB(skb));
1717 }
1718
1719 int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1720 {
1721         struct rtmsg *r = NLMSG_DATA(nlh);
1722         struct in6_rtmsg rtmsg;
1723
1724         if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
1725                 return -EINVAL;
1726         return ip6_route_add(&rtmsg, nlh, arg, &NETLINK_CB(skb));
1727 }
1728
1729 struct rt6_rtnl_dump_arg
1730 {
1731         struct sk_buff *skb;
1732         struct netlink_callback *cb;
1733 };
1734
1735 static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
1736                          struct in6_addr *dst, struct in6_addr *src,
1737                          int iif, int type, u32 pid, u32 seq,
1738                          int prefix, unsigned int flags)
1739 {
1740         struct rtmsg *rtm;
1741         struct nlmsghdr  *nlh;
1742         unsigned char    *b = skb->tail;
1743         struct rta_cacheinfo ci;
1744
1745         if (prefix) {   /* user wants prefix routes only */
1746                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
1747                         /* success since this is not a prefix route */
1748                         return 1;
1749                 }
1750         }
1751
1752         nlh = NLMSG_NEW(skb, pid, seq, type, sizeof(*rtm), flags);
1753         rtm = NLMSG_DATA(nlh);
1754         rtm->rtm_family = AF_INET6;
1755         rtm->rtm_dst_len = rt->rt6i_dst.plen;
1756         rtm->rtm_src_len = rt->rt6i_src.plen;
1757         rtm->rtm_tos = 0;
1758         rtm->rtm_table = RT_TABLE_MAIN;
1759         if (rt->rt6i_flags&RTF_REJECT)
1760                 rtm->rtm_type = RTN_UNREACHABLE;
1761         else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
1762                 rtm->rtm_type = RTN_LOCAL;
1763         else
1764                 rtm->rtm_type = RTN_UNICAST;
1765         rtm->rtm_flags = 0;
1766         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
1767         rtm->rtm_protocol = rt->rt6i_protocol;
1768         if (rt->rt6i_flags&RTF_DYNAMIC)
1769                 rtm->rtm_protocol = RTPROT_REDIRECT;
1770         else if (rt->rt6i_flags & RTF_ADDRCONF)
1771                 rtm->rtm_protocol = RTPROT_KERNEL;
1772         else if (rt->rt6i_flags&RTF_DEFAULT)
1773                 rtm->rtm_protocol = RTPROT_RA;
1774
1775         if (rt->rt6i_flags&RTF_CACHE)
1776                 rtm->rtm_flags |= RTM_F_CLONED;
1777
1778         if (dst) {
1779                 RTA_PUT(skb, RTA_DST, 16, dst);
1780                 rtm->rtm_dst_len = 128;
1781         } else if (rtm->rtm_dst_len)
1782                 RTA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
1783 #ifdef CONFIG_IPV6_SUBTREES
1784         if (src) {
1785                 RTA_PUT(skb, RTA_SRC, 16, src);
1786                 rtm->rtm_src_len = 128;
1787         } else if (rtm->rtm_src_len)
1788                 RTA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
1789 #endif
1790         if (iif)
1791                 RTA_PUT(skb, RTA_IIF, 4, &iif);
1792         else if (dst) {
1793                 struct in6_addr saddr_buf;
1794                 if (ipv6_get_saddr(&rt->u.dst, dst, &saddr_buf) == 0)
1795                         RTA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
1796         }
1797         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
1798                 goto rtattr_failure;
1799         if (rt->u.dst.neighbour)
1800                 RTA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
1801         if (rt->u.dst.dev)
1802                 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->rt6i_dev->ifindex);
1803         RTA_PUT(skb, RTA_PRIORITY, 4, &rt->rt6i_metric);
1804         ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
1805         if (rt->rt6i_expires)
1806                 ci.rta_expires = jiffies_to_clock_t(rt->rt6i_expires - jiffies);
1807         else
1808                 ci.rta_expires = 0;
1809         ci.rta_used = rt->u.dst.__use;
1810         ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
1811         ci.rta_error = rt->u.dst.error;
1812         ci.rta_id = 0;
1813         ci.rta_ts = 0;
1814         ci.rta_tsage = 0;
1815         RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
1816         nlh->nlmsg_len = skb->tail - b;
1817         return skb->len;
1818
1819 nlmsg_failure:
1820 rtattr_failure:
1821         skb_trim(skb, b - skb->data);
1822         return -1;
1823 }
1824
1825 static int rt6_dump_route(struct rt6_info *rt, void *p_arg)
1826 {
1827         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
1828         int prefix;
1829
1830         if (arg->cb->nlh->nlmsg_len >= NLMSG_LENGTH(sizeof(struct rtmsg))) {
1831                 struct rtmsg *rtm = NLMSG_DATA(arg->cb->nlh);
1832                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
1833         } else
1834                 prefix = 0;
1835
1836         return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
1837                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
1838                      prefix, NLM_F_MULTI);
1839 }
1840
1841 static int fib6_dump_node(struct fib6_walker_t *w)
1842 {
1843         int res;
1844         struct rt6_info *rt;
1845
1846         for (rt = w->leaf; rt; rt = rt->u.next) {
1847                 res = rt6_dump_route(rt, w->args);
1848                 if (res < 0) {
1849                         /* Frame is full, suspend walking */
1850                         w->leaf = rt;
1851                         return 1;
1852                 }
1853                 BUG_TRAP(res!=0);
1854         }
1855         w->leaf = NULL;
1856         return 0;
1857 }
1858
1859 static void fib6_dump_end(struct netlink_callback *cb)
1860 {
1861         struct fib6_walker_t *w = (void*)cb->args[0];
1862
1863         if (w) {
1864                 cb->args[0] = 0;
1865                 fib6_walker_unlink(w);
1866                 kfree(w);
1867         }
1868         cb->done = (void*)cb->args[1];
1869         cb->args[1] = 0;
1870 }
1871
1872 static int fib6_dump_done(struct netlink_callback *cb)
1873 {
1874         fib6_dump_end(cb);
1875         return cb->done ? cb->done(cb) : 0;
1876 }
1877
1878 int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
1879 {
1880         struct rt6_rtnl_dump_arg arg;
1881         struct fib6_walker_t *w;
1882         int res;
1883
1884         arg.skb = skb;
1885         arg.cb = cb;
1886
1887         w = (void*)cb->args[0];
1888         if (w == NULL) {
1889                 /* New dump:
1890                  * 
1891                  * 1. hook callback destructor.
1892                  */
1893                 cb->args[1] = (long)cb->done;
1894                 cb->done = fib6_dump_done;
1895
1896                 /*
1897                  * 2. allocate and initialize walker.
1898                  */
1899                 w = kzalloc(sizeof(*w), GFP_ATOMIC);
1900                 if (w == NULL)
1901                         return -ENOMEM;
1902                 RT6_TRACE("dump<%p", w);
1903                 w->root = &ip6_routing_table;
1904                 w->func = fib6_dump_node;
1905                 w->args = &arg;
1906                 cb->args[0] = (long)w;
1907                 read_lock_bh(&rt6_lock);
1908                 res = fib6_walk(w);
1909                 read_unlock_bh(&rt6_lock);
1910         } else {
1911                 w->args = &arg;
1912                 read_lock_bh(&rt6_lock);
1913                 res = fib6_walk_continue(w);
1914                 read_unlock_bh(&rt6_lock);
1915         }
1916 #if RT6_DEBUG >= 3
1917         if (res <= 0 && skb->len == 0)
1918                 RT6_TRACE("%p>dump end\n", w);
1919 #endif
1920         res = res < 0 ? res : skb->len;
1921         /* res < 0 is an error. (really, impossible)
1922            res == 0 means that dump is complete, but skb still can contain data.
1923            res > 0 dump is not complete, but frame is full.
1924          */
1925         /* Destroy walker, if dump of this table is complete. */
1926         if (res <= 0)
1927                 fib6_dump_end(cb);
1928         return res;
1929 }
1930
1931 int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
1932 {
1933         struct rtattr **rta = arg;
1934         int iif = 0;
1935         int err = -ENOBUFS;
1936         struct sk_buff *skb;
1937         struct flowi fl;
1938         struct rt6_info *rt;
1939
1940         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1941         if (skb == NULL)
1942                 goto out;
1943
1944         /* Reserve room for dummy headers, this skb can pass
1945            through good chunk of routing engine.
1946          */
1947         skb->mac.raw = skb->data;
1948         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
1949
1950         memset(&fl, 0, sizeof(fl));
1951         if (rta[RTA_SRC-1])
1952                 ipv6_addr_copy(&fl.fl6_src,
1953                                (struct in6_addr*)RTA_DATA(rta[RTA_SRC-1]));
1954         if (rta[RTA_DST-1])
1955                 ipv6_addr_copy(&fl.fl6_dst,
1956                                (struct in6_addr*)RTA_DATA(rta[RTA_DST-1]));
1957
1958         if (rta[RTA_IIF-1])
1959                 memcpy(&iif, RTA_DATA(rta[RTA_IIF-1]), sizeof(int));
1960
1961         if (iif) {
1962                 struct net_device *dev;
1963                 dev = __dev_get_by_index(iif);
1964                 if (!dev) {
1965                         err = -ENODEV;
1966                         goto out_free;
1967                 }
1968         }
1969
1970         fl.oif = 0;
1971         if (rta[RTA_OIF-1])
1972                 memcpy(&fl.oif, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
1973
1974         rt = (struct rt6_info*)ip6_route_output(NULL, &fl);
1975
1976         skb->dst = &rt->u.dst;
1977
1978         NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
1979         err = rt6_fill_node(skb, rt, 
1980                             &fl.fl6_dst, &fl.fl6_src,
1981                             iif,
1982                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
1983                             nlh->nlmsg_seq, 0, 0);
1984         if (err < 0) {
1985                 err = -EMSGSIZE;
1986                 goto out_free;
1987         }
1988
1989         err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
1990         if (err > 0)
1991                 err = 0;
1992 out:
1993         return err;
1994 out_free:
1995         kfree_skb(skb);
1996         goto out;       
1997 }
1998
1999 void inet6_rt_notify(int event, struct rt6_info *rt, struct nlmsghdr *nlh, 
2000                         struct netlink_skb_parms *req)
2001 {
2002         struct sk_buff *skb;
2003         int size = NLMSG_SPACE(sizeof(struct rtmsg)+256);
2004         u32 pid = current->pid;
2005         u32 seq = 0;
2006
2007         if (req)
2008                 pid = req->pid;
2009         if (nlh)
2010                 seq = nlh->nlmsg_seq;
2011         
2012         skb = alloc_skb(size, gfp_any());
2013         if (!skb) {
2014                 netlink_set_err(rtnl, 0, RTNLGRP_IPV6_ROUTE, ENOBUFS);
2015                 return;
2016         }
2017         if (rt6_fill_node(skb, rt, NULL, NULL, 0, event, pid, seq, 0, 0) < 0) {
2018                 kfree_skb(skb);
2019                 netlink_set_err(rtnl, 0, RTNLGRP_IPV6_ROUTE, EINVAL);
2020                 return;
2021         }
2022         NETLINK_CB(skb).dst_group = RTNLGRP_IPV6_ROUTE;
2023         netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV6_ROUTE, gfp_any());
2024 }
2025
2026 /*
2027  *      /proc
2028  */
2029
2030 #ifdef CONFIG_PROC_FS
2031
2032 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2033
2034 struct rt6_proc_arg
2035 {
2036         char *buffer;
2037         int offset;
2038         int length;
2039         int skip;
2040         int len;
2041 };
2042
2043 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2044 {
2045         struct rt6_proc_arg *arg = (struct rt6_proc_arg *) p_arg;
2046         int i;
2047
2048         if (arg->skip < arg->offset / RT6_INFO_LEN) {
2049                 arg->skip++;
2050                 return 0;
2051         }
2052
2053         if (arg->len >= arg->length)
2054                 return 0;
2055
2056         for (i=0; i<16; i++) {
2057                 sprintf(arg->buffer + arg->len, "%02x",
2058                         rt->rt6i_dst.addr.s6_addr[i]);
2059                 arg->len += 2;
2060         }
2061         arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2062                             rt->rt6i_dst.plen);
2063
2064 #ifdef CONFIG_IPV6_SUBTREES
2065         for (i=0; i<16; i++) {
2066                 sprintf(arg->buffer + arg->len, "%02x",
2067                         rt->rt6i_src.addr.s6_addr[i]);
2068                 arg->len += 2;
2069         }
2070         arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2071                             rt->rt6i_src.plen);
2072 #else
2073         sprintf(arg->buffer + arg->len,
2074                 "00000000000000000000000000000000 00 ");
2075         arg->len += 36;
2076 #endif
2077
2078         if (rt->rt6i_nexthop) {
2079                 for (i=0; i<16; i++) {
2080                         sprintf(arg->buffer + arg->len, "%02x",
2081                                 rt->rt6i_nexthop->primary_key[i]);
2082                         arg->len += 2;
2083                 }
2084         } else {
2085                 sprintf(arg->buffer + arg->len,
2086                         "00000000000000000000000000000000");
2087                 arg->len += 32;
2088         }
2089         arg->len += sprintf(arg->buffer + arg->len,
2090                             " %08x %08x %08x %08x %8s\n",
2091                             rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
2092                             rt->u.dst.__use, rt->rt6i_flags, 
2093                             rt->rt6i_dev ? rt->rt6i_dev->name : "");
2094         return 0;
2095 }
2096
2097 static int rt6_proc_info(char *buffer, char **start, off_t offset, int length)
2098 {
2099         struct rt6_proc_arg arg;
2100         arg.buffer = buffer;
2101         arg.offset = offset;
2102         arg.length = length;
2103         arg.skip = 0;
2104         arg.len = 0;
2105
2106         read_lock_bh(&rt6_lock);
2107         fib6_clean_tree(&ip6_routing_table, rt6_info_route, 0, &arg);
2108         read_unlock_bh(&rt6_lock);
2109
2110         *start = buffer;
2111         if (offset)
2112                 *start += offset % RT6_INFO_LEN;
2113
2114         arg.len -= offset % RT6_INFO_LEN;
2115
2116         if (arg.len > length)
2117                 arg.len = length;
2118         if (arg.len < 0)
2119                 arg.len = 0;
2120
2121         return arg.len;
2122 }
2123
2124 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2125 {
2126         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2127                       rt6_stats.fib_nodes, rt6_stats.fib_route_nodes,
2128                       rt6_stats.fib_rt_alloc, rt6_stats.fib_rt_entries,
2129                       rt6_stats.fib_rt_cache,
2130                       atomic_read(&ip6_dst_ops.entries),
2131                       rt6_stats.fib_discarded_routes);
2132
2133         return 0;
2134 }
2135
2136 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2137 {
2138         return single_open(file, rt6_stats_seq_show, NULL);
2139 }
2140
2141 static struct file_operations rt6_stats_seq_fops = {
2142         .owner   = THIS_MODULE,
2143         .open    = rt6_stats_seq_open,
2144         .read    = seq_read,
2145         .llseek  = seq_lseek,
2146         .release = single_release,
2147 };
2148 #endif  /* CONFIG_PROC_FS */
2149
2150 #ifdef CONFIG_SYSCTL
2151
2152 static int flush_delay;
2153
2154 static
2155 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
2156                               void __user *buffer, size_t *lenp, loff_t *ppos)
2157 {
2158         if (write) {
2159                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2160                 fib6_run_gc(flush_delay <= 0 ? ~0UL : (unsigned long)flush_delay);
2161                 return 0;
2162         } else
2163                 return -EINVAL;
2164 }
2165
2166 ctl_table ipv6_route_table[] = {
2167         {
2168                 .ctl_name       =       NET_IPV6_ROUTE_FLUSH, 
2169                 .procname       =       "flush",
2170                 .data           =       &flush_delay,
2171                 .maxlen         =       sizeof(int),
2172                 .mode           =       0200,
2173                 .proc_handler   =       &ipv6_sysctl_rtcache_flush
2174         },
2175         {
2176                 .ctl_name       =       NET_IPV6_ROUTE_GC_THRESH,
2177                 .procname       =       "gc_thresh",
2178                 .data           =       &ip6_dst_ops.gc_thresh,
2179                 .maxlen         =       sizeof(int),
2180                 .mode           =       0644,
2181                 .proc_handler   =       &proc_dointvec,
2182         },
2183         {
2184                 .ctl_name       =       NET_IPV6_ROUTE_MAX_SIZE,
2185                 .procname       =       "max_size",
2186                 .data           =       &ip6_rt_max_size,
2187                 .maxlen         =       sizeof(int),
2188                 .mode           =       0644,
2189                 .proc_handler   =       &proc_dointvec,
2190         },
2191         {
2192                 .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL,
2193                 .procname       =       "gc_min_interval",
2194                 .data           =       &ip6_rt_gc_min_interval,
2195                 .maxlen         =       sizeof(int),
2196                 .mode           =       0644,
2197                 .proc_handler   =       &proc_dointvec_jiffies,
2198                 .strategy       =       &sysctl_jiffies,
2199         },
2200         {
2201                 .ctl_name       =       NET_IPV6_ROUTE_GC_TIMEOUT,
2202                 .procname       =       "gc_timeout",
2203                 .data           =       &ip6_rt_gc_timeout,
2204                 .maxlen         =       sizeof(int),
2205                 .mode           =       0644,
2206                 .proc_handler   =       &proc_dointvec_jiffies,
2207                 .strategy       =       &sysctl_jiffies,
2208         },
2209         {
2210                 .ctl_name       =       NET_IPV6_ROUTE_GC_INTERVAL,
2211                 .procname       =       "gc_interval",
2212                 .data           =       &ip6_rt_gc_interval,
2213                 .maxlen         =       sizeof(int),
2214                 .mode           =       0644,
2215                 .proc_handler   =       &proc_dointvec_jiffies,
2216                 .strategy       =       &sysctl_jiffies,
2217         },
2218         {
2219                 .ctl_name       =       NET_IPV6_ROUTE_GC_ELASTICITY,
2220                 .procname       =       "gc_elasticity",
2221                 .data           =       &ip6_rt_gc_elasticity,
2222                 .maxlen         =       sizeof(int),
2223                 .mode           =       0644,
2224                 .proc_handler   =       &proc_dointvec_jiffies,
2225                 .strategy       =       &sysctl_jiffies,
2226         },
2227         {
2228                 .ctl_name       =       NET_IPV6_ROUTE_MTU_EXPIRES,
2229                 .procname       =       "mtu_expires",
2230                 .data           =       &ip6_rt_mtu_expires,
2231                 .maxlen         =       sizeof(int),
2232                 .mode           =       0644,
2233                 .proc_handler   =       &proc_dointvec_jiffies,
2234                 .strategy       =       &sysctl_jiffies,
2235         },
2236         {
2237                 .ctl_name       =       NET_IPV6_ROUTE_MIN_ADVMSS,
2238                 .procname       =       "min_adv_mss",
2239                 .data           =       &ip6_rt_min_advmss,
2240                 .maxlen         =       sizeof(int),
2241                 .mode           =       0644,
2242                 .proc_handler   =       &proc_dointvec_jiffies,
2243                 .strategy       =       &sysctl_jiffies,
2244         },
2245         {
2246                 .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,
2247                 .procname       =       "gc_min_interval_ms",
2248                 .data           =       &ip6_rt_gc_min_interval,
2249                 .maxlen         =       sizeof(int),
2250                 .mode           =       0644,
2251                 .proc_handler   =       &proc_dointvec_ms_jiffies,
2252                 .strategy       =       &sysctl_ms_jiffies,
2253         },
2254         { .ctl_name = 0 }
2255 };
2256
2257 #endif
2258
2259 void __init ip6_route_init(void)
2260 {
2261         struct proc_dir_entry *p;
2262
2263         ip6_dst_ops.kmem_cachep = kmem_cache_create("ip6_dst_cache",
2264                                                      sizeof(struct rt6_info),
2265                                                      0, SLAB_HWCACHE_ALIGN,
2266                                                      NULL, NULL);
2267         if (!ip6_dst_ops.kmem_cachep)
2268                 panic("cannot create ip6_dst_cache");
2269
2270         fib6_init();
2271 #ifdef  CONFIG_PROC_FS
2272         p = proc_net_create("ipv6_route", 0, rt6_proc_info);
2273         if (p)
2274                 p->owner = THIS_MODULE;
2275
2276         proc_net_fops_create("rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2277 #endif
2278 #ifdef CONFIG_XFRM
2279         xfrm6_init();
2280 #endif
2281 }
2282
2283 void ip6_route_cleanup(void)
2284 {
2285 #ifdef CONFIG_PROC_FS
2286         proc_net_remove("ipv6_route");
2287         proc_net_remove("rt6_stats");
2288 #endif
2289 #ifdef CONFIG_XFRM
2290         xfrm6_fini();
2291 #endif
2292         rt6_ifdown(NULL);
2293         fib6_gc_cleanup();
2294         kmem_cache_destroy(ip6_dst_ops.kmem_cachep);
2295 }