97cc494126842ed78ef3efd9b0caf14383442f9f
[sfrench/cifs-2.6.git] / net / ipv4 / fib_semantics.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              IPv4 Forwarding Information Base: semantics.
7  *
8  * Version:     $Id: fib_semantics.c,v 1.19 2002/01/12 07:54:56 davem Exp $
9  *
10  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
11  *
12  *              This program is free software; you can redistribute it and/or
13  *              modify it under the terms of the GNU General Public License
14  *              as published by the Free Software Foundation; either version
15  *              2 of the License, or (at your option) any later version.
16  */
17
18 #include <asm/uaccess.h>
19 #include <asm/system.h>
20 #include <linux/bitops.h>
21 #include <linux/types.h>
22 #include <linux/kernel.h>
23 #include <linux/jiffies.h>
24 #include <linux/mm.h>
25 #include <linux/string.h>
26 #include <linux/socket.h>
27 #include <linux/sockios.h>
28 #include <linux/errno.h>
29 #include <linux/in.h>
30 #include <linux/inet.h>
31 #include <linux/inetdevice.h>
32 #include <linux/netdevice.h>
33 #include <linux/if_arp.h>
34 #include <linux/proc_fs.h>
35 #include <linux/skbuff.h>
36 #include <linux/init.h>
37
38 #include <net/arp.h>
39 #include <net/ip.h>
40 #include <net/protocol.h>
41 #include <net/route.h>
42 #include <net/tcp.h>
43 #include <net/sock.h>
44 #include <net/ip_fib.h>
45 #include <net/netlink.h>
46 #include <net/nexthop.h>
47
48 #include "fib_lookup.h"
49
50 static DEFINE_SPINLOCK(fib_info_lock);
51 static struct hlist_head *fib_info_hash;
52 static struct hlist_head *fib_info_laddrhash;
53 static unsigned int fib_hash_size;
54 static unsigned int fib_info_cnt;
55
56 #define DEVINDEX_HASHBITS 8
57 #define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
58 static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
59
60 #ifdef CONFIG_IP_ROUTE_MULTIPATH
61
62 static DEFINE_SPINLOCK(fib_multipath_lock);
63
64 #define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \
65 for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
66
67 #define change_nexthops(fi) { int nhsel; struct fib_nh * nh; \
68 for (nhsel=0, nh = (struct fib_nh*)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, nhsel++)
69
70 #else /* CONFIG_IP_ROUTE_MULTIPATH */
71
72 /* Hope, that gcc will optimize it to get rid of dummy loop */
73
74 #define for_nexthops(fi) { int nhsel=0; const struct fib_nh * nh = (fi)->fib_nh; \
75 for (nhsel=0; nhsel < 1; nhsel++)
76
77 #define change_nexthops(fi) { int nhsel=0; struct fib_nh * nh = (struct fib_nh*)((fi)->fib_nh); \
78 for (nhsel=0; nhsel < 1; nhsel++)
79
80 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
81
82 #define endfor_nexthops(fi) }
83
84
85 static const struct
86 {
87         int     error;
88         u8      scope;
89 } fib_props[RTN_MAX + 1] = {
90         {
91                 .error  = 0,
92                 .scope  = RT_SCOPE_NOWHERE,
93         },      /* RTN_UNSPEC */
94         {
95                 .error  = 0,
96                 .scope  = RT_SCOPE_UNIVERSE,
97         },      /* RTN_UNICAST */
98         {
99                 .error  = 0,
100                 .scope  = RT_SCOPE_HOST,
101         },      /* RTN_LOCAL */
102         {
103                 .error  = 0,
104                 .scope  = RT_SCOPE_LINK,
105         },      /* RTN_BROADCAST */
106         {
107                 .error  = 0,
108                 .scope  = RT_SCOPE_LINK,
109         },      /* RTN_ANYCAST */
110         {
111                 .error  = 0,
112                 .scope  = RT_SCOPE_UNIVERSE,
113         },      /* RTN_MULTICAST */
114         {
115                 .error  = -EINVAL,
116                 .scope  = RT_SCOPE_UNIVERSE,
117         },      /* RTN_BLACKHOLE */
118         {
119                 .error  = -EHOSTUNREACH,
120                 .scope  = RT_SCOPE_UNIVERSE,
121         },      /* RTN_UNREACHABLE */
122         {
123                 .error  = -EACCES,
124                 .scope  = RT_SCOPE_UNIVERSE,
125         },      /* RTN_PROHIBIT */
126         {
127                 .error  = -EAGAIN,
128                 .scope  = RT_SCOPE_UNIVERSE,
129         },      /* RTN_THROW */
130         {
131                 .error  = -EINVAL,
132                 .scope  = RT_SCOPE_NOWHERE,
133         },      /* RTN_NAT */
134         {
135                 .error  = -EINVAL,
136                 .scope  = RT_SCOPE_NOWHERE,
137         },      /* RTN_XRESOLVE */
138 };
139
140
141 /* Release a nexthop info record */
142
143 void free_fib_info(struct fib_info *fi)
144 {
145         if (fi->fib_dead == 0) {
146                 printk(KERN_WARNING "Freeing alive fib_info %p\n", fi);
147                 return;
148         }
149         change_nexthops(fi) {
150                 if (nh->nh_dev)
151                         dev_put(nh->nh_dev);
152                 nh->nh_dev = NULL;
153         } endfor_nexthops(fi);
154         fib_info_cnt--;
155         kfree(fi);
156 }
157
158 void fib_release_info(struct fib_info *fi)
159 {
160         spin_lock_bh(&fib_info_lock);
161         if (fi && --fi->fib_treeref == 0) {
162                 hlist_del(&fi->fib_hash);
163                 if (fi->fib_prefsrc)
164                         hlist_del(&fi->fib_lhash);
165                 change_nexthops(fi) {
166                         if (!nh->nh_dev)
167                                 continue;
168                         hlist_del(&nh->nh_hash);
169                 } endfor_nexthops(fi)
170                 fi->fib_dead = 1;
171                 fib_info_put(fi);
172         }
173         spin_unlock_bh(&fib_info_lock);
174 }
175
176 static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
177 {
178         const struct fib_nh *onh = ofi->fib_nh;
179
180         for_nexthops(fi) {
181                 if (nh->nh_oif != onh->nh_oif ||
182                     nh->nh_gw  != onh->nh_gw ||
183                     nh->nh_scope != onh->nh_scope ||
184 #ifdef CONFIG_IP_ROUTE_MULTIPATH
185                     nh->nh_weight != onh->nh_weight ||
186 #endif
187 #ifdef CONFIG_NET_CLS_ROUTE
188                     nh->nh_tclassid != onh->nh_tclassid ||
189 #endif
190                     ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD))
191                         return -1;
192                 onh++;
193         } endfor_nexthops(fi);
194         return 0;
195 }
196
197 static inline unsigned int fib_devindex_hashfn(unsigned int val)
198 {
199         unsigned int mask = DEVINDEX_HASHSIZE - 1;
200
201         return (val ^
202                 (val >> DEVINDEX_HASHBITS) ^
203                 (val >> (DEVINDEX_HASHBITS * 2))) & mask;
204 }
205
206 static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
207 {
208         unsigned int mask = (fib_hash_size - 1);
209         unsigned int val = fi->fib_nhs;
210
211         val ^= fi->fib_protocol;
212         val ^= (__force u32)fi->fib_prefsrc;
213         val ^= fi->fib_priority;
214         for_nexthops(fi) {
215                 val ^= fib_devindex_hashfn(nh->nh_oif);
216         } endfor_nexthops(fi)
217
218         return (val ^ (val >> 7) ^ (val >> 12)) & mask;
219 }
220
221 static struct fib_info *fib_find_info(const struct fib_info *nfi)
222 {
223         struct hlist_head *head;
224         struct hlist_node *node;
225         struct fib_info *fi;
226         unsigned int hash;
227
228         hash = fib_info_hashfn(nfi);
229         head = &fib_info_hash[hash];
230
231         hlist_for_each_entry(fi, node, head, fib_hash) {
232                 if (fi->fib_nhs != nfi->fib_nhs)
233                         continue;
234                 if (nfi->fib_protocol == fi->fib_protocol &&
235                     nfi->fib_prefsrc == fi->fib_prefsrc &&
236                     nfi->fib_priority == fi->fib_priority &&
237                     memcmp(nfi->fib_metrics, fi->fib_metrics,
238                            sizeof(fi->fib_metrics)) == 0 &&
239                     ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 &&
240                     (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
241                         return fi;
242         }
243
244         return NULL;
245 }
246
247 /* Check, that the gateway is already configured.
248    Used only by redirect accept routine.
249  */
250
251 int ip_fib_check_default(__be32 gw, struct net_device *dev)
252 {
253         struct hlist_head *head;
254         struct hlist_node *node;
255         struct fib_nh *nh;
256         unsigned int hash;
257
258         spin_lock(&fib_info_lock);
259
260         hash = fib_devindex_hashfn(dev->ifindex);
261         head = &fib_info_devhash[hash];
262         hlist_for_each_entry(nh, node, head, nh_hash) {
263                 if (nh->nh_dev == dev &&
264                     nh->nh_gw == gw &&
265                     !(nh->nh_flags&RTNH_F_DEAD)) {
266                         spin_unlock(&fib_info_lock);
267                         return 0;
268                 }
269         }
270
271         spin_unlock(&fib_info_lock);
272
273         return -1;
274 }
275
276 static inline size_t fib_nlmsg_size(struct fib_info *fi)
277 {
278         size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg))
279                          + nla_total_size(4) /* RTA_TABLE */
280                          + nla_total_size(4) /* RTA_DST */
281                          + nla_total_size(4) /* RTA_PRIORITY */
282                          + nla_total_size(4); /* RTA_PREFSRC */
283
284         /* space for nested metrics */
285         payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
286
287         if (fi->fib_nhs) {
288                 /* Also handles the special case fib_nhs == 1 */
289
290                 /* each nexthop is packed in an attribute */
291                 size_t nhsize = nla_total_size(sizeof(struct rtnexthop));
292
293                 /* may contain flow and gateway attribute */
294                 nhsize += 2 * nla_total_size(4);
295
296                 /* all nexthops are packed in a nested attribute */
297                 payload += nla_total_size(fi->fib_nhs * nhsize);
298         }
299
300         return payload;
301 }
302
303 void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
304                int dst_len, u32 tb_id, struct nl_info *info,
305                unsigned int nlm_flags)
306 {
307         struct sk_buff *skb;
308         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
309         int err = -ENOBUFS;
310
311         skb = nlmsg_new(fib_nlmsg_size(fa->fa_info), GFP_KERNEL);
312         if (skb == NULL)
313                 goto errout;
314
315         err = fib_dump_info(skb, info->pid, seq, event, tb_id,
316                             fa->fa_type, fa->fa_scope, key, dst_len,
317                             fa->fa_tos, fa->fa_info, nlm_flags);
318         if (err < 0) {
319                 /* -EMSGSIZE implies BUG in fib_nlmsg_size() */
320                 WARN_ON(err == -EMSGSIZE);
321                 kfree_skb(skb);
322                 goto errout;
323         }
324         err = rtnl_notify(skb, info->nl_net, info->pid, RTNLGRP_IPV4_ROUTE,
325                           info->nlh, GFP_KERNEL);
326 errout:
327         if (err < 0)
328                 rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err);
329 }
330
331 /* Return the first fib alias matching TOS with
332  * priority less than or equal to PRIO.
333  */
334 struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio)
335 {
336         if (fah) {
337                 struct fib_alias *fa;
338                 list_for_each_entry(fa, fah, fa_list) {
339                         if (fa->fa_tos > tos)
340                                 continue;
341                         if (fa->fa_info->fib_priority >= prio ||
342                             fa->fa_tos < tos)
343                                 return fa;
344                 }
345         }
346         return NULL;
347 }
348
349 int fib_detect_death(struct fib_info *fi, int order,
350                      struct fib_info **last_resort, int *last_idx, int dflt)
351 {
352         struct neighbour *n;
353         int state = NUD_NONE;
354
355         n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev);
356         if (n) {
357                 state = n->nud_state;
358                 neigh_release(n);
359         }
360         if (state==NUD_REACHABLE)
361                 return 0;
362         if ((state&NUD_VALID) && order != dflt)
363                 return 0;
364         if ((state&NUD_VALID) ||
365             (*last_idx<0 && order > dflt)) {
366                 *last_resort = fi;
367                 *last_idx = order;
368         }
369         return 1;
370 }
371
372 #ifdef CONFIG_IP_ROUTE_MULTIPATH
373
374 static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining)
375 {
376         int nhs = 0;
377
378         while (rtnh_ok(rtnh, remaining)) {
379                 nhs++;
380                 rtnh = rtnh_next(rtnh, &remaining);
381         }
382
383         /* leftover implies invalid nexthop configuration, discard it */
384         return remaining > 0 ? 0 : nhs;
385 }
386
387 static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
388                        int remaining, struct fib_config *cfg)
389 {
390         change_nexthops(fi) {
391                 int attrlen;
392
393                 if (!rtnh_ok(rtnh, remaining))
394                         return -EINVAL;
395
396                 nh->nh_flags = (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags;
397                 nh->nh_oif = rtnh->rtnh_ifindex;
398                 nh->nh_weight = rtnh->rtnh_hops + 1;
399
400                 attrlen = rtnh_attrlen(rtnh);
401                 if (attrlen > 0) {
402                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
403
404                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
405                         nh->nh_gw = nla ? nla_get_be32(nla) : 0;
406 #ifdef CONFIG_NET_CLS_ROUTE
407                         nla = nla_find(attrs, attrlen, RTA_FLOW);
408                         nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
409 #endif
410                 }
411
412                 rtnh = rtnh_next(rtnh, &remaining);
413         } endfor_nexthops(fi);
414
415         return 0;
416 }
417
418 #endif
419
420 int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
421 {
422 #ifdef CONFIG_IP_ROUTE_MULTIPATH
423         struct rtnexthop *rtnh;
424         int remaining;
425 #endif
426
427         if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority)
428                 return 1;
429
430         if (cfg->fc_oif || cfg->fc_gw) {
431                 if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) &&
432                     (!cfg->fc_gw  || cfg->fc_gw == fi->fib_nh->nh_gw))
433                         return 0;
434                 return 1;
435         }
436
437 #ifdef CONFIG_IP_ROUTE_MULTIPATH
438         if (cfg->fc_mp == NULL)
439                 return 0;
440
441         rtnh = cfg->fc_mp;
442         remaining = cfg->fc_mp_len;
443
444         for_nexthops(fi) {
445                 int attrlen;
446
447                 if (!rtnh_ok(rtnh, remaining))
448                         return -EINVAL;
449
450                 if (rtnh->rtnh_ifindex && rtnh->rtnh_ifindex != nh->nh_oif)
451                         return 1;
452
453                 attrlen = rtnh_attrlen(rtnh);
454                 if (attrlen < 0) {
455                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
456
457                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
458                         if (nla && nla_get_be32(nla) != nh->nh_gw)
459                                 return 1;
460 #ifdef CONFIG_NET_CLS_ROUTE
461                         nla = nla_find(attrs, attrlen, RTA_FLOW);
462                         if (nla && nla_get_u32(nla) != nh->nh_tclassid)
463                                 return 1;
464 #endif
465                 }
466
467                 rtnh = rtnh_next(rtnh, &remaining);
468         } endfor_nexthops(fi);
469 #endif
470         return 0;
471 }
472
473
474 /*
475    Picture
476    -------
477
478    Semantics of nexthop is very messy by historical reasons.
479    We have to take into account, that:
480    a) gateway can be actually local interface address,
481       so that gatewayed route is direct.
482    b) gateway must be on-link address, possibly
483       described not by an ifaddr, but also by a direct route.
484    c) If both gateway and interface are specified, they should not
485       contradict.
486    d) If we use tunnel routes, gateway could be not on-link.
487
488    Attempt to reconcile all of these (alas, self-contradictory) conditions
489    results in pretty ugly and hairy code with obscure logic.
490
491    I chose to generalized it instead, so that the size
492    of code does not increase practically, but it becomes
493    much more general.
494    Every prefix is assigned a "scope" value: "host" is local address,
495    "link" is direct route,
496    [ ... "site" ... "interior" ... ]
497    and "universe" is true gateway route with global meaning.
498
499    Every prefix refers to a set of "nexthop"s (gw, oif),
500    where gw must have narrower scope. This recursion stops
501    when gw has LOCAL scope or if "nexthop" is declared ONLINK,
502    which means that gw is forced to be on link.
503
504    Code is still hairy, but now it is apparently logically
505    consistent and very flexible. F.e. as by-product it allows
506    to co-exists in peace independent exterior and interior
507    routing processes.
508
509    Normally it looks as following.
510
511    {universe prefix}  -> (gw, oif) [scope link]
512                           |
513                           |-> {link prefix} -> (gw, oif) [scope local]
514                                                 |
515                                                 |-> {local prefix} (terminal node)
516  */
517
518 static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
519                         struct fib_nh *nh)
520 {
521         int err;
522         struct net *net;
523
524         net = cfg->fc_nlinfo.nl_net;
525         if (nh->nh_gw) {
526                 struct fib_result res;
527
528 #ifdef CONFIG_IP_ROUTE_PERVASIVE
529                 if (nh->nh_flags&RTNH_F_PERVASIVE)
530                         return 0;
531 #endif
532                 if (nh->nh_flags&RTNH_F_ONLINK) {
533                         struct net_device *dev;
534
535                         if (cfg->fc_scope >= RT_SCOPE_LINK)
536                                 return -EINVAL;
537                         if (inet_addr_type(net, nh->nh_gw) != RTN_UNICAST)
538                                 return -EINVAL;
539                         if ((dev = __dev_get_by_index(net, nh->nh_oif)) == NULL)
540                                 return -ENODEV;
541                         if (!(dev->flags&IFF_UP))
542                                 return -ENETDOWN;
543                         nh->nh_dev = dev;
544                         dev_hold(dev);
545                         nh->nh_scope = RT_SCOPE_LINK;
546                         return 0;
547                 }
548                 {
549                         struct flowi fl = {
550                                 .nl_u = {
551                                         .ip4_u = {
552                                                 .daddr = nh->nh_gw,
553                                                 .scope = cfg->fc_scope + 1,
554                                         },
555                                 },
556                                 .oif = nh->nh_oif,
557                         };
558
559                         /* It is not necessary, but requires a bit of thinking */
560                         if (fl.fl4_scope < RT_SCOPE_LINK)
561                                 fl.fl4_scope = RT_SCOPE_LINK;
562                         if ((err = fib_lookup(net, &fl, &res)) != 0)
563                                 return err;
564                 }
565                 err = -EINVAL;
566                 if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
567                         goto out;
568                 nh->nh_scope = res.scope;
569                 nh->nh_oif = FIB_RES_OIF(res);
570                 if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL)
571                         goto out;
572                 dev_hold(nh->nh_dev);
573                 err = -ENETDOWN;
574                 if (!(nh->nh_dev->flags & IFF_UP))
575                         goto out;
576                 err = 0;
577 out:
578                 fib_res_put(&res);
579                 return err;
580         } else {
581                 struct in_device *in_dev;
582
583                 if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK))
584                         return -EINVAL;
585
586                 in_dev = inetdev_by_index(net, nh->nh_oif);
587                 if (in_dev == NULL)
588                         return -ENODEV;
589                 if (!(in_dev->dev->flags&IFF_UP)) {
590                         in_dev_put(in_dev);
591                         return -ENETDOWN;
592                 }
593                 nh->nh_dev = in_dev->dev;
594                 dev_hold(nh->nh_dev);
595                 nh->nh_scope = RT_SCOPE_HOST;
596                 in_dev_put(in_dev);
597         }
598         return 0;
599 }
600
601 static inline unsigned int fib_laddr_hashfn(__be32 val)
602 {
603         unsigned int mask = (fib_hash_size - 1);
604
605         return ((__force u32)val ^ ((__force u32)val >> 7) ^ ((__force u32)val >> 14)) & mask;
606 }
607
608 static struct hlist_head *fib_hash_alloc(int bytes)
609 {
610         if (bytes <= PAGE_SIZE)
611                 return kzalloc(bytes, GFP_KERNEL);
612         else
613                 return (struct hlist_head *)
614                         __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(bytes));
615 }
616
617 static void fib_hash_free(struct hlist_head *hash, int bytes)
618 {
619         if (!hash)
620                 return;
621
622         if (bytes <= PAGE_SIZE)
623                 kfree(hash);
624         else
625                 free_pages((unsigned long) hash, get_order(bytes));
626 }
627
628 static void fib_hash_move(struct hlist_head *new_info_hash,
629                           struct hlist_head *new_laddrhash,
630                           unsigned int new_size)
631 {
632         struct hlist_head *old_info_hash, *old_laddrhash;
633         unsigned int old_size = fib_hash_size;
634         unsigned int i, bytes;
635
636         spin_lock_bh(&fib_info_lock);
637         old_info_hash = fib_info_hash;
638         old_laddrhash = fib_info_laddrhash;
639         fib_hash_size = new_size;
640
641         for (i = 0; i < old_size; i++) {
642                 struct hlist_head *head = &fib_info_hash[i];
643                 struct hlist_node *node, *n;
644                 struct fib_info *fi;
645
646                 hlist_for_each_entry_safe(fi, node, n, head, fib_hash) {
647                         struct hlist_head *dest;
648                         unsigned int new_hash;
649
650                         hlist_del(&fi->fib_hash);
651
652                         new_hash = fib_info_hashfn(fi);
653                         dest = &new_info_hash[new_hash];
654                         hlist_add_head(&fi->fib_hash, dest);
655                 }
656         }
657         fib_info_hash = new_info_hash;
658
659         for (i = 0; i < old_size; i++) {
660                 struct hlist_head *lhead = &fib_info_laddrhash[i];
661                 struct hlist_node *node, *n;
662                 struct fib_info *fi;
663
664                 hlist_for_each_entry_safe(fi, node, n, lhead, fib_lhash) {
665                         struct hlist_head *ldest;
666                         unsigned int new_hash;
667
668                         hlist_del(&fi->fib_lhash);
669
670                         new_hash = fib_laddr_hashfn(fi->fib_prefsrc);
671                         ldest = &new_laddrhash[new_hash];
672                         hlist_add_head(&fi->fib_lhash, ldest);
673                 }
674         }
675         fib_info_laddrhash = new_laddrhash;
676
677         spin_unlock_bh(&fib_info_lock);
678
679         bytes = old_size * sizeof(struct hlist_head *);
680         fib_hash_free(old_info_hash, bytes);
681         fib_hash_free(old_laddrhash, bytes);
682 }
683
684 struct fib_info *fib_create_info(struct fib_config *cfg)
685 {
686         int err;
687         struct fib_info *fi = NULL;
688         struct fib_info *ofi;
689         int nhs = 1;
690         struct net *net = cfg->fc_nlinfo.nl_net;
691
692         /* Fast check to catch the most weird cases */
693         if (fib_props[cfg->fc_type].scope > cfg->fc_scope)
694                 goto err_inval;
695
696 #ifdef CONFIG_IP_ROUTE_MULTIPATH
697         if (cfg->fc_mp) {
698                 nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len);
699                 if (nhs == 0)
700                         goto err_inval;
701         }
702 #endif
703
704         err = -ENOBUFS;
705         if (fib_info_cnt >= fib_hash_size) {
706                 unsigned int new_size = fib_hash_size << 1;
707                 struct hlist_head *new_info_hash;
708                 struct hlist_head *new_laddrhash;
709                 unsigned int bytes;
710
711                 if (!new_size)
712                         new_size = 1;
713                 bytes = new_size * sizeof(struct hlist_head *);
714                 new_info_hash = fib_hash_alloc(bytes);
715                 new_laddrhash = fib_hash_alloc(bytes);
716                 if (!new_info_hash || !new_laddrhash) {
717                         fib_hash_free(new_info_hash, bytes);
718                         fib_hash_free(new_laddrhash, bytes);
719                 } else
720                         fib_hash_move(new_info_hash, new_laddrhash, new_size);
721
722                 if (!fib_hash_size)
723                         goto failure;
724         }
725
726         fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
727         if (fi == NULL)
728                 goto failure;
729         fib_info_cnt++;
730
731         fi->fib_net = net;
732         fi->fib_protocol = cfg->fc_protocol;
733         fi->fib_flags = cfg->fc_flags;
734         fi->fib_priority = cfg->fc_priority;
735         fi->fib_prefsrc = cfg->fc_prefsrc;
736
737         fi->fib_nhs = nhs;
738         change_nexthops(fi) {
739                 nh->nh_parent = fi;
740         } endfor_nexthops(fi)
741
742         if (cfg->fc_mx) {
743                 struct nlattr *nla;
744                 int remaining;
745
746                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
747                         int type = nla_type(nla);
748
749                         if (type) {
750                                 if (type > RTAX_MAX)
751                                         goto err_inval;
752                                 fi->fib_metrics[type - 1] = nla_get_u32(nla);
753                         }
754                 }
755         }
756
757         if (cfg->fc_mp) {
758 #ifdef CONFIG_IP_ROUTE_MULTIPATH
759                 err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg);
760                 if (err != 0)
761                         goto failure;
762                 if (cfg->fc_oif && fi->fib_nh->nh_oif != cfg->fc_oif)
763                         goto err_inval;
764                 if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw)
765                         goto err_inval;
766 #ifdef CONFIG_NET_CLS_ROUTE
767                 if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow)
768                         goto err_inval;
769 #endif
770 #else
771                 goto err_inval;
772 #endif
773         } else {
774                 struct fib_nh *nh = fi->fib_nh;
775
776                 nh->nh_oif = cfg->fc_oif;
777                 nh->nh_gw = cfg->fc_gw;
778                 nh->nh_flags = cfg->fc_flags;
779 #ifdef CONFIG_NET_CLS_ROUTE
780                 nh->nh_tclassid = cfg->fc_flow;
781 #endif
782 #ifdef CONFIG_IP_ROUTE_MULTIPATH
783                 nh->nh_weight = 1;
784 #endif
785         }
786
787         if (fib_props[cfg->fc_type].error) {
788                 if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp)
789                         goto err_inval;
790                 goto link_it;
791         }
792
793         if (cfg->fc_scope > RT_SCOPE_HOST)
794                 goto err_inval;
795
796         if (cfg->fc_scope == RT_SCOPE_HOST) {
797                 struct fib_nh *nh = fi->fib_nh;
798
799                 /* Local address is added. */
800                 if (nhs != 1 || nh->nh_gw)
801                         goto err_inval;
802                 nh->nh_scope = RT_SCOPE_NOWHERE;
803                 nh->nh_dev = dev_get_by_index(net, fi->fib_nh->nh_oif);
804                 err = -ENODEV;
805                 if (nh->nh_dev == NULL)
806                         goto failure;
807         } else {
808                 change_nexthops(fi) {
809                         if ((err = fib_check_nh(cfg, fi, nh)) != 0)
810                                 goto failure;
811                 } endfor_nexthops(fi)
812         }
813
814         if (fi->fib_prefsrc) {
815                 if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst ||
816                     fi->fib_prefsrc != cfg->fc_dst)
817                         if (inet_addr_type(net, fi->fib_prefsrc) != RTN_LOCAL)
818                                 goto err_inval;
819         }
820
821 link_it:
822         if ((ofi = fib_find_info(fi)) != NULL) {
823                 fi->fib_dead = 1;
824                 free_fib_info(fi);
825                 ofi->fib_treeref++;
826                 return ofi;
827         }
828
829         fi->fib_treeref++;
830         atomic_inc(&fi->fib_clntref);
831         spin_lock_bh(&fib_info_lock);
832         hlist_add_head(&fi->fib_hash,
833                        &fib_info_hash[fib_info_hashfn(fi)]);
834         if (fi->fib_prefsrc) {
835                 struct hlist_head *head;
836
837                 head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)];
838                 hlist_add_head(&fi->fib_lhash, head);
839         }
840         change_nexthops(fi) {
841                 struct hlist_head *head;
842                 unsigned int hash;
843
844                 if (!nh->nh_dev)
845                         continue;
846                 hash = fib_devindex_hashfn(nh->nh_dev->ifindex);
847                 head = &fib_info_devhash[hash];
848                 hlist_add_head(&nh->nh_hash, head);
849         } endfor_nexthops(fi)
850         spin_unlock_bh(&fib_info_lock);
851         return fi;
852
853 err_inval:
854         err = -EINVAL;
855
856 failure:
857         if (fi) {
858                 fi->fib_dead = 1;
859                 free_fib_info(fi);
860         }
861
862         return ERR_PTR(err);
863 }
864
865 /* Note! fib_semantic_match intentionally uses  RCU list functions. */
866 int fib_semantic_match(struct list_head *head, const struct flowi *flp,
867                        struct fib_result *res, __be32 zone, __be32 mask,
868                         int prefixlen)
869 {
870         struct fib_alias *fa;
871         int nh_sel = 0;
872
873         list_for_each_entry_rcu(fa, head, fa_list) {
874                 int err;
875
876                 if (fa->fa_tos &&
877                     fa->fa_tos != flp->fl4_tos)
878                         continue;
879
880                 if (fa->fa_scope < flp->fl4_scope)
881                         continue;
882
883                 fa->fa_state |= FA_S_ACCESSED;
884
885                 err = fib_props[fa->fa_type].error;
886                 if (err == 0) {
887                         struct fib_info *fi = fa->fa_info;
888
889                         if (fi->fib_flags & RTNH_F_DEAD)
890                                 continue;
891
892                         switch (fa->fa_type) {
893                         case RTN_UNICAST:
894                         case RTN_LOCAL:
895                         case RTN_BROADCAST:
896                         case RTN_ANYCAST:
897                         case RTN_MULTICAST:
898                                 for_nexthops(fi) {
899                                         if (nh->nh_flags&RTNH_F_DEAD)
900                                                 continue;
901                                         if (!flp->oif || flp->oif == nh->nh_oif)
902                                                 break;
903                                 }
904 #ifdef CONFIG_IP_ROUTE_MULTIPATH
905                                 if (nhsel < fi->fib_nhs) {
906                                         nh_sel = nhsel;
907                                         goto out_fill_res;
908                                 }
909 #else
910                                 if (nhsel < 1) {
911                                         goto out_fill_res;
912                                 }
913 #endif
914                                 endfor_nexthops(fi);
915                                 continue;
916
917                         default:
918                                 printk(KERN_WARNING "fib_semantic_match bad type %#x\n",
919                                         fa->fa_type);
920                                 return -EINVAL;
921                         }
922                 }
923                 return err;
924         }
925         return 1;
926
927 out_fill_res:
928         res->prefixlen = prefixlen;
929         res->nh_sel = nh_sel;
930         res->type = fa->fa_type;
931         res->scope = fa->fa_scope;
932         res->fi = fa->fa_info;
933         atomic_inc(&res->fi->fib_clntref);
934         return 0;
935 }
936
937 /* Find appropriate source address to this destination */
938
939 __be32 __fib_res_prefsrc(struct fib_result *res)
940 {
941         return inet_select_addr(FIB_RES_DEV(*res), FIB_RES_GW(*res), res->scope);
942 }
943
944 int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
945                   u32 tb_id, u8 type, u8 scope, __be32 dst, int dst_len, u8 tos,
946                   struct fib_info *fi, unsigned int flags)
947 {
948         struct nlmsghdr *nlh;
949         struct rtmsg *rtm;
950
951         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*rtm), flags);
952         if (nlh == NULL)
953                 return -EMSGSIZE;
954
955         rtm = nlmsg_data(nlh);
956         rtm->rtm_family = AF_INET;
957         rtm->rtm_dst_len = dst_len;
958         rtm->rtm_src_len = 0;
959         rtm->rtm_tos = tos;
960         rtm->rtm_table = tb_id;
961         NLA_PUT_U32(skb, RTA_TABLE, tb_id);
962         rtm->rtm_type = type;
963         rtm->rtm_flags = fi->fib_flags;
964         rtm->rtm_scope = scope;
965         rtm->rtm_protocol = fi->fib_protocol;
966
967         if (rtm->rtm_dst_len)
968                 NLA_PUT_BE32(skb, RTA_DST, dst);
969
970         if (fi->fib_priority)
971                 NLA_PUT_U32(skb, RTA_PRIORITY, fi->fib_priority);
972
973         if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
974                 goto nla_put_failure;
975
976         if (fi->fib_prefsrc)
977                 NLA_PUT_BE32(skb, RTA_PREFSRC, fi->fib_prefsrc);
978
979         if (fi->fib_nhs == 1) {
980                 if (fi->fib_nh->nh_gw)
981                         NLA_PUT_BE32(skb, RTA_GATEWAY, fi->fib_nh->nh_gw);
982
983                 if (fi->fib_nh->nh_oif)
984                         NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif);
985 #ifdef CONFIG_NET_CLS_ROUTE
986                 if (fi->fib_nh[0].nh_tclassid)
987                         NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid);
988 #endif
989         }
990 #ifdef CONFIG_IP_ROUTE_MULTIPATH
991         if (fi->fib_nhs > 1) {
992                 struct rtnexthop *rtnh;
993                 struct nlattr *mp;
994
995                 mp = nla_nest_start(skb, RTA_MULTIPATH);
996                 if (mp == NULL)
997                         goto nla_put_failure;
998
999                 for_nexthops(fi) {
1000                         rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
1001                         if (rtnh == NULL)
1002                                 goto nla_put_failure;
1003
1004                         rtnh->rtnh_flags = nh->nh_flags & 0xFF;
1005                         rtnh->rtnh_hops = nh->nh_weight - 1;
1006                         rtnh->rtnh_ifindex = nh->nh_oif;
1007
1008                         if (nh->nh_gw)
1009                                 NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw);
1010 #ifdef CONFIG_NET_CLS_ROUTE
1011                         if (nh->nh_tclassid)
1012                                 NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid);
1013 #endif
1014                         /* length of rtnetlink header + attributes */
1015                         rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh;
1016                 } endfor_nexthops(fi);
1017
1018                 nla_nest_end(skb, mp);
1019         }
1020 #endif
1021         return nlmsg_end(skb, nlh);
1022
1023 nla_put_failure:
1024         nlmsg_cancel(skb, nlh);
1025         return -EMSGSIZE;
1026 }
1027
1028 /*
1029    Update FIB if:
1030    - local address disappeared -> we must delete all the entries
1031      referring to it.
1032    - device went down -> we must shutdown all nexthops going via it.
1033  */
1034 int fib_sync_down_addr(__be32 local)
1035 {
1036         int ret = 0;
1037         unsigned int hash = fib_laddr_hashfn(local);
1038         struct hlist_head *head = &fib_info_laddrhash[hash];
1039         struct hlist_node *node;
1040         struct fib_info *fi;
1041
1042         if (fib_info_laddrhash == NULL || local == 0)
1043                 return 0;
1044
1045         hlist_for_each_entry(fi, node, head, fib_lhash) {
1046                 if (fi->fib_prefsrc == local) {
1047                         fi->fib_flags |= RTNH_F_DEAD;
1048                         ret++;
1049                 }
1050         }
1051         return ret;
1052 }
1053
1054 int fib_sync_down_dev(struct net_device *dev, int force)
1055 {
1056         int ret = 0;
1057         int scope = RT_SCOPE_NOWHERE;
1058         struct fib_info *prev_fi = NULL;
1059         unsigned int hash = fib_devindex_hashfn(dev->ifindex);
1060         struct hlist_head *head = &fib_info_devhash[hash];
1061         struct hlist_node *node;
1062         struct fib_nh *nh;
1063
1064         if (force)
1065                 scope = -1;
1066
1067         hlist_for_each_entry(nh, node, head, nh_hash) {
1068                 struct fib_info *fi = nh->nh_parent;
1069                 int dead;
1070
1071                 BUG_ON(!fi->fib_nhs);
1072                 if (nh->nh_dev != dev || fi == prev_fi)
1073                         continue;
1074                 prev_fi = fi;
1075                 dead = 0;
1076                 change_nexthops(fi) {
1077                         if (nh->nh_flags&RTNH_F_DEAD)
1078                                 dead++;
1079                         else if (nh->nh_dev == dev &&
1080                                         nh->nh_scope != scope) {
1081                                 nh->nh_flags |= RTNH_F_DEAD;
1082 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1083                                 spin_lock_bh(&fib_multipath_lock);
1084                                 fi->fib_power -= nh->nh_power;
1085                                 nh->nh_power = 0;
1086                                 spin_unlock_bh(&fib_multipath_lock);
1087 #endif
1088                                 dead++;
1089                         }
1090 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1091                         if (force > 1 && nh->nh_dev == dev) {
1092                                 dead = fi->fib_nhs;
1093                                 break;
1094                         }
1095 #endif
1096                 } endfor_nexthops(fi)
1097                 if (dead == fi->fib_nhs) {
1098                         fi->fib_flags |= RTNH_F_DEAD;
1099                         ret++;
1100                 }
1101         }
1102
1103         return ret;
1104 }
1105
1106 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1107
1108 /*
1109    Dead device goes up. We wake up dead nexthops.
1110    It takes sense only on multipath routes.
1111  */
1112
1113 int fib_sync_up(struct net_device *dev)
1114 {
1115         struct fib_info *prev_fi;
1116         unsigned int hash;
1117         struct hlist_head *head;
1118         struct hlist_node *node;
1119         struct fib_nh *nh;
1120         int ret;
1121
1122         if (!(dev->flags&IFF_UP))
1123                 return 0;
1124
1125         prev_fi = NULL;
1126         hash = fib_devindex_hashfn(dev->ifindex);
1127         head = &fib_info_devhash[hash];
1128         ret = 0;
1129
1130         hlist_for_each_entry(nh, node, head, nh_hash) {
1131                 struct fib_info *fi = nh->nh_parent;
1132                 int alive;
1133
1134                 BUG_ON(!fi->fib_nhs);
1135                 if (nh->nh_dev != dev || fi == prev_fi)
1136                         continue;
1137
1138                 prev_fi = fi;
1139                 alive = 0;
1140                 change_nexthops(fi) {
1141                         if (!(nh->nh_flags&RTNH_F_DEAD)) {
1142                                 alive++;
1143                                 continue;
1144                         }
1145                         if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP))
1146                                 continue;
1147                         if (nh->nh_dev != dev || !__in_dev_get_rtnl(dev))
1148                                 continue;
1149                         alive++;
1150                         spin_lock_bh(&fib_multipath_lock);
1151                         nh->nh_power = 0;
1152                         nh->nh_flags &= ~RTNH_F_DEAD;
1153                         spin_unlock_bh(&fib_multipath_lock);
1154                 } endfor_nexthops(fi)
1155
1156                 if (alive > 0) {
1157                         fi->fib_flags &= ~RTNH_F_DEAD;
1158                         ret++;
1159                 }
1160         }
1161
1162         return ret;
1163 }
1164
1165 /*
1166    The algorithm is suboptimal, but it provides really
1167    fair weighted route distribution.
1168  */
1169
1170 void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
1171 {
1172         struct fib_info *fi = res->fi;
1173         int w;
1174
1175         spin_lock_bh(&fib_multipath_lock);
1176         if (fi->fib_power <= 0) {
1177                 int power = 0;
1178                 change_nexthops(fi) {
1179                         if (!(nh->nh_flags&RTNH_F_DEAD)) {
1180                                 power += nh->nh_weight;
1181                                 nh->nh_power = nh->nh_weight;
1182                         }
1183                 } endfor_nexthops(fi);
1184                 fi->fib_power = power;
1185                 if (power <= 0) {
1186                         spin_unlock_bh(&fib_multipath_lock);
1187                         /* Race condition: route has just become dead. */
1188                         res->nh_sel = 0;
1189                         return;
1190                 }
1191         }
1192
1193
1194         /* w should be random number [0..fi->fib_power-1],
1195            it is pretty bad approximation.
1196          */
1197
1198         w = jiffies % fi->fib_power;
1199
1200         change_nexthops(fi) {
1201                 if (!(nh->nh_flags&RTNH_F_DEAD) && nh->nh_power) {
1202                         if ((w -= nh->nh_power) <= 0) {
1203                                 nh->nh_power--;
1204                                 fi->fib_power--;
1205                                 res->nh_sel = nhsel;
1206                                 spin_unlock_bh(&fib_multipath_lock);
1207                                 return;
1208                         }
1209                 }
1210         } endfor_nexthops(fi);
1211
1212         /* Race condition: route has just become dead. */
1213         res->nh_sel = 0;
1214         spin_unlock_bh(&fib_multipath_lock);
1215 }
1216 #endif