Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6
[sfrench/cifs-2.6.git] / net / ipv4 / ipmr.c
1 /*
2  *      IP multicast routing support for mrouted 3.6/3.8
3  *
4  *              (c) 1995 Alan Cox, <alan@redhat.com>
5  *        Linux Consultancy and Custom Driver Development
6  *
7  *      This program is free software; you can redistribute it and/or
8  *      modify it under the terms of the GNU General Public License
9  *      as published by the Free Software Foundation; either version
10  *      2 of the License, or (at your option) any later version.
11  *
12  *      Version: $Id: ipmr.c,v 1.65 2001/10/31 21:55:54 davem Exp $
13  *
14  *      Fixes:
15  *      Michael Chastain        :       Incorrect size of copying.
16  *      Alan Cox                :       Added the cache manager code
17  *      Alan Cox                :       Fixed the clone/copy bug and device race.
18  *      Mike McLagan            :       Routing by source
19  *      Malcolm Beattie         :       Buffer handling fixes.
20  *      Alexey Kuznetsov        :       Double buffer free and other fixes.
21  *      SVR Anand               :       Fixed several multicast bugs and problems.
22  *      Alexey Kuznetsov        :       Status, optimisations and more.
23  *      Brad Parker             :       Better behaviour on mrouted upcall
24  *                                      overflow.
25  *      Carlos Picoto           :       PIMv1 Support
26  *      Pavlin Ivanov Radoslavov:       PIMv2 Registers must checksum only PIM header
27  *                                      Relax this requrement to work with older peers.
28  *
29  */
30
31 #include <asm/system.h>
32 #include <asm/uaccess.h>
33 #include <linux/types.h>
34 #include <linux/sched.h>
35 #include <linux/capability.h>
36 #include <linux/errno.h>
37 #include <linux/timer.h>
38 #include <linux/mm.h>
39 #include <linux/kernel.h>
40 #include <linux/fcntl.h>
41 #include <linux/stat.h>
42 #include <linux/socket.h>
43 #include <linux/in.h>
44 #include <linux/inet.h>
45 #include <linux/netdevice.h>
46 #include <linux/inetdevice.h>
47 #include <linux/igmp.h>
48 #include <linux/proc_fs.h>
49 #include <linux/seq_file.h>
50 #include <linux/mroute.h>
51 #include <linux/init.h>
52 #include <linux/if_ether.h>
53 #include <net/ip.h>
54 #include <net/protocol.h>
55 #include <linux/skbuff.h>
56 #include <net/route.h>
57 #include <net/sock.h>
58 #include <net/icmp.h>
59 #include <net/udp.h>
60 #include <net/raw.h>
61 #include <linux/notifier.h>
62 #include <linux/if_arp.h>
63 #include <linux/netfilter_ipv4.h>
64 #include <net/ipip.h>
65 #include <net/checksum.h>
66
67 #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
68 #define CONFIG_IP_PIMSM 1
69 #endif
70
71 static struct sock *mroute_socket;
72
73
74 /* Big lock, protecting vif table, mrt cache and mroute socket state.
75    Note that the changes are semaphored via rtnl_lock.
76  */
77
78 static DEFINE_RWLOCK(mrt_lock);
79
80 /*
81  *      Multicast router control variables
82  */
83
84 static struct vif_device vif_table[MAXVIFS];            /* Devices              */
85 static int maxvif;
86
87 #define VIF_EXISTS(idx) (vif_table[idx].dev != NULL)
88
89 static int mroute_do_assert;                            /* Set in PIM assert    */
90 static int mroute_do_pim;
91
92 static struct mfc_cache *mfc_cache_array[MFC_LINES];    /* Forwarding cache     */
93
94 static struct mfc_cache *mfc_unres_queue;               /* Queue of unresolved entries */
95 static atomic_t cache_resolve_queue_len;                /* Size of unresolved   */
96
97 /* Special spinlock for queue of unresolved entries */
98 static DEFINE_SPINLOCK(mfc_unres_lock);
99
100 /* We return to original Alan's scheme. Hash table of resolved
101    entries is changed only in process context and protected
102    with weak lock mrt_lock. Queue of unresolved entries is protected
103    with strong spinlock mfc_unres_lock.
104
105    In this case data path is free of exclusive locks at all.
106  */
107
108 static struct kmem_cache *mrt_cachep __read_mostly;
109
110 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local);
111 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert);
112 static int ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm);
113
114 #ifdef CONFIG_IP_PIMSM_V2
115 static struct net_protocol pim_protocol;
116 #endif
117
118 static struct timer_list ipmr_expire_timer;
119
120 /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
121
122 static
123 struct net_device *ipmr_new_tunnel(struct vifctl *v)
124 {
125         struct net_device  *dev;
126
127         dev = __dev_get_by_name("tunl0");
128
129         if (dev) {
130                 int err;
131                 struct ifreq ifr;
132                 mm_segment_t    oldfs;
133                 struct ip_tunnel_parm p;
134                 struct in_device  *in_dev;
135
136                 memset(&p, 0, sizeof(p));
137                 p.iph.daddr = v->vifc_rmt_addr.s_addr;
138                 p.iph.saddr = v->vifc_lcl_addr.s_addr;
139                 p.iph.version = 4;
140                 p.iph.ihl = 5;
141                 p.iph.protocol = IPPROTO_IPIP;
142                 sprintf(p.name, "dvmrp%d", v->vifc_vifi);
143                 ifr.ifr_ifru.ifru_data = (void*)&p;
144
145                 oldfs = get_fs(); set_fs(KERNEL_DS);
146                 err = dev->do_ioctl(dev, &ifr, SIOCADDTUNNEL);
147                 set_fs(oldfs);
148
149                 dev = NULL;
150
151                 if (err == 0 && (dev = __dev_get_by_name(p.name)) != NULL) {
152                         dev->flags |= IFF_MULTICAST;
153
154                         in_dev = __in_dev_get_rtnl(dev);
155                         if (in_dev == NULL && (in_dev = inetdev_init(dev)) == NULL)
156                                 goto failure;
157                         in_dev->cnf.rp_filter = 0;
158
159                         if (dev_open(dev))
160                                 goto failure;
161                 }
162         }
163         return dev;
164
165 failure:
166         /* allow the register to be completed before unregistering. */
167         rtnl_unlock();
168         rtnl_lock();
169
170         unregister_netdevice(dev);
171         return NULL;
172 }
173
174 #ifdef CONFIG_IP_PIMSM
175
176 static int reg_vif_num = -1;
177
178 static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
179 {
180         read_lock(&mrt_lock);
181         ((struct net_device_stats*)netdev_priv(dev))->tx_bytes += skb->len;
182         ((struct net_device_stats*)netdev_priv(dev))->tx_packets++;
183         ipmr_cache_report(skb, reg_vif_num, IGMPMSG_WHOLEPKT);
184         read_unlock(&mrt_lock);
185         kfree_skb(skb);
186         return 0;
187 }
188
189 static struct net_device_stats *reg_vif_get_stats(struct net_device *dev)
190 {
191         return (struct net_device_stats*)netdev_priv(dev);
192 }
193
194 static void reg_vif_setup(struct net_device *dev)
195 {
196         dev->type               = ARPHRD_PIMREG;
197         dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 8;
198         dev->flags              = IFF_NOARP;
199         dev->hard_start_xmit    = reg_vif_xmit;
200         dev->get_stats          = reg_vif_get_stats;
201         dev->destructor         = free_netdev;
202 }
203
204 static struct net_device *ipmr_reg_vif(void)
205 {
206         struct net_device *dev;
207         struct in_device *in_dev;
208
209         dev = alloc_netdev(sizeof(struct net_device_stats), "pimreg",
210                            reg_vif_setup);
211
212         if (dev == NULL)
213                 return NULL;
214
215         if (register_netdevice(dev)) {
216                 free_netdev(dev);
217                 return NULL;
218         }
219         dev->iflink = 0;
220
221         if ((in_dev = inetdev_init(dev)) == NULL)
222                 goto failure;
223
224         in_dev->cnf.rp_filter = 0;
225
226         if (dev_open(dev))
227                 goto failure;
228
229         return dev;
230
231 failure:
232         /* allow the register to be completed before unregistering. */
233         rtnl_unlock();
234         rtnl_lock();
235
236         unregister_netdevice(dev);
237         return NULL;
238 }
239 #endif
240
241 /*
242  *      Delete a VIF entry
243  */
244  
245 static int vif_delete(int vifi)
246 {
247         struct vif_device *v;
248         struct net_device *dev;
249         struct in_device *in_dev;
250
251         if (vifi < 0 || vifi >= maxvif)
252                 return -EADDRNOTAVAIL;
253
254         v = &vif_table[vifi];
255
256         write_lock_bh(&mrt_lock);
257         dev = v->dev;
258         v->dev = NULL;
259
260         if (!dev) {
261                 write_unlock_bh(&mrt_lock);
262                 return -EADDRNOTAVAIL;
263         }
264
265 #ifdef CONFIG_IP_PIMSM
266         if (vifi == reg_vif_num)
267                 reg_vif_num = -1;
268 #endif
269
270         if (vifi+1 == maxvif) {
271                 int tmp;
272                 for (tmp=vifi-1; tmp>=0; tmp--) {
273                         if (VIF_EXISTS(tmp))
274                                 break;
275                 }
276                 maxvif = tmp+1;
277         }
278
279         write_unlock_bh(&mrt_lock);
280
281         dev_set_allmulti(dev, -1);
282
283         if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) {
284                 in_dev->cnf.mc_forwarding--;
285                 ip_rt_multicast_event(in_dev);
286         }
287
288         if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
289                 unregister_netdevice(dev);
290
291         dev_put(dev);
292         return 0;
293 }
294
295 /* Destroy an unresolved cache entry, killing queued skbs
296    and reporting error to netlink readers.
297  */
298
299 static void ipmr_destroy_unres(struct mfc_cache *c)
300 {
301         struct sk_buff *skb;
302         struct nlmsgerr *e;
303
304         atomic_dec(&cache_resolve_queue_len);
305
306         while((skb=skb_dequeue(&c->mfc_un.unres.unresolved))) {
307                 if (skb->nh.iph->version == 0) {
308                         struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
309                         nlh->nlmsg_type = NLMSG_ERROR;
310                         nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
311                         skb_trim(skb, nlh->nlmsg_len);
312                         e = NLMSG_DATA(nlh);
313                         e->error = -ETIMEDOUT;
314                         memset(&e->msg, 0, sizeof(e->msg));
315
316                         rtnl_unicast(skb, NETLINK_CB(skb).pid);
317                 } else
318                         kfree_skb(skb);
319         }
320
321         kmem_cache_free(mrt_cachep, c);
322 }
323
324
325 /* Single timer process for all the unresolved queue. */
326
327 static void ipmr_expire_process(unsigned long dummy)
328 {
329         unsigned long now;
330         unsigned long expires;
331         struct mfc_cache *c, **cp;
332
333         if (!spin_trylock(&mfc_unres_lock)) {
334                 mod_timer(&ipmr_expire_timer, jiffies+HZ/10);
335                 return;
336         }
337
338         if (atomic_read(&cache_resolve_queue_len) == 0)
339                 goto out;
340
341         now = jiffies;
342         expires = 10*HZ;
343         cp = &mfc_unres_queue;
344
345         while ((c=*cp) != NULL) {
346                 if (time_after(c->mfc_un.unres.expires, now)) {
347                         unsigned long interval = c->mfc_un.unres.expires - now;
348                         if (interval < expires)
349                                 expires = interval;
350                         cp = &c->next;
351                         continue;
352                 }
353
354                 *cp = c->next;
355
356                 ipmr_destroy_unres(c);
357         }
358
359         if (atomic_read(&cache_resolve_queue_len))
360                 mod_timer(&ipmr_expire_timer, jiffies + expires);
361
362 out:
363         spin_unlock(&mfc_unres_lock);
364 }
365
366 /* Fill oifs list. It is called under write locked mrt_lock. */
367
368 static void ipmr_update_thresholds(struct mfc_cache *cache, unsigned char *ttls)
369 {
370         int vifi;
371
372         cache->mfc_un.res.minvif = MAXVIFS;
373         cache->mfc_un.res.maxvif = 0;
374         memset(cache->mfc_un.res.ttls, 255, MAXVIFS);
375
376         for (vifi=0; vifi<maxvif; vifi++) {
377                 if (VIF_EXISTS(vifi) && ttls[vifi] && ttls[vifi] < 255) {
378                         cache->mfc_un.res.ttls[vifi] = ttls[vifi];
379                         if (cache->mfc_un.res.minvif > vifi)
380                                 cache->mfc_un.res.minvif = vifi;
381                         if (cache->mfc_un.res.maxvif <= vifi)
382                                 cache->mfc_un.res.maxvif = vifi + 1;
383                 }
384         }
385 }
386
387 static int vif_add(struct vifctl *vifc, int mrtsock)
388 {
389         int vifi = vifc->vifc_vifi;
390         struct vif_device *v = &vif_table[vifi];
391         struct net_device *dev;
392         struct in_device *in_dev;
393
394         /* Is vif busy ? */
395         if (VIF_EXISTS(vifi))
396                 return -EADDRINUSE;
397
398         switch (vifc->vifc_flags) {
399 #ifdef CONFIG_IP_PIMSM
400         case VIFF_REGISTER:
401                 /*
402                  * Special Purpose VIF in PIM
403                  * All the packets will be sent to the daemon
404                  */
405                 if (reg_vif_num >= 0)
406                         return -EADDRINUSE;
407                 dev = ipmr_reg_vif();
408                 if (!dev)
409                         return -ENOBUFS;
410                 break;
411 #endif
412         case VIFF_TUNNEL:       
413                 dev = ipmr_new_tunnel(vifc);
414                 if (!dev)
415                         return -ENOBUFS;
416                 break;
417         case 0:
418                 dev = ip_dev_find(vifc->vifc_lcl_addr.s_addr);
419                 if (!dev)
420                         return -EADDRNOTAVAIL;
421                 dev_put(dev);
422                 break;
423         default:
424                 return -EINVAL;
425         }
426
427         if ((in_dev = __in_dev_get_rtnl(dev)) == NULL)
428                 return -EADDRNOTAVAIL;
429         in_dev->cnf.mc_forwarding++;
430         dev_set_allmulti(dev, +1);
431         ip_rt_multicast_event(in_dev);
432
433         /*
434          *      Fill in the VIF structures
435          */
436         v->rate_limit=vifc->vifc_rate_limit;
437         v->local=vifc->vifc_lcl_addr.s_addr;
438         v->remote=vifc->vifc_rmt_addr.s_addr;
439         v->flags=vifc->vifc_flags;
440         if (!mrtsock)
441                 v->flags |= VIFF_STATIC;
442         v->threshold=vifc->vifc_threshold;
443         v->bytes_in = 0;
444         v->bytes_out = 0;
445         v->pkt_in = 0;
446         v->pkt_out = 0;
447         v->link = dev->ifindex;
448         if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
449                 v->link = dev->iflink;
450
451         /* And finish update writing critical data */
452         write_lock_bh(&mrt_lock);
453         dev_hold(dev);
454         v->dev=dev;
455 #ifdef CONFIG_IP_PIMSM
456         if (v->flags&VIFF_REGISTER)
457                 reg_vif_num = vifi;
458 #endif
459         if (vifi+1 > maxvif)
460                 maxvif = vifi+1;
461         write_unlock_bh(&mrt_lock);
462         return 0;
463 }
464
465 static struct mfc_cache *ipmr_cache_find(__be32 origin, __be32 mcastgrp)
466 {
467         int line=MFC_HASH(mcastgrp,origin);
468         struct mfc_cache *c;
469
470         for (c=mfc_cache_array[line]; c; c = c->next) {
471                 if (c->mfc_origin==origin && c->mfc_mcastgrp==mcastgrp)
472                         break;
473         }
474         return c;
475 }
476
477 /*
478  *      Allocate a multicast cache entry
479  */
480 static struct mfc_cache *ipmr_cache_alloc(void)
481 {
482         struct mfc_cache *c=kmem_cache_alloc(mrt_cachep, GFP_KERNEL);
483         if(c==NULL)
484                 return NULL;
485         memset(c, 0, sizeof(*c));
486         c->mfc_un.res.minvif = MAXVIFS;
487         return c;
488 }
489
490 static struct mfc_cache *ipmr_cache_alloc_unres(void)
491 {
492         struct mfc_cache *c=kmem_cache_alloc(mrt_cachep, GFP_ATOMIC);
493         if(c==NULL)
494                 return NULL;
495         memset(c, 0, sizeof(*c));
496         skb_queue_head_init(&c->mfc_un.unres.unresolved);
497         c->mfc_un.unres.expires = jiffies + 10*HZ;
498         return c;
499 }
500
501 /*
502  *      A cache entry has gone into a resolved state from queued
503  */
504  
505 static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
506 {
507         struct sk_buff *skb;
508         struct nlmsgerr *e;
509
510         /*
511          *      Play the pending entries through our router
512          */
513
514         while((skb=__skb_dequeue(&uc->mfc_un.unres.unresolved))) {
515                 if (skb->nh.iph->version == 0) {
516                         struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
517
518                         if (ipmr_fill_mroute(skb, c, NLMSG_DATA(nlh)) > 0) {
519                                 nlh->nlmsg_len = skb->tail - (u8*)nlh;
520                         } else {
521                                 nlh->nlmsg_type = NLMSG_ERROR;
522                                 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
523                                 skb_trim(skb, nlh->nlmsg_len);
524                                 e = NLMSG_DATA(nlh);
525                                 e->error = -EMSGSIZE;
526                                 memset(&e->msg, 0, sizeof(e->msg));
527                         }
528
529                         rtnl_unicast(skb, NETLINK_CB(skb).pid);
530                 } else
531                         ip_mr_forward(skb, c, 0);
532         }
533 }
534
535 /*
536  *      Bounce a cache query up to mrouted. We could use netlink for this but mrouted
537  *      expects the following bizarre scheme.
538  *
539  *      Called under mrt_lock.
540  */
541  
542 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert)
543 {
544         struct sk_buff *skb;
545         int ihl = pkt->nh.iph->ihl<<2;
546         struct igmphdr *igmp;
547         struct igmpmsg *msg;
548         int ret;
549
550 #ifdef CONFIG_IP_PIMSM
551         if (assert == IGMPMSG_WHOLEPKT)
552                 skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
553         else
554 #endif
555                 skb = alloc_skb(128, GFP_ATOMIC);
556
557         if(!skb)
558                 return -ENOBUFS;
559
560 #ifdef CONFIG_IP_PIMSM
561         if (assert == IGMPMSG_WHOLEPKT) {
562                 /* Ugly, but we have no choice with this interface.
563                    Duplicate old header, fix ihl, length etc.
564                    And all this only to mangle msg->im_msgtype and
565                    to set msg->im_mbz to "mbz" :-)
566                  */
567                 msg = (struct igmpmsg*)skb_push(skb, sizeof(struct iphdr));
568                 skb->nh.raw = skb->h.raw = (u8*)msg;
569                 memcpy(msg, pkt->nh.raw, sizeof(struct iphdr));
570                 msg->im_msgtype = IGMPMSG_WHOLEPKT;
571                 msg->im_mbz = 0;
572                 msg->im_vif = reg_vif_num;
573                 skb->nh.iph->ihl = sizeof(struct iphdr) >> 2;
574                 skb->nh.iph->tot_len = htons(ntohs(pkt->nh.iph->tot_len) + sizeof(struct iphdr));
575         } else 
576 #endif
577         {       
578                 
579         /*
580          *      Copy the IP header
581          */
582
583         skb->nh.iph = (struct iphdr *)skb_put(skb, ihl);
584         memcpy(skb->data,pkt->data,ihl);
585         skb->nh.iph->protocol = 0;                      /* Flag to the kernel this is a route add */
586         msg = (struct igmpmsg*)skb->nh.iph;
587         msg->im_vif = vifi;
588         skb->dst = dst_clone(pkt->dst);
589
590         /*
591          *      Add our header
592          */
593
594         igmp=(struct igmphdr *)skb_put(skb,sizeof(struct igmphdr));
595         igmp->type      =
596         msg->im_msgtype = assert;
597         igmp->code      =       0;
598         skb->nh.iph->tot_len=htons(skb->len);                   /* Fix the length */
599         skb->h.raw = skb->nh.raw;
600         }
601
602         if (mroute_socket == NULL) {
603                 kfree_skb(skb);
604                 return -EINVAL;
605         }
606
607         /*
608          *      Deliver to mrouted
609          */
610         if ((ret=sock_queue_rcv_skb(mroute_socket,skb))<0) {
611                 if (net_ratelimit())
612                         printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
613                 kfree_skb(skb);
614         }
615
616         return ret;
617 }
618
619 /*
620  *      Queue a packet for resolution. It gets locked cache entry!
621  */
622  
623 static int
624 ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb)
625 {
626         int err;
627         struct mfc_cache *c;
628
629         spin_lock_bh(&mfc_unres_lock);
630         for (c=mfc_unres_queue; c; c=c->next) {
631                 if (c->mfc_mcastgrp == skb->nh.iph->daddr &&
632                     c->mfc_origin == skb->nh.iph->saddr)
633                         break;
634         }
635
636         if (c == NULL) {
637                 /*
638                  *      Create a new entry if allowable
639                  */
640
641                 if (atomic_read(&cache_resolve_queue_len)>=10 ||
642                     (c=ipmr_cache_alloc_unres())==NULL) {
643                         spin_unlock_bh(&mfc_unres_lock);
644
645                         kfree_skb(skb);
646                         return -ENOBUFS;
647                 }
648
649                 /*
650                  *      Fill in the new cache entry
651                  */
652                 c->mfc_parent=-1;
653                 c->mfc_origin=skb->nh.iph->saddr;
654                 c->mfc_mcastgrp=skb->nh.iph->daddr;
655
656                 /*
657                  *      Reflect first query at mrouted.
658                  */
659                 if ((err = ipmr_cache_report(skb, vifi, IGMPMSG_NOCACHE))<0) {
660                         /* If the report failed throw the cache entry 
661                            out - Brad Parker
662                          */
663                         spin_unlock_bh(&mfc_unres_lock);
664
665                         kmem_cache_free(mrt_cachep, c);
666                         kfree_skb(skb);
667                         return err;
668                 }
669
670                 atomic_inc(&cache_resolve_queue_len);
671                 c->next = mfc_unres_queue;
672                 mfc_unres_queue = c;
673
674                 mod_timer(&ipmr_expire_timer, c->mfc_un.unres.expires);
675         }
676
677         /*
678          *      See if we can append the packet
679          */
680         if (c->mfc_un.unres.unresolved.qlen>3) {
681                 kfree_skb(skb);
682                 err = -ENOBUFS;
683         } else {
684                 skb_queue_tail(&c->mfc_un.unres.unresolved,skb);
685                 err = 0;
686         }
687
688         spin_unlock_bh(&mfc_unres_lock);
689         return err;
690 }
691
692 /*
693  *      MFC cache manipulation by user space mroute daemon
694  */
695
696 static int ipmr_mfc_delete(struct mfcctl *mfc)
697 {
698         int line;
699         struct mfc_cache *c, **cp;
700
701         line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
702
703         for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
704                 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
705                     c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
706                         write_lock_bh(&mrt_lock);
707                         *cp = c->next;
708                         write_unlock_bh(&mrt_lock);
709
710                         kmem_cache_free(mrt_cachep, c);
711                         return 0;
712                 }
713         }
714         return -ENOENT;
715 }
716
717 static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
718 {
719         int line;
720         struct mfc_cache *uc, *c, **cp;
721
722         line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
723
724         for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
725                 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
726                     c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr)
727                         break;
728         }
729
730         if (c != NULL) {
731                 write_lock_bh(&mrt_lock);
732                 c->mfc_parent = mfc->mfcc_parent;
733                 ipmr_update_thresholds(c, mfc->mfcc_ttls);
734                 if (!mrtsock)
735                         c->mfc_flags |= MFC_STATIC;
736                 write_unlock_bh(&mrt_lock);
737                 return 0;
738         }
739
740         if(!MULTICAST(mfc->mfcc_mcastgrp.s_addr))
741                 return -EINVAL;
742
743         c=ipmr_cache_alloc();
744         if (c==NULL)
745                 return -ENOMEM;
746
747         c->mfc_origin=mfc->mfcc_origin.s_addr;
748         c->mfc_mcastgrp=mfc->mfcc_mcastgrp.s_addr;
749         c->mfc_parent=mfc->mfcc_parent;
750         ipmr_update_thresholds(c, mfc->mfcc_ttls);
751         if (!mrtsock)
752                 c->mfc_flags |= MFC_STATIC;
753
754         write_lock_bh(&mrt_lock);
755         c->next = mfc_cache_array[line];
756         mfc_cache_array[line] = c;
757         write_unlock_bh(&mrt_lock);
758
759         /*
760          *      Check to see if we resolved a queued list. If so we
761          *      need to send on the frames and tidy up.
762          */
763         spin_lock_bh(&mfc_unres_lock);
764         for (cp = &mfc_unres_queue; (uc=*cp) != NULL;
765              cp = &uc->next) {
766                 if (uc->mfc_origin == c->mfc_origin &&
767                     uc->mfc_mcastgrp == c->mfc_mcastgrp) {
768                         *cp = uc->next;
769                         if (atomic_dec_and_test(&cache_resolve_queue_len))
770                                 del_timer(&ipmr_expire_timer);
771                         break;
772                 }
773         }
774         spin_unlock_bh(&mfc_unres_lock);
775
776         if (uc) {
777                 ipmr_cache_resolve(uc, c);
778                 kmem_cache_free(mrt_cachep, uc);
779         }
780         return 0;
781 }
782
783 /*
784  *      Close the multicast socket, and clear the vif tables etc
785  */
786  
787 static void mroute_clean_tables(struct sock *sk)
788 {
789         int i;
790                 
791         /*
792          *      Shut down all active vif entries
793          */
794         for(i=0; i<maxvif; i++) {
795                 if (!(vif_table[i].flags&VIFF_STATIC))
796                         vif_delete(i);
797         }
798
799         /*
800          *      Wipe the cache
801          */
802         for (i=0;i<MFC_LINES;i++) {
803                 struct mfc_cache *c, **cp;
804
805                 cp = &mfc_cache_array[i];
806                 while ((c = *cp) != NULL) {
807                         if (c->mfc_flags&MFC_STATIC) {
808                                 cp = &c->next;
809                                 continue;
810                         }
811                         write_lock_bh(&mrt_lock);
812                         *cp = c->next;
813                         write_unlock_bh(&mrt_lock);
814
815                         kmem_cache_free(mrt_cachep, c);
816                 }
817         }
818
819         if (atomic_read(&cache_resolve_queue_len) != 0) {
820                 struct mfc_cache *c;
821
822                 spin_lock_bh(&mfc_unres_lock);
823                 while (mfc_unres_queue != NULL) {
824                         c = mfc_unres_queue;
825                         mfc_unres_queue = c->next;
826                         spin_unlock_bh(&mfc_unres_lock);
827
828                         ipmr_destroy_unres(c);
829
830                         spin_lock_bh(&mfc_unres_lock);
831                 }
832                 spin_unlock_bh(&mfc_unres_lock);
833         }
834 }
835
836 static void mrtsock_destruct(struct sock *sk)
837 {
838         rtnl_lock();
839         if (sk == mroute_socket) {
840                 ipv4_devconf.mc_forwarding--;
841
842                 write_lock_bh(&mrt_lock);
843                 mroute_socket=NULL;
844                 write_unlock_bh(&mrt_lock);
845
846                 mroute_clean_tables(sk);
847         }
848         rtnl_unlock();
849 }
850
851 /*
852  *      Socket options and virtual interface manipulation. The whole
853  *      virtual interface system is a complete heap, but unfortunately
854  *      that's how BSD mrouted happens to think. Maybe one day with a proper
855  *      MOSPF/PIM router set up we can clean this up.
856  */
857  
858 int ip_mroute_setsockopt(struct sock *sk,int optname,char __user *optval,int optlen)
859 {
860         int ret;
861         struct vifctl vif;
862         struct mfcctl mfc;
863         
864         if(optname!=MRT_INIT)
865         {
866                 if(sk!=mroute_socket && !capable(CAP_NET_ADMIN))
867                         return -EACCES;
868         }
869
870         switch(optname)
871         {
872                 case MRT_INIT:
873                         if (sk->sk_type != SOCK_RAW ||
874                             inet_sk(sk)->num != IPPROTO_IGMP)
875                                 return -EOPNOTSUPP;
876                         if(optlen!=sizeof(int))
877                                 return -ENOPROTOOPT;
878
879                         rtnl_lock();
880                         if (mroute_socket) {
881                                 rtnl_unlock();
882                                 return -EADDRINUSE;
883                         }
884
885                         ret = ip_ra_control(sk, 1, mrtsock_destruct);
886                         if (ret == 0) {
887                                 write_lock_bh(&mrt_lock);
888                                 mroute_socket=sk;
889                                 write_unlock_bh(&mrt_lock);
890
891                                 ipv4_devconf.mc_forwarding++;
892                         }
893                         rtnl_unlock();
894                         return ret;
895                 case MRT_DONE:
896                         if (sk!=mroute_socket)
897                                 return -EACCES;
898                         return ip_ra_control(sk, 0, NULL);
899                 case MRT_ADD_VIF:
900                 case MRT_DEL_VIF:
901                         if(optlen!=sizeof(vif))
902                                 return -EINVAL;
903                         if (copy_from_user(&vif,optval,sizeof(vif)))
904                                 return -EFAULT; 
905                         if(vif.vifc_vifi >= MAXVIFS)
906                                 return -ENFILE;
907                         rtnl_lock();
908                         if (optname==MRT_ADD_VIF) {
909                                 ret = vif_add(&vif, sk==mroute_socket);
910                         } else {
911                                 ret = vif_delete(vif.vifc_vifi);
912                         }
913                         rtnl_unlock();
914                         return ret;
915
916                 /*
917                  *      Manipulate the forwarding caches. These live
918                  *      in a sort of kernel/user symbiosis.
919                  */
920                 case MRT_ADD_MFC:
921                 case MRT_DEL_MFC:
922                         if(optlen!=sizeof(mfc))
923                                 return -EINVAL;
924                         if (copy_from_user(&mfc,optval, sizeof(mfc)))
925                                 return -EFAULT;
926                         rtnl_lock();
927                         if (optname==MRT_DEL_MFC)
928                                 ret = ipmr_mfc_delete(&mfc);
929                         else
930                                 ret = ipmr_mfc_add(&mfc, sk==mroute_socket);
931                         rtnl_unlock();
932                         return ret;
933                 /*
934                  *      Control PIM assert.
935                  */
936                 case MRT_ASSERT:
937                 {
938                         int v;
939                         if(get_user(v,(int __user *)optval))
940                                 return -EFAULT;
941                         mroute_do_assert=(v)?1:0;
942                         return 0;
943                 }
944 #ifdef CONFIG_IP_PIMSM
945                 case MRT_PIM:
946                 {
947                         int v, ret;
948                         if(get_user(v,(int __user *)optval))
949                                 return -EFAULT;
950                         v = (v)?1:0;
951                         rtnl_lock();
952                         ret = 0;
953                         if (v != mroute_do_pim) {
954                                 mroute_do_pim = v;
955                                 mroute_do_assert = v;
956 #ifdef CONFIG_IP_PIMSM_V2
957                                 if (mroute_do_pim)
958                                         ret = inet_add_protocol(&pim_protocol,
959                                                                 IPPROTO_PIM);
960                                 else
961                                         ret = inet_del_protocol(&pim_protocol,
962                                                                 IPPROTO_PIM);
963                                 if (ret < 0)
964                                         ret = -EAGAIN;
965 #endif
966                         }
967                         rtnl_unlock();
968                         return ret;
969                 }
970 #endif
971                 /*
972                  *      Spurious command, or MRT_VERSION which you cannot
973                  *      set.
974                  */
975                 default:
976                         return -ENOPROTOOPT;
977         }
978 }
979
980 /*
981  *      Getsock opt support for the multicast routing system.
982  */
983  
984 int ip_mroute_getsockopt(struct sock *sk,int optname,char __user *optval,int __user *optlen)
985 {
986         int olr;
987         int val;
988
989         if(optname!=MRT_VERSION && 
990 #ifdef CONFIG_IP_PIMSM
991            optname!=MRT_PIM &&
992 #endif
993            optname!=MRT_ASSERT)
994                 return -ENOPROTOOPT;
995
996         if (get_user(olr, optlen))
997                 return -EFAULT;
998
999         olr = min_t(unsigned int, olr, sizeof(int));
1000         if (olr < 0)
1001                 return -EINVAL;
1002                 
1003         if(put_user(olr,optlen))
1004                 return -EFAULT;
1005         if(optname==MRT_VERSION)
1006                 val=0x0305;
1007 #ifdef CONFIG_IP_PIMSM
1008         else if(optname==MRT_PIM)
1009                 val=mroute_do_pim;
1010 #endif
1011         else
1012                 val=mroute_do_assert;
1013         if(copy_to_user(optval,&val,olr))
1014                 return -EFAULT;
1015         return 0;
1016 }
1017
1018 /*
1019  *      The IP multicast ioctl support routines.
1020  */
1021  
1022 int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
1023 {
1024         struct sioc_sg_req sr;
1025         struct sioc_vif_req vr;
1026         struct vif_device *vif;
1027         struct mfc_cache *c;
1028         
1029         switch(cmd)
1030         {
1031                 case SIOCGETVIFCNT:
1032                         if (copy_from_user(&vr,arg,sizeof(vr)))
1033                                 return -EFAULT; 
1034                         if(vr.vifi>=maxvif)
1035                                 return -EINVAL;
1036                         read_lock(&mrt_lock);
1037                         vif=&vif_table[vr.vifi];
1038                         if(VIF_EXISTS(vr.vifi)) {
1039                                 vr.icount=vif->pkt_in;
1040                                 vr.ocount=vif->pkt_out;
1041                                 vr.ibytes=vif->bytes_in;
1042                                 vr.obytes=vif->bytes_out;
1043                                 read_unlock(&mrt_lock);
1044
1045                                 if (copy_to_user(arg,&vr,sizeof(vr)))
1046                                         return -EFAULT;
1047                                 return 0;
1048                         }
1049                         read_unlock(&mrt_lock);
1050                         return -EADDRNOTAVAIL;
1051                 case SIOCGETSGCNT:
1052                         if (copy_from_user(&sr,arg,sizeof(sr)))
1053                                 return -EFAULT;
1054
1055                         read_lock(&mrt_lock);
1056                         c = ipmr_cache_find(sr.src.s_addr, sr.grp.s_addr);
1057                         if (c) {
1058                                 sr.pktcnt = c->mfc_un.res.pkt;
1059                                 sr.bytecnt = c->mfc_un.res.bytes;
1060                                 sr.wrong_if = c->mfc_un.res.wrong_if;
1061                                 read_unlock(&mrt_lock);
1062
1063                                 if (copy_to_user(arg,&sr,sizeof(sr)))
1064                                         return -EFAULT;
1065                                 return 0;
1066                         }
1067                         read_unlock(&mrt_lock);
1068                         return -EADDRNOTAVAIL;
1069                 default:
1070                         return -ENOIOCTLCMD;
1071         }
1072 }
1073
1074
1075 static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
1076 {
1077         struct vif_device *v;
1078         int ct;
1079         if (event != NETDEV_UNREGISTER)
1080                 return NOTIFY_DONE;
1081         v=&vif_table[0];
1082         for(ct=0;ct<maxvif;ct++,v++) {
1083                 if (v->dev==ptr)
1084                         vif_delete(ct);
1085         }
1086         return NOTIFY_DONE;
1087 }
1088
1089
1090 static struct notifier_block ip_mr_notifier={
1091         .notifier_call = ipmr_device_event,
1092 };
1093
1094 /*
1095  *      Encapsulate a packet by attaching a valid IPIP header to it.
1096  *      This avoids tunnel drivers and other mess and gives us the speed so
1097  *      important for multicast video.
1098  */
1099  
1100 static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1101 {
1102         struct iphdr *iph = (struct iphdr *)skb_push(skb,sizeof(struct iphdr));
1103
1104         iph->version    =       4;
1105         iph->tos        =       skb->nh.iph->tos;
1106         iph->ttl        =       skb->nh.iph->ttl;
1107         iph->frag_off   =       0;
1108         iph->daddr      =       daddr;
1109         iph->saddr      =       saddr;
1110         iph->protocol   =       IPPROTO_IPIP;
1111         iph->ihl        =       5;
1112         iph->tot_len    =       htons(skb->len);
1113         ip_select_ident(iph, skb->dst, NULL);
1114         ip_send_check(iph);
1115
1116         skb->h.ipiph = skb->nh.iph;
1117         skb->nh.iph = iph;
1118         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
1119         nf_reset(skb);
1120 }
1121
1122 static inline int ipmr_forward_finish(struct sk_buff *skb)
1123 {
1124         struct ip_options * opt = &(IPCB(skb)->opt);
1125
1126         IP_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
1127
1128         if (unlikely(opt->optlen))
1129                 ip_forward_options(skb);
1130
1131         return dst_output(skb);
1132 }
1133
1134 /*
1135  *      Processing handlers for ipmr_forward
1136  */
1137
1138 static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
1139 {
1140         struct iphdr *iph = skb->nh.iph;
1141         struct vif_device *vif = &vif_table[vifi];
1142         struct net_device *dev;
1143         struct rtable *rt;
1144         int    encap = 0;
1145
1146         if (vif->dev == NULL)
1147                 goto out_free;
1148
1149 #ifdef CONFIG_IP_PIMSM
1150         if (vif->flags & VIFF_REGISTER) {
1151                 vif->pkt_out++;
1152                 vif->bytes_out+=skb->len;
1153                 ((struct net_device_stats*)netdev_priv(vif->dev))->tx_bytes += skb->len;
1154                 ((struct net_device_stats*)netdev_priv(vif->dev))->tx_packets++;
1155                 ipmr_cache_report(skb, vifi, IGMPMSG_WHOLEPKT);
1156                 kfree_skb(skb);
1157                 return;
1158         }
1159 #endif
1160
1161         if (vif->flags&VIFF_TUNNEL) {
1162                 struct flowi fl = { .oif = vif->link,
1163                                     .nl_u = { .ip4_u =
1164                                               { .daddr = vif->remote,
1165                                                 .saddr = vif->local,
1166                                                 .tos = RT_TOS(iph->tos) } },
1167                                     .proto = IPPROTO_IPIP };
1168                 if (ip_route_output_key(&rt, &fl))
1169                         goto out_free;
1170                 encap = sizeof(struct iphdr);
1171         } else {
1172                 struct flowi fl = { .oif = vif->link,
1173                                     .nl_u = { .ip4_u =
1174                                               { .daddr = iph->daddr,
1175                                                 .tos = RT_TOS(iph->tos) } },
1176                                     .proto = IPPROTO_IPIP };
1177                 if (ip_route_output_key(&rt, &fl))
1178                         goto out_free;
1179         }
1180
1181         dev = rt->u.dst.dev;
1182
1183         if (skb->len+encap > dst_mtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) {
1184                 /* Do not fragment multicasts. Alas, IPv4 does not
1185                    allow to send ICMP, so that packets will disappear
1186                    to blackhole.
1187                  */
1188
1189                 IP_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
1190                 ip_rt_put(rt);
1191                 goto out_free;
1192         }
1193
1194         encap += LL_RESERVED_SPACE(dev) + rt->u.dst.header_len;
1195
1196         if (skb_cow(skb, encap)) {
1197                 ip_rt_put(rt);
1198                 goto out_free;
1199         }
1200
1201         vif->pkt_out++;
1202         vif->bytes_out+=skb->len;
1203
1204         dst_release(skb->dst);
1205         skb->dst = &rt->u.dst;
1206         iph = skb->nh.iph;
1207         ip_decrease_ttl(iph);
1208
1209         /* FIXME: forward and output firewalls used to be called here.
1210          * What do we do with netfilter? -- RR */
1211         if (vif->flags & VIFF_TUNNEL) {
1212                 ip_encap(skb, vif->local, vif->remote);
1213                 /* FIXME: extra output firewall step used to be here. --RR */
1214                 ((struct ip_tunnel *)netdev_priv(vif->dev))->stat.tx_packets++;
1215                 ((struct ip_tunnel *)netdev_priv(vif->dev))->stat.tx_bytes+=skb->len;
1216         }
1217
1218         IPCB(skb)->flags |= IPSKB_FORWARDED;
1219
1220         /*
1221          * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
1222          * not only before forwarding, but after forwarding on all output
1223          * interfaces. It is clear, if mrouter runs a multicasting
1224          * program, it should receive packets not depending to what interface
1225          * program is joined.
1226          * If we will not make it, the program will have to join on all
1227          * interfaces. On the other hand, multihoming host (or router, but
1228          * not mrouter) cannot join to more than one interface - it will
1229          * result in receiving multiple packets.
1230          */
1231         NF_HOOK(PF_INET, NF_IP_FORWARD, skb, skb->dev, dev, 
1232                 ipmr_forward_finish);
1233         return;
1234
1235 out_free:
1236         kfree_skb(skb);
1237         return;
1238 }
1239
1240 static int ipmr_find_vif(struct net_device *dev)
1241 {
1242         int ct;
1243         for (ct=maxvif-1; ct>=0; ct--) {
1244                 if (vif_table[ct].dev == dev)
1245                         break;
1246         }
1247         return ct;
1248 }
1249
1250 /* "local" means that we should preserve one skb (for local delivery) */
1251
1252 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local)
1253 {
1254         int psend = -1;
1255         int vif, ct;
1256
1257         vif = cache->mfc_parent;
1258         cache->mfc_un.res.pkt++;
1259         cache->mfc_un.res.bytes += skb->len;
1260
1261         /*
1262          * Wrong interface: drop packet and (maybe) send PIM assert.
1263          */
1264         if (vif_table[vif].dev != skb->dev) {
1265                 int true_vifi;
1266
1267                 if (((struct rtable*)skb->dst)->fl.iif == 0) {
1268                         /* It is our own packet, looped back.
1269                            Very complicated situation...
1270
1271                            The best workaround until routing daemons will be
1272                            fixed is not to redistribute packet, if it was
1273                            send through wrong interface. It means, that
1274                            multicast applications WILL NOT work for
1275                            (S,G), which have default multicast route pointing
1276                            to wrong oif. In any case, it is not a good
1277                            idea to use multicasting applications on router.
1278                          */
1279                         goto dont_forward;
1280                 }
1281
1282                 cache->mfc_un.res.wrong_if++;
1283                 true_vifi = ipmr_find_vif(skb->dev);
1284
1285                 if (true_vifi >= 0 && mroute_do_assert &&
1286                     /* pimsm uses asserts, when switching from RPT to SPT,
1287                        so that we cannot check that packet arrived on an oif.
1288                        It is bad, but otherwise we would need to move pretty
1289                        large chunk of pimd to kernel. Ough... --ANK
1290                      */
1291                     (mroute_do_pim || cache->mfc_un.res.ttls[true_vifi] < 255) &&
1292                     time_after(jiffies, 
1293                                cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
1294                         cache->mfc_un.res.last_assert = jiffies;
1295                         ipmr_cache_report(skb, true_vifi, IGMPMSG_WRONGVIF);
1296                 }
1297                 goto dont_forward;
1298         }
1299
1300         vif_table[vif].pkt_in++;
1301         vif_table[vif].bytes_in+=skb->len;
1302
1303         /*
1304          *      Forward the frame
1305          */
1306         for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) {
1307                 if (skb->nh.iph->ttl > cache->mfc_un.res.ttls[ct]) {
1308                         if (psend != -1) {
1309                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1310                                 if (skb2)
1311                                         ipmr_queue_xmit(skb2, cache, psend);
1312                         }
1313                         psend=ct;
1314                 }
1315         }
1316         if (psend != -1) {
1317                 if (local) {
1318                         struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1319                         if (skb2)
1320                                 ipmr_queue_xmit(skb2, cache, psend);
1321                 } else {
1322                         ipmr_queue_xmit(skb, cache, psend);
1323                         return 0;
1324                 }
1325         }
1326
1327 dont_forward:
1328         if (!local)
1329                 kfree_skb(skb);
1330         return 0;
1331 }
1332
1333
1334 /*
1335  *      Multicast packets for forwarding arrive here
1336  */
1337
1338 int ip_mr_input(struct sk_buff *skb)
1339 {
1340         struct mfc_cache *cache;
1341         int local = ((struct rtable*)skb->dst)->rt_flags&RTCF_LOCAL;
1342
1343         /* Packet is looped back after forward, it should not be
1344            forwarded second time, but still can be delivered locally.
1345          */
1346         if (IPCB(skb)->flags&IPSKB_FORWARDED)
1347                 goto dont_forward;
1348
1349         if (!local) {
1350                     if (IPCB(skb)->opt.router_alert) {
1351                             if (ip_call_ra_chain(skb))
1352                                     return 0;
1353                     } else if (skb->nh.iph->protocol == IPPROTO_IGMP){
1354                             /* IGMPv1 (and broken IGMPv2 implementations sort of
1355                                Cisco IOS <= 11.2(8)) do not put router alert
1356                                option to IGMP packets destined to routable
1357                                groups. It is very bad, because it means
1358                                that we can forward NO IGMP messages.
1359                              */
1360                             read_lock(&mrt_lock);
1361                             if (mroute_socket) {
1362                                     nf_reset(skb);
1363                                     raw_rcv(mroute_socket, skb);
1364                                     read_unlock(&mrt_lock);
1365                                     return 0;
1366                             }
1367                             read_unlock(&mrt_lock);
1368                     }
1369         }
1370
1371         read_lock(&mrt_lock);
1372         cache = ipmr_cache_find(skb->nh.iph->saddr, skb->nh.iph->daddr);
1373
1374         /*
1375          *      No usable cache entry
1376          */
1377         if (cache==NULL) {
1378                 int vif;
1379
1380                 if (local) {
1381                         struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1382                         ip_local_deliver(skb);
1383                         if (skb2 == NULL) {
1384                                 read_unlock(&mrt_lock);
1385                                 return -ENOBUFS;
1386                         }
1387                         skb = skb2;
1388                 }
1389
1390                 vif = ipmr_find_vif(skb->dev);
1391                 if (vif >= 0) {
1392                         int err = ipmr_cache_unresolved(vif, skb);
1393                         read_unlock(&mrt_lock);
1394
1395                         return err;
1396                 }
1397                 read_unlock(&mrt_lock);
1398                 kfree_skb(skb);
1399                 return -ENODEV;
1400         }
1401
1402         ip_mr_forward(skb, cache, local);
1403
1404         read_unlock(&mrt_lock);
1405
1406         if (local)
1407                 return ip_local_deliver(skb);
1408
1409         return 0;
1410
1411 dont_forward:
1412         if (local)
1413                 return ip_local_deliver(skb);
1414         kfree_skb(skb);
1415         return 0;
1416 }
1417
1418 #ifdef CONFIG_IP_PIMSM_V1
1419 /*
1420  * Handle IGMP messages of PIMv1
1421  */
1422
1423 int pim_rcv_v1(struct sk_buff * skb)
1424 {
1425         struct igmphdr *pim;
1426         struct iphdr   *encap;
1427         struct net_device  *reg_dev = NULL;
1428
1429         if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap))) 
1430                 goto drop;
1431
1432         pim = (struct igmphdr*)skb->h.raw;
1433
1434         if (!mroute_do_pim ||
1435             skb->len < sizeof(*pim) + sizeof(*encap) ||
1436             pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER) 
1437                 goto drop;
1438
1439         encap = (struct iphdr*)(skb->h.raw + sizeof(struct igmphdr));
1440         /*
1441            Check that:
1442            a. packet is really destinted to a multicast group
1443            b. packet is not a NULL-REGISTER
1444            c. packet is not truncated
1445          */
1446         if (!MULTICAST(encap->daddr) ||
1447             encap->tot_len == 0 ||
1448             ntohs(encap->tot_len) + sizeof(*pim) > skb->len) 
1449                 goto drop;
1450
1451         read_lock(&mrt_lock);
1452         if (reg_vif_num >= 0)
1453                 reg_dev = vif_table[reg_vif_num].dev;
1454         if (reg_dev)
1455                 dev_hold(reg_dev);
1456         read_unlock(&mrt_lock);
1457
1458         if (reg_dev == NULL) 
1459                 goto drop;
1460
1461         skb->mac.raw = skb->nh.raw;
1462         skb_pull(skb, (u8*)encap - skb->data);
1463         skb->nh.iph = (struct iphdr *)skb->data;
1464         skb->dev = reg_dev;
1465         skb->protocol = htons(ETH_P_IP);
1466         skb->ip_summed = 0;
1467         skb->pkt_type = PACKET_HOST;
1468         dst_release(skb->dst);
1469         skb->dst = NULL;
1470         ((struct net_device_stats*)netdev_priv(reg_dev))->rx_bytes += skb->len;
1471         ((struct net_device_stats*)netdev_priv(reg_dev))->rx_packets++;
1472         nf_reset(skb);
1473         netif_rx(skb);
1474         dev_put(reg_dev);
1475         return 0;
1476  drop:
1477         kfree_skb(skb);
1478         return 0;
1479 }
1480 #endif
1481
1482 #ifdef CONFIG_IP_PIMSM_V2
1483 static int pim_rcv(struct sk_buff * skb)
1484 {
1485         struct pimreghdr *pim;
1486         struct iphdr   *encap;
1487         struct net_device  *reg_dev = NULL;
1488
1489         if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap))) 
1490                 goto drop;
1491
1492         pim = (struct pimreghdr*)skb->h.raw;
1493         if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) ||
1494             (pim->flags&PIM_NULL_REGISTER) ||
1495             (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 && 
1496              csum_fold(skb_checksum(skb, 0, skb->len, 0))))
1497                 goto drop;
1498
1499         /* check if the inner packet is destined to mcast group */
1500         encap = (struct iphdr*)(skb->h.raw + sizeof(struct pimreghdr));
1501         if (!MULTICAST(encap->daddr) ||
1502             encap->tot_len == 0 ||
1503             ntohs(encap->tot_len) + sizeof(*pim) > skb->len) 
1504                 goto drop;
1505
1506         read_lock(&mrt_lock);
1507         if (reg_vif_num >= 0)
1508                 reg_dev = vif_table[reg_vif_num].dev;
1509         if (reg_dev)
1510                 dev_hold(reg_dev);
1511         read_unlock(&mrt_lock);
1512
1513         if (reg_dev == NULL) 
1514                 goto drop;
1515
1516         skb->mac.raw = skb->nh.raw;
1517         skb_pull(skb, (u8*)encap - skb->data);
1518         skb->nh.iph = (struct iphdr *)skb->data;
1519         skb->dev = reg_dev;
1520         skb->protocol = htons(ETH_P_IP);
1521         skb->ip_summed = 0;
1522         skb->pkt_type = PACKET_HOST;
1523         dst_release(skb->dst);
1524         ((struct net_device_stats*)netdev_priv(reg_dev))->rx_bytes += skb->len;
1525         ((struct net_device_stats*)netdev_priv(reg_dev))->rx_packets++;
1526         skb->dst = NULL;
1527         nf_reset(skb);
1528         netif_rx(skb);
1529         dev_put(reg_dev);
1530         return 0;
1531  drop:
1532         kfree_skb(skb);
1533         return 0;
1534 }
1535 #endif
1536
1537 static int
1538 ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm)
1539 {
1540         int ct;
1541         struct rtnexthop *nhp;
1542         struct net_device *dev = vif_table[c->mfc_parent].dev;
1543         u8 *b = skb->tail;
1544         struct rtattr *mp_head;
1545
1546         if (dev)
1547                 RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex);
1548
1549         mp_head = (struct rtattr*)skb_put(skb, RTA_LENGTH(0));
1550
1551         for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
1552                 if (c->mfc_un.res.ttls[ct] < 255) {
1553                         if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
1554                                 goto rtattr_failure;
1555                         nhp = (struct rtnexthop*)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
1556                         nhp->rtnh_flags = 0;
1557                         nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
1558                         nhp->rtnh_ifindex = vif_table[ct].dev->ifindex;
1559                         nhp->rtnh_len = sizeof(*nhp);
1560                 }
1561         }
1562         mp_head->rta_type = RTA_MULTIPATH;
1563         mp_head->rta_len = skb->tail - (u8*)mp_head;
1564         rtm->rtm_type = RTN_MULTICAST;
1565         return 1;
1566
1567 rtattr_failure:
1568         skb_trim(skb, b - skb->data);
1569         return -EMSGSIZE;
1570 }
1571
1572 int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait)
1573 {
1574         int err;
1575         struct mfc_cache *cache;
1576         struct rtable *rt = (struct rtable*)skb->dst;
1577
1578         read_lock(&mrt_lock);
1579         cache = ipmr_cache_find(rt->rt_src, rt->rt_dst);
1580
1581         if (cache==NULL) {
1582                 struct sk_buff *skb2;
1583                 struct net_device *dev;
1584                 int vif;
1585
1586                 if (nowait) {
1587                         read_unlock(&mrt_lock);
1588                         return -EAGAIN;
1589                 }
1590
1591                 dev = skb->dev;
1592                 if (dev == NULL || (vif = ipmr_find_vif(dev)) < 0) {
1593                         read_unlock(&mrt_lock);
1594                         return -ENODEV;
1595                 }
1596                 skb2 = skb_clone(skb, GFP_ATOMIC);
1597                 if (!skb2) {
1598                         read_unlock(&mrt_lock);
1599                         return -ENOMEM;
1600                 }
1601
1602                 skb2->nh.raw = skb_push(skb2, sizeof(struct iphdr));
1603                 skb2->nh.iph->ihl = sizeof(struct iphdr)>>2;
1604                 skb2->nh.iph->saddr = rt->rt_src;
1605                 skb2->nh.iph->daddr = rt->rt_dst;
1606                 skb2->nh.iph->version = 0;
1607                 err = ipmr_cache_unresolved(vif, skb2);
1608                 read_unlock(&mrt_lock);
1609                 return err;
1610         }
1611
1612         if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
1613                 cache->mfc_flags |= MFC_NOTIFY;
1614         err = ipmr_fill_mroute(skb, cache, rtm);
1615         read_unlock(&mrt_lock);
1616         return err;
1617 }
1618
1619 #ifdef CONFIG_PROC_FS   
1620 /*
1621  *      The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif
1622  */
1623 struct ipmr_vif_iter {
1624         int ct;
1625 };
1626
1627 static struct vif_device *ipmr_vif_seq_idx(struct ipmr_vif_iter *iter,
1628                                            loff_t pos)
1629 {
1630         for (iter->ct = 0; iter->ct < maxvif; ++iter->ct) {
1631                 if(!VIF_EXISTS(iter->ct))
1632                         continue;
1633                 if (pos-- == 0) 
1634                         return &vif_table[iter->ct];
1635         }
1636         return NULL;
1637 }
1638
1639 static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
1640 {
1641         read_lock(&mrt_lock);
1642         return *pos ? ipmr_vif_seq_idx(seq->private, *pos - 1) 
1643                 : SEQ_START_TOKEN;
1644 }
1645
1646 static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1647 {
1648         struct ipmr_vif_iter *iter = seq->private;
1649
1650         ++*pos;
1651         if (v == SEQ_START_TOKEN)
1652                 return ipmr_vif_seq_idx(iter, 0);
1653         
1654         while (++iter->ct < maxvif) {
1655                 if(!VIF_EXISTS(iter->ct))
1656                         continue;
1657                 return &vif_table[iter->ct];
1658         }
1659         return NULL;
1660 }
1661
1662 static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
1663 {
1664         read_unlock(&mrt_lock);
1665 }
1666
1667 static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
1668 {
1669         if (v == SEQ_START_TOKEN) {
1670                 seq_puts(seq, 
1671                          "Interface      BytesIn  PktsIn  BytesOut PktsOut Flags Local    Remote\n");
1672         } else {
1673                 const struct vif_device *vif = v;
1674                 const char *name =  vif->dev ? vif->dev->name : "none";
1675
1676                 seq_printf(seq,
1677                            "%2Zd %-10s %8ld %7ld  %8ld %7ld %05X %08X %08X\n",
1678                            vif - vif_table,
1679                            name, vif->bytes_in, vif->pkt_in, 
1680                            vif->bytes_out, vif->pkt_out,
1681                            vif->flags, vif->local, vif->remote);
1682         }
1683         return 0;
1684 }
1685
1686 static struct seq_operations ipmr_vif_seq_ops = {
1687         .start = ipmr_vif_seq_start,
1688         .next  = ipmr_vif_seq_next,
1689         .stop  = ipmr_vif_seq_stop,
1690         .show  = ipmr_vif_seq_show,
1691 };
1692
1693 static int ipmr_vif_open(struct inode *inode, struct file *file)
1694 {
1695         struct seq_file *seq;
1696         int rc = -ENOMEM;
1697         struct ipmr_vif_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
1698        
1699         if (!s)
1700                 goto out;
1701
1702         rc = seq_open(file, &ipmr_vif_seq_ops);
1703         if (rc)
1704                 goto out_kfree;
1705
1706         s->ct = 0;
1707         seq = file->private_data;
1708         seq->private = s;
1709 out:
1710         return rc;
1711 out_kfree:
1712         kfree(s);
1713         goto out;
1714
1715 }
1716
1717 static struct file_operations ipmr_vif_fops = {
1718         .owner   = THIS_MODULE,
1719         .open    = ipmr_vif_open,
1720         .read    = seq_read,
1721         .llseek  = seq_lseek,
1722         .release = seq_release_private,
1723 };
1724
1725 struct ipmr_mfc_iter {
1726         struct mfc_cache **cache;
1727         int ct;
1728 };
1729
1730
1731 static struct mfc_cache *ipmr_mfc_seq_idx(struct ipmr_mfc_iter *it, loff_t pos)
1732 {
1733         struct mfc_cache *mfc;
1734
1735         it->cache = mfc_cache_array;
1736         read_lock(&mrt_lock);
1737         for (it->ct = 0; it->ct < MFC_LINES; it->ct++) 
1738                 for(mfc = mfc_cache_array[it->ct]; mfc; mfc = mfc->next) 
1739                         if (pos-- == 0) 
1740                                 return mfc;
1741         read_unlock(&mrt_lock);
1742
1743         it->cache = &mfc_unres_queue;
1744         spin_lock_bh(&mfc_unres_lock);
1745         for(mfc = mfc_unres_queue; mfc; mfc = mfc->next) 
1746                 if (pos-- == 0)
1747                         return mfc;
1748         spin_unlock_bh(&mfc_unres_lock);
1749
1750         it->cache = NULL;
1751         return NULL;
1752 }
1753
1754
1755 static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
1756 {
1757         struct ipmr_mfc_iter *it = seq->private;
1758         it->cache = NULL;
1759         it->ct = 0;
1760         return *pos ? ipmr_mfc_seq_idx(seq->private, *pos - 1) 
1761                 : SEQ_START_TOKEN;
1762 }
1763
1764 static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1765 {
1766         struct mfc_cache *mfc = v;
1767         struct ipmr_mfc_iter *it = seq->private;
1768
1769         ++*pos;
1770
1771         if (v == SEQ_START_TOKEN)
1772                 return ipmr_mfc_seq_idx(seq->private, 0);
1773
1774         if (mfc->next)
1775                 return mfc->next;
1776         
1777         if (it->cache == &mfc_unres_queue) 
1778                 goto end_of_list;
1779
1780         BUG_ON(it->cache != mfc_cache_array);
1781
1782         while (++it->ct < MFC_LINES) {
1783                 mfc = mfc_cache_array[it->ct];
1784                 if (mfc)
1785                         return mfc;
1786         }
1787
1788         /* exhausted cache_array, show unresolved */
1789         read_unlock(&mrt_lock);
1790         it->cache = &mfc_unres_queue;
1791         it->ct = 0;
1792                 
1793         spin_lock_bh(&mfc_unres_lock);
1794         mfc = mfc_unres_queue;
1795         if (mfc) 
1796                 return mfc;
1797
1798  end_of_list:
1799         spin_unlock_bh(&mfc_unres_lock);
1800         it->cache = NULL;
1801
1802         return NULL;
1803 }
1804
1805 static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
1806 {
1807         struct ipmr_mfc_iter *it = seq->private;
1808
1809         if (it->cache == &mfc_unres_queue)
1810                 spin_unlock_bh(&mfc_unres_lock);
1811         else if (it->cache == mfc_cache_array)
1812                 read_unlock(&mrt_lock);
1813 }
1814
1815 static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
1816 {
1817         int n;
1818
1819         if (v == SEQ_START_TOKEN) {
1820                 seq_puts(seq, 
1821                  "Group    Origin   Iif     Pkts    Bytes    Wrong Oifs\n");
1822         } else {
1823                 const struct mfc_cache *mfc = v;
1824                 const struct ipmr_mfc_iter *it = seq->private;
1825                 
1826                 seq_printf(seq, "%08lX %08lX %-3d %8ld %8ld %8ld",
1827                            (unsigned long) mfc->mfc_mcastgrp,
1828                            (unsigned long) mfc->mfc_origin,
1829                            mfc->mfc_parent,
1830                            mfc->mfc_un.res.pkt,
1831                            mfc->mfc_un.res.bytes,
1832                            mfc->mfc_un.res.wrong_if);
1833
1834                 if (it->cache != &mfc_unres_queue) {
1835                         for(n = mfc->mfc_un.res.minvif; 
1836                             n < mfc->mfc_un.res.maxvif; n++ ) {
1837                                 if(VIF_EXISTS(n) 
1838                                    && mfc->mfc_un.res.ttls[n] < 255)
1839                                 seq_printf(seq, 
1840                                            " %2d:%-3d", 
1841                                            n, mfc->mfc_un.res.ttls[n]);
1842                         }
1843                 }
1844                 seq_putc(seq, '\n');
1845         }
1846         return 0;
1847 }
1848
1849 static struct seq_operations ipmr_mfc_seq_ops = {
1850         .start = ipmr_mfc_seq_start,
1851         .next  = ipmr_mfc_seq_next,
1852         .stop  = ipmr_mfc_seq_stop,
1853         .show  = ipmr_mfc_seq_show,
1854 };
1855
1856 static int ipmr_mfc_open(struct inode *inode, struct file *file)
1857 {
1858         struct seq_file *seq;
1859         int rc = -ENOMEM;
1860         struct ipmr_mfc_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
1861        
1862         if (!s)
1863                 goto out;
1864
1865         rc = seq_open(file, &ipmr_mfc_seq_ops);
1866         if (rc)
1867                 goto out_kfree;
1868
1869         seq = file->private_data;
1870         seq->private = s;
1871 out:
1872         return rc;
1873 out_kfree:
1874         kfree(s);
1875         goto out;
1876
1877 }
1878
1879 static struct file_operations ipmr_mfc_fops = {
1880         .owner   = THIS_MODULE,
1881         .open    = ipmr_mfc_open,
1882         .read    = seq_read,
1883         .llseek  = seq_lseek,
1884         .release = seq_release_private,
1885 };
1886 #endif  
1887
1888 #ifdef CONFIG_IP_PIMSM_V2
1889 static struct net_protocol pim_protocol = {
1890         .handler        =       pim_rcv,
1891 };
1892 #endif
1893
1894
1895 /*
1896  *      Setup for IP multicast routing
1897  */
1898  
1899 void __init ip_mr_init(void)
1900 {
1901         mrt_cachep = kmem_cache_create("ip_mrt_cache",
1902                                        sizeof(struct mfc_cache),
1903                                        0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
1904                                        NULL, NULL);
1905         init_timer(&ipmr_expire_timer);
1906         ipmr_expire_timer.function=ipmr_expire_process;
1907         register_netdevice_notifier(&ip_mr_notifier);
1908 #ifdef CONFIG_PROC_FS   
1909         proc_net_fops_create("ip_mr_vif", 0, &ipmr_vif_fops);
1910         proc_net_fops_create("ip_mr_cache", 0, &ipmr_mfc_fops);
1911 #endif  
1912 }