[SK_BUFF] ipmr: Another skb_push related conversion to skb_reset_network_header
[sfrench/cifs-2.6.git] / net / ipv4 / ipmr.c
1 /*
2  *      IP multicast routing support for mrouted 3.6/3.8
3  *
4  *              (c) 1995 Alan Cox, <alan@redhat.com>
5  *        Linux Consultancy and Custom Driver Development
6  *
7  *      This program is free software; you can redistribute it and/or
8  *      modify it under the terms of the GNU General Public License
9  *      as published by the Free Software Foundation; either version
10  *      2 of the License, or (at your option) any later version.
11  *
12  *      Version: $Id: ipmr.c,v 1.65 2001/10/31 21:55:54 davem Exp $
13  *
14  *      Fixes:
15  *      Michael Chastain        :       Incorrect size of copying.
16  *      Alan Cox                :       Added the cache manager code
17  *      Alan Cox                :       Fixed the clone/copy bug and device race.
18  *      Mike McLagan            :       Routing by source
19  *      Malcolm Beattie         :       Buffer handling fixes.
20  *      Alexey Kuznetsov        :       Double buffer free and other fixes.
21  *      SVR Anand               :       Fixed several multicast bugs and problems.
22  *      Alexey Kuznetsov        :       Status, optimisations and more.
23  *      Brad Parker             :       Better behaviour on mrouted upcall
24  *                                      overflow.
25  *      Carlos Picoto           :       PIMv1 Support
26  *      Pavlin Ivanov Radoslavov:       PIMv2 Registers must checksum only PIM header
27  *                                      Relax this requrement to work with older peers.
28  *
29  */
30
31 #include <asm/system.h>
32 #include <asm/uaccess.h>
33 #include <linux/types.h>
34 #include <linux/capability.h>
35 #include <linux/errno.h>
36 #include <linux/timer.h>
37 #include <linux/mm.h>
38 #include <linux/kernel.h>
39 #include <linux/fcntl.h>
40 #include <linux/stat.h>
41 #include <linux/socket.h>
42 #include <linux/in.h>
43 #include <linux/inet.h>
44 #include <linux/netdevice.h>
45 #include <linux/inetdevice.h>
46 #include <linux/igmp.h>
47 #include <linux/proc_fs.h>
48 #include <linux/seq_file.h>
49 #include <linux/mroute.h>
50 #include <linux/init.h>
51 #include <linux/if_ether.h>
52 #include <net/ip.h>
53 #include <net/protocol.h>
54 #include <linux/skbuff.h>
55 #include <net/route.h>
56 #include <net/sock.h>
57 #include <net/icmp.h>
58 #include <net/udp.h>
59 #include <net/raw.h>
60 #include <linux/notifier.h>
61 #include <linux/if_arp.h>
62 #include <linux/netfilter_ipv4.h>
63 #include <net/ipip.h>
64 #include <net/checksum.h>
65
66 #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
67 #define CONFIG_IP_PIMSM 1
68 #endif
69
70 static struct sock *mroute_socket;
71
72
73 /* Big lock, protecting vif table, mrt cache and mroute socket state.
74    Note that the changes are semaphored via rtnl_lock.
75  */
76
77 static DEFINE_RWLOCK(mrt_lock);
78
79 /*
80  *      Multicast router control variables
81  */
82
83 static struct vif_device vif_table[MAXVIFS];            /* Devices              */
84 static int maxvif;
85
86 #define VIF_EXISTS(idx) (vif_table[idx].dev != NULL)
87
88 static int mroute_do_assert;                            /* Set in PIM assert    */
89 static int mroute_do_pim;
90
91 static struct mfc_cache *mfc_cache_array[MFC_LINES];    /* Forwarding cache     */
92
93 static struct mfc_cache *mfc_unres_queue;               /* Queue of unresolved entries */
94 static atomic_t cache_resolve_queue_len;                /* Size of unresolved   */
95
96 /* Special spinlock for queue of unresolved entries */
97 static DEFINE_SPINLOCK(mfc_unres_lock);
98
99 /* We return to original Alan's scheme. Hash table of resolved
100    entries is changed only in process context and protected
101    with weak lock mrt_lock. Queue of unresolved entries is protected
102    with strong spinlock mfc_unres_lock.
103
104    In this case data path is free of exclusive locks at all.
105  */
106
107 static struct kmem_cache *mrt_cachep __read_mostly;
108
109 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local);
110 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert);
111 static int ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm);
112
113 #ifdef CONFIG_IP_PIMSM_V2
114 static struct net_protocol pim_protocol;
115 #endif
116
117 static struct timer_list ipmr_expire_timer;
118
119 /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
120
121 static
122 struct net_device *ipmr_new_tunnel(struct vifctl *v)
123 {
124         struct net_device  *dev;
125
126         dev = __dev_get_by_name("tunl0");
127
128         if (dev) {
129                 int err;
130                 struct ifreq ifr;
131                 mm_segment_t    oldfs;
132                 struct ip_tunnel_parm p;
133                 struct in_device  *in_dev;
134
135                 memset(&p, 0, sizeof(p));
136                 p.iph.daddr = v->vifc_rmt_addr.s_addr;
137                 p.iph.saddr = v->vifc_lcl_addr.s_addr;
138                 p.iph.version = 4;
139                 p.iph.ihl = 5;
140                 p.iph.protocol = IPPROTO_IPIP;
141                 sprintf(p.name, "dvmrp%d", v->vifc_vifi);
142                 ifr.ifr_ifru.ifru_data = (void*)&p;
143
144                 oldfs = get_fs(); set_fs(KERNEL_DS);
145                 err = dev->do_ioctl(dev, &ifr, SIOCADDTUNNEL);
146                 set_fs(oldfs);
147
148                 dev = NULL;
149
150                 if (err == 0 && (dev = __dev_get_by_name(p.name)) != NULL) {
151                         dev->flags |= IFF_MULTICAST;
152
153                         in_dev = __in_dev_get_rtnl(dev);
154                         if (in_dev == NULL && (in_dev = inetdev_init(dev)) == NULL)
155                                 goto failure;
156                         in_dev->cnf.rp_filter = 0;
157
158                         if (dev_open(dev))
159                                 goto failure;
160                 }
161         }
162         return dev;
163
164 failure:
165         /* allow the register to be completed before unregistering. */
166         rtnl_unlock();
167         rtnl_lock();
168
169         unregister_netdevice(dev);
170         return NULL;
171 }
172
173 #ifdef CONFIG_IP_PIMSM
174
175 static int reg_vif_num = -1;
176
177 static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
178 {
179         read_lock(&mrt_lock);
180         ((struct net_device_stats*)netdev_priv(dev))->tx_bytes += skb->len;
181         ((struct net_device_stats*)netdev_priv(dev))->tx_packets++;
182         ipmr_cache_report(skb, reg_vif_num, IGMPMSG_WHOLEPKT);
183         read_unlock(&mrt_lock);
184         kfree_skb(skb);
185         return 0;
186 }
187
188 static struct net_device_stats *reg_vif_get_stats(struct net_device *dev)
189 {
190         return (struct net_device_stats*)netdev_priv(dev);
191 }
192
193 static void reg_vif_setup(struct net_device *dev)
194 {
195         dev->type               = ARPHRD_PIMREG;
196         dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 8;
197         dev->flags              = IFF_NOARP;
198         dev->hard_start_xmit    = reg_vif_xmit;
199         dev->get_stats          = reg_vif_get_stats;
200         dev->destructor         = free_netdev;
201 }
202
203 static struct net_device *ipmr_reg_vif(void)
204 {
205         struct net_device *dev;
206         struct in_device *in_dev;
207
208         dev = alloc_netdev(sizeof(struct net_device_stats), "pimreg",
209                            reg_vif_setup);
210
211         if (dev == NULL)
212                 return NULL;
213
214         if (register_netdevice(dev)) {
215                 free_netdev(dev);
216                 return NULL;
217         }
218         dev->iflink = 0;
219
220         if ((in_dev = inetdev_init(dev)) == NULL)
221                 goto failure;
222
223         in_dev->cnf.rp_filter = 0;
224
225         if (dev_open(dev))
226                 goto failure;
227
228         return dev;
229
230 failure:
231         /* allow the register to be completed before unregistering. */
232         rtnl_unlock();
233         rtnl_lock();
234
235         unregister_netdevice(dev);
236         return NULL;
237 }
238 #endif
239
240 /*
241  *      Delete a VIF entry
242  */
243
244 static int vif_delete(int vifi)
245 {
246         struct vif_device *v;
247         struct net_device *dev;
248         struct in_device *in_dev;
249
250         if (vifi < 0 || vifi >= maxvif)
251                 return -EADDRNOTAVAIL;
252
253         v = &vif_table[vifi];
254
255         write_lock_bh(&mrt_lock);
256         dev = v->dev;
257         v->dev = NULL;
258
259         if (!dev) {
260                 write_unlock_bh(&mrt_lock);
261                 return -EADDRNOTAVAIL;
262         }
263
264 #ifdef CONFIG_IP_PIMSM
265         if (vifi == reg_vif_num)
266                 reg_vif_num = -1;
267 #endif
268
269         if (vifi+1 == maxvif) {
270                 int tmp;
271                 for (tmp=vifi-1; tmp>=0; tmp--) {
272                         if (VIF_EXISTS(tmp))
273                                 break;
274                 }
275                 maxvif = tmp+1;
276         }
277
278         write_unlock_bh(&mrt_lock);
279
280         dev_set_allmulti(dev, -1);
281
282         if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) {
283                 in_dev->cnf.mc_forwarding--;
284                 ip_rt_multicast_event(in_dev);
285         }
286
287         if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
288                 unregister_netdevice(dev);
289
290         dev_put(dev);
291         return 0;
292 }
293
294 /* Destroy an unresolved cache entry, killing queued skbs
295    and reporting error to netlink readers.
296  */
297
298 static void ipmr_destroy_unres(struct mfc_cache *c)
299 {
300         struct sk_buff *skb;
301         struct nlmsgerr *e;
302
303         atomic_dec(&cache_resolve_queue_len);
304
305         while ((skb=skb_dequeue(&c->mfc_un.unres.unresolved))) {
306                 if (skb->nh.iph->version == 0) {
307                         struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
308                         nlh->nlmsg_type = NLMSG_ERROR;
309                         nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
310                         skb_trim(skb, nlh->nlmsg_len);
311                         e = NLMSG_DATA(nlh);
312                         e->error = -ETIMEDOUT;
313                         memset(&e->msg, 0, sizeof(e->msg));
314
315                         rtnl_unicast(skb, NETLINK_CB(skb).pid);
316                 } else
317                         kfree_skb(skb);
318         }
319
320         kmem_cache_free(mrt_cachep, c);
321 }
322
323
324 /* Single timer process for all the unresolved queue. */
325
326 static void ipmr_expire_process(unsigned long dummy)
327 {
328         unsigned long now;
329         unsigned long expires;
330         struct mfc_cache *c, **cp;
331
332         if (!spin_trylock(&mfc_unres_lock)) {
333                 mod_timer(&ipmr_expire_timer, jiffies+HZ/10);
334                 return;
335         }
336
337         if (atomic_read(&cache_resolve_queue_len) == 0)
338                 goto out;
339
340         now = jiffies;
341         expires = 10*HZ;
342         cp = &mfc_unres_queue;
343
344         while ((c=*cp) != NULL) {
345                 if (time_after(c->mfc_un.unres.expires, now)) {
346                         unsigned long interval = c->mfc_un.unres.expires - now;
347                         if (interval < expires)
348                                 expires = interval;
349                         cp = &c->next;
350                         continue;
351                 }
352
353                 *cp = c->next;
354
355                 ipmr_destroy_unres(c);
356         }
357
358         if (atomic_read(&cache_resolve_queue_len))
359                 mod_timer(&ipmr_expire_timer, jiffies + expires);
360
361 out:
362         spin_unlock(&mfc_unres_lock);
363 }
364
365 /* Fill oifs list. It is called under write locked mrt_lock. */
366
367 static void ipmr_update_thresholds(struct mfc_cache *cache, unsigned char *ttls)
368 {
369         int vifi;
370
371         cache->mfc_un.res.minvif = MAXVIFS;
372         cache->mfc_un.res.maxvif = 0;
373         memset(cache->mfc_un.res.ttls, 255, MAXVIFS);
374
375         for (vifi=0; vifi<maxvif; vifi++) {
376                 if (VIF_EXISTS(vifi) && ttls[vifi] && ttls[vifi] < 255) {
377                         cache->mfc_un.res.ttls[vifi] = ttls[vifi];
378                         if (cache->mfc_un.res.minvif > vifi)
379                                 cache->mfc_un.res.minvif = vifi;
380                         if (cache->mfc_un.res.maxvif <= vifi)
381                                 cache->mfc_un.res.maxvif = vifi + 1;
382                 }
383         }
384 }
385
386 static int vif_add(struct vifctl *vifc, int mrtsock)
387 {
388         int vifi = vifc->vifc_vifi;
389         struct vif_device *v = &vif_table[vifi];
390         struct net_device *dev;
391         struct in_device *in_dev;
392
393         /* Is vif busy ? */
394         if (VIF_EXISTS(vifi))
395                 return -EADDRINUSE;
396
397         switch (vifc->vifc_flags) {
398 #ifdef CONFIG_IP_PIMSM
399         case VIFF_REGISTER:
400                 /*
401                  * Special Purpose VIF in PIM
402                  * All the packets will be sent to the daemon
403                  */
404                 if (reg_vif_num >= 0)
405                         return -EADDRINUSE;
406                 dev = ipmr_reg_vif();
407                 if (!dev)
408                         return -ENOBUFS;
409                 break;
410 #endif
411         case VIFF_TUNNEL:
412                 dev = ipmr_new_tunnel(vifc);
413                 if (!dev)
414                         return -ENOBUFS;
415                 break;
416         case 0:
417                 dev = ip_dev_find(vifc->vifc_lcl_addr.s_addr);
418                 if (!dev)
419                         return -EADDRNOTAVAIL;
420                 dev_put(dev);
421                 break;
422         default:
423                 return -EINVAL;
424         }
425
426         if ((in_dev = __in_dev_get_rtnl(dev)) == NULL)
427                 return -EADDRNOTAVAIL;
428         in_dev->cnf.mc_forwarding++;
429         dev_set_allmulti(dev, +1);
430         ip_rt_multicast_event(in_dev);
431
432         /*
433          *      Fill in the VIF structures
434          */
435         v->rate_limit=vifc->vifc_rate_limit;
436         v->local=vifc->vifc_lcl_addr.s_addr;
437         v->remote=vifc->vifc_rmt_addr.s_addr;
438         v->flags=vifc->vifc_flags;
439         if (!mrtsock)
440                 v->flags |= VIFF_STATIC;
441         v->threshold=vifc->vifc_threshold;
442         v->bytes_in = 0;
443         v->bytes_out = 0;
444         v->pkt_in = 0;
445         v->pkt_out = 0;
446         v->link = dev->ifindex;
447         if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
448                 v->link = dev->iflink;
449
450         /* And finish update writing critical data */
451         write_lock_bh(&mrt_lock);
452         dev_hold(dev);
453         v->dev=dev;
454 #ifdef CONFIG_IP_PIMSM
455         if (v->flags&VIFF_REGISTER)
456                 reg_vif_num = vifi;
457 #endif
458         if (vifi+1 > maxvif)
459                 maxvif = vifi+1;
460         write_unlock_bh(&mrt_lock);
461         return 0;
462 }
463
464 static struct mfc_cache *ipmr_cache_find(__be32 origin, __be32 mcastgrp)
465 {
466         int line=MFC_HASH(mcastgrp,origin);
467         struct mfc_cache *c;
468
469         for (c=mfc_cache_array[line]; c; c = c->next) {
470                 if (c->mfc_origin==origin && c->mfc_mcastgrp==mcastgrp)
471                         break;
472         }
473         return c;
474 }
475
476 /*
477  *      Allocate a multicast cache entry
478  */
479 static struct mfc_cache *ipmr_cache_alloc(void)
480 {
481         struct mfc_cache *c=kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
482         if (c==NULL)
483                 return NULL;
484         c->mfc_un.res.minvif = MAXVIFS;
485         return c;
486 }
487
488 static struct mfc_cache *ipmr_cache_alloc_unres(void)
489 {
490         struct mfc_cache *c=kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
491         if (c==NULL)
492                 return NULL;
493         skb_queue_head_init(&c->mfc_un.unres.unresolved);
494         c->mfc_un.unres.expires = jiffies + 10*HZ;
495         return c;
496 }
497
498 /*
499  *      A cache entry has gone into a resolved state from queued
500  */
501
502 static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
503 {
504         struct sk_buff *skb;
505         struct nlmsgerr *e;
506
507         /*
508          *      Play the pending entries through our router
509          */
510
511         while ((skb=__skb_dequeue(&uc->mfc_un.unres.unresolved))) {
512                 if (skb->nh.iph->version == 0) {
513                         struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
514
515                         if (ipmr_fill_mroute(skb, c, NLMSG_DATA(nlh)) > 0) {
516                                 nlh->nlmsg_len = skb->tail - (u8*)nlh;
517                         } else {
518                                 nlh->nlmsg_type = NLMSG_ERROR;
519                                 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
520                                 skb_trim(skb, nlh->nlmsg_len);
521                                 e = NLMSG_DATA(nlh);
522                                 e->error = -EMSGSIZE;
523                                 memset(&e->msg, 0, sizeof(e->msg));
524                         }
525
526                         rtnl_unicast(skb, NETLINK_CB(skb).pid);
527                 } else
528                         ip_mr_forward(skb, c, 0);
529         }
530 }
531
532 /*
533  *      Bounce a cache query up to mrouted. We could use netlink for this but mrouted
534  *      expects the following bizarre scheme.
535  *
536  *      Called under mrt_lock.
537  */
538
539 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert)
540 {
541         struct sk_buff *skb;
542         int ihl = pkt->nh.iph->ihl<<2;
543         struct igmphdr *igmp;
544         struct igmpmsg *msg;
545         int ret;
546
547 #ifdef CONFIG_IP_PIMSM
548         if (assert == IGMPMSG_WHOLEPKT)
549                 skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
550         else
551 #endif
552                 skb = alloc_skb(128, GFP_ATOMIC);
553
554         if (!skb)
555                 return -ENOBUFS;
556
557 #ifdef CONFIG_IP_PIMSM
558         if (assert == IGMPMSG_WHOLEPKT) {
559                 /* Ugly, but we have no choice with this interface.
560                    Duplicate old header, fix ihl, length etc.
561                    And all this only to mangle msg->im_msgtype and
562                    to set msg->im_mbz to "mbz" :-)
563                  */
564                 skb_push(skb, sizeof(struct iphdr));
565                 skb_reset_network_header(skb);
566                 skb->h.raw = skb->data;
567                 msg = (struct igmpmsg *)skb->nh.raw;
568                 memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr));
569                 msg->im_msgtype = IGMPMSG_WHOLEPKT;
570                 msg->im_mbz = 0;
571                 msg->im_vif = reg_vif_num;
572                 skb->nh.iph->ihl = sizeof(struct iphdr) >> 2;
573                 skb->nh.iph->tot_len = htons(ntohs(pkt->nh.iph->tot_len) + sizeof(struct iphdr));
574         } else
575 #endif
576         {
577
578         /*
579          *      Copy the IP header
580          */
581
582         skb->nh.iph = (struct iphdr *)skb_put(skb, ihl);
583         memcpy(skb->data,pkt->data,ihl);
584         skb->nh.iph->protocol = 0;                      /* Flag to the kernel this is a route add */
585         msg = (struct igmpmsg*)skb->nh.iph;
586         msg->im_vif = vifi;
587         skb->dst = dst_clone(pkt->dst);
588
589         /*
590          *      Add our header
591          */
592
593         igmp=(struct igmphdr *)skb_put(skb,sizeof(struct igmphdr));
594         igmp->type      =
595         msg->im_msgtype = assert;
596         igmp->code      =       0;
597         skb->nh.iph->tot_len=htons(skb->len);                   /* Fix the length */
598         skb->h.raw = skb->nh.raw;
599         }
600
601         if (mroute_socket == NULL) {
602                 kfree_skb(skb);
603                 return -EINVAL;
604         }
605
606         /*
607          *      Deliver to mrouted
608          */
609         if ((ret=sock_queue_rcv_skb(mroute_socket,skb))<0) {
610                 if (net_ratelimit())
611                         printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
612                 kfree_skb(skb);
613         }
614
615         return ret;
616 }
617
618 /*
619  *      Queue a packet for resolution. It gets locked cache entry!
620  */
621
622 static int
623 ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb)
624 {
625         int err;
626         struct mfc_cache *c;
627
628         spin_lock_bh(&mfc_unres_lock);
629         for (c=mfc_unres_queue; c; c=c->next) {
630                 if (c->mfc_mcastgrp == skb->nh.iph->daddr &&
631                     c->mfc_origin == skb->nh.iph->saddr)
632                         break;
633         }
634
635         if (c == NULL) {
636                 /*
637                  *      Create a new entry if allowable
638                  */
639
640                 if (atomic_read(&cache_resolve_queue_len)>=10 ||
641                     (c=ipmr_cache_alloc_unres())==NULL) {
642                         spin_unlock_bh(&mfc_unres_lock);
643
644                         kfree_skb(skb);
645                         return -ENOBUFS;
646                 }
647
648                 /*
649                  *      Fill in the new cache entry
650                  */
651                 c->mfc_parent=-1;
652                 c->mfc_origin=skb->nh.iph->saddr;
653                 c->mfc_mcastgrp=skb->nh.iph->daddr;
654
655                 /*
656                  *      Reflect first query at mrouted.
657                  */
658                 if ((err = ipmr_cache_report(skb, vifi, IGMPMSG_NOCACHE))<0) {
659                         /* If the report failed throw the cache entry
660                            out - Brad Parker
661                          */
662                         spin_unlock_bh(&mfc_unres_lock);
663
664                         kmem_cache_free(mrt_cachep, c);
665                         kfree_skb(skb);
666                         return err;
667                 }
668
669                 atomic_inc(&cache_resolve_queue_len);
670                 c->next = mfc_unres_queue;
671                 mfc_unres_queue = c;
672
673                 mod_timer(&ipmr_expire_timer, c->mfc_un.unres.expires);
674         }
675
676         /*
677          *      See if we can append the packet
678          */
679         if (c->mfc_un.unres.unresolved.qlen>3) {
680                 kfree_skb(skb);
681                 err = -ENOBUFS;
682         } else {
683                 skb_queue_tail(&c->mfc_un.unres.unresolved,skb);
684                 err = 0;
685         }
686
687         spin_unlock_bh(&mfc_unres_lock);
688         return err;
689 }
690
691 /*
692  *      MFC cache manipulation by user space mroute daemon
693  */
694
695 static int ipmr_mfc_delete(struct mfcctl *mfc)
696 {
697         int line;
698         struct mfc_cache *c, **cp;
699
700         line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
701
702         for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
703                 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
704                     c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
705                         write_lock_bh(&mrt_lock);
706                         *cp = c->next;
707                         write_unlock_bh(&mrt_lock);
708
709                         kmem_cache_free(mrt_cachep, c);
710                         return 0;
711                 }
712         }
713         return -ENOENT;
714 }
715
716 static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
717 {
718         int line;
719         struct mfc_cache *uc, *c, **cp;
720
721         line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
722
723         for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
724                 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
725                     c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr)
726                         break;
727         }
728
729         if (c != NULL) {
730                 write_lock_bh(&mrt_lock);
731                 c->mfc_parent = mfc->mfcc_parent;
732                 ipmr_update_thresholds(c, mfc->mfcc_ttls);
733                 if (!mrtsock)
734                         c->mfc_flags |= MFC_STATIC;
735                 write_unlock_bh(&mrt_lock);
736                 return 0;
737         }
738
739         if (!MULTICAST(mfc->mfcc_mcastgrp.s_addr))
740                 return -EINVAL;
741
742         c=ipmr_cache_alloc();
743         if (c==NULL)
744                 return -ENOMEM;
745
746         c->mfc_origin=mfc->mfcc_origin.s_addr;
747         c->mfc_mcastgrp=mfc->mfcc_mcastgrp.s_addr;
748         c->mfc_parent=mfc->mfcc_parent;
749         ipmr_update_thresholds(c, mfc->mfcc_ttls);
750         if (!mrtsock)
751                 c->mfc_flags |= MFC_STATIC;
752
753         write_lock_bh(&mrt_lock);
754         c->next = mfc_cache_array[line];
755         mfc_cache_array[line] = c;
756         write_unlock_bh(&mrt_lock);
757
758         /*
759          *      Check to see if we resolved a queued list. If so we
760          *      need to send on the frames and tidy up.
761          */
762         spin_lock_bh(&mfc_unres_lock);
763         for (cp = &mfc_unres_queue; (uc=*cp) != NULL;
764              cp = &uc->next) {
765                 if (uc->mfc_origin == c->mfc_origin &&
766                     uc->mfc_mcastgrp == c->mfc_mcastgrp) {
767                         *cp = uc->next;
768                         if (atomic_dec_and_test(&cache_resolve_queue_len))
769                                 del_timer(&ipmr_expire_timer);
770                         break;
771                 }
772         }
773         spin_unlock_bh(&mfc_unres_lock);
774
775         if (uc) {
776                 ipmr_cache_resolve(uc, c);
777                 kmem_cache_free(mrt_cachep, uc);
778         }
779         return 0;
780 }
781
782 /*
783  *      Close the multicast socket, and clear the vif tables etc
784  */
785
786 static void mroute_clean_tables(struct sock *sk)
787 {
788         int i;
789
790         /*
791          *      Shut down all active vif entries
792          */
793         for (i=0; i<maxvif; i++) {
794                 if (!(vif_table[i].flags&VIFF_STATIC))
795                         vif_delete(i);
796         }
797
798         /*
799          *      Wipe the cache
800          */
801         for (i=0;i<MFC_LINES;i++) {
802                 struct mfc_cache *c, **cp;
803
804                 cp = &mfc_cache_array[i];
805                 while ((c = *cp) != NULL) {
806                         if (c->mfc_flags&MFC_STATIC) {
807                                 cp = &c->next;
808                                 continue;
809                         }
810                         write_lock_bh(&mrt_lock);
811                         *cp = c->next;
812                         write_unlock_bh(&mrt_lock);
813
814                         kmem_cache_free(mrt_cachep, c);
815                 }
816         }
817
818         if (atomic_read(&cache_resolve_queue_len) != 0) {
819                 struct mfc_cache *c;
820
821                 spin_lock_bh(&mfc_unres_lock);
822                 while (mfc_unres_queue != NULL) {
823                         c = mfc_unres_queue;
824                         mfc_unres_queue = c->next;
825                         spin_unlock_bh(&mfc_unres_lock);
826
827                         ipmr_destroy_unres(c);
828
829                         spin_lock_bh(&mfc_unres_lock);
830                 }
831                 spin_unlock_bh(&mfc_unres_lock);
832         }
833 }
834
835 static void mrtsock_destruct(struct sock *sk)
836 {
837         rtnl_lock();
838         if (sk == mroute_socket) {
839                 ipv4_devconf.mc_forwarding--;
840
841                 write_lock_bh(&mrt_lock);
842                 mroute_socket=NULL;
843                 write_unlock_bh(&mrt_lock);
844
845                 mroute_clean_tables(sk);
846         }
847         rtnl_unlock();
848 }
849
850 /*
851  *      Socket options and virtual interface manipulation. The whole
852  *      virtual interface system is a complete heap, but unfortunately
853  *      that's how BSD mrouted happens to think. Maybe one day with a proper
854  *      MOSPF/PIM router set up we can clean this up.
855  */
856
857 int ip_mroute_setsockopt(struct sock *sk,int optname,char __user *optval,int optlen)
858 {
859         int ret;
860         struct vifctl vif;
861         struct mfcctl mfc;
862
863         if (optname != MRT_INIT) {
864                 if (sk != mroute_socket && !capable(CAP_NET_ADMIN))
865                         return -EACCES;
866         }
867
868         switch (optname) {
869         case MRT_INIT:
870                 if (sk->sk_type != SOCK_RAW ||
871                     inet_sk(sk)->num != IPPROTO_IGMP)
872                         return -EOPNOTSUPP;
873                 if (optlen!=sizeof(int))
874                         return -ENOPROTOOPT;
875
876                 rtnl_lock();
877                 if (mroute_socket) {
878                         rtnl_unlock();
879                         return -EADDRINUSE;
880                 }
881
882                 ret = ip_ra_control(sk, 1, mrtsock_destruct);
883                 if (ret == 0) {
884                         write_lock_bh(&mrt_lock);
885                         mroute_socket=sk;
886                         write_unlock_bh(&mrt_lock);
887
888                         ipv4_devconf.mc_forwarding++;
889                 }
890                 rtnl_unlock();
891                 return ret;
892         case MRT_DONE:
893                 if (sk!=mroute_socket)
894                         return -EACCES;
895                 return ip_ra_control(sk, 0, NULL);
896         case MRT_ADD_VIF:
897         case MRT_DEL_VIF:
898                 if (optlen!=sizeof(vif))
899                         return -EINVAL;
900                 if (copy_from_user(&vif,optval,sizeof(vif)))
901                         return -EFAULT;
902                 if (vif.vifc_vifi >= MAXVIFS)
903                         return -ENFILE;
904                 rtnl_lock();
905                 if (optname==MRT_ADD_VIF) {
906                         ret = vif_add(&vif, sk==mroute_socket);
907                 } else {
908                         ret = vif_delete(vif.vifc_vifi);
909                 }
910                 rtnl_unlock();
911                 return ret;
912
913                 /*
914                  *      Manipulate the forwarding caches. These live
915                  *      in a sort of kernel/user symbiosis.
916                  */
917         case MRT_ADD_MFC:
918         case MRT_DEL_MFC:
919                 if (optlen!=sizeof(mfc))
920                         return -EINVAL;
921                 if (copy_from_user(&mfc,optval, sizeof(mfc)))
922                         return -EFAULT;
923                 rtnl_lock();
924                 if (optname==MRT_DEL_MFC)
925                         ret = ipmr_mfc_delete(&mfc);
926                 else
927                         ret = ipmr_mfc_add(&mfc, sk==mroute_socket);
928                 rtnl_unlock();
929                 return ret;
930                 /*
931                  *      Control PIM assert.
932                  */
933         case MRT_ASSERT:
934         {
935                 int v;
936                 if (get_user(v,(int __user *)optval))
937                         return -EFAULT;
938                 mroute_do_assert=(v)?1:0;
939                 return 0;
940         }
941 #ifdef CONFIG_IP_PIMSM
942         case MRT_PIM:
943         {
944                 int v, ret;
945                 if (get_user(v,(int __user *)optval))
946                         return -EFAULT;
947                 v = (v)?1:0;
948                 rtnl_lock();
949                 ret = 0;
950                 if (v != mroute_do_pim) {
951                         mroute_do_pim = v;
952                         mroute_do_assert = v;
953 #ifdef CONFIG_IP_PIMSM_V2
954                         if (mroute_do_pim)
955                                 ret = inet_add_protocol(&pim_protocol,
956                                                         IPPROTO_PIM);
957                         else
958                                 ret = inet_del_protocol(&pim_protocol,
959                                                         IPPROTO_PIM);
960                         if (ret < 0)
961                                 ret = -EAGAIN;
962 #endif
963                 }
964                 rtnl_unlock();
965                 return ret;
966         }
967 #endif
968         /*
969          *      Spurious command, or MRT_VERSION which you cannot
970          *      set.
971          */
972         default:
973                 return -ENOPROTOOPT;
974         }
975 }
976
977 /*
978  *      Getsock opt support for the multicast routing system.
979  */
980
981 int ip_mroute_getsockopt(struct sock *sk,int optname,char __user *optval,int __user *optlen)
982 {
983         int olr;
984         int val;
985
986         if (optname!=MRT_VERSION &&
987 #ifdef CONFIG_IP_PIMSM
988            optname!=MRT_PIM &&
989 #endif
990            optname!=MRT_ASSERT)
991                 return -ENOPROTOOPT;
992
993         if (get_user(olr, optlen))
994                 return -EFAULT;
995
996         olr = min_t(unsigned int, olr, sizeof(int));
997         if (olr < 0)
998                 return -EINVAL;
999
1000         if (put_user(olr,optlen))
1001                 return -EFAULT;
1002         if (optname==MRT_VERSION)
1003                 val=0x0305;
1004 #ifdef CONFIG_IP_PIMSM
1005         else if (optname==MRT_PIM)
1006                 val=mroute_do_pim;
1007 #endif
1008         else
1009                 val=mroute_do_assert;
1010         if (copy_to_user(optval,&val,olr))
1011                 return -EFAULT;
1012         return 0;
1013 }
1014
1015 /*
1016  *      The IP multicast ioctl support routines.
1017  */
1018
1019 int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
1020 {
1021         struct sioc_sg_req sr;
1022         struct sioc_vif_req vr;
1023         struct vif_device *vif;
1024         struct mfc_cache *c;
1025
1026         switch (cmd) {
1027         case SIOCGETVIFCNT:
1028                 if (copy_from_user(&vr,arg,sizeof(vr)))
1029                         return -EFAULT;
1030                 if (vr.vifi>=maxvif)
1031                         return -EINVAL;
1032                 read_lock(&mrt_lock);
1033                 vif=&vif_table[vr.vifi];
1034                 if (VIF_EXISTS(vr.vifi))        {
1035                         vr.icount=vif->pkt_in;
1036                         vr.ocount=vif->pkt_out;
1037                         vr.ibytes=vif->bytes_in;
1038                         vr.obytes=vif->bytes_out;
1039                         read_unlock(&mrt_lock);
1040
1041                         if (copy_to_user(arg,&vr,sizeof(vr)))
1042                                 return -EFAULT;
1043                         return 0;
1044                 }
1045                 read_unlock(&mrt_lock);
1046                 return -EADDRNOTAVAIL;
1047         case SIOCGETSGCNT:
1048                 if (copy_from_user(&sr,arg,sizeof(sr)))
1049                         return -EFAULT;
1050
1051                 read_lock(&mrt_lock);
1052                 c = ipmr_cache_find(sr.src.s_addr, sr.grp.s_addr);
1053                 if (c) {
1054                         sr.pktcnt = c->mfc_un.res.pkt;
1055                         sr.bytecnt = c->mfc_un.res.bytes;
1056                         sr.wrong_if = c->mfc_un.res.wrong_if;
1057                         read_unlock(&mrt_lock);
1058
1059                         if (copy_to_user(arg,&sr,sizeof(sr)))
1060                                 return -EFAULT;
1061                         return 0;
1062                 }
1063                 read_unlock(&mrt_lock);
1064                 return -EADDRNOTAVAIL;
1065         default:
1066                 return -ENOIOCTLCMD;
1067         }
1068 }
1069
1070
1071 static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
1072 {
1073         struct vif_device *v;
1074         int ct;
1075         if (event != NETDEV_UNREGISTER)
1076                 return NOTIFY_DONE;
1077         v=&vif_table[0];
1078         for (ct=0;ct<maxvif;ct++,v++) {
1079                 if (v->dev==ptr)
1080                         vif_delete(ct);
1081         }
1082         return NOTIFY_DONE;
1083 }
1084
1085
1086 static struct notifier_block ip_mr_notifier={
1087         .notifier_call = ipmr_device_event,
1088 };
1089
1090 /*
1091  *      Encapsulate a packet by attaching a valid IPIP header to it.
1092  *      This avoids tunnel drivers and other mess and gives us the speed so
1093  *      important for multicast video.
1094  */
1095
1096 static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1097 {
1098         struct iphdr *iph;
1099
1100         skb_push(skb, sizeof(struct iphdr));
1101         skb->h.ipiph = skb->nh.iph;
1102         skb_reset_network_header(skb);
1103         iph = skb->nh.iph;
1104
1105         iph->version    =       4;
1106         iph->tos        =       skb->nh.iph->tos;
1107         iph->ttl        =       skb->nh.iph->ttl;
1108         iph->frag_off   =       0;
1109         iph->daddr      =       daddr;
1110         iph->saddr      =       saddr;
1111         iph->protocol   =       IPPROTO_IPIP;
1112         iph->ihl        =       5;
1113         iph->tot_len    =       htons(skb->len);
1114         ip_select_ident(iph, skb->dst, NULL);
1115         ip_send_check(iph);
1116
1117         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
1118         nf_reset(skb);
1119 }
1120
1121 static inline int ipmr_forward_finish(struct sk_buff *skb)
1122 {
1123         struct ip_options * opt = &(IPCB(skb)->opt);
1124
1125         IP_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
1126
1127         if (unlikely(opt->optlen))
1128                 ip_forward_options(skb);
1129
1130         return dst_output(skb);
1131 }
1132
1133 /*
1134  *      Processing handlers for ipmr_forward
1135  */
1136
1137 static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
1138 {
1139         struct iphdr *iph = skb->nh.iph;
1140         struct vif_device *vif = &vif_table[vifi];
1141         struct net_device *dev;
1142         struct rtable *rt;
1143         int    encap = 0;
1144
1145         if (vif->dev == NULL)
1146                 goto out_free;
1147
1148 #ifdef CONFIG_IP_PIMSM
1149         if (vif->flags & VIFF_REGISTER) {
1150                 vif->pkt_out++;
1151                 vif->bytes_out+=skb->len;
1152                 ((struct net_device_stats*)netdev_priv(vif->dev))->tx_bytes += skb->len;
1153                 ((struct net_device_stats*)netdev_priv(vif->dev))->tx_packets++;
1154                 ipmr_cache_report(skb, vifi, IGMPMSG_WHOLEPKT);
1155                 kfree_skb(skb);
1156                 return;
1157         }
1158 #endif
1159
1160         if (vif->flags&VIFF_TUNNEL) {
1161                 struct flowi fl = { .oif = vif->link,
1162                                     .nl_u = { .ip4_u =
1163                                               { .daddr = vif->remote,
1164                                                 .saddr = vif->local,
1165                                                 .tos = RT_TOS(iph->tos) } },
1166                                     .proto = IPPROTO_IPIP };
1167                 if (ip_route_output_key(&rt, &fl))
1168                         goto out_free;
1169                 encap = sizeof(struct iphdr);
1170         } else {
1171                 struct flowi fl = { .oif = vif->link,
1172                                     .nl_u = { .ip4_u =
1173                                               { .daddr = iph->daddr,
1174                                                 .tos = RT_TOS(iph->tos) } },
1175                                     .proto = IPPROTO_IPIP };
1176                 if (ip_route_output_key(&rt, &fl))
1177                         goto out_free;
1178         }
1179
1180         dev = rt->u.dst.dev;
1181
1182         if (skb->len+encap > dst_mtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) {
1183                 /* Do not fragment multicasts. Alas, IPv4 does not
1184                    allow to send ICMP, so that packets will disappear
1185                    to blackhole.
1186                  */
1187
1188                 IP_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
1189                 ip_rt_put(rt);
1190                 goto out_free;
1191         }
1192
1193         encap += LL_RESERVED_SPACE(dev) + rt->u.dst.header_len;
1194
1195         if (skb_cow(skb, encap)) {
1196                 ip_rt_put(rt);
1197                 goto out_free;
1198         }
1199
1200         vif->pkt_out++;
1201         vif->bytes_out+=skb->len;
1202
1203         dst_release(skb->dst);
1204         skb->dst = &rt->u.dst;
1205         iph = skb->nh.iph;
1206         ip_decrease_ttl(iph);
1207
1208         /* FIXME: forward and output firewalls used to be called here.
1209          * What do we do with netfilter? -- RR */
1210         if (vif->flags & VIFF_TUNNEL) {
1211                 ip_encap(skb, vif->local, vif->remote);
1212                 /* FIXME: extra output firewall step used to be here. --RR */
1213                 ((struct ip_tunnel *)netdev_priv(vif->dev))->stat.tx_packets++;
1214                 ((struct ip_tunnel *)netdev_priv(vif->dev))->stat.tx_bytes+=skb->len;
1215         }
1216
1217         IPCB(skb)->flags |= IPSKB_FORWARDED;
1218
1219         /*
1220          * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
1221          * not only before forwarding, but after forwarding on all output
1222          * interfaces. It is clear, if mrouter runs a multicasting
1223          * program, it should receive packets not depending to what interface
1224          * program is joined.
1225          * If we will not make it, the program will have to join on all
1226          * interfaces. On the other hand, multihoming host (or router, but
1227          * not mrouter) cannot join to more than one interface - it will
1228          * result in receiving multiple packets.
1229          */
1230         NF_HOOK(PF_INET, NF_IP_FORWARD, skb, skb->dev, dev,
1231                 ipmr_forward_finish);
1232         return;
1233
1234 out_free:
1235         kfree_skb(skb);
1236         return;
1237 }
1238
1239 static int ipmr_find_vif(struct net_device *dev)
1240 {
1241         int ct;
1242         for (ct=maxvif-1; ct>=0; ct--) {
1243                 if (vif_table[ct].dev == dev)
1244                         break;
1245         }
1246         return ct;
1247 }
1248
1249 /* "local" means that we should preserve one skb (for local delivery) */
1250
1251 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local)
1252 {
1253         int psend = -1;
1254         int vif, ct;
1255
1256         vif = cache->mfc_parent;
1257         cache->mfc_un.res.pkt++;
1258         cache->mfc_un.res.bytes += skb->len;
1259
1260         /*
1261          * Wrong interface: drop packet and (maybe) send PIM assert.
1262          */
1263         if (vif_table[vif].dev != skb->dev) {
1264                 int true_vifi;
1265
1266                 if (((struct rtable*)skb->dst)->fl.iif == 0) {
1267                         /* It is our own packet, looped back.
1268                            Very complicated situation...
1269
1270                            The best workaround until routing daemons will be
1271                            fixed is not to redistribute packet, if it was
1272                            send through wrong interface. It means, that
1273                            multicast applications WILL NOT work for
1274                            (S,G), which have default multicast route pointing
1275                            to wrong oif. In any case, it is not a good
1276                            idea to use multicasting applications on router.
1277                          */
1278                         goto dont_forward;
1279                 }
1280
1281                 cache->mfc_un.res.wrong_if++;
1282                 true_vifi = ipmr_find_vif(skb->dev);
1283
1284                 if (true_vifi >= 0 && mroute_do_assert &&
1285                     /* pimsm uses asserts, when switching from RPT to SPT,
1286                        so that we cannot check that packet arrived on an oif.
1287                        It is bad, but otherwise we would need to move pretty
1288                        large chunk of pimd to kernel. Ough... --ANK
1289                      */
1290                     (mroute_do_pim || cache->mfc_un.res.ttls[true_vifi] < 255) &&
1291                     time_after(jiffies,
1292                                cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
1293                         cache->mfc_un.res.last_assert = jiffies;
1294                         ipmr_cache_report(skb, true_vifi, IGMPMSG_WRONGVIF);
1295                 }
1296                 goto dont_forward;
1297         }
1298
1299         vif_table[vif].pkt_in++;
1300         vif_table[vif].bytes_in+=skb->len;
1301
1302         /*
1303          *      Forward the frame
1304          */
1305         for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) {
1306                 if (skb->nh.iph->ttl > cache->mfc_un.res.ttls[ct]) {
1307                         if (psend != -1) {
1308                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1309                                 if (skb2)
1310                                         ipmr_queue_xmit(skb2, cache, psend);
1311                         }
1312                         psend=ct;
1313                 }
1314         }
1315         if (psend != -1) {
1316                 if (local) {
1317                         struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1318                         if (skb2)
1319                                 ipmr_queue_xmit(skb2, cache, psend);
1320                 } else {
1321                         ipmr_queue_xmit(skb, cache, psend);
1322                         return 0;
1323                 }
1324         }
1325
1326 dont_forward:
1327         if (!local)
1328                 kfree_skb(skb);
1329         return 0;
1330 }
1331
1332
1333 /*
1334  *      Multicast packets for forwarding arrive here
1335  */
1336
1337 int ip_mr_input(struct sk_buff *skb)
1338 {
1339         struct mfc_cache *cache;
1340         int local = ((struct rtable*)skb->dst)->rt_flags&RTCF_LOCAL;
1341
1342         /* Packet is looped back after forward, it should not be
1343            forwarded second time, but still can be delivered locally.
1344          */
1345         if (IPCB(skb)->flags&IPSKB_FORWARDED)
1346                 goto dont_forward;
1347
1348         if (!local) {
1349                     if (IPCB(skb)->opt.router_alert) {
1350                             if (ip_call_ra_chain(skb))
1351                                     return 0;
1352                     } else if (skb->nh.iph->protocol == IPPROTO_IGMP){
1353                             /* IGMPv1 (and broken IGMPv2 implementations sort of
1354                                Cisco IOS <= 11.2(8)) do not put router alert
1355                                option to IGMP packets destined to routable
1356                                groups. It is very bad, because it means
1357                                that we can forward NO IGMP messages.
1358                              */
1359                             read_lock(&mrt_lock);
1360                             if (mroute_socket) {
1361                                     nf_reset(skb);
1362                                     raw_rcv(mroute_socket, skb);
1363                                     read_unlock(&mrt_lock);
1364                                     return 0;
1365                             }
1366                             read_unlock(&mrt_lock);
1367                     }
1368         }
1369
1370         read_lock(&mrt_lock);
1371         cache = ipmr_cache_find(skb->nh.iph->saddr, skb->nh.iph->daddr);
1372
1373         /*
1374          *      No usable cache entry
1375          */
1376         if (cache==NULL) {
1377                 int vif;
1378
1379                 if (local) {
1380                         struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1381                         ip_local_deliver(skb);
1382                         if (skb2 == NULL) {
1383                                 read_unlock(&mrt_lock);
1384                                 return -ENOBUFS;
1385                         }
1386                         skb = skb2;
1387                 }
1388
1389                 vif = ipmr_find_vif(skb->dev);
1390                 if (vif >= 0) {
1391                         int err = ipmr_cache_unresolved(vif, skb);
1392                         read_unlock(&mrt_lock);
1393
1394                         return err;
1395                 }
1396                 read_unlock(&mrt_lock);
1397                 kfree_skb(skb);
1398                 return -ENODEV;
1399         }
1400
1401         ip_mr_forward(skb, cache, local);
1402
1403         read_unlock(&mrt_lock);
1404
1405         if (local)
1406                 return ip_local_deliver(skb);
1407
1408         return 0;
1409
1410 dont_forward:
1411         if (local)
1412                 return ip_local_deliver(skb);
1413         kfree_skb(skb);
1414         return 0;
1415 }
1416
1417 #ifdef CONFIG_IP_PIMSM_V1
1418 /*
1419  * Handle IGMP messages of PIMv1
1420  */
1421
1422 int pim_rcv_v1(struct sk_buff * skb)
1423 {
1424         struct igmphdr *pim;
1425         struct iphdr   *encap;
1426         struct net_device  *reg_dev = NULL;
1427
1428         if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap)))
1429                 goto drop;
1430
1431         pim = (struct igmphdr*)skb->h.raw;
1432
1433         if (!mroute_do_pim ||
1434             skb->len < sizeof(*pim) + sizeof(*encap) ||
1435             pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
1436                 goto drop;
1437
1438         encap = (struct iphdr*)(skb->h.raw + sizeof(struct igmphdr));
1439         /*
1440            Check that:
1441            a. packet is really destinted to a multicast group
1442            b. packet is not a NULL-REGISTER
1443            c. packet is not truncated
1444          */
1445         if (!MULTICAST(encap->daddr) ||
1446             encap->tot_len == 0 ||
1447             ntohs(encap->tot_len) + sizeof(*pim) > skb->len)
1448                 goto drop;
1449
1450         read_lock(&mrt_lock);
1451         if (reg_vif_num >= 0)
1452                 reg_dev = vif_table[reg_vif_num].dev;
1453         if (reg_dev)
1454                 dev_hold(reg_dev);
1455         read_unlock(&mrt_lock);
1456
1457         if (reg_dev == NULL)
1458                 goto drop;
1459
1460         skb->mac.raw = skb->nh.raw;
1461         skb_pull(skb, (u8*)encap - skb->data);
1462         skb_reset_network_header(skb);
1463         skb->dev = reg_dev;
1464         skb->protocol = htons(ETH_P_IP);
1465         skb->ip_summed = 0;
1466         skb->pkt_type = PACKET_HOST;
1467         dst_release(skb->dst);
1468         skb->dst = NULL;
1469         ((struct net_device_stats*)netdev_priv(reg_dev))->rx_bytes += skb->len;
1470         ((struct net_device_stats*)netdev_priv(reg_dev))->rx_packets++;
1471         nf_reset(skb);
1472         netif_rx(skb);
1473         dev_put(reg_dev);
1474         return 0;
1475  drop:
1476         kfree_skb(skb);
1477         return 0;
1478 }
1479 #endif
1480
1481 #ifdef CONFIG_IP_PIMSM_V2
1482 static int pim_rcv(struct sk_buff * skb)
1483 {
1484         struct pimreghdr *pim;
1485         struct iphdr   *encap;
1486         struct net_device  *reg_dev = NULL;
1487
1488         if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap)))
1489                 goto drop;
1490
1491         pim = (struct pimreghdr*)skb->h.raw;
1492         if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) ||
1493             (pim->flags&PIM_NULL_REGISTER) ||
1494             (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
1495              csum_fold(skb_checksum(skb, 0, skb->len, 0))))
1496                 goto drop;
1497
1498         /* check if the inner packet is destined to mcast group */
1499         encap = (struct iphdr*)(skb->h.raw + sizeof(struct pimreghdr));
1500         if (!MULTICAST(encap->daddr) ||
1501             encap->tot_len == 0 ||
1502             ntohs(encap->tot_len) + sizeof(*pim) > skb->len)
1503                 goto drop;
1504
1505         read_lock(&mrt_lock);
1506         if (reg_vif_num >= 0)
1507                 reg_dev = vif_table[reg_vif_num].dev;
1508         if (reg_dev)
1509                 dev_hold(reg_dev);
1510         read_unlock(&mrt_lock);
1511
1512         if (reg_dev == NULL)
1513                 goto drop;
1514
1515         skb->mac.raw = skb->nh.raw;
1516         skb_pull(skb, (u8*)encap - skb->data);
1517         skb_reset_network_header(skb);
1518         skb->dev = reg_dev;
1519         skb->protocol = htons(ETH_P_IP);
1520         skb->ip_summed = 0;
1521         skb->pkt_type = PACKET_HOST;
1522         dst_release(skb->dst);
1523         ((struct net_device_stats*)netdev_priv(reg_dev))->rx_bytes += skb->len;
1524         ((struct net_device_stats*)netdev_priv(reg_dev))->rx_packets++;
1525         skb->dst = NULL;
1526         nf_reset(skb);
1527         netif_rx(skb);
1528         dev_put(reg_dev);
1529         return 0;
1530  drop:
1531         kfree_skb(skb);
1532         return 0;
1533 }
1534 #endif
1535
1536 static int
1537 ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm)
1538 {
1539         int ct;
1540         struct rtnexthop *nhp;
1541         struct net_device *dev = vif_table[c->mfc_parent].dev;
1542         u8 *b = skb->tail;
1543         struct rtattr *mp_head;
1544
1545         if (dev)
1546                 RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex);
1547
1548         mp_head = (struct rtattr*)skb_put(skb, RTA_LENGTH(0));
1549
1550         for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
1551                 if (c->mfc_un.res.ttls[ct] < 255) {
1552                         if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
1553                                 goto rtattr_failure;
1554                         nhp = (struct rtnexthop*)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
1555                         nhp->rtnh_flags = 0;
1556                         nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
1557                         nhp->rtnh_ifindex = vif_table[ct].dev->ifindex;
1558                         nhp->rtnh_len = sizeof(*nhp);
1559                 }
1560         }
1561         mp_head->rta_type = RTA_MULTIPATH;
1562         mp_head->rta_len = skb->tail - (u8*)mp_head;
1563         rtm->rtm_type = RTN_MULTICAST;
1564         return 1;
1565
1566 rtattr_failure:
1567         skb_trim(skb, b - skb->data);
1568         return -EMSGSIZE;
1569 }
1570
1571 int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait)
1572 {
1573         int err;
1574         struct mfc_cache *cache;
1575         struct rtable *rt = (struct rtable*)skb->dst;
1576
1577         read_lock(&mrt_lock);
1578         cache = ipmr_cache_find(rt->rt_src, rt->rt_dst);
1579
1580         if (cache==NULL) {
1581                 struct sk_buff *skb2;
1582                 struct net_device *dev;
1583                 int vif;
1584
1585                 if (nowait) {
1586                         read_unlock(&mrt_lock);
1587                         return -EAGAIN;
1588                 }
1589
1590                 dev = skb->dev;
1591                 if (dev == NULL || (vif = ipmr_find_vif(dev)) < 0) {
1592                         read_unlock(&mrt_lock);
1593                         return -ENODEV;
1594                 }
1595                 skb2 = skb_clone(skb, GFP_ATOMIC);
1596                 if (!skb2) {
1597                         read_unlock(&mrt_lock);
1598                         return -ENOMEM;
1599                 }
1600
1601                 skb_push(skb2, sizeof(struct iphdr));
1602                 skb_reset_network_header(skb2);
1603                 skb2->nh.iph->ihl = sizeof(struct iphdr)>>2;
1604                 skb2->nh.iph->saddr = rt->rt_src;
1605                 skb2->nh.iph->daddr = rt->rt_dst;
1606                 skb2->nh.iph->version = 0;
1607                 err = ipmr_cache_unresolved(vif, skb2);
1608                 read_unlock(&mrt_lock);
1609                 return err;
1610         }
1611
1612         if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
1613                 cache->mfc_flags |= MFC_NOTIFY;
1614         err = ipmr_fill_mroute(skb, cache, rtm);
1615         read_unlock(&mrt_lock);
1616         return err;
1617 }
1618
1619 #ifdef CONFIG_PROC_FS
1620 /*
1621  *      The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif
1622  */
1623 struct ipmr_vif_iter {
1624         int ct;
1625 };
1626
1627 static struct vif_device *ipmr_vif_seq_idx(struct ipmr_vif_iter *iter,
1628                                            loff_t pos)
1629 {
1630         for (iter->ct = 0; iter->ct < maxvif; ++iter->ct) {
1631                 if (!VIF_EXISTS(iter->ct))
1632                         continue;
1633                 if (pos-- == 0)
1634                         return &vif_table[iter->ct];
1635         }
1636         return NULL;
1637 }
1638
1639 static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
1640 {
1641         read_lock(&mrt_lock);
1642         return *pos ? ipmr_vif_seq_idx(seq->private, *pos - 1)
1643                 : SEQ_START_TOKEN;
1644 }
1645
1646 static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1647 {
1648         struct ipmr_vif_iter *iter = seq->private;
1649
1650         ++*pos;
1651         if (v == SEQ_START_TOKEN)
1652                 return ipmr_vif_seq_idx(iter, 0);
1653
1654         while (++iter->ct < maxvif) {
1655                 if (!VIF_EXISTS(iter->ct))
1656                         continue;
1657                 return &vif_table[iter->ct];
1658         }
1659         return NULL;
1660 }
1661
1662 static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
1663 {
1664         read_unlock(&mrt_lock);
1665 }
1666
1667 static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
1668 {
1669         if (v == SEQ_START_TOKEN) {
1670                 seq_puts(seq,
1671                          "Interface      BytesIn  PktsIn  BytesOut PktsOut Flags Local    Remote\n");
1672         } else {
1673                 const struct vif_device *vif = v;
1674                 const char *name =  vif->dev ? vif->dev->name : "none";
1675
1676                 seq_printf(seq,
1677                            "%2Zd %-10s %8ld %7ld  %8ld %7ld %05X %08X %08X\n",
1678                            vif - vif_table,
1679                            name, vif->bytes_in, vif->pkt_in,
1680                            vif->bytes_out, vif->pkt_out,
1681                            vif->flags, vif->local, vif->remote);
1682         }
1683         return 0;
1684 }
1685
1686 static struct seq_operations ipmr_vif_seq_ops = {
1687         .start = ipmr_vif_seq_start,
1688         .next  = ipmr_vif_seq_next,
1689         .stop  = ipmr_vif_seq_stop,
1690         .show  = ipmr_vif_seq_show,
1691 };
1692
1693 static int ipmr_vif_open(struct inode *inode, struct file *file)
1694 {
1695         struct seq_file *seq;
1696         int rc = -ENOMEM;
1697         struct ipmr_vif_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
1698
1699         if (!s)
1700                 goto out;
1701
1702         rc = seq_open(file, &ipmr_vif_seq_ops);
1703         if (rc)
1704                 goto out_kfree;
1705
1706         s->ct = 0;
1707         seq = file->private_data;
1708         seq->private = s;
1709 out:
1710         return rc;
1711 out_kfree:
1712         kfree(s);
1713         goto out;
1714
1715 }
1716
1717 static const struct file_operations ipmr_vif_fops = {
1718         .owner   = THIS_MODULE,
1719         .open    = ipmr_vif_open,
1720         .read    = seq_read,
1721         .llseek  = seq_lseek,
1722         .release = seq_release_private,
1723 };
1724
1725 struct ipmr_mfc_iter {
1726         struct mfc_cache **cache;
1727         int ct;
1728 };
1729
1730
1731 static struct mfc_cache *ipmr_mfc_seq_idx(struct ipmr_mfc_iter *it, loff_t pos)
1732 {
1733         struct mfc_cache *mfc;
1734
1735         it->cache = mfc_cache_array;
1736         read_lock(&mrt_lock);
1737         for (it->ct = 0; it->ct < MFC_LINES; it->ct++)
1738                 for (mfc = mfc_cache_array[it->ct]; mfc; mfc = mfc->next)
1739                         if (pos-- == 0)
1740                                 return mfc;
1741         read_unlock(&mrt_lock);
1742
1743         it->cache = &mfc_unres_queue;
1744         spin_lock_bh(&mfc_unres_lock);
1745         for (mfc = mfc_unres_queue; mfc; mfc = mfc->next)
1746                 if (pos-- == 0)
1747                         return mfc;
1748         spin_unlock_bh(&mfc_unres_lock);
1749
1750         it->cache = NULL;
1751         return NULL;
1752 }
1753
1754
1755 static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
1756 {
1757         struct ipmr_mfc_iter *it = seq->private;
1758         it->cache = NULL;
1759         it->ct = 0;
1760         return *pos ? ipmr_mfc_seq_idx(seq->private, *pos - 1)
1761                 : SEQ_START_TOKEN;
1762 }
1763
1764 static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1765 {
1766         struct mfc_cache *mfc = v;
1767         struct ipmr_mfc_iter *it = seq->private;
1768
1769         ++*pos;
1770
1771         if (v == SEQ_START_TOKEN)
1772                 return ipmr_mfc_seq_idx(seq->private, 0);
1773
1774         if (mfc->next)
1775                 return mfc->next;
1776
1777         if (it->cache == &mfc_unres_queue)
1778                 goto end_of_list;
1779
1780         BUG_ON(it->cache != mfc_cache_array);
1781
1782         while (++it->ct < MFC_LINES) {
1783                 mfc = mfc_cache_array[it->ct];
1784                 if (mfc)
1785                         return mfc;
1786         }
1787
1788         /* exhausted cache_array, show unresolved */
1789         read_unlock(&mrt_lock);
1790         it->cache = &mfc_unres_queue;
1791         it->ct = 0;
1792
1793         spin_lock_bh(&mfc_unres_lock);
1794         mfc = mfc_unres_queue;
1795         if (mfc)
1796                 return mfc;
1797
1798  end_of_list:
1799         spin_unlock_bh(&mfc_unres_lock);
1800         it->cache = NULL;
1801
1802         return NULL;
1803 }
1804
1805 static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
1806 {
1807         struct ipmr_mfc_iter *it = seq->private;
1808
1809         if (it->cache == &mfc_unres_queue)
1810                 spin_unlock_bh(&mfc_unres_lock);
1811         else if (it->cache == mfc_cache_array)
1812                 read_unlock(&mrt_lock);
1813 }
1814
1815 static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
1816 {
1817         int n;
1818
1819         if (v == SEQ_START_TOKEN) {
1820                 seq_puts(seq,
1821                  "Group    Origin   Iif     Pkts    Bytes    Wrong Oifs\n");
1822         } else {
1823                 const struct mfc_cache *mfc = v;
1824                 const struct ipmr_mfc_iter *it = seq->private;
1825
1826                 seq_printf(seq, "%08lX %08lX %-3d %8ld %8ld %8ld",
1827                            (unsigned long) mfc->mfc_mcastgrp,
1828                            (unsigned long) mfc->mfc_origin,
1829                            mfc->mfc_parent,
1830                            mfc->mfc_un.res.pkt,
1831                            mfc->mfc_un.res.bytes,
1832                            mfc->mfc_un.res.wrong_if);
1833
1834                 if (it->cache != &mfc_unres_queue) {
1835                         for (n = mfc->mfc_un.res.minvif;
1836                              n < mfc->mfc_un.res.maxvif; n++ ) {
1837                                 if (VIF_EXISTS(n)
1838                                    && mfc->mfc_un.res.ttls[n] < 255)
1839                                 seq_printf(seq,
1840                                            " %2d:%-3d",
1841                                            n, mfc->mfc_un.res.ttls[n]);
1842                         }
1843                 }
1844                 seq_putc(seq, '\n');
1845         }
1846         return 0;
1847 }
1848
1849 static struct seq_operations ipmr_mfc_seq_ops = {
1850         .start = ipmr_mfc_seq_start,
1851         .next  = ipmr_mfc_seq_next,
1852         .stop  = ipmr_mfc_seq_stop,
1853         .show  = ipmr_mfc_seq_show,
1854 };
1855
1856 static int ipmr_mfc_open(struct inode *inode, struct file *file)
1857 {
1858         struct seq_file *seq;
1859         int rc = -ENOMEM;
1860         struct ipmr_mfc_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
1861
1862         if (!s)
1863                 goto out;
1864
1865         rc = seq_open(file, &ipmr_mfc_seq_ops);
1866         if (rc)
1867                 goto out_kfree;
1868
1869         seq = file->private_data;
1870         seq->private = s;
1871 out:
1872         return rc;
1873 out_kfree:
1874         kfree(s);
1875         goto out;
1876
1877 }
1878
1879 static const struct file_operations ipmr_mfc_fops = {
1880         .owner   = THIS_MODULE,
1881         .open    = ipmr_mfc_open,
1882         .read    = seq_read,
1883         .llseek  = seq_lseek,
1884         .release = seq_release_private,
1885 };
1886 #endif
1887
1888 #ifdef CONFIG_IP_PIMSM_V2
1889 static struct net_protocol pim_protocol = {
1890         .handler        =       pim_rcv,
1891 };
1892 #endif
1893
1894
1895 /*
1896  *      Setup for IP multicast routing
1897  */
1898
1899 void __init ip_mr_init(void)
1900 {
1901         mrt_cachep = kmem_cache_create("ip_mrt_cache",
1902                                        sizeof(struct mfc_cache),
1903                                        0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
1904                                        NULL, NULL);
1905         init_timer(&ipmr_expire_timer);
1906         ipmr_expire_timer.function=ipmr_expire_process;
1907         register_netdevice_notifier(&ip_mr_notifier);
1908 #ifdef CONFIG_PROC_FS
1909         proc_net_fops_create("ip_mr_vif", 0, &ipmr_vif_fops);
1910         proc_net_fops_create("ip_mr_cache", 0, &ipmr_mfc_fops);
1911 #endif
1912 }