Merge branch 'for-linus' of git://one.firstfloor.org/home/andi/git/linux-2.6
[sfrench/cifs-2.6.git] / net / packet / af_packet.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              PACKET - implements raw packet sockets.
7  *
8  * Version:     $Id: af_packet.c,v 1.61 2002/02/08 03:57:19 davem Exp $
9  *
10  * Authors:     Ross Biro
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *
14  * Fixes:
15  *              Alan Cox        :       verify_area() now used correctly
16  *              Alan Cox        :       new skbuff lists, look ma no backlogs!
17  *              Alan Cox        :       tidied skbuff lists.
18  *              Alan Cox        :       Now uses generic datagram routines I
19  *                                      added. Also fixed the peek/read crash
20  *                                      from all old Linux datagram code.
21  *              Alan Cox        :       Uses the improved datagram code.
22  *              Alan Cox        :       Added NULL's for socket options.
23  *              Alan Cox        :       Re-commented the code.
24  *              Alan Cox        :       Use new kernel side addressing
25  *              Rob Janssen     :       Correct MTU usage.
26  *              Dave Platt      :       Counter leaks caused by incorrect
27  *                                      interrupt locking and some slightly
28  *                                      dubious gcc output. Can you read
29  *                                      compiler: it said _VOLATILE_
30  *      Richard Kooijman        :       Timestamp fixes.
31  *              Alan Cox        :       New buffers. Use sk->mac.raw.
32  *              Alan Cox        :       sendmsg/recvmsg support.
33  *              Alan Cox        :       Protocol setting support
34  *      Alexey Kuznetsov        :       Untied from IPv4 stack.
35  *      Cyrus Durgin            :       Fixed kerneld for kmod.
36  *      Michal Ostrowski        :       Module initialization cleanup.
37  *         Ulises Alonso        :       Frame number limit removal and
38  *                                      packet_set_ring memory leak.
39  *              Eric Biederman  :       Allow for > 8 byte hardware addresses.
40  *                                      The convention is that longer addresses
41  *                                      will simply extend the hardware address
42  *                                      byte arrays at the end of sockaddr_ll
43  *                                      and packet_mreq.
44  *
45  *              This program is free software; you can redistribute it and/or
46  *              modify it under the terms of the GNU General Public License
47  *              as published by the Free Software Foundation; either version
48  *              2 of the License, or (at your option) any later version.
49  *
50  */
51
52 #include <linux/types.h>
53 #include <linux/mm.h>
54 #include <linux/capability.h>
55 #include <linux/fcntl.h>
56 #include <linux/socket.h>
57 #include <linux/in.h>
58 #include <linux/inet.h>
59 #include <linux/netdevice.h>
60 #include <linux/if_packet.h>
61 #include <linux/wireless.h>
62 #include <linux/kernel.h>
63 #include <linux/kmod.h>
64 #include <net/ip.h>
65 #include <net/protocol.h>
66 #include <linux/skbuff.h>
67 #include <net/sock.h>
68 #include <linux/errno.h>
69 #include <linux/timer.h>
70 #include <asm/system.h>
71 #include <asm/uaccess.h>
72 #include <asm/ioctls.h>
73 #include <asm/page.h>
74 #include <asm/cacheflush.h>
75 #include <asm/io.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/poll.h>
79 #include <linux/module.h>
80 #include <linux/init.h>
81
82 #ifdef CONFIG_INET
83 #include <net/inet_common.h>
84 #endif
85
86 #define CONFIG_SOCK_PACKET      1
87
88 /*
89    Proposed replacement for SIOC{ADD,DEL}MULTI and
90    IFF_PROMISC, IFF_ALLMULTI flags.
91
92    It is more expensive, but I believe,
93    it is really correct solution: reentereble, safe and fault tolerant.
94
95    IFF_PROMISC/IFF_ALLMULTI/SIOC{ADD/DEL}MULTI are faked by keeping
96    reference count and global flag, so that real status is
97    (gflag|(count != 0)), so that we can use obsolete faulty interface
98    not harming clever users.
99  */
100 #define CONFIG_PACKET_MULTICAST 1
101
102 /*
103    Assumptions:
104    - if device has no dev->hard_header routine, it adds and removes ll header
105      inside itself. In this case ll header is invisible outside of device,
106      but higher levels still should reserve dev->hard_header_len.
107      Some devices are enough clever to reallocate skb, when header
108      will not fit to reserved space (tunnel), another ones are silly
109      (PPP).
110    - packet socket receives packets with pulled ll header,
111      so that SOCK_RAW should push it back.
112
113 On receive:
114 -----------
115
116 Incoming, dev->hard_header!=NULL
117    mac.raw -> ll header
118    data    -> data
119
120 Outgoing, dev->hard_header!=NULL
121    mac.raw -> ll header
122    data    -> ll header
123
124 Incoming, dev->hard_header==NULL
125    mac.raw -> UNKNOWN position. It is very likely, that it points to ll header.
126               PPP makes it, that is wrong, because introduce assymetry
127               between rx and tx paths.
128    data    -> data
129
130 Outgoing, dev->hard_header==NULL
131    mac.raw -> data. ll header is still not built!
132    data    -> data
133
134 Resume
135   If dev->hard_header==NULL we are unlikely to restore sensible ll header.
136
137
138 On transmit:
139 ------------
140
141 dev->hard_header != NULL
142    mac.raw -> ll header
143    data    -> ll header
144
145 dev->hard_header == NULL (ll header is added by device, we cannot control it)
146    mac.raw -> data
147    data -> data
148
149    We should set nh.raw on output to correct posistion,
150    packet classifier depends on it.
151  */
152
153 /* List of all packet sockets. */
154 static HLIST_HEAD(packet_sklist);
155 static DEFINE_RWLOCK(packet_sklist_lock);
156
157 static atomic_t packet_socks_nr;
158
159
160 /* Private packet socket structures. */
161
162 #ifdef CONFIG_PACKET_MULTICAST
163 struct packet_mclist
164 {
165         struct packet_mclist    *next;
166         int                     ifindex;
167         int                     count;
168         unsigned short          type;
169         unsigned short          alen;
170         unsigned char           addr[MAX_ADDR_LEN];
171 };
172 /* identical to struct packet_mreq except it has
173  * a longer address field.
174  */
175 struct packet_mreq_max
176 {
177         int             mr_ifindex;
178         unsigned short  mr_type;
179         unsigned short  mr_alen;
180         unsigned char   mr_address[MAX_ADDR_LEN];
181 };
182 #endif
183 #ifdef CONFIG_PACKET_MMAP
184 static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing);
185 #endif
186
187 static void packet_flush_mclist(struct sock *sk);
188
189 struct packet_sock {
190         /* struct sock has to be the first member of packet_sock */
191         struct sock             sk;
192         struct tpacket_stats    stats;
193 #ifdef CONFIG_PACKET_MMAP
194         char *                  *pg_vec;
195         unsigned int            head;
196         unsigned int            frames_per_block;
197         unsigned int            frame_size;
198         unsigned int            frame_max;
199         int                     copy_thresh;
200 #endif
201         struct packet_type      prot_hook;
202         spinlock_t              bind_lock;
203         unsigned int            running:1,      /* prot_hook is attached*/
204                                 auxdata:1;
205         int                     ifindex;        /* bound device         */
206         __be16                  num;
207 #ifdef CONFIG_PACKET_MULTICAST
208         struct packet_mclist    *mclist;
209 #endif
210 #ifdef CONFIG_PACKET_MMAP
211         atomic_t                mapped;
212         unsigned int            pg_vec_order;
213         unsigned int            pg_vec_pages;
214         unsigned int            pg_vec_len;
215 #endif
216 };
217
218 struct packet_skb_cb {
219         unsigned int origlen;
220         union {
221                 struct sockaddr_pkt pkt;
222                 struct sockaddr_ll ll;
223         } sa;
224 };
225
226 #define PACKET_SKB_CB(__skb)    ((struct packet_skb_cb *)((__skb)->cb))
227
228 #ifdef CONFIG_PACKET_MMAP
229
230 static inline char *packet_lookup_frame(struct packet_sock *po, unsigned int position)
231 {
232         unsigned int pg_vec_pos, frame_offset;
233         char *frame;
234
235         pg_vec_pos = position / po->frames_per_block;
236         frame_offset = position % po->frames_per_block;
237
238         frame = po->pg_vec[pg_vec_pos] + (frame_offset * po->frame_size);
239
240         return frame;
241 }
242 #endif
243
244 static inline struct packet_sock *pkt_sk(struct sock *sk)
245 {
246         return (struct packet_sock *)sk;
247 }
248
249 static void packet_sock_destruct(struct sock *sk)
250 {
251         BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc));
252         BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
253
254         if (!sock_flag(sk, SOCK_DEAD)) {
255                 printk("Attempt to release alive packet socket: %p\n", sk);
256                 return;
257         }
258
259         atomic_dec(&packet_socks_nr);
260 #ifdef PACKET_REFCNT_DEBUG
261         printk(KERN_DEBUG "PACKET socket %p is free, %d are alive\n", sk, atomic_read(&packet_socks_nr));
262 #endif
263 }
264
265
266 static const struct proto_ops packet_ops;
267
268 #ifdef CONFIG_SOCK_PACKET
269 static const struct proto_ops packet_ops_spkt;
270
271 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,  struct packet_type *pt, struct net_device *orig_dev)
272 {
273         struct sock *sk;
274         struct sockaddr_pkt *spkt;
275
276         /*
277          *      When we registered the protocol we saved the socket in the data
278          *      field for just this event.
279          */
280
281         sk = pt->af_packet_priv;
282
283         /*
284          *      Yank back the headers [hope the device set this
285          *      right or kerboom...]
286          *
287          *      Incoming packets have ll header pulled,
288          *      push it back.
289          *
290          *      For outgoing ones skb->data == skb->mac.raw
291          *      so that this procedure is noop.
292          */
293
294         if (skb->pkt_type == PACKET_LOOPBACK)
295                 goto out;
296
297         if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
298                 goto oom;
299
300         /* drop any routing info */
301         dst_release(skb->dst);
302         skb->dst = NULL;
303
304         /* drop conntrack reference */
305         nf_reset(skb);
306
307         spkt = &PACKET_SKB_CB(skb)->sa.pkt;
308
309         skb_push(skb, skb->data-skb->mac.raw);
310
311         /*
312          *      The SOCK_PACKET socket receives _all_ frames.
313          */
314
315         spkt->spkt_family = dev->type;
316         strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
317         spkt->spkt_protocol = skb->protocol;
318
319         /*
320          *      Charge the memory to the socket. This is done specifically
321          *      to prevent sockets using all the memory up.
322          */
323
324         if (sock_queue_rcv_skb(sk,skb) == 0)
325                 return 0;
326
327 out:
328         kfree_skb(skb);
329 oom:
330         return 0;
331 }
332
333
334 /*
335  *      Output a raw packet to a device layer. This bypasses all the other
336  *      protocol layers and you must therefore supply it with a complete frame
337  */
338
339 static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
340                                struct msghdr *msg, size_t len)
341 {
342         struct sock *sk = sock->sk;
343         struct sockaddr_pkt *saddr=(struct sockaddr_pkt *)msg->msg_name;
344         struct sk_buff *skb;
345         struct net_device *dev;
346         __be16 proto=0;
347         int err;
348
349         /*
350          *      Get and verify the address.
351          */
352
353         if (saddr)
354         {
355                 if (msg->msg_namelen < sizeof(struct sockaddr))
356                         return(-EINVAL);
357                 if (msg->msg_namelen==sizeof(struct sockaddr_pkt))
358                         proto=saddr->spkt_protocol;
359         }
360         else
361                 return(-ENOTCONN);      /* SOCK_PACKET must be sent giving an address */
362
363         /*
364          *      Find the device first to size check it
365          */
366
367         saddr->spkt_device[13] = 0;
368         dev = dev_get_by_name(saddr->spkt_device);
369         err = -ENODEV;
370         if (dev == NULL)
371                 goto out_unlock;
372
373         err = -ENETDOWN;
374         if (!(dev->flags & IFF_UP))
375                 goto out_unlock;
376
377         /*
378          *      You may not queue a frame bigger than the mtu. This is the lowest level
379          *      raw protocol and you must do your own fragmentation at this level.
380          */
381
382         err = -EMSGSIZE;
383         if (len > dev->mtu + dev->hard_header_len)
384                 goto out_unlock;
385
386         err = -ENOBUFS;
387         skb = sock_wmalloc(sk, len + LL_RESERVED_SPACE(dev), 0, GFP_KERNEL);
388
389         /*
390          *      If the write buffer is full, then tough. At this level the user gets to
391          *      deal with the problem - do your own algorithmic backoffs. That's far
392          *      more flexible.
393          */
394
395         if (skb == NULL)
396                 goto out_unlock;
397
398         /*
399          *      Fill it in
400          */
401
402         /* FIXME: Save some space for broken drivers that write a
403          * hard header at transmission time by themselves. PPP is the
404          * notable one here. This should really be fixed at the driver level.
405          */
406         skb_reserve(skb, LL_RESERVED_SPACE(dev));
407         skb->nh.raw = skb->data;
408
409         /* Try to align data part correctly */
410         if (dev->hard_header) {
411                 skb->data -= dev->hard_header_len;
412                 skb->tail -= dev->hard_header_len;
413                 if (len < dev->hard_header_len)
414                         skb->nh.raw = skb->data;
415         }
416
417         /* Returns -EFAULT on error */
418         err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
419         skb->protocol = proto;
420         skb->dev = dev;
421         skb->priority = sk->sk_priority;
422         if (err)
423                 goto out_free;
424
425         /*
426          *      Now send it
427          */
428
429         dev_queue_xmit(skb);
430         dev_put(dev);
431         return(len);
432
433 out_free:
434         kfree_skb(skb);
435 out_unlock:
436         if (dev)
437                 dev_put(dev);
438         return err;
439 }
440 #endif
441
442 static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
443                                       unsigned int res)
444 {
445         struct sk_filter *filter;
446
447         rcu_read_lock_bh();
448         filter = rcu_dereference(sk->sk_filter);
449         if (filter != NULL)
450                 res = sk_run_filter(skb, filter->insns, filter->len);
451         rcu_read_unlock_bh();
452
453         return res;
454 }
455
456 /*
457    This function makes lazy skb cloning in hope that most of packets
458    are discarded by BPF.
459
460    Note tricky part: we DO mangle shared skb! skb->data, skb->len
461    and skb->cb are mangled. It works because (and until) packets
462    falling here are owned by current CPU. Output packets are cloned
463    by dev_queue_xmit_nit(), input packets are processed by net_bh
464    sequencially, so that if we return skb to original state on exit,
465    we will not harm anyone.
466  */
467
468 static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
469 {
470         struct sock *sk;
471         struct sockaddr_ll *sll;
472         struct packet_sock *po;
473         u8 * skb_head = skb->data;
474         int skb_len = skb->len;
475         unsigned int snaplen, res;
476
477         if (skb->pkt_type == PACKET_LOOPBACK)
478                 goto drop;
479
480         sk = pt->af_packet_priv;
481         po = pkt_sk(sk);
482
483         skb->dev = dev;
484
485         if (dev->hard_header) {
486                 /* The device has an explicit notion of ll header,
487                    exported to higher levels.
488
489                    Otherwise, the device hides datails of it frame
490                    structure, so that corresponding packet head
491                    never delivered to user.
492                  */
493                 if (sk->sk_type != SOCK_DGRAM)
494                         skb_push(skb, skb->data - skb->mac.raw);
495                 else if (skb->pkt_type == PACKET_OUTGOING) {
496                         /* Special case: outgoing packets have ll header at head */
497                         skb_pull(skb, skb->nh.raw - skb->data);
498                 }
499         }
500
501         snaplen = skb->len;
502
503         res = run_filter(skb, sk, snaplen);
504         if (!res)
505                 goto drop_n_restore;
506         if (snaplen > res)
507                 snaplen = res;
508
509         if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
510             (unsigned)sk->sk_rcvbuf)
511                 goto drop_n_acct;
512
513         if (skb_shared(skb)) {
514                 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
515                 if (nskb == NULL)
516                         goto drop_n_acct;
517
518                 if (skb_head != skb->data) {
519                         skb->data = skb_head;
520                         skb->len = skb_len;
521                 }
522                 kfree_skb(skb);
523                 skb = nskb;
524         }
525
526         BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
527                      sizeof(skb->cb));
528
529         sll = &PACKET_SKB_CB(skb)->sa.ll;
530         sll->sll_family = AF_PACKET;
531         sll->sll_hatype = dev->type;
532         sll->sll_protocol = skb->protocol;
533         sll->sll_pkttype = skb->pkt_type;
534         sll->sll_ifindex = dev->ifindex;
535         sll->sll_halen = 0;
536
537         if (dev->hard_header_parse)
538                 sll->sll_halen = dev->hard_header_parse(skb, sll->sll_addr);
539
540         PACKET_SKB_CB(skb)->origlen = skb->len;
541
542         if (pskb_trim(skb, snaplen))
543                 goto drop_n_acct;
544
545         skb_set_owner_r(skb, sk);
546         skb->dev = NULL;
547         dst_release(skb->dst);
548         skb->dst = NULL;
549
550         /* drop conntrack reference */
551         nf_reset(skb);
552
553         spin_lock(&sk->sk_receive_queue.lock);
554         po->stats.tp_packets++;
555         __skb_queue_tail(&sk->sk_receive_queue, skb);
556         spin_unlock(&sk->sk_receive_queue.lock);
557         sk->sk_data_ready(sk, skb->len);
558         return 0;
559
560 drop_n_acct:
561         spin_lock(&sk->sk_receive_queue.lock);
562         po->stats.tp_drops++;
563         spin_unlock(&sk->sk_receive_queue.lock);
564
565 drop_n_restore:
566         if (skb_head != skb->data && skb_shared(skb)) {
567                 skb->data = skb_head;
568                 skb->len = skb_len;
569         }
570 drop:
571         kfree_skb(skb);
572         return 0;
573 }
574
575 #ifdef CONFIG_PACKET_MMAP
576 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
577 {
578         struct sock *sk;
579         struct packet_sock *po;
580         struct sockaddr_ll *sll;
581         struct tpacket_hdr *h;
582         u8 * skb_head = skb->data;
583         int skb_len = skb->len;
584         unsigned int snaplen, res;
585         unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
586         unsigned short macoff, netoff;
587         struct sk_buff *copy_skb = NULL;
588
589         if (skb->pkt_type == PACKET_LOOPBACK)
590                 goto drop;
591
592         sk = pt->af_packet_priv;
593         po = pkt_sk(sk);
594
595         if (dev->hard_header) {
596                 if (sk->sk_type != SOCK_DGRAM)
597                         skb_push(skb, skb->data - skb->mac.raw);
598                 else if (skb->pkt_type == PACKET_OUTGOING) {
599                         /* Special case: outgoing packets have ll header at head */
600                         skb_pull(skb, skb->nh.raw - skb->data);
601                 }
602         }
603
604         if (skb->ip_summed == CHECKSUM_PARTIAL)
605                 status |= TP_STATUS_CSUMNOTREADY;
606
607         snaplen = skb->len;
608
609         res = run_filter(skb, sk, snaplen);
610         if (!res)
611                 goto drop_n_restore;
612         if (snaplen > res)
613                 snaplen = res;
614
615         if (sk->sk_type == SOCK_DGRAM) {
616                 macoff = netoff = TPACKET_ALIGN(TPACKET_HDRLEN) + 16;
617         } else {
618                 unsigned maclen = skb->nh.raw - skb->data;
619                 netoff = TPACKET_ALIGN(TPACKET_HDRLEN + (maclen < 16 ? 16 : maclen));
620                 macoff = netoff - maclen;
621         }
622
623         if (macoff + snaplen > po->frame_size) {
624                 if (po->copy_thresh &&
625                     atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
626                     (unsigned)sk->sk_rcvbuf) {
627                         if (skb_shared(skb)) {
628                                 copy_skb = skb_clone(skb, GFP_ATOMIC);
629                         } else {
630                                 copy_skb = skb_get(skb);
631                                 skb_head = skb->data;
632                         }
633                         if (copy_skb)
634                                 skb_set_owner_r(copy_skb, sk);
635                 }
636                 snaplen = po->frame_size - macoff;
637                 if ((int)snaplen < 0)
638                         snaplen = 0;
639         }
640
641         spin_lock(&sk->sk_receive_queue.lock);
642         h = (struct tpacket_hdr *)packet_lookup_frame(po, po->head);
643
644         if (h->tp_status)
645                 goto ring_is_full;
646         po->head = po->head != po->frame_max ? po->head+1 : 0;
647         po->stats.tp_packets++;
648         if (copy_skb) {
649                 status |= TP_STATUS_COPY;
650                 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
651         }
652         if (!po->stats.tp_drops)
653                 status &= ~TP_STATUS_LOSING;
654         spin_unlock(&sk->sk_receive_queue.lock);
655
656         skb_copy_bits(skb, 0, (u8*)h + macoff, snaplen);
657
658         h->tp_len = skb->len;
659         h->tp_snaplen = snaplen;
660         h->tp_mac = macoff;
661         h->tp_net = netoff;
662         if (skb->tstamp.off_sec == 0) {
663                 __net_timestamp(skb);
664                 sock_enable_timestamp(sk);
665         }
666         h->tp_sec = skb->tstamp.off_sec;
667         h->tp_usec = skb->tstamp.off_usec;
668
669         sll = (struct sockaddr_ll*)((u8*)h + TPACKET_ALIGN(sizeof(*h)));
670         sll->sll_halen = 0;
671         if (dev->hard_header_parse)
672                 sll->sll_halen = dev->hard_header_parse(skb, sll->sll_addr);
673         sll->sll_family = AF_PACKET;
674         sll->sll_hatype = dev->type;
675         sll->sll_protocol = skb->protocol;
676         sll->sll_pkttype = skb->pkt_type;
677         sll->sll_ifindex = dev->ifindex;
678
679         h->tp_status = status;
680         smp_mb();
681
682         {
683                 struct page *p_start, *p_end;
684                 u8 *h_end = (u8 *)h + macoff + snaplen - 1;
685
686                 p_start = virt_to_page(h);
687                 p_end = virt_to_page(h_end);
688                 while (p_start <= p_end) {
689                         flush_dcache_page(p_start);
690                         p_start++;
691                 }
692         }
693
694         sk->sk_data_ready(sk, 0);
695
696 drop_n_restore:
697         if (skb_head != skb->data && skb_shared(skb)) {
698                 skb->data = skb_head;
699                 skb->len = skb_len;
700         }
701 drop:
702         kfree_skb(skb);
703         return 0;
704
705 ring_is_full:
706         po->stats.tp_drops++;
707         spin_unlock(&sk->sk_receive_queue.lock);
708
709         sk->sk_data_ready(sk, 0);
710         if (copy_skb)
711                 kfree_skb(copy_skb);
712         goto drop_n_restore;
713 }
714
715 #endif
716
717
718 static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
719                           struct msghdr *msg, size_t len)
720 {
721         struct sock *sk = sock->sk;
722         struct sockaddr_ll *saddr=(struct sockaddr_ll *)msg->msg_name;
723         struct sk_buff *skb;
724         struct net_device *dev;
725         __be16 proto;
726         unsigned char *addr;
727         int ifindex, err, reserve = 0;
728
729         /*
730          *      Get and verify the address.
731          */
732
733         if (saddr == NULL) {
734                 struct packet_sock *po = pkt_sk(sk);
735
736                 ifindex = po->ifindex;
737                 proto   = po->num;
738                 addr    = NULL;
739         } else {
740                 err = -EINVAL;
741                 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
742                         goto out;
743                 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
744                         goto out;
745                 ifindex = saddr->sll_ifindex;
746                 proto   = saddr->sll_protocol;
747                 addr    = saddr->sll_addr;
748         }
749
750
751         dev = dev_get_by_index(ifindex);
752         err = -ENXIO;
753         if (dev == NULL)
754                 goto out_unlock;
755         if (sock->type == SOCK_RAW)
756                 reserve = dev->hard_header_len;
757
758         err = -ENETDOWN;
759         if (!(dev->flags & IFF_UP))
760                 goto out_unlock;
761
762         err = -EMSGSIZE;
763         if (len > dev->mtu+reserve)
764                 goto out_unlock;
765
766         skb = sock_alloc_send_skb(sk, len + LL_RESERVED_SPACE(dev),
767                                 msg->msg_flags & MSG_DONTWAIT, &err);
768         if (skb==NULL)
769                 goto out_unlock;
770
771         skb_reserve(skb, LL_RESERVED_SPACE(dev));
772         skb->nh.raw = skb->data;
773
774         if (dev->hard_header) {
775                 int res;
776                 err = -EINVAL;
777                 res = dev->hard_header(skb, dev, ntohs(proto), addr, NULL, len);
778                 if (sock->type != SOCK_DGRAM) {
779                         skb->tail = skb->data;
780                         skb->len = 0;
781                 } else if (res < 0)
782                         goto out_free;
783         }
784
785         /* Returns -EFAULT on error */
786         err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
787         if (err)
788                 goto out_free;
789
790         skb->protocol = proto;
791         skb->dev = dev;
792         skb->priority = sk->sk_priority;
793
794         /*
795          *      Now send it
796          */
797
798         err = dev_queue_xmit(skb);
799         if (err > 0 && (err = net_xmit_errno(err)) != 0)
800                 goto out_unlock;
801
802         dev_put(dev);
803
804         return(len);
805
806 out_free:
807         kfree_skb(skb);
808 out_unlock:
809         if (dev)
810                 dev_put(dev);
811 out:
812         return err;
813 }
814
815 /*
816  *      Close a PACKET socket. This is fairly simple. We immediately go
817  *      to 'closed' state and remove our protocol entry in the device list.
818  */
819
820 static int packet_release(struct socket *sock)
821 {
822         struct sock *sk = sock->sk;
823         struct packet_sock *po;
824
825         if (!sk)
826                 return 0;
827
828         po = pkt_sk(sk);
829
830         write_lock_bh(&packet_sklist_lock);
831         sk_del_node_init(sk);
832         write_unlock_bh(&packet_sklist_lock);
833
834         /*
835          *      Unhook packet receive handler.
836          */
837
838         if (po->running) {
839                 /*
840                  *      Remove the protocol hook
841                  */
842                 dev_remove_pack(&po->prot_hook);
843                 po->running = 0;
844                 po->num = 0;
845                 __sock_put(sk);
846         }
847
848 #ifdef CONFIG_PACKET_MULTICAST
849         packet_flush_mclist(sk);
850 #endif
851
852 #ifdef CONFIG_PACKET_MMAP
853         if (po->pg_vec) {
854                 struct tpacket_req req;
855                 memset(&req, 0, sizeof(req));
856                 packet_set_ring(sk, &req, 1);
857         }
858 #endif
859
860         /*
861          *      Now the socket is dead. No more input will appear.
862          */
863
864         sock_orphan(sk);
865         sock->sk = NULL;
866
867         /* Purge queues */
868
869         skb_queue_purge(&sk->sk_receive_queue);
870
871         sock_put(sk);
872         return 0;
873 }
874
875 /*
876  *      Attach a packet hook.
877  */
878
879 static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
880 {
881         struct packet_sock *po = pkt_sk(sk);
882         /*
883          *      Detach an existing hook if present.
884          */
885
886         lock_sock(sk);
887
888         spin_lock(&po->bind_lock);
889         if (po->running) {
890                 __sock_put(sk);
891                 po->running = 0;
892                 po->num = 0;
893                 spin_unlock(&po->bind_lock);
894                 dev_remove_pack(&po->prot_hook);
895                 spin_lock(&po->bind_lock);
896         }
897
898         po->num = protocol;
899         po->prot_hook.type = protocol;
900         po->prot_hook.dev = dev;
901
902         po->ifindex = dev ? dev->ifindex : 0;
903
904         if (protocol == 0)
905                 goto out_unlock;
906
907         if (dev) {
908                 if (dev->flags&IFF_UP) {
909                         dev_add_pack(&po->prot_hook);
910                         sock_hold(sk);
911                         po->running = 1;
912                 } else {
913                         sk->sk_err = ENETDOWN;
914                         if (!sock_flag(sk, SOCK_DEAD))
915                                 sk->sk_error_report(sk);
916                 }
917         } else {
918                 dev_add_pack(&po->prot_hook);
919                 sock_hold(sk);
920                 po->running = 1;
921         }
922
923 out_unlock:
924         spin_unlock(&po->bind_lock);
925         release_sock(sk);
926         return 0;
927 }
928
929 /*
930  *      Bind a packet socket to a device
931  */
932
933 #ifdef CONFIG_SOCK_PACKET
934
935 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, int addr_len)
936 {
937         struct sock *sk=sock->sk;
938         char name[15];
939         struct net_device *dev;
940         int err = -ENODEV;
941
942         /*
943          *      Check legality
944          */
945
946         if (addr_len != sizeof(struct sockaddr))
947                 return -EINVAL;
948         strlcpy(name,uaddr->sa_data,sizeof(name));
949
950         dev = dev_get_by_name(name);
951         if (dev) {
952                 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
953                 dev_put(dev);
954         }
955         return err;
956 }
957 #endif
958
959 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
960 {
961         struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
962         struct sock *sk=sock->sk;
963         struct net_device *dev = NULL;
964         int err;
965
966
967         /*
968          *      Check legality
969          */
970
971         if (addr_len < sizeof(struct sockaddr_ll))
972                 return -EINVAL;
973         if (sll->sll_family != AF_PACKET)
974                 return -EINVAL;
975
976         if (sll->sll_ifindex) {
977                 err = -ENODEV;
978                 dev = dev_get_by_index(sll->sll_ifindex);
979                 if (dev == NULL)
980                         goto out;
981         }
982         err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
983         if (dev)
984                 dev_put(dev);
985
986 out:
987         return err;
988 }
989
990 static struct proto packet_proto = {
991         .name     = "PACKET",
992         .owner    = THIS_MODULE,
993         .obj_size = sizeof(struct packet_sock),
994 };
995
996 /*
997  *      Create a packet of type SOCK_PACKET.
998  */
999
1000 static int packet_create(struct socket *sock, int protocol)
1001 {
1002         struct sock *sk;
1003         struct packet_sock *po;
1004         __be16 proto = (__force __be16)protocol; /* weird, but documented */
1005         int err;
1006
1007         if (!capable(CAP_NET_RAW))
1008                 return -EPERM;
1009         if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW
1010 #ifdef CONFIG_SOCK_PACKET
1011             && sock->type != SOCK_PACKET
1012 #endif
1013             )
1014                 return -ESOCKTNOSUPPORT;
1015
1016         sock->state = SS_UNCONNECTED;
1017
1018         err = -ENOBUFS;
1019         sk = sk_alloc(PF_PACKET, GFP_KERNEL, &packet_proto, 1);
1020         if (sk == NULL)
1021                 goto out;
1022
1023         sock->ops = &packet_ops;
1024 #ifdef CONFIG_SOCK_PACKET
1025         if (sock->type == SOCK_PACKET)
1026                 sock->ops = &packet_ops_spkt;
1027 #endif
1028         sock_init_data(sock, sk);
1029
1030         po = pkt_sk(sk);
1031         sk->sk_family = PF_PACKET;
1032         po->num = proto;
1033
1034         sk->sk_destruct = packet_sock_destruct;
1035         atomic_inc(&packet_socks_nr);
1036
1037         /*
1038          *      Attach a protocol block
1039          */
1040
1041         spin_lock_init(&po->bind_lock);
1042         po->prot_hook.func = packet_rcv;
1043 #ifdef CONFIG_SOCK_PACKET
1044         if (sock->type == SOCK_PACKET)
1045                 po->prot_hook.func = packet_rcv_spkt;
1046 #endif
1047         po->prot_hook.af_packet_priv = sk;
1048
1049         if (proto) {
1050                 po->prot_hook.type = proto;
1051                 dev_add_pack(&po->prot_hook);
1052                 sock_hold(sk);
1053                 po->running = 1;
1054         }
1055
1056         write_lock_bh(&packet_sklist_lock);
1057         sk_add_node(sk, &packet_sklist);
1058         write_unlock_bh(&packet_sklist_lock);
1059         return(0);
1060 out:
1061         return err;
1062 }
1063
1064 /*
1065  *      Pull a packet from our receive queue and hand it to the user.
1066  *      If necessary we block.
1067  */
1068
1069 static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1070                           struct msghdr *msg, size_t len, int flags)
1071 {
1072         struct sock *sk = sock->sk;
1073         struct sk_buff *skb;
1074         int copied, err;
1075         struct sockaddr_ll *sll;
1076
1077         err = -EINVAL;
1078         if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
1079                 goto out;
1080
1081 #if 0
1082         /* What error should we return now? EUNATTACH? */
1083         if (pkt_sk(sk)->ifindex < 0)
1084                 return -ENODEV;
1085 #endif
1086
1087         /*
1088          *      Call the generic datagram receiver. This handles all sorts
1089          *      of horrible races and re-entrancy so we can forget about it
1090          *      in the protocol layers.
1091          *
1092          *      Now it will return ENETDOWN, if device have just gone down,
1093          *      but then it will block.
1094          */
1095
1096         skb=skb_recv_datagram(sk,flags,flags&MSG_DONTWAIT,&err);
1097
1098         /*
1099          *      An error occurred so return it. Because skb_recv_datagram()
1100          *      handles the blocking we don't see and worry about blocking
1101          *      retries.
1102          */
1103
1104         if (skb == NULL)
1105                 goto out;
1106
1107         /*
1108          *      If the address length field is there to be filled in, we fill
1109          *      it in now.
1110          */
1111
1112         sll = &PACKET_SKB_CB(skb)->sa.ll;
1113         if (sock->type == SOCK_PACKET)
1114                 msg->msg_namelen = sizeof(struct sockaddr_pkt);
1115         else
1116                 msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
1117
1118         /*
1119          *      You lose any data beyond the buffer you gave. If it worries a
1120          *      user program they can ask the device for its MTU anyway.
1121          */
1122
1123         copied = skb->len;
1124         if (copied > len)
1125         {
1126                 copied=len;
1127                 msg->msg_flags|=MSG_TRUNC;
1128         }
1129
1130         err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1131         if (err)
1132                 goto out_free;
1133
1134         sock_recv_timestamp(msg, sk, skb);
1135
1136         if (msg->msg_name)
1137                 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
1138                        msg->msg_namelen);
1139
1140         if (pkt_sk(sk)->auxdata) {
1141                 struct tpacket_auxdata aux;
1142
1143                 aux.tp_status = TP_STATUS_USER;
1144                 if (skb->ip_summed == CHECKSUM_PARTIAL)
1145                         aux.tp_status |= TP_STATUS_CSUMNOTREADY;
1146                 aux.tp_len = PACKET_SKB_CB(skb)->origlen;
1147                 aux.tp_snaplen = skb->len;
1148                 aux.tp_mac = 0;
1149                 aux.tp_net = skb->nh.raw - skb->data;
1150
1151                 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
1152         }
1153
1154         /*
1155          *      Free or return the buffer as appropriate. Again this
1156          *      hides all the races and re-entrancy issues from us.
1157          */
1158         err = (flags&MSG_TRUNC) ? skb->len : copied;
1159
1160 out_free:
1161         skb_free_datagram(sk, skb);
1162 out:
1163         return err;
1164 }
1165
1166 #ifdef CONFIG_SOCK_PACKET
1167 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1168                                int *uaddr_len, int peer)
1169 {
1170         struct net_device *dev;
1171         struct sock *sk = sock->sk;
1172
1173         if (peer)
1174                 return -EOPNOTSUPP;
1175
1176         uaddr->sa_family = AF_PACKET;
1177         dev = dev_get_by_index(pkt_sk(sk)->ifindex);
1178         if (dev) {
1179                 strlcpy(uaddr->sa_data, dev->name, 15);
1180                 dev_put(dev);
1181         } else
1182                 memset(uaddr->sa_data, 0, 14);
1183         *uaddr_len = sizeof(*uaddr);
1184
1185         return 0;
1186 }
1187 #endif
1188
1189 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1190                           int *uaddr_len, int peer)
1191 {
1192         struct net_device *dev;
1193         struct sock *sk = sock->sk;
1194         struct packet_sock *po = pkt_sk(sk);
1195         struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
1196
1197         if (peer)
1198                 return -EOPNOTSUPP;
1199
1200         sll->sll_family = AF_PACKET;
1201         sll->sll_ifindex = po->ifindex;
1202         sll->sll_protocol = po->num;
1203         dev = dev_get_by_index(po->ifindex);
1204         if (dev) {
1205                 sll->sll_hatype = dev->type;
1206                 sll->sll_halen = dev->addr_len;
1207                 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1208                 dev_put(dev);
1209         } else {
1210                 sll->sll_hatype = 0;    /* Bad: we have no ARPHRD_UNSPEC */
1211                 sll->sll_halen = 0;
1212         }
1213         *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1214
1215         return 0;
1216 }
1217
1218 #ifdef CONFIG_PACKET_MULTICAST
1219 static void packet_dev_mc(struct net_device *dev, struct packet_mclist *i, int what)
1220 {
1221         switch (i->type) {
1222         case PACKET_MR_MULTICAST:
1223                 if (what > 0)
1224                         dev_mc_add(dev, i->addr, i->alen, 0);
1225                 else
1226                         dev_mc_delete(dev, i->addr, i->alen, 0);
1227                 break;
1228         case PACKET_MR_PROMISC:
1229                 dev_set_promiscuity(dev, what);
1230                 break;
1231         case PACKET_MR_ALLMULTI:
1232                 dev_set_allmulti(dev, what);
1233                 break;
1234         default:;
1235         }
1236 }
1237
1238 static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1239 {
1240         for ( ; i; i=i->next) {
1241                 if (i->ifindex == dev->ifindex)
1242                         packet_dev_mc(dev, i, what);
1243         }
1244 }
1245
1246 static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1247 {
1248         struct packet_sock *po = pkt_sk(sk);
1249         struct packet_mclist *ml, *i;
1250         struct net_device *dev;
1251         int err;
1252
1253         rtnl_lock();
1254
1255         err = -ENODEV;
1256         dev = __dev_get_by_index(mreq->mr_ifindex);
1257         if (!dev)
1258                 goto done;
1259
1260         err = -EINVAL;
1261         if (mreq->mr_alen > dev->addr_len)
1262                 goto done;
1263
1264         err = -ENOBUFS;
1265         i = kmalloc(sizeof(*i), GFP_KERNEL);
1266         if (i == NULL)
1267                 goto done;
1268
1269         err = 0;
1270         for (ml = po->mclist; ml; ml = ml->next) {
1271                 if (ml->ifindex == mreq->mr_ifindex &&
1272                     ml->type == mreq->mr_type &&
1273                     ml->alen == mreq->mr_alen &&
1274                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1275                         ml->count++;
1276                         /* Free the new element ... */
1277                         kfree(i);
1278                         goto done;
1279                 }
1280         }
1281
1282         i->type = mreq->mr_type;
1283         i->ifindex = mreq->mr_ifindex;
1284         i->alen = mreq->mr_alen;
1285         memcpy(i->addr, mreq->mr_address, i->alen);
1286         i->count = 1;
1287         i->next = po->mclist;
1288         po->mclist = i;
1289         packet_dev_mc(dev, i, +1);
1290
1291 done:
1292         rtnl_unlock();
1293         return err;
1294 }
1295
1296 static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1297 {
1298         struct packet_mclist *ml, **mlp;
1299
1300         rtnl_lock();
1301
1302         for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1303                 if (ml->ifindex == mreq->mr_ifindex &&
1304                     ml->type == mreq->mr_type &&
1305                     ml->alen == mreq->mr_alen &&
1306                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1307                         if (--ml->count == 0) {
1308                                 struct net_device *dev;
1309                                 *mlp = ml->next;
1310                                 dev = dev_get_by_index(ml->ifindex);
1311                                 if (dev) {
1312                                         packet_dev_mc(dev, ml, -1);
1313                                         dev_put(dev);
1314                                 }
1315                                 kfree(ml);
1316                         }
1317                         rtnl_unlock();
1318                         return 0;
1319                 }
1320         }
1321         rtnl_unlock();
1322         return -EADDRNOTAVAIL;
1323 }
1324
1325 static void packet_flush_mclist(struct sock *sk)
1326 {
1327         struct packet_sock *po = pkt_sk(sk);
1328         struct packet_mclist *ml;
1329
1330         if (!po->mclist)
1331                 return;
1332
1333         rtnl_lock();
1334         while ((ml = po->mclist) != NULL) {
1335                 struct net_device *dev;
1336
1337                 po->mclist = ml->next;
1338                 if ((dev = dev_get_by_index(ml->ifindex)) != NULL) {
1339                         packet_dev_mc(dev, ml, -1);
1340                         dev_put(dev);
1341                 }
1342                 kfree(ml);
1343         }
1344         rtnl_unlock();
1345 }
1346 #endif
1347
1348 static int
1349 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen)
1350 {
1351         struct sock *sk = sock->sk;
1352         struct packet_sock *po = pkt_sk(sk);
1353         int ret;
1354
1355         if (level != SOL_PACKET)
1356                 return -ENOPROTOOPT;
1357
1358         switch(optname) {
1359 #ifdef CONFIG_PACKET_MULTICAST
1360         case PACKET_ADD_MEMBERSHIP:
1361         case PACKET_DROP_MEMBERSHIP:
1362         {
1363                 struct packet_mreq_max mreq;
1364                 int len = optlen;
1365                 memset(&mreq, 0, sizeof(mreq));
1366                 if (len < sizeof(struct packet_mreq))
1367                         return -EINVAL;
1368                 if (len > sizeof(mreq))
1369                         len = sizeof(mreq);
1370                 if (copy_from_user(&mreq,optval,len))
1371                         return -EFAULT;
1372                 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
1373                         return -EINVAL;
1374                 if (optname == PACKET_ADD_MEMBERSHIP)
1375                         ret = packet_mc_add(sk, &mreq);
1376                 else
1377                         ret = packet_mc_drop(sk, &mreq);
1378                 return ret;
1379         }
1380 #endif
1381 #ifdef CONFIG_PACKET_MMAP
1382         case PACKET_RX_RING:
1383         {
1384                 struct tpacket_req req;
1385
1386                 if (optlen<sizeof(req))
1387                         return -EINVAL;
1388                 if (copy_from_user(&req,optval,sizeof(req)))
1389                         return -EFAULT;
1390                 return packet_set_ring(sk, &req, 0);
1391         }
1392         case PACKET_COPY_THRESH:
1393         {
1394                 int val;
1395
1396                 if (optlen!=sizeof(val))
1397                         return -EINVAL;
1398                 if (copy_from_user(&val,optval,sizeof(val)))
1399                         return -EFAULT;
1400
1401                 pkt_sk(sk)->copy_thresh = val;
1402                 return 0;
1403         }
1404 #endif
1405         case PACKET_AUXDATA:
1406         {
1407                 int val;
1408
1409                 if (optlen < sizeof(val))
1410                         return -EINVAL;
1411                 if (copy_from_user(&val, optval, sizeof(val)))
1412                         return -EFAULT;
1413
1414                 po->auxdata = !!val;
1415                 return 0;
1416         }
1417         default:
1418                 return -ENOPROTOOPT;
1419         }
1420 }
1421
1422 static int packet_getsockopt(struct socket *sock, int level, int optname,
1423                              char __user *optval, int __user *optlen)
1424 {
1425         int len;
1426         int val;
1427         struct sock *sk = sock->sk;
1428         struct packet_sock *po = pkt_sk(sk);
1429         void *data;
1430         struct tpacket_stats st;
1431
1432         if (level != SOL_PACKET)
1433                 return -ENOPROTOOPT;
1434
1435         if (get_user(len, optlen))
1436                 return -EFAULT;
1437
1438         if (len < 0)
1439                 return -EINVAL;
1440
1441         switch(optname) {
1442         case PACKET_STATISTICS:
1443                 if (len > sizeof(struct tpacket_stats))
1444                         len = sizeof(struct tpacket_stats);
1445                 spin_lock_bh(&sk->sk_receive_queue.lock);
1446                 st = po->stats;
1447                 memset(&po->stats, 0, sizeof(st));
1448                 spin_unlock_bh(&sk->sk_receive_queue.lock);
1449                 st.tp_packets += st.tp_drops;
1450
1451                 data = &st;
1452                 break;
1453         case PACKET_AUXDATA:
1454                 if (len > sizeof(int))
1455                         len = sizeof(int);
1456                 val = po->auxdata;
1457
1458                 data = &val;
1459                 break;
1460         default:
1461                 return -ENOPROTOOPT;
1462         }
1463
1464         if (put_user(len, optlen))
1465                 return -EFAULT;
1466         if (copy_to_user(optval, data, len))
1467                 return -EFAULT;
1468         return 0;
1469 }
1470
1471
1472 static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
1473 {
1474         struct sock *sk;
1475         struct hlist_node *node;
1476         struct net_device *dev = (struct net_device*)data;
1477
1478         read_lock(&packet_sklist_lock);
1479         sk_for_each(sk, node, &packet_sklist) {
1480                 struct packet_sock *po = pkt_sk(sk);
1481
1482                 switch (msg) {
1483                 case NETDEV_UNREGISTER:
1484 #ifdef CONFIG_PACKET_MULTICAST
1485                         if (po->mclist)
1486                                 packet_dev_mclist(dev, po->mclist, -1);
1487                         // fallthrough
1488 #endif
1489                 case NETDEV_DOWN:
1490                         if (dev->ifindex == po->ifindex) {
1491                                 spin_lock(&po->bind_lock);
1492                                 if (po->running) {
1493                                         __dev_remove_pack(&po->prot_hook);
1494                                         __sock_put(sk);
1495                                         po->running = 0;
1496                                         sk->sk_err = ENETDOWN;
1497                                         if (!sock_flag(sk, SOCK_DEAD))
1498                                                 sk->sk_error_report(sk);
1499                                 }
1500                                 if (msg == NETDEV_UNREGISTER) {
1501                                         po->ifindex = -1;
1502                                         po->prot_hook.dev = NULL;
1503                                 }
1504                                 spin_unlock(&po->bind_lock);
1505                         }
1506                         break;
1507                 case NETDEV_UP:
1508                         spin_lock(&po->bind_lock);
1509                         if (dev->ifindex == po->ifindex && po->num &&
1510                             !po->running) {
1511                                 dev_add_pack(&po->prot_hook);
1512                                 sock_hold(sk);
1513                                 po->running = 1;
1514                         }
1515                         spin_unlock(&po->bind_lock);
1516                         break;
1517                 }
1518         }
1519         read_unlock(&packet_sklist_lock);
1520         return NOTIFY_DONE;
1521 }
1522
1523
1524 static int packet_ioctl(struct socket *sock, unsigned int cmd,
1525                         unsigned long arg)
1526 {
1527         struct sock *sk = sock->sk;
1528
1529         switch(cmd) {
1530                 case SIOCOUTQ:
1531                 {
1532                         int amount = atomic_read(&sk->sk_wmem_alloc);
1533                         return put_user(amount, (int __user *)arg);
1534                 }
1535                 case SIOCINQ:
1536                 {
1537                         struct sk_buff *skb;
1538                         int amount = 0;
1539
1540                         spin_lock_bh(&sk->sk_receive_queue.lock);
1541                         skb = skb_peek(&sk->sk_receive_queue);
1542                         if (skb)
1543                                 amount = skb->len;
1544                         spin_unlock_bh(&sk->sk_receive_queue.lock);
1545                         return put_user(amount, (int __user *)arg);
1546                 }
1547                 case SIOCGSTAMP:
1548                         return sock_get_timestamp(sk, (struct timeval __user *)arg);
1549
1550 #ifdef CONFIG_INET
1551                 case SIOCADDRT:
1552                 case SIOCDELRT:
1553                 case SIOCDARP:
1554                 case SIOCGARP:
1555                 case SIOCSARP:
1556                 case SIOCGIFADDR:
1557                 case SIOCSIFADDR:
1558                 case SIOCGIFBRDADDR:
1559                 case SIOCSIFBRDADDR:
1560                 case SIOCGIFNETMASK:
1561                 case SIOCSIFNETMASK:
1562                 case SIOCGIFDSTADDR:
1563                 case SIOCSIFDSTADDR:
1564                 case SIOCSIFFLAGS:
1565                         return inet_dgram_ops.ioctl(sock, cmd, arg);
1566 #endif
1567
1568                 default:
1569                         return -ENOIOCTLCMD;
1570         }
1571         return 0;
1572 }
1573
1574 #ifndef CONFIG_PACKET_MMAP
1575 #define packet_mmap sock_no_mmap
1576 #define packet_poll datagram_poll
1577 #else
1578
1579 static unsigned int packet_poll(struct file * file, struct socket *sock,
1580                                 poll_table *wait)
1581 {
1582         struct sock *sk = sock->sk;
1583         struct packet_sock *po = pkt_sk(sk);
1584         unsigned int mask = datagram_poll(file, sock, wait);
1585
1586         spin_lock_bh(&sk->sk_receive_queue.lock);
1587         if (po->pg_vec) {
1588                 unsigned last = po->head ? po->head-1 : po->frame_max;
1589                 struct tpacket_hdr *h;
1590
1591                 h = (struct tpacket_hdr *)packet_lookup_frame(po, last);
1592
1593                 if (h->tp_status)
1594                         mask |= POLLIN | POLLRDNORM;
1595         }
1596         spin_unlock_bh(&sk->sk_receive_queue.lock);
1597         return mask;
1598 }
1599
1600
1601 /* Dirty? Well, I still did not learn better way to account
1602  * for user mmaps.
1603  */
1604
1605 static void packet_mm_open(struct vm_area_struct *vma)
1606 {
1607         struct file *file = vma->vm_file;
1608         struct socket * sock = file->private_data;
1609         struct sock *sk = sock->sk;
1610
1611         if (sk)
1612                 atomic_inc(&pkt_sk(sk)->mapped);
1613 }
1614
1615 static void packet_mm_close(struct vm_area_struct *vma)
1616 {
1617         struct file *file = vma->vm_file;
1618         struct socket * sock = file->private_data;
1619         struct sock *sk = sock->sk;
1620
1621         if (sk)
1622                 atomic_dec(&pkt_sk(sk)->mapped);
1623 }
1624
1625 static struct vm_operations_struct packet_mmap_ops = {
1626         .open = packet_mm_open,
1627         .close =packet_mm_close,
1628 };
1629
1630 static inline struct page *pg_vec_endpage(char *one_pg_vec, unsigned int order)
1631 {
1632         return virt_to_page(one_pg_vec + (PAGE_SIZE << order) - 1);
1633 }
1634
1635 static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
1636 {
1637         int i;
1638
1639         for (i = 0; i < len; i++) {
1640                 if (likely(pg_vec[i]))
1641                         free_pages((unsigned long) pg_vec[i], order);
1642         }
1643         kfree(pg_vec);
1644 }
1645
1646 static inline char *alloc_one_pg_vec_page(unsigned long order)
1647 {
1648         return (char *) __get_free_pages(GFP_KERNEL | __GFP_COMP | __GFP_ZERO,
1649                                          order);
1650 }
1651
1652 static char **alloc_pg_vec(struct tpacket_req *req, int order)
1653 {
1654         unsigned int block_nr = req->tp_block_nr;
1655         char **pg_vec;
1656         int i;
1657
1658         pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
1659         if (unlikely(!pg_vec))
1660                 goto out;
1661
1662         for (i = 0; i < block_nr; i++) {
1663                 pg_vec[i] = alloc_one_pg_vec_page(order);
1664                 if (unlikely(!pg_vec[i]))
1665                         goto out_free_pgvec;
1666         }
1667
1668 out:
1669         return pg_vec;
1670
1671 out_free_pgvec:
1672         free_pg_vec(pg_vec, order, block_nr);
1673         pg_vec = NULL;
1674         goto out;
1675 }
1676
1677 static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing)
1678 {
1679         char **pg_vec = NULL;
1680         struct packet_sock *po = pkt_sk(sk);
1681         int was_running, order = 0;
1682         __be16 num;
1683         int err = 0;
1684
1685         if (req->tp_block_nr) {
1686                 int i, l;
1687
1688                 /* Sanity tests and some calculations */
1689
1690                 if (unlikely(po->pg_vec))
1691                         return -EBUSY;
1692
1693                 if (unlikely((int)req->tp_block_size <= 0))
1694                         return -EINVAL;
1695                 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
1696                         return -EINVAL;
1697                 if (unlikely(req->tp_frame_size < TPACKET_HDRLEN))
1698                         return -EINVAL;
1699                 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
1700                         return -EINVAL;
1701
1702                 po->frames_per_block = req->tp_block_size/req->tp_frame_size;
1703                 if (unlikely(po->frames_per_block <= 0))
1704                         return -EINVAL;
1705                 if (unlikely((po->frames_per_block * req->tp_block_nr) !=
1706                              req->tp_frame_nr))
1707                         return -EINVAL;
1708
1709                 err = -ENOMEM;
1710                 order = get_order(req->tp_block_size);
1711                 pg_vec = alloc_pg_vec(req, order);
1712                 if (unlikely(!pg_vec))
1713                         goto out;
1714
1715                 l = 0;
1716                 for (i = 0; i < req->tp_block_nr; i++) {
1717                         char *ptr = pg_vec[i];
1718                         struct tpacket_hdr *header;
1719                         int k;
1720
1721                         for (k = 0; k < po->frames_per_block; k++) {
1722                                 header = (struct tpacket_hdr *) ptr;
1723                                 header->tp_status = TP_STATUS_KERNEL;
1724                                 ptr += req->tp_frame_size;
1725                         }
1726                 }
1727                 /* Done */
1728         } else {
1729                 if (unlikely(req->tp_frame_nr))
1730                         return -EINVAL;
1731         }
1732
1733         lock_sock(sk);
1734
1735         /* Detach socket from network */
1736         spin_lock(&po->bind_lock);
1737         was_running = po->running;
1738         num = po->num;
1739         if (was_running) {
1740                 __dev_remove_pack(&po->prot_hook);
1741                 po->num = 0;
1742                 po->running = 0;
1743                 __sock_put(sk);
1744         }
1745         spin_unlock(&po->bind_lock);
1746
1747         synchronize_net();
1748
1749         err = -EBUSY;
1750         if (closing || atomic_read(&po->mapped) == 0) {
1751                 err = 0;
1752 #define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
1753
1754                 spin_lock_bh(&sk->sk_receive_queue.lock);
1755                 pg_vec = XC(po->pg_vec, pg_vec);
1756                 po->frame_max = (req->tp_frame_nr - 1);
1757                 po->head = 0;
1758                 po->frame_size = req->tp_frame_size;
1759                 spin_unlock_bh(&sk->sk_receive_queue.lock);
1760
1761                 order = XC(po->pg_vec_order, order);
1762                 req->tp_block_nr = XC(po->pg_vec_len, req->tp_block_nr);
1763
1764                 po->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
1765                 po->prot_hook.func = po->pg_vec ? tpacket_rcv : packet_rcv;
1766                 skb_queue_purge(&sk->sk_receive_queue);
1767 #undef XC
1768                 if (atomic_read(&po->mapped))
1769                         printk(KERN_DEBUG "packet_mmap: vma is busy: %d\n", atomic_read(&po->mapped));
1770         }
1771
1772         spin_lock(&po->bind_lock);
1773         if (was_running && !po->running) {
1774                 sock_hold(sk);
1775                 po->running = 1;
1776                 po->num = num;
1777                 dev_add_pack(&po->prot_hook);
1778         }
1779         spin_unlock(&po->bind_lock);
1780
1781         release_sock(sk);
1782
1783         if (pg_vec)
1784                 free_pg_vec(pg_vec, order, req->tp_block_nr);
1785 out:
1786         return err;
1787 }
1788
1789 static int packet_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1790 {
1791         struct sock *sk = sock->sk;
1792         struct packet_sock *po = pkt_sk(sk);
1793         unsigned long size;
1794         unsigned long start;
1795         int err = -EINVAL;
1796         int i;
1797
1798         if (vma->vm_pgoff)
1799                 return -EINVAL;
1800
1801         size = vma->vm_end - vma->vm_start;
1802
1803         lock_sock(sk);
1804         if (po->pg_vec == NULL)
1805                 goto out;
1806         if (size != po->pg_vec_len*po->pg_vec_pages*PAGE_SIZE)
1807                 goto out;
1808
1809         start = vma->vm_start;
1810         for (i = 0; i < po->pg_vec_len; i++) {
1811                 struct page *page = virt_to_page(po->pg_vec[i]);
1812                 int pg_num;
1813
1814                 for (pg_num = 0; pg_num < po->pg_vec_pages; pg_num++, page++) {
1815                         err = vm_insert_page(vma, start, page);
1816                         if (unlikely(err))
1817                                 goto out;
1818                         start += PAGE_SIZE;
1819                 }
1820         }
1821         atomic_inc(&po->mapped);
1822         vma->vm_ops = &packet_mmap_ops;
1823         err = 0;
1824
1825 out:
1826         release_sock(sk);
1827         return err;
1828 }
1829 #endif
1830
1831
1832 #ifdef CONFIG_SOCK_PACKET
1833 static const struct proto_ops packet_ops_spkt = {
1834         .family =       PF_PACKET,
1835         .owner =        THIS_MODULE,
1836         .release =      packet_release,
1837         .bind =         packet_bind_spkt,
1838         .connect =      sock_no_connect,
1839         .socketpair =   sock_no_socketpair,
1840         .accept =       sock_no_accept,
1841         .getname =      packet_getname_spkt,
1842         .poll =         datagram_poll,
1843         .ioctl =        packet_ioctl,
1844         .listen =       sock_no_listen,
1845         .shutdown =     sock_no_shutdown,
1846         .setsockopt =   sock_no_setsockopt,
1847         .getsockopt =   sock_no_getsockopt,
1848         .sendmsg =      packet_sendmsg_spkt,
1849         .recvmsg =      packet_recvmsg,
1850         .mmap =         sock_no_mmap,
1851         .sendpage =     sock_no_sendpage,
1852 };
1853 #endif
1854
1855 static const struct proto_ops packet_ops = {
1856         .family =       PF_PACKET,
1857         .owner =        THIS_MODULE,
1858         .release =      packet_release,
1859         .bind =         packet_bind,
1860         .connect =      sock_no_connect,
1861         .socketpair =   sock_no_socketpair,
1862         .accept =       sock_no_accept,
1863         .getname =      packet_getname,
1864         .poll =         packet_poll,
1865         .ioctl =        packet_ioctl,
1866         .listen =       sock_no_listen,
1867         .shutdown =     sock_no_shutdown,
1868         .setsockopt =   packet_setsockopt,
1869         .getsockopt =   packet_getsockopt,
1870         .sendmsg =      packet_sendmsg,
1871         .recvmsg =      packet_recvmsg,
1872         .mmap =         packet_mmap,
1873         .sendpage =     sock_no_sendpage,
1874 };
1875
1876 static struct net_proto_family packet_family_ops = {
1877         .family =       PF_PACKET,
1878         .create =       packet_create,
1879         .owner  =       THIS_MODULE,
1880 };
1881
1882 static struct notifier_block packet_netdev_notifier = {
1883         .notifier_call =packet_notifier,
1884 };
1885
1886 #ifdef CONFIG_PROC_FS
1887 static inline struct sock *packet_seq_idx(loff_t off)
1888 {
1889         struct sock *s;
1890         struct hlist_node *node;
1891
1892         sk_for_each(s, node, &packet_sklist) {
1893                 if (!off--)
1894                         return s;
1895         }
1896         return NULL;
1897 }
1898
1899 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
1900 {
1901         read_lock(&packet_sklist_lock);
1902         return *pos ? packet_seq_idx(*pos - 1) : SEQ_START_TOKEN;
1903 }
1904
1905 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1906 {
1907         ++*pos;
1908         return  (v == SEQ_START_TOKEN)
1909                 ? sk_head(&packet_sklist)
1910                 : sk_next((struct sock*)v) ;
1911 }
1912
1913 static void packet_seq_stop(struct seq_file *seq, void *v)
1914 {
1915         read_unlock(&packet_sklist_lock);
1916 }
1917
1918 static int packet_seq_show(struct seq_file *seq, void *v)
1919 {
1920         if (v == SEQ_START_TOKEN)
1921                 seq_puts(seq, "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n");
1922         else {
1923                 struct sock *s = v;
1924                 const struct packet_sock *po = pkt_sk(s);
1925
1926                 seq_printf(seq,
1927                            "%p %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",
1928                            s,
1929                            atomic_read(&s->sk_refcnt),
1930                            s->sk_type,
1931                            ntohs(po->num),
1932                            po->ifindex,
1933                            po->running,
1934                            atomic_read(&s->sk_rmem_alloc),
1935                            sock_i_uid(s),
1936                            sock_i_ino(s) );
1937         }
1938
1939         return 0;
1940 }
1941
1942 static struct seq_operations packet_seq_ops = {
1943         .start  = packet_seq_start,
1944         .next   = packet_seq_next,
1945         .stop   = packet_seq_stop,
1946         .show   = packet_seq_show,
1947 };
1948
1949 static int packet_seq_open(struct inode *inode, struct file *file)
1950 {
1951         return seq_open(file, &packet_seq_ops);
1952 }
1953
1954 static const struct file_operations packet_seq_fops = {
1955         .owner          = THIS_MODULE,
1956         .open           = packet_seq_open,
1957         .read           = seq_read,
1958         .llseek         = seq_lseek,
1959         .release        = seq_release,
1960 };
1961
1962 #endif
1963
1964 static void __exit packet_exit(void)
1965 {
1966         proc_net_remove("packet");
1967         unregister_netdevice_notifier(&packet_netdev_notifier);
1968         sock_unregister(PF_PACKET);
1969         proto_unregister(&packet_proto);
1970 }
1971
1972 static int __init packet_init(void)
1973 {
1974         int rc = proto_register(&packet_proto, 0);
1975
1976         if (rc != 0)
1977                 goto out;
1978
1979         sock_register(&packet_family_ops);
1980         register_netdevice_notifier(&packet_netdev_notifier);
1981         proc_net_fops_create("packet", 0, &packet_seq_fops);
1982 out:
1983         return rc;
1984 }
1985
1986 module_init(packet_init);
1987 module_exit(packet_exit);
1988 MODULE_LICENSE("GPL");
1989 MODULE_ALIAS_NETPROTO(PF_PACKET);