Merge branch 'release' of git://git.kernel.org/pub/scm/linux/kernel/git/aegl/linux-2.6
[sfrench/cifs-2.6.git] / net / core / sock.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Generic socket support routines. Memory allocators, socket lock/release
7  *              handler for protocols to use and generic option handler.
8  *
9  *
10  * Version:     $Id: sock.c,v 1.117 2002/02/01 22:01:03 davem Exp $
11  *
12  * Authors:     Ross Biro
13  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
14  *              Florian La Roche, <flla@stud.uni-sb.de>
15  *              Alan Cox, <A.Cox@swansea.ac.uk>
16  *
17  * Fixes:
18  *              Alan Cox        :       Numerous verify_area() problems
19  *              Alan Cox        :       Connecting on a connecting socket
20  *                                      now returns an error for tcp.
21  *              Alan Cox        :       sock->protocol is set correctly.
22  *                                      and is not sometimes left as 0.
23  *              Alan Cox        :       connect handles icmp errors on a
24  *                                      connect properly. Unfortunately there
25  *                                      is a restart syscall nasty there. I
26  *                                      can't match BSD without hacking the C
27  *                                      library. Ideas urgently sought!
28  *              Alan Cox        :       Disallow bind() to addresses that are
29  *                                      not ours - especially broadcast ones!!
30  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
31  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
32  *                                      instead they leave that for the DESTROY timer.
33  *              Alan Cox        :       Clean up error flag in accept
34  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
35  *                                      was buggy. Put a remove_sock() in the handler
36  *                                      for memory when we hit 0. Also altered the timer
37  *                                      code. The ACK stuff can wait and needs major 
38  *                                      TCP layer surgery.
39  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
40  *                                      and fixed timer/inet_bh race.
41  *              Alan Cox        :       Added zapped flag for TCP
42  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
43  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
44  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
45  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
46  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
47  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
48  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
49  *      Pauline Middelink       :       identd support
50  *              Alan Cox        :       Fixed connect() taking signals I think.
51  *              Alan Cox        :       SO_LINGER supported
52  *              Alan Cox        :       Error reporting fixes
53  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
54  *              Alan Cox        :       inet sockets don't set sk->type!
55  *              Alan Cox        :       Split socket option code
56  *              Alan Cox        :       Callbacks
57  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
58  *              Alex            :       Removed restriction on inet fioctl
59  *              Alan Cox        :       Splitting INET from NET core
60  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
61  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
62  *              Alan Cox        :       Split IP from generic code
63  *              Alan Cox        :       New kfree_skbmem()
64  *              Alan Cox        :       Make SO_DEBUG superuser only.
65  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
66  *                                      (compatibility fix)
67  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
68  *              Alan Cox        :       Allocator for a socket is settable.
69  *              Alan Cox        :       SO_ERROR includes soft errors.
70  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
71  *              Alan Cox        :       Generic socket allocation to make hooks
72  *                                      easier (suggested by Craig Metz).
73  *              Michael Pall    :       SO_ERROR returns positive errno again
74  *              Steve Whitehouse:       Added default destructor to free
75  *                                      protocol private data.
76  *              Steve Whitehouse:       Added various other default routines
77  *                                      common to several socket families.
78  *              Chris Evans     :       Call suser() check last on F_SETOWN
79  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
80  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
81  *              Andi Kleen      :       Fix write_space callback
82  *              Chris Evans     :       Security fixes - signedness again
83  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
84  *
85  * To Fix:
86  *
87  *
88  *              This program is free software; you can redistribute it and/or
89  *              modify it under the terms of the GNU General Public License
90  *              as published by the Free Software Foundation; either version
91  *              2 of the License, or (at your option) any later version.
92  */
93
94 #include <linux/capability.h>
95 #include <linux/config.h>
96 #include <linux/errno.h>
97 #include <linux/types.h>
98 #include <linux/socket.h>
99 #include <linux/in.h>
100 #include <linux/kernel.h>
101 #include <linux/module.h>
102 #include <linux/proc_fs.h>
103 #include <linux/seq_file.h>
104 #include <linux/sched.h>
105 #include <linux/timer.h>
106 #include <linux/string.h>
107 #include <linux/sockios.h>
108 #include <linux/net.h>
109 #include <linux/mm.h>
110 #include <linux/slab.h>
111 #include <linux/interrupt.h>
112 #include <linux/poll.h>
113 #include <linux/tcp.h>
114 #include <linux/init.h>
115
116 #include <asm/uaccess.h>
117 #include <asm/system.h>
118
119 #include <linux/netdevice.h>
120 #include <net/protocol.h>
121 #include <linux/skbuff.h>
122 #include <net/request_sock.h>
123 #include <net/sock.h>
124 #include <net/xfrm.h>
125 #include <linux/ipsec.h>
126
127 #include <linux/filter.h>
128
129 #ifdef CONFIG_INET
130 #include <net/tcp.h>
131 #endif
132
133 /* Take into consideration the size of the struct sk_buff overhead in the
134  * determination of these values, since that is non-constant across
135  * platforms.  This makes socket queueing behavior and performance
136  * not depend upon such differences.
137  */
138 #define _SK_MEM_PACKETS         256
139 #define _SK_MEM_OVERHEAD        (sizeof(struct sk_buff) + 256)
140 #define SK_WMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
141 #define SK_RMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
142
143 /* Run time adjustable parameters. */
144 __u32 sysctl_wmem_max = SK_WMEM_MAX;
145 __u32 sysctl_rmem_max = SK_RMEM_MAX;
146 __u32 sysctl_wmem_default = SK_WMEM_MAX;
147 __u32 sysctl_rmem_default = SK_RMEM_MAX;
148
149 /* Maximal space eaten by iovec or ancilliary data plus some space */
150 int sysctl_optmem_max = sizeof(unsigned long)*(2*UIO_MAXIOV + 512);
151
152 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
153 {
154         struct timeval tv;
155
156         if (optlen < sizeof(tv))
157                 return -EINVAL;
158         if (copy_from_user(&tv, optval, sizeof(tv)))
159                 return -EFAULT;
160
161         *timeo_p = MAX_SCHEDULE_TIMEOUT;
162         if (tv.tv_sec == 0 && tv.tv_usec == 0)
163                 return 0;
164         if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
165                 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
166         return 0;
167 }
168
169 static void sock_warn_obsolete_bsdism(const char *name)
170 {
171         static int warned;
172         static char warncomm[TASK_COMM_LEN];
173         if (strcmp(warncomm, current->comm) && warned < 5) { 
174                 strcpy(warncomm,  current->comm); 
175                 printk(KERN_WARNING "process `%s' is using obsolete "
176                        "%s SO_BSDCOMPAT\n", warncomm, name);
177                 warned++;
178         }
179 }
180
181 static void sock_disable_timestamp(struct sock *sk)
182 {       
183         if (sock_flag(sk, SOCK_TIMESTAMP)) { 
184                 sock_reset_flag(sk, SOCK_TIMESTAMP);
185                 net_disable_timestamp();
186         }
187 }
188
189
190 /*
191  *      This is meant for all protocols to use and covers goings on
192  *      at the socket level. Everything here is generic.
193  */
194
195 int sock_setsockopt(struct socket *sock, int level, int optname,
196                     char __user *optval, int optlen)
197 {
198         struct sock *sk=sock->sk;
199         struct sk_filter *filter;
200         int val;
201         int valbool;
202         struct linger ling;
203         int ret = 0;
204         
205         /*
206          *      Options without arguments
207          */
208
209 #ifdef SO_DONTLINGER            /* Compatibility item... */
210         if (optname == SO_DONTLINGER) {
211                 lock_sock(sk);
212                 sock_reset_flag(sk, SOCK_LINGER);
213                 release_sock(sk);
214                 return 0;
215         }
216 #endif
217         
218         if(optlen<sizeof(int))
219                 return(-EINVAL);
220         
221         if (get_user(val, (int __user *)optval))
222                 return -EFAULT;
223         
224         valbool = val?1:0;
225
226         lock_sock(sk);
227
228         switch(optname) 
229         {
230                 case SO_DEBUG:  
231                         if(val && !capable(CAP_NET_ADMIN))
232                         {
233                                 ret = -EACCES;
234                         }
235                         else if (valbool)
236                                 sock_set_flag(sk, SOCK_DBG);
237                         else
238                                 sock_reset_flag(sk, SOCK_DBG);
239                         break;
240                 case SO_REUSEADDR:
241                         sk->sk_reuse = valbool;
242                         break;
243                 case SO_TYPE:
244                 case SO_ERROR:
245                         ret = -ENOPROTOOPT;
246                         break;
247                 case SO_DONTROUTE:
248                         if (valbool)
249                                 sock_set_flag(sk, SOCK_LOCALROUTE);
250                         else
251                                 sock_reset_flag(sk, SOCK_LOCALROUTE);
252                         break;
253                 case SO_BROADCAST:
254                         sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
255                         break;
256                 case SO_SNDBUF:
257                         /* Don't error on this BSD doesn't and if you think
258                            about it this is right. Otherwise apps have to
259                            play 'guess the biggest size' games. RCVBUF/SNDBUF
260                            are treated in BSD as hints */
261                            
262                         if (val > sysctl_wmem_max)
263                                 val = sysctl_wmem_max;
264 set_sndbuf:
265                         sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
266                         if ((val * 2) < SOCK_MIN_SNDBUF)
267                                 sk->sk_sndbuf = SOCK_MIN_SNDBUF;
268                         else
269                                 sk->sk_sndbuf = val * 2;
270
271                         /*
272                          *      Wake up sending tasks if we
273                          *      upped the value.
274                          */
275                         sk->sk_write_space(sk);
276                         break;
277
278                 case SO_SNDBUFFORCE:
279                         if (!capable(CAP_NET_ADMIN)) {
280                                 ret = -EPERM;
281                                 break;
282                         }
283                         goto set_sndbuf;
284
285                 case SO_RCVBUF:
286                         /* Don't error on this BSD doesn't and if you think
287                            about it this is right. Otherwise apps have to
288                            play 'guess the biggest size' games. RCVBUF/SNDBUF
289                            are treated in BSD as hints */
290                           
291                         if (val > sysctl_rmem_max)
292                                 val = sysctl_rmem_max;
293 set_rcvbuf:
294                         sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
295                         /* FIXME: is this lower bound the right one? */
296                         if ((val * 2) < SOCK_MIN_RCVBUF)
297                                 sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
298                         else
299                                 sk->sk_rcvbuf = val * 2;
300                         break;
301
302                 case SO_RCVBUFFORCE:
303                         if (!capable(CAP_NET_ADMIN)) {
304                                 ret = -EPERM;
305                                 break;
306                         }
307                         goto set_rcvbuf;
308
309                 case SO_KEEPALIVE:
310 #ifdef CONFIG_INET
311                         if (sk->sk_protocol == IPPROTO_TCP)
312                                 tcp_set_keepalive(sk, valbool);
313 #endif
314                         sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
315                         break;
316
317                 case SO_OOBINLINE:
318                         sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
319                         break;
320
321                 case SO_NO_CHECK:
322                         sk->sk_no_check = valbool;
323                         break;
324
325                 case SO_PRIORITY:
326                         if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN)) 
327                                 sk->sk_priority = val;
328                         else
329                                 ret = -EPERM;
330                         break;
331
332                 case SO_LINGER:
333                         if(optlen<sizeof(ling)) {
334                                 ret = -EINVAL;  /* 1003.1g */
335                                 break;
336                         }
337                         if (copy_from_user(&ling,optval,sizeof(ling))) {
338                                 ret = -EFAULT;
339                                 break;
340                         }
341                         if (!ling.l_onoff)
342                                 sock_reset_flag(sk, SOCK_LINGER);
343                         else {
344 #if (BITS_PER_LONG == 32)
345                                 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
346                                         sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
347                                 else
348 #endif
349                                         sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
350                                 sock_set_flag(sk, SOCK_LINGER);
351                         }
352                         break;
353
354                 case SO_BSDCOMPAT:
355                         sock_warn_obsolete_bsdism("setsockopt");
356                         break;
357
358                 case SO_PASSCRED:
359                         if (valbool)
360                                 set_bit(SOCK_PASSCRED, &sock->flags);
361                         else
362                                 clear_bit(SOCK_PASSCRED, &sock->flags);
363                         break;
364
365                 case SO_TIMESTAMP:
366                         if (valbool)  {
367                                 sock_set_flag(sk, SOCK_RCVTSTAMP);
368                                 sock_enable_timestamp(sk);
369                         } else
370                                 sock_reset_flag(sk, SOCK_RCVTSTAMP);
371                         break;
372
373                 case SO_RCVLOWAT:
374                         if (val < 0)
375                                 val = INT_MAX;
376                         sk->sk_rcvlowat = val ? : 1;
377                         break;
378
379                 case SO_RCVTIMEO:
380                         ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
381                         break;
382
383                 case SO_SNDTIMEO:
384                         ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
385                         break;
386
387 #ifdef CONFIG_NETDEVICES
388                 case SO_BINDTODEVICE:
389                 {
390                         char devname[IFNAMSIZ]; 
391
392                         /* Sorry... */ 
393                         if (!capable(CAP_NET_RAW)) {
394                                 ret = -EPERM;
395                                 break;
396                         }
397
398                         /* Bind this socket to a particular device like "eth0",
399                          * as specified in the passed interface name. If the
400                          * name is "" or the option length is zero the socket 
401                          * is not bound. 
402                          */ 
403
404                         if (!valbool) {
405                                 sk->sk_bound_dev_if = 0;
406                         } else {
407                                 if (optlen > IFNAMSIZ) 
408                                         optlen = IFNAMSIZ; 
409                                 if (copy_from_user(devname, optval, optlen)) {
410                                         ret = -EFAULT;
411                                         break;
412                                 }
413
414                                 /* Remove any cached route for this socket. */
415                                 sk_dst_reset(sk);
416
417                                 if (devname[0] == '\0') {
418                                         sk->sk_bound_dev_if = 0;
419                                 } else {
420                                         struct net_device *dev = dev_get_by_name(devname);
421                                         if (!dev) {
422                                                 ret = -ENODEV;
423                                                 break;
424                                         }
425                                         sk->sk_bound_dev_if = dev->ifindex;
426                                         dev_put(dev);
427                                 }
428                         }
429                         break;
430                 }
431 #endif
432
433
434                 case SO_ATTACH_FILTER:
435                         ret = -EINVAL;
436                         if (optlen == sizeof(struct sock_fprog)) {
437                                 struct sock_fprog fprog;
438
439                                 ret = -EFAULT;
440                                 if (copy_from_user(&fprog, optval, sizeof(fprog)))
441                                         break;
442
443                                 ret = sk_attach_filter(&fprog, sk);
444                         }
445                         break;
446
447                 case SO_DETACH_FILTER:
448                         spin_lock_bh(&sk->sk_lock.slock);
449                         filter = sk->sk_filter;
450                         if (filter) {
451                                 sk->sk_filter = NULL;
452                                 spin_unlock_bh(&sk->sk_lock.slock);
453                                 sk_filter_release(sk, filter);
454                                 break;
455                         }
456                         spin_unlock_bh(&sk->sk_lock.slock);
457                         ret = -ENONET;
458                         break;
459
460                 /* We implement the SO_SNDLOWAT etc to
461                    not be settable (1003.1g 5.3) */
462                 default:
463                         ret = -ENOPROTOOPT;
464                         break;
465         }
466         release_sock(sk);
467         return ret;
468 }
469
470
471 int sock_getsockopt(struct socket *sock, int level, int optname,
472                     char __user *optval, int __user *optlen)
473 {
474         struct sock *sk = sock->sk;
475         
476         union
477         {
478                 int val;
479                 struct linger ling;
480                 struct timeval tm;
481         } v;
482         
483         unsigned int lv = sizeof(int);
484         int len;
485         
486         if(get_user(len,optlen))
487                 return -EFAULT;
488         if(len < 0)
489                 return -EINVAL;
490                 
491         switch(optname) 
492         {
493                 case SO_DEBUG:          
494                         v.val = sock_flag(sk, SOCK_DBG);
495                         break;
496                 
497                 case SO_DONTROUTE:
498                         v.val = sock_flag(sk, SOCK_LOCALROUTE);
499                         break;
500                 
501                 case SO_BROADCAST:
502                         v.val = !!sock_flag(sk, SOCK_BROADCAST);
503                         break;
504
505                 case SO_SNDBUF:
506                         v.val = sk->sk_sndbuf;
507                         break;
508                 
509                 case SO_RCVBUF:
510                         v.val = sk->sk_rcvbuf;
511                         break;
512
513                 case SO_REUSEADDR:
514                         v.val = sk->sk_reuse;
515                         break;
516
517                 case SO_KEEPALIVE:
518                         v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
519                         break;
520
521                 case SO_TYPE:
522                         v.val = sk->sk_type;                            
523                         break;
524
525                 case SO_ERROR:
526                         v.val = -sock_error(sk);
527                         if(v.val==0)
528                                 v.val = xchg(&sk->sk_err_soft, 0);
529                         break;
530
531                 case SO_OOBINLINE:
532                         v.val = !!sock_flag(sk, SOCK_URGINLINE);
533                         break;
534         
535                 case SO_NO_CHECK:
536                         v.val = sk->sk_no_check;
537                         break;
538
539                 case SO_PRIORITY:
540                         v.val = sk->sk_priority;
541                         break;
542                 
543                 case SO_LINGER: 
544                         lv              = sizeof(v.ling);
545                         v.ling.l_onoff  = !!sock_flag(sk, SOCK_LINGER);
546                         v.ling.l_linger = sk->sk_lingertime / HZ;
547                         break;
548                                         
549                 case SO_BSDCOMPAT:
550                         sock_warn_obsolete_bsdism("getsockopt");
551                         break;
552
553                 case SO_TIMESTAMP:
554                         v.val = sock_flag(sk, SOCK_RCVTSTAMP);
555                         break;
556
557                 case SO_RCVTIMEO:
558                         lv=sizeof(struct timeval);
559                         if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
560                                 v.tm.tv_sec = 0;
561                                 v.tm.tv_usec = 0;
562                         } else {
563                                 v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
564                                 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
565                         }
566                         break;
567
568                 case SO_SNDTIMEO:
569                         lv=sizeof(struct timeval);
570                         if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
571                                 v.tm.tv_sec = 0;
572                                 v.tm.tv_usec = 0;
573                         } else {
574                                 v.tm.tv_sec = sk->sk_sndtimeo / HZ;
575                                 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
576                         }
577                         break;
578
579                 case SO_RCVLOWAT:
580                         v.val = sk->sk_rcvlowat;
581                         break;
582
583                 case SO_SNDLOWAT:
584                         v.val=1;
585                         break; 
586
587                 case SO_PASSCRED:
588                         v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
589                         break;
590
591                 case SO_PEERCRED:
592                         if (len > sizeof(sk->sk_peercred))
593                                 len = sizeof(sk->sk_peercred);
594                         if (copy_to_user(optval, &sk->sk_peercred, len))
595                                 return -EFAULT;
596                         goto lenout;
597
598                 case SO_PEERNAME:
599                 {
600                         char address[128];
601
602                         if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
603                                 return -ENOTCONN;
604                         if (lv < len)
605                                 return -EINVAL;
606                         if (copy_to_user(optval, address, len))
607                                 return -EFAULT;
608                         goto lenout;
609                 }
610
611                 /* Dubious BSD thing... Probably nobody even uses it, but
612                  * the UNIX standard wants it for whatever reason... -DaveM
613                  */
614                 case SO_ACCEPTCONN:
615                         v.val = sk->sk_state == TCP_LISTEN;
616                         break;
617
618                 case SO_PEERSEC:
619                         return security_socket_getpeersec_stream(sock, optval, optlen, len);
620
621                 default:
622                         return(-ENOPROTOOPT);
623         }
624         if (len > lv)
625                 len = lv;
626         if (copy_to_user(optval, &v, len))
627                 return -EFAULT;
628 lenout:
629         if (put_user(len, optlen))
630                 return -EFAULT;
631         return 0;
632 }
633
634 /**
635  *      sk_alloc - All socket objects are allocated here
636  *      @family: protocol family
637  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
638  *      @prot: struct proto associated with this new sock instance
639  *      @zero_it: if we should zero the newly allocated sock
640  */
641 struct sock *sk_alloc(int family, gfp_t priority,
642                       struct proto *prot, int zero_it)
643 {
644         struct sock *sk = NULL;
645         kmem_cache_t *slab = prot->slab;
646
647         if (slab != NULL)
648                 sk = kmem_cache_alloc(slab, priority);
649         else
650                 sk = kmalloc(prot->obj_size, priority);
651
652         if (sk) {
653                 if (zero_it) {
654                         memset(sk, 0, prot->obj_size);
655                         sk->sk_family = family;
656                         /*
657                          * See comment in struct sock definition to understand
658                          * why we need sk_prot_creator -acme
659                          */
660                         sk->sk_prot = sk->sk_prot_creator = prot;
661                         sock_lock_init(sk);
662                 }
663                 
664                 if (security_sk_alloc(sk, family, priority))
665                         goto out_free;
666
667                 if (!try_module_get(prot->owner))
668                         goto out_free;
669         }
670         return sk;
671
672 out_free:
673         if (slab != NULL)
674                 kmem_cache_free(slab, sk);
675         else
676                 kfree(sk);
677         return NULL;
678 }
679
680 void sk_free(struct sock *sk)
681 {
682         struct sk_filter *filter;
683         struct module *owner = sk->sk_prot_creator->owner;
684
685         if (sk->sk_destruct)
686                 sk->sk_destruct(sk);
687
688         filter = sk->sk_filter;
689         if (filter) {
690                 sk_filter_release(sk, filter);
691                 sk->sk_filter = NULL;
692         }
693
694         sock_disable_timestamp(sk);
695
696         if (atomic_read(&sk->sk_omem_alloc))
697                 printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
698                        __FUNCTION__, atomic_read(&sk->sk_omem_alloc));
699
700         security_sk_free(sk);
701         if (sk->sk_prot_creator->slab != NULL)
702                 kmem_cache_free(sk->sk_prot_creator->slab, sk);
703         else
704                 kfree(sk);
705         module_put(owner);
706 }
707
708 struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
709 {
710         struct sock *newsk = sk_alloc(sk->sk_family, priority, sk->sk_prot, 0);
711
712         if (newsk != NULL) {
713                 struct sk_filter *filter;
714
715                 memcpy(newsk, sk, sk->sk_prot->obj_size);
716
717                 /* SANITY */
718                 sk_node_init(&newsk->sk_node);
719                 sock_lock_init(newsk);
720                 bh_lock_sock(newsk);
721
722                 atomic_set(&newsk->sk_rmem_alloc, 0);
723                 atomic_set(&newsk->sk_wmem_alloc, 0);
724                 atomic_set(&newsk->sk_omem_alloc, 0);
725                 skb_queue_head_init(&newsk->sk_receive_queue);
726                 skb_queue_head_init(&newsk->sk_write_queue);
727
728                 rwlock_init(&newsk->sk_dst_lock);
729                 rwlock_init(&newsk->sk_callback_lock);
730
731                 newsk->sk_dst_cache     = NULL;
732                 newsk->sk_wmem_queued   = 0;
733                 newsk->sk_forward_alloc = 0;
734                 newsk->sk_send_head     = NULL;
735                 newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
736                 newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
737
738                 sock_reset_flag(newsk, SOCK_DONE);
739                 skb_queue_head_init(&newsk->sk_error_queue);
740
741                 filter = newsk->sk_filter;
742                 if (filter != NULL)
743                         sk_filter_charge(newsk, filter);
744
745                 if (unlikely(xfrm_sk_clone_policy(newsk))) {
746                         /* It is still raw copy of parent, so invalidate
747                          * destructor and make plain sk_free() */
748                         newsk->sk_destruct = NULL;
749                         sk_free(newsk);
750                         newsk = NULL;
751                         goto out;
752                 }
753
754                 newsk->sk_err      = 0;
755                 newsk->sk_priority = 0;
756                 atomic_set(&newsk->sk_refcnt, 2);
757
758                 /*
759                  * Increment the counter in the same struct proto as the master
760                  * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
761                  * is the same as sk->sk_prot->socks, as this field was copied
762                  * with memcpy).
763                  *
764                  * This _changes_ the previous behaviour, where
765                  * tcp_create_openreq_child always was incrementing the
766                  * equivalent to tcp_prot->socks (inet_sock_nr), so this have
767                  * to be taken into account in all callers. -acme
768                  */
769                 sk_refcnt_debug_inc(newsk);
770                 newsk->sk_socket = NULL;
771                 newsk->sk_sleep  = NULL;
772
773                 if (newsk->sk_prot->sockets_allocated)
774                         atomic_inc(newsk->sk_prot->sockets_allocated);
775         }
776 out:
777         return newsk;
778 }
779
780 EXPORT_SYMBOL_GPL(sk_clone);
781
782 void __init sk_init(void)
783 {
784         if (num_physpages <= 4096) {
785                 sysctl_wmem_max = 32767;
786                 sysctl_rmem_max = 32767;
787                 sysctl_wmem_default = 32767;
788                 sysctl_rmem_default = 32767;
789         } else if (num_physpages >= 131072) {
790                 sysctl_wmem_max = 131071;
791                 sysctl_rmem_max = 131071;
792         }
793 }
794
795 /*
796  *      Simple resource managers for sockets.
797  */
798
799
800 /* 
801  * Write buffer destructor automatically called from kfree_skb. 
802  */
803 void sock_wfree(struct sk_buff *skb)
804 {
805         struct sock *sk = skb->sk;
806
807         /* In case it might be waiting for more memory. */
808         atomic_sub(skb->truesize, &sk->sk_wmem_alloc);
809         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE))
810                 sk->sk_write_space(sk);
811         sock_put(sk);
812 }
813
814 /* 
815  * Read buffer destructor automatically called from kfree_skb. 
816  */
817 void sock_rfree(struct sk_buff *skb)
818 {
819         struct sock *sk = skb->sk;
820
821         atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
822 }
823
824
825 int sock_i_uid(struct sock *sk)
826 {
827         int uid;
828
829         read_lock(&sk->sk_callback_lock);
830         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
831         read_unlock(&sk->sk_callback_lock);
832         return uid;
833 }
834
835 unsigned long sock_i_ino(struct sock *sk)
836 {
837         unsigned long ino;
838
839         read_lock(&sk->sk_callback_lock);
840         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
841         read_unlock(&sk->sk_callback_lock);
842         return ino;
843 }
844
845 /*
846  * Allocate a skb from the socket's send buffer.
847  */
848 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
849                              gfp_t priority)
850 {
851         if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
852                 struct sk_buff * skb = alloc_skb(size, priority);
853                 if (skb) {
854                         skb_set_owner_w(skb, sk);
855                         return skb;
856                 }
857         }
858         return NULL;
859 }
860
861 /*
862  * Allocate a skb from the socket's receive buffer.
863  */ 
864 struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
865                              gfp_t priority)
866 {
867         if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
868                 struct sk_buff *skb = alloc_skb(size, priority);
869                 if (skb) {
870                         skb_set_owner_r(skb, sk);
871                         return skb;
872                 }
873         }
874         return NULL;
875 }
876
877 /* 
878  * Allocate a memory block from the socket's option memory buffer.
879  */ 
880 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
881 {
882         if ((unsigned)size <= sysctl_optmem_max &&
883             atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
884                 void *mem;
885                 /* First do the add, to avoid the race if kmalloc
886                  * might sleep.
887                  */
888                 atomic_add(size, &sk->sk_omem_alloc);
889                 mem = kmalloc(size, priority);
890                 if (mem)
891                         return mem;
892                 atomic_sub(size, &sk->sk_omem_alloc);
893         }
894         return NULL;
895 }
896
897 /*
898  * Free an option memory block.
899  */
900 void sock_kfree_s(struct sock *sk, void *mem, int size)
901 {
902         kfree(mem);
903         atomic_sub(size, &sk->sk_omem_alloc);
904 }
905
906 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
907    I think, these locks should be removed for datagram sockets.
908  */
909 static long sock_wait_for_wmem(struct sock * sk, long timeo)
910 {
911         DEFINE_WAIT(wait);
912
913         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
914         for (;;) {
915                 if (!timeo)
916                         break;
917                 if (signal_pending(current))
918                         break;
919                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
920                 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
921                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
922                         break;
923                 if (sk->sk_shutdown & SEND_SHUTDOWN)
924                         break;
925                 if (sk->sk_err)
926                         break;
927                 timeo = schedule_timeout(timeo);
928         }
929         finish_wait(sk->sk_sleep, &wait);
930         return timeo;
931 }
932
933
934 /*
935  *      Generic send/receive buffer handlers
936  */
937
938 static struct sk_buff *sock_alloc_send_pskb(struct sock *sk,
939                                             unsigned long header_len,
940                                             unsigned long data_len,
941                                             int noblock, int *errcode)
942 {
943         struct sk_buff *skb;
944         gfp_t gfp_mask;
945         long timeo;
946         int err;
947
948         gfp_mask = sk->sk_allocation;
949         if (gfp_mask & __GFP_WAIT)
950                 gfp_mask |= __GFP_REPEAT;
951
952         timeo = sock_sndtimeo(sk, noblock);
953         while (1) {
954                 err = sock_error(sk);
955                 if (err != 0)
956                         goto failure;
957
958                 err = -EPIPE;
959                 if (sk->sk_shutdown & SEND_SHUTDOWN)
960                         goto failure;
961
962                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
963                         skb = alloc_skb(header_len, sk->sk_allocation);
964                         if (skb) {
965                                 int npages;
966                                 int i;
967
968                                 /* No pages, we're done... */
969                                 if (!data_len)
970                                         break;
971
972                                 npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
973                                 skb->truesize += data_len;
974                                 skb_shinfo(skb)->nr_frags = npages;
975                                 for (i = 0; i < npages; i++) {
976                                         struct page *page;
977                                         skb_frag_t *frag;
978
979                                         page = alloc_pages(sk->sk_allocation, 0);
980                                         if (!page) {
981                                                 err = -ENOBUFS;
982                                                 skb_shinfo(skb)->nr_frags = i;
983                                                 kfree_skb(skb);
984                                                 goto failure;
985                                         }
986
987                                         frag = &skb_shinfo(skb)->frags[i];
988                                         frag->page = page;
989                                         frag->page_offset = 0;
990                                         frag->size = (data_len >= PAGE_SIZE ?
991                                                       PAGE_SIZE :
992                                                       data_len);
993                                         data_len -= PAGE_SIZE;
994                                 }
995
996                                 /* Full success... */
997                                 break;
998                         }
999                         err = -ENOBUFS;
1000                         goto failure;
1001                 }
1002                 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1003                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1004                 err = -EAGAIN;
1005                 if (!timeo)
1006                         goto failure;
1007                 if (signal_pending(current))
1008                         goto interrupted;
1009                 timeo = sock_wait_for_wmem(sk, timeo);
1010         }
1011
1012         skb_set_owner_w(skb, sk);
1013         return skb;
1014
1015 interrupted:
1016         err = sock_intr_errno(timeo);
1017 failure:
1018         *errcode = err;
1019         return NULL;
1020 }
1021
1022 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, 
1023                                     int noblock, int *errcode)
1024 {
1025         return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1026 }
1027
1028 static void __lock_sock(struct sock *sk)
1029 {
1030         DEFINE_WAIT(wait);
1031
1032         for(;;) {
1033                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1034                                         TASK_UNINTERRUPTIBLE);
1035                 spin_unlock_bh(&sk->sk_lock.slock);
1036                 schedule();
1037                 spin_lock_bh(&sk->sk_lock.slock);
1038                 if(!sock_owned_by_user(sk))
1039                         break;
1040         }
1041         finish_wait(&sk->sk_lock.wq, &wait);
1042 }
1043
1044 static void __release_sock(struct sock *sk)
1045 {
1046         struct sk_buff *skb = sk->sk_backlog.head;
1047
1048         do {
1049                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1050                 bh_unlock_sock(sk);
1051
1052                 do {
1053                         struct sk_buff *next = skb->next;
1054
1055                         skb->next = NULL;
1056                         sk->sk_backlog_rcv(sk, skb);
1057
1058                         /*
1059                          * We are in process context here with softirqs
1060                          * disabled, use cond_resched_softirq() to preempt.
1061                          * This is safe to do because we've taken the backlog
1062                          * queue private:
1063                          */
1064                         cond_resched_softirq();
1065
1066                         skb = next;
1067                 } while (skb != NULL);
1068
1069                 bh_lock_sock(sk);
1070         } while((skb = sk->sk_backlog.head) != NULL);
1071 }
1072
1073 /**
1074  * sk_wait_data - wait for data to arrive at sk_receive_queue
1075  * @sk:    sock to wait on
1076  * @timeo: for how long
1077  *
1078  * Now socket state including sk->sk_err is changed only under lock,
1079  * hence we may omit checks after joining wait queue.
1080  * We check receive queue before schedule() only as optimization;
1081  * it is very likely that release_sock() added new data.
1082  */
1083 int sk_wait_data(struct sock *sk, long *timeo)
1084 {
1085         int rc;
1086         DEFINE_WAIT(wait);
1087
1088         prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1089         set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1090         rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1091         clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1092         finish_wait(sk->sk_sleep, &wait);
1093         return rc;
1094 }
1095
1096 EXPORT_SYMBOL(sk_wait_data);
1097
1098 /*
1099  * Set of default routines for initialising struct proto_ops when
1100  * the protocol does not support a particular function. In certain
1101  * cases where it makes no sense for a protocol to have a "do nothing"
1102  * function, some default processing is provided.
1103  */
1104
1105 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1106 {
1107         return -EOPNOTSUPP;
1108 }
1109
1110 int sock_no_connect(struct socket *sock, struct sockaddr *saddr, 
1111                     int len, int flags)
1112 {
1113         return -EOPNOTSUPP;
1114 }
1115
1116 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1117 {
1118         return -EOPNOTSUPP;
1119 }
1120
1121 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1122 {
1123         return -EOPNOTSUPP;
1124 }
1125
1126 int sock_no_getname(struct socket *sock, struct sockaddr *saddr, 
1127                     int *len, int peer)
1128 {
1129         return -EOPNOTSUPP;
1130 }
1131
1132 unsigned int sock_no_poll(struct file * file, struct socket *sock, poll_table *pt)
1133 {
1134         return 0;
1135 }
1136
1137 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1138 {
1139         return -EOPNOTSUPP;
1140 }
1141
1142 int sock_no_listen(struct socket *sock, int backlog)
1143 {
1144         return -EOPNOTSUPP;
1145 }
1146
1147 int sock_no_shutdown(struct socket *sock, int how)
1148 {
1149         return -EOPNOTSUPP;
1150 }
1151
1152 int sock_no_setsockopt(struct socket *sock, int level, int optname,
1153                     char __user *optval, int optlen)
1154 {
1155         return -EOPNOTSUPP;
1156 }
1157
1158 int sock_no_getsockopt(struct socket *sock, int level, int optname,
1159                     char __user *optval, int __user *optlen)
1160 {
1161         return -EOPNOTSUPP;
1162 }
1163
1164 int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1165                     size_t len)
1166 {
1167         return -EOPNOTSUPP;
1168 }
1169
1170 int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1171                     size_t len, int flags)
1172 {
1173         return -EOPNOTSUPP;
1174 }
1175
1176 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1177 {
1178         /* Mirror missing mmap method error code */
1179         return -ENODEV;
1180 }
1181
1182 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1183 {
1184         ssize_t res;
1185         struct msghdr msg = {.msg_flags = flags};
1186         struct kvec iov;
1187         char *kaddr = kmap(page);
1188         iov.iov_base = kaddr + offset;
1189         iov.iov_len = size;
1190         res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1191         kunmap(page);
1192         return res;
1193 }
1194
1195 /*
1196  *      Default Socket Callbacks
1197  */
1198
1199 static void sock_def_wakeup(struct sock *sk)
1200 {
1201         read_lock(&sk->sk_callback_lock);
1202         if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1203                 wake_up_interruptible_all(sk->sk_sleep);
1204         read_unlock(&sk->sk_callback_lock);
1205 }
1206
1207 static void sock_def_error_report(struct sock *sk)
1208 {
1209         read_lock(&sk->sk_callback_lock);
1210         if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1211                 wake_up_interruptible(sk->sk_sleep);
1212         sk_wake_async(sk,0,POLL_ERR); 
1213         read_unlock(&sk->sk_callback_lock);
1214 }
1215
1216 static void sock_def_readable(struct sock *sk, int len)
1217 {
1218         read_lock(&sk->sk_callback_lock);
1219         if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1220                 wake_up_interruptible(sk->sk_sleep);
1221         sk_wake_async(sk,1,POLL_IN);
1222         read_unlock(&sk->sk_callback_lock);
1223 }
1224
1225 static void sock_def_write_space(struct sock *sk)
1226 {
1227         read_lock(&sk->sk_callback_lock);
1228
1229         /* Do not wake up a writer until he can make "significant"
1230          * progress.  --DaveM
1231          */
1232         if((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
1233                 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1234                         wake_up_interruptible(sk->sk_sleep);
1235
1236                 /* Should agree with poll, otherwise some programs break */
1237                 if (sock_writeable(sk))
1238                         sk_wake_async(sk, 2, POLL_OUT);
1239         }
1240
1241         read_unlock(&sk->sk_callback_lock);
1242 }
1243
1244 static void sock_def_destruct(struct sock *sk)
1245 {
1246         kfree(sk->sk_protinfo);
1247 }
1248
1249 void sk_send_sigurg(struct sock *sk)
1250 {
1251         if (sk->sk_socket && sk->sk_socket->file)
1252                 if (send_sigurg(&sk->sk_socket->file->f_owner))
1253                         sk_wake_async(sk, 3, POLL_PRI);
1254 }
1255
1256 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
1257                     unsigned long expires)
1258 {
1259         if (!mod_timer(timer, expires))
1260                 sock_hold(sk);
1261 }
1262
1263 EXPORT_SYMBOL(sk_reset_timer);
1264
1265 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
1266 {
1267         if (timer_pending(timer) && del_timer(timer))
1268                 __sock_put(sk);
1269 }
1270
1271 EXPORT_SYMBOL(sk_stop_timer);
1272
1273 void sock_init_data(struct socket *sock, struct sock *sk)
1274 {
1275         skb_queue_head_init(&sk->sk_receive_queue);
1276         skb_queue_head_init(&sk->sk_write_queue);
1277         skb_queue_head_init(&sk->sk_error_queue);
1278
1279         sk->sk_send_head        =       NULL;
1280
1281         init_timer(&sk->sk_timer);
1282         
1283         sk->sk_allocation       =       GFP_KERNEL;
1284         sk->sk_rcvbuf           =       sysctl_rmem_default;
1285         sk->sk_sndbuf           =       sysctl_wmem_default;
1286         sk->sk_state            =       TCP_CLOSE;
1287         sk->sk_socket           =       sock;
1288
1289         sock_set_flag(sk, SOCK_ZAPPED);
1290
1291         if(sock)
1292         {
1293                 sk->sk_type     =       sock->type;
1294                 sk->sk_sleep    =       &sock->wait;
1295                 sock->sk        =       sk;
1296         } else
1297                 sk->sk_sleep    =       NULL;
1298
1299         rwlock_init(&sk->sk_dst_lock);
1300         rwlock_init(&sk->sk_callback_lock);
1301
1302         sk->sk_state_change     =       sock_def_wakeup;
1303         sk->sk_data_ready       =       sock_def_readable;
1304         sk->sk_write_space      =       sock_def_write_space;
1305         sk->sk_error_report     =       sock_def_error_report;
1306         sk->sk_destruct         =       sock_def_destruct;
1307
1308         sk->sk_sndmsg_page      =       NULL;
1309         sk->sk_sndmsg_off       =       0;
1310
1311         sk->sk_peercred.pid     =       0;
1312         sk->sk_peercred.uid     =       -1;
1313         sk->sk_peercred.gid     =       -1;
1314         sk->sk_write_pending    =       0;
1315         sk->sk_rcvlowat         =       1;
1316         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
1317         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
1318
1319         sk->sk_stamp.tv_sec     = -1L;
1320         sk->sk_stamp.tv_usec    = -1L;
1321
1322         atomic_set(&sk->sk_refcnt, 1);
1323 }
1324
1325 void fastcall lock_sock(struct sock *sk)
1326 {
1327         might_sleep();
1328         spin_lock_bh(&(sk->sk_lock.slock));
1329         if (sk->sk_lock.owner)
1330                 __lock_sock(sk);
1331         sk->sk_lock.owner = (void *)1;
1332         spin_unlock_bh(&(sk->sk_lock.slock));
1333 }
1334
1335 EXPORT_SYMBOL(lock_sock);
1336
1337 void fastcall release_sock(struct sock *sk)
1338 {
1339         spin_lock_bh(&(sk->sk_lock.slock));
1340         if (sk->sk_backlog.tail)
1341                 __release_sock(sk);
1342         sk->sk_lock.owner = NULL;
1343         if (waitqueue_active(&(sk->sk_lock.wq)))
1344                 wake_up(&(sk->sk_lock.wq));
1345         spin_unlock_bh(&(sk->sk_lock.slock));
1346 }
1347 EXPORT_SYMBOL(release_sock);
1348
1349 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
1350
1351         if (!sock_flag(sk, SOCK_TIMESTAMP))
1352                 sock_enable_timestamp(sk);
1353         if (sk->sk_stamp.tv_sec == -1) 
1354                 return -ENOENT;
1355         if (sk->sk_stamp.tv_sec == 0)
1356                 do_gettimeofday(&sk->sk_stamp);
1357         return copy_to_user(userstamp, &sk->sk_stamp, sizeof(struct timeval)) ?
1358                 -EFAULT : 0; 
1359
1360 EXPORT_SYMBOL(sock_get_timestamp);
1361
1362 void sock_enable_timestamp(struct sock *sk)
1363 {       
1364         if (!sock_flag(sk, SOCK_TIMESTAMP)) { 
1365                 sock_set_flag(sk, SOCK_TIMESTAMP);
1366                 net_enable_timestamp();
1367         }
1368 }
1369 EXPORT_SYMBOL(sock_enable_timestamp); 
1370
1371 /*
1372  *      Get a socket option on an socket.
1373  *
1374  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
1375  *      asynchronous errors should be reported by getsockopt. We assume
1376  *      this means if you specify SO_ERROR (otherwise whats the point of it).
1377  */
1378 int sock_common_getsockopt(struct socket *sock, int level, int optname,
1379                            char __user *optval, int __user *optlen)
1380 {
1381         struct sock *sk = sock->sk;
1382
1383         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1384 }
1385
1386 EXPORT_SYMBOL(sock_common_getsockopt);
1387
1388 #ifdef CONFIG_COMPAT
1389 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
1390                                   char __user *optval, int __user *optlen)
1391 {
1392         struct sock *sk = sock->sk;
1393
1394         if (sk->sk_prot->compat_setsockopt != NULL)
1395                 return sk->sk_prot->compat_getsockopt(sk, level, optname,
1396                                                       optval, optlen);
1397         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1398 }
1399 EXPORT_SYMBOL(compat_sock_common_getsockopt);
1400 #endif
1401
1402 int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
1403                         struct msghdr *msg, size_t size, int flags)
1404 {
1405         struct sock *sk = sock->sk;
1406         int addr_len = 0;
1407         int err;
1408
1409         err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
1410                                    flags & ~MSG_DONTWAIT, &addr_len);
1411         if (err >= 0)
1412                 msg->msg_namelen = addr_len;
1413         return err;
1414 }
1415
1416 EXPORT_SYMBOL(sock_common_recvmsg);
1417
1418 /*
1419  *      Set socket options on an inet socket.
1420  */
1421 int sock_common_setsockopt(struct socket *sock, int level, int optname,
1422                            char __user *optval, int optlen)
1423 {
1424         struct sock *sk = sock->sk;
1425
1426         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1427 }
1428
1429 EXPORT_SYMBOL(sock_common_setsockopt);
1430
1431 #ifdef CONFIG_COMPAT
1432 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
1433                                   char __user *optval, int optlen)
1434 {
1435         struct sock *sk = sock->sk;
1436
1437         if (sk->sk_prot->compat_setsockopt != NULL)
1438                 return sk->sk_prot->compat_setsockopt(sk, level, optname,
1439                                                       optval, optlen);
1440         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1441 }
1442 EXPORT_SYMBOL(compat_sock_common_setsockopt);
1443 #endif
1444
1445 void sk_common_release(struct sock *sk)
1446 {
1447         if (sk->sk_prot->destroy)
1448                 sk->sk_prot->destroy(sk);
1449
1450         /*
1451          * Observation: when sock_common_release is called, processes have
1452          * no access to socket. But net still has.
1453          * Step one, detach it from networking:
1454          *
1455          * A. Remove from hash tables.
1456          */
1457
1458         sk->sk_prot->unhash(sk);
1459
1460         /*
1461          * In this point socket cannot receive new packets, but it is possible
1462          * that some packets are in flight because some CPU runs receiver and
1463          * did hash table lookup before we unhashed socket. They will achieve
1464          * receive queue and will be purged by socket destructor.
1465          *
1466          * Also we still have packets pending on receive queue and probably,
1467          * our own packets waiting in device queues. sock_destroy will drain
1468          * receive queue, but transmitted packets will delay socket destruction
1469          * until the last reference will be released.
1470          */
1471
1472         sock_orphan(sk);
1473
1474         xfrm_sk_free_policy(sk);
1475
1476         sk_refcnt_debug_release(sk);
1477         sock_put(sk);
1478 }
1479
1480 EXPORT_SYMBOL(sk_common_release);
1481
1482 static DEFINE_RWLOCK(proto_list_lock);
1483 static LIST_HEAD(proto_list);
1484
1485 int proto_register(struct proto *prot, int alloc_slab)
1486 {
1487         char *request_sock_slab_name = NULL;
1488         char *timewait_sock_slab_name;
1489         int rc = -ENOBUFS;
1490
1491         if (alloc_slab) {
1492                 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
1493                                                SLAB_HWCACHE_ALIGN, NULL, NULL);
1494
1495                 if (prot->slab == NULL) {
1496                         printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
1497                                prot->name);
1498                         goto out;
1499                 }
1500
1501                 if (prot->rsk_prot != NULL) {
1502                         static const char mask[] = "request_sock_%s";
1503
1504                         request_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
1505                         if (request_sock_slab_name == NULL)
1506                                 goto out_free_sock_slab;
1507
1508                         sprintf(request_sock_slab_name, mask, prot->name);
1509                         prot->rsk_prot->slab = kmem_cache_create(request_sock_slab_name,
1510                                                                  prot->rsk_prot->obj_size, 0,
1511                                                                  SLAB_HWCACHE_ALIGN, NULL, NULL);
1512
1513                         if (prot->rsk_prot->slab == NULL) {
1514                                 printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
1515                                        prot->name);
1516                                 goto out_free_request_sock_slab_name;
1517                         }
1518                 }
1519
1520                 if (prot->twsk_prot != NULL) {
1521                         static const char mask[] = "tw_sock_%s";
1522
1523                         timewait_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
1524
1525                         if (timewait_sock_slab_name == NULL)
1526                                 goto out_free_request_sock_slab;
1527
1528                         sprintf(timewait_sock_slab_name, mask, prot->name);
1529                         prot->twsk_prot->twsk_slab =
1530                                 kmem_cache_create(timewait_sock_slab_name,
1531                                                   prot->twsk_prot->twsk_obj_size,
1532                                                   0, SLAB_HWCACHE_ALIGN,
1533                                                   NULL, NULL);
1534                         if (prot->twsk_prot->twsk_slab == NULL)
1535                                 goto out_free_timewait_sock_slab_name;
1536                 }
1537         }
1538
1539         write_lock(&proto_list_lock);
1540         list_add(&prot->node, &proto_list);
1541         write_unlock(&proto_list_lock);
1542         rc = 0;
1543 out:
1544         return rc;
1545 out_free_timewait_sock_slab_name:
1546         kfree(timewait_sock_slab_name);
1547 out_free_request_sock_slab:
1548         if (prot->rsk_prot && prot->rsk_prot->slab) {
1549                 kmem_cache_destroy(prot->rsk_prot->slab);
1550                 prot->rsk_prot->slab = NULL;
1551         }
1552 out_free_request_sock_slab_name:
1553         kfree(request_sock_slab_name);
1554 out_free_sock_slab:
1555         kmem_cache_destroy(prot->slab);
1556         prot->slab = NULL;
1557         goto out;
1558 }
1559
1560 EXPORT_SYMBOL(proto_register);
1561
1562 void proto_unregister(struct proto *prot)
1563 {
1564         write_lock(&proto_list_lock);
1565         list_del(&prot->node);
1566         write_unlock(&proto_list_lock);
1567
1568         if (prot->slab != NULL) {
1569                 kmem_cache_destroy(prot->slab);
1570                 prot->slab = NULL;
1571         }
1572
1573         if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
1574                 const char *name = kmem_cache_name(prot->rsk_prot->slab);
1575
1576                 kmem_cache_destroy(prot->rsk_prot->slab);
1577                 kfree(name);
1578                 prot->rsk_prot->slab = NULL;
1579         }
1580
1581         if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
1582                 const char *name = kmem_cache_name(prot->twsk_prot->twsk_slab);
1583
1584                 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
1585                 kfree(name);
1586                 prot->twsk_prot->twsk_slab = NULL;
1587         }
1588 }
1589
1590 EXPORT_SYMBOL(proto_unregister);
1591
1592 #ifdef CONFIG_PROC_FS
1593 static inline struct proto *__proto_head(void)
1594 {
1595         return list_entry(proto_list.next, struct proto, node);
1596 }
1597
1598 static inline struct proto *proto_head(void)
1599 {
1600         return list_empty(&proto_list) ? NULL : __proto_head();
1601 }
1602
1603 static inline struct proto *proto_next(struct proto *proto)
1604 {
1605         return proto->node.next == &proto_list ? NULL :
1606                 list_entry(proto->node.next, struct proto, node);
1607 }
1608
1609 static inline struct proto *proto_get_idx(loff_t pos)
1610 {
1611         struct proto *proto;
1612         loff_t i = 0;
1613
1614         list_for_each_entry(proto, &proto_list, node)
1615                 if (i++ == pos)
1616                         goto out;
1617
1618         proto = NULL;
1619 out:
1620         return proto;
1621 }
1622
1623 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
1624 {
1625         read_lock(&proto_list_lock);
1626         return *pos ? proto_get_idx(*pos - 1) : SEQ_START_TOKEN;
1627 }
1628
1629 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1630 {
1631         ++*pos;
1632         return v == SEQ_START_TOKEN ? proto_head() : proto_next(v);
1633 }
1634
1635 static void proto_seq_stop(struct seq_file *seq, void *v)
1636 {
1637         read_unlock(&proto_list_lock);
1638 }
1639
1640 static char proto_method_implemented(const void *method)
1641 {
1642         return method == NULL ? 'n' : 'y';
1643 }
1644
1645 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
1646 {
1647         seq_printf(seq, "%-9s %4u %6d  %6d   %-3s %6u   %-3s  %-10s "
1648                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
1649                    proto->name,
1650                    proto->obj_size,
1651                    proto->sockets_allocated != NULL ? atomic_read(proto->sockets_allocated) : -1,
1652                    proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
1653                    proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
1654                    proto->max_header,
1655                    proto->slab == NULL ? "no" : "yes",
1656                    module_name(proto->owner),
1657                    proto_method_implemented(proto->close),
1658                    proto_method_implemented(proto->connect),
1659                    proto_method_implemented(proto->disconnect),
1660                    proto_method_implemented(proto->accept),
1661                    proto_method_implemented(proto->ioctl),
1662                    proto_method_implemented(proto->init),
1663                    proto_method_implemented(proto->destroy),
1664                    proto_method_implemented(proto->shutdown),
1665                    proto_method_implemented(proto->setsockopt),
1666                    proto_method_implemented(proto->getsockopt),
1667                    proto_method_implemented(proto->sendmsg),
1668                    proto_method_implemented(proto->recvmsg),
1669                    proto_method_implemented(proto->sendpage),
1670                    proto_method_implemented(proto->bind),
1671                    proto_method_implemented(proto->backlog_rcv),
1672                    proto_method_implemented(proto->hash),
1673                    proto_method_implemented(proto->unhash),
1674                    proto_method_implemented(proto->get_port),
1675                    proto_method_implemented(proto->enter_memory_pressure));
1676 }
1677
1678 static int proto_seq_show(struct seq_file *seq, void *v)
1679 {
1680         if (v == SEQ_START_TOKEN)
1681                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
1682                            "protocol",
1683                            "size",
1684                            "sockets",
1685                            "memory",
1686                            "press",
1687                            "maxhdr",
1688                            "slab",
1689                            "module",
1690                            "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
1691         else
1692                 proto_seq_printf(seq, v);
1693         return 0;
1694 }
1695
1696 static struct seq_operations proto_seq_ops = {
1697         .start  = proto_seq_start,
1698         .next   = proto_seq_next,
1699         .stop   = proto_seq_stop,
1700         .show   = proto_seq_show,
1701 };
1702
1703 static int proto_seq_open(struct inode *inode, struct file *file)
1704 {
1705         return seq_open(file, &proto_seq_ops);
1706 }
1707
1708 static struct file_operations proto_seq_fops = {
1709         .owner          = THIS_MODULE,
1710         .open           = proto_seq_open,
1711         .read           = seq_read,
1712         .llseek         = seq_lseek,
1713         .release        = seq_release,
1714 };
1715
1716 static int __init proto_init(void)
1717 {
1718         /* register /proc/net/protocols */
1719         return proc_net_fops_create("protocols", S_IRUGO, &proto_seq_fops) == NULL ? -ENOBUFS : 0;
1720 }
1721
1722 subsys_initcall(proto_init);
1723
1724 #endif /* PROC_FS */
1725
1726 EXPORT_SYMBOL(sk_alloc);
1727 EXPORT_SYMBOL(sk_free);
1728 EXPORT_SYMBOL(sk_send_sigurg);
1729 EXPORT_SYMBOL(sock_alloc_send_skb);
1730 EXPORT_SYMBOL(sock_init_data);
1731 EXPORT_SYMBOL(sock_kfree_s);
1732 EXPORT_SYMBOL(sock_kmalloc);
1733 EXPORT_SYMBOL(sock_no_accept);
1734 EXPORT_SYMBOL(sock_no_bind);
1735 EXPORT_SYMBOL(sock_no_connect);
1736 EXPORT_SYMBOL(sock_no_getname);
1737 EXPORT_SYMBOL(sock_no_getsockopt);
1738 EXPORT_SYMBOL(sock_no_ioctl);
1739 EXPORT_SYMBOL(sock_no_listen);
1740 EXPORT_SYMBOL(sock_no_mmap);
1741 EXPORT_SYMBOL(sock_no_poll);
1742 EXPORT_SYMBOL(sock_no_recvmsg);
1743 EXPORT_SYMBOL(sock_no_sendmsg);
1744 EXPORT_SYMBOL(sock_no_sendpage);
1745 EXPORT_SYMBOL(sock_no_setsockopt);
1746 EXPORT_SYMBOL(sock_no_shutdown);
1747 EXPORT_SYMBOL(sock_no_socketpair);
1748 EXPORT_SYMBOL(sock_rfree);
1749 EXPORT_SYMBOL(sock_setsockopt);
1750 EXPORT_SYMBOL(sock_wfree);
1751 EXPORT_SYMBOL(sock_wmalloc);
1752 EXPORT_SYMBOL(sock_i_uid);
1753 EXPORT_SYMBOL(sock_i_ino);
1754 EXPORT_SYMBOL(sysctl_optmem_max);
1755 #ifdef CONFIG_SYSCTL
1756 EXPORT_SYMBOL(sysctl_rmem_max);
1757 EXPORT_SYMBOL(sysctl_wmem_max);
1758 #endif