Linux-2.6.12-rc2
[sfrench/cifs-2.6.git] / net / unix / af_unix.c
1 /*
2  * NET4:        Implementation of BSD Unix domain sockets.
3  *
4  * Authors:     Alan Cox, <alan.cox@linux.org>
5  *
6  *              This program is free software; you can redistribute it and/or
7  *              modify it under the terms of the GNU General Public License
8  *              as published by the Free Software Foundation; either version
9  *              2 of the License, or (at your option) any later version.
10  *
11  * Version:     $Id: af_unix.c,v 1.133 2002/02/08 03:57:19 davem Exp $
12  *
13  * Fixes:
14  *              Linus Torvalds  :       Assorted bug cures.
15  *              Niibe Yutaka    :       async I/O support.
16  *              Carsten Paeth   :       PF_UNIX check, address fixes.
17  *              Alan Cox        :       Limit size of allocated blocks.
18  *              Alan Cox        :       Fixed the stupid socketpair bug.
19  *              Alan Cox        :       BSD compatibility fine tuning.
20  *              Alan Cox        :       Fixed a bug in connect when interrupted.
21  *              Alan Cox        :       Sorted out a proper draft version of
22  *                                      file descriptor passing hacked up from
23  *                                      Mike Shaver's work.
24  *              Marty Leisner   :       Fixes to fd passing
25  *              Nick Nevin      :       recvmsg bugfix.
26  *              Alan Cox        :       Started proper garbage collector
27  *              Heiko EiBfeldt  :       Missing verify_area check
28  *              Alan Cox        :       Started POSIXisms
29  *              Andreas Schwab  :       Replace inode by dentry for proper
30  *                                      reference counting
31  *              Kirk Petersen   :       Made this a module
32  *          Christoph Rohland   :       Elegant non-blocking accept/connect algorithm.
33  *                                      Lots of bug fixes.
34  *           Alexey Kuznetosv   :       Repaired (I hope) bugs introduces
35  *                                      by above two patches.
36  *           Andrea Arcangeli   :       If possible we block in connect(2)
37  *                                      if the max backlog of the listen socket
38  *                                      is been reached. This won't break
39  *                                      old apps and it will avoid huge amount
40  *                                      of socks hashed (this for unix_gc()
41  *                                      performances reasons).
42  *                                      Security fix that limits the max
43  *                                      number of socks to 2*max_files and
44  *                                      the number of skb queueable in the
45  *                                      dgram receiver.
46  *              Artur Skawina   :       Hash function optimizations
47  *           Alexey Kuznetsov   :       Full scale SMP. Lot of bugs are introduced 8)
48  *            Malcolm Beattie   :       Set peercred for socketpair
49  *           Michal Ostrowski   :       Module initialization cleanup.
50  *           Arnaldo C. Melo    :       Remove MOD_{INC,DEC}_USE_COUNT,
51  *                                      the core infrastructure is doing that
52  *                                      for all net proto families now (2.5.69+)
53  *
54  *
55  * Known differences from reference BSD that was tested:
56  *
57  *      [TO FIX]
58  *      ECONNREFUSED is not returned from one end of a connected() socket to the
59  *              other the moment one end closes.
60  *      fstat() doesn't return st_dev=0, and give the blksize as high water mark
61  *              and a fake inode identifier (nor the BSD first socket fstat twice bug).
62  *      [NOT TO FIX]
63  *      accept() returns a path name even if the connecting socket has closed
64  *              in the meantime (BSD loses the path and gives up).
65  *      accept() returns 0 length path for an unbound connector. BSD returns 16
66  *              and a null first byte in the path (but not for gethost/peername - BSD bug ??)
67  *      socketpair(...SOCK_RAW..) doesn't panic the kernel.
68  *      BSD af_unix apparently has connect forgetting to block properly.
69  *              (need to check this with the POSIX spec in detail)
70  *
71  * Differences from 2.0.0-11-... (ANK)
72  *      Bug fixes and improvements.
73  *              - client shutdown killed server socket.
74  *              - removed all useless cli/sti pairs.
75  *
76  *      Semantic changes/extensions.
77  *              - generic control message passing.
78  *              - SCM_CREDENTIALS control message.
79  *              - "Abstract" (not FS based) socket bindings.
80  *                Abstract names are sequences of bytes (not zero terminated)
81  *                started by 0, so that this name space does not intersect
82  *                with BSD names.
83  */
84
85 #include <linux/module.h>
86 #include <linux/config.h>
87 #include <linux/kernel.h>
88 #include <linux/major.h>
89 #include <linux/signal.h>
90 #include <linux/sched.h>
91 #include <linux/errno.h>
92 #include <linux/string.h>
93 #include <linux/stat.h>
94 #include <linux/dcache.h>
95 #include <linux/namei.h>
96 #include <linux/socket.h>
97 #include <linux/un.h>
98 #include <linux/fcntl.h>
99 #include <linux/termios.h>
100 #include <linux/sockios.h>
101 #include <linux/net.h>
102 #include <linux/in.h>
103 #include <linux/fs.h>
104 #include <linux/slab.h>
105 #include <asm/uaccess.h>
106 #include <linux/skbuff.h>
107 #include <linux/netdevice.h>
108 #include <net/sock.h>
109 #include <linux/tcp.h>
110 #include <net/af_unix.h>
111 #include <linux/proc_fs.h>
112 #include <linux/seq_file.h>
113 #include <net/scm.h>
114 #include <linux/init.h>
115 #include <linux/poll.h>
116 #include <linux/smp_lock.h>
117 #include <linux/rtnetlink.h>
118 #include <linux/mount.h>
119 #include <net/checksum.h>
120 #include <linux/security.h>
121
122 int sysctl_unix_max_dgram_qlen = 10;
123
124 struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1];
125 DEFINE_RWLOCK(unix_table_lock);
126 static atomic_t unix_nr_socks = ATOMIC_INIT(0);
127
128 #define unix_sockets_unbound    (&unix_socket_table[UNIX_HASH_SIZE])
129
130 #define UNIX_ABSTRACT(sk)       (unix_sk(sk)->addr->hash != UNIX_HASH_SIZE)
131
132 /*
133  *  SMP locking strategy:
134  *    hash table is protected with rwlock unix_table_lock
135  *    each socket state is protected by separate rwlock.
136  */
137
138 static inline unsigned unix_hash_fold(unsigned hash)
139 {
140         hash ^= hash>>16;
141         hash ^= hash>>8;
142         return hash&(UNIX_HASH_SIZE-1);
143 }
144
145 #define unix_peer(sk) (unix_sk(sk)->peer)
146
147 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
148 {
149         return unix_peer(osk) == sk;
150 }
151
152 static inline int unix_may_send(struct sock *sk, struct sock *osk)
153 {
154         return (unix_peer(osk) == NULL || unix_our_peer(sk, osk));
155 }
156
157 static struct sock *unix_peer_get(struct sock *s)
158 {
159         struct sock *peer;
160
161         unix_state_rlock(s);
162         peer = unix_peer(s);
163         if (peer)
164                 sock_hold(peer);
165         unix_state_runlock(s);
166         return peer;
167 }
168
169 static inline void unix_release_addr(struct unix_address *addr)
170 {
171         if (atomic_dec_and_test(&addr->refcnt))
172                 kfree(addr);
173 }
174
175 /*
176  *      Check unix socket name:
177  *              - should be not zero length.
178  *              - if started by not zero, should be NULL terminated (FS object)
179  *              - if started by zero, it is abstract name.
180  */
181  
182 static int unix_mkname(struct sockaddr_un * sunaddr, int len, unsigned *hashp)
183 {
184         if (len <= sizeof(short) || len > sizeof(*sunaddr))
185                 return -EINVAL;
186         if (!sunaddr || sunaddr->sun_family != AF_UNIX)
187                 return -EINVAL;
188         if (sunaddr->sun_path[0]) {
189                 /*
190                  * This may look like an off by one error but it is a bit more
191                  * subtle. 108 is the longest valid AF_UNIX path for a binding.
192                  * sun_path[108] doesnt as such exist.  However in kernel space
193                  * we are guaranteed that it is a valid memory location in our
194                  * kernel address buffer.
195                  */
196                 ((char *)sunaddr)[len]=0;
197                 len = strlen(sunaddr->sun_path)+1+sizeof(short);
198                 return len;
199         }
200
201         *hashp = unix_hash_fold(csum_partial((char*)sunaddr, len, 0));
202         return len;
203 }
204
205 static void __unix_remove_socket(struct sock *sk)
206 {
207         sk_del_node_init(sk);
208 }
209
210 static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
211 {
212         BUG_TRAP(sk_unhashed(sk));
213         sk_add_node(sk, list);
214 }
215
216 static inline void unix_remove_socket(struct sock *sk)
217 {
218         write_lock(&unix_table_lock);
219         __unix_remove_socket(sk);
220         write_unlock(&unix_table_lock);
221 }
222
223 static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
224 {
225         write_lock(&unix_table_lock);
226         __unix_insert_socket(list, sk);
227         write_unlock(&unix_table_lock);
228 }
229
230 static struct sock *__unix_find_socket_byname(struct sockaddr_un *sunname,
231                                               int len, int type, unsigned hash)
232 {
233         struct sock *s;
234         struct hlist_node *node;
235
236         sk_for_each(s, node, &unix_socket_table[hash ^ type]) {
237                 struct unix_sock *u = unix_sk(s);
238
239                 if (u->addr->len == len &&
240                     !memcmp(u->addr->name, sunname, len))
241                         goto found;
242         }
243         s = NULL;
244 found:
245         return s;
246 }
247
248 static inline struct sock *unix_find_socket_byname(struct sockaddr_un *sunname,
249                                                    int len, int type,
250                                                    unsigned hash)
251 {
252         struct sock *s;
253
254         read_lock(&unix_table_lock);
255         s = __unix_find_socket_byname(sunname, len, type, hash);
256         if (s)
257                 sock_hold(s);
258         read_unlock(&unix_table_lock);
259         return s;
260 }
261
262 static struct sock *unix_find_socket_byinode(struct inode *i)
263 {
264         struct sock *s;
265         struct hlist_node *node;
266
267         read_lock(&unix_table_lock);
268         sk_for_each(s, node,
269                     &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
270                 struct dentry *dentry = unix_sk(s)->dentry;
271
272                 if(dentry && dentry->d_inode == i)
273                 {
274                         sock_hold(s);
275                         goto found;
276                 }
277         }
278         s = NULL;
279 found:
280         read_unlock(&unix_table_lock);
281         return s;
282 }
283
284 static inline int unix_writable(struct sock *sk)
285 {
286         return (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
287 }
288
289 static void unix_write_space(struct sock *sk)
290 {
291         read_lock(&sk->sk_callback_lock);
292         if (unix_writable(sk)) {
293                 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
294                         wake_up_interruptible(sk->sk_sleep);
295                 sk_wake_async(sk, 2, POLL_OUT);
296         }
297         read_unlock(&sk->sk_callback_lock);
298 }
299
300 /* When dgram socket disconnects (or changes its peer), we clear its receive
301  * queue of packets arrived from previous peer. First, it allows to do
302  * flow control based only on wmem_alloc; second, sk connected to peer
303  * may receive messages only from that peer. */
304 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
305 {
306         if (skb_queue_len(&sk->sk_receive_queue)) {
307                 skb_queue_purge(&sk->sk_receive_queue);
308                 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
309
310                 /* If one link of bidirectional dgram pipe is disconnected,
311                  * we signal error. Messages are lost. Do not make this,
312                  * when peer was not connected to us.
313                  */
314                 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
315                         other->sk_err = ECONNRESET;
316                         other->sk_error_report(other);
317                 }
318         }
319 }
320
321 static void unix_sock_destructor(struct sock *sk)
322 {
323         struct unix_sock *u = unix_sk(sk);
324
325         skb_queue_purge(&sk->sk_receive_queue);
326
327         BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
328         BUG_TRAP(sk_unhashed(sk));
329         BUG_TRAP(!sk->sk_socket);
330         if (!sock_flag(sk, SOCK_DEAD)) {
331                 printk("Attempt to release alive unix socket: %p\n", sk);
332                 return;
333         }
334
335         if (u->addr)
336                 unix_release_addr(u->addr);
337
338         atomic_dec(&unix_nr_socks);
339 #ifdef UNIX_REFCNT_DEBUG
340         printk(KERN_DEBUG "UNIX %p is destroyed, %d are still alive.\n", sk, atomic_read(&unix_nr_socks));
341 #endif
342 }
343
344 static int unix_release_sock (struct sock *sk, int embrion)
345 {
346         struct unix_sock *u = unix_sk(sk);
347         struct dentry *dentry;
348         struct vfsmount *mnt;
349         struct sock *skpair;
350         struct sk_buff *skb;
351         int state;
352
353         unix_remove_socket(sk);
354
355         /* Clear state */
356         unix_state_wlock(sk);
357         sock_orphan(sk);
358         sk->sk_shutdown = SHUTDOWN_MASK;
359         dentry       = u->dentry;
360         u->dentry    = NULL;
361         mnt          = u->mnt;
362         u->mnt       = NULL;
363         state = sk->sk_state;
364         sk->sk_state = TCP_CLOSE;
365         unix_state_wunlock(sk);
366
367         wake_up_interruptible_all(&u->peer_wait);
368
369         skpair=unix_peer(sk);
370
371         if (skpair!=NULL) {
372                 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
373                         unix_state_wlock(skpair);
374                         /* No more writes */
375                         skpair->sk_shutdown = SHUTDOWN_MASK;
376                         if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
377                                 skpair->sk_err = ECONNRESET;
378                         unix_state_wunlock(skpair);
379                         skpair->sk_state_change(skpair);
380                         read_lock(&skpair->sk_callback_lock);
381                         sk_wake_async(skpair,1,POLL_HUP);
382                         read_unlock(&skpair->sk_callback_lock);
383                 }
384                 sock_put(skpair); /* It may now die */
385                 unix_peer(sk) = NULL;
386         }
387
388         /* Try to flush out this socket. Throw out buffers at least */
389
390         while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
391                 if (state==TCP_LISTEN)
392                         unix_release_sock(skb->sk, 1);
393                 /* passed fds are erased in the kfree_skb hook        */
394                 kfree_skb(skb);
395         }
396
397         if (dentry) {
398                 dput(dentry);
399                 mntput(mnt);
400         }
401
402         sock_put(sk);
403
404         /* ---- Socket is dead now and most probably destroyed ---- */
405
406         /*
407          * Fixme: BSD difference: In BSD all sockets connected to use get
408          *        ECONNRESET and we die on the spot. In Linux we behave
409          *        like files and pipes do and wait for the last
410          *        dereference.
411          *
412          * Can't we simply set sock->err?
413          *
414          *        What the above comment does talk about? --ANK(980817)
415          */
416
417         if (atomic_read(&unix_tot_inflight))
418                 unix_gc();              /* Garbage collect fds */       
419
420         return 0;
421 }
422
423 static int unix_listen(struct socket *sock, int backlog)
424 {
425         int err;
426         struct sock *sk = sock->sk;
427         struct unix_sock *u = unix_sk(sk);
428
429         err = -EOPNOTSUPP;
430         if (sock->type!=SOCK_STREAM && sock->type!=SOCK_SEQPACKET)
431                 goto out;                       /* Only stream/seqpacket sockets accept */
432         err = -EINVAL;
433         if (!u->addr)
434                 goto out;                       /* No listens on an unbound socket */
435         unix_state_wlock(sk);
436         if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
437                 goto out_unlock;
438         if (backlog > sk->sk_max_ack_backlog)
439                 wake_up_interruptible_all(&u->peer_wait);
440         sk->sk_max_ack_backlog  = backlog;
441         sk->sk_state            = TCP_LISTEN;
442         /* set credentials so connect can copy them */
443         sk->sk_peercred.pid     = current->tgid;
444         sk->sk_peercred.uid     = current->euid;
445         sk->sk_peercred.gid     = current->egid;
446         err = 0;
447
448 out_unlock:
449         unix_state_wunlock(sk);
450 out:
451         return err;
452 }
453
454 static int unix_release(struct socket *);
455 static int unix_bind(struct socket *, struct sockaddr *, int);
456 static int unix_stream_connect(struct socket *, struct sockaddr *,
457                                int addr_len, int flags);
458 static int unix_socketpair(struct socket *, struct socket *);
459 static int unix_accept(struct socket *, struct socket *, int);
460 static int unix_getname(struct socket *, struct sockaddr *, int *, int);
461 static unsigned int unix_poll(struct file *, struct socket *, poll_table *);
462 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
463 static int unix_shutdown(struct socket *, int);
464 static int unix_stream_sendmsg(struct kiocb *, struct socket *,
465                                struct msghdr *, size_t);
466 static int unix_stream_recvmsg(struct kiocb *, struct socket *,
467                                struct msghdr *, size_t, int);
468 static int unix_dgram_sendmsg(struct kiocb *, struct socket *,
469                               struct msghdr *, size_t);
470 static int unix_dgram_recvmsg(struct kiocb *, struct socket *,
471                               struct msghdr *, size_t, int);
472 static int unix_dgram_connect(struct socket *, struct sockaddr *,
473                               int, int);
474 static int unix_seqpacket_sendmsg(struct kiocb *, struct socket *,
475                                   struct msghdr *, size_t);
476
477 static struct proto_ops unix_stream_ops = {
478         .family =       PF_UNIX,
479         .owner =        THIS_MODULE,
480         .release =      unix_release,
481         .bind =         unix_bind,
482         .connect =      unix_stream_connect,
483         .socketpair =   unix_socketpair,
484         .accept =       unix_accept,
485         .getname =      unix_getname,
486         .poll =         unix_poll,
487         .ioctl =        unix_ioctl,
488         .listen =       unix_listen,
489         .shutdown =     unix_shutdown,
490         .setsockopt =   sock_no_setsockopt,
491         .getsockopt =   sock_no_getsockopt,
492         .sendmsg =      unix_stream_sendmsg,
493         .recvmsg =      unix_stream_recvmsg,
494         .mmap =         sock_no_mmap,
495         .sendpage =     sock_no_sendpage,
496 };
497
498 static struct proto_ops unix_dgram_ops = {
499         .family =       PF_UNIX,
500         .owner =        THIS_MODULE,
501         .release =      unix_release,
502         .bind =         unix_bind,
503         .connect =      unix_dgram_connect,
504         .socketpair =   unix_socketpair,
505         .accept =       sock_no_accept,
506         .getname =      unix_getname,
507         .poll =         datagram_poll,
508         .ioctl =        unix_ioctl,
509         .listen =       sock_no_listen,
510         .shutdown =     unix_shutdown,
511         .setsockopt =   sock_no_setsockopt,
512         .getsockopt =   sock_no_getsockopt,
513         .sendmsg =      unix_dgram_sendmsg,
514         .recvmsg =      unix_dgram_recvmsg,
515         .mmap =         sock_no_mmap,
516         .sendpage =     sock_no_sendpage,
517 };
518
519 static struct proto_ops unix_seqpacket_ops = {
520         .family =       PF_UNIX,
521         .owner =        THIS_MODULE,
522         .release =      unix_release,
523         .bind =         unix_bind,
524         .connect =      unix_stream_connect,
525         .socketpair =   unix_socketpair,
526         .accept =       unix_accept,
527         .getname =      unix_getname,
528         .poll =         datagram_poll,
529         .ioctl =        unix_ioctl,
530         .listen =       unix_listen,
531         .shutdown =     unix_shutdown,
532         .setsockopt =   sock_no_setsockopt,
533         .getsockopt =   sock_no_getsockopt,
534         .sendmsg =      unix_seqpacket_sendmsg,
535         .recvmsg =      unix_dgram_recvmsg,
536         .mmap =         sock_no_mmap,
537         .sendpage =     sock_no_sendpage,
538 };
539
540 static struct proto unix_proto = {
541         .name     = "UNIX",
542         .owner    = THIS_MODULE,
543         .obj_size = sizeof(struct unix_sock),
544 };
545
546 static struct sock * unix_create1(struct socket *sock)
547 {
548         struct sock *sk = NULL;
549         struct unix_sock *u;
550
551         if (atomic_read(&unix_nr_socks) >= 2*files_stat.max_files)
552                 goto out;
553
554         sk = sk_alloc(PF_UNIX, GFP_KERNEL, &unix_proto, 1);
555         if (!sk)
556                 goto out;
557
558         atomic_inc(&unix_nr_socks);
559
560         sock_init_data(sock,sk);
561
562         sk->sk_write_space      = unix_write_space;
563         sk->sk_max_ack_backlog  = sysctl_unix_max_dgram_qlen;
564         sk->sk_destruct         = unix_sock_destructor;
565         u         = unix_sk(sk);
566         u->dentry = NULL;
567         u->mnt    = NULL;
568         rwlock_init(&u->lock);
569         atomic_set(&u->inflight, sock ? 0 : -1);
570         init_MUTEX(&u->readsem); /* single task reading lock */
571         init_waitqueue_head(&u->peer_wait);
572         unix_insert_socket(unix_sockets_unbound, sk);
573 out:
574         return sk;
575 }
576
577 static int unix_create(struct socket *sock, int protocol)
578 {
579         if (protocol && protocol != PF_UNIX)
580                 return -EPROTONOSUPPORT;
581
582         sock->state = SS_UNCONNECTED;
583
584         switch (sock->type) {
585         case SOCK_STREAM:
586                 sock->ops = &unix_stream_ops;
587                 break;
588                 /*
589                  *      Believe it or not BSD has AF_UNIX, SOCK_RAW though
590                  *      nothing uses it.
591                  */
592         case SOCK_RAW:
593                 sock->type=SOCK_DGRAM;
594         case SOCK_DGRAM:
595                 sock->ops = &unix_dgram_ops;
596                 break;
597         case SOCK_SEQPACKET:
598                 sock->ops = &unix_seqpacket_ops;
599                 break;
600         default:
601                 return -ESOCKTNOSUPPORT;
602         }
603
604         return unix_create1(sock) ? 0 : -ENOMEM;
605 }
606
607 static int unix_release(struct socket *sock)
608 {
609         struct sock *sk = sock->sk;
610
611         if (!sk)
612                 return 0;
613
614         sock->sk = NULL;
615
616         return unix_release_sock (sk, 0);
617 }
618
619 static int unix_autobind(struct socket *sock)
620 {
621         struct sock *sk = sock->sk;
622         struct unix_sock *u = unix_sk(sk);
623         static u32 ordernum = 1;
624         struct unix_address * addr;
625         int err;
626
627         down(&u->readsem);
628
629         err = 0;
630         if (u->addr)
631                 goto out;
632
633         err = -ENOMEM;
634         addr = kmalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
635         if (!addr)
636                 goto out;
637
638         memset(addr, 0, sizeof(*addr) + sizeof(short) + 16);
639         addr->name->sun_family = AF_UNIX;
640         atomic_set(&addr->refcnt, 1);
641
642 retry:
643         addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
644         addr->hash = unix_hash_fold(csum_partial((void*)addr->name, addr->len, 0));
645
646         write_lock(&unix_table_lock);
647         ordernum = (ordernum+1)&0xFFFFF;
648
649         if (__unix_find_socket_byname(addr->name, addr->len, sock->type,
650                                       addr->hash)) {
651                 write_unlock(&unix_table_lock);
652                 /* Sanity yield. It is unusual case, but yet... */
653                 if (!(ordernum&0xFF))
654                         yield();
655                 goto retry;
656         }
657         addr->hash ^= sk->sk_type;
658
659         __unix_remove_socket(sk);
660         u->addr = addr;
661         __unix_insert_socket(&unix_socket_table[addr->hash], sk);
662         write_unlock(&unix_table_lock);
663         err = 0;
664
665 out:    up(&u->readsem);
666         return err;
667 }
668
669 static struct sock *unix_find_other(struct sockaddr_un *sunname, int len,
670                                     int type, unsigned hash, int *error)
671 {
672         struct sock *u;
673         struct nameidata nd;
674         int err = 0;
675         
676         if (sunname->sun_path[0]) {
677                 err = path_lookup(sunname->sun_path, LOOKUP_FOLLOW, &nd);
678                 if (err)
679                         goto fail;
680                 err = permission(nd.dentry->d_inode,MAY_WRITE, &nd);
681                 if (err)
682                         goto put_fail;
683
684                 err = -ECONNREFUSED;
685                 if (!S_ISSOCK(nd.dentry->d_inode->i_mode))
686                         goto put_fail;
687                 u=unix_find_socket_byinode(nd.dentry->d_inode);
688                 if (!u)
689                         goto put_fail;
690
691                 if (u->sk_type == type)
692                         touch_atime(nd.mnt, nd.dentry);
693
694                 path_release(&nd);
695
696                 err=-EPROTOTYPE;
697                 if (u->sk_type != type) {
698                         sock_put(u);
699                         goto fail;
700                 }
701         } else {
702                 err = -ECONNREFUSED;
703                 u=unix_find_socket_byname(sunname, len, type, hash);
704                 if (u) {
705                         struct dentry *dentry;
706                         dentry = unix_sk(u)->dentry;
707                         if (dentry)
708                                 touch_atime(unix_sk(u)->mnt, dentry);
709                 } else
710                         goto fail;
711         }
712         return u;
713
714 put_fail:
715         path_release(&nd);
716 fail:
717         *error=err;
718         return NULL;
719 }
720
721
722 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
723 {
724         struct sock *sk = sock->sk;
725         struct unix_sock *u = unix_sk(sk);
726         struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr;
727         struct dentry * dentry = NULL;
728         struct nameidata nd;
729         int err;
730         unsigned hash;
731         struct unix_address *addr;
732         struct hlist_head *list;
733
734         err = -EINVAL;
735         if (sunaddr->sun_family != AF_UNIX)
736                 goto out;
737
738         if (addr_len==sizeof(short)) {
739                 err = unix_autobind(sock);
740                 goto out;
741         }
742
743         err = unix_mkname(sunaddr, addr_len, &hash);
744         if (err < 0)
745                 goto out;
746         addr_len = err;
747
748         down(&u->readsem);
749
750         err = -EINVAL;
751         if (u->addr)
752                 goto out_up;
753
754         err = -ENOMEM;
755         addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
756         if (!addr)
757                 goto out_up;
758
759         memcpy(addr->name, sunaddr, addr_len);
760         addr->len = addr_len;
761         addr->hash = hash ^ sk->sk_type;
762         atomic_set(&addr->refcnt, 1);
763
764         if (sunaddr->sun_path[0]) {
765                 unsigned int mode;
766                 err = 0;
767                 /*
768                  * Get the parent directory, calculate the hash for last
769                  * component.
770                  */
771                 err = path_lookup(sunaddr->sun_path, LOOKUP_PARENT, &nd);
772                 if (err)
773                         goto out_mknod_parent;
774                 /*
775                  * Yucky last component or no last component at all?
776                  * (foo/., foo/.., /////)
777                  */
778                 err = -EEXIST;
779                 if (nd.last_type != LAST_NORM)
780                         goto out_mknod;
781                 /*
782                  * Lock the directory.
783                  */
784                 down(&nd.dentry->d_inode->i_sem);
785                 /*
786                  * Do the final lookup.
787                  */
788                 dentry = lookup_hash(&nd.last, nd.dentry);
789                 err = PTR_ERR(dentry);
790                 if (IS_ERR(dentry))
791                         goto out_mknod_unlock;
792                 err = -ENOENT;
793                 /*
794                  * Special case - lookup gave negative, but... we had foo/bar/
795                  * From the vfs_mknod() POV we just have a negative dentry -
796                  * all is fine. Let's be bastards - you had / on the end, you've
797                  * been asking for (non-existent) directory. -ENOENT for you.
798                  */
799                 if (nd.last.name[nd.last.len] && !dentry->d_inode)
800                         goto out_mknod_dput;
801                 /*
802                  * All right, let's create it.
803                  */
804                 mode = S_IFSOCK |
805                        (SOCK_INODE(sock)->i_mode & ~current->fs->umask);
806                 err = vfs_mknod(nd.dentry->d_inode, dentry, mode, 0);
807                 if (err)
808                         goto out_mknod_dput;
809                 up(&nd.dentry->d_inode->i_sem);
810                 dput(nd.dentry);
811                 nd.dentry = dentry;
812
813                 addr->hash = UNIX_HASH_SIZE;
814         }
815
816         write_lock(&unix_table_lock);
817
818         if (!sunaddr->sun_path[0]) {
819                 err = -EADDRINUSE;
820                 if (__unix_find_socket_byname(sunaddr, addr_len,
821                                               sk->sk_type, hash)) {
822                         unix_release_addr(addr);
823                         goto out_unlock;
824                 }
825
826                 list = &unix_socket_table[addr->hash];
827         } else {
828                 list = &unix_socket_table[dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1)];
829                 u->dentry = nd.dentry;
830                 u->mnt    = nd.mnt;
831         }
832
833         err = 0;
834         __unix_remove_socket(sk);
835         u->addr = addr;
836         __unix_insert_socket(list, sk);
837
838 out_unlock:
839         write_unlock(&unix_table_lock);
840 out_up:
841         up(&u->readsem);
842 out:
843         return err;
844
845 out_mknod_dput:
846         dput(dentry);
847 out_mknod_unlock:
848         up(&nd.dentry->d_inode->i_sem);
849 out_mknod:
850         path_release(&nd);
851 out_mknod_parent:
852         if (err==-EEXIST)
853                 err=-EADDRINUSE;
854         unix_release_addr(addr);
855         goto out_up;
856 }
857
858 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
859                               int alen, int flags)
860 {
861         struct sock *sk = sock->sk;
862         struct sockaddr_un *sunaddr=(struct sockaddr_un*)addr;
863         struct sock *other;
864         unsigned hash;
865         int err;
866
867         if (addr->sa_family != AF_UNSPEC) {
868                 err = unix_mkname(sunaddr, alen, &hash);
869                 if (err < 0)
870                         goto out;
871                 alen = err;
872
873                 if (test_bit(SOCK_PASSCRED, &sock->flags) &&
874                     !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0)
875                         goto out;
876
877                 other=unix_find_other(sunaddr, alen, sock->type, hash, &err);
878                 if (!other)
879                         goto out;
880
881                 unix_state_wlock(sk);
882
883                 err = -EPERM;
884                 if (!unix_may_send(sk, other))
885                         goto out_unlock;
886
887                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
888                 if (err)
889                         goto out_unlock;
890
891         } else {
892                 /*
893                  *      1003.1g breaking connected state with AF_UNSPEC
894                  */
895                 other = NULL;
896                 unix_state_wlock(sk);
897         }
898
899         /*
900          * If it was connected, reconnect.
901          */
902         if (unix_peer(sk)) {
903                 struct sock *old_peer = unix_peer(sk);
904                 unix_peer(sk)=other;
905                 unix_state_wunlock(sk);
906
907                 if (other != old_peer)
908                         unix_dgram_disconnected(sk, old_peer);
909                 sock_put(old_peer);
910         } else {
911                 unix_peer(sk)=other;
912                 unix_state_wunlock(sk);
913         }
914         return 0;
915
916 out_unlock:
917         unix_state_wunlock(sk);
918         sock_put(other);
919 out:
920         return err;
921 }
922
923 static long unix_wait_for_peer(struct sock *other, long timeo)
924 {
925         struct unix_sock *u = unix_sk(other);
926         int sched;
927         DEFINE_WAIT(wait);
928
929         prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
930
931         sched = !sock_flag(other, SOCK_DEAD) &&
932                 !(other->sk_shutdown & RCV_SHUTDOWN) &&
933                 (skb_queue_len(&other->sk_receive_queue) >
934                  other->sk_max_ack_backlog);
935
936         unix_state_runlock(other);
937
938         if (sched)
939                 timeo = schedule_timeout(timeo);
940
941         finish_wait(&u->peer_wait, &wait);
942         return timeo;
943 }
944
945 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
946                                int addr_len, int flags)
947 {
948         struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr;
949         struct sock *sk = sock->sk;
950         struct unix_sock *u = unix_sk(sk), *newu, *otheru;
951         struct sock *newsk = NULL;
952         struct sock *other = NULL;
953         struct sk_buff *skb = NULL;
954         unsigned hash;
955         int st;
956         int err;
957         long timeo;
958
959         err = unix_mkname(sunaddr, addr_len, &hash);
960         if (err < 0)
961                 goto out;
962         addr_len = err;
963
964         if (test_bit(SOCK_PASSCRED, &sock->flags)
965                 && !u->addr && (err = unix_autobind(sock)) != 0)
966                 goto out;
967
968         timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
969
970         /* First of all allocate resources.
971            If we will make it after state is locked,
972            we will have to recheck all again in any case.
973          */
974
975         err = -ENOMEM;
976
977         /* create new sock for complete connection */
978         newsk = unix_create1(NULL);
979         if (newsk == NULL)
980                 goto out;
981
982         /* Allocate skb for sending to listening sock */
983         skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
984         if (skb == NULL)
985                 goto out;
986
987 restart:
988         /*  Find listening sock. */
989         other = unix_find_other(sunaddr, addr_len, sk->sk_type, hash, &err);
990         if (!other)
991                 goto out;
992
993         /* Latch state of peer */
994         unix_state_rlock(other);
995
996         /* Apparently VFS overslept socket death. Retry. */
997         if (sock_flag(other, SOCK_DEAD)) {
998                 unix_state_runlock(other);
999                 sock_put(other);
1000                 goto restart;
1001         }
1002
1003         err = -ECONNREFUSED;
1004         if (other->sk_state != TCP_LISTEN)
1005                 goto out_unlock;
1006
1007         if (skb_queue_len(&other->sk_receive_queue) >
1008             other->sk_max_ack_backlog) {
1009                 err = -EAGAIN;
1010                 if (!timeo)
1011                         goto out_unlock;
1012
1013                 timeo = unix_wait_for_peer(other, timeo);
1014
1015                 err = sock_intr_errno(timeo);
1016                 if (signal_pending(current))
1017                         goto out;
1018                 sock_put(other);
1019                 goto restart;
1020         }
1021
1022         /* Latch our state.
1023
1024            It is tricky place. We need to grab write lock and cannot
1025            drop lock on peer. It is dangerous because deadlock is
1026            possible. Connect to self case and simultaneous
1027            attempt to connect are eliminated by checking socket
1028            state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1029            check this before attempt to grab lock.
1030
1031            Well, and we have to recheck the state after socket locked.
1032          */
1033         st = sk->sk_state;
1034
1035         switch (st) {
1036         case TCP_CLOSE:
1037                 /* This is ok... continue with connect */
1038                 break;
1039         case TCP_ESTABLISHED:
1040                 /* Socket is already connected */
1041                 err = -EISCONN;
1042                 goto out_unlock;
1043         default:
1044                 err = -EINVAL;
1045                 goto out_unlock;
1046         }
1047
1048         unix_state_wlock(sk);
1049
1050         if (sk->sk_state != st) {
1051                 unix_state_wunlock(sk);
1052                 unix_state_runlock(other);
1053                 sock_put(other);
1054                 goto restart;
1055         }
1056
1057         err = security_unix_stream_connect(sock, other->sk_socket, newsk);
1058         if (err) {
1059                 unix_state_wunlock(sk);
1060                 goto out_unlock;
1061         }
1062
1063         /* The way is open! Fastly set all the necessary fields... */
1064
1065         sock_hold(sk);
1066         unix_peer(newsk)        = sk;
1067         newsk->sk_state         = TCP_ESTABLISHED;
1068         newsk->sk_type          = sk->sk_type;
1069         newsk->sk_peercred.pid  = current->tgid;
1070         newsk->sk_peercred.uid  = current->euid;
1071         newsk->sk_peercred.gid  = current->egid;
1072         newu = unix_sk(newsk);
1073         newsk->sk_sleep         = &newu->peer_wait;
1074         otheru = unix_sk(other);
1075
1076         /* copy address information from listening to new sock*/
1077         if (otheru->addr) {
1078                 atomic_inc(&otheru->addr->refcnt);
1079                 newu->addr = otheru->addr;
1080         }
1081         if (otheru->dentry) {
1082                 newu->dentry    = dget(otheru->dentry);
1083                 newu->mnt       = mntget(otheru->mnt);
1084         }
1085
1086         /* Set credentials */
1087         sk->sk_peercred = other->sk_peercred;
1088
1089         sock_hold(newsk);
1090         unix_peer(sk)   = newsk;
1091         sock->state     = SS_CONNECTED;
1092         sk->sk_state    = TCP_ESTABLISHED;
1093
1094         unix_state_wunlock(sk);
1095
1096         /* take ten and and send info to listening sock */
1097         spin_lock(&other->sk_receive_queue.lock);
1098         __skb_queue_tail(&other->sk_receive_queue, skb);
1099         /* Undo artificially decreased inflight after embrion
1100          * is installed to listening socket. */
1101         atomic_inc(&newu->inflight);
1102         spin_unlock(&other->sk_receive_queue.lock);
1103         unix_state_runlock(other);
1104         other->sk_data_ready(other, 0);
1105         sock_put(other);
1106         return 0;
1107
1108 out_unlock:
1109         if (other)
1110                 unix_state_runlock(other);
1111
1112 out:
1113         if (skb)
1114                 kfree_skb(skb);
1115         if (newsk)
1116                 unix_release_sock(newsk, 0);
1117         if (other)
1118                 sock_put(other);
1119         return err;
1120 }
1121
1122 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1123 {
1124         struct sock *ska=socka->sk, *skb = sockb->sk;
1125
1126         /* Join our sockets back to back */
1127         sock_hold(ska);
1128         sock_hold(skb);
1129         unix_peer(ska)=skb;
1130         unix_peer(skb)=ska;
1131         ska->sk_peercred.pid = skb->sk_peercred.pid = current->tgid;
1132         ska->sk_peercred.uid = skb->sk_peercred.uid = current->euid;
1133         ska->sk_peercred.gid = skb->sk_peercred.gid = current->egid;
1134
1135         if (ska->sk_type != SOCK_DGRAM) {
1136                 ska->sk_state = TCP_ESTABLISHED;
1137                 skb->sk_state = TCP_ESTABLISHED;
1138                 socka->state  = SS_CONNECTED;
1139                 sockb->state  = SS_CONNECTED;
1140         }
1141         return 0;
1142 }
1143
1144 static int unix_accept(struct socket *sock, struct socket *newsock, int flags)
1145 {
1146         struct sock *sk = sock->sk;
1147         struct sock *tsk;
1148         struct sk_buff *skb;
1149         int err;
1150
1151         err = -EOPNOTSUPP;
1152         if (sock->type!=SOCK_STREAM && sock->type!=SOCK_SEQPACKET)
1153                 goto out;
1154
1155         err = -EINVAL;
1156         if (sk->sk_state != TCP_LISTEN)
1157                 goto out;
1158
1159         /* If socket state is TCP_LISTEN it cannot change (for now...),
1160          * so that no locks are necessary.
1161          */
1162
1163         skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
1164         if (!skb) {
1165                 /* This means receive shutdown. */
1166                 if (err == 0)
1167                         err = -EINVAL;
1168                 goto out;
1169         }
1170
1171         tsk = skb->sk;
1172         skb_free_datagram(sk, skb);
1173         wake_up_interruptible(&unix_sk(sk)->peer_wait);
1174
1175         /* attach accepted sock to socket */
1176         unix_state_wlock(tsk);
1177         newsock->state = SS_CONNECTED;
1178         sock_graft(tsk, newsock);
1179         unix_state_wunlock(tsk);
1180         return 0;
1181
1182 out:
1183         return err;
1184 }
1185
1186
1187 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer)
1188 {
1189         struct sock *sk = sock->sk;
1190         struct unix_sock *u;
1191         struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr;
1192         int err = 0;
1193
1194         if (peer) {
1195                 sk = unix_peer_get(sk);
1196
1197                 err = -ENOTCONN;
1198                 if (!sk)
1199                         goto out;
1200                 err = 0;
1201         } else {
1202                 sock_hold(sk);
1203         }
1204
1205         u = unix_sk(sk);
1206         unix_state_rlock(sk);
1207         if (!u->addr) {
1208                 sunaddr->sun_family = AF_UNIX;
1209                 sunaddr->sun_path[0] = 0;
1210                 *uaddr_len = sizeof(short);
1211         } else {
1212                 struct unix_address *addr = u->addr;
1213
1214                 *uaddr_len = addr->len;
1215                 memcpy(sunaddr, addr->name, *uaddr_len);
1216         }
1217         unix_state_runlock(sk);
1218         sock_put(sk);
1219 out:
1220         return err;
1221 }
1222
1223 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1224 {
1225         int i;
1226
1227         scm->fp = UNIXCB(skb).fp;
1228         skb->destructor = sock_wfree;
1229         UNIXCB(skb).fp = NULL;
1230
1231         for (i=scm->fp->count-1; i>=0; i--)
1232                 unix_notinflight(scm->fp->fp[i]);
1233 }
1234
1235 static void unix_destruct_fds(struct sk_buff *skb)
1236 {
1237         struct scm_cookie scm;
1238         memset(&scm, 0, sizeof(scm));
1239         unix_detach_fds(&scm, skb);
1240
1241         /* Alas, it calls VFS */
1242         /* So fscking what? fput() had been SMP-safe since the last Summer */
1243         scm_destroy(&scm);
1244         sock_wfree(skb);
1245 }
1246
1247 static void unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1248 {
1249         int i;
1250         for (i=scm->fp->count-1; i>=0; i--)
1251                 unix_inflight(scm->fp->fp[i]);
1252         UNIXCB(skb).fp = scm->fp;
1253         skb->destructor = unix_destruct_fds;
1254         scm->fp = NULL;
1255 }
1256
1257 /*
1258  *      Send AF_UNIX data.
1259  */
1260
1261 static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
1262                               struct msghdr *msg, size_t len)
1263 {
1264         struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1265         struct sock *sk = sock->sk;
1266         struct unix_sock *u = unix_sk(sk);
1267         struct sockaddr_un *sunaddr=msg->msg_name;
1268         struct sock *other = NULL;
1269         int namelen = 0; /* fake GCC */
1270         int err;
1271         unsigned hash;
1272         struct sk_buff *skb;
1273         long timeo;
1274         struct scm_cookie tmp_scm;
1275
1276         if (NULL == siocb->scm)
1277                 siocb->scm = &tmp_scm;
1278         err = scm_send(sock, msg, siocb->scm);
1279         if (err < 0)
1280                 return err;
1281
1282         err = -EOPNOTSUPP;
1283         if (msg->msg_flags&MSG_OOB)
1284                 goto out;
1285
1286         if (msg->msg_namelen) {
1287                 err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
1288                 if (err < 0)
1289                         goto out;
1290                 namelen = err;
1291         } else {
1292                 sunaddr = NULL;
1293                 err = -ENOTCONN;
1294                 other = unix_peer_get(sk);
1295                 if (!other)
1296                         goto out;
1297         }
1298
1299         if (test_bit(SOCK_PASSCRED, &sock->flags)
1300                 && !u->addr && (err = unix_autobind(sock)) != 0)
1301                 goto out;
1302
1303         err = -EMSGSIZE;
1304         if (len > sk->sk_sndbuf - 32)
1305                 goto out;
1306
1307         skb = sock_alloc_send_skb(sk, len, msg->msg_flags&MSG_DONTWAIT, &err);
1308         if (skb==NULL)
1309                 goto out;
1310
1311         memcpy(UNIXCREDS(skb), &siocb->scm->creds, sizeof(struct ucred));
1312         if (siocb->scm->fp)
1313                 unix_attach_fds(siocb->scm, skb);
1314
1315         skb->h.raw = skb->data;
1316         err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
1317         if (err)
1318                 goto out_free;
1319
1320         timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1321
1322 restart:
1323         if (!other) {
1324                 err = -ECONNRESET;
1325                 if (sunaddr == NULL)
1326                         goto out_free;
1327
1328                 other = unix_find_other(sunaddr, namelen, sk->sk_type,
1329                                         hash, &err);
1330                 if (other==NULL)
1331                         goto out_free;
1332         }
1333
1334         unix_state_rlock(other);
1335         err = -EPERM;
1336         if (!unix_may_send(sk, other))
1337                 goto out_unlock;
1338
1339         if (sock_flag(other, SOCK_DEAD)) {
1340                 /*
1341                  *      Check with 1003.1g - what should
1342                  *      datagram error
1343                  */
1344                 unix_state_runlock(other);
1345                 sock_put(other);
1346
1347                 err = 0;
1348                 unix_state_wlock(sk);
1349                 if (unix_peer(sk) == other) {
1350                         unix_peer(sk)=NULL;
1351                         unix_state_wunlock(sk);
1352
1353                         unix_dgram_disconnected(sk, other);
1354                         sock_put(other);
1355                         err = -ECONNREFUSED;
1356                 } else {
1357                         unix_state_wunlock(sk);
1358                 }
1359
1360                 other = NULL;
1361                 if (err)
1362                         goto out_free;
1363                 goto restart;
1364         }
1365
1366         err = -EPIPE;
1367         if (other->sk_shutdown & RCV_SHUTDOWN)
1368                 goto out_unlock;
1369
1370         if (sk->sk_type != SOCK_SEQPACKET) {
1371                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1372                 if (err)
1373                         goto out_unlock;
1374         }
1375
1376         if (unix_peer(other) != sk &&
1377             (skb_queue_len(&other->sk_receive_queue) >
1378              other->sk_max_ack_backlog)) {
1379                 if (!timeo) {
1380                         err = -EAGAIN;
1381                         goto out_unlock;
1382                 }
1383
1384                 timeo = unix_wait_for_peer(other, timeo);
1385
1386                 err = sock_intr_errno(timeo);
1387                 if (signal_pending(current))
1388                         goto out_free;
1389
1390                 goto restart;
1391         }
1392
1393         skb_queue_tail(&other->sk_receive_queue, skb);
1394         unix_state_runlock(other);
1395         other->sk_data_ready(other, len);
1396         sock_put(other);
1397         scm_destroy(siocb->scm);
1398         return len;
1399
1400 out_unlock:
1401         unix_state_runlock(other);
1402 out_free:
1403         kfree_skb(skb);
1404 out:
1405         if (other)
1406                 sock_put(other);
1407         scm_destroy(siocb->scm);
1408         return err;
1409 }
1410
1411                 
1412 static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
1413                                struct msghdr *msg, size_t len)
1414 {
1415         struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1416         struct sock *sk = sock->sk;
1417         struct sock *other = NULL;
1418         struct sockaddr_un *sunaddr=msg->msg_name;
1419         int err,size;
1420         struct sk_buff *skb;
1421         int sent=0;
1422         struct scm_cookie tmp_scm;
1423
1424         if (NULL == siocb->scm)
1425                 siocb->scm = &tmp_scm;
1426         err = scm_send(sock, msg, siocb->scm);
1427         if (err < 0)
1428                 return err;
1429
1430         err = -EOPNOTSUPP;
1431         if (msg->msg_flags&MSG_OOB)
1432                 goto out_err;
1433
1434         if (msg->msg_namelen) {
1435                 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
1436                 goto out_err;
1437         } else {
1438                 sunaddr = NULL;
1439                 err = -ENOTCONN;
1440                 other = unix_peer_get(sk);
1441                 if (!other)
1442                         goto out_err;
1443         }
1444
1445         if (sk->sk_shutdown & SEND_SHUTDOWN)
1446                 goto pipe_err;
1447
1448         while(sent < len)
1449         {
1450                 /*
1451                  *      Optimisation for the fact that under 0.01% of X messages typically
1452                  *      need breaking up.
1453                  */
1454
1455                 size=len-sent;
1456
1457                 /* Keep two messages in the pipe so it schedules better */
1458                 if (size > sk->sk_sndbuf / 2 - 64)
1459                         size = sk->sk_sndbuf / 2 - 64;
1460
1461                 if (size > SKB_MAX_ALLOC)
1462                         size = SKB_MAX_ALLOC;
1463                         
1464                 /*
1465                  *      Grab a buffer
1466                  */
1467                  
1468                 skb=sock_alloc_send_skb(sk,size,msg->msg_flags&MSG_DONTWAIT, &err);
1469
1470                 if (skb==NULL)
1471                         goto out_err;
1472
1473                 /*
1474                  *      If you pass two values to the sock_alloc_send_skb
1475                  *      it tries to grab the large buffer with GFP_NOFS
1476                  *      (which can fail easily), and if it fails grab the
1477                  *      fallback size buffer which is under a page and will
1478                  *      succeed. [Alan]
1479                  */
1480                 size = min_t(int, size, skb_tailroom(skb));
1481
1482                 memcpy(UNIXCREDS(skb), &siocb->scm->creds, sizeof(struct ucred));
1483                 if (siocb->scm->fp)
1484                         unix_attach_fds(siocb->scm, skb);
1485
1486                 if ((err = memcpy_fromiovec(skb_put(skb,size), msg->msg_iov, size)) != 0) {
1487                         kfree_skb(skb);
1488                         goto out_err;
1489                 }
1490
1491                 unix_state_rlock(other);
1492
1493                 if (sock_flag(other, SOCK_DEAD) ||
1494                     (other->sk_shutdown & RCV_SHUTDOWN))
1495                         goto pipe_err_free;
1496
1497                 skb_queue_tail(&other->sk_receive_queue, skb);
1498                 unix_state_runlock(other);
1499                 other->sk_data_ready(other, size);
1500                 sent+=size;
1501         }
1502         sock_put(other);
1503
1504         scm_destroy(siocb->scm);
1505         siocb->scm = NULL;
1506
1507         return sent;
1508
1509 pipe_err_free:
1510         unix_state_runlock(other);
1511         kfree_skb(skb);
1512 pipe_err:
1513         if (sent==0 && !(msg->msg_flags&MSG_NOSIGNAL))
1514                 send_sig(SIGPIPE,current,0);
1515         err = -EPIPE;
1516 out_err:
1517         if (other)
1518                 sock_put(other);
1519         scm_destroy(siocb->scm);
1520         siocb->scm = NULL;
1521         return sent ? : err;
1522 }
1523
1524 static int unix_seqpacket_sendmsg(struct kiocb *kiocb, struct socket *sock,
1525                                   struct msghdr *msg, size_t len)
1526 {
1527         int err;
1528         struct sock *sk = sock->sk;
1529         
1530         err = sock_error(sk);
1531         if (err)
1532                 return err;
1533
1534         if (sk->sk_state != TCP_ESTABLISHED)
1535                 return -ENOTCONN;
1536
1537         if (msg->msg_namelen)
1538                 msg->msg_namelen = 0;
1539
1540         return unix_dgram_sendmsg(kiocb, sock, msg, len);
1541 }
1542                                                                                             
1543 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
1544 {
1545         struct unix_sock *u = unix_sk(sk);
1546
1547         msg->msg_namelen = 0;
1548         if (u->addr) {
1549                 msg->msg_namelen = u->addr->len;
1550                 memcpy(msg->msg_name, u->addr->name, u->addr->len);
1551         }
1552 }
1553
1554 static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
1555                               struct msghdr *msg, size_t size,
1556                               int flags)
1557 {
1558         struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1559         struct scm_cookie tmp_scm;
1560         struct sock *sk = sock->sk;
1561         struct unix_sock *u = unix_sk(sk);
1562         int noblock = flags & MSG_DONTWAIT;
1563         struct sk_buff *skb;
1564         int err;
1565
1566         err = -EOPNOTSUPP;
1567         if (flags&MSG_OOB)
1568                 goto out;
1569
1570         msg->msg_namelen = 0;
1571
1572         down(&u->readsem);
1573
1574         skb = skb_recv_datagram(sk, flags, noblock, &err);
1575         if (!skb)
1576                 goto out_unlock;
1577
1578         wake_up_interruptible(&u->peer_wait);
1579
1580         if (msg->msg_name)
1581                 unix_copy_addr(msg, skb->sk);
1582
1583         if (size > skb->len)
1584                 size = skb->len;
1585         else if (size < skb->len)
1586                 msg->msg_flags |= MSG_TRUNC;
1587
1588         err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, size);
1589         if (err)
1590                 goto out_free;
1591
1592         if (!siocb->scm) {
1593                 siocb->scm = &tmp_scm;
1594                 memset(&tmp_scm, 0, sizeof(tmp_scm));
1595         }
1596         siocb->scm->creds = *UNIXCREDS(skb);
1597
1598         if (!(flags & MSG_PEEK))
1599         {
1600                 if (UNIXCB(skb).fp)
1601                         unix_detach_fds(siocb->scm, skb);
1602         }
1603         else 
1604         {
1605                 /* It is questionable: on PEEK we could:
1606                    - do not return fds - good, but too simple 8)
1607                    - return fds, and do not return them on read (old strategy,
1608                      apparently wrong)
1609                    - clone fds (I chose it for now, it is the most universal
1610                      solution)
1611                 
1612                    POSIX 1003.1g does not actually define this clearly
1613                    at all. POSIX 1003.1g doesn't define a lot of things
1614                    clearly however!                  
1615                    
1616                 */
1617                 if (UNIXCB(skb).fp)
1618                         siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1619         }
1620         err = size;
1621
1622         scm_recv(sock, msg, siocb->scm, flags);
1623
1624 out_free:
1625         skb_free_datagram(sk,skb);
1626 out_unlock:
1627         up(&u->readsem);
1628 out:
1629         return err;
1630 }
1631
1632 /*
1633  *      Sleep until data has arrive. But check for races..
1634  */
1635  
1636 static long unix_stream_data_wait(struct sock * sk, long timeo)
1637 {
1638         DEFINE_WAIT(wait);
1639
1640         unix_state_rlock(sk);
1641
1642         for (;;) {
1643                 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1644
1645                 if (skb_queue_len(&sk->sk_receive_queue) ||
1646                     sk->sk_err ||
1647                     (sk->sk_shutdown & RCV_SHUTDOWN) ||
1648                     signal_pending(current) ||
1649                     !timeo)
1650                         break;
1651
1652                 set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1653                 unix_state_runlock(sk);
1654                 timeo = schedule_timeout(timeo);
1655                 unix_state_rlock(sk);
1656                 clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1657         }
1658
1659         finish_wait(sk->sk_sleep, &wait);
1660         unix_state_runlock(sk);
1661         return timeo;
1662 }
1663
1664
1665
1666 static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
1667                                struct msghdr *msg, size_t size,
1668                                int flags)
1669 {
1670         struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1671         struct scm_cookie tmp_scm;
1672         struct sock *sk = sock->sk;
1673         struct unix_sock *u = unix_sk(sk);
1674         struct sockaddr_un *sunaddr=msg->msg_name;
1675         int copied = 0;
1676         int check_creds = 0;
1677         int target;
1678         int err = 0;
1679         long timeo;
1680
1681         err = -EINVAL;
1682         if (sk->sk_state != TCP_ESTABLISHED)
1683                 goto out;
1684
1685         err = -EOPNOTSUPP;
1686         if (flags&MSG_OOB)
1687                 goto out;
1688
1689         target = sock_rcvlowat(sk, flags&MSG_WAITALL, size);
1690         timeo = sock_rcvtimeo(sk, flags&MSG_DONTWAIT);
1691
1692         msg->msg_namelen = 0;
1693
1694         /* Lock the socket to prevent queue disordering
1695          * while sleeps in memcpy_tomsg
1696          */
1697
1698         if (!siocb->scm) {
1699                 siocb->scm = &tmp_scm;
1700                 memset(&tmp_scm, 0, sizeof(tmp_scm));
1701         }
1702
1703         down(&u->readsem);
1704
1705         do
1706         {
1707                 int chunk;
1708                 struct sk_buff *skb;
1709
1710                 skb = skb_dequeue(&sk->sk_receive_queue);
1711                 if (skb==NULL)
1712                 {
1713                         if (copied >= target)
1714                                 break;
1715
1716                         /*
1717                          *      POSIX 1003.1g mandates this order.
1718                          */
1719                          
1720                         if ((err = sock_error(sk)) != 0)
1721                                 break;
1722                         if (sk->sk_shutdown & RCV_SHUTDOWN)
1723                                 break;
1724                         err = -EAGAIN;
1725                         if (!timeo)
1726                                 break;
1727                         up(&u->readsem);
1728
1729                         timeo = unix_stream_data_wait(sk, timeo);
1730
1731                         if (signal_pending(current)) {
1732                                 err = sock_intr_errno(timeo);
1733                                 goto out;
1734                         }
1735                         down(&u->readsem);
1736                         continue;
1737                 }
1738
1739                 if (check_creds) {
1740                         /* Never glue messages from different writers */
1741                         if (memcmp(UNIXCREDS(skb), &siocb->scm->creds, sizeof(siocb->scm->creds)) != 0) {
1742                                 skb_queue_head(&sk->sk_receive_queue, skb);
1743                                 break;
1744                         }
1745                 } else {
1746                         /* Copy credentials */
1747                         siocb->scm->creds = *UNIXCREDS(skb);
1748                         check_creds = 1;
1749                 }
1750
1751                 /* Copy address just once */
1752                 if (sunaddr)
1753                 {
1754                         unix_copy_addr(msg, skb->sk);
1755                         sunaddr = NULL;
1756                 }
1757
1758                 chunk = min_t(unsigned int, skb->len, size);
1759                 if (memcpy_toiovec(msg->msg_iov, skb->data, chunk)) {
1760                         skb_queue_head(&sk->sk_receive_queue, skb);
1761                         if (copied == 0)
1762                                 copied = -EFAULT;
1763                         break;
1764                 }
1765                 copied += chunk;
1766                 size -= chunk;
1767
1768                 /* Mark read part of skb as used */
1769                 if (!(flags & MSG_PEEK))
1770                 {
1771                         skb_pull(skb, chunk);
1772
1773                         if (UNIXCB(skb).fp)
1774                                 unix_detach_fds(siocb->scm, skb);
1775
1776                         /* put the skb back if we didn't use it up.. */
1777                         if (skb->len)
1778                         {
1779                                 skb_queue_head(&sk->sk_receive_queue, skb);
1780                                 break;
1781                         }
1782
1783                         kfree_skb(skb);
1784
1785                         if (siocb->scm->fp)
1786                                 break;
1787                 }
1788                 else
1789                 {
1790                         /* It is questionable, see note in unix_dgram_recvmsg.
1791                          */
1792                         if (UNIXCB(skb).fp)
1793                                 siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1794
1795                         /* put message back and return */
1796                         skb_queue_head(&sk->sk_receive_queue, skb);
1797                         break;
1798                 }
1799         } while (size);
1800
1801         up(&u->readsem);
1802         scm_recv(sock, msg, siocb->scm, flags);
1803 out:
1804         return copied ? : err;
1805 }
1806
1807 static int unix_shutdown(struct socket *sock, int mode)
1808 {
1809         struct sock *sk = sock->sk;
1810         struct sock *other;
1811
1812         mode = (mode+1)&(RCV_SHUTDOWN|SEND_SHUTDOWN);
1813
1814         if (mode) {
1815                 unix_state_wlock(sk);
1816                 sk->sk_shutdown |= mode;
1817                 other=unix_peer(sk);
1818                 if (other)
1819                         sock_hold(other);
1820                 unix_state_wunlock(sk);
1821                 sk->sk_state_change(sk);
1822
1823                 if (other &&
1824                         (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
1825
1826                         int peer_mode = 0;
1827
1828                         if (mode&RCV_SHUTDOWN)
1829                                 peer_mode |= SEND_SHUTDOWN;
1830                         if (mode&SEND_SHUTDOWN)
1831                                 peer_mode |= RCV_SHUTDOWN;
1832                         unix_state_wlock(other);
1833                         other->sk_shutdown |= peer_mode;
1834                         unix_state_wunlock(other);
1835                         other->sk_state_change(other);
1836                         read_lock(&other->sk_callback_lock);
1837                         if (peer_mode == SHUTDOWN_MASK)
1838                                 sk_wake_async(other,1,POLL_HUP);
1839                         else if (peer_mode & RCV_SHUTDOWN)
1840                                 sk_wake_async(other,1,POLL_IN);
1841                         read_unlock(&other->sk_callback_lock);
1842                 }
1843                 if (other)
1844                         sock_put(other);
1845         }
1846         return 0;
1847 }
1848
1849 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1850 {
1851         struct sock *sk = sock->sk;
1852         long amount=0;
1853         int err;
1854
1855         switch(cmd)
1856         {
1857                 case SIOCOUTQ:
1858                         amount = atomic_read(&sk->sk_wmem_alloc);
1859                         err = put_user(amount, (int __user *)arg);
1860                         break;
1861                 case SIOCINQ:
1862                 {
1863                         struct sk_buff *skb;
1864
1865                         if (sk->sk_state == TCP_LISTEN) {
1866                                 err = -EINVAL;
1867                                 break;
1868                         }
1869
1870                         spin_lock(&sk->sk_receive_queue.lock);
1871                         if (sk->sk_type == SOCK_STREAM ||
1872                             sk->sk_type == SOCK_SEQPACKET) {
1873                                 skb_queue_walk(&sk->sk_receive_queue, skb)
1874                                         amount += skb->len;
1875                         } else {
1876                                 skb = skb_peek(&sk->sk_receive_queue);
1877                                 if (skb)
1878                                         amount=skb->len;
1879                         }
1880                         spin_unlock(&sk->sk_receive_queue.lock);
1881                         err = put_user(amount, (int __user *)arg);
1882                         break;
1883                 }
1884
1885                 default:
1886                         err = dev_ioctl(cmd, (void __user *)arg);
1887                         break;
1888         }
1889         return err;
1890 }
1891
1892 static unsigned int unix_poll(struct file * file, struct socket *sock, poll_table *wait)
1893 {
1894         struct sock *sk = sock->sk;
1895         unsigned int mask;
1896
1897         poll_wait(file, sk->sk_sleep, wait);
1898         mask = 0;
1899
1900         /* exceptional events? */
1901         if (sk->sk_err)
1902                 mask |= POLLERR;
1903         if (sk->sk_shutdown == SHUTDOWN_MASK)
1904                 mask |= POLLHUP;
1905
1906         /* readable? */
1907         if (!skb_queue_empty(&sk->sk_receive_queue) ||
1908             (sk->sk_shutdown & RCV_SHUTDOWN))
1909                 mask |= POLLIN | POLLRDNORM;
1910
1911         /* Connection-based need to check for termination and startup */
1912         if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) && sk->sk_state == TCP_CLOSE)
1913                 mask |= POLLHUP;
1914
1915         /*
1916          * we set writable also when the other side has shut down the
1917          * connection. This prevents stuck sockets.
1918          */
1919         if (unix_writable(sk))
1920                 mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
1921
1922         return mask;
1923 }
1924
1925
1926 #ifdef CONFIG_PROC_FS
1927 static struct sock *unix_seq_idx(int *iter, loff_t pos)
1928 {
1929         loff_t off = 0;
1930         struct sock *s;
1931
1932         for (s = first_unix_socket(iter); s; s = next_unix_socket(iter, s)) {
1933                 if (off == pos) 
1934                         return s;
1935                 ++off;
1936         }
1937         return NULL;
1938 }
1939
1940
1941 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
1942 {
1943         read_lock(&unix_table_lock);
1944         return *pos ? unix_seq_idx(seq->private, *pos - 1) : ((void *) 1);
1945 }
1946
1947 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1948 {
1949         ++*pos;
1950
1951         if (v == (void *)1) 
1952                 return first_unix_socket(seq->private);
1953         return next_unix_socket(seq->private, v);
1954 }
1955
1956 static void unix_seq_stop(struct seq_file *seq, void *v)
1957 {
1958         read_unlock(&unix_table_lock);
1959 }
1960
1961 static int unix_seq_show(struct seq_file *seq, void *v)
1962 {
1963         
1964         if (v == (void *)1)
1965                 seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
1966                          "Inode Path\n");
1967         else {
1968                 struct sock *s = v;
1969                 struct unix_sock *u = unix_sk(s);
1970                 unix_state_rlock(s);
1971
1972                 seq_printf(seq, "%p: %08X %08X %08X %04X %02X %5lu",
1973                         s,
1974                         atomic_read(&s->sk_refcnt),
1975                         0,
1976                         s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
1977                         s->sk_type,
1978                         s->sk_socket ?
1979                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
1980                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
1981                         sock_i_ino(s));
1982
1983                 if (u->addr) {
1984                         int i, len;
1985                         seq_putc(seq, ' ');
1986
1987                         i = 0;
1988                         len = u->addr->len - sizeof(short);
1989                         if (!UNIX_ABSTRACT(s))
1990                                 len--;
1991                         else {
1992                                 seq_putc(seq, '@');
1993                                 i++;
1994                         }
1995                         for ( ; i < len; i++)
1996                                 seq_putc(seq, u->addr->name->sun_path[i]);
1997                 }
1998                 unix_state_runlock(s);
1999                 seq_putc(seq, '\n');
2000         }
2001
2002         return 0;
2003 }
2004
2005 static struct seq_operations unix_seq_ops = {
2006         .start  = unix_seq_start,
2007         .next   = unix_seq_next,
2008         .stop   = unix_seq_stop,
2009         .show   = unix_seq_show,
2010 };
2011
2012
2013 static int unix_seq_open(struct inode *inode, struct file *file)
2014 {
2015         struct seq_file *seq;
2016         int rc = -ENOMEM;
2017         int *iter = kmalloc(sizeof(int), GFP_KERNEL);
2018
2019         if (!iter)
2020                 goto out;
2021
2022         rc = seq_open(file, &unix_seq_ops);
2023         if (rc)
2024                 goto out_kfree;
2025
2026         seq          = file->private_data;
2027         seq->private = iter;
2028         *iter = 0;
2029 out:
2030         return rc;
2031 out_kfree:
2032         kfree(iter);
2033         goto out;
2034 }
2035
2036 static struct file_operations unix_seq_fops = {
2037         .owner          = THIS_MODULE,
2038         .open           = unix_seq_open,
2039         .read           = seq_read,
2040         .llseek         = seq_lseek,
2041         .release        = seq_release_private,
2042 };
2043
2044 #endif
2045
2046 static struct net_proto_family unix_family_ops = {
2047         .family = PF_UNIX,
2048         .create = unix_create,
2049         .owner  = THIS_MODULE,
2050 };
2051
2052 #ifdef CONFIG_SYSCTL
2053 extern void unix_sysctl_register(void);
2054 extern void unix_sysctl_unregister(void);
2055 #else
2056 static inline void unix_sysctl_register(void) {}
2057 static inline void unix_sysctl_unregister(void) {}
2058 #endif
2059
2060 static int __init af_unix_init(void)
2061 {
2062         int rc = -1;
2063         struct sk_buff *dummy_skb;
2064
2065         if (sizeof(struct unix_skb_parms) > sizeof(dummy_skb->cb)) {
2066                 printk(KERN_CRIT "%s: panic\n", __FUNCTION__);
2067                 goto out;
2068         }
2069
2070         rc = proto_register(&unix_proto, 1);
2071         if (rc != 0) {
2072                 printk(KERN_CRIT "%s: Cannot create unix_sock SLAB cache!\n",
2073                        __FUNCTION__);
2074                 goto out;
2075         }
2076
2077         sock_register(&unix_family_ops);
2078 #ifdef CONFIG_PROC_FS
2079         proc_net_fops_create("unix", 0, &unix_seq_fops);
2080 #endif
2081         unix_sysctl_register();
2082 out:
2083         return rc;
2084 }
2085
2086 static void __exit af_unix_exit(void)
2087 {
2088         sock_unregister(PF_UNIX);
2089         unix_sysctl_unregister();
2090         proc_net_remove("unix");
2091         proto_unregister(&unix_proto);
2092 }
2093
2094 module_init(af_unix_init);
2095 module_exit(af_unix_exit);
2096
2097 MODULE_LICENSE("GPL");
2098 MODULE_ALIAS_NETPROTO(PF_UNIX);