net/unix/af_unix.c

   1 /*
   2  * NET4:        Implementation of BSD Unix domain sockets.
   3  *
   4  * Authors:     Alan Cox, <alan@lxorguk.ukuu.org.uk>
   5  *
   6  *              This program is free software; you can redistribute it and/or
   7  *              modify it under the terms of the GNU General Public License
   8  *              as published by the Free Software Foundation; either version
   9  *              2 of the License, or (at your option) any later version.
  10  *
  11  * Fixes:
  12  *              Linus Torvalds  :       Assorted bug cures.
  13  *              Niibe Yutaka    :       async I/O support.
  14  *              Carsten Paeth   :       PF_UNIX check, address fixes.
  15  *              Alan Cox        :       Limit size of allocated blocks.
  16  *              Alan Cox        :       Fixed the stupid socketpair bug.
  17  *              Alan Cox        :       BSD compatibility fine tuning.
  18  *              Alan Cox        :       Fixed a bug in connect when interrupted.
  19  *              Alan Cox        :       Sorted out a proper draft version of
  20  *                                      file descriptor passing hacked up from
  21  *                                      Mike Shaver's work.
  22  *              Marty Leisner   :       Fixes to fd passing
  23  *              Nick Nevin      :       recvmsg bugfix.
  24  *              Alan Cox        :       Started proper garbage collector
  25  *              Heiko EiBfeldt  :       Missing verify_area check
  26  *              Alan Cox        :       Started POSIXisms
  27  *              Andreas Schwab  :       Replace inode by dentry for proper
  28  *                                      reference counting
  29  *              Kirk Petersen   :       Made this a module
  30  *          Christoph Rohland   :       Elegant non-blocking accept/connect algorithm.
  31  *                                      Lots of bug fixes.
  32  *           Alexey Kuznetosv   :       Repaired (I hope) bugs introduces
  33  *                                      by above two patches.
  34  *           Andrea Arcangeli   :       If possible we block in connect(2)
  35  *                                      if the max backlog of the listen socket
  36  *                                      is been reached. This won't break
  37  *                                      old apps and it will avoid huge amount
  38  *                                      of socks hashed (this for unix_gc()
  39  *                                      performances reasons).
  40  *                                      Security fix that limits the max
  41  *                                      number of socks to 2*max_files and
  42  *                                      the number of skb queueable in the
  43  *                                      dgram receiver.
  44  *              Artur Skawina   :       Hash function optimizations
  45  *           Alexey Kuznetsov   :       Full scale SMP. Lot of bugs are introduced 8)
  46  *            Malcolm Beattie   :       Set peercred for socketpair
  47  *           Michal Ostrowski   :       Module initialization cleanup.
  48  *           Arnaldo C. Melo    :       Remove MOD_{INC,DEC}_USE_COUNT,
  49  *                                      the core infrastructure is doing that
  50  *                                      for all net proto families now (2.5.69+)
  51  *
  52  *
  53  * Known differences from reference BSD that was tested:
  54  *
  55  *      [TO FIX]
  56  *      ECONNREFUSED is not returned from one end of a connected() socket to the
  57  *              other the moment one end closes.
  58  *      fstat() doesn't return st_dev=0, and give the blksize as high water mark
  59  *              and a fake inode identifier (nor the BSD first socket fstat twice bug).
  60  *      [NOT TO FIX]
  61  *      accept() returns a path name even if the connecting socket has closed
  62  *              in the meantime (BSD loses the path and gives up).
  63  *      accept() returns 0 length path for an unbound connector. BSD returns 16
  64  *              and a null first byte in the path (but not for gethost/peername - BSD bug ??)
  65  *      socketpair(...SOCK_RAW..) doesn't panic the kernel.
  66  *      BSD af_unix apparently has connect forgetting to block properly.
  67  *              (need to check this with the POSIX spec in detail)
  68  *
  69  * Differences from 2.0.0-11-... (ANK)
  70  *      Bug fixes and improvements.
  71  *              - client shutdown killed server socket.
  72  *              - removed all useless cli/sti pairs.
  73  *
  74  *      Semantic changes/extensions.
  75  *              - generic control message passing.
  76  *              - SCM_CREDENTIALS control message.
  77  *              - "Abstract" (not FS based) socket bindings.
  78  *                Abstract names are sequences of bytes (not zero terminated)
  79  *                started by 0, so that this name space does not intersect
  80  *                with BSD names.
  81  */
  82
  83 #include <linux/module.h>
  84 #include <linux/kernel.h>
  85 #include <linux/signal.h>
  86 #include <linux/sched.h>
  87 #include <linux/errno.h>
  88 #include <linux/string.h>
  89 #include <linux/stat.h>
  90 #include <linux/dcache.h>
  91 #include <linux/namei.h>
  92 #include <linux/socket.h>
  93 #include <linux/un.h>
  94 #include <linux/fcntl.h>
  95 #include <linux/termios.h>
  96 #include <linux/sockios.h>
  97 #include <linux/net.h>
  98 #include <linux/in.h>
  99 #include <linux/fs.h>
 100 #include <linux/slab.h>
 101 #include <asm/uaccess.h>
 102 #include <linux/skbuff.h>
 103 #include <linux/netdevice.h>
 104 #include <net/net_namespace.h>
 105 #include <net/sock.h>
 106 #include <net/tcp_states.h>
 107 #include <net/af_unix.h>
 108 #include <linux/proc_fs.h>
 109 #include <linux/seq_file.h>
 110 #include <net/scm.h>
 111 #include <linux/init.h>
 112 #include <linux/poll.h>
 113 #include <linux/rtnetlink.h>
 114 #include <linux/mount.h>
 115 #include <net/checksum.h>
 116 #include <linux/security.h>
 117
 118 struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
 119 EXPORT_SYMBOL_GPL(unix_socket_table);
 120 DEFINE_SPINLOCK(unix_table_lock);
 121 EXPORT_SYMBOL_GPL(unix_table_lock);
 122 static atomic_long_t unix_nr_socks;
 123
 124
 125 static struct hlist_head *unix_sockets_unbound(void *addr)
 126 {
 127         unsigned long hash = (unsigned long)addr;
 128
 129         hash ^= hash >> 16;
 130         hash ^= hash >> 8;
 131         hash %= UNIX_HASH_SIZE;
 132         return &unix_socket_table[UNIX_HASH_SIZE + hash];
 133 }
 134
 135 #define UNIX_ABSTRACT(sk)       (unix_sk(sk)->addr->hash < UNIX_HASH_SIZE)
 136
 137 #ifdef CONFIG_SECURITY_NETWORK
 138 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 139 {
 140         memcpy(UNIXSID(skb), &scm->secid, sizeof(u32));
 141 }
 142
 143 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 144 {
 145         scm->secid = *UNIXSID(skb);
 146 }
 147 #else
 148 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 149 { }
 150
 151 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 152 { }
 153 #endif /* CONFIG_SECURITY_NETWORK */
 154
 155 /*
 156  *  SMP locking strategy:
 157  *    hash table is protected with spinlock unix_table_lock
 158  *    each socket state is protected by separate spin lock.
 159  */
 160
 161 static inline unsigned int unix_hash_fold(__wsum n)
 162 {
 163         unsigned int hash = (__force unsigned int)n;
 164
 165         hash ^= hash>>16;
 166         hash ^= hash>>8;
 167         return hash&(UNIX_HASH_SIZE-1);
 168 }
 169
 170 #define unix_peer(sk) (unix_sk(sk)->peer)
 171
 172 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
 173 {
 174         return unix_peer(osk) == sk;
 175 }
 176
 177 static inline int unix_may_send(struct sock *sk, struct sock *osk)
 178 {
 179         return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
 180 }
 181
 182 static inline int unix_recvq_full(struct sock const *sk)
 183 {
 184         return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
 185 }
 186
 187 struct sock *unix_peer_get(struct sock *s)
 188 {
 189         struct sock *peer;
 190
 191         unix_state_lock(s);
 192         peer = unix_peer(s);
 193         if (peer)
 194                 sock_hold(peer);
 195         unix_state_unlock(s);
 196         return peer;
 197 }
 198 EXPORT_SYMBOL_GPL(unix_peer_get);
 199
 200 static inline void unix_release_addr(struct unix_address *addr)
 201 {
 202         if (atomic_dec_and_test(&addr->refcnt))
 203                 kfree(addr);
 204 }
 205
 206 /*
 207  *      Check unix socket name:
 208  *              - should be not zero length.
 209  *              - if started by not zero, should be NULL terminated (FS object)
 210  *              - if started by zero, it is abstract name.
 211  */
 212
 213 static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp)
 214 {
 215         if (len <= sizeof(short) || len > sizeof(*sunaddr))
 216                 return -EINVAL;
 217         if (!sunaddr || sunaddr->sun_family != AF_UNIX)
 218                 return -EINVAL;
 219         if (sunaddr->sun_path[0]) {
 220                 /*
 221                  * This may look like an off by one error but it is a bit more
 222                  * subtle. 108 is the longest valid AF_UNIX path for a binding.
 223                  * sun_path[108] doesn't as such exist.  However in kernel space
 224                  * we are guaranteed that it is a valid memory location in our
 225                  * kernel address buffer.
 226                  */
 227                 ((char *)sunaddr)[len] = 0;
 228                 len = strlen(sunaddr->sun_path)+1+sizeof(short);
 229                 return len;
 230         }
 231
 232         *hashp = unix_hash_fold(csum_partial(sunaddr, len, 0));
 233         return len;
 234 }
 235
 236 static void __unix_remove_socket(struct sock *sk)
 237 {
 238         sk_del_node_init(sk);
 239 }
 240
 241 static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
 242 {
 243         WARN_ON(!sk_unhashed(sk));
 244         sk_add_node(sk, list);
 245 }
 246
 247 static inline void unix_remove_socket(struct sock *sk)
 248 {
 249         spin_lock(&unix_table_lock);
 250         __unix_remove_socket(sk);
 251         spin_unlock(&unix_table_lock);
 252 }
 253
 254 static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
 255 {
 256         spin_lock(&unix_table_lock);
 257         __unix_insert_socket(list, sk);
 258         spin_unlock(&unix_table_lock);
 259 }
 260
 261 static struct sock *__unix_find_socket_byname(struct net *net,
 262                                               struct sockaddr_un *sunname,
 263                                               int len, int type, unsigned int hash)
 264 {
 265         struct sock *s;
 266
 267         sk_for_each(s, &unix_socket_table[hash ^ type]) {
 268                 struct unix_sock *u = unix_sk(s);
 269
 270                 if (!net_eq(sock_net(s), net))
 271                         continue;
 272
 273                 if (u->addr->len == len &&
 274                     !memcmp(u->addr->name, sunname, len))
 275                         goto found;
 276         }
 277         s = NULL;
 278 found:
 279         return s;
 280 }
 281
 282 static inline struct sock *unix_find_socket_byname(struct net *net,
 283                                                    struct sockaddr_un *sunname,
 284                                                    int len, int type,
 285                                                    unsigned int hash)
 286 {
 287         struct sock *s;
 288
 289         spin_lock(&unix_table_lock);
 290         s = __unix_find_socket_byname(net, sunname, len, type, hash);
 291         if (s)
 292                 sock_hold(s);
 293         spin_unlock(&unix_table_lock);
 294         return s;
 295 }
 296
 297 static struct sock *unix_find_socket_byinode(struct inode *i)
 298 {
 299         struct sock *s;
 300
 301         spin_lock(&unix_table_lock);
 302         sk_for_each(s,
 303                     &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
 304                 struct dentry *dentry = unix_sk(s)->path.dentry;
 305
 306                 if (dentry && dentry->d_inode == i) {
 307                         sock_hold(s);
 308                         goto found;
 309                 }
 310         }
 311         s = NULL;
 312 found:
 313         spin_unlock(&unix_table_lock);
 314         return s;
 315 }
 316
 317 static inline int unix_writable(struct sock *sk)
 318 {
 319         return (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
 320 }
 321
 322 static void unix_write_space(struct sock *sk)
 323 {
 324         struct socket_wq *wq;
 325
 326         rcu_read_lock();
 327         if (unix_writable(sk)) {
 328                 wq = rcu_dereference(sk->sk_wq);
 329                 if (wq_has_sleeper(wq))
 330                         wake_up_interruptible_sync_poll(&wq->wait,
 331                                 POLLOUT | POLLWRNORM | POLLWRBAND);
 332                 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
 333         }
 334         rcu_read_unlock();
 335 }
 336
 337 /* When dgram socket disconnects (or changes its peer), we clear its receive
 338  * queue of packets arrived from previous peer. First, it allows to do
 339  * flow control based only on wmem_alloc; second, sk connected to peer
 340  * may receive messages only from that peer. */
 341 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
 342 {
 343         if (!skb_queue_empty(&sk->sk_receive_queue)) {
 344                 skb_queue_purge(&sk->sk_receive_queue);
 345                 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
 346
 347                 /* If one link of bidirectional dgram pipe is disconnected,
 348                  * we signal error. Messages are lost. Do not make this,
 349                  * when peer was not connected to us.
 350                  */
 351                 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
 352                         other->sk_err = ECONNRESET;
 353                         other->sk_error_report(other);
 354                 }
 355         }
 356 }
 357
 358 static void unix_sock_destructor(struct sock *sk)
 359 {
 360         struct unix_sock *u = unix_sk(sk);
 361
 362         skb_queue_purge(&sk->sk_receive_queue);
 363
 364         WARN_ON(atomic_read(&sk->sk_wmem_alloc));
 365         WARN_ON(!sk_unhashed(sk));
 366         WARN_ON(sk->sk_socket);
 367         if (!sock_flag(sk, SOCK_DEAD)) {
 368                 printk(KERN_INFO "Attempt to release alive unix socket: %p\n", sk);
 369                 return;
 370         }
 371
 372         if (u->addr)
 373                 unix_release_addr(u->addr);
 374
 375         atomic_long_dec(&unix_nr_socks);
 376         local_bh_disable();
 377         sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
 378         local_bh_enable();
 379 #ifdef UNIX_REFCNT_DEBUG
 380         printk(KERN_DEBUG "UNIX %p is destroyed, %ld are still alive.\n", sk,
 381                 atomic_long_read(&unix_nr_socks));
 382 #endif
 383 }
 384
 385 static int unix_release_sock(struct sock *sk, int embrion)
 386 {
 387         struct unix_sock *u = unix_sk(sk);
 388         struct path path;
 389         struct sock *skpair;
 390         struct sk_buff *skb;
 391         int state;
 392
 393         unix_remove_socket(sk);
 394
 395         /* Clear state */
 396         unix_state_lock(sk);
 397         sock_orphan(sk);
 398         sk->sk_shutdown = SHUTDOWN_MASK;
 399         path         = u->path;
 400         u->path.dentry = NULL;
 401         u->path.mnt = NULL;
 402         state = sk->sk_state;
 403         sk->sk_state = TCP_CLOSE;
 404         unix_state_unlock(sk);
 405
 406         wake_up_interruptible_all(&u->peer_wait);
 407
 408         skpair = unix_peer(sk);
 409
 410         if (skpair != NULL) {
 411                 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
 412                         unix_state_lock(skpair);
 413                         /* No more writes */
 414                         skpair->sk_shutdown = SHUTDOWN_MASK;
 415                         if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
 416                                 skpair->sk_err = ECONNRESET;
 417                         unix_state_unlock(skpair);
 418                         skpair->sk_state_change(skpair);
 419                         sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
 420                 }
 421                 sock_put(skpair); /* It may now die */
 422                 unix_peer(sk) = NULL;
 423         }
 424
 425         /* Try to flush out this socket. Throw out buffers at least */
 426
 427         while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
 428                 if (state == TCP_LISTEN)
 429                         unix_release_sock(skb->sk, 1);
 430                 /* passed fds are erased in the kfree_skb hook        */
 431                 kfree_skb(skb);
 432         }
 433
 434         if (path.dentry)
 435                 path_put(&path);
 436
 437         sock_put(sk);
 438
 439         /* ---- Socket is dead now and most probably destroyed ---- */
 440
 441         /*
 442          * Fixme: BSD difference: In BSD all sockets connected to us get
 443          *        ECONNRESET and we die on the spot. In Linux we behave
 444          *        like files and pipes do and wait for the last
 445          *        dereference.
 446          *
 447          * Can't we simply set sock->err?
 448          *
 449          *        What the above comment does talk about? --ANK(980817)
 450          */
 451
 452         if (unix_tot_inflight)
 453                 unix_gc();              /* Garbage collect fds */
 454
 455         return 0;
 456 }
 457
 458 static void init_peercred(struct sock *sk)
 459 {
 460         put_pid(sk->sk_peer_pid);
 461         if (sk->sk_peer_cred)
 462                 put_cred(sk->sk_peer_cred);
 463         sk->sk_peer_pid  = get_pid(task_tgid(current));
 464         sk->sk_peer_cred = get_current_cred();
 465 }
 466
 467 static void copy_peercred(struct sock *sk, struct sock *peersk)
 468 {
 469         put_pid(sk->sk_peer_pid);
 470         if (sk->sk_peer_cred)
 471                 put_cred(sk->sk_peer_cred);
 472         sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
 473         sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
 474 }
 475
 476 static int unix_listen(struct socket *sock, int backlog)
 477 {
 478         int err;
 479         struct sock *sk = sock->sk;
 480         struct unix_sock *u = unix_sk(sk);
 481         struct pid *old_pid = NULL;
 482
 483         err = -EOPNOTSUPP;
 484         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
 485                 goto out;       /* Only stream/seqpacket sockets accept */
 486         err = -EINVAL;
 487         if (!u->addr)
 488                 goto out;       /* No listens on an unbound socket */
 489         unix_state_lock(sk);
 490         if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
 491                 goto out_unlock;
 492         if (backlog > sk->sk_max_ack_backlog)
 493                 wake_up_interruptible_all(&u->peer_wait);
 494         sk->sk_max_ack_backlog  = backlog;
 495         sk->sk_state            = TCP_LISTEN;
 496         /* set credentials so connect can copy them */
 497         init_peercred(sk);
 498         err = 0;
 499
 500 out_unlock:
 501         unix_state_unlock(sk);
 502         put_pid(old_pid);
 503 out:
 504         return err;
 505 }
 506
 507 static int unix_release(struct socket *);
 508 static int unix_bind(struct socket *, struct sockaddr *, int);
 509 static int unix_stream_connect(struct socket *, struct sockaddr *,
 510                                int addr_len, int flags);
 511 static int unix_socketpair(struct socket *, struct socket *);
 512 static int unix_accept(struct socket *, struct socket *, int);
 513 static int unix_getname(struct socket *, struct sockaddr *, int *, int);
 514 static unsigned int unix_poll(struct file *, struct socket *, poll_table *);
 515 static unsigned int unix_dgram_poll(struct file *, struct socket *,
 516                                     poll_table *);
 517 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
 518 static int unix_shutdown(struct socket *, int);
 519 static int unix_stream_sendmsg(struct kiocb *, struct socket *,
 520                                struct msghdr *, size_t);
 521 static int unix_stream_recvmsg(struct kiocb *, struct socket *,
 522                                struct msghdr *, size_t, int);
 523 static int unix_dgram_sendmsg(struct kiocb *, struct socket *,
 524                               struct msghdr *, size_t);
 525 static int unix_dgram_recvmsg(struct kiocb *, struct socket *,
 526                               struct msghdr *, size_t, int);
 527 static int unix_dgram_connect(struct socket *, struct sockaddr *,
 528                               int, int);
 529 static int unix_seqpacket_sendmsg(struct kiocb *, struct socket *,
 530                                   struct msghdr *, size_t);
 531 static int unix_seqpacket_recvmsg(struct kiocb *, struct socket *,
 532                                   struct msghdr *, size_t, int);
 533
 534 static void unix_set_peek_off(struct sock *sk, int val)
 535 {
 536         struct unix_sock *u = unix_sk(sk);
 537
 538         mutex_lock(&u->readlock);
 539         sk->sk_peek_off = val;
 540         mutex_unlock(&u->readlock);
 541 }
 542
 543
 544 static const struct proto_ops unix_stream_ops = {
 545         .family =       PF_UNIX,
 546         .owner =        THIS_MODULE,
 547         .release =      unix_release,
 548         .bind =         unix_bind,
 549         .connect =      unix_stream_connect,
 550         .socketpair =   unix_socketpair,
 551         .accept =       unix_accept,
 552         .getname =      unix_getname,
 553         .poll =         unix_poll,
 554         .ioctl =        unix_ioctl,
 555         .listen =       unix_listen,
 556         .shutdown =     unix_shutdown,
 557         .setsockopt =   sock_no_setsockopt,
 558         .getsockopt =   sock_no_getsockopt,
 559         .sendmsg =      unix_stream_sendmsg,
 560         .recvmsg =      unix_stream_recvmsg,
 561         .mmap =         sock_no_mmap,
 562         .sendpage =     sock_no_sendpage,
 563         .set_peek_off = unix_set_peek_off,
 564 };
 565
 566 static const struct proto_ops unix_dgram_ops = {
 567         .family =       PF_UNIX,
 568         .owner =        THIS_MODULE,
 569         .release =      unix_release,
 570         .bind =         unix_bind,
 571         .connect =      unix_dgram_connect,
 572         .socketpair =   unix_socketpair,
 573         .accept =       sock_no_accept,
 574         .getname =      unix_getname,
 575         .poll =         unix_dgram_poll,
 576         .ioctl =        unix_ioctl,
 577         .listen =       sock_no_listen,
 578         .shutdown =     unix_shutdown,
 579         .setsockopt =   sock_no_setsockopt,
 580         .getsockopt =   sock_no_getsockopt,
 581         .sendmsg =      unix_dgram_sendmsg,
 582         .recvmsg =      unix_dgram_recvmsg,
 583         .mmap =         sock_no_mmap,
 584         .sendpage =     sock_no_sendpage,
 585         .set_peek_off = unix_set_peek_off,
 586 };
 587
 588 static const struct proto_ops unix_seqpacket_ops = {
 589         .family =       PF_UNIX,
 590         .owner =        THIS_MODULE,
 591         .release =      unix_release,
 592         .bind =         unix_bind,
 593         .connect =      unix_stream_connect,
 594         .socketpair =   unix_socketpair,
 595         .accept =       unix_accept,
 596         .getname =      unix_getname,
 597         .poll =         unix_dgram_poll,
 598         .ioctl =        unix_ioctl,
 599         .listen =       unix_listen,
 600         .shutdown =     unix_shutdown,
 601         .setsockopt =   sock_no_setsockopt,
 602         .getsockopt =   sock_no_getsockopt,
 603         .sendmsg =      unix_seqpacket_sendmsg,
 604         .recvmsg =      unix_seqpacket_recvmsg,
 605         .mmap =         sock_no_mmap,
 606         .sendpage =     sock_no_sendpage,
 607         .set_peek_off = unix_set_peek_off,
 608 };
 609
 610 static struct proto unix_proto = {
 611         .name                   = "UNIX",
 612         .owner                  = THIS_MODULE,
 613         .obj_size               = sizeof(struct unix_sock),
 614 };
 615
 616 /*
 617  * AF_UNIX sockets do not interact with hardware, hence they
 618  * dont trigger interrupts - so it's safe for them to have
 619  * bh-unsafe locking for their sk_receive_queue.lock. Split off
 620  * this special lock-class by reinitializing the spinlock key:
 621  */
 622 static struct lock_class_key af_unix_sk_receive_queue_lock_key;
 623
 624 static struct sock *unix_create1(struct net *net, struct socket *sock)
 625 {
 626         struct sock *sk = NULL;
 627         struct unix_sock *u;
 628
 629         atomic_long_inc(&unix_nr_socks);
 630         if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files())
 631                 goto out;
 632
 633         sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto);
 634         if (!sk)
 635                 goto out;
 636
 637         sock_init_data(sock, sk);
 638         lockdep_set_class(&sk->sk_receive_queue.lock,
 639                                 &af_unix_sk_receive_queue_lock_key);
 640
 641         sk->sk_write_space      = unix_write_space;
 642         sk->sk_max_ack_backlog  = net->unx.sysctl_max_dgram_qlen;
 643         sk->sk_destruct         = unix_sock_destructor;
 644         u         = unix_sk(sk);
 645         u->path.dentry = NULL;
 646         u->path.mnt = NULL;
 647         spin_lock_init(&u->lock);
 648         atomic_long_set(&u->inflight, 0);
 649         INIT_LIST_HEAD(&u->link);
 650         mutex_init(&u->readlock); /* single task reading lock */
 651         init_waitqueue_head(&u->peer_wait);
 652         unix_insert_socket(unix_sockets_unbound(sk), sk);
 653 out:
 654         if (sk == NULL)
 655                 atomic_long_dec(&unix_nr_socks);
 656         else {
 657                 local_bh_disable();
 658                 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 659                 local_bh_enable();
 660         }
 661         return sk;
 662 }
 663
 664 static int unix_create(struct net *net, struct socket *sock, int protocol,
 665                        int kern)
 666 {
 667         if (protocol && protocol != PF_UNIX)
 668                 return -EPROTONOSUPPORT;
 669
 670         sock->state = SS_UNCONNECTED;
 671
 672         switch (sock->type) {
 673         case SOCK_STREAM:
 674                 sock->ops = &unix_stream_ops;
 675                 break;
 676                 /*
 677                  *      Believe it or not BSD has AF_UNIX, SOCK_RAW though
 678                  *      nothing uses it.
 679                  */
 680         case SOCK_RAW:
 681                 sock->type = SOCK_DGRAM;
 682         case SOCK_DGRAM:
 683                 sock->ops = &unix_dgram_ops;
 684                 break;
 685         case SOCK_SEQPACKET:
 686                 sock->ops = &unix_seqpacket_ops;
 687                 break;
 688         default:
 689                 return -ESOCKTNOSUPPORT;
 690         }
 691
 692         return unix_create1(net, sock) ? 0 : -ENOMEM;
 693 }
 694
 695 static int unix_release(struct socket *sock)
 696 {
 697         struct sock *sk = sock->sk;
 698
 699         if (!sk)
 700                 return 0;
 701
 702         sock->sk = NULL;
 703
 704         return unix_release_sock(sk, 0);
 705 }
 706
 707 static int unix_autobind(struct socket *sock)
 708 {
 709         struct sock *sk = sock->sk;
 710         struct net *net = sock_net(sk);
 711         struct unix_sock *u = unix_sk(sk);
 712         static u32 ordernum = 1;
 713         struct unix_address *addr;
 714         int err;
 715         unsigned int retries = 0;
 716
 717         mutex_lock(&u->readlock);
 718
 719         err = 0;
 720         if (u->addr)
 721                 goto out;
 722
 723         err = -ENOMEM;
 724         addr = kzalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
 725         if (!addr)
 726                 goto out;
 727
 728         addr->name->sun_family = AF_UNIX;
 729         atomic_set(&addr->refcnt, 1);
 730
 731 retry:
 732         addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
 733         addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0));
 734
 735         spin_lock(&unix_table_lock);
 736         ordernum = (ordernum+1)&0xFFFFF;
 737
 738         if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type,
 739                                       addr->hash)) {
 740                 spin_unlock(&unix_table_lock);
 741                 /*
 742                  * __unix_find_socket_byname() may take long time if many names
 743                  * are already in use.
 744                  */
 745                 cond_resched();
 746                 /* Give up if all names seems to be in use. */
 747                 if (retries++ == 0xFFFFF) {
 748                         err = -ENOSPC;
 749                         kfree(addr);
 750                         goto out;
 751                 }
 752                 goto retry;
 753         }
 754         addr->hash ^= sk->sk_type;
 755
 756         __unix_remove_socket(sk);
 757         u->addr = addr;
 758         __unix_insert_socket(&unix_socket_table[addr->hash], sk);
 759         spin_unlock(&unix_table_lock);
 760         err = 0;
 761
 762 out:    mutex_unlock(&u->readlock);
 763         return err;
 764 }
 765
 766 static struct sock *unix_find_other(struct net *net,
 767                                     struct sockaddr_un *sunname, int len,
 768                                     int type, unsigned int hash, int *error)
 769 {
 770         struct sock *u;
 771         struct path path;
 772         int err = 0;
 773
 774         if (sunname->sun_path[0]) {
 775                 struct inode *inode;
 776                 err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path);
 777                 if (err)
 778                         goto fail;
 779                 inode = path.dentry->d_inode;
 780                 err = inode_permission(inode, MAY_WRITE);
 781                 if (err)
 782                         goto put_fail;
 783
 784                 err = -ECONNREFUSED;
 785                 if (!S_ISSOCK(inode->i_mode))
 786                         goto put_fail;
 787                 u = unix_find_socket_byinode(inode);
 788                 if (!u)
 789                         goto put_fail;
 790
 791                 if (u->sk_type == type)
 792                         touch_atime(&path);
 793
 794                 path_put(&path);
 795
 796                 err = -EPROTOTYPE;
 797                 if (u->sk_type != type) {
 798                         sock_put(u);
 799                         goto fail;
 800                 }
 801         } else {
 802                 err = -ECONNREFUSED;
 803                 u = unix_find_socket_byname(net, sunname, len, type, hash);
 804                 if (u) {
 805                         struct dentry *dentry;
 806                         dentry = unix_sk(u)->path.dentry;
 807                         if (dentry)
 808                                 touch_atime(&unix_sk(u)->path);
 809                 } else
 810                         goto fail;
 811         }
 812         return u;
 813
 814 put_fail:
 815         path_put(&path);
 816 fail:
 817         *error = err;
 818         return NULL;
 819 }
 820
 821 static int unix_mknod(const char *sun_path, umode_t mode, struct path *res)
 822 {
 823         struct dentry *dentry;
 824         struct path path;
 825         int err = 0;
 826         /*
 827          * Get the parent directory, calculate the hash for last
 828          * component.
 829          */
 830         dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
 831         err = PTR_ERR(dentry);
 832         if (IS_ERR(dentry))
 833                 return err;
 834
 835         /*
 836          * All right, let's create it.
 837          */
 838         err = security_path_mknod(&path, dentry, mode, 0);
 839         if (!err) {
 840                 err = vfs_mknod(path.dentry->d_inode, dentry, mode, 0);
 841                 if (!err) {
 842                         res->mnt = mntget(path.mnt);
 843                         res->dentry = dget(dentry);
 844                 }
 845         }
 846         done_path_create(&path, dentry);
 847         return err;
 848 }
 849
 850 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 851 {
 852         struct sock *sk = sock->sk;
 853         struct net *net = sock_net(sk);
 854         struct unix_sock *u = unix_sk(sk);
 855         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
 856         char *sun_path = sunaddr->sun_path;
 857         int err;
 858         unsigned int hash;
 859         struct unix_address *addr;
 860         struct hlist_head *list;
 861
 862         err = -EINVAL;
 863         if (sunaddr->sun_family != AF_UNIX)
 864                 goto out;
 865
 866         if (addr_len == sizeof(short)) {
 867                 err = unix_autobind(sock);
 868                 goto out;
 869         }
 870
 871         err = unix_mkname(sunaddr, addr_len, &hash);
 872         if (err < 0)
 873                 goto out;
 874         addr_len = err;
 875
 876         mutex_lock(&u->readlock);
 877
 878         err = -EINVAL;
 879         if (u->addr)
 880                 goto out_up;
 881
 882         err = -ENOMEM;
 883         addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
 884         if (!addr)
 885                 goto out_up;
 886
 887         memcpy(addr->name, sunaddr, addr_len);
 888         addr->len = addr_len;
 889         addr->hash = hash ^ sk->sk_type;
 890         atomic_set(&addr->refcnt, 1);
 891
 892         if (sun_path[0]) {
 893                 struct path path;
 894                 umode_t mode = S_IFSOCK |
 895                        (SOCK_INODE(sock)->i_mode & ~current_umask());
 896                 err = unix_mknod(sun_path, mode, &path);
 897                 if (err) {
 898                         if (err == -EEXIST)
 899                                 err = -EADDRINUSE;
 900                         unix_release_addr(addr);
 901                         goto out_up;
 902                 }
 903                 addr->hash = UNIX_HASH_SIZE;
 904                 hash = path.dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1);
 905                 spin_lock(&unix_table_lock);
 906                 u->path = path;
 907                 list = &unix_socket_table[hash];
 908         } else {
 909                 spin_lock(&unix_table_lock);
 910                 err = -EADDRINUSE;
 911                 if (__unix_find_socket_byname(net, sunaddr, addr_len,
 912                                               sk->sk_type, hash)) {
 913                         unix_release_addr(addr);
 914                         goto out_unlock;
 915                 }
 916
 917                 list = &unix_socket_table[addr->hash];
 918         }
 919
 920         err = 0;
 921         __unix_remove_socket(sk);
 922         u->addr = addr;
 923         __unix_insert_socket(list, sk);
 924
 925 out_unlock:
 926         spin_unlock(&unix_table_lock);
 927 out_up:
 928         mutex_unlock(&u->readlock);
 929 out:
 930         return err;
 931 }
 932
 933 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
 934 {
 935         if (unlikely(sk1 == sk2) || !sk2) {
 936                 unix_state_lock(sk1);
 937                 return;
 938         }
 939         if (sk1 < sk2) {
 940                 unix_state_lock(sk1);
 941                 unix_state_lock_nested(sk2);
 942         } else {
 943                 unix_state_lock(sk2);
 944                 unix_state_lock_nested(sk1);
 945         }
 946 }
 947
 948 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
 949 {
 950         if (unlikely(sk1 == sk2) || !sk2) {
 951                 unix_state_unlock(sk1);
 952                 return;
 953         }
 954         unix_state_unlock(sk1);
 955         unix_state_unlock(sk2);
 956 }
 957
 958 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
 959                               int alen, int flags)
 960 {
 961         struct sock *sk = sock->sk;
 962         struct net *net = sock_net(sk);
 963         struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
 964         struct sock *other;
 965         unsigned int hash;
 966         int err;
 967
 968         if (addr->sa_family != AF_UNSPEC) {
 969                 err = unix_mkname(sunaddr, alen, &hash);
 970                 if (err < 0)
 971                         goto out;
 972                 alen = err;
 973
 974                 if (test_bit(SOCK_PASSCRED, &sock->flags) &&
 975                     !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0)
 976                         goto out;
 977
 978 restart:
 979                 other = unix_find_other(net, sunaddr, alen, sock->type, hash, &err);
 980                 if (!other)
 981                         goto out;
 982
 983                 unix_state_double_lock(sk, other);
 984
 985                 /* Apparently VFS overslept socket death. Retry. */
 986                 if (sock_flag(other, SOCK_DEAD)) {
 987                         unix_state_double_unlock(sk, other);
 988                         sock_put(other);
 989                         goto restart;
 990                 }
 991
 992                 err = -EPERM;
 993                 if (!unix_may_send(sk, other))
 994                         goto out_unlock;
 995
 996                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
 997                 if (err)
 998                         goto out_unlock;
 999
1000         } else {
1001                 /*
1002                  *      1003.1g breaking connected state with AF_UNSPEC
1003                  */
1004                 other = NULL;
1005                 unix_state_double_lock(sk, other);
1006         }
1007
1008         /*
1009          * If it was connected, reconnect.
1010          */
1011         if (unix_peer(sk)) {
1012                 struct sock *old_peer = unix_peer(sk);
1013                 unix_peer(sk) = other;
1014                 unix_state_double_unlock(sk, other);
1015
1016                 if (other != old_peer)
1017                         unix_dgram_disconnected(sk, old_peer);
1018                 sock_put(old_peer);
1019         } else {
1020                 unix_peer(sk) = other;
1021                 unix_state_double_unlock(sk, other);
1022         }
1023         return 0;
1024
1025 out_unlock:
1026         unix_state_double_unlock(sk, other);
1027         sock_put(other);
1028 out:
1029         return err;
1030 }
1031
1032 static long unix_wait_for_peer(struct sock *other, long timeo)
1033 {
1034         struct unix_sock *u = unix_sk(other);
1035         int sched;
1036         DEFINE_WAIT(wait);
1037
1038         prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1039
1040         sched = !sock_flag(other, SOCK_DEAD) &&
1041                 !(other->sk_shutdown & RCV_SHUTDOWN) &&
1042                 unix_recvq_full(other);
1043
1044         unix_state_unlock(other);
1045
1046         if (sched)
1047                 timeo = schedule_timeout(timeo);
1048
1049         finish_wait(&u->peer_wait, &wait);
1050         return timeo;
1051 }
1052
1053 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1054                                int addr_len, int flags)
1055 {
1056         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1057         struct sock *sk = sock->sk;
1058         struct net *net = sock_net(sk);
1059         struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1060         struct sock *newsk = NULL;
1061         struct sock *other = NULL;
1062         struct sk_buff *skb = NULL;
1063         unsigned int hash;
1064         int st;
1065         int err;
1066         long timeo;
1067
1068         err = unix_mkname(sunaddr, addr_len, &hash);
1069         if (err < 0)
1070                 goto out;
1071         addr_len = err;
1072
1073         if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr &&
1074             (err = unix_autobind(sock)) != 0)
1075                 goto out;
1076
1077         timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1078
1079         /* First of all allocate resources.
1080            If we will make it after state is locked,
1081            we will have to recheck all again in any case.
1082          */
1083
1084         err = -ENOMEM;
1085
1086         /* create new sock for complete connection */
1087         newsk = unix_create1(sock_net(sk), NULL);
1088         if (newsk == NULL)
1089                 goto out;
1090
1091         /* Allocate skb for sending to listening sock */
1092         skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1093         if (skb == NULL)
1094                 goto out;
1095
1096 restart:
1097         /*  Find listening sock. */
1098         other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash, &err);
1099         if (!other)
1100                 goto out;
1101
1102         /* Latch state of peer */
1103         unix_state_lock(other);
1104
1105         /* Apparently VFS overslept socket death. Retry. */
1106         if (sock_flag(other, SOCK_DEAD)) {
1107                 unix_state_unlock(other);
1108                 sock_put(other);
1109                 goto restart;
1110         }
1111
1112         err = -ECONNREFUSED;
1113         if (other->sk_state != TCP_LISTEN)
1114                 goto out_unlock;
1115         if (other->sk_shutdown & RCV_SHUTDOWN)
1116                 goto out_unlock;
1117
1118         if (unix_recvq_full(other)) {
1119                 err = -EAGAIN;
1120                 if (!timeo)
1121                         goto out_unlock;
1122
1123                 timeo = unix_wait_for_peer(other, timeo);
1124
1125                 err = sock_intr_errno(timeo);
1126                 if (signal_pending(current))
1127                         goto out;
1128                 sock_put(other);
1129                 goto restart;
1130         }
1131
1132         /* Latch our state.
1133
1134            It is tricky place. We need to grab our state lock and cannot
1135            drop lock on peer. It is dangerous because deadlock is
1136            possible. Connect to self case and simultaneous
1137            attempt to connect are eliminated by checking socket
1138            state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1139            check this before attempt to grab lock.
1140
1141            Well, and we have to recheck the state after socket locked.
1142          */
1143         st = sk->sk_state;
1144
1145         switch (st) {
1146         case TCP_CLOSE:
1147                 /* This is ok... continue with connect */
1148                 break;
1149         case TCP_ESTABLISHED:
1150                 /* Socket is already connected */
1151                 err = -EISCONN;
1152                 goto out_unlock;
1153         default:
1154                 err = -EINVAL;
1155                 goto out_unlock;
1156         }
1157
1158         unix_state_lock_nested(sk);
1159
1160         if (sk->sk_state != st) {
1161                 unix_state_unlock(sk);
1162                 unix_state_unlock(other);
1163                 sock_put(other);
1164                 goto restart;
1165         }
1166
1167         err = security_unix_stream_connect(sk, other, newsk);
1168         if (err) {
1169                 unix_state_unlock(sk);
1170                 goto out_unlock;
1171         }
1172
1173         /* The way is open! Fastly set all the necessary fields... */
1174
1175         sock_hold(sk);
1176         unix_peer(newsk)        = sk;
1177         newsk->sk_state         = TCP_ESTABLISHED;
1178         newsk->sk_type          = sk->sk_type;
1179         init_peercred(newsk);
1180         newu = unix_sk(newsk);
1181         RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1182         otheru = unix_sk(other);
1183
1184         /* copy address information from listening to new sock*/
1185         if (otheru->addr) {
1186                 atomic_inc(&otheru->addr->refcnt);
1187                 newu->addr = otheru->addr;
1188         }
1189         if (otheru->path.dentry) {
1190                 path_get(&otheru->path);
1191                 newu->path = otheru->path;
1192         }
1193
1194         /* Set credentials */
1195         copy_peercred(sk, other);
1196
1197         sock->state     = SS_CONNECTED;
1198         sk->sk_state    = TCP_ESTABLISHED;
1199         sock_hold(newsk);
1200
1201         smp_mb__after_atomic_inc();     /* sock_hold() does an atomic_inc() */
1202         unix_peer(sk)   = newsk;
1203
1204         unix_state_unlock(sk);
1205
1206         /* take ten and and send info to listening sock */
1207         spin_lock(&other->sk_receive_queue.lock);
1208         __skb_queue_tail(&other->sk_receive_queue, skb);
1209         spin_unlock(&other->sk_receive_queue.lock);
1210         unix_state_unlock(other);
1211         other->sk_data_ready(other, 0);
1212         sock_put(other);
1213         return 0;
1214
1215 out_unlock:
1216         if (other)
1217                 unix_state_unlock(other);
1218
1219 out:
1220         kfree_skb(skb);
1221         if (newsk)
1222                 unix_release_sock(newsk, 0);
1223         if (other)
1224                 sock_put(other);
1225         return err;
1226 }
1227
1228 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1229 {
1230         struct sock *ska = socka->sk, *skb = sockb->sk;
1231
1232         /* Join our sockets back to back */
1233         sock_hold(ska);
1234         sock_hold(skb);
1235         unix_peer(ska) = skb;
1236         unix_peer(skb) = ska;
1237         init_peercred(ska);
1238         init_peercred(skb);
1239
1240         if (ska->sk_type != SOCK_DGRAM) {
1241                 ska->sk_state = TCP_ESTABLISHED;
1242                 skb->sk_state = TCP_ESTABLISHED;
1243                 socka->state  = SS_CONNECTED;
1244                 sockb->state  = SS_CONNECTED;
1245         }
1246         return 0;
1247 }
1248
1249 static int unix_accept(struct socket *sock, struct socket *newsock, int flags)
1250 {
1251         struct sock *sk = sock->sk;
1252         struct sock *tsk;
1253         struct sk_buff *skb;
1254         int err;
1255
1256         err = -EOPNOTSUPP;
1257         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1258                 goto out;
1259
1260         err = -EINVAL;
1261         if (sk->sk_state != TCP_LISTEN)
1262                 goto out;
1263
1264         /* If socket state is TCP_LISTEN it cannot change (for now...),
1265          * so that no locks are necessary.
1266          */
1267
1268         skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
1269         if (!skb) {
1270                 /* This means receive shutdown. */
1271                 if (err == 0)
1272                         err = -EINVAL;
1273                 goto out;
1274         }
1275
1276         tsk = skb->sk;
1277         skb_free_datagram(sk, skb);
1278         wake_up_interruptible(&unix_sk(sk)->peer_wait);
1279
1280         /* attach accepted sock to socket */
1281         unix_state_lock(tsk);
1282         newsock->state = SS_CONNECTED;
1283         sock_graft(tsk, newsock);
1284         unix_state_unlock(tsk);
1285         return 0;
1286
1287 out:
1288         return err;
1289 }
1290
1291
1292 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer)
1293 {
1294         struct sock *sk = sock->sk;
1295         struct unix_sock *u;
1296         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1297         int err = 0;
1298
1299         if (peer) {
1300                 sk = unix_peer_get(sk);
1301
1302                 err = -ENOTCONN;
1303                 if (!sk)
1304                         goto out;
1305                 err = 0;
1306         } else {
1307                 sock_hold(sk);
1308         }
1309
1310         u = unix_sk(sk);
1311         unix_state_lock(sk);
1312         if (!u->addr) {
1313                 sunaddr->sun_family = AF_UNIX;
1314                 sunaddr->sun_path[0] = 0;
1315                 *uaddr_len = sizeof(short);
1316         } else {
1317                 struct unix_address *addr = u->addr;
1318
1319                 *uaddr_len = addr->len;
1320                 memcpy(sunaddr, addr->name, *uaddr_len);
1321         }
1322         unix_state_unlock(sk);
1323         sock_put(sk);
1324 out:
1325         return err;
1326 }
1327
1328 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1329 {
1330         int i;
1331
1332         scm->fp = UNIXCB(skb).fp;
1333         UNIXCB(skb).fp = NULL;
1334
1335         for (i = scm->fp->count-1; i >= 0; i--)
1336                 unix_notinflight(scm->fp->fp[i]);
1337 }
1338
1339 static void unix_destruct_scm(struct sk_buff *skb)
1340 {
1341         struct scm_cookie scm;
1342         memset(&scm, 0, sizeof(scm));
1343         scm.pid  = UNIXCB(skb).pid;
1344         scm.cred = UNIXCB(skb).cred;
1345         if (UNIXCB(skb).fp)
1346                 unix_detach_fds(&scm, skb);
1347
1348         /* Alas, it calls VFS */
1349         /* So fscking what? fput() had been SMP-safe since the last Summer */
1350         scm_destroy(&scm);
1351         sock_wfree(skb);
1352 }
1353
1354 #define MAX_RECURSION_LEVEL 4
1355
1356 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1357 {
1358         int i;
1359         unsigned char max_level = 0;
1360         int unix_sock_count = 0;
1361
1362         for (i = scm->fp->count - 1; i >= 0; i--) {
1363                 struct sock *sk = unix_get_socket(scm->fp->fp[i]);
1364
1365                 if (sk) {
1366                         unix_sock_count++;
1367                         max_level = max(max_level,
1368                                         unix_sk(sk)->recursion_level);
1369                 }
1370         }
1371         if (unlikely(max_level > MAX_RECURSION_LEVEL))
1372                 return -ETOOMANYREFS;
1373
1374         /*
1375          * Need to duplicate file references for the sake of garbage
1376          * collection.  Otherwise a socket in the fps might become a
1377          * candidate for GC while the skb is not yet queued.
1378          */
1379         UNIXCB(skb).fp = scm_fp_dup(scm->fp);
1380         if (!UNIXCB(skb).fp)
1381                 return -ENOMEM;
1382
1383         if (unix_sock_count) {
1384                 for (i = scm->fp->count - 1; i >= 0; i--)
1385                         unix_inflight(scm->fp->fp[i]);
1386         }
1387         return max_level;
1388 }
1389
1390 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1391 {
1392         int err = 0;
1393
1394         UNIXCB(skb).pid  = get_pid(scm->pid);
1395         if (scm->cred)
1396                 UNIXCB(skb).cred = get_cred(scm->cred);
1397         UNIXCB(skb).fp = NULL;
1398         if (scm->fp && send_fds)
1399                 err = unix_attach_fds(scm, skb);
1400
1401         skb->destructor = unix_destruct_scm;
1402         return err;
1403 }
1404
1405 /*
1406  * Some apps rely on write() giving SCM_CREDENTIALS
1407  * We include credentials if source or destination socket
1408  * asserted SOCK_PASSCRED.
1409  */
1410 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1411                             const struct sock *other)
1412 {
1413         if (UNIXCB(skb).cred)
1414                 return;
1415         if (test_bit(SOCK_PASSCRED, &sock->flags) ||
1416             !other->sk_socket ||
1417             test_bit(SOCK_PASSCRED, &other->sk_socket->flags)) {
1418                 UNIXCB(skb).pid  = get_pid(task_tgid(current));
1419                 UNIXCB(skb).cred = get_current_cred();
1420         }
1421 }
1422
1423 /*
1424  *      Send AF_UNIX data.
1425  */
1426
1427 static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
1428                               struct msghdr *msg, size_t len)
1429 {
1430         struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1431         struct sock *sk = sock->sk;
1432         struct net *net = sock_net(sk);
1433         struct unix_sock *u = unix_sk(sk);
1434         struct sockaddr_un *sunaddr = msg->msg_name;
1435         struct sock *other = NULL;
1436         int namelen = 0; /* fake GCC */
1437         int err;
1438         unsigned int hash;
1439         struct sk_buff *skb;
1440         long timeo;
1441         struct scm_cookie tmp_scm;
1442         int max_level;
1443         int data_len = 0;
1444
1445         if (NULL == siocb->scm)
1446                 siocb->scm = &tmp_scm;
1447         wait_for_unix_gc();
1448         err = scm_send(sock, msg, siocb->scm, false);
1449         if (err < 0)
1450                 return err;
1451
1452         err = -EOPNOTSUPP;
1453         if (msg->msg_flags&MSG_OOB)
1454                 goto out;
1455
1456         if (msg->msg_namelen) {
1457                 err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
1458                 if (err < 0)
1459                         goto out;
1460                 namelen = err;
1461         } else {
1462                 sunaddr = NULL;
1463                 err = -ENOTCONN;
1464                 other = unix_peer_get(sk);
1465                 if (!other)
1466                         goto out;
1467         }
1468
1469         if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr
1470             && (err = unix_autobind(sock)) != 0)
1471                 goto out;
1472
1473         err = -EMSGSIZE;
1474         if (len > sk->sk_sndbuf - 32)
1475                 goto out;
1476
1477         if (len > SKB_MAX_ALLOC)
1478                 data_len = min_t(size_t,
1479                                  len - SKB_MAX_ALLOC,
1480                                  MAX_SKB_FRAGS * PAGE_SIZE);
1481
1482         skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1483                                    msg->msg_flags & MSG_DONTWAIT, &err);
1484         if (skb == NULL)
1485                 goto out;
1486
1487         err = unix_scm_to_skb(siocb->scm, skb, true);
1488         if (err < 0)
1489                 goto out_free;
1490         max_level = err + 1;
1491         unix_get_secdata(siocb->scm, skb);
1492
1493         skb_put(skb, len - data_len);
1494         skb->data_len = data_len;
1495         skb->len = len;
1496         err = skb_copy_datagram_from_iovec(skb, 0, msg->msg_iov, 0, len);
1497         if (err)
1498                 goto out_free;
1499
1500         timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1501
1502 restart:
1503         if (!other) {
1504                 err = -ECONNRESET;
1505                 if (sunaddr == NULL)
1506                         goto out_free;
1507
1508                 other = unix_find_other(net, sunaddr, namelen, sk->sk_type,
1509                                         hash, &err);
1510                 if (other == NULL)
1511                         goto out_free;
1512         }
1513
1514         if (sk_filter(other, skb) < 0) {
1515                 /* Toss the packet but do not return any error to the sender */
1516                 err = len;
1517                 goto out_free;
1518         }
1519
1520         unix_state_lock(other);
1521         err = -EPERM;
1522         if (!unix_may_send(sk, other))
1523                 goto out_unlock;
1524
1525         if (sock_flag(other, SOCK_DEAD)) {
1526                 /*
1527                  *      Check with 1003.1g - what should
1528                  *      datagram error
1529                  */
1530                 unix_state_unlock(other);
1531                 sock_put(other);
1532
1533                 err = 0;
1534                 unix_state_lock(sk);
1535                 if (unix_peer(sk) == other) {
1536                         unix_peer(sk) = NULL;
1537                         unix_state_unlock(sk);
1538
1539                         unix_dgram_disconnected(sk, other);
1540                         sock_put(other);
1541                         err = -ECONNREFUSED;
1542                 } else {
1543                         unix_state_unlock(sk);
1544                 }
1545
1546                 other = NULL;
1547                 if (err)
1548                         goto out_free;
1549                 goto restart;
1550         }
1551
1552         err = -EPIPE;
1553         if (other->sk_shutdown & RCV_SHUTDOWN)
1554                 goto out_unlock;
1555
1556         if (sk->sk_type != SOCK_SEQPACKET) {
1557                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1558                 if (err)
1559                         goto out_unlock;
1560         }
1561
1562         if (unix_peer(other) != sk && unix_recvq_full(other)) {
1563                 if (!timeo) {
1564                         err = -EAGAIN;
1565                         goto out_unlock;
1566                 }
1567
1568                 timeo = unix_wait_for_peer(other, timeo);
1569
1570                 err = sock_intr_errno(timeo);
1571                 if (signal_pending(current))
1572                         goto out_free;
1573
1574                 goto restart;
1575         }
1576
1577         if (sock_flag(other, SOCK_RCVTSTAMP))
1578                 __net_timestamp(skb);
1579         maybe_add_creds(skb, sock, other);
1580         skb_queue_tail(&other->sk_receive_queue, skb);
1581         if (max_level > unix_sk(other)->recursion_level)
1582                 unix_sk(other)->recursion_level = max_level;
1583         unix_state_unlock(other);
1584         other->sk_data_ready(other, len);
1585         sock_put(other);
1586         scm_destroy(siocb->scm);
1587         return len;
1588
1589 out_unlock:
1590         unix_state_unlock(other);
1591 out_free:
1592         kfree_skb(skb);
1593 out:
1594         if (other)
1595                 sock_put(other);
1596         scm_destroy(siocb->scm);
1597         return err;
1598 }
1599
1600
1601 static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
1602                                struct msghdr *msg, size_t len)
1603 {
1604         struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1605         struct sock *sk = sock->sk;
1606         struct sock *other = NULL;
1607         int err, size;
1608         struct sk_buff *skb;
1609         int sent = 0;
1610         struct scm_cookie tmp_scm;
1611         bool fds_sent = false;
1612         int max_level;
1613
1614         if (NULL == siocb->scm)
1615                 siocb->scm = &tmp_scm;
1616         wait_for_unix_gc();
1617         err = scm_send(sock, msg, siocb->scm, false);
1618         if (err < 0)
1619                 return err;
1620
1621         err = -EOPNOTSUPP;
1622         if (msg->msg_flags&MSG_OOB)
1623                 goto out_err;
1624
1625         if (msg->msg_namelen) {
1626                 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
1627                 goto out_err;
1628         } else {
1629                 err = -ENOTCONN;
1630                 other = unix_peer(sk);
1631                 if (!other)
1632                         goto out_err;
1633         }
1634
1635         if (sk->sk_shutdown & SEND_SHUTDOWN)
1636                 goto pipe_err;
1637
1638         while (sent < len) {
1639                 /*
1640                  *      Optimisation for the fact that under 0.01% of X
1641                  *      messages typically need breaking up.
1642                  */
1643
1644                 size = len-sent;
1645
1646                 /* Keep two messages in the pipe so it schedules better */
1647                 if (size > ((sk->sk_sndbuf >> 1) - 64))
1648                         size = (sk->sk_sndbuf >> 1) - 64;
1649
1650                 if (size > SKB_MAX_ALLOC)
1651                         size = SKB_MAX_ALLOC;
1652
1653                 /*
1654                  *      Grab a buffer
1655                  */
1656
1657                 skb = sock_alloc_send_skb(sk, size, msg->msg_flags&MSG_DONTWAIT,
1658                                           &err);
1659
1660                 if (skb == NULL)
1661                         goto out_err;
1662
1663                 /*
1664                  *      If you pass two values to the sock_alloc_send_skb
1665                  *      it tries to grab the large buffer with GFP_NOFS
1666                  *      (which can fail easily), and if it fails grab the
1667                  *      fallback size buffer which is under a page and will
1668                  *      succeed. [Alan]
1669                  */
1670                 size = min_t(int, size, skb_tailroom(skb));
1671
1672
1673                 /* Only send the fds in the first buffer */
1674                 err = unix_scm_to_skb(siocb->scm, skb, !fds_sent);
1675                 if (err < 0) {
1676                         kfree_skb(skb);
1677                         goto out_err;
1678                 }
1679                 max_level = err + 1;
1680                 fds_sent = true;
1681
1682                 err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size);
1683                 if (err) {
1684                         kfree_skb(skb);
1685                         goto out_err;
1686                 }
1687
1688                 unix_state_lock(other);
1689
1690                 if (sock_flag(other, SOCK_DEAD) ||
1691                     (other->sk_shutdown & RCV_SHUTDOWN))
1692                         goto pipe_err_free;
1693
1694                 maybe_add_creds(skb, sock, other);
1695                 skb_queue_tail(&other->sk_receive_queue, skb);
1696                 if (max_level > unix_sk(other)->recursion_level)
1697                         unix_sk(other)->recursion_level = max_level;
1698                 unix_state_unlock(other);
1699                 other->sk_data_ready(other, size);
1700                 sent += size;
1701         }
1702
1703         scm_destroy(siocb->scm);
1704         siocb->scm = NULL;
1705
1706         return sent;
1707
1708 pipe_err_free:
1709         unix_state_unlock(other);
1710         kfree_skb(skb);
1711 pipe_err:
1712         if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
1713                 send_sig(SIGPIPE, current, 0);
1714         err = -EPIPE;
1715 out_err:
1716         scm_destroy(siocb->scm);
1717         siocb->scm = NULL;
1718         return sent ? : err;
1719 }
1720
1721 static int unix_seqpacket_sendmsg(struct kiocb *kiocb, struct socket *sock,
1722                                   struct msghdr *msg, size_t len)
1723 {
1724         int err;
1725         struct sock *sk = sock->sk;
1726
1727         err = sock_error(sk);
1728         if (err)
1729                 return err;
1730
1731         if (sk->sk_state != TCP_ESTABLISHED)
1732                 return -ENOTCONN;
1733
1734         if (msg->msg_namelen)
1735                 msg->msg_namelen = 0;
1736
1737         return unix_dgram_sendmsg(kiocb, sock, msg, len);
1738 }
1739
1740 static int unix_seqpacket_recvmsg(struct kiocb *iocb, struct socket *sock,
1741                               struct msghdr *msg, size_t size,
1742                               int flags)
1743 {
1744         struct sock *sk = sock->sk;
1745
1746         if (sk->sk_state != TCP_ESTABLISHED)
1747                 return -ENOTCONN;
1748
1749         return unix_dgram_recvmsg(iocb, sock, msg, size, flags);
1750 }
1751
1752 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
1753 {
1754         struct unix_sock *u = unix_sk(sk);
1755
1756         msg->msg_namelen = 0;
1757         if (u->addr) {
1758                 msg->msg_namelen = u->addr->len;
1759                 memcpy(msg->msg_name, u->addr->name, u->addr->len);
1760         }
1761 }
1762
1763 static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
1764                               struct msghdr *msg, size_t size,
1765                               int flags)
1766 {
1767         struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1768         struct scm_cookie tmp_scm;
1769         struct sock *sk = sock->sk;
1770         struct unix_sock *u = unix_sk(sk);
1771         int noblock = flags & MSG_DONTWAIT;
1772         struct sk_buff *skb;
1773         int err;
1774         int peeked, skip;
1775
1776         err = -EOPNOTSUPP;
1777         if (flags&MSG_OOB)
1778                 goto out;
1779
1780         msg->msg_namelen = 0;
1781
1782         err = mutex_lock_interruptible(&u->readlock);
1783         if (err) {
1784                 err = sock_intr_errno(sock_rcvtimeo(sk, noblock));
1785                 goto out;
1786         }
1787
1788         skip = sk_peek_offset(sk, flags);
1789
1790         skb = __skb_recv_datagram(sk, flags, &peeked, &skip, &err);
1791         if (!skb) {
1792                 unix_state_lock(sk);
1793                 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */
1794                 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
1795                     (sk->sk_shutdown & RCV_SHUTDOWN))
1796                         err = 0;
1797                 unix_state_unlock(sk);
1798                 goto out_unlock;
1799         }
1800
1801         wake_up_interruptible_sync_poll(&u->peer_wait,
1802                                         POLLOUT | POLLWRNORM | POLLWRBAND);
1803
1804         if (msg->msg_name)
1805                 unix_copy_addr(msg, skb->sk);
1806
1807         if (size > skb->len - skip)
1808                 size = skb->len - skip;
1809         else if (size < skb->len - skip)
1810                 msg->msg_flags |= MSG_TRUNC;
1811
1812         err = skb_copy_datagram_iovec(skb, skip, msg->msg_iov, size);
1813         if (err)
1814                 goto out_free;
1815
1816         if (sock_flag(sk, SOCK_RCVTSTAMP))
1817                 __sock_recv_timestamp(msg, sk, skb);
1818
1819         if (!siocb->scm) {
1820                 siocb->scm = &tmp_scm;
1821                 memset(&tmp_scm, 0, sizeof(tmp_scm));
1822         }
1823         scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).cred);
1824         unix_set_secdata(siocb->scm, skb);
1825
1826         if (!(flags & MSG_PEEK)) {
1827                 if (UNIXCB(skb).fp)
1828                         unix_detach_fds(siocb->scm, skb);
1829
1830                 sk_peek_offset_bwd(sk, skb->len);
1831         } else {
1832                 /* It is questionable: on PEEK we could:
1833                    - do not return fds - good, but too simple 8)
1834                    - return fds, and do not return them on read (old strategy,
1835                      apparently wrong)
1836                    - clone fds (I chose it for now, it is the most universal
1837                      solution)
1838
1839                    POSIX 1003.1g does not actually define this clearly
1840                    at all. POSIX 1003.1g doesn't define a lot of things
1841                    clearly however!
1842
1843                 */
1844
1845                 sk_peek_offset_fwd(sk, size);
1846
1847                 if (UNIXCB(skb).fp)
1848                         siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1849         }
1850         err = (flags & MSG_TRUNC) ? skb->len - skip : size;
1851
1852         scm_recv(sock, msg, siocb->scm, flags);
1853
1854 out_free:
1855         skb_free_datagram(sk, skb);
1856 out_unlock:
1857         mutex_unlock(&u->readlock);
1858 out:
1859         return err;
1860 }
1861
1862 /*
1863  *      Sleep until data has arrive. But check for races..
1864  */
1865
1866 static long unix_stream_data_wait(struct sock *sk, long timeo)
1867 {
1868         DEFINE_WAIT(wait);
1869
1870         unix_state_lock(sk);
1871
1872         for (;;) {
1873                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1874
1875                 if (!skb_queue_empty(&sk->sk_receive_queue) ||
1876                     sk->sk_err ||
1877                     (sk->sk_shutdown & RCV_SHUTDOWN) ||
1878                     signal_pending(current) ||
1879                     !timeo)
1880                         break;
1881
1882                 set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1883                 unix_state_unlock(sk);
1884                 timeo = schedule_timeout(timeo);
1885                 unix_state_lock(sk);
1886                 clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1887         }
1888
1889         finish_wait(sk_sleep(sk), &wait);
1890         unix_state_unlock(sk);
1891         return timeo;
1892 }
1893
1894
1895
1896 static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
1897                                struct msghdr *msg, size_t size,
1898                                int flags)
1899 {
1900         struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1901         struct scm_cookie tmp_scm;
1902         struct sock *sk = sock->sk;
1903         struct unix_sock *u = unix_sk(sk);
1904         struct sockaddr_un *sunaddr = msg->msg_name;
1905         int copied = 0;
1906         int check_creds = 0;
1907         int target;
1908         int err = 0;
1909         long timeo;
1910         int skip;
1911
1912         err = -EINVAL;
1913         if (sk->sk_state != TCP_ESTABLISHED)
1914                 goto out;
1915
1916         err = -EOPNOTSUPP;
1917         if (flags&MSG_OOB)
1918                 goto out;
1919
1920         target = sock_rcvlowat(sk, flags&MSG_WAITALL, size);
1921         timeo = sock_rcvtimeo(sk, flags&MSG_DONTWAIT);
1922
1923         msg->msg_namelen = 0;
1924
1925         /* Lock the socket to prevent queue disordering
1926          * while sleeps in memcpy_tomsg
1927          */
1928
1929         if (!siocb->scm) {
1930                 siocb->scm = &tmp_scm;
1931                 memset(&tmp_scm, 0, sizeof(tmp_scm));
1932         }
1933
1934         err = mutex_lock_interruptible(&u->readlock);
1935         if (err) {
1936                 err = sock_intr_errno(timeo);
1937                 goto out;
1938         }
1939
1940         skip = sk_peek_offset(sk, flags);
1941
1942         do {
1943                 int chunk;
1944                 struct sk_buff *skb;
1945
1946                 unix_state_lock(sk);
1947                 skb = skb_peek(&sk->sk_receive_queue);
1948 again:
1949                 if (skb == NULL) {
1950                         unix_sk(sk)->recursion_level = 0;
1951                         if (copied >= target)
1952                                 goto unlock;
1953
1954                         /*
1955                          *      POSIX 1003.1g mandates this order.
1956                          */
1957
1958                         err = sock_error(sk);
1959                         if (err)
1960                                 goto unlock;
1961                         if (sk->sk_shutdown & RCV_SHUTDOWN)
1962                                 goto unlock;
1963
1964                         unix_state_unlock(sk);
1965                         err = -EAGAIN;
1966                         if (!timeo)
1967                                 break;
1968                         mutex_unlock(&u->readlock);
1969
1970                         timeo = unix_stream_data_wait(sk, timeo);
1971
1972                         if (signal_pending(current)
1973                             ||  mutex_lock_interruptible(&u->readlock)) {
1974                                 err = sock_intr_errno(timeo);
1975                                 goto out;
1976                         }
1977
1978                         continue;
1979  unlock:
1980                         unix_state_unlock(sk);
1981                         break;
1982                 }
1983
1984                 if (skip >= skb->len) {
1985                         skip -= skb->len;
1986                         skb = skb_peek_next(skb, &sk->sk_receive_queue);
1987                         goto again;
1988                 }
1989
1990                 unix_state_unlock(sk);
1991
1992                 if (check_creds) {
1993                         /* Never glue messages from different writers */
1994                         if ((UNIXCB(skb).pid  != siocb->scm->pid) ||
1995                             (UNIXCB(skb).cred != siocb->scm->cred))
1996                                 break;
1997                 } else {
1998                         /* Copy credentials */
1999                         scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).cred);
2000                         check_creds = 1;
2001                 }
2002
2003                 /* Copy address just once */
2004                 if (sunaddr) {
2005                         unix_copy_addr(msg, skb->sk);
2006                         sunaddr = NULL;
2007                 }
2008
2009                 chunk = min_t(unsigned int, skb->len - skip, size);
2010                 if (memcpy_toiovec(msg->msg_iov, skb->data + skip, chunk)) {
2011                         if (copied == 0)
2012                                 copied = -EFAULT;
2013                         break;
2014                 }
2015                 copied += chunk;
2016                 size -= chunk;
2017
2018                 /* Mark read part of skb as used */
2019                 if (!(flags & MSG_PEEK)) {
2020                         skb_pull(skb, chunk);
2021
2022                         sk_peek_offset_bwd(sk, chunk);
2023
2024                         if (UNIXCB(skb).fp)
2025                                 unix_detach_fds(siocb->scm, skb);
2026
2027                         if (skb->len)
2028                                 break;
2029
2030                         skb_unlink(skb, &sk->sk_receive_queue);
2031                         consume_skb(skb);
2032
2033                         if (siocb->scm->fp)
2034                                 break;
2035                 } else {
2036                         /* It is questionable, see note in unix_dgram_recvmsg.
2037                          */
2038                         if (UNIXCB(skb).fp)
2039                                 siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
2040
2041                         sk_peek_offset_fwd(sk, chunk);
2042
2043                         break;
2044                 }
2045         } while (size);
2046
2047         mutex_unlock(&u->readlock);
2048         scm_recv(sock, msg, siocb->scm, flags);
2049 out:
2050         return copied ? : err;
2051 }
2052
2053 static int unix_shutdown(struct socket *sock, int mode)
2054 {
2055         struct sock *sk = sock->sk;
2056         struct sock *other;
2057
2058         if (mode < SHUT_RD || mode > SHUT_RDWR)
2059                 return -EINVAL;
2060         /* This maps:
2061          * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2062          * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2063          * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2064          */
2065         ++mode;
2066
2067         unix_state_lock(sk);
2068         sk->sk_shutdown |= mode;
2069         other = unix_peer(sk);
2070         if (other)
2071                 sock_hold(other);
2072         unix_state_unlock(sk);
2073         sk->sk_state_change(sk);
2074
2075         if (other &&
2076                 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2077
2078                 int peer_mode = 0;
2079
2080                 if (mode&RCV_SHUTDOWN)
2081                         peer_mode |= SEND_SHUTDOWN;
2082                 if (mode&SEND_SHUTDOWN)
2083                         peer_mode |= RCV_SHUTDOWN;
2084                 unix_state_lock(other);
2085                 other->sk_shutdown |= peer_mode;
2086                 unix_state_unlock(other);
2087                 other->sk_state_change(other);
2088                 if (peer_mode == SHUTDOWN_MASK)
2089                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2090                 else if (peer_mode & RCV_SHUTDOWN)
2091                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2092         }
2093         if (other)
2094                 sock_put(other);
2095
2096         return 0;
2097 }
2098
2099 long unix_inq_len(struct sock *sk)
2100 {
2101         struct sk_buff *skb;
2102         long amount = 0;
2103
2104         if (sk->sk_state == TCP_LISTEN)
2105                 return -EINVAL;
2106
2107         spin_lock(&sk->sk_receive_queue.lock);
2108         if (sk->sk_type == SOCK_STREAM ||
2109             sk->sk_type == SOCK_SEQPACKET) {
2110                 skb_queue_walk(&sk->sk_receive_queue, skb)
2111                         amount += skb->len;
2112         } else {
2113                 skb = skb_peek(&sk->sk_receive_queue);
2114                 if (skb)
2115                         amount = skb->len;
2116         }
2117         spin_unlock(&sk->sk_receive_queue.lock);
2118
2119         return amount;
2120 }
2121 EXPORT_SYMBOL_GPL(unix_inq_len);
2122
2123 long unix_outq_len(struct sock *sk)
2124 {
2125         return sk_wmem_alloc_get(sk);
2126 }
2127 EXPORT_SYMBOL_GPL(unix_outq_len);
2128
2129 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2130 {
2131         struct sock *sk = sock->sk;
2132         long amount = 0;
2133         int err;
2134
2135         switch (cmd) {
2136         case SIOCOUTQ:
2137                 amount = unix_outq_len(sk);
2138                 err = put_user(amount, (int __user *)arg);
2139                 break;
2140         case SIOCINQ:
2141                 amount = unix_inq_len(sk);
2142                 if (amount < 0)
2143                         err = amount;
2144                 else
2145                         err = put_user(amount, (int __user *)arg);
2146                 break;
2147         default:
2148                 err = -ENOIOCTLCMD;
2149                 break;
2150         }
2151         return err;
2152 }
2153
2154 static unsigned int unix_poll(struct file *file, struct socket *sock, poll_table *wait)
2155 {
2156         struct sock *sk = sock->sk;
2157         unsigned int mask;
2158
2159         sock_poll_wait(file, sk_sleep(sk), wait);
2160         mask = 0;
2161
2162         /* exceptional events? */
2163         if (sk->sk_err)
2164                 mask |= POLLERR;
2165         if (sk->sk_shutdown == SHUTDOWN_MASK)
2166                 mask |= POLLHUP;
2167         if (sk->sk_shutdown & RCV_SHUTDOWN)
2168                 mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2169
2170         /* readable? */
2171         if (!skb_queue_empty(&sk->sk_receive_queue))
2172                 mask |= POLLIN | POLLRDNORM;
2173
2174         /* Connection-based need to check for termination and startup */
2175         if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
2176             sk->sk_state == TCP_CLOSE)
2177                 mask |= POLLHUP;
2178
2179         /*
2180          * we set writable also when the other side has shut down the
2181          * connection. This prevents stuck sockets.
2182          */
2183         if (unix_writable(sk))
2184                 mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2185
2186         return mask;
2187 }
2188
2189 static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
2190                                     poll_table *wait)
2191 {
2192         struct sock *sk = sock->sk, *other;
2193         unsigned int mask, writable;
2194
2195         sock_poll_wait(file, sk_sleep(sk), wait);
2196         mask = 0;
2197
2198         /* exceptional events? */
2199         if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
2200                 mask |= POLLERR;
2201         if (sk->sk_shutdown & RCV_SHUTDOWN)
2202                 mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2203         if (sk->sk_shutdown == SHUTDOWN_MASK)
2204                 mask |= POLLHUP;
2205
2206         /* readable? */
2207         if (!skb_queue_empty(&sk->sk_receive_queue))
2208                 mask |= POLLIN | POLLRDNORM;
2209
2210         /* Connection-based need to check for termination and startup */
2211         if (sk->sk_type == SOCK_SEQPACKET) {
2212                 if (sk->sk_state == TCP_CLOSE)
2213                         mask |= POLLHUP;
2214                 /* connection hasn't started yet? */
2215                 if (sk->sk_state == TCP_SYN_SENT)
2216                         return mask;
2217         }
2218
2219         /* No write status requested, avoid expensive OUT tests. */
2220         if (!(poll_requested_events(wait) & (POLLWRBAND|POLLWRNORM|POLLOUT)))
2221                 return mask;
2222
2223         writable = unix_writable(sk);
2224         other = unix_peer_get(sk);
2225         if (other) {
2226                 if (unix_peer(other) != sk) {
2227                         sock_poll_wait(file, &unix_sk(other)->peer_wait, wait);
2228                         if (unix_recvq_full(other))
2229                                 writable = 0;
2230                 }
2231                 sock_put(other);
2232         }
2233
2234         if (writable)
2235                 mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2236         else
2237                 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
2238
2239         return mask;
2240 }
2241
2242 #ifdef CONFIG_PROC_FS
2243
2244 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
2245
2246 #define get_bucket(x) ((x) >> BUCKET_SPACE)
2247 #define get_offset(x) ((x) & ((1L << BUCKET_SPACE) - 1))
2248 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
2249
2250 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
2251 {
2252         unsigned long offset = get_offset(*pos);
2253         unsigned long bucket = get_bucket(*pos);
2254         struct sock *sk;
2255         unsigned long count = 0;
2256
2257         for (sk = sk_head(&unix_socket_table[bucket]); sk; sk = sk_next(sk)) {
2258                 if (sock_net(sk) != seq_file_net(seq))
2259                         continue;
2260                 if (++count == offset)
2261                         break;
2262         }
2263
2264         return sk;
2265 }
2266
2267 static struct sock *unix_next_socket(struct seq_file *seq,
2268                                      struct sock *sk,
2269                                      loff_t *pos)
2270 {
2271         unsigned long bucket;
2272
2273         while (sk > (struct sock *)SEQ_START_TOKEN) {
2274                 sk = sk_next(sk);
2275                 if (!sk)
2276                         goto next_bucket;
2277                 if (sock_net(sk) == seq_file_net(seq))
2278                         return sk;
2279         }
2280
2281         do {
2282                 sk = unix_from_bucket(seq, pos);
2283                 if (sk)
2284                         return sk;
2285
2286 next_bucket:
2287                 bucket = get_bucket(*pos) + 1;
2288                 *pos = set_bucket_offset(bucket, 1);
2289         } while (bucket < ARRAY_SIZE(unix_socket_table));
2290
2291         return NULL;
2292 }
2293
2294 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
2295         __acquires(unix_table_lock)
2296 {
2297         spin_lock(&unix_table_lock);
2298
2299         if (!*pos)
2300                 return SEQ_START_TOKEN;
2301
2302         if (get_bucket(*pos) >= ARRAY_SIZE(unix_socket_table))
2303                 return NULL;
2304
2305         return unix_next_socket(seq, NULL, pos);
2306 }
2307
2308 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2309 {
2310         ++*pos;
2311         return unix_next_socket(seq, v, pos);
2312 }
2313
2314 static void unix_seq_stop(struct seq_file *seq, void *v)
2315         __releases(unix_table_lock)
2316 {
2317         spin_unlock(&unix_table_lock);
2318 }
2319
2320 static int unix_seq_show(struct seq_file *seq, void *v)
2321 {
2322
2323         if (v == SEQ_START_TOKEN)
2324                 seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
2325                          "Inode Path\n");
2326         else {
2327                 struct sock *s = v;
2328                 struct unix_sock *u = unix_sk(s);
2329                 unix_state_lock(s);
2330
2331                 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
2332                         s,
2333                         atomic_read(&s->sk_refcnt),
2334                         0,
2335                         s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
2336                         s->sk_type,
2337                         s->sk_socket ?
2338                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
2339                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
2340                         sock_i_ino(s));
2341
2342                 if (u->addr) {
2343                         int i, len;
2344                         seq_putc(seq, ' ');
2345
2346                         i = 0;
2347                         len = u->addr->len - sizeof(short);
2348                         if (!UNIX_ABSTRACT(s))
2349                                 len--;
2350                         else {
2351                                 seq_putc(seq, '@');
2352                                 i++;
2353                         }
2354                         for ( ; i < len; i++)
2355                                 seq_putc(seq, u->addr->name->sun_path[i]);
2356                 }
2357                 unix_state_unlock(s);
2358                 seq_putc(seq, '\n');
2359         }
2360
2361         return 0;
2362 }
2363
2364 static const struct seq_operations unix_seq_ops = {
2365         .start  = unix_seq_start,
2366         .next   = unix_seq_next,
2367         .stop   = unix_seq_stop,
2368         .show   = unix_seq_show,
2369 };
2370
2371 static int unix_seq_open(struct inode *inode, struct file *file)
2372 {
2373         return seq_open_net(inode, file, &unix_seq_ops,
2374                             sizeof(struct seq_net_private));
2375 }
2376
2377 static const struct file_operations unix_seq_fops = {
2378         .owner          = THIS_MODULE,
2379         .open           = unix_seq_open,
2380         .read           = seq_read,
2381         .llseek         = seq_lseek,
2382         .release        = seq_release_net,
2383 };
2384
2385 #endif
2386
2387 static const struct net_proto_family unix_family_ops = {
2388         .family = PF_UNIX,
2389         .create = unix_create,
2390         .owner  = THIS_MODULE,
2391 };
2392
2393
2394 static int __net_init unix_net_init(struct net *net)
2395 {
2396         int error = -ENOMEM;
2397
2398         net->unx.sysctl_max_dgram_qlen = 10;
2399         if (unix_sysctl_register(net))
2400                 goto out;
2401
2402 #ifdef CONFIG_PROC_FS
2403         if (!proc_create("unix", 0, net->proc_net, &unix_seq_fops)) {
2404                 unix_sysctl_unregister(net);
2405                 goto out;
2406         }
2407 #endif
2408         error = 0;
2409 out:
2410         return error;
2411 }
2412
2413 static void __net_exit unix_net_exit(struct net *net)
2414 {
2415         unix_sysctl_unregister(net);
2416         remove_proc_entry("unix", net->proc_net);
2417 }
2418
2419 static struct pernet_operations unix_net_ops = {
2420         .init = unix_net_init,
2421         .exit = unix_net_exit,
2422 };
2423
2424 static int __init af_unix_init(void)
2425 {
2426         int rc = -1;
2427
2428         BUILD_BUG_ON(sizeof(struct unix_skb_parms) > FIELD_SIZEOF(struct sk_buff, cb));
2429
2430         rc = proto_register(&unix_proto, 1);
2431         if (rc != 0) {
2432                 printk(KERN_CRIT "%s: Cannot create unix_sock SLAB cache!\n",
2433                        __func__);
2434                 goto out;
2435         }
2436
2437         sock_register(&unix_family_ops);
2438         register_pernet_subsys(&unix_net_ops);
2439 out:
2440         return rc;
2441 }
2442
2443 static void __exit af_unix_exit(void)
2444 {
2445         sock_unregister(PF_UNIX);
2446         proto_unregister(&unix_proto);
2447         unregister_pernet_subsys(&unix_net_ops);
2448 }
2449
2450 /* Earlier than device_initcall() so that other drivers invoking
2451    request_module() don't end up in a loop when modprobe tries
2452    to use a UNIX socket. But later than subsys_initcall() because
2453    we depend on stuff initialised there */
2454 fs_initcall(af_unix_init);
2455 module_exit(af_unix_exit);
2456
2457 MODULE_LICENSE("GPL");
2458 MODULE_ALIAS_NETPROTO(PF_UNIX);