net/core/sock.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Generic socket support routines. Memory allocators, socket lock/release
   7  *              handler for protocols to use and generic option handler.
   8  *
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Florian La Roche, <flla@stud.uni-sb.de>
  13  *              Alan Cox, <A.Cox@swansea.ac.uk>
  14  *
  15  * Fixes:
  16  *              Alan Cox        :       Numerous verify_area() problems
  17  *              Alan Cox        :       Connecting on a connecting socket
  18  *                                      now returns an error for tcp.
  19  *              Alan Cox        :       sock->protocol is set correctly.
  20  *                                      and is not sometimes left as 0.
  21  *              Alan Cox        :       connect handles icmp errors on a
  22  *                                      connect properly. Unfortunately there
  23  *                                      is a restart syscall nasty there. I
  24  *                                      can't match BSD without hacking the C
  25  *                                      library. Ideas urgently sought!
  26  *              Alan Cox        :       Disallow bind() to addresses that are
  27  *                                      not ours - especially broadcast ones!!
  28  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30  *                                      instead they leave that for the DESTROY timer.
  31  *              Alan Cox        :       Clean up error flag in accept
  32  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33  *                                      was buggy. Put a remove_sock() in the handler
  34  *                                      for memory when we hit 0. Also altered the timer
  35  *                                      code. The ACK stuff can wait and needs major
  36  *                                      TCP layer surgery.
  37  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38  *                                      and fixed timer/inet_bh race.
  39  *              Alan Cox        :       Added zapped flag for TCP
  40  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47  *      Pauline Middelink       :       identd support
  48  *              Alan Cox        :       Fixed connect() taking signals I think.
  49  *              Alan Cox        :       SO_LINGER supported
  50  *              Alan Cox        :       Error reporting fixes
  51  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52  *              Alan Cox        :       inet sockets don't set sk->type!
  53  *              Alan Cox        :       Split socket option code
  54  *              Alan Cox        :       Callbacks
  55  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56  *              Alex            :       Removed restriction on inet fioctl
  57  *              Alan Cox        :       Splitting INET from NET core
  58  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60  *              Alan Cox        :       Split IP from generic code
  61  *              Alan Cox        :       New kfree_skbmem()
  62  *              Alan Cox        :       Make SO_DEBUG superuser only.
  63  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64  *                                      (compatibility fix)
  65  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66  *              Alan Cox        :       Allocator for a socket is settable.
  67  *              Alan Cox        :       SO_ERROR includes soft errors.
  68  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69  *              Alan Cox        :       Generic socket allocation to make hooks
  70  *                                      easier (suggested by Craig Metz).
  71  *              Michael Pall    :       SO_ERROR returns positive errno again
  72  *              Steve Whitehouse:       Added default destructor to free
  73  *                                      protocol private data.
  74  *              Steve Whitehouse:       Added various other default routines
  75  *                                      common to several socket families.
  76  *              Chris Evans     :       Call suser() check last on F_SETOWN
  77  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79  *              Andi Kleen      :       Fix write_space callback
  80  *              Chris Evans     :       Security fixes - signedness again
  81  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82  *
  83  * To Fix:
  84  *
  85  *
  86  *              This program is free software; you can redistribute it and/or
  87  *              modify it under the terms of the GNU General Public License
  88  *              as published by the Free Software Foundation; either version
  89  *              2 of the License, or (at your option) any later version.
  90  */
  91
  92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  93
  94 #include <linux/capability.h>
  95 #include <linux/errno.h>
  96 #include <linux/errqueue.h>
  97 #include <linux/types.h>
  98 #include <linux/socket.h>
  99 #include <linux/in.h>
 100 #include <linux/kernel.h>
 101 #include <linux/module.h>
 102 #include <linux/proc_fs.h>
 103 #include <linux/seq_file.h>
 104 #include <linux/sched.h>
 105 #include <linux/timer.h>
 106 #include <linux/string.h>
 107 #include <linux/sockios.h>
 108 #include <linux/net.h>
 109 #include <linux/mm.h>
 110 #include <linux/slab.h>
 111 #include <linux/interrupt.h>
 112 #include <linux/poll.h>
 113 #include <linux/tcp.h>
 114 #include <linux/init.h>
 115 #include <linux/highmem.h>
 116 #include <linux/user_namespace.h>
 117 #include <linux/static_key.h>
 118 #include <linux/memcontrol.h>
 119 #include <linux/prefetch.h>
 120
 121 #include <asm/uaccess.h>
 122
 123 #include <linux/netdevice.h>
 124 #include <net/protocol.h>
 125 #include <linux/skbuff.h>
 126 #include <net/net_namespace.h>
 127 #include <net/request_sock.h>
 128 #include <net/sock.h>
 129 #include <linux/net_tstamp.h>
 130 #include <net/xfrm.h>
 131 #include <linux/ipsec.h>
 132 #include <net/cls_cgroup.h>
 133 #include <net/netprio_cgroup.h>
 134 #include <linux/sock_diag.h>
 135
 136 #include <linux/filter.h>
 137 #include <net/sock_reuseport.h>
 138
 139 #include <trace/events/sock.h>
 140
 141 #ifdef CONFIG_INET
 142 #include <net/tcp.h>
 143 #endif
 144
 145 #include <net/busy_poll.h>
 146
 147 static DEFINE_MUTEX(proto_list_mutex);
 148 static LIST_HEAD(proto_list);
 149
 150 /**
 151  * sk_ns_capable - General socket capability test
 152  * @sk: Socket to use a capability on or through
 153  * @user_ns: The user namespace of the capability to use
 154  * @cap: The capability to use
 155  *
 156  * Test to see if the opener of the socket had when the socket was
 157  * created and the current process has the capability @cap in the user
 158  * namespace @user_ns.
 159  */
 160 bool sk_ns_capable(const struct sock *sk,
 161                    struct user_namespace *user_ns, int cap)
 162 {
 163         return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 164                 ns_capable(user_ns, cap);
 165 }
 166 EXPORT_SYMBOL(sk_ns_capable);
 167
 168 /**
 169  * sk_capable - Socket global capability test
 170  * @sk: Socket to use a capability on or through
 171  * @cap: The global capability to use
 172  *
 173  * Test to see if the opener of the socket had when the socket was
 174  * created and the current process has the capability @cap in all user
 175  * namespaces.
 176  */
 177 bool sk_capable(const struct sock *sk, int cap)
 178 {
 179         return sk_ns_capable(sk, &init_user_ns, cap);
 180 }
 181 EXPORT_SYMBOL(sk_capable);
 182
 183 /**
 184  * sk_net_capable - Network namespace socket capability test
 185  * @sk: Socket to use a capability on or through
 186  * @cap: The capability to use
 187  *
 188  * Test to see if the opener of the socket had when the socket was created
 189  * and the current process has the capability @cap over the network namespace
 190  * the socket is a member of.
 191  */
 192 bool sk_net_capable(const struct sock *sk, int cap)
 193 {
 194         return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 195 }
 196 EXPORT_SYMBOL(sk_net_capable);
 197
 198 /*
 199  * Each address family might have different locking rules, so we have
 200  * one slock key per address family:
 201  */
 202 static struct lock_class_key af_family_keys[AF_MAX];
 203 static struct lock_class_key af_family_slock_keys[AF_MAX];
 204
 205 /*
 206  * Make lock validator output more readable. (we pre-construct these
 207  * strings build-time, so that runtime initialization of socket
 208  * locks is fast):
 209  */
 210 static const char *const af_family_key_strings[AF_MAX+1] = {
 211   "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
 212   "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
 213   "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
 214   "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
 215   "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
 216   "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
 217   "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
 218   "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
 219   "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
 220   "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
 221   "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
 222   "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
 223   "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
 224   "sk_lock-AF_NFC"   , "sk_lock-AF_VSOCK"    , "sk_lock-AF_KCM"      ,
 225   "sk_lock-AF_MAX"
 226 };
 227 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 228   "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
 229   "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
 230   "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
 231   "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
 232   "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
 233   "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
 234   "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
 235   "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
 236   "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
 237   "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
 238   "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
 239   "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
 240   "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
 241   "slock-AF_NFC"   , "slock-AF_VSOCK"    ,"slock-AF_KCM"       ,
 242   "slock-AF_MAX"
 243 };
 244 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 245   "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
 246   "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
 247   "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
 248   "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
 249   "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
 250   "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
 251   "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
 252   "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
 253   "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
 254   "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
 255   "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
 256   "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
 257   "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
 258   "clock-AF_NFC"   , "clock-AF_VSOCK"    , "clock-AF_KCM"      ,
 259   "clock-AF_MAX"
 260 };
 261
 262 /*
 263  * sk_callback_lock locking rules are per-address-family,
 264  * so split the lock classes by using a per-AF key:
 265  */
 266 static struct lock_class_key af_callback_keys[AF_MAX];
 267
 268 /* Take into consideration the size of the struct sk_buff overhead in the
 269  * determination of these values, since that is non-constant across
 270  * platforms.  This makes socket queueing behavior and performance
 271  * not depend upon such differences.
 272  */
 273 #define _SK_MEM_PACKETS         256
 274 #define _SK_MEM_OVERHEAD        SKB_TRUESIZE(256)
 275 #define SK_WMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 276 #define SK_RMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 277
 278 /* Run time adjustable parameters. */
 279 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 280 EXPORT_SYMBOL(sysctl_wmem_max);
 281 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 282 EXPORT_SYMBOL(sysctl_rmem_max);
 283 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 284 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 285
 286 /* Maximal space eaten by iovec or ancillary data plus some space */
 287 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 288 EXPORT_SYMBOL(sysctl_optmem_max);
 289
 290 int sysctl_tstamp_allow_data __read_mostly = 1;
 291
 292 struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
 293 EXPORT_SYMBOL_GPL(memalloc_socks);
 294
 295 /**
 296  * sk_set_memalloc - sets %SOCK_MEMALLOC
 297  * @sk: socket to set it on
 298  *
 299  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 300  * It's the responsibility of the admin to adjust min_free_kbytes
 301  * to meet the requirements
 302  */
 303 void sk_set_memalloc(struct sock *sk)
 304 {
 305         sock_set_flag(sk, SOCK_MEMALLOC);
 306         sk->sk_allocation |= __GFP_MEMALLOC;
 307         static_key_slow_inc(&memalloc_socks);
 308 }
 309 EXPORT_SYMBOL_GPL(sk_set_memalloc);
 310
 311 void sk_clear_memalloc(struct sock *sk)
 312 {
 313         sock_reset_flag(sk, SOCK_MEMALLOC);
 314         sk->sk_allocation &= ~__GFP_MEMALLOC;
 315         static_key_slow_dec(&memalloc_socks);
 316
 317         /*
 318          * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 319          * progress of swapping. SOCK_MEMALLOC may be cleared while
 320          * it has rmem allocations due to the last swapfile being deactivated
 321          * but there is a risk that the socket is unusable due to exceeding
 322          * the rmem limits. Reclaim the reserves and obey rmem limits again.
 323          */
 324         sk_mem_reclaim(sk);
 325 }
 326 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 327
 328 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 329 {
 330         int ret;
 331         unsigned long pflags = current->flags;
 332
 333         /* these should have been dropped before queueing */
 334         BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 335
 336         current->flags |= PF_MEMALLOC;
 337         ret = sk->sk_backlog_rcv(sk, skb);
 338         tsk_restore_flags(current, pflags, PF_MEMALLOC);
 339
 340         return ret;
 341 }
 342 EXPORT_SYMBOL(__sk_backlog_rcv);
 343
 344 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 345 {
 346         struct timeval tv;
 347
 348         if (optlen < sizeof(tv))
 349                 return -EINVAL;
 350         if (copy_from_user(&tv, optval, sizeof(tv)))
 351                 return -EFAULT;
 352         if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 353                 return -EDOM;
 354
 355         if (tv.tv_sec < 0) {
 356                 static int warned __read_mostly;
 357
 358                 *timeo_p = 0;
 359                 if (warned < 10 && net_ratelimit()) {
 360                         warned++;
 361                         pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 362                                 __func__, current->comm, task_pid_nr(current));
 363                 }
 364                 return 0;
 365         }
 366         *timeo_p = MAX_SCHEDULE_TIMEOUT;
 367         if (tv.tv_sec == 0 && tv.tv_usec == 0)
 368                 return 0;
 369         if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
 370                 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
 371         return 0;
 372 }
 373
 374 static void sock_warn_obsolete_bsdism(const char *name)
 375 {
 376         static int warned;
 377         static char warncomm[TASK_COMM_LEN];
 378         if (strcmp(warncomm, current->comm) && warned < 5) {
 379                 strcpy(warncomm,  current->comm);
 380                 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
 381                         warncomm, name);
 382                 warned++;
 383         }
 384 }
 385
 386 static bool sock_needs_netstamp(const struct sock *sk)
 387 {
 388         switch (sk->sk_family) {
 389         case AF_UNSPEC:
 390         case AF_UNIX:
 391                 return false;
 392         default:
 393                 return true;
 394         }
 395 }
 396
 397 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 398 {
 399         if (sk->sk_flags & flags) {
 400                 sk->sk_flags &= ~flags;
 401                 if (sock_needs_netstamp(sk) &&
 402                     !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 403                         net_disable_timestamp();
 404         }
 405 }
 406
 407
 408 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 409 {
 410         unsigned long flags;
 411         struct sk_buff_head *list = &sk->sk_receive_queue;
 412
 413         if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 414                 atomic_inc(&sk->sk_drops);
 415                 trace_sock_rcvqueue_full(sk, skb);
 416                 return -ENOMEM;
 417         }
 418
 419         if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 420                 atomic_inc(&sk->sk_drops);
 421                 return -ENOBUFS;
 422         }
 423
 424         skb->dev = NULL;
 425         skb_set_owner_r(skb, sk);
 426
 427         /* we escape from rcu protected region, make sure we dont leak
 428          * a norefcounted dst
 429          */
 430         skb_dst_force(skb);
 431
 432         spin_lock_irqsave(&list->lock, flags);
 433         sock_skb_set_dropcount(sk, skb);
 434         __skb_queue_tail(list, skb);
 435         spin_unlock_irqrestore(&list->lock, flags);
 436
 437         if (!sock_flag(sk, SOCK_DEAD))
 438                 sk->sk_data_ready(sk);
 439         return 0;
 440 }
 441 EXPORT_SYMBOL(__sock_queue_rcv_skb);
 442
 443 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 444 {
 445         int err;
 446
 447         err = sk_filter(sk, skb);
 448         if (err)
 449                 return err;
 450
 451         return __sock_queue_rcv_skb(sk, skb);
 452 }
 453 EXPORT_SYMBOL(sock_queue_rcv_skb);
 454
 455 int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
 456 {
 457         int rc = NET_RX_SUCCESS;
 458
 459         if (sk_filter(sk, skb))
 460                 goto discard_and_relse;
 461
 462         skb->dev = NULL;
 463
 464         if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
 465                 atomic_inc(&sk->sk_drops);
 466                 goto discard_and_relse;
 467         }
 468         if (nested)
 469                 bh_lock_sock_nested(sk);
 470         else
 471                 bh_lock_sock(sk);
 472         if (!sock_owned_by_user(sk)) {
 473                 /*
 474                  * trylock + unlock semantics:
 475                  */
 476                 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 477
 478                 rc = sk_backlog_rcv(sk, skb);
 479
 480                 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
 481         } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
 482                 bh_unlock_sock(sk);
 483                 atomic_inc(&sk->sk_drops);
 484                 goto discard_and_relse;
 485         }
 486
 487         bh_unlock_sock(sk);
 488 out:
 489         sock_put(sk);
 490         return rc;
 491 discard_and_relse:
 492         kfree_skb(skb);
 493         goto out;
 494 }
 495 EXPORT_SYMBOL(sk_receive_skb);
 496
 497 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 498 {
 499         struct dst_entry *dst = __sk_dst_get(sk);
 500
 501         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 502                 sk_tx_queue_clear(sk);
 503                 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 504                 dst_release(dst);
 505                 return NULL;
 506         }
 507
 508         return dst;
 509 }
 510 EXPORT_SYMBOL(__sk_dst_check);
 511
 512 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 513 {
 514         struct dst_entry *dst = sk_dst_get(sk);
 515
 516         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 517                 sk_dst_reset(sk);
 518                 dst_release(dst);
 519                 return NULL;
 520         }
 521
 522         return dst;
 523 }
 524 EXPORT_SYMBOL(sk_dst_check);
 525
 526 static int sock_setbindtodevice(struct sock *sk, char __user *optval,
 527                                 int optlen)
 528 {
 529         int ret = -ENOPROTOOPT;
 530 #ifdef CONFIG_NETDEVICES
 531         struct net *net = sock_net(sk);
 532         char devname[IFNAMSIZ];
 533         int index;
 534
 535         /* Sorry... */
 536         ret = -EPERM;
 537         if (!ns_capable(net->user_ns, CAP_NET_RAW))
 538                 goto out;
 539
 540         ret = -EINVAL;
 541         if (optlen < 0)
 542                 goto out;
 543
 544         /* Bind this socket to a particular device like "eth0",
 545          * as specified in the passed interface name. If the
 546          * name is "" or the option length is zero the socket
 547          * is not bound.
 548          */
 549         if (optlen > IFNAMSIZ - 1)
 550                 optlen = IFNAMSIZ - 1;
 551         memset(devname, 0, sizeof(devname));
 552
 553         ret = -EFAULT;
 554         if (copy_from_user(devname, optval, optlen))
 555                 goto out;
 556
 557         index = 0;
 558         if (devname[0] != '\0') {
 559                 struct net_device *dev;
 560
 561                 rcu_read_lock();
 562                 dev = dev_get_by_name_rcu(net, devname);
 563                 if (dev)
 564                         index = dev->ifindex;
 565                 rcu_read_unlock();
 566                 ret = -ENODEV;
 567                 if (!dev)
 568                         goto out;
 569         }
 570
 571         lock_sock(sk);
 572         sk->sk_bound_dev_if = index;
 573         sk_dst_reset(sk);
 574         release_sock(sk);
 575
 576         ret = 0;
 577
 578 out:
 579 #endif
 580
 581         return ret;
 582 }
 583
 584 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
 585                                 int __user *optlen, int len)
 586 {
 587         int ret = -ENOPROTOOPT;
 588 #ifdef CONFIG_NETDEVICES
 589         struct net *net = sock_net(sk);
 590         char devname[IFNAMSIZ];
 591
 592         if (sk->sk_bound_dev_if == 0) {
 593                 len = 0;
 594                 goto zero;
 595         }
 596
 597         ret = -EINVAL;
 598         if (len < IFNAMSIZ)
 599                 goto out;
 600
 601         ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
 602         if (ret)
 603                 goto out;
 604
 605         len = strlen(devname) + 1;
 606
 607         ret = -EFAULT;
 608         if (copy_to_user(optval, devname, len))
 609                 goto out;
 610
 611 zero:
 612         ret = -EFAULT;
 613         if (put_user(len, optlen))
 614                 goto out;
 615
 616         ret = 0;
 617
 618 out:
 619 #endif
 620
 621         return ret;
 622 }
 623
 624 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
 625 {
 626         if (valbool)
 627                 sock_set_flag(sk, bit);
 628         else
 629                 sock_reset_flag(sk, bit);
 630 }
 631
 632 bool sk_mc_loop(struct sock *sk)
 633 {
 634         if (dev_recursion_level())
 635                 return false;
 636         if (!sk)
 637                 return true;
 638         switch (sk->sk_family) {
 639         case AF_INET:
 640                 return inet_sk(sk)->mc_loop;
 641 #if IS_ENABLED(CONFIG_IPV6)
 642         case AF_INET6:
 643                 return inet6_sk(sk)->mc_loop;
 644 #endif
 645         }
 646         WARN_ON(1);
 647         return true;
 648 }
 649 EXPORT_SYMBOL(sk_mc_loop);
 650
 651 /*
 652  *      This is meant for all protocols to use and covers goings on
 653  *      at the socket level. Everything here is generic.
 654  */
 655
 656 int sock_setsockopt(struct socket *sock, int level, int optname,
 657                     char __user *optval, unsigned int optlen)
 658 {
 659         struct sock *sk = sock->sk;
 660         int val;
 661         int valbool;
 662         struct linger ling;
 663         int ret = 0;
 664
 665         /*
 666          *      Options without arguments
 667          */
 668
 669         if (optname == SO_BINDTODEVICE)
 670                 return sock_setbindtodevice(sk, optval, optlen);
 671
 672         if (optlen < sizeof(int))
 673                 return -EINVAL;
 674
 675         if (get_user(val, (int __user *)optval))
 676                 return -EFAULT;
 677
 678         valbool = val ? 1 : 0;
 679
 680         lock_sock(sk);
 681
 682         switch (optname) {
 683         case SO_DEBUG:
 684                 if (val && !capable(CAP_NET_ADMIN))
 685                         ret = -EACCES;
 686                 else
 687                         sock_valbool_flag(sk, SOCK_DBG, valbool);
 688                 break;
 689         case SO_REUSEADDR:
 690                 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
 691                 break;
 692         case SO_REUSEPORT:
 693                 sk->sk_reuseport = valbool;
 694                 break;
 695         case SO_TYPE:
 696         case SO_PROTOCOL:
 697         case SO_DOMAIN:
 698         case SO_ERROR:
 699                 ret = -ENOPROTOOPT;
 700                 break;
 701         case SO_DONTROUTE:
 702                 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
 703                 break;
 704         case SO_BROADCAST:
 705                 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
 706                 break;
 707         case SO_SNDBUF:
 708                 /* Don't error on this BSD doesn't and if you think
 709                  * about it this is right. Otherwise apps have to
 710                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
 711                  * are treated in BSD as hints
 712                  */
 713                 val = min_t(u32, val, sysctl_wmem_max);
 714 set_sndbuf:
 715                 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 716                 sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);
 717                 /* Wake up sending tasks if we upped the value. */
 718                 sk->sk_write_space(sk);
 719                 break;
 720
 721         case SO_SNDBUFFORCE:
 722                 if (!capable(CAP_NET_ADMIN)) {
 723                         ret = -EPERM;
 724                         break;
 725                 }
 726                 goto set_sndbuf;
 727
 728         case SO_RCVBUF:
 729                 /* Don't error on this BSD doesn't and if you think
 730                  * about it this is right. Otherwise apps have to
 731                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
 732                  * are treated in BSD as hints
 733                  */
 734                 val = min_t(u32, val, sysctl_rmem_max);
 735 set_rcvbuf:
 736                 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 737                 /*
 738                  * We double it on the way in to account for
 739                  * "struct sk_buff" etc. overhead.   Applications
 740                  * assume that the SO_RCVBUF setting they make will
 741                  * allow that much actual data to be received on that
 742                  * socket.
 743                  *
 744                  * Applications are unaware that "struct sk_buff" and
 745                  * other overheads allocate from the receive buffer
 746                  * during socket buffer allocation.
 747                  *
 748                  * And after considering the possible alternatives,
 749                  * returning the value we actually used in getsockopt
 750                  * is the most desirable behavior.
 751                  */
 752                 sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);
 753                 break;
 754
 755         case SO_RCVBUFFORCE:
 756                 if (!capable(CAP_NET_ADMIN)) {
 757                         ret = -EPERM;
 758                         break;
 759                 }
 760                 goto set_rcvbuf;
 761
 762         case SO_KEEPALIVE:
 763 #ifdef CONFIG_INET
 764                 if (sk->sk_protocol == IPPROTO_TCP &&
 765                     sk->sk_type == SOCK_STREAM)
 766                         tcp_set_keepalive(sk, valbool);
 767 #endif
 768                 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
 769                 break;
 770
 771         case SO_OOBINLINE:
 772                 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
 773                 break;
 774
 775         case SO_NO_CHECK:
 776                 sk->sk_no_check_tx = valbool;
 777                 break;
 778
 779         case SO_PRIORITY:
 780                 if ((val >= 0 && val <= 6) ||
 781                     ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 782                         sk->sk_priority = val;
 783                 else
 784                         ret = -EPERM;
 785                 break;
 786
 787         case SO_LINGER:
 788                 if (optlen < sizeof(ling)) {
 789                         ret = -EINVAL;  /* 1003.1g */
 790                         break;
 791                 }
 792                 if (copy_from_user(&ling, optval, sizeof(ling))) {
 793                         ret = -EFAULT;
 794                         break;
 795                 }
 796                 if (!ling.l_onoff)
 797                         sock_reset_flag(sk, SOCK_LINGER);
 798                 else {
 799 #if (BITS_PER_LONG == 32)
 800                         if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
 801                                 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
 802                         else
 803 #endif
 804                                 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
 805                         sock_set_flag(sk, SOCK_LINGER);
 806                 }
 807                 break;
 808
 809         case SO_BSDCOMPAT:
 810                 sock_warn_obsolete_bsdism("setsockopt");
 811                 break;
 812
 813         case SO_PASSCRED:
 814                 if (valbool)
 815                         set_bit(SOCK_PASSCRED, &sock->flags);
 816                 else
 817                         clear_bit(SOCK_PASSCRED, &sock->flags);
 818                 break;
 819
 820         case SO_TIMESTAMP:
 821         case SO_TIMESTAMPNS:
 822                 if (valbool)  {
 823                         if (optname == SO_TIMESTAMP)
 824                                 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 825                         else
 826                                 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
 827                         sock_set_flag(sk, SOCK_RCVTSTAMP);
 828                         sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 829                 } else {
 830                         sock_reset_flag(sk, SOCK_RCVTSTAMP);
 831                         sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 832                 }
 833                 break;
 834
 835         case SO_TIMESTAMPING:
 836                 if (val & ~SOF_TIMESTAMPING_MASK) {
 837                         ret = -EINVAL;
 838                         break;
 839                 }
 840
 841                 if (val & SOF_TIMESTAMPING_OPT_ID &&
 842                     !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
 843                         if (sk->sk_protocol == IPPROTO_TCP &&
 844                             sk->sk_type == SOCK_STREAM) {
 845                                 if ((1 << sk->sk_state) &
 846                                     (TCPF_CLOSE | TCPF_LISTEN)) {
 847                                         ret = -EINVAL;
 848                                         break;
 849                                 }
 850                                 sk->sk_tskey = tcp_sk(sk)->snd_una;
 851                         } else {
 852                                 sk->sk_tskey = 0;
 853                         }
 854                 }
 855                 sk->sk_tsflags = val;
 856                 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 857                         sock_enable_timestamp(sk,
 858                                               SOCK_TIMESTAMPING_RX_SOFTWARE);
 859                 else
 860                         sock_disable_timestamp(sk,
 861                                                (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 862                 break;
 863
 864         case SO_RCVLOWAT:
 865                 if (val < 0)
 866                         val = INT_MAX;
 867                 sk->sk_rcvlowat = val ? : 1;
 868                 break;
 869
 870         case SO_RCVTIMEO:
 871                 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
 872                 break;
 873
 874         case SO_SNDTIMEO:
 875                 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
 876                 break;
 877
 878         case SO_ATTACH_FILTER:
 879                 ret = -EINVAL;
 880                 if (optlen == sizeof(struct sock_fprog)) {
 881                         struct sock_fprog fprog;
 882
 883                         ret = -EFAULT;
 884                         if (copy_from_user(&fprog, optval, sizeof(fprog)))
 885                                 break;
 886
 887                         ret = sk_attach_filter(&fprog, sk);
 888                 }
 889                 break;
 890
 891         case SO_ATTACH_BPF:
 892                 ret = -EINVAL;
 893                 if (optlen == sizeof(u32)) {
 894                         u32 ufd;
 895
 896                         ret = -EFAULT;
 897                         if (copy_from_user(&ufd, optval, sizeof(ufd)))
 898                                 break;
 899
 900                         ret = sk_attach_bpf(ufd, sk);
 901                 }
 902                 break;
 903
 904         case SO_ATTACH_REUSEPORT_CBPF:
 905                 ret = -EINVAL;
 906                 if (optlen == sizeof(struct sock_fprog)) {
 907                         struct sock_fprog fprog;
 908
 909                         ret = -EFAULT;
 910                         if (copy_from_user(&fprog, optval, sizeof(fprog)))
 911                                 break;
 912
 913                         ret = sk_reuseport_attach_filter(&fprog, sk);
 914                 }
 915                 break;
 916
 917         case SO_ATTACH_REUSEPORT_EBPF:
 918                 ret = -EINVAL;
 919                 if (optlen == sizeof(u32)) {
 920                         u32 ufd;
 921
 922                         ret = -EFAULT;
 923                         if (copy_from_user(&ufd, optval, sizeof(ufd)))
 924                                 break;
 925
 926                         ret = sk_reuseport_attach_bpf(ufd, sk);
 927                 }
 928                 break;
 929
 930         case SO_DETACH_FILTER:
 931                 ret = sk_detach_filter(sk);
 932                 break;
 933
 934         case SO_LOCK_FILTER:
 935                 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
 936                         ret = -EPERM;
 937                 else
 938                         sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
 939                 break;
 940
 941         case SO_PASSSEC:
 942                 if (valbool)
 943                         set_bit(SOCK_PASSSEC, &sock->flags);
 944                 else
 945                         clear_bit(SOCK_PASSSEC, &sock->flags);
 946                 break;
 947         case SO_MARK:
 948                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 949                         ret = -EPERM;
 950                 else
 951                         sk->sk_mark = val;
 952                 break;
 953
 954         case SO_RXQ_OVFL:
 955                 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
 956                 break;
 957
 958         case SO_WIFI_STATUS:
 959                 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
 960                 break;
 961
 962         case SO_PEEK_OFF:
 963                 if (sock->ops->set_peek_off)
 964                         ret = sock->ops->set_peek_off(sk, val);
 965                 else
 966                         ret = -EOPNOTSUPP;
 967                 break;
 968
 969         case SO_NOFCS:
 970                 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
 971                 break;
 972
 973         case SO_SELECT_ERR_QUEUE:
 974                 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
 975                 break;
 976
 977 #ifdef CONFIG_NET_RX_BUSY_POLL
 978         case SO_BUSY_POLL:
 979                 /* allow unprivileged users to decrease the value */
 980                 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
 981                         ret = -EPERM;
 982                 else {
 983                         if (val < 0)
 984                                 ret = -EINVAL;
 985                         else
 986                                 sk->sk_ll_usec = val;
 987                 }
 988                 break;
 989 #endif
 990
 991         case SO_MAX_PACING_RATE:
 992                 sk->sk_max_pacing_rate = val;
 993                 sk->sk_pacing_rate = min(sk->sk_pacing_rate,
 994                                          sk->sk_max_pacing_rate);
 995                 break;
 996
 997         case SO_INCOMING_CPU:
 998                 sk->sk_incoming_cpu = val;
 999                 break;
1000
1001         case SO_CNX_ADVICE:
1002                 if (val == 1)
1003                         dst_negative_advice(sk);
1004                 break;
1005         default:
1006                 ret = -ENOPROTOOPT;
1007                 break;
1008         }
1009         release_sock(sk);
1010         return ret;
1011 }
1012 EXPORT_SYMBOL(sock_setsockopt);
1013
1014
1015 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1016                           struct ucred *ucred)
1017 {
1018         ucred->pid = pid_vnr(pid);
1019         ucred->uid = ucred->gid = -1;
1020         if (cred) {
1021                 struct user_namespace *current_ns = current_user_ns();
1022
1023                 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1024                 ucred->gid = from_kgid_munged(current_ns, cred->egid);
1025         }
1026 }
1027
1028 int sock_getsockopt(struct socket *sock, int level, int optname,
1029                     char __user *optval, int __user *optlen)
1030 {
1031         struct sock *sk = sock->sk;
1032
1033         union {
1034                 int val;
1035                 struct linger ling;
1036                 struct timeval tm;
1037         } v;
1038
1039         int lv = sizeof(int);
1040         int len;
1041
1042         if (get_user(len, optlen))
1043                 return -EFAULT;
1044         if (len < 0)
1045                 return -EINVAL;
1046
1047         memset(&v, 0, sizeof(v));
1048
1049         switch (optname) {
1050         case SO_DEBUG:
1051                 v.val = sock_flag(sk, SOCK_DBG);
1052                 break;
1053
1054         case SO_DONTROUTE:
1055                 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1056                 break;
1057
1058         case SO_BROADCAST:
1059                 v.val = sock_flag(sk, SOCK_BROADCAST);
1060                 break;
1061
1062         case SO_SNDBUF:
1063                 v.val = sk->sk_sndbuf;
1064                 break;
1065
1066         case SO_RCVBUF:
1067                 v.val = sk->sk_rcvbuf;
1068                 break;
1069
1070         case SO_REUSEADDR:
1071                 v.val = sk->sk_reuse;
1072                 break;
1073
1074         case SO_REUSEPORT:
1075                 v.val = sk->sk_reuseport;
1076                 break;
1077
1078         case SO_KEEPALIVE:
1079                 v.val = sock_flag(sk, SOCK_KEEPOPEN);
1080                 break;
1081
1082         case SO_TYPE:
1083                 v.val = sk->sk_type;
1084                 break;
1085
1086         case SO_PROTOCOL:
1087                 v.val = sk->sk_protocol;
1088                 break;
1089
1090         case SO_DOMAIN:
1091                 v.val = sk->sk_family;
1092                 break;
1093
1094         case SO_ERROR:
1095                 v.val = -sock_error(sk);
1096                 if (v.val == 0)
1097                         v.val = xchg(&sk->sk_err_soft, 0);
1098                 break;
1099
1100         case SO_OOBINLINE:
1101                 v.val = sock_flag(sk, SOCK_URGINLINE);
1102                 break;
1103
1104         case SO_NO_CHECK:
1105                 v.val = sk->sk_no_check_tx;
1106                 break;
1107
1108         case SO_PRIORITY:
1109                 v.val = sk->sk_priority;
1110                 break;
1111
1112         case SO_LINGER:
1113                 lv              = sizeof(v.ling);
1114                 v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
1115                 v.ling.l_linger = sk->sk_lingertime / HZ;
1116                 break;
1117
1118         case SO_BSDCOMPAT:
1119                 sock_warn_obsolete_bsdism("getsockopt");
1120                 break;
1121
1122         case SO_TIMESTAMP:
1123                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1124                                 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1125                 break;
1126
1127         case SO_TIMESTAMPNS:
1128                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1129                 break;
1130
1131         case SO_TIMESTAMPING:
1132                 v.val = sk->sk_tsflags;
1133                 break;
1134
1135         case SO_RCVTIMEO:
1136                 lv = sizeof(struct timeval);
1137                 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1138                         v.tm.tv_sec = 0;
1139                         v.tm.tv_usec = 0;
1140                 } else {
1141                         v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1142                         v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
1143                 }
1144                 break;
1145
1146         case SO_SNDTIMEO:
1147                 lv = sizeof(struct timeval);
1148                 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1149                         v.tm.tv_sec = 0;
1150                         v.tm.tv_usec = 0;
1151                 } else {
1152                         v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1153                         v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
1154                 }
1155                 break;
1156
1157         case SO_RCVLOWAT:
1158                 v.val = sk->sk_rcvlowat;
1159                 break;
1160
1161         case SO_SNDLOWAT:
1162                 v.val = 1;
1163                 break;
1164
1165         case SO_PASSCRED:
1166                 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1167                 break;
1168
1169         case SO_PEERCRED:
1170         {
1171                 struct ucred peercred;
1172                 if (len > sizeof(peercred))
1173                         len = sizeof(peercred);
1174                 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1175                 if (copy_to_user(optval, &peercred, len))
1176                         return -EFAULT;
1177                 goto lenout;
1178         }
1179
1180         case SO_PEERNAME:
1181         {
1182                 char address[128];
1183
1184                 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1185                         return -ENOTCONN;
1186                 if (lv < len)
1187                         return -EINVAL;
1188                 if (copy_to_user(optval, address, len))
1189                         return -EFAULT;
1190                 goto lenout;
1191         }
1192
1193         /* Dubious BSD thing... Probably nobody even uses it, but
1194          * the UNIX standard wants it for whatever reason... -DaveM
1195          */
1196         case SO_ACCEPTCONN:
1197                 v.val = sk->sk_state == TCP_LISTEN;
1198                 break;
1199
1200         case SO_PASSSEC:
1201                 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1202                 break;
1203
1204         case SO_PEERSEC:
1205                 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1206
1207         case SO_MARK:
1208                 v.val = sk->sk_mark;
1209                 break;
1210
1211         case SO_RXQ_OVFL:
1212                 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1213                 break;
1214
1215         case SO_WIFI_STATUS:
1216                 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1217                 break;
1218
1219         case SO_PEEK_OFF:
1220                 if (!sock->ops->set_peek_off)
1221                         return -EOPNOTSUPP;
1222
1223                 v.val = sk->sk_peek_off;
1224                 break;
1225         case SO_NOFCS:
1226                 v.val = sock_flag(sk, SOCK_NOFCS);
1227                 break;
1228
1229         case SO_BINDTODEVICE:
1230                 return sock_getbindtodevice(sk, optval, optlen, len);
1231
1232         case SO_GET_FILTER:
1233                 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1234                 if (len < 0)
1235                         return len;
1236
1237                 goto lenout;
1238
1239         case SO_LOCK_FILTER:
1240                 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1241                 break;
1242
1243         case SO_BPF_EXTENSIONS:
1244                 v.val = bpf_tell_extensions();
1245                 break;
1246
1247         case SO_SELECT_ERR_QUEUE:
1248                 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1249                 break;
1250
1251 #ifdef CONFIG_NET_RX_BUSY_POLL
1252         case SO_BUSY_POLL:
1253                 v.val = sk->sk_ll_usec;
1254                 break;
1255 #endif
1256
1257         case SO_MAX_PACING_RATE:
1258                 v.val = sk->sk_max_pacing_rate;
1259                 break;
1260
1261         case SO_INCOMING_CPU:
1262                 v.val = sk->sk_incoming_cpu;
1263                 break;
1264
1265         default:
1266                 /* We implement the SO_SNDLOWAT etc to not be settable
1267                  * (1003.1g 7).
1268                  */
1269                 return -ENOPROTOOPT;
1270         }
1271
1272         if (len > lv)
1273                 len = lv;
1274         if (copy_to_user(optval, &v, len))
1275                 return -EFAULT;
1276 lenout:
1277         if (put_user(len, optlen))
1278                 return -EFAULT;
1279         return 0;
1280 }
1281
1282 /*
1283  * Initialize an sk_lock.
1284  *
1285  * (We also register the sk_lock with the lock validator.)
1286  */
1287 static inline void sock_lock_init(struct sock *sk)
1288 {
1289         sock_lock_init_class_and_name(sk,
1290                         af_family_slock_key_strings[sk->sk_family],
1291                         af_family_slock_keys + sk->sk_family,
1292                         af_family_key_strings[sk->sk_family],
1293                         af_family_keys + sk->sk_family);
1294 }
1295
1296 /*
1297  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1298  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1299  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1300  */
1301 static void sock_copy(struct sock *nsk, const struct sock *osk)
1302 {
1303 #ifdef CONFIG_SECURITY_NETWORK
1304         void *sptr = nsk->sk_security;
1305 #endif
1306         memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1307
1308         memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1309                osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1310
1311 #ifdef CONFIG_SECURITY_NETWORK
1312         nsk->sk_security = sptr;
1313         security_sk_clone(osk, nsk);
1314 #endif
1315 }
1316
1317 void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1318 {
1319         unsigned long nulls1, nulls2;
1320
1321         nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1322         nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1323         if (nulls1 > nulls2)
1324                 swap(nulls1, nulls2);
1325
1326         if (nulls1 != 0)
1327                 memset((char *)sk, 0, nulls1);
1328         memset((char *)sk + nulls1 + sizeof(void *), 0,
1329                nulls2 - nulls1 - sizeof(void *));
1330         memset((char *)sk + nulls2 + sizeof(void *), 0,
1331                size - nulls2 - sizeof(void *));
1332 }
1333 EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1334
1335 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1336                 int family)
1337 {
1338         struct sock *sk;
1339         struct kmem_cache *slab;
1340
1341         slab = prot->slab;
1342         if (slab != NULL) {
1343                 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1344                 if (!sk)
1345                         return sk;
1346                 if (priority & __GFP_ZERO) {
1347                         if (prot->clear_sk)
1348                                 prot->clear_sk(sk, prot->obj_size);
1349                         else
1350                                 sk_prot_clear_nulls(sk, prot->obj_size);
1351                 }
1352         } else
1353                 sk = kmalloc(prot->obj_size, priority);
1354
1355         if (sk != NULL) {
1356                 kmemcheck_annotate_bitfield(sk, flags);
1357
1358                 if (security_sk_alloc(sk, family, priority))
1359                         goto out_free;
1360
1361                 if (!try_module_get(prot->owner))
1362                         goto out_free_sec;
1363                 sk_tx_queue_clear(sk);
1364                 cgroup_sk_alloc(&sk->sk_cgrp_data);
1365         }
1366
1367         return sk;
1368
1369 out_free_sec:
1370         security_sk_free(sk);
1371 out_free:
1372         if (slab != NULL)
1373                 kmem_cache_free(slab, sk);
1374         else
1375                 kfree(sk);
1376         return NULL;
1377 }
1378
1379 static void sk_prot_free(struct proto *prot, struct sock *sk)
1380 {
1381         struct kmem_cache *slab;
1382         struct module *owner;
1383
1384         owner = prot->owner;
1385         slab = prot->slab;
1386
1387         cgroup_sk_free(&sk->sk_cgrp_data);
1388         security_sk_free(sk);
1389         if (slab != NULL)
1390                 kmem_cache_free(slab, sk);
1391         else
1392                 kfree(sk);
1393         module_put(owner);
1394 }
1395
1396 /**
1397  *      sk_alloc - All socket objects are allocated here
1398  *      @net: the applicable net namespace
1399  *      @family: protocol family
1400  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1401  *      @prot: struct proto associated with this new sock instance
1402  *      @kern: is this to be a kernel socket?
1403  */
1404 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1405                       struct proto *prot, int kern)
1406 {
1407         struct sock *sk;
1408
1409         sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1410         if (sk) {
1411                 sk->sk_family = family;
1412                 /*
1413                  * See comment in struct sock definition to understand
1414                  * why we need sk_prot_creator -acme
1415                  */
1416                 sk->sk_prot = sk->sk_prot_creator = prot;
1417                 sock_lock_init(sk);
1418                 sk->sk_net_refcnt = kern ? 0 : 1;
1419                 if (likely(sk->sk_net_refcnt))
1420                         get_net(net);
1421                 sock_net_set(sk, net);
1422                 atomic_set(&sk->sk_wmem_alloc, 1);
1423
1424                 sock_update_classid(&sk->sk_cgrp_data);
1425                 sock_update_netprioidx(&sk->sk_cgrp_data);
1426         }
1427
1428         return sk;
1429 }
1430 EXPORT_SYMBOL(sk_alloc);
1431
1432 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
1433  * grace period. This is the case for UDP sockets and TCP listeners.
1434  */
1435 static void __sk_destruct(struct rcu_head *head)
1436 {
1437         struct sock *sk = container_of(head, struct sock, sk_rcu);
1438         struct sk_filter *filter;
1439
1440         if (sk->sk_destruct)
1441                 sk->sk_destruct(sk);
1442
1443         filter = rcu_dereference_check(sk->sk_filter,
1444                                        atomic_read(&sk->sk_wmem_alloc) == 0);
1445         if (filter) {
1446                 sk_filter_uncharge(sk, filter);
1447                 RCU_INIT_POINTER(sk->sk_filter, NULL);
1448         }
1449         if (rcu_access_pointer(sk->sk_reuseport_cb))
1450                 reuseport_detach_sock(sk);
1451
1452         sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1453
1454         if (atomic_read(&sk->sk_omem_alloc))
1455                 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1456                          __func__, atomic_read(&sk->sk_omem_alloc));
1457
1458         if (sk->sk_peer_cred)
1459                 put_cred(sk->sk_peer_cred);
1460         put_pid(sk->sk_peer_pid);
1461         if (likely(sk->sk_net_refcnt))
1462                 put_net(sock_net(sk));
1463         sk_prot_free(sk->sk_prot_creator, sk);
1464 }
1465
1466 void sk_destruct(struct sock *sk)
1467 {
1468         if (sock_flag(sk, SOCK_RCU_FREE))
1469                 call_rcu(&sk->sk_rcu, __sk_destruct);
1470         else
1471                 __sk_destruct(&sk->sk_rcu);
1472 }
1473
1474 static void __sk_free(struct sock *sk)
1475 {
1476         if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt))
1477                 sock_diag_broadcast_destroy(sk);
1478         else
1479                 sk_destruct(sk);
1480 }
1481
1482 void sk_free(struct sock *sk)
1483 {
1484         /*
1485          * We subtract one from sk_wmem_alloc and can know if
1486          * some packets are still in some tx queue.
1487          * If not null, sock_wfree() will call __sk_free(sk) later
1488          */
1489         if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1490                 __sk_free(sk);
1491 }
1492 EXPORT_SYMBOL(sk_free);
1493
1494 /**
1495  *      sk_clone_lock - clone a socket, and lock its clone
1496  *      @sk: the socket to clone
1497  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1498  *
1499  *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1500  */
1501 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1502 {
1503         struct sock *newsk;
1504         bool is_charged = true;
1505
1506         newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1507         if (newsk != NULL) {
1508                 struct sk_filter *filter;
1509
1510                 sock_copy(newsk, sk);
1511
1512                 /* SANITY */
1513                 if (likely(newsk->sk_net_refcnt))
1514                         get_net(sock_net(newsk));
1515                 sk_node_init(&newsk->sk_node);
1516                 sock_lock_init(newsk);
1517                 bh_lock_sock(newsk);
1518                 newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
1519                 newsk->sk_backlog.len = 0;
1520
1521                 atomic_set(&newsk->sk_rmem_alloc, 0);
1522                 /*
1523                  * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1524                  */
1525                 atomic_set(&newsk->sk_wmem_alloc, 1);
1526                 atomic_set(&newsk->sk_omem_alloc, 0);
1527                 skb_queue_head_init(&newsk->sk_receive_queue);
1528                 skb_queue_head_init(&newsk->sk_write_queue);
1529
1530                 rwlock_init(&newsk->sk_callback_lock);
1531                 lockdep_set_class_and_name(&newsk->sk_callback_lock,
1532                                 af_callback_keys + newsk->sk_family,
1533                                 af_family_clock_key_strings[newsk->sk_family]);
1534
1535                 newsk->sk_dst_cache     = NULL;
1536                 newsk->sk_wmem_queued   = 0;
1537                 newsk->sk_forward_alloc = 0;
1538                 atomic_set(&newsk->sk_drops, 0);
1539                 newsk->sk_send_head     = NULL;
1540                 newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1541
1542                 sock_reset_flag(newsk, SOCK_DONE);
1543                 skb_queue_head_init(&newsk->sk_error_queue);
1544
1545                 filter = rcu_dereference_protected(newsk->sk_filter, 1);
1546                 if (filter != NULL)
1547                         /* though it's an empty new sock, the charging may fail
1548                          * if sysctl_optmem_max was changed between creation of
1549                          * original socket and cloning
1550                          */
1551                         is_charged = sk_filter_charge(newsk, filter);
1552
1553                 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1554                         /* It is still raw copy of parent, so invalidate
1555                          * destructor and make plain sk_free() */
1556                         newsk->sk_destruct = NULL;
1557                         bh_unlock_sock(newsk);
1558                         sk_free(newsk);
1559                         newsk = NULL;
1560                         goto out;
1561                 }
1562                 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1563
1564                 newsk->sk_err      = 0;
1565                 newsk->sk_priority = 0;
1566                 newsk->sk_incoming_cpu = raw_smp_processor_id();
1567                 atomic64_set(&newsk->sk_cookie, 0);
1568                 /*
1569                  * Before updating sk_refcnt, we must commit prior changes to memory
1570                  * (Documentation/RCU/rculist_nulls.txt for details)
1571                  */
1572                 smp_wmb();
1573                 atomic_set(&newsk->sk_refcnt, 2);
1574
1575                 /*
1576                  * Increment the counter in the same struct proto as the master
1577                  * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1578                  * is the same as sk->sk_prot->socks, as this field was copied
1579                  * with memcpy).
1580                  *
1581                  * This _changes_ the previous behaviour, where
1582                  * tcp_create_openreq_child always was incrementing the
1583                  * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1584                  * to be taken into account in all callers. -acme
1585                  */
1586                 sk_refcnt_debug_inc(newsk);
1587                 sk_set_socket(newsk, NULL);
1588                 newsk->sk_wq = NULL;
1589
1590                 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
1591                         sock_update_memcg(newsk);
1592
1593                 if (newsk->sk_prot->sockets_allocated)
1594                         sk_sockets_allocated_inc(newsk);
1595
1596                 if (sock_needs_netstamp(sk) &&
1597                     newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1598                         net_enable_timestamp();
1599         }
1600 out:
1601         return newsk;
1602 }
1603 EXPORT_SYMBOL_GPL(sk_clone_lock);
1604
1605 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1606 {
1607         u32 max_segs = 1;
1608
1609         sk_dst_set(sk, dst);
1610         sk->sk_route_caps = dst->dev->features;
1611         if (sk->sk_route_caps & NETIF_F_GSO)
1612                 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1613         sk->sk_route_caps &= ~sk->sk_route_nocaps;
1614         if (sk_can_gso(sk)) {
1615                 if (dst->header_len) {
1616                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1617                 } else {
1618                         sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1619                         sk->sk_gso_max_size = dst->dev->gso_max_size;
1620                         max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
1621                 }
1622         }
1623         sk->sk_gso_max_segs = max_segs;
1624 }
1625 EXPORT_SYMBOL_GPL(sk_setup_caps);
1626
1627 /*
1628  *      Simple resource managers for sockets.
1629  */
1630
1631
1632 /*
1633  * Write buffer destructor automatically called from kfree_skb.
1634  */
1635 void sock_wfree(struct sk_buff *skb)
1636 {
1637         struct sock *sk = skb->sk;
1638         unsigned int len = skb->truesize;
1639
1640         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1641                 /*
1642                  * Keep a reference on sk_wmem_alloc, this will be released
1643                  * after sk_write_space() call
1644                  */
1645                 atomic_sub(len - 1, &sk->sk_wmem_alloc);
1646                 sk->sk_write_space(sk);
1647                 len = 1;
1648         }
1649         /*
1650          * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1651          * could not do because of in-flight packets
1652          */
1653         if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1654                 __sk_free(sk);
1655 }
1656 EXPORT_SYMBOL(sock_wfree);
1657
1658 /* This variant of sock_wfree() is used by TCP,
1659  * since it sets SOCK_USE_WRITE_QUEUE.
1660  */
1661 void __sock_wfree(struct sk_buff *skb)
1662 {
1663         struct sock *sk = skb->sk;
1664
1665         if (atomic_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
1666                 __sk_free(sk);
1667 }
1668
1669 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1670 {
1671         skb_orphan(skb);
1672         skb->sk = sk;
1673 #ifdef CONFIG_INET
1674         if (unlikely(!sk_fullsock(sk))) {
1675                 skb->destructor = sock_edemux;
1676                 sock_hold(sk);
1677                 return;
1678         }
1679 #endif
1680         skb->destructor = sock_wfree;
1681         skb_set_hash_from_sk(skb, sk);
1682         /*
1683          * We used to take a refcount on sk, but following operation
1684          * is enough to guarantee sk_free() wont free this sock until
1685          * all in-flight packets are completed
1686          */
1687         atomic_add(skb->truesize, &sk->sk_wmem_alloc);
1688 }
1689 EXPORT_SYMBOL(skb_set_owner_w);
1690
1691 /* This helper is used by netem, as it can hold packets in its
1692  * delay queue. We want to allow the owner socket to send more
1693  * packets, as if they were already TX completed by a typical driver.
1694  * But we also want to keep skb->sk set because some packet schedulers
1695  * rely on it (sch_fq for example). So we set skb->truesize to a small
1696  * amount (1) and decrease sk_wmem_alloc accordingly.
1697  */
1698 void skb_orphan_partial(struct sk_buff *skb)
1699 {
1700         /* If this skb is a TCP pure ACK or already went here,
1701          * we have nothing to do. 2 is already a very small truesize.
1702          */
1703         if (skb->truesize <= 2)
1704                 return;
1705
1706         /* TCP stack sets skb->ooo_okay based on sk_wmem_alloc,
1707          * so we do not completely orphan skb, but transfert all
1708          * accounted bytes but one, to avoid unexpected reorders.
1709          */
1710         if (skb->destructor == sock_wfree
1711 #ifdef CONFIG_INET
1712             || skb->destructor == tcp_wfree
1713 #endif
1714                 ) {
1715                 atomic_sub(skb->truesize - 1, &skb->sk->sk_wmem_alloc);
1716                 skb->truesize = 1;
1717         } else {
1718                 skb_orphan(skb);
1719         }
1720 }
1721 EXPORT_SYMBOL(skb_orphan_partial);
1722
1723 /*
1724  * Read buffer destructor automatically called from kfree_skb.
1725  */
1726 void sock_rfree(struct sk_buff *skb)
1727 {
1728         struct sock *sk = skb->sk;
1729         unsigned int len = skb->truesize;
1730
1731         atomic_sub(len, &sk->sk_rmem_alloc);
1732         sk_mem_uncharge(sk, len);
1733 }
1734 EXPORT_SYMBOL(sock_rfree);
1735
1736 /*
1737  * Buffer destructor for skbs that are not used directly in read or write
1738  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1739  */
1740 void sock_efree(struct sk_buff *skb)
1741 {
1742         sock_put(skb->sk);
1743 }
1744 EXPORT_SYMBOL(sock_efree);
1745
1746 kuid_t sock_i_uid(struct sock *sk)
1747 {
1748         kuid_t uid;
1749
1750         read_lock_bh(&sk->sk_callback_lock);
1751         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1752         read_unlock_bh(&sk->sk_callback_lock);
1753         return uid;
1754 }
1755 EXPORT_SYMBOL(sock_i_uid);
1756
1757 unsigned long sock_i_ino(struct sock *sk)
1758 {
1759         unsigned long ino;
1760
1761         read_lock_bh(&sk->sk_callback_lock);
1762         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1763         read_unlock_bh(&sk->sk_callback_lock);
1764         return ino;
1765 }
1766 EXPORT_SYMBOL(sock_i_ino);
1767
1768 /*
1769  * Allocate a skb from the socket's send buffer.
1770  */
1771 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1772                              gfp_t priority)
1773 {
1774         if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1775                 struct sk_buff *skb = alloc_skb(size, priority);
1776                 if (skb) {
1777                         skb_set_owner_w(skb, sk);
1778                         return skb;
1779                 }
1780         }
1781         return NULL;
1782 }
1783 EXPORT_SYMBOL(sock_wmalloc);
1784
1785 /*
1786  * Allocate a memory block from the socket's option memory buffer.
1787  */
1788 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1789 {
1790         if ((unsigned int)size <= sysctl_optmem_max &&
1791             atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1792                 void *mem;
1793                 /* First do the add, to avoid the race if kmalloc
1794                  * might sleep.
1795                  */
1796                 atomic_add(size, &sk->sk_omem_alloc);
1797                 mem = kmalloc(size, priority);
1798                 if (mem)
1799                         return mem;
1800                 atomic_sub(size, &sk->sk_omem_alloc);
1801         }
1802         return NULL;
1803 }
1804 EXPORT_SYMBOL(sock_kmalloc);
1805
1806 /* Free an option memory block. Note, we actually want the inline
1807  * here as this allows gcc to detect the nullify and fold away the
1808  * condition entirely.
1809  */
1810 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
1811                                   const bool nullify)
1812 {
1813         if (WARN_ON_ONCE(!mem))
1814                 return;
1815         if (nullify)
1816                 kzfree(mem);
1817         else
1818                 kfree(mem);
1819         atomic_sub(size, &sk->sk_omem_alloc);
1820 }
1821
1822 void sock_kfree_s(struct sock *sk, void *mem, int size)
1823 {
1824         __sock_kfree_s(sk, mem, size, false);
1825 }
1826 EXPORT_SYMBOL(sock_kfree_s);
1827
1828 void sock_kzfree_s(struct sock *sk, void *mem, int size)
1829 {
1830         __sock_kfree_s(sk, mem, size, true);
1831 }
1832 EXPORT_SYMBOL(sock_kzfree_s);
1833
1834 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1835    I think, these locks should be removed for datagram sockets.
1836  */
1837 static long sock_wait_for_wmem(struct sock *sk, long timeo)
1838 {
1839         DEFINE_WAIT(wait);
1840
1841         sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1842         for (;;) {
1843                 if (!timeo)
1844                         break;
1845                 if (signal_pending(current))
1846                         break;
1847                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1848                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1849                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1850                         break;
1851                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1852                         break;
1853                 if (sk->sk_err)
1854                         break;
1855                 timeo = schedule_timeout(timeo);
1856         }
1857         finish_wait(sk_sleep(sk), &wait);
1858         return timeo;
1859 }
1860
1861
1862 /*
1863  *      Generic send/receive buffer handlers
1864  */
1865
1866 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1867                                      unsigned long data_len, int noblock,
1868                                      int *errcode, int max_page_order)
1869 {
1870         struct sk_buff *skb;
1871         long timeo;
1872         int err;
1873
1874         timeo = sock_sndtimeo(sk, noblock);
1875         for (;;) {
1876                 err = sock_error(sk);
1877                 if (err != 0)
1878                         goto failure;
1879
1880                 err = -EPIPE;
1881                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1882                         goto failure;
1883
1884                 if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
1885                         break;
1886
1887                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1888                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1889                 err = -EAGAIN;
1890                 if (!timeo)
1891                         goto failure;
1892                 if (signal_pending(current))
1893                         goto interrupted;
1894                 timeo = sock_wait_for_wmem(sk, timeo);
1895         }
1896         skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
1897                                    errcode, sk->sk_allocation);
1898         if (skb)
1899                 skb_set_owner_w(skb, sk);
1900         return skb;
1901
1902 interrupted:
1903         err = sock_intr_errno(timeo);
1904 failure:
1905         *errcode = err;
1906         return NULL;
1907 }
1908 EXPORT_SYMBOL(sock_alloc_send_pskb);
1909
1910 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1911                                     int noblock, int *errcode)
1912 {
1913         return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
1914 }
1915 EXPORT_SYMBOL(sock_alloc_send_skb);
1916
1917 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
1918                      struct sockcm_cookie *sockc)
1919 {
1920         u32 tsflags;
1921
1922         switch (cmsg->cmsg_type) {
1923         case SO_MARK:
1924                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1925                         return -EPERM;
1926                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
1927                         return -EINVAL;
1928                 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
1929                 break;
1930         case SO_TIMESTAMPING:
1931                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
1932                         return -EINVAL;
1933
1934                 tsflags = *(u32 *)CMSG_DATA(cmsg);
1935                 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
1936                         return -EINVAL;
1937
1938                 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
1939                 sockc->tsflags |= tsflags;
1940                 break;
1941         default:
1942                 return -EINVAL;
1943         }
1944         return 0;
1945 }
1946 EXPORT_SYMBOL(__sock_cmsg_send);
1947
1948 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
1949                    struct sockcm_cookie *sockc)
1950 {
1951         struct cmsghdr *cmsg;
1952         int ret;
1953
1954         for_each_cmsghdr(cmsg, msg) {
1955                 if (!CMSG_OK(msg, cmsg))
1956                         return -EINVAL;
1957                 if (cmsg->cmsg_level != SOL_SOCKET)
1958                         continue;
1959                 ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
1960                 if (ret)
1961                         return ret;
1962         }
1963         return 0;
1964 }
1965 EXPORT_SYMBOL(sock_cmsg_send);
1966
1967 /* On 32bit arches, an skb frag is limited to 2^15 */
1968 #define SKB_FRAG_PAGE_ORDER     get_order(32768)
1969
1970 /**
1971  * skb_page_frag_refill - check that a page_frag contains enough room
1972  * @sz: minimum size of the fragment we want to get
1973  * @pfrag: pointer to page_frag
1974  * @gfp: priority for memory allocation
1975  *
1976  * Note: While this allocator tries to use high order pages, there is
1977  * no guarantee that allocations succeed. Therefore, @sz MUST be
1978  * less or equal than PAGE_SIZE.
1979  */
1980 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
1981 {
1982         if (pfrag->page) {
1983                 if (page_ref_count(pfrag->page) == 1) {
1984                         pfrag->offset = 0;
1985                         return true;
1986                 }
1987                 if (pfrag->offset + sz <= pfrag->size)
1988                         return true;
1989                 put_page(pfrag->page);
1990         }
1991
1992         pfrag->offset = 0;
1993         if (SKB_FRAG_PAGE_ORDER) {
1994                 /* Avoid direct reclaim but allow kswapd to wake */
1995                 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
1996                                           __GFP_COMP | __GFP_NOWARN |
1997                                           __GFP_NORETRY,
1998                                           SKB_FRAG_PAGE_ORDER);
1999                 if (likely(pfrag->page)) {
2000                         pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2001                         return true;
2002                 }
2003         }
2004         pfrag->page = alloc_page(gfp);
2005         if (likely(pfrag->page)) {
2006                 pfrag->size = PAGE_SIZE;
2007                 return true;
2008         }
2009         return false;
2010 }
2011 EXPORT_SYMBOL(skb_page_frag_refill);
2012
2013 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2014 {
2015         if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2016                 return true;
2017
2018         sk_enter_memory_pressure(sk);
2019         sk_stream_moderate_sndbuf(sk);
2020         return false;
2021 }
2022 EXPORT_SYMBOL(sk_page_frag_refill);
2023
2024 static void __lock_sock(struct sock *sk)
2025         __releases(&sk->sk_lock.slock)
2026         __acquires(&sk->sk_lock.slock)
2027 {
2028         DEFINE_WAIT(wait);
2029
2030         for (;;) {
2031                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2032                                         TASK_UNINTERRUPTIBLE);
2033                 spin_unlock_bh(&sk->sk_lock.slock);
2034                 schedule();
2035                 spin_lock_bh(&sk->sk_lock.slock);
2036                 if (!sock_owned_by_user(sk))
2037                         break;
2038         }
2039         finish_wait(&sk->sk_lock.wq, &wait);
2040 }
2041
2042 static void __release_sock(struct sock *sk)
2043         __releases(&sk->sk_lock.slock)
2044         __acquires(&sk->sk_lock.slock)
2045 {
2046         struct sk_buff *skb, *next;
2047
2048         while ((skb = sk->sk_backlog.head) != NULL) {
2049                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2050
2051                 spin_unlock_bh(&sk->sk_lock.slock);
2052
2053                 do {
2054                         next = skb->next;
2055                         prefetch(next);
2056                         WARN_ON_ONCE(skb_dst_is_noref(skb));
2057                         skb->next = NULL;
2058                         sk_backlog_rcv(sk, skb);
2059
2060                         cond_resched();
2061
2062                         skb = next;
2063                 } while (skb != NULL);
2064
2065                 spin_lock_bh(&sk->sk_lock.slock);
2066         }
2067
2068         /*
2069          * Doing the zeroing here guarantee we can not loop forever
2070          * while a wild producer attempts to flood us.
2071          */
2072         sk->sk_backlog.len = 0;
2073 }
2074
2075 void __sk_flush_backlog(struct sock *sk)
2076 {
2077         spin_lock_bh(&sk->sk_lock.slock);
2078         __release_sock(sk);
2079         spin_unlock_bh(&sk->sk_lock.slock);
2080 }
2081
2082 /**
2083  * sk_wait_data - wait for data to arrive at sk_receive_queue
2084  * @sk:    sock to wait on
2085  * @timeo: for how long
2086  * @skb:   last skb seen on sk_receive_queue
2087  *
2088  * Now socket state including sk->sk_err is changed only under lock,
2089  * hence we may omit checks after joining wait queue.
2090  * We check receive queue before schedule() only as optimization;
2091  * it is very likely that release_sock() added new data.
2092  */
2093 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2094 {
2095         int rc;
2096         DEFINE_WAIT(wait);
2097
2098         prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2099         sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2100         rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb);
2101         sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2102         finish_wait(sk_sleep(sk), &wait);
2103         return rc;
2104 }
2105 EXPORT_SYMBOL(sk_wait_data);
2106
2107 /**
2108  *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2109  *      @sk: socket
2110  *      @size: memory size to allocate
2111  *      @kind: allocation type
2112  *
2113  *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2114  *      rmem allocation. This function assumes that protocols which have
2115  *      memory_pressure use sk_wmem_queued as write buffer accounting.
2116  */
2117 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2118 {
2119         struct proto *prot = sk->sk_prot;
2120         int amt = sk_mem_pages(size);
2121         long allocated;
2122
2123         sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
2124
2125         allocated = sk_memory_allocated_add(sk, amt);
2126
2127         if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2128             !mem_cgroup_charge_skmem(sk->sk_memcg, amt))
2129                 goto suppress_allocation;
2130
2131         /* Under limit. */
2132         if (allocated <= sk_prot_mem_limits(sk, 0)) {
2133                 sk_leave_memory_pressure(sk);
2134                 return 1;
2135         }
2136
2137         /* Under pressure. */
2138         if (allocated > sk_prot_mem_limits(sk, 1))
2139                 sk_enter_memory_pressure(sk);
2140
2141         /* Over hard limit. */
2142         if (allocated > sk_prot_mem_limits(sk, 2))
2143                 goto suppress_allocation;
2144
2145         /* guarantee minimum buffer size under pressure */
2146         if (kind == SK_MEM_RECV) {
2147                 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
2148                         return 1;
2149
2150         } else { /* SK_MEM_SEND */
2151                 if (sk->sk_type == SOCK_STREAM) {
2152                         if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
2153                                 return 1;
2154                 } else if (atomic_read(&sk->sk_wmem_alloc) <
2155                            prot->sysctl_wmem[0])
2156                                 return 1;
2157         }
2158
2159         if (sk_has_memory_pressure(sk)) {
2160                 int alloc;
2161
2162                 if (!sk_under_memory_pressure(sk))
2163                         return 1;
2164                 alloc = sk_sockets_allocated_read_positive(sk);
2165                 if (sk_prot_mem_limits(sk, 2) > alloc *
2166                     sk_mem_pages(sk->sk_wmem_queued +
2167                                  atomic_read(&sk->sk_rmem_alloc) +
2168                                  sk->sk_forward_alloc))
2169                         return 1;
2170         }
2171
2172 suppress_allocation:
2173
2174         if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2175                 sk_stream_moderate_sndbuf(sk);
2176
2177                 /* Fail only if socket is _under_ its sndbuf.
2178                  * In this case we cannot block, so that we have to fail.
2179                  */
2180                 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2181                         return 1;
2182         }
2183
2184         trace_sock_exceed_buf_limit(sk, prot, allocated);
2185
2186         /* Alas. Undo changes. */
2187         sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
2188
2189         sk_memory_allocated_sub(sk, amt);
2190
2191         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2192                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2193
2194         return 0;
2195 }
2196 EXPORT_SYMBOL(__sk_mem_schedule);
2197
2198 /**
2199  *      __sk_mem_reclaim - reclaim memory_allocated
2200  *      @sk: socket
2201  *      @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2202  */
2203 void __sk_mem_reclaim(struct sock *sk, int amount)
2204 {
2205         amount >>= SK_MEM_QUANTUM_SHIFT;
2206         sk_memory_allocated_sub(sk, amount);
2207         sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2208
2209         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2210                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2211
2212         if (sk_under_memory_pressure(sk) &&
2213             (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2214                 sk_leave_memory_pressure(sk);
2215 }
2216 EXPORT_SYMBOL(__sk_mem_reclaim);
2217
2218 int sk_set_peek_off(struct sock *sk, int val)
2219 {
2220         if (val < 0)
2221                 return -EINVAL;
2222
2223         sk->sk_peek_off = val;
2224         return 0;
2225 }
2226 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2227
2228 /*
2229  * Set of default routines for initialising struct proto_ops when
2230  * the protocol does not support a particular function. In certain
2231  * cases where it makes no sense for a protocol to have a "do nothing"
2232  * function, some default processing is provided.
2233  */
2234
2235 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2236 {
2237         return -EOPNOTSUPP;
2238 }
2239 EXPORT_SYMBOL(sock_no_bind);
2240
2241 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2242                     int len, int flags)
2243 {
2244         return -EOPNOTSUPP;
2245 }
2246 EXPORT_SYMBOL(sock_no_connect);
2247
2248 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2249 {
2250         return -EOPNOTSUPP;
2251 }
2252 EXPORT_SYMBOL(sock_no_socketpair);
2253
2254 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
2255 {
2256         return -EOPNOTSUPP;
2257 }
2258 EXPORT_SYMBOL(sock_no_accept);
2259
2260 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2261                     int *len, int peer)
2262 {
2263         return -EOPNOTSUPP;
2264 }
2265 EXPORT_SYMBOL(sock_no_getname);
2266
2267 unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2268 {
2269         return 0;
2270 }
2271 EXPORT_SYMBOL(sock_no_poll);
2272
2273 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2274 {
2275         return -EOPNOTSUPP;
2276 }
2277 EXPORT_SYMBOL(sock_no_ioctl);
2278
2279 int sock_no_listen(struct socket *sock, int backlog)
2280 {
2281         return -EOPNOTSUPP;
2282 }
2283 EXPORT_SYMBOL(sock_no_listen);
2284
2285 int sock_no_shutdown(struct socket *sock, int how)
2286 {
2287         return -EOPNOTSUPP;
2288 }
2289 EXPORT_SYMBOL(sock_no_shutdown);
2290
2291 int sock_no_setsockopt(struct socket *sock, int level, int optname,
2292                     char __user *optval, unsigned int optlen)
2293 {
2294         return -EOPNOTSUPP;
2295 }
2296 EXPORT_SYMBOL(sock_no_setsockopt);
2297
2298 int sock_no_getsockopt(struct socket *sock, int level, int optname,
2299                     char __user *optval, int __user *optlen)
2300 {
2301         return -EOPNOTSUPP;
2302 }
2303 EXPORT_SYMBOL(sock_no_getsockopt);
2304
2305 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2306 {
2307         return -EOPNOTSUPP;
2308 }
2309 EXPORT_SYMBOL(sock_no_sendmsg);
2310
2311 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2312                     int flags)
2313 {
2314         return -EOPNOTSUPP;
2315 }
2316 EXPORT_SYMBOL(sock_no_recvmsg);
2317
2318 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2319 {
2320         /* Mirror missing mmap method error code */
2321         return -ENODEV;
2322 }
2323 EXPORT_SYMBOL(sock_no_mmap);
2324
2325 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2326 {
2327         ssize_t res;
2328         struct msghdr msg = {.msg_flags = flags};
2329         struct kvec iov;
2330         char *kaddr = kmap(page);
2331         iov.iov_base = kaddr + offset;
2332         iov.iov_len = size;
2333         res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2334         kunmap(page);
2335         return res;
2336 }
2337 EXPORT_SYMBOL(sock_no_sendpage);
2338
2339 /*
2340  *      Default Socket Callbacks
2341  */
2342
2343 static void sock_def_wakeup(struct sock *sk)
2344 {
2345         struct socket_wq *wq;
2346
2347         rcu_read_lock();
2348         wq = rcu_dereference(sk->sk_wq);
2349         if (skwq_has_sleeper(wq))
2350                 wake_up_interruptible_all(&wq->wait);
2351         rcu_read_unlock();
2352 }
2353
2354 static void sock_def_error_report(struct sock *sk)
2355 {
2356         struct socket_wq *wq;
2357
2358         rcu_read_lock();
2359         wq = rcu_dereference(sk->sk_wq);
2360         if (skwq_has_sleeper(wq))
2361                 wake_up_interruptible_poll(&wq->wait, POLLERR);
2362         sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2363         rcu_read_unlock();
2364 }
2365
2366 static void sock_def_readable(struct sock *sk)
2367 {
2368         struct socket_wq *wq;
2369
2370         rcu_read_lock();
2371         wq = rcu_dereference(sk->sk_wq);
2372         if (skwq_has_sleeper(wq))
2373                 wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2374                                                 POLLRDNORM | POLLRDBAND);
2375         sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2376         rcu_read_unlock();
2377 }
2378
2379 static void sock_def_write_space(struct sock *sk)
2380 {
2381         struct socket_wq *wq;
2382
2383         rcu_read_lock();
2384
2385         /* Do not wake up a writer until he can make "significant"
2386          * progress.  --DaveM
2387          */
2388         if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2389                 wq = rcu_dereference(sk->sk_wq);
2390                 if (skwq_has_sleeper(wq))
2391                         wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2392                                                 POLLWRNORM | POLLWRBAND);
2393
2394                 /* Should agree with poll, otherwise some programs break */
2395                 if (sock_writeable(sk))
2396                         sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2397         }
2398
2399         rcu_read_unlock();
2400 }
2401
2402 static void sock_def_destruct(struct sock *sk)
2403 {
2404 }
2405
2406 void sk_send_sigurg(struct sock *sk)
2407 {
2408         if (sk->sk_socket && sk->sk_socket->file)
2409                 if (send_sigurg(&sk->sk_socket->file->f_owner))
2410                         sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2411 }
2412 EXPORT_SYMBOL(sk_send_sigurg);
2413
2414 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2415                     unsigned long expires)
2416 {
2417         if (!mod_timer(timer, expires))
2418                 sock_hold(sk);
2419 }
2420 EXPORT_SYMBOL(sk_reset_timer);
2421
2422 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2423 {
2424         if (del_timer(timer))
2425                 __sock_put(sk);
2426 }
2427 EXPORT_SYMBOL(sk_stop_timer);
2428
2429 void sock_init_data(struct socket *sock, struct sock *sk)
2430 {
2431         skb_queue_head_init(&sk->sk_receive_queue);
2432         skb_queue_head_init(&sk->sk_write_queue);
2433         skb_queue_head_init(&sk->sk_error_queue);
2434
2435         sk->sk_send_head        =       NULL;
2436
2437         init_timer(&sk->sk_timer);
2438
2439         sk->sk_allocation       =       GFP_KERNEL;
2440         sk->sk_rcvbuf           =       sysctl_rmem_default;
2441         sk->sk_sndbuf           =       sysctl_wmem_default;
2442         sk->sk_state            =       TCP_CLOSE;
2443         sk_set_socket(sk, sock);
2444
2445         sock_set_flag(sk, SOCK_ZAPPED);
2446
2447         if (sock) {
2448                 sk->sk_type     =       sock->type;
2449                 sk->sk_wq       =       sock->wq;
2450                 sock->sk        =       sk;
2451         } else
2452                 sk->sk_wq       =       NULL;
2453
2454         rwlock_init(&sk->sk_callback_lock);
2455         lockdep_set_class_and_name(&sk->sk_callback_lock,
2456                         af_callback_keys + sk->sk_family,
2457                         af_family_clock_key_strings[sk->sk_family]);
2458
2459         sk->sk_state_change     =       sock_def_wakeup;
2460         sk->sk_data_ready       =       sock_def_readable;
2461         sk->sk_write_space      =       sock_def_write_space;
2462         sk->sk_error_report     =       sock_def_error_report;
2463         sk->sk_destruct         =       sock_def_destruct;
2464
2465         sk->sk_frag.page        =       NULL;
2466         sk->sk_frag.offset      =       0;
2467         sk->sk_peek_off         =       -1;
2468
2469         sk->sk_peer_pid         =       NULL;
2470         sk->sk_peer_cred        =       NULL;
2471         sk->sk_write_pending    =       0;
2472         sk->sk_rcvlowat         =       1;
2473         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
2474         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
2475
2476         sk->sk_stamp = ktime_set(-1L, 0);
2477
2478 #ifdef CONFIG_NET_RX_BUSY_POLL
2479         sk->sk_napi_id          =       0;
2480         sk->sk_ll_usec          =       sysctl_net_busy_read;
2481 #endif
2482
2483         sk->sk_max_pacing_rate = ~0U;
2484         sk->sk_pacing_rate = ~0U;
2485         sk->sk_incoming_cpu = -1;
2486         /*
2487          * Before updating sk_refcnt, we must commit prior changes to memory
2488          * (Documentation/RCU/rculist_nulls.txt for details)
2489          */
2490         smp_wmb();
2491         atomic_set(&sk->sk_refcnt, 1);
2492         atomic_set(&sk->sk_drops, 0);
2493 }
2494 EXPORT_SYMBOL(sock_init_data);
2495
2496 void lock_sock_nested(struct sock *sk, int subclass)
2497 {
2498         might_sleep();
2499         spin_lock_bh(&sk->sk_lock.slock);
2500         if (sk->sk_lock.owned)
2501                 __lock_sock(sk);
2502         sk->sk_lock.owned = 1;
2503         spin_unlock(&sk->sk_lock.slock);
2504         /*
2505          * The sk_lock has mutex_lock() semantics here:
2506          */
2507         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2508         local_bh_enable();
2509 }
2510 EXPORT_SYMBOL(lock_sock_nested);
2511
2512 void release_sock(struct sock *sk)
2513 {
2514         spin_lock_bh(&sk->sk_lock.slock);
2515         if (sk->sk_backlog.tail)
2516                 __release_sock(sk);
2517
2518         /* Warning : release_cb() might need to release sk ownership,
2519          * ie call sock_release_ownership(sk) before us.
2520          */
2521         if (sk->sk_prot->release_cb)
2522                 sk->sk_prot->release_cb(sk);
2523
2524         sock_release_ownership(sk);
2525         if (waitqueue_active(&sk->sk_lock.wq))
2526                 wake_up(&sk->sk_lock.wq);
2527         spin_unlock_bh(&sk->sk_lock.slock);
2528 }
2529 EXPORT_SYMBOL(release_sock);
2530
2531 /**
2532  * lock_sock_fast - fast version of lock_sock
2533  * @sk: socket
2534  *
2535  * This version should be used for very small section, where process wont block
2536  * return false if fast path is taken
2537  *   sk_lock.slock locked, owned = 0, BH disabled
2538  * return true if slow path is taken
2539  *   sk_lock.slock unlocked, owned = 1, BH enabled
2540  */
2541 bool lock_sock_fast(struct sock *sk)
2542 {
2543         might_sleep();
2544         spin_lock_bh(&sk->sk_lock.slock);
2545
2546         if (!sk->sk_lock.owned)
2547                 /*
2548                  * Note : We must disable BH
2549                  */
2550                 return false;
2551
2552         __lock_sock(sk);
2553         sk->sk_lock.owned = 1;
2554         spin_unlock(&sk->sk_lock.slock);
2555         /*
2556          * The sk_lock has mutex_lock() semantics here:
2557          */
2558         mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2559         local_bh_enable();
2560         return true;
2561 }
2562 EXPORT_SYMBOL(lock_sock_fast);
2563
2564 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2565 {
2566         struct timeval tv;
2567         if (!sock_flag(sk, SOCK_TIMESTAMP))
2568                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2569         tv = ktime_to_timeval(sk->sk_stamp);
2570         if (tv.tv_sec == -1)
2571                 return -ENOENT;
2572         if (tv.tv_sec == 0) {
2573                 sk->sk_stamp = ktime_get_real();
2574                 tv = ktime_to_timeval(sk->sk_stamp);
2575         }
2576         return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2577 }
2578 EXPORT_SYMBOL(sock_get_timestamp);
2579
2580 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2581 {
2582         struct timespec ts;
2583         if (!sock_flag(sk, SOCK_TIMESTAMP))
2584                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2585         ts = ktime_to_timespec(sk->sk_stamp);
2586         if (ts.tv_sec == -1)
2587                 return -ENOENT;
2588         if (ts.tv_sec == 0) {
2589                 sk->sk_stamp = ktime_get_real();
2590                 ts = ktime_to_timespec(sk->sk_stamp);
2591         }
2592         return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2593 }
2594 EXPORT_SYMBOL(sock_get_timestampns);
2595
2596 void sock_enable_timestamp(struct sock *sk, int flag)
2597 {
2598         if (!sock_flag(sk, flag)) {
2599                 unsigned long previous_flags = sk->sk_flags;
2600
2601                 sock_set_flag(sk, flag);
2602                 /*
2603                  * we just set one of the two flags which require net
2604                  * time stamping, but time stamping might have been on
2605                  * already because of the other one
2606                  */
2607                 if (sock_needs_netstamp(sk) &&
2608                     !(previous_flags & SK_FLAGS_TIMESTAMP))
2609                         net_enable_timestamp();
2610         }
2611 }
2612
2613 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2614                        int level, int type)
2615 {
2616         struct sock_exterr_skb *serr;
2617         struct sk_buff *skb;
2618         int copied, err;
2619
2620         err = -EAGAIN;
2621         skb = sock_dequeue_err_skb(sk);
2622         if (skb == NULL)
2623                 goto out;
2624
2625         copied = skb->len;
2626         if (copied > len) {
2627                 msg->msg_flags |= MSG_TRUNC;
2628                 copied = len;
2629         }
2630         err = skb_copy_datagram_msg(skb, 0, msg, copied);
2631         if (err)
2632                 goto out_free_skb;
2633
2634         sock_recv_timestamp(msg, sk, skb);
2635
2636         serr = SKB_EXT_ERR(skb);
2637         put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2638
2639         msg->msg_flags |= MSG_ERRQUEUE;
2640         err = copied;
2641
2642 out_free_skb:
2643         kfree_skb(skb);
2644 out:
2645         return err;
2646 }
2647 EXPORT_SYMBOL(sock_recv_errqueue);
2648
2649 /*
2650  *      Get a socket option on an socket.
2651  *
2652  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
2653  *      asynchronous errors should be reported by getsockopt. We assume
2654  *      this means if you specify SO_ERROR (otherwise whats the point of it).
2655  */
2656 int sock_common_getsockopt(struct socket *sock, int level, int optname,
2657                            char __user *optval, int __user *optlen)
2658 {
2659         struct sock *sk = sock->sk;
2660
2661         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2662 }
2663 EXPORT_SYMBOL(sock_common_getsockopt);
2664
2665 #ifdef CONFIG_COMPAT
2666 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2667                                   char __user *optval, int __user *optlen)
2668 {
2669         struct sock *sk = sock->sk;
2670
2671         if (sk->sk_prot->compat_getsockopt != NULL)
2672                 return sk->sk_prot->compat_getsockopt(sk, level, optname,
2673                                                       optval, optlen);
2674         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2675 }
2676 EXPORT_SYMBOL(compat_sock_common_getsockopt);
2677 #endif
2678
2679 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2680                         int flags)
2681 {
2682         struct sock *sk = sock->sk;
2683         int addr_len = 0;
2684         int err;
2685
2686         err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
2687                                    flags & ~MSG_DONTWAIT, &addr_len);
2688         if (err >= 0)
2689                 msg->msg_namelen = addr_len;
2690         return err;
2691 }
2692 EXPORT_SYMBOL(sock_common_recvmsg);
2693
2694 /*
2695  *      Set socket options on an inet socket.
2696  */
2697 int sock_common_setsockopt(struct socket *sock, int level, int optname,
2698                            char __user *optval, unsigned int optlen)
2699 {
2700         struct sock *sk = sock->sk;
2701
2702         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2703 }
2704 EXPORT_SYMBOL(sock_common_setsockopt);
2705
2706 #ifdef CONFIG_COMPAT
2707 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2708                                   char __user *optval, unsigned int optlen)
2709 {
2710         struct sock *sk = sock->sk;
2711
2712         if (sk->sk_prot->compat_setsockopt != NULL)
2713                 return sk->sk_prot->compat_setsockopt(sk, level, optname,
2714                                                       optval, optlen);
2715         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2716 }
2717 EXPORT_SYMBOL(compat_sock_common_setsockopt);
2718 #endif
2719
2720 void sk_common_release(struct sock *sk)
2721 {
2722         if (sk->sk_prot->destroy)
2723                 sk->sk_prot->destroy(sk);
2724
2725         /*
2726          * Observation: when sock_common_release is called, processes have
2727          * no access to socket. But net still has.
2728          * Step one, detach it from networking:
2729          *
2730          * A. Remove from hash tables.
2731          */
2732
2733         sk->sk_prot->unhash(sk);
2734
2735         /*
2736          * In this point socket cannot receive new packets, but it is possible
2737          * that some packets are in flight because some CPU runs receiver and
2738          * did hash table lookup before we unhashed socket. They will achieve
2739          * receive queue and will be purged by socket destructor.
2740          *
2741          * Also we still have packets pending on receive queue and probably,
2742          * our own packets waiting in device queues. sock_destroy will drain
2743          * receive queue, but transmitted packets will delay socket destruction
2744          * until the last reference will be released.
2745          */
2746
2747         sock_orphan(sk);
2748
2749         xfrm_sk_free_policy(sk);
2750
2751         sk_refcnt_debug_release(sk);
2752
2753         if (sk->sk_frag.page) {
2754                 put_page(sk->sk_frag.page);
2755                 sk->sk_frag.page = NULL;
2756         }
2757
2758         sock_put(sk);
2759 }
2760 EXPORT_SYMBOL(sk_common_release);
2761
2762 #ifdef CONFIG_PROC_FS
2763 #define PROTO_INUSE_NR  64      /* should be enough for the first time */
2764 struct prot_inuse {
2765         int val[PROTO_INUSE_NR];
2766 };
2767
2768 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2769
2770 #ifdef CONFIG_NET_NS
2771 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2772 {
2773         __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2774 }
2775 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2776
2777 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2778 {
2779         int cpu, idx = prot->inuse_idx;
2780         int res = 0;
2781
2782         for_each_possible_cpu(cpu)
2783                 res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2784
2785         return res >= 0 ? res : 0;
2786 }
2787 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2788
2789 static int __net_init sock_inuse_init_net(struct net *net)
2790 {
2791         net->core.inuse = alloc_percpu(struct prot_inuse);
2792         return net->core.inuse ? 0 : -ENOMEM;
2793 }
2794
2795 static void __net_exit sock_inuse_exit_net(struct net *net)
2796 {
2797         free_percpu(net->core.inuse);
2798 }
2799
2800 static struct pernet_operations net_inuse_ops = {
2801         .init = sock_inuse_init_net,
2802         .exit = sock_inuse_exit_net,
2803 };
2804
2805 static __init int net_inuse_init(void)
2806 {
2807         if (register_pernet_subsys(&net_inuse_ops))
2808                 panic("Cannot initialize net inuse counters");
2809
2810         return 0;
2811 }
2812
2813 core_initcall(net_inuse_init);
2814 #else
2815 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2816
2817 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2818 {
2819         __this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2820 }
2821 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2822
2823 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2824 {
2825         int cpu, idx = prot->inuse_idx;
2826         int res = 0;
2827
2828         for_each_possible_cpu(cpu)
2829                 res += per_cpu(prot_inuse, cpu).val[idx];
2830
2831         return res >= 0 ? res : 0;
2832 }
2833 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2834 #endif
2835
2836 static void assign_proto_idx(struct proto *prot)
2837 {
2838         prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2839
2840         if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2841                 pr_err("PROTO_INUSE_NR exhausted\n");
2842                 return;
2843         }
2844
2845         set_bit(prot->inuse_idx, proto_inuse_idx);
2846 }
2847
2848 static void release_proto_idx(struct proto *prot)
2849 {
2850         if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2851                 clear_bit(prot->inuse_idx, proto_inuse_idx);
2852 }
2853 #else
2854 static inline void assign_proto_idx(struct proto *prot)
2855 {
2856 }
2857
2858 static inline void release_proto_idx(struct proto *prot)
2859 {
2860 }
2861 #endif
2862
2863 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
2864 {
2865         if (!rsk_prot)
2866                 return;
2867         kfree(rsk_prot->slab_name);
2868         rsk_prot->slab_name = NULL;
2869         kmem_cache_destroy(rsk_prot->slab);
2870         rsk_prot->slab = NULL;
2871 }
2872
2873 static int req_prot_init(const struct proto *prot)
2874 {
2875         struct request_sock_ops *rsk_prot = prot->rsk_prot;
2876
2877         if (!rsk_prot)
2878                 return 0;
2879
2880         rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
2881                                         prot->name);
2882         if (!rsk_prot->slab_name)
2883                 return -ENOMEM;
2884
2885         rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
2886                                            rsk_prot->obj_size, 0,
2887                                            prot->slab_flags, NULL);
2888
2889         if (!rsk_prot->slab) {
2890                 pr_crit("%s: Can't create request sock SLAB cache!\n",
2891                         prot->name);
2892                 return -ENOMEM;
2893         }
2894         return 0;
2895 }
2896
2897 int proto_register(struct proto *prot, int alloc_slab)
2898 {
2899         if (alloc_slab) {
2900                 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2901                                         SLAB_HWCACHE_ALIGN | prot->slab_flags,
2902                                         NULL);
2903
2904                 if (prot->slab == NULL) {
2905                         pr_crit("%s: Can't create sock SLAB cache!\n",
2906                                 prot->name);
2907                         goto out;
2908                 }
2909
2910                 if (req_prot_init(prot))
2911                         goto out_free_request_sock_slab;
2912
2913                 if (prot->twsk_prot != NULL) {
2914                         prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2915
2916                         if (prot->twsk_prot->twsk_slab_name == NULL)
2917                                 goto out_free_request_sock_slab;
2918
2919                         prot->twsk_prot->twsk_slab =
2920                                 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2921                                                   prot->twsk_prot->twsk_obj_size,
2922                                                   0,
2923                                                   prot->slab_flags,
2924                                                   NULL);
2925                         if (prot->twsk_prot->twsk_slab == NULL)
2926                                 goto out_free_timewait_sock_slab_name;
2927                 }
2928         }
2929
2930         mutex_lock(&proto_list_mutex);
2931         list_add(&prot->node, &proto_list);
2932         assign_proto_idx(prot);
2933         mutex_unlock(&proto_list_mutex);
2934         return 0;
2935
2936 out_free_timewait_sock_slab_name:
2937         kfree(prot->twsk_prot->twsk_slab_name);
2938 out_free_request_sock_slab:
2939         req_prot_cleanup(prot->rsk_prot);
2940
2941         kmem_cache_destroy(prot->slab);
2942         prot->slab = NULL;
2943 out:
2944         return -ENOBUFS;
2945 }
2946 EXPORT_SYMBOL(proto_register);
2947
2948 void proto_unregister(struct proto *prot)
2949 {
2950         mutex_lock(&proto_list_mutex);
2951         release_proto_idx(prot);
2952         list_del(&prot->node);
2953         mutex_unlock(&proto_list_mutex);
2954
2955         kmem_cache_destroy(prot->slab);
2956         prot->slab = NULL;
2957
2958         req_prot_cleanup(prot->rsk_prot);
2959
2960         if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2961                 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2962                 kfree(prot->twsk_prot->twsk_slab_name);
2963                 prot->twsk_prot->twsk_slab = NULL;
2964         }
2965 }
2966 EXPORT_SYMBOL(proto_unregister);
2967
2968 #ifdef CONFIG_PROC_FS
2969 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2970         __acquires(proto_list_mutex)
2971 {
2972         mutex_lock(&proto_list_mutex);
2973         return seq_list_start_head(&proto_list, *pos);
2974 }
2975
2976 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2977 {
2978         return seq_list_next(v, &proto_list, pos);
2979 }
2980
2981 static void proto_seq_stop(struct seq_file *seq, void *v)
2982         __releases(proto_list_mutex)
2983 {
2984         mutex_unlock(&proto_list_mutex);
2985 }
2986
2987 static char proto_method_implemented(const void *method)
2988 {
2989         return method == NULL ? 'n' : 'y';
2990 }
2991 static long sock_prot_memory_allocated(struct proto *proto)
2992 {
2993         return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
2994 }
2995
2996 static char *sock_prot_memory_pressure(struct proto *proto)
2997 {
2998         return proto->memory_pressure != NULL ?
2999         proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3000 }
3001
3002 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3003 {
3004
3005         seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3006                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3007                    proto->name,
3008                    proto->obj_size,
3009                    sock_prot_inuse_get(seq_file_net(seq), proto),
3010                    sock_prot_memory_allocated(proto),
3011                    sock_prot_memory_pressure(proto),
3012                    proto->max_header,
3013                    proto->slab == NULL ? "no" : "yes",
3014                    module_name(proto->owner),
3015                    proto_method_implemented(proto->close),
3016                    proto_method_implemented(proto->connect),
3017                    proto_method_implemented(proto->disconnect),
3018                    proto_method_implemented(proto->accept),
3019                    proto_method_implemented(proto->ioctl),
3020                    proto_method_implemented(proto->init),
3021                    proto_method_implemented(proto->destroy),
3022                    proto_method_implemented(proto->shutdown),
3023                    proto_method_implemented(proto->setsockopt),
3024                    proto_method_implemented(proto->getsockopt),
3025                    proto_method_implemented(proto->sendmsg),
3026                    proto_method_implemented(proto->recvmsg),
3027                    proto_method_implemented(proto->sendpage),
3028                    proto_method_implemented(proto->bind),
3029                    proto_method_implemented(proto->backlog_rcv),
3030                    proto_method_implemented(proto->hash),
3031                    proto_method_implemented(proto->unhash),
3032                    proto_method_implemented(proto->get_port),
3033                    proto_method_implemented(proto->enter_memory_pressure));
3034 }
3035
3036 static int proto_seq_show(struct seq_file *seq, void *v)
3037 {
3038         if (v == &proto_list)
3039                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3040                            "protocol",
3041                            "size",
3042                            "sockets",
3043                            "memory",
3044                            "press",
3045                            "maxhdr",
3046                            "slab",
3047                            "module",
3048                            "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3049         else
3050                 proto_seq_printf(seq, list_entry(v, struct proto, node));
3051         return 0;
3052 }
3053
3054 static const struct seq_operations proto_seq_ops = {
3055         .start  = proto_seq_start,
3056         .next   = proto_seq_next,
3057         .stop   = proto_seq_stop,
3058         .show   = proto_seq_show,
3059 };
3060
3061 static int proto_seq_open(struct inode *inode, struct file *file)
3062 {
3063         return seq_open_net(inode, file, &proto_seq_ops,
3064                             sizeof(struct seq_net_private));
3065 }
3066
3067 static const struct file_operations proto_seq_fops = {
3068         .owner          = THIS_MODULE,
3069         .open           = proto_seq_open,
3070         .read           = seq_read,
3071         .llseek         = seq_lseek,
3072         .release        = seq_release_net,
3073 };
3074
3075 static __net_init int proto_init_net(struct net *net)
3076 {
3077         if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
3078                 return -ENOMEM;
3079
3080         return 0;
3081 }
3082
3083 static __net_exit void proto_exit_net(struct net *net)
3084 {
3085         remove_proc_entry("protocols", net->proc_net);
3086 }
3087
3088
3089 static __net_initdata struct pernet_operations proto_net_ops = {
3090         .init = proto_init_net,
3091         .exit = proto_exit_net,
3092 };
3093
3094 static int __init proto_init(void)
3095 {
3096         return register_pernet_subsys(&proto_net_ops);
3097 }
3098
3099 subsys_initcall(proto_init);
3100
3101 #endif /* PROC_FS */