net/core/sock.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Generic socket support routines. Memory allocators, socket lock/release
   7  *              handler for protocols to use and generic option handler.
   8  *
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Florian La Roche, <flla@stud.uni-sb.de>
  13  *              Alan Cox, <A.Cox@swansea.ac.uk>
  14  *
  15  * Fixes:
  16  *              Alan Cox        :       Numerous verify_area() problems
  17  *              Alan Cox        :       Connecting on a connecting socket
  18  *                                      now returns an error for tcp.
  19  *              Alan Cox        :       sock->protocol is set correctly.
  20  *                                      and is not sometimes left as 0.
  21  *              Alan Cox        :       connect handles icmp errors on a
  22  *                                      connect properly. Unfortunately there
  23  *                                      is a restart syscall nasty there. I
  24  *                                      can't match BSD without hacking the C
  25  *                                      library. Ideas urgently sought!
  26  *              Alan Cox        :       Disallow bind() to addresses that are
  27  *                                      not ours - especially broadcast ones!!
  28  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30  *                                      instead they leave that for the DESTROY timer.
  31  *              Alan Cox        :       Clean up error flag in accept
  32  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33  *                                      was buggy. Put a remove_sock() in the handler
  34  *                                      for memory when we hit 0. Also altered the timer
  35  *                                      code. The ACK stuff can wait and needs major
  36  *                                      TCP layer surgery.
  37  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38  *                                      and fixed timer/inet_bh race.
  39  *              Alan Cox        :       Added zapped flag for TCP
  40  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47  *      Pauline Middelink       :       identd support
  48  *              Alan Cox        :       Fixed connect() taking signals I think.
  49  *              Alan Cox        :       SO_LINGER supported
  50  *              Alan Cox        :       Error reporting fixes
  51  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52  *              Alan Cox        :       inet sockets don't set sk->type!
  53  *              Alan Cox        :       Split socket option code
  54  *              Alan Cox        :       Callbacks
  55  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56  *              Alex            :       Removed restriction on inet fioctl
  57  *              Alan Cox        :       Splitting INET from NET core
  58  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60  *              Alan Cox        :       Split IP from generic code
  61  *              Alan Cox        :       New kfree_skbmem()
  62  *              Alan Cox        :       Make SO_DEBUG superuser only.
  63  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64  *                                      (compatibility fix)
  65  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66  *              Alan Cox        :       Allocator for a socket is settable.
  67  *              Alan Cox        :       SO_ERROR includes soft errors.
  68  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69  *              Alan Cox        :       Generic socket allocation to make hooks
  70  *                                      easier (suggested by Craig Metz).
  71  *              Michael Pall    :       SO_ERROR returns positive errno again
  72  *              Steve Whitehouse:       Added default destructor to free
  73  *                                      protocol private data.
  74  *              Steve Whitehouse:       Added various other default routines
  75  *                                      common to several socket families.
  76  *              Chris Evans     :       Call suser() check last on F_SETOWN
  77  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79  *              Andi Kleen      :       Fix write_space callback
  80  *              Chris Evans     :       Security fixes - signedness again
  81  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82  *
  83  * To Fix:
  84  *
  85  *
  86  *              This program is free software; you can redistribute it and/or
  87  *              modify it under the terms of the GNU General Public License
  88  *              as published by the Free Software Foundation; either version
  89  *              2 of the License, or (at your option) any later version.
  90  */
  91
  92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  93
  94 #include <linux/capability.h>
  95 #include <linux/errno.h>
  96 #include <linux/errqueue.h>
  97 #include <linux/types.h>
  98 #include <linux/socket.h>
  99 #include <linux/in.h>
 100 #include <linux/kernel.h>
 101 #include <linux/module.h>
 102 #include <linux/proc_fs.h>
 103 #include <linux/seq_file.h>
 104 #include <linux/sched.h>
 105 #include <linux/timer.h>
 106 #include <linux/string.h>
 107 #include <linux/sockios.h>
 108 #include <linux/net.h>
 109 #include <linux/mm.h>
 110 #include <linux/slab.h>
 111 #include <linux/interrupt.h>
 112 #include <linux/poll.h>
 113 #include <linux/tcp.h>
 114 #include <linux/init.h>
 115 #include <linux/highmem.h>
 116 #include <linux/user_namespace.h>
 117 #include <linux/static_key.h>
 118 #include <linux/memcontrol.h>
 119 #include <linux/prefetch.h>
 120
 121 #include <asm/uaccess.h>
 122
 123 #include <linux/netdevice.h>
 124 #include <net/protocol.h>
 125 #include <linux/skbuff.h>
 126 #include <net/net_namespace.h>
 127 #include <net/request_sock.h>
 128 #include <net/sock.h>
 129 #include <linux/net_tstamp.h>
 130 #include <net/xfrm.h>
 131 #include <linux/ipsec.h>
 132 #include <net/cls_cgroup.h>
 133 #include <net/netprio_cgroup.h>
 134 #include <linux/sock_diag.h>
 135
 136 #include <linux/filter.h>
 137 #include <net/sock_reuseport.h>
 138
 139 #include <trace/events/sock.h>
 140
 141 #ifdef CONFIG_INET
 142 #include <net/tcp.h>
 143 #endif
 144
 145 #include <net/busy_poll.h>
 146
 147 static DEFINE_MUTEX(proto_list_mutex);
 148 static LIST_HEAD(proto_list);
 149
 150 /**
 151  * sk_ns_capable - General socket capability test
 152  * @sk: Socket to use a capability on or through
 153  * @user_ns: The user namespace of the capability to use
 154  * @cap: The capability to use
 155  *
 156  * Test to see if the opener of the socket had when the socket was
 157  * created and the current process has the capability @cap in the user
 158  * namespace @user_ns.
 159  */
 160 bool sk_ns_capable(const struct sock *sk,
 161                    struct user_namespace *user_ns, int cap)
 162 {
 163         return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 164                 ns_capable(user_ns, cap);
 165 }
 166 EXPORT_SYMBOL(sk_ns_capable);
 167
 168 /**
 169  * sk_capable - Socket global capability test
 170  * @sk: Socket to use a capability on or through
 171  * @cap: The global capability to use
 172  *
 173  * Test to see if the opener of the socket had when the socket was
 174  * created and the current process has the capability @cap in all user
 175  * namespaces.
 176  */
 177 bool sk_capable(const struct sock *sk, int cap)
 178 {
 179         return sk_ns_capable(sk, &init_user_ns, cap);
 180 }
 181 EXPORT_SYMBOL(sk_capable);
 182
 183 /**
 184  * sk_net_capable - Network namespace socket capability test
 185  * @sk: Socket to use a capability on or through
 186  * @cap: The capability to use
 187  *
 188  * Test to see if the opener of the socket had when the socket was created
 189  * and the current process has the capability @cap over the network namespace
 190  * the socket is a member of.
 191  */
 192 bool sk_net_capable(const struct sock *sk, int cap)
 193 {
 194         return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 195 }
 196 EXPORT_SYMBOL(sk_net_capable);
 197
 198 /*
 199  * Each address family might have different locking rules, so we have
 200  * one slock key per address family:
 201  */
 202 static struct lock_class_key af_family_keys[AF_MAX];
 203 static struct lock_class_key af_family_slock_keys[AF_MAX];
 204
 205 /*
 206  * Make lock validator output more readable. (we pre-construct these
 207  * strings build-time, so that runtime initialization of socket
 208  * locks is fast):
 209  */
 210 static const char *const af_family_key_strings[AF_MAX+1] = {
 211   "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
 212   "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
 213   "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
 214   "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
 215   "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
 216   "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
 217   "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
 218   "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
 219   "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
 220   "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
 221   "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
 222   "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
 223   "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
 224   "sk_lock-AF_NFC"   , "sk_lock-AF_VSOCK"    , "sk_lock-AF_MAX"
 225 };
 226 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 227   "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
 228   "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
 229   "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
 230   "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
 231   "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
 232   "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
 233   "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
 234   "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
 235   "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
 236   "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
 237   "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
 238   "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
 239   "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
 240   "slock-AF_NFC"   , "slock-AF_VSOCK"    ,"slock-AF_MAX"
 241 };
 242 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 243   "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
 244   "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
 245   "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
 246   "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
 247   "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
 248   "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
 249   "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
 250   "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
 251   "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
 252   "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
 253   "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
 254   "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
 255   "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
 256   "clock-AF_NFC"   , "clock-AF_VSOCK"    , "clock-AF_MAX"
 257 };
 258
 259 /*
 260  * sk_callback_lock locking rules are per-address-family,
 261  * so split the lock classes by using a per-AF key:
 262  */
 263 static struct lock_class_key af_callback_keys[AF_MAX];
 264
 265 /* Take into consideration the size of the struct sk_buff overhead in the
 266  * determination of these values, since that is non-constant across
 267  * platforms.  This makes socket queueing behavior and performance
 268  * not depend upon such differences.
 269  */
 270 #define _SK_MEM_PACKETS         256
 271 #define _SK_MEM_OVERHEAD        SKB_TRUESIZE(256)
 272 #define SK_WMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 273 #define SK_RMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 274
 275 /* Run time adjustable parameters. */
 276 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 277 EXPORT_SYMBOL(sysctl_wmem_max);
 278 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 279 EXPORT_SYMBOL(sysctl_rmem_max);
 280 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 281 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 282
 283 /* Maximal space eaten by iovec or ancillary data plus some space */
 284 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 285 EXPORT_SYMBOL(sysctl_optmem_max);
 286
 287 int sysctl_tstamp_allow_data __read_mostly = 1;
 288
 289 struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
 290 EXPORT_SYMBOL_GPL(memalloc_socks);
 291
 292 /**
 293  * sk_set_memalloc - sets %SOCK_MEMALLOC
 294  * @sk: socket to set it on
 295  *
 296  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 297  * It's the responsibility of the admin to adjust min_free_kbytes
 298  * to meet the requirements
 299  */
 300 void sk_set_memalloc(struct sock *sk)
 301 {
 302         sock_set_flag(sk, SOCK_MEMALLOC);
 303         sk->sk_allocation |= __GFP_MEMALLOC;
 304         static_key_slow_inc(&memalloc_socks);
 305 }
 306 EXPORT_SYMBOL_GPL(sk_set_memalloc);
 307
 308 void sk_clear_memalloc(struct sock *sk)
 309 {
 310         sock_reset_flag(sk, SOCK_MEMALLOC);
 311         sk->sk_allocation &= ~__GFP_MEMALLOC;
 312         static_key_slow_dec(&memalloc_socks);
 313
 314         /*
 315          * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 316          * progress of swapping. SOCK_MEMALLOC may be cleared while
 317          * it has rmem allocations due to the last swapfile being deactivated
 318          * but there is a risk that the socket is unusable due to exceeding
 319          * the rmem limits. Reclaim the reserves and obey rmem limits again.
 320          */
 321         sk_mem_reclaim(sk);
 322 }
 323 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 324
 325 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 326 {
 327         int ret;
 328         unsigned long pflags = current->flags;
 329
 330         /* these should have been dropped before queueing */
 331         BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 332
 333         current->flags |= PF_MEMALLOC;
 334         ret = sk->sk_backlog_rcv(sk, skb);
 335         tsk_restore_flags(current, pflags, PF_MEMALLOC);
 336
 337         return ret;
 338 }
 339 EXPORT_SYMBOL(__sk_backlog_rcv);
 340
 341 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 342 {
 343         struct timeval tv;
 344
 345         if (optlen < sizeof(tv))
 346                 return -EINVAL;
 347         if (copy_from_user(&tv, optval, sizeof(tv)))
 348                 return -EFAULT;
 349         if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 350                 return -EDOM;
 351
 352         if (tv.tv_sec < 0) {
 353                 static int warned __read_mostly;
 354
 355                 *timeo_p = 0;
 356                 if (warned < 10 && net_ratelimit()) {
 357                         warned++;
 358                         pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 359                                 __func__, current->comm, task_pid_nr(current));
 360                 }
 361                 return 0;
 362         }
 363         *timeo_p = MAX_SCHEDULE_TIMEOUT;
 364         if (tv.tv_sec == 0 && tv.tv_usec == 0)
 365                 return 0;
 366         if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
 367                 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
 368         return 0;
 369 }
 370
 371 static void sock_warn_obsolete_bsdism(const char *name)
 372 {
 373         static int warned;
 374         static char warncomm[TASK_COMM_LEN];
 375         if (strcmp(warncomm, current->comm) && warned < 5) {
 376                 strcpy(warncomm,  current->comm);
 377                 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
 378                         warncomm, name);
 379                 warned++;
 380         }
 381 }
 382
 383 static bool sock_needs_netstamp(const struct sock *sk)
 384 {
 385         switch (sk->sk_family) {
 386         case AF_UNSPEC:
 387         case AF_UNIX:
 388                 return false;
 389         default:
 390                 return true;
 391         }
 392 }
 393
 394 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 395 {
 396         if (sk->sk_flags & flags) {
 397                 sk->sk_flags &= ~flags;
 398                 if (sock_needs_netstamp(sk) &&
 399                     !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 400                         net_disable_timestamp();
 401         }
 402 }
 403
 404
 405 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 406 {
 407         unsigned long flags;
 408         struct sk_buff_head *list = &sk->sk_receive_queue;
 409
 410         if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 411                 atomic_inc(&sk->sk_drops);
 412                 trace_sock_rcvqueue_full(sk, skb);
 413                 return -ENOMEM;
 414         }
 415
 416         if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 417                 atomic_inc(&sk->sk_drops);
 418                 return -ENOBUFS;
 419         }
 420
 421         skb->dev = NULL;
 422         skb_set_owner_r(skb, sk);
 423
 424         /* we escape from rcu protected region, make sure we dont leak
 425          * a norefcounted dst
 426          */
 427         skb_dst_force(skb);
 428
 429         spin_lock_irqsave(&list->lock, flags);
 430         sock_skb_set_dropcount(sk, skb);
 431         __skb_queue_tail(list, skb);
 432         spin_unlock_irqrestore(&list->lock, flags);
 433
 434         if (!sock_flag(sk, SOCK_DEAD))
 435                 sk->sk_data_ready(sk);
 436         return 0;
 437 }
 438 EXPORT_SYMBOL(__sock_queue_rcv_skb);
 439
 440 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 441 {
 442         int err;
 443
 444         err = sk_filter(sk, skb);
 445         if (err)
 446                 return err;
 447
 448         return __sock_queue_rcv_skb(sk, skb);
 449 }
 450 EXPORT_SYMBOL(sock_queue_rcv_skb);
 451
 452 int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
 453 {
 454         int rc = NET_RX_SUCCESS;
 455
 456         if (sk_filter(sk, skb))
 457                 goto discard_and_relse;
 458
 459         skb->dev = NULL;
 460
 461         if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
 462                 atomic_inc(&sk->sk_drops);
 463                 goto discard_and_relse;
 464         }
 465         if (nested)
 466                 bh_lock_sock_nested(sk);
 467         else
 468                 bh_lock_sock(sk);
 469         if (!sock_owned_by_user(sk)) {
 470                 /*
 471                  * trylock + unlock semantics:
 472                  */
 473                 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 474
 475                 rc = sk_backlog_rcv(sk, skb);
 476
 477                 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
 478         } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
 479                 bh_unlock_sock(sk);
 480                 atomic_inc(&sk->sk_drops);
 481                 goto discard_and_relse;
 482         }
 483
 484         bh_unlock_sock(sk);
 485 out:
 486         sock_put(sk);
 487         return rc;
 488 discard_and_relse:
 489         kfree_skb(skb);
 490         goto out;
 491 }
 492 EXPORT_SYMBOL(sk_receive_skb);
 493
 494 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 495 {
 496         struct dst_entry *dst = __sk_dst_get(sk);
 497
 498         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 499                 sk_tx_queue_clear(sk);
 500                 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 501                 dst_release(dst);
 502                 return NULL;
 503         }
 504
 505         return dst;
 506 }
 507 EXPORT_SYMBOL(__sk_dst_check);
 508
 509 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 510 {
 511         struct dst_entry *dst = sk_dst_get(sk);
 512
 513         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 514                 sk_dst_reset(sk);
 515                 dst_release(dst);
 516                 return NULL;
 517         }
 518
 519         return dst;
 520 }
 521 EXPORT_SYMBOL(sk_dst_check);
 522
 523 static int sock_setbindtodevice(struct sock *sk, char __user *optval,
 524                                 int optlen)
 525 {
 526         int ret = -ENOPROTOOPT;
 527 #ifdef CONFIG_NETDEVICES
 528         struct net *net = sock_net(sk);
 529         char devname[IFNAMSIZ];
 530         int index;
 531
 532         /* Sorry... */
 533         ret = -EPERM;
 534         if (!ns_capable(net->user_ns, CAP_NET_RAW))
 535                 goto out;
 536
 537         ret = -EINVAL;
 538         if (optlen < 0)
 539                 goto out;
 540
 541         /* Bind this socket to a particular device like "eth0",
 542          * as specified in the passed interface name. If the
 543          * name is "" or the option length is zero the socket
 544          * is not bound.
 545          */
 546         if (optlen > IFNAMSIZ - 1)
 547                 optlen = IFNAMSIZ - 1;
 548         memset(devname, 0, sizeof(devname));
 549
 550         ret = -EFAULT;
 551         if (copy_from_user(devname, optval, optlen))
 552                 goto out;
 553
 554         index = 0;
 555         if (devname[0] != '\0') {
 556                 struct net_device *dev;
 557
 558                 rcu_read_lock();
 559                 dev = dev_get_by_name_rcu(net, devname);
 560                 if (dev)
 561                         index = dev->ifindex;
 562                 rcu_read_unlock();
 563                 ret = -ENODEV;
 564                 if (!dev)
 565                         goto out;
 566         }
 567
 568         lock_sock(sk);
 569         sk->sk_bound_dev_if = index;
 570         sk_dst_reset(sk);
 571         release_sock(sk);
 572
 573         ret = 0;
 574
 575 out:
 576 #endif
 577
 578         return ret;
 579 }
 580
 581 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
 582                                 int __user *optlen, int len)
 583 {
 584         int ret = -ENOPROTOOPT;
 585 #ifdef CONFIG_NETDEVICES
 586         struct net *net = sock_net(sk);
 587         char devname[IFNAMSIZ];
 588
 589         if (sk->sk_bound_dev_if == 0) {
 590                 len = 0;
 591                 goto zero;
 592         }
 593
 594         ret = -EINVAL;
 595         if (len < IFNAMSIZ)
 596                 goto out;
 597
 598         ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
 599         if (ret)
 600                 goto out;
 601
 602         len = strlen(devname) + 1;
 603
 604         ret = -EFAULT;
 605         if (copy_to_user(optval, devname, len))
 606                 goto out;
 607
 608 zero:
 609         ret = -EFAULT;
 610         if (put_user(len, optlen))
 611                 goto out;
 612
 613         ret = 0;
 614
 615 out:
 616 #endif
 617
 618         return ret;
 619 }
 620
 621 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
 622 {
 623         if (valbool)
 624                 sock_set_flag(sk, bit);
 625         else
 626                 sock_reset_flag(sk, bit);
 627 }
 628
 629 bool sk_mc_loop(struct sock *sk)
 630 {
 631         if (dev_recursion_level())
 632                 return false;
 633         if (!sk)
 634                 return true;
 635         switch (sk->sk_family) {
 636         case AF_INET:
 637                 return inet_sk(sk)->mc_loop;
 638 #if IS_ENABLED(CONFIG_IPV6)
 639         case AF_INET6:
 640                 return inet6_sk(sk)->mc_loop;
 641 #endif
 642         }
 643         WARN_ON(1);
 644         return true;
 645 }
 646 EXPORT_SYMBOL(sk_mc_loop);
 647
 648 /*
 649  *      This is meant for all protocols to use and covers goings on
 650  *      at the socket level. Everything here is generic.
 651  */
 652
 653 int sock_setsockopt(struct socket *sock, int level, int optname,
 654                     char __user *optval, unsigned int optlen)
 655 {
 656         struct sock *sk = sock->sk;
 657         int val;
 658         int valbool;
 659         struct linger ling;
 660         int ret = 0;
 661
 662         /*
 663          *      Options without arguments
 664          */
 665
 666         if (optname == SO_BINDTODEVICE)
 667                 return sock_setbindtodevice(sk, optval, optlen);
 668
 669         if (optlen < sizeof(int))
 670                 return -EINVAL;
 671
 672         if (get_user(val, (int __user *)optval))
 673                 return -EFAULT;
 674
 675         valbool = val ? 1 : 0;
 676
 677         lock_sock(sk);
 678
 679         switch (optname) {
 680         case SO_DEBUG:
 681                 if (val && !capable(CAP_NET_ADMIN))
 682                         ret = -EACCES;
 683                 else
 684                         sock_valbool_flag(sk, SOCK_DBG, valbool);
 685                 break;
 686         case SO_REUSEADDR:
 687                 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
 688                 break;
 689         case SO_REUSEPORT:
 690                 sk->sk_reuseport = valbool;
 691                 break;
 692         case SO_TYPE:
 693         case SO_PROTOCOL:
 694         case SO_DOMAIN:
 695         case SO_ERROR:
 696                 ret = -ENOPROTOOPT;
 697                 break;
 698         case SO_DONTROUTE:
 699                 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
 700                 break;
 701         case SO_BROADCAST:
 702                 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
 703                 break;
 704         case SO_SNDBUF:
 705                 /* Don't error on this BSD doesn't and if you think
 706                  * about it this is right. Otherwise apps have to
 707                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
 708                  * are treated in BSD as hints
 709                  */
 710                 val = min_t(u32, val, sysctl_wmem_max);
 711 set_sndbuf:
 712                 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 713                 sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);
 714                 /* Wake up sending tasks if we upped the value. */
 715                 sk->sk_write_space(sk);
 716                 break;
 717
 718         case SO_SNDBUFFORCE:
 719                 if (!capable(CAP_NET_ADMIN)) {
 720                         ret = -EPERM;
 721                         break;
 722                 }
 723                 goto set_sndbuf;
 724
 725         case SO_RCVBUF:
 726                 /* Don't error on this BSD doesn't and if you think
 727                  * about it this is right. Otherwise apps have to
 728                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
 729                  * are treated in BSD as hints
 730                  */
 731                 val = min_t(u32, val, sysctl_rmem_max);
 732 set_rcvbuf:
 733                 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 734                 /*
 735                  * We double it on the way in to account for
 736                  * "struct sk_buff" etc. overhead.   Applications
 737                  * assume that the SO_RCVBUF setting they make will
 738                  * allow that much actual data to be received on that
 739                  * socket.
 740                  *
 741                  * Applications are unaware that "struct sk_buff" and
 742                  * other overheads allocate from the receive buffer
 743                  * during socket buffer allocation.
 744                  *
 745                  * And after considering the possible alternatives,
 746                  * returning the value we actually used in getsockopt
 747                  * is the most desirable behavior.
 748                  */
 749                 sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);
 750                 break;
 751
 752         case SO_RCVBUFFORCE:
 753                 if (!capable(CAP_NET_ADMIN)) {
 754                         ret = -EPERM;
 755                         break;
 756                 }
 757                 goto set_rcvbuf;
 758
 759         case SO_KEEPALIVE:
 760 #ifdef CONFIG_INET
 761                 if (sk->sk_protocol == IPPROTO_TCP &&
 762                     sk->sk_type == SOCK_STREAM)
 763                         tcp_set_keepalive(sk, valbool);
 764 #endif
 765                 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
 766                 break;
 767
 768         case SO_OOBINLINE:
 769                 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
 770                 break;
 771
 772         case SO_NO_CHECK:
 773                 sk->sk_no_check_tx = valbool;
 774                 break;
 775
 776         case SO_PRIORITY:
 777                 if ((val >= 0 && val <= 6) ||
 778                     ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 779                         sk->sk_priority = val;
 780                 else
 781                         ret = -EPERM;
 782                 break;
 783
 784         case SO_LINGER:
 785                 if (optlen < sizeof(ling)) {
 786                         ret = -EINVAL;  /* 1003.1g */
 787                         break;
 788                 }
 789                 if (copy_from_user(&ling, optval, sizeof(ling))) {
 790                         ret = -EFAULT;
 791                         break;
 792                 }
 793                 if (!ling.l_onoff)
 794                         sock_reset_flag(sk, SOCK_LINGER);
 795                 else {
 796 #if (BITS_PER_LONG == 32)
 797                         if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
 798                                 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
 799                         else
 800 #endif
 801                                 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
 802                         sock_set_flag(sk, SOCK_LINGER);
 803                 }
 804                 break;
 805
 806         case SO_BSDCOMPAT:
 807                 sock_warn_obsolete_bsdism("setsockopt");
 808                 break;
 809
 810         case SO_PASSCRED:
 811                 if (valbool)
 812                         set_bit(SOCK_PASSCRED, &sock->flags);
 813                 else
 814                         clear_bit(SOCK_PASSCRED, &sock->flags);
 815                 break;
 816
 817         case SO_TIMESTAMP:
 818         case SO_TIMESTAMPNS:
 819                 if (valbool)  {
 820                         if (optname == SO_TIMESTAMP)
 821                                 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 822                         else
 823                                 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
 824                         sock_set_flag(sk, SOCK_RCVTSTAMP);
 825                         sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 826                 } else {
 827                         sock_reset_flag(sk, SOCK_RCVTSTAMP);
 828                         sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 829                 }
 830                 break;
 831
 832         case SO_TIMESTAMPING:
 833                 if (val & ~SOF_TIMESTAMPING_MASK) {
 834                         ret = -EINVAL;
 835                         break;
 836                 }
 837
 838                 if (val & SOF_TIMESTAMPING_OPT_ID &&
 839                     !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
 840                         if (sk->sk_protocol == IPPROTO_TCP &&
 841                             sk->sk_type == SOCK_STREAM) {
 842                                 if ((1 << sk->sk_state) &
 843                                     (TCPF_CLOSE | TCPF_LISTEN)) {
 844                                         ret = -EINVAL;
 845                                         break;
 846                                 }
 847                                 sk->sk_tskey = tcp_sk(sk)->snd_una;
 848                         } else {
 849                                 sk->sk_tskey = 0;
 850                         }
 851                 }
 852                 sk->sk_tsflags = val;
 853                 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 854                         sock_enable_timestamp(sk,
 855                                               SOCK_TIMESTAMPING_RX_SOFTWARE);
 856                 else
 857                         sock_disable_timestamp(sk,
 858                                                (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 859                 break;
 860
 861         case SO_RCVLOWAT:
 862                 if (val < 0)
 863                         val = INT_MAX;
 864                 sk->sk_rcvlowat = val ? : 1;
 865                 break;
 866
 867         case SO_RCVTIMEO:
 868                 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
 869                 break;
 870
 871         case SO_SNDTIMEO:
 872                 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
 873                 break;
 874
 875         case SO_ATTACH_FILTER:
 876                 ret = -EINVAL;
 877                 if (optlen == sizeof(struct sock_fprog)) {
 878                         struct sock_fprog fprog;
 879
 880                         ret = -EFAULT;
 881                         if (copy_from_user(&fprog, optval, sizeof(fprog)))
 882                                 break;
 883
 884                         ret = sk_attach_filter(&fprog, sk);
 885                 }
 886                 break;
 887
 888         case SO_ATTACH_BPF:
 889                 ret = -EINVAL;
 890                 if (optlen == sizeof(u32)) {
 891                         u32 ufd;
 892
 893                         ret = -EFAULT;
 894                         if (copy_from_user(&ufd, optval, sizeof(ufd)))
 895                                 break;
 896
 897                         ret = sk_attach_bpf(ufd, sk);
 898                 }
 899                 break;
 900
 901         case SO_ATTACH_REUSEPORT_CBPF:
 902                 ret = -EINVAL;
 903                 if (optlen == sizeof(struct sock_fprog)) {
 904                         struct sock_fprog fprog;
 905
 906                         ret = -EFAULT;
 907                         if (copy_from_user(&fprog, optval, sizeof(fprog)))
 908                                 break;
 909
 910                         ret = sk_reuseport_attach_filter(&fprog, sk);
 911                 }
 912                 break;
 913
 914         case SO_ATTACH_REUSEPORT_EBPF:
 915                 ret = -EINVAL;
 916                 if (optlen == sizeof(u32)) {
 917                         u32 ufd;
 918
 919                         ret = -EFAULT;
 920                         if (copy_from_user(&ufd, optval, sizeof(ufd)))
 921                                 break;
 922
 923                         ret = sk_reuseport_attach_bpf(ufd, sk);
 924                 }
 925                 break;
 926
 927         case SO_DETACH_FILTER:
 928                 ret = sk_detach_filter(sk);
 929                 break;
 930
 931         case SO_LOCK_FILTER:
 932                 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
 933                         ret = -EPERM;
 934                 else
 935                         sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
 936                 break;
 937
 938         case SO_PASSSEC:
 939                 if (valbool)
 940                         set_bit(SOCK_PASSSEC, &sock->flags);
 941                 else
 942                         clear_bit(SOCK_PASSSEC, &sock->flags);
 943                 break;
 944         case SO_MARK:
 945                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 946                         ret = -EPERM;
 947                 else
 948                         sk->sk_mark = val;
 949                 break;
 950
 951         case SO_RXQ_OVFL:
 952                 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
 953                 break;
 954
 955         case SO_WIFI_STATUS:
 956                 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
 957                 break;
 958
 959         case SO_PEEK_OFF:
 960                 if (sock->ops->set_peek_off)
 961                         ret = sock->ops->set_peek_off(sk, val);
 962                 else
 963                         ret = -EOPNOTSUPP;
 964                 break;
 965
 966         case SO_NOFCS:
 967                 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
 968                 break;
 969
 970         case SO_SELECT_ERR_QUEUE:
 971                 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
 972                 break;
 973
 974 #ifdef CONFIG_NET_RX_BUSY_POLL
 975         case SO_BUSY_POLL:
 976                 /* allow unprivileged users to decrease the value */
 977                 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
 978                         ret = -EPERM;
 979                 else {
 980                         if (val < 0)
 981                                 ret = -EINVAL;
 982                         else
 983                                 sk->sk_ll_usec = val;
 984                 }
 985                 break;
 986 #endif
 987
 988         case SO_MAX_PACING_RATE:
 989                 sk->sk_max_pacing_rate = val;
 990                 sk->sk_pacing_rate = min(sk->sk_pacing_rate,
 991                                          sk->sk_max_pacing_rate);
 992                 break;
 993
 994         case SO_INCOMING_CPU:
 995                 sk->sk_incoming_cpu = val;
 996                 break;
 997
 998         case SO_CNX_ADVICE:
 999                 if (val == 1)
1000                         dst_negative_advice(sk);
1001                 break;
1002         default:
1003                 ret = -ENOPROTOOPT;
1004                 break;
1005         }
1006         release_sock(sk);
1007         return ret;
1008 }
1009 EXPORT_SYMBOL(sock_setsockopt);
1010
1011
1012 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1013                           struct ucred *ucred)
1014 {
1015         ucred->pid = pid_vnr(pid);
1016         ucred->uid = ucred->gid = -1;
1017         if (cred) {
1018                 struct user_namespace *current_ns = current_user_ns();
1019
1020                 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1021                 ucred->gid = from_kgid_munged(current_ns, cred->egid);
1022         }
1023 }
1024
1025 int sock_getsockopt(struct socket *sock, int level, int optname,
1026                     char __user *optval, int __user *optlen)
1027 {
1028         struct sock *sk = sock->sk;
1029
1030         union {
1031                 int val;
1032                 struct linger ling;
1033                 struct timeval tm;
1034         } v;
1035
1036         int lv = sizeof(int);
1037         int len;
1038
1039         if (get_user(len, optlen))
1040                 return -EFAULT;
1041         if (len < 0)
1042                 return -EINVAL;
1043
1044         memset(&v, 0, sizeof(v));
1045
1046         switch (optname) {
1047         case SO_DEBUG:
1048                 v.val = sock_flag(sk, SOCK_DBG);
1049                 break;
1050
1051         case SO_DONTROUTE:
1052                 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1053                 break;
1054
1055         case SO_BROADCAST:
1056                 v.val = sock_flag(sk, SOCK_BROADCAST);
1057                 break;
1058
1059         case SO_SNDBUF:
1060                 v.val = sk->sk_sndbuf;
1061                 break;
1062
1063         case SO_RCVBUF:
1064                 v.val = sk->sk_rcvbuf;
1065                 break;
1066
1067         case SO_REUSEADDR:
1068                 v.val = sk->sk_reuse;
1069                 break;
1070
1071         case SO_REUSEPORT:
1072                 v.val = sk->sk_reuseport;
1073                 break;
1074
1075         case SO_KEEPALIVE:
1076                 v.val = sock_flag(sk, SOCK_KEEPOPEN);
1077                 break;
1078
1079         case SO_TYPE:
1080                 v.val = sk->sk_type;
1081                 break;
1082
1083         case SO_PROTOCOL:
1084                 v.val = sk->sk_protocol;
1085                 break;
1086
1087         case SO_DOMAIN:
1088                 v.val = sk->sk_family;
1089                 break;
1090
1091         case SO_ERROR:
1092                 v.val = -sock_error(sk);
1093                 if (v.val == 0)
1094                         v.val = xchg(&sk->sk_err_soft, 0);
1095                 break;
1096
1097         case SO_OOBINLINE:
1098                 v.val = sock_flag(sk, SOCK_URGINLINE);
1099                 break;
1100
1101         case SO_NO_CHECK:
1102                 v.val = sk->sk_no_check_tx;
1103                 break;
1104
1105         case SO_PRIORITY:
1106                 v.val = sk->sk_priority;
1107                 break;
1108
1109         case SO_LINGER:
1110                 lv              = sizeof(v.ling);
1111                 v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
1112                 v.ling.l_linger = sk->sk_lingertime / HZ;
1113                 break;
1114
1115         case SO_BSDCOMPAT:
1116                 sock_warn_obsolete_bsdism("getsockopt");
1117                 break;
1118
1119         case SO_TIMESTAMP:
1120                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1121                                 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1122                 break;
1123
1124         case SO_TIMESTAMPNS:
1125                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1126                 break;
1127
1128         case SO_TIMESTAMPING:
1129                 v.val = sk->sk_tsflags;
1130                 break;
1131
1132         case SO_RCVTIMEO:
1133                 lv = sizeof(struct timeval);
1134                 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1135                         v.tm.tv_sec = 0;
1136                         v.tm.tv_usec = 0;
1137                 } else {
1138                         v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1139                         v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
1140                 }
1141                 break;
1142
1143         case SO_SNDTIMEO:
1144                 lv = sizeof(struct timeval);
1145                 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1146                         v.tm.tv_sec = 0;
1147                         v.tm.tv_usec = 0;
1148                 } else {
1149                         v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1150                         v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
1151                 }
1152                 break;
1153
1154         case SO_RCVLOWAT:
1155                 v.val = sk->sk_rcvlowat;
1156                 break;
1157
1158         case SO_SNDLOWAT:
1159                 v.val = 1;
1160                 break;
1161
1162         case SO_PASSCRED:
1163                 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1164                 break;
1165
1166         case SO_PEERCRED:
1167         {
1168                 struct ucred peercred;
1169                 if (len > sizeof(peercred))
1170                         len = sizeof(peercred);
1171                 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1172                 if (copy_to_user(optval, &peercred, len))
1173                         return -EFAULT;
1174                 goto lenout;
1175         }
1176
1177         case SO_PEERNAME:
1178         {
1179                 char address[128];
1180
1181                 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1182                         return -ENOTCONN;
1183                 if (lv < len)
1184                         return -EINVAL;
1185                 if (copy_to_user(optval, address, len))
1186                         return -EFAULT;
1187                 goto lenout;
1188         }
1189
1190         /* Dubious BSD thing... Probably nobody even uses it, but
1191          * the UNIX standard wants it for whatever reason... -DaveM
1192          */
1193         case SO_ACCEPTCONN:
1194                 v.val = sk->sk_state == TCP_LISTEN;
1195                 break;
1196
1197         case SO_PASSSEC:
1198                 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1199                 break;
1200
1201         case SO_PEERSEC:
1202                 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1203
1204         case SO_MARK:
1205                 v.val = sk->sk_mark;
1206                 break;
1207
1208         case SO_RXQ_OVFL:
1209                 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1210                 break;
1211
1212         case SO_WIFI_STATUS:
1213                 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1214                 break;
1215
1216         case SO_PEEK_OFF:
1217                 if (!sock->ops->set_peek_off)
1218                         return -EOPNOTSUPP;
1219
1220                 v.val = sk->sk_peek_off;
1221                 break;
1222         case SO_NOFCS:
1223                 v.val = sock_flag(sk, SOCK_NOFCS);
1224                 break;
1225
1226         case SO_BINDTODEVICE:
1227                 return sock_getbindtodevice(sk, optval, optlen, len);
1228
1229         case SO_GET_FILTER:
1230                 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1231                 if (len < 0)
1232                         return len;
1233
1234                 goto lenout;
1235
1236         case SO_LOCK_FILTER:
1237                 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1238                 break;
1239
1240         case SO_BPF_EXTENSIONS:
1241                 v.val = bpf_tell_extensions();
1242                 break;
1243
1244         case SO_SELECT_ERR_QUEUE:
1245                 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1246                 break;
1247
1248 #ifdef CONFIG_NET_RX_BUSY_POLL
1249         case SO_BUSY_POLL:
1250                 v.val = sk->sk_ll_usec;
1251                 break;
1252 #endif
1253
1254         case SO_MAX_PACING_RATE:
1255                 v.val = sk->sk_max_pacing_rate;
1256                 break;
1257
1258         case SO_INCOMING_CPU:
1259                 v.val = sk->sk_incoming_cpu;
1260                 break;
1261
1262         default:
1263                 /* We implement the SO_SNDLOWAT etc to not be settable
1264                  * (1003.1g 7).
1265                  */
1266                 return -ENOPROTOOPT;
1267         }
1268
1269         if (len > lv)
1270                 len = lv;
1271         if (copy_to_user(optval, &v, len))
1272                 return -EFAULT;
1273 lenout:
1274         if (put_user(len, optlen))
1275                 return -EFAULT;
1276         return 0;
1277 }
1278
1279 /*
1280  * Initialize an sk_lock.
1281  *
1282  * (We also register the sk_lock with the lock validator.)
1283  */
1284 static inline void sock_lock_init(struct sock *sk)
1285 {
1286         sock_lock_init_class_and_name(sk,
1287                         af_family_slock_key_strings[sk->sk_family],
1288                         af_family_slock_keys + sk->sk_family,
1289                         af_family_key_strings[sk->sk_family],
1290                         af_family_keys + sk->sk_family);
1291 }
1292
1293 /*
1294  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1295  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1296  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1297  */
1298 static void sock_copy(struct sock *nsk, const struct sock *osk)
1299 {
1300 #ifdef CONFIG_SECURITY_NETWORK
1301         void *sptr = nsk->sk_security;
1302 #endif
1303         memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1304
1305         memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1306                osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1307
1308 #ifdef CONFIG_SECURITY_NETWORK
1309         nsk->sk_security = sptr;
1310         security_sk_clone(osk, nsk);
1311 #endif
1312 }
1313
1314 void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1315 {
1316         unsigned long nulls1, nulls2;
1317
1318         nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1319         nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1320         if (nulls1 > nulls2)
1321                 swap(nulls1, nulls2);
1322
1323         if (nulls1 != 0)
1324                 memset((char *)sk, 0, nulls1);
1325         memset((char *)sk + nulls1 + sizeof(void *), 0,
1326                nulls2 - nulls1 - sizeof(void *));
1327         memset((char *)sk + nulls2 + sizeof(void *), 0,
1328                size - nulls2 - sizeof(void *));
1329 }
1330 EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1331
1332 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1333                 int family)
1334 {
1335         struct sock *sk;
1336         struct kmem_cache *slab;
1337
1338         slab = prot->slab;
1339         if (slab != NULL) {
1340                 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1341                 if (!sk)
1342                         return sk;
1343                 if (priority & __GFP_ZERO) {
1344                         if (prot->clear_sk)
1345                                 prot->clear_sk(sk, prot->obj_size);
1346                         else
1347                                 sk_prot_clear_nulls(sk, prot->obj_size);
1348                 }
1349         } else
1350                 sk = kmalloc(prot->obj_size, priority);
1351
1352         if (sk != NULL) {
1353                 kmemcheck_annotate_bitfield(sk, flags);
1354
1355                 if (security_sk_alloc(sk, family, priority))
1356                         goto out_free;
1357
1358                 if (!try_module_get(prot->owner))
1359                         goto out_free_sec;
1360                 sk_tx_queue_clear(sk);
1361                 cgroup_sk_alloc(&sk->sk_cgrp_data);
1362         }
1363
1364         return sk;
1365
1366 out_free_sec:
1367         security_sk_free(sk);
1368 out_free:
1369         if (slab != NULL)
1370                 kmem_cache_free(slab, sk);
1371         else
1372                 kfree(sk);
1373         return NULL;
1374 }
1375
1376 static void sk_prot_free(struct proto *prot, struct sock *sk)
1377 {
1378         struct kmem_cache *slab;
1379         struct module *owner;
1380
1381         owner = prot->owner;
1382         slab = prot->slab;
1383
1384         cgroup_sk_free(&sk->sk_cgrp_data);
1385         security_sk_free(sk);
1386         if (slab != NULL)
1387                 kmem_cache_free(slab, sk);
1388         else
1389                 kfree(sk);
1390         module_put(owner);
1391 }
1392
1393 /**
1394  *      sk_alloc - All socket objects are allocated here
1395  *      @net: the applicable net namespace
1396  *      @family: protocol family
1397  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1398  *      @prot: struct proto associated with this new sock instance
1399  *      @kern: is this to be a kernel socket?
1400  */
1401 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1402                       struct proto *prot, int kern)
1403 {
1404         struct sock *sk;
1405
1406         sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1407         if (sk) {
1408                 sk->sk_family = family;
1409                 /*
1410                  * See comment in struct sock definition to understand
1411                  * why we need sk_prot_creator -acme
1412                  */
1413                 sk->sk_prot = sk->sk_prot_creator = prot;
1414                 sock_lock_init(sk);
1415                 sk->sk_net_refcnt = kern ? 0 : 1;
1416                 if (likely(sk->sk_net_refcnt))
1417                         get_net(net);
1418                 sock_net_set(sk, net);
1419                 atomic_set(&sk->sk_wmem_alloc, 1);
1420
1421                 sock_update_classid(&sk->sk_cgrp_data);
1422                 sock_update_netprioidx(&sk->sk_cgrp_data);
1423         }
1424
1425         return sk;
1426 }
1427 EXPORT_SYMBOL(sk_alloc);
1428
1429 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
1430  * grace period. This is the case for UDP sockets and TCP listeners.
1431  */
1432 static void __sk_destruct(struct rcu_head *head)
1433 {
1434         struct sock *sk = container_of(head, struct sock, sk_rcu);
1435         struct sk_filter *filter;
1436
1437         if (sk->sk_destruct)
1438                 sk->sk_destruct(sk);
1439
1440         filter = rcu_dereference_check(sk->sk_filter,
1441                                        atomic_read(&sk->sk_wmem_alloc) == 0);
1442         if (filter) {
1443                 sk_filter_uncharge(sk, filter);
1444                 RCU_INIT_POINTER(sk->sk_filter, NULL);
1445         }
1446         if (rcu_access_pointer(sk->sk_reuseport_cb))
1447                 reuseport_detach_sock(sk);
1448
1449         sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1450
1451         if (atomic_read(&sk->sk_omem_alloc))
1452                 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1453                          __func__, atomic_read(&sk->sk_omem_alloc));
1454
1455         if (sk->sk_peer_cred)
1456                 put_cred(sk->sk_peer_cred);
1457         put_pid(sk->sk_peer_pid);
1458         if (likely(sk->sk_net_refcnt))
1459                 put_net(sock_net(sk));
1460         sk_prot_free(sk->sk_prot_creator, sk);
1461 }
1462
1463 void sk_destruct(struct sock *sk)
1464 {
1465         if (sock_flag(sk, SOCK_RCU_FREE))
1466                 call_rcu(&sk->sk_rcu, __sk_destruct);
1467         else
1468                 __sk_destruct(&sk->sk_rcu);
1469 }
1470
1471 static void __sk_free(struct sock *sk)
1472 {
1473         if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt))
1474                 sock_diag_broadcast_destroy(sk);
1475         else
1476                 sk_destruct(sk);
1477 }
1478
1479 void sk_free(struct sock *sk)
1480 {
1481         /*
1482          * We subtract one from sk_wmem_alloc and can know if
1483          * some packets are still in some tx queue.
1484          * If not null, sock_wfree() will call __sk_free(sk) later
1485          */
1486         if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1487                 __sk_free(sk);
1488 }
1489 EXPORT_SYMBOL(sk_free);
1490
1491 /**
1492  *      sk_clone_lock - clone a socket, and lock its clone
1493  *      @sk: the socket to clone
1494  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1495  *
1496  *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1497  */
1498 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1499 {
1500         struct sock *newsk;
1501         bool is_charged = true;
1502
1503         newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1504         if (newsk != NULL) {
1505                 struct sk_filter *filter;
1506
1507                 sock_copy(newsk, sk);
1508
1509                 /* SANITY */
1510                 if (likely(newsk->sk_net_refcnt))
1511                         get_net(sock_net(newsk));
1512                 sk_node_init(&newsk->sk_node);
1513                 sock_lock_init(newsk);
1514                 bh_lock_sock(newsk);
1515                 newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
1516                 newsk->sk_backlog.len = 0;
1517
1518                 atomic_set(&newsk->sk_rmem_alloc, 0);
1519                 /*
1520                  * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1521                  */
1522                 atomic_set(&newsk->sk_wmem_alloc, 1);
1523                 atomic_set(&newsk->sk_omem_alloc, 0);
1524                 skb_queue_head_init(&newsk->sk_receive_queue);
1525                 skb_queue_head_init(&newsk->sk_write_queue);
1526
1527                 rwlock_init(&newsk->sk_callback_lock);
1528                 lockdep_set_class_and_name(&newsk->sk_callback_lock,
1529                                 af_callback_keys + newsk->sk_family,
1530                                 af_family_clock_key_strings[newsk->sk_family]);
1531
1532                 newsk->sk_dst_cache     = NULL;
1533                 newsk->sk_wmem_queued   = 0;
1534                 newsk->sk_forward_alloc = 0;
1535                 atomic_set(&newsk->sk_drops, 0);
1536                 newsk->sk_send_head     = NULL;
1537                 newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1538
1539                 sock_reset_flag(newsk, SOCK_DONE);
1540                 skb_queue_head_init(&newsk->sk_error_queue);
1541
1542                 filter = rcu_dereference_protected(newsk->sk_filter, 1);
1543                 if (filter != NULL)
1544                         /* though it's an empty new sock, the charging may fail
1545                          * if sysctl_optmem_max was changed between creation of
1546                          * original socket and cloning
1547                          */
1548                         is_charged = sk_filter_charge(newsk, filter);
1549
1550                 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1551                         /* It is still raw copy of parent, so invalidate
1552                          * destructor and make plain sk_free() */
1553                         newsk->sk_destruct = NULL;
1554                         bh_unlock_sock(newsk);
1555                         sk_free(newsk);
1556                         newsk = NULL;
1557                         goto out;
1558                 }
1559                 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1560
1561                 newsk->sk_err      = 0;
1562                 newsk->sk_priority = 0;
1563                 newsk->sk_incoming_cpu = raw_smp_processor_id();
1564                 atomic64_set(&newsk->sk_cookie, 0);
1565                 /*
1566                  * Before updating sk_refcnt, we must commit prior changes to memory
1567                  * (Documentation/RCU/rculist_nulls.txt for details)
1568                  */
1569                 smp_wmb();
1570                 atomic_set(&newsk->sk_refcnt, 2);
1571
1572                 /*
1573                  * Increment the counter in the same struct proto as the master
1574                  * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1575                  * is the same as sk->sk_prot->socks, as this field was copied
1576                  * with memcpy).
1577                  *
1578                  * This _changes_ the previous behaviour, where
1579                  * tcp_create_openreq_child always was incrementing the
1580                  * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1581                  * to be taken into account in all callers. -acme
1582                  */
1583                 sk_refcnt_debug_inc(newsk);
1584                 sk_set_socket(newsk, NULL);
1585                 newsk->sk_wq = NULL;
1586
1587                 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
1588                         sock_update_memcg(newsk);
1589
1590                 if (newsk->sk_prot->sockets_allocated)
1591                         sk_sockets_allocated_inc(newsk);
1592
1593                 if (sock_needs_netstamp(sk) &&
1594                     newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1595                         net_enable_timestamp();
1596         }
1597 out:
1598         return newsk;
1599 }
1600 EXPORT_SYMBOL_GPL(sk_clone_lock);
1601
1602 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1603 {
1604         u32 max_segs = 1;
1605
1606         sk_dst_set(sk, dst);
1607         sk->sk_route_caps = dst->dev->features;
1608         if (sk->sk_route_caps & NETIF_F_GSO)
1609                 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1610         sk->sk_route_caps &= ~sk->sk_route_nocaps;
1611         if (sk_can_gso(sk)) {
1612                 if (dst->header_len) {
1613                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1614                 } else {
1615                         sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1616                         sk->sk_gso_max_size = dst->dev->gso_max_size;
1617                         max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
1618                 }
1619         }
1620         sk->sk_gso_max_segs = max_segs;
1621 }
1622 EXPORT_SYMBOL_GPL(sk_setup_caps);
1623
1624 /*
1625  *      Simple resource managers for sockets.
1626  */
1627
1628
1629 /*
1630  * Write buffer destructor automatically called from kfree_skb.
1631  */
1632 void sock_wfree(struct sk_buff *skb)
1633 {
1634         struct sock *sk = skb->sk;
1635         unsigned int len = skb->truesize;
1636
1637         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1638                 /*
1639                  * Keep a reference on sk_wmem_alloc, this will be released
1640                  * after sk_write_space() call
1641                  */
1642                 atomic_sub(len - 1, &sk->sk_wmem_alloc);
1643                 sk->sk_write_space(sk);
1644                 len = 1;
1645         }
1646         /*
1647          * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1648          * could not do because of in-flight packets
1649          */
1650         if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1651                 __sk_free(sk);
1652 }
1653 EXPORT_SYMBOL(sock_wfree);
1654
1655 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1656 {
1657         skb_orphan(skb);
1658         skb->sk = sk;
1659 #ifdef CONFIG_INET
1660         if (unlikely(!sk_fullsock(sk))) {
1661                 skb->destructor = sock_edemux;
1662                 sock_hold(sk);
1663                 return;
1664         }
1665 #endif
1666         skb->destructor = sock_wfree;
1667         skb_set_hash_from_sk(skb, sk);
1668         /*
1669          * We used to take a refcount on sk, but following operation
1670          * is enough to guarantee sk_free() wont free this sock until
1671          * all in-flight packets are completed
1672          */
1673         atomic_add(skb->truesize, &sk->sk_wmem_alloc);
1674 }
1675 EXPORT_SYMBOL(skb_set_owner_w);
1676
1677 void skb_orphan_partial(struct sk_buff *skb)
1678 {
1679         /* TCP stack sets skb->ooo_okay based on sk_wmem_alloc,
1680          * so we do not completely orphan skb, but transfert all
1681          * accounted bytes but one, to avoid unexpected reorders.
1682          */
1683         if (skb->destructor == sock_wfree
1684 #ifdef CONFIG_INET
1685             || skb->destructor == tcp_wfree
1686 #endif
1687                 ) {
1688                 atomic_sub(skb->truesize - 1, &skb->sk->sk_wmem_alloc);
1689                 skb->truesize = 1;
1690         } else {
1691                 skb_orphan(skb);
1692         }
1693 }
1694 EXPORT_SYMBOL(skb_orphan_partial);
1695
1696 /*
1697  * Read buffer destructor automatically called from kfree_skb.
1698  */
1699 void sock_rfree(struct sk_buff *skb)
1700 {
1701         struct sock *sk = skb->sk;
1702         unsigned int len = skb->truesize;
1703
1704         atomic_sub(len, &sk->sk_rmem_alloc);
1705         sk_mem_uncharge(sk, len);
1706 }
1707 EXPORT_SYMBOL(sock_rfree);
1708
1709 /*
1710  * Buffer destructor for skbs that are not used directly in read or write
1711  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1712  */
1713 void sock_efree(struct sk_buff *skb)
1714 {
1715         sock_put(skb->sk);
1716 }
1717 EXPORT_SYMBOL(sock_efree);
1718
1719 kuid_t sock_i_uid(struct sock *sk)
1720 {
1721         kuid_t uid;
1722
1723         read_lock_bh(&sk->sk_callback_lock);
1724         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1725         read_unlock_bh(&sk->sk_callback_lock);
1726         return uid;
1727 }
1728 EXPORT_SYMBOL(sock_i_uid);
1729
1730 unsigned long sock_i_ino(struct sock *sk)
1731 {
1732         unsigned long ino;
1733
1734         read_lock_bh(&sk->sk_callback_lock);
1735         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1736         read_unlock_bh(&sk->sk_callback_lock);
1737         return ino;
1738 }
1739 EXPORT_SYMBOL(sock_i_ino);
1740
1741 /*
1742  * Allocate a skb from the socket's send buffer.
1743  */
1744 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1745                              gfp_t priority)
1746 {
1747         if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1748                 struct sk_buff *skb = alloc_skb(size, priority);
1749                 if (skb) {
1750                         skb_set_owner_w(skb, sk);
1751                         return skb;
1752                 }
1753         }
1754         return NULL;
1755 }
1756 EXPORT_SYMBOL(sock_wmalloc);
1757
1758 /*
1759  * Allocate a memory block from the socket's option memory buffer.
1760  */
1761 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1762 {
1763         if ((unsigned int)size <= sysctl_optmem_max &&
1764             atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1765                 void *mem;
1766                 /* First do the add, to avoid the race if kmalloc
1767                  * might sleep.
1768                  */
1769                 atomic_add(size, &sk->sk_omem_alloc);
1770                 mem = kmalloc(size, priority);
1771                 if (mem)
1772                         return mem;
1773                 atomic_sub(size, &sk->sk_omem_alloc);
1774         }
1775         return NULL;
1776 }
1777 EXPORT_SYMBOL(sock_kmalloc);
1778
1779 /* Free an option memory block. Note, we actually want the inline
1780  * here as this allows gcc to detect the nullify and fold away the
1781  * condition entirely.
1782  */
1783 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
1784                                   const bool nullify)
1785 {
1786         if (WARN_ON_ONCE(!mem))
1787                 return;
1788         if (nullify)
1789                 kzfree(mem);
1790         else
1791                 kfree(mem);
1792         atomic_sub(size, &sk->sk_omem_alloc);
1793 }
1794
1795 void sock_kfree_s(struct sock *sk, void *mem, int size)
1796 {
1797         __sock_kfree_s(sk, mem, size, false);
1798 }
1799 EXPORT_SYMBOL(sock_kfree_s);
1800
1801 void sock_kzfree_s(struct sock *sk, void *mem, int size)
1802 {
1803         __sock_kfree_s(sk, mem, size, true);
1804 }
1805 EXPORT_SYMBOL(sock_kzfree_s);
1806
1807 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1808    I think, these locks should be removed for datagram sockets.
1809  */
1810 static long sock_wait_for_wmem(struct sock *sk, long timeo)
1811 {
1812         DEFINE_WAIT(wait);
1813
1814         sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1815         for (;;) {
1816                 if (!timeo)
1817                         break;
1818                 if (signal_pending(current))
1819                         break;
1820                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1821                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1822                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1823                         break;
1824                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1825                         break;
1826                 if (sk->sk_err)
1827                         break;
1828                 timeo = schedule_timeout(timeo);
1829         }
1830         finish_wait(sk_sleep(sk), &wait);
1831         return timeo;
1832 }
1833
1834
1835 /*
1836  *      Generic send/receive buffer handlers
1837  */
1838
1839 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1840                                      unsigned long data_len, int noblock,
1841                                      int *errcode, int max_page_order)
1842 {
1843         struct sk_buff *skb;
1844         long timeo;
1845         int err;
1846
1847         timeo = sock_sndtimeo(sk, noblock);
1848         for (;;) {
1849                 err = sock_error(sk);
1850                 if (err != 0)
1851                         goto failure;
1852
1853                 err = -EPIPE;
1854                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1855                         goto failure;
1856
1857                 if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
1858                         break;
1859
1860                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1861                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1862                 err = -EAGAIN;
1863                 if (!timeo)
1864                         goto failure;
1865                 if (signal_pending(current))
1866                         goto interrupted;
1867                 timeo = sock_wait_for_wmem(sk, timeo);
1868         }
1869         skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
1870                                    errcode, sk->sk_allocation);
1871         if (skb)
1872                 skb_set_owner_w(skb, sk);
1873         return skb;
1874
1875 interrupted:
1876         err = sock_intr_errno(timeo);
1877 failure:
1878         *errcode = err;
1879         return NULL;
1880 }
1881 EXPORT_SYMBOL(sock_alloc_send_pskb);
1882
1883 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1884                                     int noblock, int *errcode)
1885 {
1886         return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
1887 }
1888 EXPORT_SYMBOL(sock_alloc_send_skb);
1889
1890 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
1891                      struct sockcm_cookie *sockc)
1892 {
1893         u32 tsflags;
1894
1895         switch (cmsg->cmsg_type) {
1896         case SO_MARK:
1897                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1898                         return -EPERM;
1899                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
1900                         return -EINVAL;
1901                 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
1902                 break;
1903         case SO_TIMESTAMPING:
1904                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
1905                         return -EINVAL;
1906
1907                 tsflags = *(u32 *)CMSG_DATA(cmsg);
1908                 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
1909                         return -EINVAL;
1910
1911                 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
1912                 sockc->tsflags |= tsflags;
1913                 break;
1914         default:
1915                 return -EINVAL;
1916         }
1917         return 0;
1918 }
1919 EXPORT_SYMBOL(__sock_cmsg_send);
1920
1921 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
1922                    struct sockcm_cookie *sockc)
1923 {
1924         struct cmsghdr *cmsg;
1925         int ret;
1926
1927         for_each_cmsghdr(cmsg, msg) {
1928                 if (!CMSG_OK(msg, cmsg))
1929                         return -EINVAL;
1930                 if (cmsg->cmsg_level != SOL_SOCKET)
1931                         continue;
1932                 ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
1933                 if (ret)
1934                         return ret;
1935         }
1936         return 0;
1937 }
1938 EXPORT_SYMBOL(sock_cmsg_send);
1939
1940 /* On 32bit arches, an skb frag is limited to 2^15 */
1941 #define SKB_FRAG_PAGE_ORDER     get_order(32768)
1942
1943 /**
1944  * skb_page_frag_refill - check that a page_frag contains enough room
1945  * @sz: minimum size of the fragment we want to get
1946  * @pfrag: pointer to page_frag
1947  * @gfp: priority for memory allocation
1948  *
1949  * Note: While this allocator tries to use high order pages, there is
1950  * no guarantee that allocations succeed. Therefore, @sz MUST be
1951  * less or equal than PAGE_SIZE.
1952  */
1953 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
1954 {
1955         if (pfrag->page) {
1956                 if (page_ref_count(pfrag->page) == 1) {
1957                         pfrag->offset = 0;
1958                         return true;
1959                 }
1960                 if (pfrag->offset + sz <= pfrag->size)
1961                         return true;
1962                 put_page(pfrag->page);
1963         }
1964
1965         pfrag->offset = 0;
1966         if (SKB_FRAG_PAGE_ORDER) {
1967                 /* Avoid direct reclaim but allow kswapd to wake */
1968                 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
1969                                           __GFP_COMP | __GFP_NOWARN |
1970                                           __GFP_NORETRY,
1971                                           SKB_FRAG_PAGE_ORDER);
1972                 if (likely(pfrag->page)) {
1973                         pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
1974                         return true;
1975                 }
1976         }
1977         pfrag->page = alloc_page(gfp);
1978         if (likely(pfrag->page)) {
1979                 pfrag->size = PAGE_SIZE;
1980                 return true;
1981         }
1982         return false;
1983 }
1984 EXPORT_SYMBOL(skb_page_frag_refill);
1985
1986 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
1987 {
1988         if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
1989                 return true;
1990
1991         sk_enter_memory_pressure(sk);
1992         sk_stream_moderate_sndbuf(sk);
1993         return false;
1994 }
1995 EXPORT_SYMBOL(sk_page_frag_refill);
1996
1997 static void __lock_sock(struct sock *sk)
1998         __releases(&sk->sk_lock.slock)
1999         __acquires(&sk->sk_lock.slock)
2000 {
2001         DEFINE_WAIT(wait);
2002
2003         for (;;) {
2004                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2005                                         TASK_UNINTERRUPTIBLE);
2006                 spin_unlock_bh(&sk->sk_lock.slock);
2007                 schedule();
2008                 spin_lock_bh(&sk->sk_lock.slock);
2009                 if (!sock_owned_by_user(sk))
2010                         break;
2011         }
2012         finish_wait(&sk->sk_lock.wq, &wait);
2013 }
2014
2015 static void __release_sock(struct sock *sk)
2016         __releases(&sk->sk_lock.slock)
2017         __acquires(&sk->sk_lock.slock)
2018 {
2019         struct sk_buff *skb = sk->sk_backlog.head;
2020
2021         do {
2022                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2023                 bh_unlock_sock(sk);
2024
2025                 do {
2026                         struct sk_buff *next = skb->next;
2027
2028                         prefetch(next);
2029                         WARN_ON_ONCE(skb_dst_is_noref(skb));
2030                         skb->next = NULL;
2031                         sk_backlog_rcv(sk, skb);
2032
2033                         /*
2034                          * We are in process context here with softirqs
2035                          * disabled, use cond_resched_softirq() to preempt.
2036                          * This is safe to do because we've taken the backlog
2037                          * queue private:
2038                          */
2039                         cond_resched_softirq();
2040
2041                         skb = next;
2042                 } while (skb != NULL);
2043
2044                 bh_lock_sock(sk);
2045         } while ((skb = sk->sk_backlog.head) != NULL);
2046
2047         /*
2048          * Doing the zeroing here guarantee we can not loop forever
2049          * while a wild producer attempts to flood us.
2050          */
2051         sk->sk_backlog.len = 0;
2052 }
2053
2054 /**
2055  * sk_wait_data - wait for data to arrive at sk_receive_queue
2056  * @sk:    sock to wait on
2057  * @timeo: for how long
2058  * @skb:   last skb seen on sk_receive_queue
2059  *
2060  * Now socket state including sk->sk_err is changed only under lock,
2061  * hence we may omit checks after joining wait queue.
2062  * We check receive queue before schedule() only as optimization;
2063  * it is very likely that release_sock() added new data.
2064  */
2065 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2066 {
2067         int rc;
2068         DEFINE_WAIT(wait);
2069
2070         prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2071         sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2072         rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb);
2073         sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2074         finish_wait(sk_sleep(sk), &wait);
2075         return rc;
2076 }
2077 EXPORT_SYMBOL(sk_wait_data);
2078
2079 /**
2080  *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2081  *      @sk: socket
2082  *      @size: memory size to allocate
2083  *      @kind: allocation type
2084  *
2085  *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2086  *      rmem allocation. This function assumes that protocols which have
2087  *      memory_pressure use sk_wmem_queued as write buffer accounting.
2088  */
2089 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2090 {
2091         struct proto *prot = sk->sk_prot;
2092         int amt = sk_mem_pages(size);
2093         long allocated;
2094
2095         sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
2096
2097         allocated = sk_memory_allocated_add(sk, amt);
2098
2099         if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2100             !mem_cgroup_charge_skmem(sk->sk_memcg, amt))
2101                 goto suppress_allocation;
2102
2103         /* Under limit. */
2104         if (allocated <= sk_prot_mem_limits(sk, 0)) {
2105                 sk_leave_memory_pressure(sk);
2106                 return 1;
2107         }
2108
2109         /* Under pressure. */
2110         if (allocated > sk_prot_mem_limits(sk, 1))
2111                 sk_enter_memory_pressure(sk);
2112
2113         /* Over hard limit. */
2114         if (allocated > sk_prot_mem_limits(sk, 2))
2115                 goto suppress_allocation;
2116
2117         /* guarantee minimum buffer size under pressure */
2118         if (kind == SK_MEM_RECV) {
2119                 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
2120                         return 1;
2121
2122         } else { /* SK_MEM_SEND */
2123                 if (sk->sk_type == SOCK_STREAM) {
2124                         if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
2125                                 return 1;
2126                 } else if (atomic_read(&sk->sk_wmem_alloc) <
2127                            prot->sysctl_wmem[0])
2128                                 return 1;
2129         }
2130
2131         if (sk_has_memory_pressure(sk)) {
2132                 int alloc;
2133
2134                 if (!sk_under_memory_pressure(sk))
2135                         return 1;
2136                 alloc = sk_sockets_allocated_read_positive(sk);
2137                 if (sk_prot_mem_limits(sk, 2) > alloc *
2138                     sk_mem_pages(sk->sk_wmem_queued +
2139                                  atomic_read(&sk->sk_rmem_alloc) +
2140                                  sk->sk_forward_alloc))
2141                         return 1;
2142         }
2143
2144 suppress_allocation:
2145
2146         if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2147                 sk_stream_moderate_sndbuf(sk);
2148
2149                 /* Fail only if socket is _under_ its sndbuf.
2150                  * In this case we cannot block, so that we have to fail.
2151                  */
2152                 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2153                         return 1;
2154         }
2155
2156         trace_sock_exceed_buf_limit(sk, prot, allocated);
2157
2158         /* Alas. Undo changes. */
2159         sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
2160
2161         sk_memory_allocated_sub(sk, amt);
2162
2163         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2164                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2165
2166         return 0;
2167 }
2168 EXPORT_SYMBOL(__sk_mem_schedule);
2169
2170 /**
2171  *      __sk_mem_reclaim - reclaim memory_allocated
2172  *      @sk: socket
2173  *      @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2174  */
2175 void __sk_mem_reclaim(struct sock *sk, int amount)
2176 {
2177         amount >>= SK_MEM_QUANTUM_SHIFT;
2178         sk_memory_allocated_sub(sk, amount);
2179         sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2180
2181         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2182                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2183
2184         if (sk_under_memory_pressure(sk) &&
2185             (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2186                 sk_leave_memory_pressure(sk);
2187 }
2188 EXPORT_SYMBOL(__sk_mem_reclaim);
2189
2190
2191 /*
2192  * Set of default routines for initialising struct proto_ops when
2193  * the protocol does not support a particular function. In certain
2194  * cases where it makes no sense for a protocol to have a "do nothing"
2195  * function, some default processing is provided.
2196  */
2197
2198 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2199 {
2200         return -EOPNOTSUPP;
2201 }
2202 EXPORT_SYMBOL(sock_no_bind);
2203
2204 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2205                     int len, int flags)
2206 {
2207         return -EOPNOTSUPP;
2208 }
2209 EXPORT_SYMBOL(sock_no_connect);
2210
2211 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2212 {
2213         return -EOPNOTSUPP;
2214 }
2215 EXPORT_SYMBOL(sock_no_socketpair);
2216
2217 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
2218 {
2219         return -EOPNOTSUPP;
2220 }
2221 EXPORT_SYMBOL(sock_no_accept);
2222
2223 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2224                     int *len, int peer)
2225 {
2226         return -EOPNOTSUPP;
2227 }
2228 EXPORT_SYMBOL(sock_no_getname);
2229
2230 unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2231 {
2232         return 0;
2233 }
2234 EXPORT_SYMBOL(sock_no_poll);
2235
2236 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2237 {
2238         return -EOPNOTSUPP;
2239 }
2240 EXPORT_SYMBOL(sock_no_ioctl);
2241
2242 int sock_no_listen(struct socket *sock, int backlog)
2243 {
2244         return -EOPNOTSUPP;
2245 }
2246 EXPORT_SYMBOL(sock_no_listen);
2247
2248 int sock_no_shutdown(struct socket *sock, int how)
2249 {
2250         return -EOPNOTSUPP;
2251 }
2252 EXPORT_SYMBOL(sock_no_shutdown);
2253
2254 int sock_no_setsockopt(struct socket *sock, int level, int optname,
2255                     char __user *optval, unsigned int optlen)
2256 {
2257         return -EOPNOTSUPP;
2258 }
2259 EXPORT_SYMBOL(sock_no_setsockopt);
2260
2261 int sock_no_getsockopt(struct socket *sock, int level, int optname,
2262                     char __user *optval, int __user *optlen)
2263 {
2264         return -EOPNOTSUPP;
2265 }
2266 EXPORT_SYMBOL(sock_no_getsockopt);
2267
2268 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2269 {
2270         return -EOPNOTSUPP;
2271 }
2272 EXPORT_SYMBOL(sock_no_sendmsg);
2273
2274 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2275                     int flags)
2276 {
2277         return -EOPNOTSUPP;
2278 }
2279 EXPORT_SYMBOL(sock_no_recvmsg);
2280
2281 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2282 {
2283         /* Mirror missing mmap method error code */
2284         return -ENODEV;
2285 }
2286 EXPORT_SYMBOL(sock_no_mmap);
2287
2288 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2289 {
2290         ssize_t res;
2291         struct msghdr msg = {.msg_flags = flags};
2292         struct kvec iov;
2293         char *kaddr = kmap(page);
2294         iov.iov_base = kaddr + offset;
2295         iov.iov_len = size;
2296         res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2297         kunmap(page);
2298         return res;
2299 }
2300 EXPORT_SYMBOL(sock_no_sendpage);
2301
2302 /*
2303  *      Default Socket Callbacks
2304  */
2305
2306 static void sock_def_wakeup(struct sock *sk)
2307 {
2308         struct socket_wq *wq;
2309
2310         rcu_read_lock();
2311         wq = rcu_dereference(sk->sk_wq);
2312         if (skwq_has_sleeper(wq))
2313                 wake_up_interruptible_all(&wq->wait);
2314         rcu_read_unlock();
2315 }
2316
2317 static void sock_def_error_report(struct sock *sk)
2318 {
2319         struct socket_wq *wq;
2320
2321         rcu_read_lock();
2322         wq = rcu_dereference(sk->sk_wq);
2323         if (skwq_has_sleeper(wq))
2324                 wake_up_interruptible_poll(&wq->wait, POLLERR);
2325         sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2326         rcu_read_unlock();
2327 }
2328
2329 static void sock_def_readable(struct sock *sk)
2330 {
2331         struct socket_wq *wq;
2332
2333         rcu_read_lock();
2334         wq = rcu_dereference(sk->sk_wq);
2335         if (skwq_has_sleeper(wq))
2336                 wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2337                                                 POLLRDNORM | POLLRDBAND);
2338         sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2339         rcu_read_unlock();
2340 }
2341
2342 static void sock_def_write_space(struct sock *sk)
2343 {
2344         struct socket_wq *wq;
2345
2346         rcu_read_lock();
2347
2348         /* Do not wake up a writer until he can make "significant"
2349          * progress.  --DaveM
2350          */
2351         if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2352                 wq = rcu_dereference(sk->sk_wq);
2353                 if (skwq_has_sleeper(wq))
2354                         wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2355                                                 POLLWRNORM | POLLWRBAND);
2356
2357                 /* Should agree with poll, otherwise some programs break */
2358                 if (sock_writeable(sk))
2359                         sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2360         }
2361
2362         rcu_read_unlock();
2363 }
2364
2365 static void sock_def_destruct(struct sock *sk)
2366 {
2367 }
2368
2369 void sk_send_sigurg(struct sock *sk)
2370 {
2371         if (sk->sk_socket && sk->sk_socket->file)
2372                 if (send_sigurg(&sk->sk_socket->file->f_owner))
2373                         sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2374 }
2375 EXPORT_SYMBOL(sk_send_sigurg);
2376
2377 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2378                     unsigned long expires)
2379 {
2380         if (!mod_timer(timer, expires))
2381                 sock_hold(sk);
2382 }
2383 EXPORT_SYMBOL(sk_reset_timer);
2384
2385 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2386 {
2387         if (del_timer(timer))
2388                 __sock_put(sk);
2389 }
2390 EXPORT_SYMBOL(sk_stop_timer);
2391
2392 void sock_init_data(struct socket *sock, struct sock *sk)
2393 {
2394         skb_queue_head_init(&sk->sk_receive_queue);
2395         skb_queue_head_init(&sk->sk_write_queue);
2396         skb_queue_head_init(&sk->sk_error_queue);
2397
2398         sk->sk_send_head        =       NULL;
2399
2400         init_timer(&sk->sk_timer);
2401
2402         sk->sk_allocation       =       GFP_KERNEL;
2403         sk->sk_rcvbuf           =       sysctl_rmem_default;
2404         sk->sk_sndbuf           =       sysctl_wmem_default;
2405         sk->sk_state            =       TCP_CLOSE;
2406         sk_set_socket(sk, sock);
2407
2408         sock_set_flag(sk, SOCK_ZAPPED);
2409
2410         if (sock) {
2411                 sk->sk_type     =       sock->type;
2412                 sk->sk_wq       =       sock->wq;
2413                 sock->sk        =       sk;
2414         } else
2415                 sk->sk_wq       =       NULL;
2416
2417         rwlock_init(&sk->sk_callback_lock);
2418         lockdep_set_class_and_name(&sk->sk_callback_lock,
2419                         af_callback_keys + sk->sk_family,
2420                         af_family_clock_key_strings[sk->sk_family]);
2421
2422         sk->sk_state_change     =       sock_def_wakeup;
2423         sk->sk_data_ready       =       sock_def_readable;
2424         sk->sk_write_space      =       sock_def_write_space;
2425         sk->sk_error_report     =       sock_def_error_report;
2426         sk->sk_destruct         =       sock_def_destruct;
2427
2428         sk->sk_frag.page        =       NULL;
2429         sk->sk_frag.offset      =       0;
2430         sk->sk_peek_off         =       -1;
2431
2432         sk->sk_peer_pid         =       NULL;
2433         sk->sk_peer_cred        =       NULL;
2434         sk->sk_write_pending    =       0;
2435         sk->sk_rcvlowat         =       1;
2436         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
2437         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
2438
2439         sk->sk_stamp = ktime_set(-1L, 0);
2440
2441 #ifdef CONFIG_NET_RX_BUSY_POLL
2442         sk->sk_napi_id          =       0;
2443         sk->sk_ll_usec          =       sysctl_net_busy_read;
2444 #endif
2445
2446         sk->sk_max_pacing_rate = ~0U;
2447         sk->sk_pacing_rate = ~0U;
2448         sk->sk_incoming_cpu = -1;
2449         /*
2450          * Before updating sk_refcnt, we must commit prior changes to memory
2451          * (Documentation/RCU/rculist_nulls.txt for details)
2452          */
2453         smp_wmb();
2454         atomic_set(&sk->sk_refcnt, 1);
2455         atomic_set(&sk->sk_drops, 0);
2456 }
2457 EXPORT_SYMBOL(sock_init_data);
2458
2459 void lock_sock_nested(struct sock *sk, int subclass)
2460 {
2461         might_sleep();
2462         spin_lock_bh(&sk->sk_lock.slock);
2463         if (sk->sk_lock.owned)
2464                 __lock_sock(sk);
2465         sk->sk_lock.owned = 1;
2466         spin_unlock(&sk->sk_lock.slock);
2467         /*
2468          * The sk_lock has mutex_lock() semantics here:
2469          */
2470         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2471         local_bh_enable();
2472 }
2473 EXPORT_SYMBOL(lock_sock_nested);
2474
2475 void release_sock(struct sock *sk)
2476 {
2477         /*
2478          * The sk_lock has mutex_unlock() semantics:
2479          */
2480         mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
2481
2482         spin_lock_bh(&sk->sk_lock.slock);
2483         if (sk->sk_backlog.tail)
2484                 __release_sock(sk);
2485
2486         /* Warning : release_cb() might need to release sk ownership,
2487          * ie call sock_release_ownership(sk) before us.
2488          */
2489         if (sk->sk_prot->release_cb)
2490                 sk->sk_prot->release_cb(sk);
2491
2492         sock_release_ownership(sk);
2493         if (waitqueue_active(&sk->sk_lock.wq))
2494                 wake_up(&sk->sk_lock.wq);
2495         spin_unlock_bh(&sk->sk_lock.slock);
2496 }
2497 EXPORT_SYMBOL(release_sock);
2498
2499 /**
2500  * lock_sock_fast - fast version of lock_sock
2501  * @sk: socket
2502  *
2503  * This version should be used for very small section, where process wont block
2504  * return false if fast path is taken
2505  *   sk_lock.slock locked, owned = 0, BH disabled
2506  * return true if slow path is taken
2507  *   sk_lock.slock unlocked, owned = 1, BH enabled
2508  */
2509 bool lock_sock_fast(struct sock *sk)
2510 {
2511         might_sleep();
2512         spin_lock_bh(&sk->sk_lock.slock);
2513
2514         if (!sk->sk_lock.owned)
2515                 /*
2516                  * Note : We must disable BH
2517                  */
2518                 return false;
2519
2520         __lock_sock(sk);
2521         sk->sk_lock.owned = 1;
2522         spin_unlock(&sk->sk_lock.slock);
2523         /*
2524          * The sk_lock has mutex_lock() semantics here:
2525          */
2526         mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2527         local_bh_enable();
2528         return true;
2529 }
2530 EXPORT_SYMBOL(lock_sock_fast);
2531
2532 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2533 {
2534         struct timeval tv;
2535         if (!sock_flag(sk, SOCK_TIMESTAMP))
2536                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2537         tv = ktime_to_timeval(sk->sk_stamp);
2538         if (tv.tv_sec == -1)
2539                 return -ENOENT;
2540         if (tv.tv_sec == 0) {
2541                 sk->sk_stamp = ktime_get_real();
2542                 tv = ktime_to_timeval(sk->sk_stamp);
2543         }
2544         return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2545 }
2546 EXPORT_SYMBOL(sock_get_timestamp);
2547
2548 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2549 {
2550         struct timespec ts;
2551         if (!sock_flag(sk, SOCK_TIMESTAMP))
2552                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2553         ts = ktime_to_timespec(sk->sk_stamp);
2554         if (ts.tv_sec == -1)
2555                 return -ENOENT;
2556         if (ts.tv_sec == 0) {
2557                 sk->sk_stamp = ktime_get_real();
2558                 ts = ktime_to_timespec(sk->sk_stamp);
2559         }
2560         return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2561 }
2562 EXPORT_SYMBOL(sock_get_timestampns);
2563
2564 void sock_enable_timestamp(struct sock *sk, int flag)
2565 {
2566         if (!sock_flag(sk, flag)) {
2567                 unsigned long previous_flags = sk->sk_flags;
2568
2569                 sock_set_flag(sk, flag);
2570                 /*
2571                  * we just set one of the two flags which require net
2572                  * time stamping, but time stamping might have been on
2573                  * already because of the other one
2574                  */
2575                 if (sock_needs_netstamp(sk) &&
2576                     !(previous_flags & SK_FLAGS_TIMESTAMP))
2577                         net_enable_timestamp();
2578         }
2579 }
2580
2581 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2582                        int level, int type)
2583 {
2584         struct sock_exterr_skb *serr;
2585         struct sk_buff *skb;
2586         int copied, err;
2587
2588         err = -EAGAIN;
2589         skb = sock_dequeue_err_skb(sk);
2590         if (skb == NULL)
2591                 goto out;
2592
2593         copied = skb->len;
2594         if (copied > len) {
2595                 msg->msg_flags |= MSG_TRUNC;
2596                 copied = len;
2597         }
2598         err = skb_copy_datagram_msg(skb, 0, msg, copied);
2599         if (err)
2600                 goto out_free_skb;
2601
2602         sock_recv_timestamp(msg, sk, skb);
2603
2604         serr = SKB_EXT_ERR(skb);
2605         put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2606
2607         msg->msg_flags |= MSG_ERRQUEUE;
2608         err = copied;
2609
2610 out_free_skb:
2611         kfree_skb(skb);
2612 out:
2613         return err;
2614 }
2615 EXPORT_SYMBOL(sock_recv_errqueue);
2616
2617 /*
2618  *      Get a socket option on an socket.
2619  *
2620  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
2621  *      asynchronous errors should be reported by getsockopt. We assume
2622  *      this means if you specify SO_ERROR (otherwise whats the point of it).
2623  */
2624 int sock_common_getsockopt(struct socket *sock, int level, int optname,
2625                            char __user *optval, int __user *optlen)
2626 {
2627         struct sock *sk = sock->sk;
2628
2629         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2630 }
2631 EXPORT_SYMBOL(sock_common_getsockopt);
2632
2633 #ifdef CONFIG_COMPAT
2634 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2635                                   char __user *optval, int __user *optlen)
2636 {
2637         struct sock *sk = sock->sk;
2638
2639         if (sk->sk_prot->compat_getsockopt != NULL)
2640                 return sk->sk_prot->compat_getsockopt(sk, level, optname,
2641                                                       optval, optlen);
2642         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2643 }
2644 EXPORT_SYMBOL(compat_sock_common_getsockopt);
2645 #endif
2646
2647 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2648                         int flags)
2649 {
2650         struct sock *sk = sock->sk;
2651         int addr_len = 0;
2652         int err;
2653
2654         err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
2655                                    flags & ~MSG_DONTWAIT, &addr_len);
2656         if (err >= 0)
2657                 msg->msg_namelen = addr_len;
2658         return err;
2659 }
2660 EXPORT_SYMBOL(sock_common_recvmsg);
2661
2662 /*
2663  *      Set socket options on an inet socket.
2664  */
2665 int sock_common_setsockopt(struct socket *sock, int level, int optname,
2666                            char __user *optval, unsigned int optlen)
2667 {
2668         struct sock *sk = sock->sk;
2669
2670         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2671 }
2672 EXPORT_SYMBOL(sock_common_setsockopt);
2673
2674 #ifdef CONFIG_COMPAT
2675 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2676                                   char __user *optval, unsigned int optlen)
2677 {
2678         struct sock *sk = sock->sk;
2679
2680         if (sk->sk_prot->compat_setsockopt != NULL)
2681                 return sk->sk_prot->compat_setsockopt(sk, level, optname,
2682                                                       optval, optlen);
2683         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2684 }
2685 EXPORT_SYMBOL(compat_sock_common_setsockopt);
2686 #endif
2687
2688 void sk_common_release(struct sock *sk)
2689 {
2690         if (sk->sk_prot->destroy)
2691                 sk->sk_prot->destroy(sk);
2692
2693         /*
2694          * Observation: when sock_common_release is called, processes have
2695          * no access to socket. But net still has.
2696          * Step one, detach it from networking:
2697          *
2698          * A. Remove from hash tables.
2699          */
2700
2701         sk->sk_prot->unhash(sk);
2702
2703         /*
2704          * In this point socket cannot receive new packets, but it is possible
2705          * that some packets are in flight because some CPU runs receiver and
2706          * did hash table lookup before we unhashed socket. They will achieve
2707          * receive queue and will be purged by socket destructor.
2708          *
2709          * Also we still have packets pending on receive queue and probably,
2710          * our own packets waiting in device queues. sock_destroy will drain
2711          * receive queue, but transmitted packets will delay socket destruction
2712          * until the last reference will be released.
2713          */
2714
2715         sock_orphan(sk);
2716
2717         xfrm_sk_free_policy(sk);
2718
2719         sk_refcnt_debug_release(sk);
2720
2721         if (sk->sk_frag.page) {
2722                 put_page(sk->sk_frag.page);
2723                 sk->sk_frag.page = NULL;
2724         }
2725
2726         sock_put(sk);
2727 }
2728 EXPORT_SYMBOL(sk_common_release);
2729
2730 #ifdef CONFIG_PROC_FS
2731 #define PROTO_INUSE_NR  64      /* should be enough for the first time */
2732 struct prot_inuse {
2733         int val[PROTO_INUSE_NR];
2734 };
2735
2736 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2737
2738 #ifdef CONFIG_NET_NS
2739 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2740 {
2741         __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2742 }
2743 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2744
2745 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2746 {
2747         int cpu, idx = prot->inuse_idx;
2748         int res = 0;
2749
2750         for_each_possible_cpu(cpu)
2751                 res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2752
2753         return res >= 0 ? res : 0;
2754 }
2755 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2756
2757 static int __net_init sock_inuse_init_net(struct net *net)
2758 {
2759         net->core.inuse = alloc_percpu(struct prot_inuse);
2760         return net->core.inuse ? 0 : -ENOMEM;
2761 }
2762
2763 static void __net_exit sock_inuse_exit_net(struct net *net)
2764 {
2765         free_percpu(net->core.inuse);
2766 }
2767
2768 static struct pernet_operations net_inuse_ops = {
2769         .init = sock_inuse_init_net,
2770         .exit = sock_inuse_exit_net,
2771 };
2772
2773 static __init int net_inuse_init(void)
2774 {
2775         if (register_pernet_subsys(&net_inuse_ops))
2776                 panic("Cannot initialize net inuse counters");
2777
2778         return 0;
2779 }
2780
2781 core_initcall(net_inuse_init);
2782 #else
2783 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2784
2785 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2786 {
2787         __this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2788 }
2789 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2790
2791 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2792 {
2793         int cpu, idx = prot->inuse_idx;
2794         int res = 0;
2795
2796         for_each_possible_cpu(cpu)
2797                 res += per_cpu(prot_inuse, cpu).val[idx];
2798
2799         return res >= 0 ? res : 0;
2800 }
2801 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2802 #endif
2803
2804 static void assign_proto_idx(struct proto *prot)
2805 {
2806         prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2807
2808         if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2809                 pr_err("PROTO_INUSE_NR exhausted\n");
2810                 return;
2811         }
2812
2813         set_bit(prot->inuse_idx, proto_inuse_idx);
2814 }
2815
2816 static void release_proto_idx(struct proto *prot)
2817 {
2818         if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2819                 clear_bit(prot->inuse_idx, proto_inuse_idx);
2820 }
2821 #else
2822 static inline void assign_proto_idx(struct proto *prot)
2823 {
2824 }
2825
2826 static inline void release_proto_idx(struct proto *prot)
2827 {
2828 }
2829 #endif
2830
2831 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
2832 {
2833         if (!rsk_prot)
2834                 return;
2835         kfree(rsk_prot->slab_name);
2836         rsk_prot->slab_name = NULL;
2837         kmem_cache_destroy(rsk_prot->slab);
2838         rsk_prot->slab = NULL;
2839 }
2840
2841 static int req_prot_init(const struct proto *prot)
2842 {
2843         struct request_sock_ops *rsk_prot = prot->rsk_prot;
2844
2845         if (!rsk_prot)
2846                 return 0;
2847
2848         rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
2849                                         prot->name);
2850         if (!rsk_prot->slab_name)
2851                 return -ENOMEM;
2852
2853         rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
2854                                            rsk_prot->obj_size, 0,
2855                                            prot->slab_flags, NULL);
2856
2857         if (!rsk_prot->slab) {
2858                 pr_crit("%s: Can't create request sock SLAB cache!\n",
2859                         prot->name);
2860                 return -ENOMEM;
2861         }
2862         return 0;
2863 }
2864
2865 int proto_register(struct proto *prot, int alloc_slab)
2866 {
2867         if (alloc_slab) {
2868                 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2869                                         SLAB_HWCACHE_ALIGN | prot->slab_flags,
2870                                         NULL);
2871
2872                 if (prot->slab == NULL) {
2873                         pr_crit("%s: Can't create sock SLAB cache!\n",
2874                                 prot->name);
2875                         goto out;
2876                 }
2877
2878                 if (req_prot_init(prot))
2879                         goto out_free_request_sock_slab;
2880
2881                 if (prot->twsk_prot != NULL) {
2882                         prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2883
2884                         if (prot->twsk_prot->twsk_slab_name == NULL)
2885                                 goto out_free_request_sock_slab;
2886
2887                         prot->twsk_prot->twsk_slab =
2888                                 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2889                                                   prot->twsk_prot->twsk_obj_size,
2890                                                   0,
2891                                                   prot->slab_flags,
2892                                                   NULL);
2893                         if (prot->twsk_prot->twsk_slab == NULL)
2894                                 goto out_free_timewait_sock_slab_name;
2895                 }
2896         }
2897
2898         mutex_lock(&proto_list_mutex);
2899         list_add(&prot->node, &proto_list);
2900         assign_proto_idx(prot);
2901         mutex_unlock(&proto_list_mutex);
2902         return 0;
2903
2904 out_free_timewait_sock_slab_name:
2905         kfree(prot->twsk_prot->twsk_slab_name);
2906 out_free_request_sock_slab:
2907         req_prot_cleanup(prot->rsk_prot);
2908
2909         kmem_cache_destroy(prot->slab);
2910         prot->slab = NULL;
2911 out:
2912         return -ENOBUFS;
2913 }
2914 EXPORT_SYMBOL(proto_register);
2915
2916 void proto_unregister(struct proto *prot)
2917 {
2918         mutex_lock(&proto_list_mutex);
2919         release_proto_idx(prot);
2920         list_del(&prot->node);
2921         mutex_unlock(&proto_list_mutex);
2922
2923         kmem_cache_destroy(prot->slab);
2924         prot->slab = NULL;
2925
2926         req_prot_cleanup(prot->rsk_prot);
2927
2928         if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2929                 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2930                 kfree(prot->twsk_prot->twsk_slab_name);
2931                 prot->twsk_prot->twsk_slab = NULL;
2932         }
2933 }
2934 EXPORT_SYMBOL(proto_unregister);
2935
2936 #ifdef CONFIG_PROC_FS
2937 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2938         __acquires(proto_list_mutex)
2939 {
2940         mutex_lock(&proto_list_mutex);
2941         return seq_list_start_head(&proto_list, *pos);
2942 }
2943
2944 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2945 {
2946         return seq_list_next(v, &proto_list, pos);
2947 }
2948
2949 static void proto_seq_stop(struct seq_file *seq, void *v)
2950         __releases(proto_list_mutex)
2951 {
2952         mutex_unlock(&proto_list_mutex);
2953 }
2954
2955 static char proto_method_implemented(const void *method)
2956 {
2957         return method == NULL ? 'n' : 'y';
2958 }
2959 static long sock_prot_memory_allocated(struct proto *proto)
2960 {
2961         return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
2962 }
2963
2964 static char *sock_prot_memory_pressure(struct proto *proto)
2965 {
2966         return proto->memory_pressure != NULL ?
2967         proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2968 }
2969
2970 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2971 {
2972
2973         seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
2974                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2975                    proto->name,
2976                    proto->obj_size,
2977                    sock_prot_inuse_get(seq_file_net(seq), proto),
2978                    sock_prot_memory_allocated(proto),
2979                    sock_prot_memory_pressure(proto),
2980                    proto->max_header,
2981                    proto->slab == NULL ? "no" : "yes",
2982                    module_name(proto->owner),
2983                    proto_method_implemented(proto->close),
2984                    proto_method_implemented(proto->connect),
2985                    proto_method_implemented(proto->disconnect),
2986                    proto_method_implemented(proto->accept),
2987                    proto_method_implemented(proto->ioctl),
2988                    proto_method_implemented(proto->init),
2989                    proto_method_implemented(proto->destroy),
2990                    proto_method_implemented(proto->shutdown),
2991                    proto_method_implemented(proto->setsockopt),
2992                    proto_method_implemented(proto->getsockopt),
2993                    proto_method_implemented(proto->sendmsg),
2994                    proto_method_implemented(proto->recvmsg),
2995                    proto_method_implemented(proto->sendpage),
2996                    proto_method_implemented(proto->bind),
2997                    proto_method_implemented(proto->backlog_rcv),
2998                    proto_method_implemented(proto->hash),
2999                    proto_method_implemented(proto->unhash),
3000                    proto_method_implemented(proto->get_port),
3001                    proto_method_implemented(proto->enter_memory_pressure));
3002 }
3003
3004 static int proto_seq_show(struct seq_file *seq, void *v)
3005 {
3006         if (v == &proto_list)
3007                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3008                            "protocol",
3009                            "size",
3010                            "sockets",
3011                            "memory",
3012                            "press",
3013                            "maxhdr",
3014                            "slab",
3015                            "module",
3016                            "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3017         else
3018                 proto_seq_printf(seq, list_entry(v, struct proto, node));
3019         return 0;
3020 }
3021
3022 static const struct seq_operations proto_seq_ops = {
3023         .start  = proto_seq_start,
3024         .next   = proto_seq_next,
3025         .stop   = proto_seq_stop,
3026         .show   = proto_seq_show,
3027 };
3028
3029 static int proto_seq_open(struct inode *inode, struct file *file)
3030 {
3031         return seq_open_net(inode, file, &proto_seq_ops,
3032                             sizeof(struct seq_net_private));
3033 }
3034
3035 static const struct file_operations proto_seq_fops = {
3036         .owner          = THIS_MODULE,
3037         .open           = proto_seq_open,
3038         .read           = seq_read,
3039         .llseek         = seq_lseek,
3040         .release        = seq_release_net,
3041 };
3042
3043 static __net_init int proto_init_net(struct net *net)
3044 {
3045         if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
3046                 return -ENOMEM;
3047
3048         return 0;
3049 }
3050
3051 static __net_exit void proto_exit_net(struct net *net)
3052 {
3053         remove_proc_entry("protocols", net->proc_net);
3054 }
3055
3056
3057 static __net_initdata struct pernet_operations proto_net_ops = {
3058         .init = proto_init_net,
3059         .exit = proto_exit_net,
3060 };
3061
3062 static int __init proto_init(void)
3063 {
3064         return register_pernet_subsys(&proto_net_ops);
3065 }
3066
3067 subsys_initcall(proto_init);
3068
3069 #endif /* PROC_FS */