netvm: propagate page->pfmemalloc from skb_alloc_page to skb
[deliverable/linux.git] / net / core / sock.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Generic socket support routines. Memory allocators, socket lock/release
7 * handler for protocols to use and generic option handler.
8 *
9 *
02c30a84 10 * Authors: Ross Biro
1da177e4
LT
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Florian La Roche, <flla@stud.uni-sb.de>
13 * Alan Cox, <A.Cox@swansea.ac.uk>
14 *
15 * Fixes:
16 * Alan Cox : Numerous verify_area() problems
17 * Alan Cox : Connecting on a connecting socket
18 * now returns an error for tcp.
19 * Alan Cox : sock->protocol is set correctly.
20 * and is not sometimes left as 0.
21 * Alan Cox : connect handles icmp errors on a
22 * connect properly. Unfortunately there
23 * is a restart syscall nasty there. I
24 * can't match BSD without hacking the C
25 * library. Ideas urgently sought!
26 * Alan Cox : Disallow bind() to addresses that are
27 * not ours - especially broadcast ones!!
28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
30 * instead they leave that for the DESTROY timer.
31 * Alan Cox : Clean up error flag in accept
32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
33 * was buggy. Put a remove_sock() in the handler
34 * for memory when we hit 0. Also altered the timer
4ec93edb 35 * code. The ACK stuff can wait and needs major
1da177e4
LT
36 * TCP layer surgery.
37 * Alan Cox : Fixed TCP ack bug, removed remove sock
38 * and fixed timer/inet_bh race.
39 * Alan Cox : Added zapped flag for TCP
40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 * Rick Sladkey : Relaxed UDP rules for matching packets.
46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
47 * Pauline Middelink : identd support
48 * Alan Cox : Fixed connect() taking signals I think.
49 * Alan Cox : SO_LINGER supported
50 * Alan Cox : Error reporting fixes
51 * Anonymous : inet_create tidied up (sk->reuse setting)
52 * Alan Cox : inet sockets don't set sk->type!
53 * Alan Cox : Split socket option code
54 * Alan Cox : Callbacks
55 * Alan Cox : Nagle flag for Charles & Johannes stuff
56 * Alex : Removed restriction on inet fioctl
57 * Alan Cox : Splitting INET from NET core
58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
60 * Alan Cox : Split IP from generic code
61 * Alan Cox : New kfree_skbmem()
62 * Alan Cox : Make SO_DEBUG superuser only.
63 * Alan Cox : Allow anyone to clear SO_DEBUG
64 * (compatibility fix)
65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
66 * Alan Cox : Allocator for a socket is settable.
67 * Alan Cox : SO_ERROR includes soft errors.
68 * Alan Cox : Allow NULL arguments on some SO_ opts
69 * Alan Cox : Generic socket allocation to make hooks
70 * easier (suggested by Craig Metz).
71 * Michael Pall : SO_ERROR returns positive errno again
72 * Steve Whitehouse: Added default destructor to free
73 * protocol private data.
74 * Steve Whitehouse: Added various other default routines
75 * common to several socket families.
76 * Chris Evans : Call suser() check last on F_SETOWN
77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
79 * Andi Kleen : Fix write_space callback
80 * Chris Evans : Security fixes - signedness again
81 * Arnaldo C. Melo : cleanups, use skb_queue_purge
82 *
83 * To Fix:
84 *
85 *
86 * This program is free software; you can redistribute it and/or
87 * modify it under the terms of the GNU General Public License
88 * as published by the Free Software Foundation; either version
89 * 2 of the License, or (at your option) any later version.
90 */
91
e005d193
JP
92#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
93
4fc268d2 94#include <linux/capability.h>
1da177e4
LT
95#include <linux/errno.h>
96#include <linux/types.h>
97#include <linux/socket.h>
98#include <linux/in.h>
99#include <linux/kernel.h>
1da177e4
LT
100#include <linux/module.h>
101#include <linux/proc_fs.h>
102#include <linux/seq_file.h>
103#include <linux/sched.h>
104#include <linux/timer.h>
105#include <linux/string.h>
106#include <linux/sockios.h>
107#include <linux/net.h>
108#include <linux/mm.h>
109#include <linux/slab.h>
110#include <linux/interrupt.h>
111#include <linux/poll.h>
112#include <linux/tcp.h>
113#include <linux/init.h>
a1f8e7f7 114#include <linux/highmem.h>
3f551f94 115#include <linux/user_namespace.h>
c5905afb 116#include <linux/static_key.h>
3969eb38 117#include <linux/memcontrol.h>
8c1ae10d 118#include <linux/prefetch.h>
1da177e4
LT
119
120#include <asm/uaccess.h>
1da177e4
LT
121
122#include <linux/netdevice.h>
123#include <net/protocol.h>
124#include <linux/skbuff.h>
457c4cbc 125#include <net/net_namespace.h>
2e6599cb 126#include <net/request_sock.h>
1da177e4 127#include <net/sock.h>
20d49473 128#include <linux/net_tstamp.h>
1da177e4
LT
129#include <net/xfrm.h>
130#include <linux/ipsec.h>
f8451725 131#include <net/cls_cgroup.h>
5bc1421e 132#include <net/netprio_cgroup.h>
1da177e4
LT
133
134#include <linux/filter.h>
135
3847ce32
SM
136#include <trace/events/sock.h>
137
1da177e4
LT
138#ifdef CONFIG_INET
139#include <net/tcp.h>
140#endif
141
36b77a52 142static DEFINE_MUTEX(proto_list_mutex);
d1a4c0b3
GC
143static LIST_HEAD(proto_list);
144
c255a458 145#ifdef CONFIG_MEMCG_KMEM
1d62e436 146int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
d1a4c0b3
GC
147{
148 struct proto *proto;
149 int ret = 0;
150
36b77a52 151 mutex_lock(&proto_list_mutex);
d1a4c0b3
GC
152 list_for_each_entry(proto, &proto_list, node) {
153 if (proto->init_cgroup) {
1d62e436 154 ret = proto->init_cgroup(memcg, ss);
d1a4c0b3
GC
155 if (ret)
156 goto out;
157 }
158 }
159
36b77a52 160 mutex_unlock(&proto_list_mutex);
d1a4c0b3
GC
161 return ret;
162out:
163 list_for_each_entry_continue_reverse(proto, &proto_list, node)
164 if (proto->destroy_cgroup)
1d62e436 165 proto->destroy_cgroup(memcg);
36b77a52 166 mutex_unlock(&proto_list_mutex);
d1a4c0b3
GC
167 return ret;
168}
169
1d62e436 170void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg)
d1a4c0b3
GC
171{
172 struct proto *proto;
173
36b77a52 174 mutex_lock(&proto_list_mutex);
d1a4c0b3
GC
175 list_for_each_entry_reverse(proto, &proto_list, node)
176 if (proto->destroy_cgroup)
1d62e436 177 proto->destroy_cgroup(memcg);
36b77a52 178 mutex_unlock(&proto_list_mutex);
d1a4c0b3
GC
179}
180#endif
181
da21f24d
IM
182/*
183 * Each address family might have different locking rules, so we have
184 * one slock key per address family:
185 */
a5b5bb9a
IM
186static struct lock_class_key af_family_keys[AF_MAX];
187static struct lock_class_key af_family_slock_keys[AF_MAX];
188
c5905afb 189struct static_key memcg_socket_limit_enabled;
e1aab161
GC
190EXPORT_SYMBOL(memcg_socket_limit_enabled);
191
a5b5bb9a
IM
192/*
193 * Make lock validator output more readable. (we pre-construct these
194 * strings build-time, so that runtime initialization of socket
195 * locks is fast):
196 */
36cbd3dc 197static const char *const af_family_key_strings[AF_MAX+1] = {
a5b5bb9a
IM
198 "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX" , "sk_lock-AF_INET" ,
199 "sk_lock-AF_AX25" , "sk_lock-AF_IPX" , "sk_lock-AF_APPLETALK",
200 "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE" , "sk_lock-AF_ATMPVC" ,
201 "sk_lock-AF_X25" , "sk_lock-AF_INET6" , "sk_lock-AF_ROSE" ,
202 "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI" , "sk_lock-AF_SECURITY" ,
203 "sk_lock-AF_KEY" , "sk_lock-AF_NETLINK" , "sk_lock-AF_PACKET" ,
204 "sk_lock-AF_ASH" , "sk_lock-AF_ECONET" , "sk_lock-AF_ATMSVC" ,
cbd151bf 205 "sk_lock-AF_RDS" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" ,
a5b5bb9a 206 "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE" , "sk_lock-AF_LLC" ,
cd05acfe 207 "sk_lock-27" , "sk_lock-28" , "sk_lock-AF_CAN" ,
17926a79 208 "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" ,
bce7b154 209 "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" ,
6f107b58 210 "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG" ,
c7fe3b52 211 "sk_lock-AF_NFC" , "sk_lock-AF_MAX"
a5b5bb9a 212};
36cbd3dc 213static const char *const af_family_slock_key_strings[AF_MAX+1] = {
a5b5bb9a
IM
214 "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" ,
215 "slock-AF_AX25" , "slock-AF_IPX" , "slock-AF_APPLETALK",
216 "slock-AF_NETROM", "slock-AF_BRIDGE" , "slock-AF_ATMPVC" ,
217 "slock-AF_X25" , "slock-AF_INET6" , "slock-AF_ROSE" ,
218 "slock-AF_DECnet", "slock-AF_NETBEUI" , "slock-AF_SECURITY" ,
219 "slock-AF_KEY" , "slock-AF_NETLINK" , "slock-AF_PACKET" ,
220 "slock-AF_ASH" , "slock-AF_ECONET" , "slock-AF_ATMSVC" ,
cbd151bf 221 "slock-AF_RDS" , "slock-AF_SNA" , "slock-AF_IRDA" ,
a5b5bb9a 222 "slock-AF_PPPOX" , "slock-AF_WANPIPE" , "slock-AF_LLC" ,
cd05acfe 223 "slock-27" , "slock-28" , "slock-AF_CAN" ,
17926a79 224 "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" ,
bce7b154 225 "slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" ,
6f107b58 226 "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG" ,
c7fe3b52 227 "slock-AF_NFC" , "slock-AF_MAX"
a5b5bb9a 228};
36cbd3dc 229static const char *const af_family_clock_key_strings[AF_MAX+1] = {
443aef0e
PZ
230 "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" ,
231 "clock-AF_AX25" , "clock-AF_IPX" , "clock-AF_APPLETALK",
232 "clock-AF_NETROM", "clock-AF_BRIDGE" , "clock-AF_ATMPVC" ,
233 "clock-AF_X25" , "clock-AF_INET6" , "clock-AF_ROSE" ,
234 "clock-AF_DECnet", "clock-AF_NETBEUI" , "clock-AF_SECURITY" ,
235 "clock-AF_KEY" , "clock-AF_NETLINK" , "clock-AF_PACKET" ,
236 "clock-AF_ASH" , "clock-AF_ECONET" , "clock-AF_ATMSVC" ,
cbd151bf 237 "clock-AF_RDS" , "clock-AF_SNA" , "clock-AF_IRDA" ,
443aef0e 238 "clock-AF_PPPOX" , "clock-AF_WANPIPE" , "clock-AF_LLC" ,
b4942af6 239 "clock-27" , "clock-28" , "clock-AF_CAN" ,
e51f802b 240 "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" ,
bce7b154 241 "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" ,
6f107b58 242 "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG" ,
c7fe3b52 243 "clock-AF_NFC" , "clock-AF_MAX"
443aef0e 244};
da21f24d
IM
245
246/*
247 * sk_callback_lock locking rules are per-address-family,
248 * so split the lock classes by using a per-AF key:
249 */
250static struct lock_class_key af_callback_keys[AF_MAX];
251
1da177e4
LT
252/* Take into consideration the size of the struct sk_buff overhead in the
253 * determination of these values, since that is non-constant across
254 * platforms. This makes socket queueing behavior and performance
255 * not depend upon such differences.
256 */
257#define _SK_MEM_PACKETS 256
87fb4b7b 258#define _SK_MEM_OVERHEAD SKB_TRUESIZE(256)
1da177e4
LT
259#define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
260#define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
261
262/* Run time adjustable parameters. */
ab32ea5d 263__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
6d8ebc8a 264EXPORT_SYMBOL(sysctl_wmem_max);
ab32ea5d 265__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
6d8ebc8a 266EXPORT_SYMBOL(sysctl_rmem_max);
ab32ea5d
BH
267__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
268__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
1da177e4 269
25985edc 270/* Maximal space eaten by iovec or ancillary data plus some space */
ab32ea5d 271int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
2a91525c 272EXPORT_SYMBOL(sysctl_optmem_max);
1da177e4 273
c93bdd0e
MG
274struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
275EXPORT_SYMBOL_GPL(memalloc_socks);
276
7cb02404
MG
277/**
278 * sk_set_memalloc - sets %SOCK_MEMALLOC
279 * @sk: socket to set it on
280 *
281 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
282 * It's the responsibility of the admin to adjust min_free_kbytes
283 * to meet the requirements
284 */
285void sk_set_memalloc(struct sock *sk)
286{
287 sock_set_flag(sk, SOCK_MEMALLOC);
288 sk->sk_allocation |= __GFP_MEMALLOC;
c93bdd0e 289 static_key_slow_inc(&memalloc_socks);
7cb02404
MG
290}
291EXPORT_SYMBOL_GPL(sk_set_memalloc);
292
293void sk_clear_memalloc(struct sock *sk)
294{
295 sock_reset_flag(sk, SOCK_MEMALLOC);
296 sk->sk_allocation &= ~__GFP_MEMALLOC;
c93bdd0e 297 static_key_slow_dec(&memalloc_socks);
7cb02404
MG
298}
299EXPORT_SYMBOL_GPL(sk_clear_memalloc);
300
5bc1421e
NH
301#if defined(CONFIG_CGROUPS)
302#if !defined(CONFIG_NET_CLS_CGROUP)
f8451725
HX
303int net_cls_subsys_id = -1;
304EXPORT_SYMBOL_GPL(net_cls_subsys_id);
305#endif
5bc1421e
NH
306#if !defined(CONFIG_NETPRIO_CGROUP)
307int net_prio_subsys_id = -1;
308EXPORT_SYMBOL_GPL(net_prio_subsys_id);
309#endif
310#endif
f8451725 311
1da177e4
LT
312static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
313{
314 struct timeval tv;
315
316 if (optlen < sizeof(tv))
317 return -EINVAL;
318 if (copy_from_user(&tv, optval, sizeof(tv)))
319 return -EFAULT;
ba78073e
VA
320 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
321 return -EDOM;
1da177e4 322
ba78073e 323 if (tv.tv_sec < 0) {
6f11df83
AM
324 static int warned __read_mostly;
325
ba78073e 326 *timeo_p = 0;
50aab54f 327 if (warned < 10 && net_ratelimit()) {
ba78073e 328 warned++;
e005d193
JP
329 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
330 __func__, current->comm, task_pid_nr(current));
50aab54f 331 }
ba78073e
VA
332 return 0;
333 }
1da177e4
LT
334 *timeo_p = MAX_SCHEDULE_TIMEOUT;
335 if (tv.tv_sec == 0 && tv.tv_usec == 0)
336 return 0;
337 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
338 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
339 return 0;
340}
341
342static void sock_warn_obsolete_bsdism(const char *name)
343{
344 static int warned;
345 static char warncomm[TASK_COMM_LEN];
4ec93edb
YH
346 if (strcmp(warncomm, current->comm) && warned < 5) {
347 strcpy(warncomm, current->comm);
e005d193
JP
348 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
349 warncomm, name);
1da177e4
LT
350 warned++;
351 }
352}
353
08e29af3
ED
354#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
355
356static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
4ec93edb 357{
08e29af3
ED
358 if (sk->sk_flags & flags) {
359 sk->sk_flags &= ~flags;
360 if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP))
20d49473 361 net_disable_timestamp();
1da177e4
LT
362 }
363}
364
365
f0088a50
DV
366int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
367{
766e9037 368 int err;
f0088a50 369 int skb_len;
3b885787
NH
370 unsigned long flags;
371 struct sk_buff_head *list = &sk->sk_receive_queue;
f0088a50 372
0fd7bac6 373 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
766e9037 374 atomic_inc(&sk->sk_drops);
3847ce32 375 trace_sock_rcvqueue_full(sk, skb);
766e9037 376 return -ENOMEM;
f0088a50
DV
377 }
378
fda9ef5d 379 err = sk_filter(sk, skb);
f0088a50 380 if (err)
766e9037 381 return err;
f0088a50 382
3ab224be 383 if (!sk_rmem_schedule(sk, skb->truesize)) {
766e9037
ED
384 atomic_inc(&sk->sk_drops);
385 return -ENOBUFS;
3ab224be
HA
386 }
387
f0088a50
DV
388 skb->dev = NULL;
389 skb_set_owner_r(skb, sk);
49ad9599 390
f0088a50
DV
391 /* Cache the SKB length before we tack it onto the receive
392 * queue. Once it is added it no longer belongs to us and
393 * may be freed by other threads of control pulling packets
394 * from the queue.
395 */
396 skb_len = skb->len;
397
7fee226a
ED
398 /* we escape from rcu protected region, make sure we dont leak
399 * a norefcounted dst
400 */
401 skb_dst_force(skb);
402
3b885787
NH
403 spin_lock_irqsave(&list->lock, flags);
404 skb->dropcount = atomic_read(&sk->sk_drops);
405 __skb_queue_tail(list, skb);
406 spin_unlock_irqrestore(&list->lock, flags);
f0088a50
DV
407
408 if (!sock_flag(sk, SOCK_DEAD))
409 sk->sk_data_ready(sk, skb_len);
766e9037 410 return 0;
f0088a50
DV
411}
412EXPORT_SYMBOL(sock_queue_rcv_skb);
413
58a5a7b9 414int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
f0088a50
DV
415{
416 int rc = NET_RX_SUCCESS;
417
fda9ef5d 418 if (sk_filter(sk, skb))
f0088a50
DV
419 goto discard_and_relse;
420
421 skb->dev = NULL;
422
f545a38f 423 if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) {
c377411f
ED
424 atomic_inc(&sk->sk_drops);
425 goto discard_and_relse;
426 }
58a5a7b9
ACM
427 if (nested)
428 bh_lock_sock_nested(sk);
429 else
430 bh_lock_sock(sk);
a5b5bb9a
IM
431 if (!sock_owned_by_user(sk)) {
432 /*
433 * trylock + unlock semantics:
434 */
435 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
436
c57943a1 437 rc = sk_backlog_rcv(sk, skb);
a5b5bb9a
IM
438
439 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
f545a38f 440 } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
8eae939f
ZY
441 bh_unlock_sock(sk);
442 atomic_inc(&sk->sk_drops);
443 goto discard_and_relse;
444 }
445
f0088a50
DV
446 bh_unlock_sock(sk);
447out:
448 sock_put(sk);
449 return rc;
450discard_and_relse:
451 kfree_skb(skb);
452 goto out;
453}
454EXPORT_SYMBOL(sk_receive_skb);
455
ea94ff3b
KK
456void sk_reset_txq(struct sock *sk)
457{
458 sk_tx_queue_clear(sk);
459}
460EXPORT_SYMBOL(sk_reset_txq);
461
f0088a50
DV
462struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
463{
b6c6712a 464 struct dst_entry *dst = __sk_dst_get(sk);
f0088a50
DV
465
466 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
e022f0b4 467 sk_tx_queue_clear(sk);
a9b3cd7f 468 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
f0088a50
DV
469 dst_release(dst);
470 return NULL;
471 }
472
473 return dst;
474}
475EXPORT_SYMBOL(__sk_dst_check);
476
477struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
478{
479 struct dst_entry *dst = sk_dst_get(sk);
480
481 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
482 sk_dst_reset(sk);
483 dst_release(dst);
484 return NULL;
485 }
486
487 return dst;
488}
489EXPORT_SYMBOL(sk_dst_check);
490
4878809f
DM
491static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
492{
493 int ret = -ENOPROTOOPT;
494#ifdef CONFIG_NETDEVICES
3b1e0a65 495 struct net *net = sock_net(sk);
4878809f
DM
496 char devname[IFNAMSIZ];
497 int index;
498
499 /* Sorry... */
500 ret = -EPERM;
501 if (!capable(CAP_NET_RAW))
502 goto out;
503
504 ret = -EINVAL;
505 if (optlen < 0)
506 goto out;
507
508 /* Bind this socket to a particular device like "eth0",
509 * as specified in the passed interface name. If the
510 * name is "" or the option length is zero the socket
511 * is not bound.
512 */
513 if (optlen > IFNAMSIZ - 1)
514 optlen = IFNAMSIZ - 1;
515 memset(devname, 0, sizeof(devname));
516
517 ret = -EFAULT;
518 if (copy_from_user(devname, optval, optlen))
519 goto out;
520
000ba2e4
DM
521 index = 0;
522 if (devname[0] != '\0') {
bf8e56bf 523 struct net_device *dev;
4878809f 524
bf8e56bf
ED
525 rcu_read_lock();
526 dev = dev_get_by_name_rcu(net, devname);
527 if (dev)
528 index = dev->ifindex;
529 rcu_read_unlock();
4878809f
DM
530 ret = -ENODEV;
531 if (!dev)
532 goto out;
4878809f
DM
533 }
534
535 lock_sock(sk);
536 sk->sk_bound_dev_if = index;
537 sk_dst_reset(sk);
538 release_sock(sk);
539
540 ret = 0;
541
542out:
543#endif
544
545 return ret;
546}
547
c0ef877b
PE
548static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
549{
550 if (valbool)
551 sock_set_flag(sk, bit);
552 else
553 sock_reset_flag(sk, bit);
554}
555
1da177e4
LT
556/*
557 * This is meant for all protocols to use and covers goings on
558 * at the socket level. Everything here is generic.
559 */
560
561int sock_setsockopt(struct socket *sock, int level, int optname,
b7058842 562 char __user *optval, unsigned int optlen)
1da177e4 563{
2a91525c 564 struct sock *sk = sock->sk;
1da177e4
LT
565 int val;
566 int valbool;
567 struct linger ling;
568 int ret = 0;
4ec93edb 569
1da177e4
LT
570 /*
571 * Options without arguments
572 */
573
4878809f
DM
574 if (optname == SO_BINDTODEVICE)
575 return sock_bindtodevice(sk, optval, optlen);
576
e71a4783
SH
577 if (optlen < sizeof(int))
578 return -EINVAL;
4ec93edb 579
1da177e4
LT
580 if (get_user(val, (int __user *)optval))
581 return -EFAULT;
4ec93edb 582
2a91525c 583 valbool = val ? 1 : 0;
1da177e4
LT
584
585 lock_sock(sk);
586
2a91525c 587 switch (optname) {
e71a4783 588 case SO_DEBUG:
2a91525c 589 if (val && !capable(CAP_NET_ADMIN))
e71a4783 590 ret = -EACCES;
2a91525c 591 else
c0ef877b 592 sock_valbool_flag(sk, SOCK_DBG, valbool);
e71a4783
SH
593 break;
594 case SO_REUSEADDR:
4a17fd52 595 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
e71a4783
SH
596 break;
597 case SO_TYPE:
49c794e9 598 case SO_PROTOCOL:
0d6038ee 599 case SO_DOMAIN:
e71a4783
SH
600 case SO_ERROR:
601 ret = -ENOPROTOOPT;
602 break;
603 case SO_DONTROUTE:
c0ef877b 604 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
e71a4783
SH
605 break;
606 case SO_BROADCAST:
607 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
608 break;
609 case SO_SNDBUF:
610 /* Don't error on this BSD doesn't and if you think
82981930
ED
611 * about it this is right. Otherwise apps have to
612 * play 'guess the biggest size' games. RCVBUF/SNDBUF
613 * are treated in BSD as hints
614 */
615 val = min_t(u32, val, sysctl_wmem_max);
b0573dea 616set_sndbuf:
e71a4783 617 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
82981930
ED
618 sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);
619 /* Wake up sending tasks if we upped the value. */
e71a4783
SH
620 sk->sk_write_space(sk);
621 break;
1da177e4 622
e71a4783
SH
623 case SO_SNDBUFFORCE:
624 if (!capable(CAP_NET_ADMIN)) {
625 ret = -EPERM;
626 break;
627 }
628 goto set_sndbuf;
b0573dea 629
e71a4783
SH
630 case SO_RCVBUF:
631 /* Don't error on this BSD doesn't and if you think
82981930
ED
632 * about it this is right. Otherwise apps have to
633 * play 'guess the biggest size' games. RCVBUF/SNDBUF
634 * are treated in BSD as hints
635 */
636 val = min_t(u32, val, sysctl_rmem_max);
b0573dea 637set_rcvbuf:
e71a4783
SH
638 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
639 /*
640 * We double it on the way in to account for
641 * "struct sk_buff" etc. overhead. Applications
642 * assume that the SO_RCVBUF setting they make will
643 * allow that much actual data to be received on that
644 * socket.
645 *
646 * Applications are unaware that "struct sk_buff" and
647 * other overheads allocate from the receive buffer
648 * during socket buffer allocation.
649 *
650 * And after considering the possible alternatives,
651 * returning the value we actually used in getsockopt
652 * is the most desirable behavior.
653 */
82981930 654 sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);
e71a4783
SH
655 break;
656
657 case SO_RCVBUFFORCE:
658 if (!capable(CAP_NET_ADMIN)) {
659 ret = -EPERM;
1da177e4 660 break;
e71a4783
SH
661 }
662 goto set_rcvbuf;
1da177e4 663
e71a4783 664 case SO_KEEPALIVE:
1da177e4 665#ifdef CONFIG_INET
e71a4783
SH
666 if (sk->sk_protocol == IPPROTO_TCP)
667 tcp_set_keepalive(sk, valbool);
1da177e4 668#endif
e71a4783
SH
669 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
670 break;
671
672 case SO_OOBINLINE:
673 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
674 break;
675
676 case SO_NO_CHECK:
677 sk->sk_no_check = valbool;
678 break;
679
680 case SO_PRIORITY:
681 if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
682 sk->sk_priority = val;
683 else
684 ret = -EPERM;
685 break;
686
687 case SO_LINGER:
688 if (optlen < sizeof(ling)) {
689 ret = -EINVAL; /* 1003.1g */
1da177e4 690 break;
e71a4783 691 }
2a91525c 692 if (copy_from_user(&ling, optval, sizeof(ling))) {
e71a4783 693 ret = -EFAULT;
1da177e4 694 break;
e71a4783
SH
695 }
696 if (!ling.l_onoff)
697 sock_reset_flag(sk, SOCK_LINGER);
698 else {
1da177e4 699#if (BITS_PER_LONG == 32)
e71a4783
SH
700 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
701 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
1da177e4 702 else
e71a4783
SH
703#endif
704 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
705 sock_set_flag(sk, SOCK_LINGER);
706 }
707 break;
708
709 case SO_BSDCOMPAT:
710 sock_warn_obsolete_bsdism("setsockopt");
711 break;
712
713 case SO_PASSCRED:
714 if (valbool)
715 set_bit(SOCK_PASSCRED, &sock->flags);
716 else
717 clear_bit(SOCK_PASSCRED, &sock->flags);
718 break;
719
720 case SO_TIMESTAMP:
92f37fd2 721 case SO_TIMESTAMPNS:
e71a4783 722 if (valbool) {
92f37fd2
ED
723 if (optname == SO_TIMESTAMP)
724 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
725 else
726 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
e71a4783 727 sock_set_flag(sk, SOCK_RCVTSTAMP);
20d49473 728 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
92f37fd2 729 } else {
e71a4783 730 sock_reset_flag(sk, SOCK_RCVTSTAMP);
92f37fd2
ED
731 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
732 }
e71a4783
SH
733 break;
734
20d49473
PO
735 case SO_TIMESTAMPING:
736 if (val & ~SOF_TIMESTAMPING_MASK) {
f249fb78 737 ret = -EINVAL;
20d49473
PO
738 break;
739 }
740 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
741 val & SOF_TIMESTAMPING_TX_HARDWARE);
742 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
743 val & SOF_TIMESTAMPING_TX_SOFTWARE);
744 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
745 val & SOF_TIMESTAMPING_RX_HARDWARE);
746 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
747 sock_enable_timestamp(sk,
748 SOCK_TIMESTAMPING_RX_SOFTWARE);
749 else
750 sock_disable_timestamp(sk,
08e29af3 751 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
20d49473
PO
752 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
753 val & SOF_TIMESTAMPING_SOFTWARE);
754 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
755 val & SOF_TIMESTAMPING_SYS_HARDWARE);
756 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
757 val & SOF_TIMESTAMPING_RAW_HARDWARE);
758 break;
759
e71a4783
SH
760 case SO_RCVLOWAT:
761 if (val < 0)
762 val = INT_MAX;
763 sk->sk_rcvlowat = val ? : 1;
764 break;
765
766 case SO_RCVTIMEO:
767 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
768 break;
769
770 case SO_SNDTIMEO:
771 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
772 break;
1da177e4 773
e71a4783
SH
774 case SO_ATTACH_FILTER:
775 ret = -EINVAL;
776 if (optlen == sizeof(struct sock_fprog)) {
777 struct sock_fprog fprog;
1da177e4 778
e71a4783
SH
779 ret = -EFAULT;
780 if (copy_from_user(&fprog, optval, sizeof(fprog)))
1da177e4 781 break;
e71a4783
SH
782
783 ret = sk_attach_filter(&fprog, sk);
784 }
785 break;
786
787 case SO_DETACH_FILTER:
55b33325 788 ret = sk_detach_filter(sk);
e71a4783 789 break;
1da177e4 790
e71a4783
SH
791 case SO_PASSSEC:
792 if (valbool)
793 set_bit(SOCK_PASSSEC, &sock->flags);
794 else
795 clear_bit(SOCK_PASSSEC, &sock->flags);
796 break;
4a19ec58
LAT
797 case SO_MARK:
798 if (!capable(CAP_NET_ADMIN))
799 ret = -EPERM;
2a91525c 800 else
4a19ec58 801 sk->sk_mark = val;
4a19ec58 802 break;
877ce7c1 803
1da177e4
LT
804 /* We implement the SO_SNDLOWAT etc to
805 not be settable (1003.1g 5.3) */
3b885787 806 case SO_RXQ_OVFL:
8083f0fc 807 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
3b885787 808 break;
6e3e939f
JB
809
810 case SO_WIFI_STATUS:
811 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
812 break;
813
ef64a54f
PE
814 case SO_PEEK_OFF:
815 if (sock->ops->set_peek_off)
816 sock->ops->set_peek_off(sk, val);
817 else
818 ret = -EOPNOTSUPP;
819 break;
3bdc0eba
BG
820
821 case SO_NOFCS:
822 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
823 break;
824
e71a4783
SH
825 default:
826 ret = -ENOPROTOOPT;
827 break;
4ec93edb 828 }
1da177e4
LT
829 release_sock(sk);
830 return ret;
831}
2a91525c 832EXPORT_SYMBOL(sock_setsockopt);
1da177e4
LT
833
834
3f551f94
EB
835void cred_to_ucred(struct pid *pid, const struct cred *cred,
836 struct ucred *ucred)
837{
838 ucred->pid = pid_vnr(pid);
839 ucred->uid = ucred->gid = -1;
840 if (cred) {
841 struct user_namespace *current_ns = current_user_ns();
842
76b6db01
EB
843 ucred->uid = from_kuid(current_ns, cred->euid);
844 ucred->gid = from_kgid(current_ns, cred->egid);
3f551f94
EB
845 }
846}
3924773a 847EXPORT_SYMBOL_GPL(cred_to_ucred);
3f551f94 848
1da177e4
LT
849int sock_getsockopt(struct socket *sock, int level, int optname,
850 char __user *optval, int __user *optlen)
851{
852 struct sock *sk = sock->sk;
4ec93edb 853
e71a4783 854 union {
4ec93edb
YH
855 int val;
856 struct linger ling;
1da177e4
LT
857 struct timeval tm;
858 } v;
4ec93edb 859
4d0392be 860 int lv = sizeof(int);
1da177e4 861 int len;
4ec93edb 862
e71a4783 863 if (get_user(len, optlen))
4ec93edb 864 return -EFAULT;
e71a4783 865 if (len < 0)
1da177e4 866 return -EINVAL;
4ec93edb 867
50fee1de 868 memset(&v, 0, sizeof(v));
df0bca04 869
2a91525c 870 switch (optname) {
e71a4783
SH
871 case SO_DEBUG:
872 v.val = sock_flag(sk, SOCK_DBG);
873 break;
874
875 case SO_DONTROUTE:
876 v.val = sock_flag(sk, SOCK_LOCALROUTE);
877 break;
878
879 case SO_BROADCAST:
1b23a5df 880 v.val = sock_flag(sk, SOCK_BROADCAST);
e71a4783
SH
881 break;
882
883 case SO_SNDBUF:
884 v.val = sk->sk_sndbuf;
885 break;
886
887 case SO_RCVBUF:
888 v.val = sk->sk_rcvbuf;
889 break;
890
891 case SO_REUSEADDR:
892 v.val = sk->sk_reuse;
893 break;
894
895 case SO_KEEPALIVE:
1b23a5df 896 v.val = sock_flag(sk, SOCK_KEEPOPEN);
e71a4783
SH
897 break;
898
899 case SO_TYPE:
900 v.val = sk->sk_type;
901 break;
902
49c794e9
JE
903 case SO_PROTOCOL:
904 v.val = sk->sk_protocol;
905 break;
906
0d6038ee
JE
907 case SO_DOMAIN:
908 v.val = sk->sk_family;
909 break;
910
e71a4783
SH
911 case SO_ERROR:
912 v.val = -sock_error(sk);
2a91525c 913 if (v.val == 0)
e71a4783
SH
914 v.val = xchg(&sk->sk_err_soft, 0);
915 break;
916
917 case SO_OOBINLINE:
1b23a5df 918 v.val = sock_flag(sk, SOCK_URGINLINE);
e71a4783
SH
919 break;
920
921 case SO_NO_CHECK:
922 v.val = sk->sk_no_check;
923 break;
924
925 case SO_PRIORITY:
926 v.val = sk->sk_priority;
927 break;
928
929 case SO_LINGER:
930 lv = sizeof(v.ling);
1b23a5df 931 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER);
e71a4783
SH
932 v.ling.l_linger = sk->sk_lingertime / HZ;
933 break;
934
935 case SO_BSDCOMPAT:
936 sock_warn_obsolete_bsdism("getsockopt");
937 break;
938
939 case SO_TIMESTAMP:
92f37fd2
ED
940 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
941 !sock_flag(sk, SOCK_RCVTSTAMPNS);
942 break;
943
944 case SO_TIMESTAMPNS:
945 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
e71a4783
SH
946 break;
947
20d49473
PO
948 case SO_TIMESTAMPING:
949 v.val = 0;
950 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
951 v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
952 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
953 v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
954 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
955 v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
956 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
957 v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
958 if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
959 v.val |= SOF_TIMESTAMPING_SOFTWARE;
960 if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
961 v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
962 if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
963 v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
964 break;
965
e71a4783 966 case SO_RCVTIMEO:
2a91525c 967 lv = sizeof(struct timeval);
e71a4783
SH
968 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
969 v.tm.tv_sec = 0;
970 v.tm.tv_usec = 0;
971 } else {
972 v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
973 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
974 }
975 break;
976
977 case SO_SNDTIMEO:
2a91525c 978 lv = sizeof(struct timeval);
e71a4783
SH
979 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
980 v.tm.tv_sec = 0;
981 v.tm.tv_usec = 0;
982 } else {
983 v.tm.tv_sec = sk->sk_sndtimeo / HZ;
984 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
985 }
986 break;
1da177e4 987
e71a4783
SH
988 case SO_RCVLOWAT:
989 v.val = sk->sk_rcvlowat;
990 break;
1da177e4 991
e71a4783 992 case SO_SNDLOWAT:
2a91525c 993 v.val = 1;
e71a4783 994 break;
1da177e4 995
e71a4783 996 case SO_PASSCRED:
82981930 997 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
e71a4783 998 break;
1da177e4 999
e71a4783 1000 case SO_PEERCRED:
109f6e39
EB
1001 {
1002 struct ucred peercred;
1003 if (len > sizeof(peercred))
1004 len = sizeof(peercred);
1005 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1006 if (copy_to_user(optval, &peercred, len))
e71a4783
SH
1007 return -EFAULT;
1008 goto lenout;
109f6e39 1009 }
1da177e4 1010
e71a4783
SH
1011 case SO_PEERNAME:
1012 {
1013 char address[128];
1014
1015 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1016 return -ENOTCONN;
1017 if (lv < len)
1018 return -EINVAL;
1019 if (copy_to_user(optval, address, len))
1020 return -EFAULT;
1021 goto lenout;
1022 }
1da177e4 1023
e71a4783
SH
1024 /* Dubious BSD thing... Probably nobody even uses it, but
1025 * the UNIX standard wants it for whatever reason... -DaveM
1026 */
1027 case SO_ACCEPTCONN:
1028 v.val = sk->sk_state == TCP_LISTEN;
1029 break;
1da177e4 1030
e71a4783 1031 case SO_PASSSEC:
82981930 1032 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
e71a4783 1033 break;
877ce7c1 1034
e71a4783
SH
1035 case SO_PEERSEC:
1036 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1da177e4 1037
4a19ec58
LAT
1038 case SO_MARK:
1039 v.val = sk->sk_mark;
1040 break;
1041
3b885787 1042 case SO_RXQ_OVFL:
1b23a5df 1043 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
3b885787
NH
1044 break;
1045
6e3e939f 1046 case SO_WIFI_STATUS:
1b23a5df 1047 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
6e3e939f
JB
1048 break;
1049
ef64a54f
PE
1050 case SO_PEEK_OFF:
1051 if (!sock->ops->set_peek_off)
1052 return -EOPNOTSUPP;
1053
1054 v.val = sk->sk_peek_off;
1055 break;
bc2f7996 1056 case SO_NOFCS:
1b23a5df 1057 v.val = sock_flag(sk, SOCK_NOFCS);
bc2f7996 1058 break;
e71a4783
SH
1059 default:
1060 return -ENOPROTOOPT;
1da177e4 1061 }
e71a4783 1062
1da177e4
LT
1063 if (len > lv)
1064 len = lv;
1065 if (copy_to_user(optval, &v, len))
1066 return -EFAULT;
1067lenout:
4ec93edb
YH
1068 if (put_user(len, optlen))
1069 return -EFAULT;
1070 return 0;
1da177e4
LT
1071}
1072
a5b5bb9a
IM
1073/*
1074 * Initialize an sk_lock.
1075 *
1076 * (We also register the sk_lock with the lock validator.)
1077 */
b6f99a21 1078static inline void sock_lock_init(struct sock *sk)
a5b5bb9a 1079{
ed07536e
PZ
1080 sock_lock_init_class_and_name(sk,
1081 af_family_slock_key_strings[sk->sk_family],
1082 af_family_slock_keys + sk->sk_family,
1083 af_family_key_strings[sk->sk_family],
1084 af_family_keys + sk->sk_family);
a5b5bb9a
IM
1085}
1086
4dc6dc71
ED
1087/*
1088 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1089 * even temporarly, because of RCU lookups. sk_node should also be left as is.
68835aba 1090 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
4dc6dc71 1091 */
f1a6c4da
PE
1092static void sock_copy(struct sock *nsk, const struct sock *osk)
1093{
1094#ifdef CONFIG_SECURITY_NETWORK
1095 void *sptr = nsk->sk_security;
1096#endif
68835aba
ED
1097 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1098
1099 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1100 osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1101
f1a6c4da
PE
1102#ifdef CONFIG_SECURITY_NETWORK
1103 nsk->sk_security = sptr;
1104 security_sk_clone(osk, nsk);
1105#endif
1106}
1107
fcbdf09d
OP
1108/*
1109 * caches using SLAB_DESTROY_BY_RCU should let .next pointer from nulls nodes
1110 * un-modified. Special care is taken when initializing object to zero.
1111 */
1112static inline void sk_prot_clear_nulls(struct sock *sk, int size)
1113{
1114 if (offsetof(struct sock, sk_node.next) != 0)
1115 memset(sk, 0, offsetof(struct sock, sk_node.next));
1116 memset(&sk->sk_node.pprev, 0,
1117 size - offsetof(struct sock, sk_node.pprev));
1118}
1119
1120void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1121{
1122 unsigned long nulls1, nulls2;
1123
1124 nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1125 nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1126 if (nulls1 > nulls2)
1127 swap(nulls1, nulls2);
1128
1129 if (nulls1 != 0)
1130 memset((char *)sk, 0, nulls1);
1131 memset((char *)sk + nulls1 + sizeof(void *), 0,
1132 nulls2 - nulls1 - sizeof(void *));
1133 memset((char *)sk + nulls2 + sizeof(void *), 0,
1134 size - nulls2 - sizeof(void *));
1135}
1136EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1137
2e4afe7b
PE
1138static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1139 int family)
c308c1b2
PE
1140{
1141 struct sock *sk;
1142 struct kmem_cache *slab;
1143
1144 slab = prot->slab;
e912b114
ED
1145 if (slab != NULL) {
1146 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1147 if (!sk)
1148 return sk;
1149 if (priority & __GFP_ZERO) {
fcbdf09d
OP
1150 if (prot->clear_sk)
1151 prot->clear_sk(sk, prot->obj_size);
1152 else
1153 sk_prot_clear_nulls(sk, prot->obj_size);
e912b114 1154 }
fcbdf09d 1155 } else
c308c1b2
PE
1156 sk = kmalloc(prot->obj_size, priority);
1157
2e4afe7b 1158 if (sk != NULL) {
a98b65a3
VN
1159 kmemcheck_annotate_bitfield(sk, flags);
1160
2e4afe7b
PE
1161 if (security_sk_alloc(sk, family, priority))
1162 goto out_free;
1163
1164 if (!try_module_get(prot->owner))
1165 goto out_free_sec;
e022f0b4 1166 sk_tx_queue_clear(sk);
2e4afe7b
PE
1167 }
1168
c308c1b2 1169 return sk;
2e4afe7b
PE
1170
1171out_free_sec:
1172 security_sk_free(sk);
1173out_free:
1174 if (slab != NULL)
1175 kmem_cache_free(slab, sk);
1176 else
1177 kfree(sk);
1178 return NULL;
c308c1b2
PE
1179}
1180
1181static void sk_prot_free(struct proto *prot, struct sock *sk)
1182{
1183 struct kmem_cache *slab;
2e4afe7b 1184 struct module *owner;
c308c1b2 1185
2e4afe7b 1186 owner = prot->owner;
c308c1b2 1187 slab = prot->slab;
2e4afe7b
PE
1188
1189 security_sk_free(sk);
c308c1b2
PE
1190 if (slab != NULL)
1191 kmem_cache_free(slab, sk);
1192 else
1193 kfree(sk);
2e4afe7b 1194 module_put(owner);
c308c1b2
PE
1195}
1196
f8451725
HX
1197#ifdef CONFIG_CGROUPS
1198void sock_update_classid(struct sock *sk)
1199{
1144182a 1200 u32 classid;
f8451725 1201
1144182a
PM
1202 rcu_read_lock(); /* doing current task, which cannot vanish. */
1203 classid = task_cls_classid(current);
1204 rcu_read_unlock();
f8451725
HX
1205 if (classid && classid != sk->sk_classid)
1206 sk->sk_classid = classid;
1207}
82862742 1208EXPORT_SYMBOL(sock_update_classid);
5bc1421e 1209
406a3c63 1210void sock_update_netprioidx(struct sock *sk, struct task_struct *task)
5bc1421e 1211{
5bc1421e
NH
1212 if (in_interrupt())
1213 return;
2b73bc65 1214
406a3c63 1215 sk->sk_cgrp_prioidx = task_netprioidx(task);
5bc1421e
NH
1216}
1217EXPORT_SYMBOL_GPL(sock_update_netprioidx);
f8451725
HX
1218#endif
1219
1da177e4
LT
1220/**
1221 * sk_alloc - All socket objects are allocated here
c4ea43c5 1222 * @net: the applicable net namespace
4dc3b16b
PP
1223 * @family: protocol family
1224 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1225 * @prot: struct proto associated with this new sock instance
1da177e4 1226 */
1b8d7ae4 1227struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
6257ff21 1228 struct proto *prot)
1da177e4 1229{
c308c1b2 1230 struct sock *sk;
1da177e4 1231
154adbc8 1232 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1da177e4 1233 if (sk) {
154adbc8
PE
1234 sk->sk_family = family;
1235 /*
1236 * See comment in struct sock definition to understand
1237 * why we need sk_prot_creator -acme
1238 */
1239 sk->sk_prot = sk->sk_prot_creator = prot;
1240 sock_lock_init(sk);
3b1e0a65 1241 sock_net_set(sk, get_net(net));
d66ee058 1242 atomic_set(&sk->sk_wmem_alloc, 1);
f8451725
HX
1243
1244 sock_update_classid(sk);
406a3c63 1245 sock_update_netprioidx(sk, current);
1da177e4 1246 }
a79af59e 1247
2e4afe7b 1248 return sk;
1da177e4 1249}
2a91525c 1250EXPORT_SYMBOL(sk_alloc);
1da177e4 1251
2b85a34e 1252static void __sk_free(struct sock *sk)
1da177e4
LT
1253{
1254 struct sk_filter *filter;
1da177e4
LT
1255
1256 if (sk->sk_destruct)
1257 sk->sk_destruct(sk);
1258
a898def2
PM
1259 filter = rcu_dereference_check(sk->sk_filter,
1260 atomic_read(&sk->sk_wmem_alloc) == 0);
1da177e4 1261 if (filter) {
309dd5fc 1262 sk_filter_uncharge(sk, filter);
a9b3cd7f 1263 RCU_INIT_POINTER(sk->sk_filter, NULL);
1da177e4
LT
1264 }
1265
08e29af3 1266 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1da177e4
LT
1267
1268 if (atomic_read(&sk->sk_omem_alloc))
e005d193
JP
1269 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1270 __func__, atomic_read(&sk->sk_omem_alloc));
1da177e4 1271
109f6e39
EB
1272 if (sk->sk_peer_cred)
1273 put_cred(sk->sk_peer_cred);
1274 put_pid(sk->sk_peer_pid);
3b1e0a65 1275 put_net(sock_net(sk));
c308c1b2 1276 sk_prot_free(sk->sk_prot_creator, sk);
1da177e4 1277}
2b85a34e
ED
1278
1279void sk_free(struct sock *sk)
1280{
1281 /*
25985edc 1282 * We subtract one from sk_wmem_alloc and can know if
2b85a34e
ED
1283 * some packets are still in some tx queue.
1284 * If not null, sock_wfree() will call __sk_free(sk) later
1285 */
1286 if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1287 __sk_free(sk);
1288}
2a91525c 1289EXPORT_SYMBOL(sk_free);
1da177e4 1290
edf02087 1291/*
25985edc
LDM
1292 * Last sock_put should drop reference to sk->sk_net. It has already
1293 * been dropped in sk_change_net. Taking reference to stopping namespace
edf02087 1294 * is not an option.
25985edc 1295 * Take reference to a socket to remove it from hash _alive_ and after that
edf02087
DL
1296 * destroy it in the context of init_net.
1297 */
1298void sk_release_kernel(struct sock *sk)
1299{
1300 if (sk == NULL || sk->sk_socket == NULL)
1301 return;
1302
1303 sock_hold(sk);
1304 sock_release(sk->sk_socket);
65a18ec5 1305 release_net(sock_net(sk));
3b1e0a65 1306 sock_net_set(sk, get_net(&init_net));
edf02087
DL
1307 sock_put(sk);
1308}
45af1754 1309EXPORT_SYMBOL(sk_release_kernel);
edf02087 1310
475f1b52
SR
1311static void sk_update_clone(const struct sock *sk, struct sock *newsk)
1312{
1313 if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
1314 sock_update_memcg(newsk);
1315}
1316
e56c57d0
ED
1317/**
1318 * sk_clone_lock - clone a socket, and lock its clone
1319 * @sk: the socket to clone
1320 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1321 *
1322 * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1323 */
1324struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
87d11ceb 1325{
8fd1d178 1326 struct sock *newsk;
87d11ceb 1327
8fd1d178 1328 newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
87d11ceb
ACM
1329 if (newsk != NULL) {
1330 struct sk_filter *filter;
1331
892c141e 1332 sock_copy(newsk, sk);
87d11ceb
ACM
1333
1334 /* SANITY */
3b1e0a65 1335 get_net(sock_net(newsk));
87d11ceb
ACM
1336 sk_node_init(&newsk->sk_node);
1337 sock_lock_init(newsk);
1338 bh_lock_sock(newsk);
fa438ccf 1339 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
8eae939f 1340 newsk->sk_backlog.len = 0;
87d11ceb
ACM
1341
1342 atomic_set(&newsk->sk_rmem_alloc, 0);
2b85a34e
ED
1343 /*
1344 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1345 */
1346 atomic_set(&newsk->sk_wmem_alloc, 1);
87d11ceb
ACM
1347 atomic_set(&newsk->sk_omem_alloc, 0);
1348 skb_queue_head_init(&newsk->sk_receive_queue);
1349 skb_queue_head_init(&newsk->sk_write_queue);
97fc2f08
CL
1350#ifdef CONFIG_NET_DMA
1351 skb_queue_head_init(&newsk->sk_async_wait_queue);
1352#endif
87d11ceb 1353
b6c6712a 1354 spin_lock_init(&newsk->sk_dst_lock);
87d11ceb 1355 rwlock_init(&newsk->sk_callback_lock);
443aef0e
PZ
1356 lockdep_set_class_and_name(&newsk->sk_callback_lock,
1357 af_callback_keys + newsk->sk_family,
1358 af_family_clock_key_strings[newsk->sk_family]);
87d11ceb
ACM
1359
1360 newsk->sk_dst_cache = NULL;
1361 newsk->sk_wmem_queued = 0;
1362 newsk->sk_forward_alloc = 0;
1363 newsk->sk_send_head = NULL;
87d11ceb
ACM
1364 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1365
1366 sock_reset_flag(newsk, SOCK_DONE);
1367 skb_queue_head_init(&newsk->sk_error_queue);
1368
0d7da9dd 1369 filter = rcu_dereference_protected(newsk->sk_filter, 1);
87d11ceb
ACM
1370 if (filter != NULL)
1371 sk_filter_charge(newsk, filter);
1372
1373 if (unlikely(xfrm_sk_clone_policy(newsk))) {
1374 /* It is still raw copy of parent, so invalidate
1375 * destructor and make plain sk_free() */
1376 newsk->sk_destruct = NULL;
b0691c8e 1377 bh_unlock_sock(newsk);
87d11ceb
ACM
1378 sk_free(newsk);
1379 newsk = NULL;
1380 goto out;
1381 }
1382
1383 newsk->sk_err = 0;
1384 newsk->sk_priority = 0;
4dc6dc71
ED
1385 /*
1386 * Before updating sk_refcnt, we must commit prior changes to memory
1387 * (Documentation/RCU/rculist_nulls.txt for details)
1388 */
1389 smp_wmb();
87d11ceb
ACM
1390 atomic_set(&newsk->sk_refcnt, 2);
1391
1392 /*
1393 * Increment the counter in the same struct proto as the master
1394 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1395 * is the same as sk->sk_prot->socks, as this field was copied
1396 * with memcpy).
1397 *
1398 * This _changes_ the previous behaviour, where
1399 * tcp_create_openreq_child always was incrementing the
1400 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1401 * to be taken into account in all callers. -acme
1402 */
1403 sk_refcnt_debug_inc(newsk);
972692e0 1404 sk_set_socket(newsk, NULL);
43815482 1405 newsk->sk_wq = NULL;
87d11ceb 1406
f3f511e1
GC
1407 sk_update_clone(sk, newsk);
1408
87d11ceb 1409 if (newsk->sk_prot->sockets_allocated)
180d8cd9 1410 sk_sockets_allocated_inc(newsk);
704da560 1411
08e29af3 1412 if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
704da560 1413 net_enable_timestamp();
87d11ceb
ACM
1414 }
1415out:
1416 return newsk;
1417}
e56c57d0 1418EXPORT_SYMBOL_GPL(sk_clone_lock);
87d11ceb 1419
9958089a
AK
1420void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1421{
1422 __sk_dst_set(sk, dst);
1423 sk->sk_route_caps = dst->dev->features;
1424 if (sk->sk_route_caps & NETIF_F_GSO)
4fcd6b99 1425 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
a465419b 1426 sk->sk_route_caps &= ~sk->sk_route_nocaps;
9958089a 1427 if (sk_can_gso(sk)) {
82cc1a7a 1428 if (dst->header_len) {
9958089a 1429 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
82cc1a7a 1430 } else {
9958089a 1431 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
82cc1a7a
PWJ
1432 sk->sk_gso_max_size = dst->dev->gso_max_size;
1433 }
9958089a
AK
1434 }
1435}
1436EXPORT_SYMBOL_GPL(sk_setup_caps);
1437
1da177e4
LT
1438void __init sk_init(void)
1439{
4481374c 1440 if (totalram_pages <= 4096) {
1da177e4
LT
1441 sysctl_wmem_max = 32767;
1442 sysctl_rmem_max = 32767;
1443 sysctl_wmem_default = 32767;
1444 sysctl_rmem_default = 32767;
4481374c 1445 } else if (totalram_pages >= 131072) {
1da177e4
LT
1446 sysctl_wmem_max = 131071;
1447 sysctl_rmem_max = 131071;
1448 }
1449}
1450
1451/*
1452 * Simple resource managers for sockets.
1453 */
1454
1455
4ec93edb
YH
1456/*
1457 * Write buffer destructor automatically called from kfree_skb.
1da177e4
LT
1458 */
1459void sock_wfree(struct sk_buff *skb)
1460{
1461 struct sock *sk = skb->sk;
d99927f4 1462 unsigned int len = skb->truesize;
1da177e4 1463
d99927f4
ED
1464 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1465 /*
1466 * Keep a reference on sk_wmem_alloc, this will be released
1467 * after sk_write_space() call
1468 */
1469 atomic_sub(len - 1, &sk->sk_wmem_alloc);
1da177e4 1470 sk->sk_write_space(sk);
d99927f4
ED
1471 len = 1;
1472 }
2b85a34e 1473 /*
d99927f4
ED
1474 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1475 * could not do because of in-flight packets
2b85a34e 1476 */
d99927f4 1477 if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
2b85a34e 1478 __sk_free(sk);
1da177e4 1479}
2a91525c 1480EXPORT_SYMBOL(sock_wfree);
1da177e4 1481
4ec93edb
YH
1482/*
1483 * Read buffer destructor automatically called from kfree_skb.
1da177e4
LT
1484 */
1485void sock_rfree(struct sk_buff *skb)
1486{
1487 struct sock *sk = skb->sk;
d361fd59 1488 unsigned int len = skb->truesize;
1da177e4 1489
d361fd59
ED
1490 atomic_sub(len, &sk->sk_rmem_alloc);
1491 sk_mem_uncharge(sk, len);
1da177e4 1492}
2a91525c 1493EXPORT_SYMBOL(sock_rfree);
1da177e4 1494
41063e9d
DM
1495void sock_edemux(struct sk_buff *skb)
1496{
1497 sock_put(skb->sk);
1498}
1499EXPORT_SYMBOL(sock_edemux);
1da177e4
LT
1500
1501int sock_i_uid(struct sock *sk)
1502{
1503 int uid;
1504
f064af1e 1505 read_lock_bh(&sk->sk_callback_lock);
1da177e4 1506 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
f064af1e 1507 read_unlock_bh(&sk->sk_callback_lock);
1da177e4
LT
1508 return uid;
1509}
2a91525c 1510EXPORT_SYMBOL(sock_i_uid);
1da177e4
LT
1511
1512unsigned long sock_i_ino(struct sock *sk)
1513{
1514 unsigned long ino;
1515
f064af1e 1516 read_lock_bh(&sk->sk_callback_lock);
1da177e4 1517 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
f064af1e 1518 read_unlock_bh(&sk->sk_callback_lock);
1da177e4
LT
1519 return ino;
1520}
2a91525c 1521EXPORT_SYMBOL(sock_i_ino);
1da177e4
LT
1522
1523/*
1524 * Allocate a skb from the socket's send buffer.
1525 */
86a76caf 1526struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
dd0fc66f 1527 gfp_t priority)
1da177e4
LT
1528{
1529 if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
2a91525c 1530 struct sk_buff *skb = alloc_skb(size, priority);
1da177e4
LT
1531 if (skb) {
1532 skb_set_owner_w(skb, sk);
1533 return skb;
1534 }
1535 }
1536 return NULL;
1537}
2a91525c 1538EXPORT_SYMBOL(sock_wmalloc);
1da177e4
LT
1539
1540/*
1541 * Allocate a skb from the socket's receive buffer.
4ec93edb 1542 */
86a76caf 1543struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
dd0fc66f 1544 gfp_t priority)
1da177e4
LT
1545{
1546 if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1547 struct sk_buff *skb = alloc_skb(size, priority);
1548 if (skb) {
1549 skb_set_owner_r(skb, sk);
1550 return skb;
1551 }
1552 }
1553 return NULL;
1554}
1555
4ec93edb 1556/*
1da177e4 1557 * Allocate a memory block from the socket's option memory buffer.
4ec93edb 1558 */
dd0fc66f 1559void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1da177e4 1560{
95c96174 1561 if ((unsigned int)size <= sysctl_optmem_max &&
1da177e4
LT
1562 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1563 void *mem;
1564 /* First do the add, to avoid the race if kmalloc
4ec93edb 1565 * might sleep.
1da177e4
LT
1566 */
1567 atomic_add(size, &sk->sk_omem_alloc);
1568 mem = kmalloc(size, priority);
1569 if (mem)
1570 return mem;
1571 atomic_sub(size, &sk->sk_omem_alloc);
1572 }
1573 return NULL;
1574}
2a91525c 1575EXPORT_SYMBOL(sock_kmalloc);
1da177e4
LT
1576
1577/*
1578 * Free an option memory block.
1579 */
1580void sock_kfree_s(struct sock *sk, void *mem, int size)
1581{
1582 kfree(mem);
1583 atomic_sub(size, &sk->sk_omem_alloc);
1584}
2a91525c 1585EXPORT_SYMBOL(sock_kfree_s);
1da177e4
LT
1586
1587/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1588 I think, these locks should be removed for datagram sockets.
1589 */
2a91525c 1590static long sock_wait_for_wmem(struct sock *sk, long timeo)
1da177e4
LT
1591{
1592 DEFINE_WAIT(wait);
1593
1594 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1595 for (;;) {
1596 if (!timeo)
1597 break;
1598 if (signal_pending(current))
1599 break;
1600 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
aa395145 1601 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1da177e4
LT
1602 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1603 break;
1604 if (sk->sk_shutdown & SEND_SHUTDOWN)
1605 break;
1606 if (sk->sk_err)
1607 break;
1608 timeo = schedule_timeout(timeo);
1609 }
aa395145 1610 finish_wait(sk_sleep(sk), &wait);
1da177e4
LT
1611 return timeo;
1612}
1613
1614
1615/*
1616 * Generic send/receive buffer handlers
1617 */
1618
4cc7f68d
HX
1619struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1620 unsigned long data_len, int noblock,
1621 int *errcode)
1da177e4
LT
1622{
1623 struct sk_buff *skb;
7d877f3b 1624 gfp_t gfp_mask;
1da177e4
LT
1625 long timeo;
1626 int err;
cc9b17ad
JW
1627 int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1628
1629 err = -EMSGSIZE;
1630 if (npages > MAX_SKB_FRAGS)
1631 goto failure;
1da177e4
LT
1632
1633 gfp_mask = sk->sk_allocation;
1634 if (gfp_mask & __GFP_WAIT)
1635 gfp_mask |= __GFP_REPEAT;
1636
1637 timeo = sock_sndtimeo(sk, noblock);
1638 while (1) {
1639 err = sock_error(sk);
1640 if (err != 0)
1641 goto failure;
1642
1643 err = -EPIPE;
1644 if (sk->sk_shutdown & SEND_SHUTDOWN)
1645 goto failure;
1646
1647 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
db38c179 1648 skb = alloc_skb(header_len, gfp_mask);
1da177e4 1649 if (skb) {
1da177e4
LT
1650 int i;
1651
1652 /* No pages, we're done... */
1653 if (!data_len)
1654 break;
1655
1da177e4
LT
1656 skb->truesize += data_len;
1657 skb_shinfo(skb)->nr_frags = npages;
1658 for (i = 0; i < npages; i++) {
1659 struct page *page;
1da177e4
LT
1660
1661 page = alloc_pages(sk->sk_allocation, 0);
1662 if (!page) {
1663 err = -ENOBUFS;
1664 skb_shinfo(skb)->nr_frags = i;
1665 kfree_skb(skb);
1666 goto failure;
1667 }
1668
ea2ab693
IC
1669 __skb_fill_page_desc(skb, i,
1670 page, 0,
1671 (data_len >= PAGE_SIZE ?
1672 PAGE_SIZE :
1673 data_len));
1da177e4
LT
1674 data_len -= PAGE_SIZE;
1675 }
1676
1677 /* Full success... */
1678 break;
1679 }
1680 err = -ENOBUFS;
1681 goto failure;
1682 }
1683 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1684 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1685 err = -EAGAIN;
1686 if (!timeo)
1687 goto failure;
1688 if (signal_pending(current))
1689 goto interrupted;
1690 timeo = sock_wait_for_wmem(sk, timeo);
1691 }
1692
1693 skb_set_owner_w(skb, sk);
1694 return skb;
1695
1696interrupted:
1697 err = sock_intr_errno(timeo);
1698failure:
1699 *errcode = err;
1700 return NULL;
1701}
4cc7f68d 1702EXPORT_SYMBOL(sock_alloc_send_pskb);
1da177e4 1703
4ec93edb 1704struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1da177e4
LT
1705 int noblock, int *errcode)
1706{
1707 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1708}
2a91525c 1709EXPORT_SYMBOL(sock_alloc_send_skb);
1da177e4
LT
1710
1711static void __lock_sock(struct sock *sk)
f39234d6
NK
1712 __releases(&sk->sk_lock.slock)
1713 __acquires(&sk->sk_lock.slock)
1da177e4
LT
1714{
1715 DEFINE_WAIT(wait);
1716
e71a4783 1717 for (;;) {
1da177e4
LT
1718 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1719 TASK_UNINTERRUPTIBLE);
1720 spin_unlock_bh(&sk->sk_lock.slock);
1721 schedule();
1722 spin_lock_bh(&sk->sk_lock.slock);
e71a4783 1723 if (!sock_owned_by_user(sk))
1da177e4
LT
1724 break;
1725 }
1726 finish_wait(&sk->sk_lock.wq, &wait);
1727}
1728
1729static void __release_sock(struct sock *sk)
f39234d6
NK
1730 __releases(&sk->sk_lock.slock)
1731 __acquires(&sk->sk_lock.slock)
1da177e4
LT
1732{
1733 struct sk_buff *skb = sk->sk_backlog.head;
1734
1735 do {
1736 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1737 bh_unlock_sock(sk);
1738
1739 do {
1740 struct sk_buff *next = skb->next;
1741
e4cbb02a 1742 prefetch(next);
7fee226a 1743 WARN_ON_ONCE(skb_dst_is_noref(skb));
1da177e4 1744 skb->next = NULL;
c57943a1 1745 sk_backlog_rcv(sk, skb);
1da177e4
LT
1746
1747 /*
1748 * We are in process context here with softirqs
1749 * disabled, use cond_resched_softirq() to preempt.
1750 * This is safe to do because we've taken the backlog
1751 * queue private:
1752 */
1753 cond_resched_softirq();
1754
1755 skb = next;
1756 } while (skb != NULL);
1757
1758 bh_lock_sock(sk);
e71a4783 1759 } while ((skb = sk->sk_backlog.head) != NULL);
8eae939f
ZY
1760
1761 /*
1762 * Doing the zeroing here guarantee we can not loop forever
1763 * while a wild producer attempts to flood us.
1764 */
1765 sk->sk_backlog.len = 0;
1da177e4
LT
1766}
1767
1768/**
1769 * sk_wait_data - wait for data to arrive at sk_receive_queue
4dc3b16b
PP
1770 * @sk: sock to wait on
1771 * @timeo: for how long
1da177e4
LT
1772 *
1773 * Now socket state including sk->sk_err is changed only under lock,
1774 * hence we may omit checks after joining wait queue.
1775 * We check receive queue before schedule() only as optimization;
1776 * it is very likely that release_sock() added new data.
1777 */
1778int sk_wait_data(struct sock *sk, long *timeo)
1779{
1780 int rc;
1781 DEFINE_WAIT(wait);
1782
aa395145 1783 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1da177e4
LT
1784 set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1785 rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1786 clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
aa395145 1787 finish_wait(sk_sleep(sk), &wait);
1da177e4
LT
1788 return rc;
1789}
1da177e4
LT
1790EXPORT_SYMBOL(sk_wait_data);
1791
3ab224be
HA
1792/**
1793 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1794 * @sk: socket
1795 * @size: memory size to allocate
1796 * @kind: allocation type
1797 *
1798 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1799 * rmem allocation. This function assumes that protocols which have
1800 * memory_pressure use sk_wmem_queued as write buffer accounting.
1801 */
1802int __sk_mem_schedule(struct sock *sk, int size, int kind)
1803{
1804 struct proto *prot = sk->sk_prot;
1805 int amt = sk_mem_pages(size);
8d987e5c 1806 long allocated;
e1aab161 1807 int parent_status = UNDER_LIMIT;
3ab224be
HA
1808
1809 sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
180d8cd9 1810
e1aab161 1811 allocated = sk_memory_allocated_add(sk, amt, &parent_status);
3ab224be
HA
1812
1813 /* Under limit. */
e1aab161
GC
1814 if (parent_status == UNDER_LIMIT &&
1815 allocated <= sk_prot_mem_limits(sk, 0)) {
180d8cd9 1816 sk_leave_memory_pressure(sk);
3ab224be
HA
1817 return 1;
1818 }
1819
e1aab161
GC
1820 /* Under pressure. (we or our parents) */
1821 if ((parent_status > SOFT_LIMIT) ||
1822 allocated > sk_prot_mem_limits(sk, 1))
180d8cd9 1823 sk_enter_memory_pressure(sk);
3ab224be 1824
e1aab161
GC
1825 /* Over hard limit (we or our parents) */
1826 if ((parent_status == OVER_LIMIT) ||
1827 (allocated > sk_prot_mem_limits(sk, 2)))
3ab224be
HA
1828 goto suppress_allocation;
1829
1830 /* guarantee minimum buffer size under pressure */
1831 if (kind == SK_MEM_RECV) {
1832 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1833 return 1;
180d8cd9 1834
3ab224be
HA
1835 } else { /* SK_MEM_SEND */
1836 if (sk->sk_type == SOCK_STREAM) {
1837 if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1838 return 1;
1839 } else if (atomic_read(&sk->sk_wmem_alloc) <
1840 prot->sysctl_wmem[0])
1841 return 1;
1842 }
1843
180d8cd9 1844 if (sk_has_memory_pressure(sk)) {
1748376b
ED
1845 int alloc;
1846
180d8cd9 1847 if (!sk_under_memory_pressure(sk))
1748376b 1848 return 1;
180d8cd9
GC
1849 alloc = sk_sockets_allocated_read_positive(sk);
1850 if (sk_prot_mem_limits(sk, 2) > alloc *
3ab224be
HA
1851 sk_mem_pages(sk->sk_wmem_queued +
1852 atomic_read(&sk->sk_rmem_alloc) +
1853 sk->sk_forward_alloc))
1854 return 1;
1855 }
1856
1857suppress_allocation:
1858
1859 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1860 sk_stream_moderate_sndbuf(sk);
1861
1862 /* Fail only if socket is _under_ its sndbuf.
1863 * In this case we cannot block, so that we have to fail.
1864 */
1865 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
1866 return 1;
1867 }
1868
3847ce32
SM
1869 trace_sock_exceed_buf_limit(sk, prot, allocated);
1870
3ab224be
HA
1871 /* Alas. Undo changes. */
1872 sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
180d8cd9 1873
0e90b31f 1874 sk_memory_allocated_sub(sk, amt);
180d8cd9 1875
3ab224be
HA
1876 return 0;
1877}
3ab224be
HA
1878EXPORT_SYMBOL(__sk_mem_schedule);
1879
1880/**
1881 * __sk_reclaim - reclaim memory_allocated
1882 * @sk: socket
1883 */
1884void __sk_mem_reclaim(struct sock *sk)
1885{
180d8cd9 1886 sk_memory_allocated_sub(sk,
0e90b31f 1887 sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT);
3ab224be
HA
1888 sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
1889
180d8cd9
GC
1890 if (sk_under_memory_pressure(sk) &&
1891 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
1892 sk_leave_memory_pressure(sk);
3ab224be 1893}
3ab224be
HA
1894EXPORT_SYMBOL(__sk_mem_reclaim);
1895
1896
1da177e4
LT
1897/*
1898 * Set of default routines for initialising struct proto_ops when
1899 * the protocol does not support a particular function. In certain
1900 * cases where it makes no sense for a protocol to have a "do nothing"
1901 * function, some default processing is provided.
1902 */
1903
1904int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1905{
1906 return -EOPNOTSUPP;
1907}
2a91525c 1908EXPORT_SYMBOL(sock_no_bind);
1da177e4 1909
4ec93edb 1910int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1da177e4
LT
1911 int len, int flags)
1912{
1913 return -EOPNOTSUPP;
1914}
2a91525c 1915EXPORT_SYMBOL(sock_no_connect);
1da177e4
LT
1916
1917int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1918{
1919 return -EOPNOTSUPP;
1920}
2a91525c 1921EXPORT_SYMBOL(sock_no_socketpair);
1da177e4
LT
1922
1923int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1924{
1925 return -EOPNOTSUPP;
1926}
2a91525c 1927EXPORT_SYMBOL(sock_no_accept);
1da177e4 1928
4ec93edb 1929int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1da177e4
LT
1930 int *len, int peer)
1931{
1932 return -EOPNOTSUPP;
1933}
2a91525c 1934EXPORT_SYMBOL(sock_no_getname);
1da177e4 1935
2a91525c 1936unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
1da177e4
LT
1937{
1938 return 0;
1939}
2a91525c 1940EXPORT_SYMBOL(sock_no_poll);
1da177e4
LT
1941
1942int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1943{
1944 return -EOPNOTSUPP;
1945}
2a91525c 1946EXPORT_SYMBOL(sock_no_ioctl);
1da177e4
LT
1947
1948int sock_no_listen(struct socket *sock, int backlog)
1949{
1950 return -EOPNOTSUPP;
1951}
2a91525c 1952EXPORT_SYMBOL(sock_no_listen);
1da177e4
LT
1953
1954int sock_no_shutdown(struct socket *sock, int how)
1955{
1956 return -EOPNOTSUPP;
1957}
2a91525c 1958EXPORT_SYMBOL(sock_no_shutdown);
1da177e4
LT
1959
1960int sock_no_setsockopt(struct socket *sock, int level, int optname,
b7058842 1961 char __user *optval, unsigned int optlen)
1da177e4
LT
1962{
1963 return -EOPNOTSUPP;
1964}
2a91525c 1965EXPORT_SYMBOL(sock_no_setsockopt);
1da177e4
LT
1966
1967int sock_no_getsockopt(struct socket *sock, int level, int optname,
1968 char __user *optval, int __user *optlen)
1969{
1970 return -EOPNOTSUPP;
1971}
2a91525c 1972EXPORT_SYMBOL(sock_no_getsockopt);
1da177e4
LT
1973
1974int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1975 size_t len)
1976{
1977 return -EOPNOTSUPP;
1978}
2a91525c 1979EXPORT_SYMBOL(sock_no_sendmsg);
1da177e4
LT
1980
1981int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1982 size_t len, int flags)
1983{
1984 return -EOPNOTSUPP;
1985}
2a91525c 1986EXPORT_SYMBOL(sock_no_recvmsg);
1da177e4
LT
1987
1988int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1989{
1990 /* Mirror missing mmap method error code */
1991 return -ENODEV;
1992}
2a91525c 1993EXPORT_SYMBOL(sock_no_mmap);
1da177e4
LT
1994
1995ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1996{
1997 ssize_t res;
1998 struct msghdr msg = {.msg_flags = flags};
1999 struct kvec iov;
2000 char *kaddr = kmap(page);
2001 iov.iov_base = kaddr + offset;
2002 iov.iov_len = size;
2003 res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2004 kunmap(page);
2005 return res;
2006}
2a91525c 2007EXPORT_SYMBOL(sock_no_sendpage);
1da177e4
LT
2008
2009/*
2010 * Default Socket Callbacks
2011 */
2012
2013static void sock_def_wakeup(struct sock *sk)
2014{
43815482
ED
2015 struct socket_wq *wq;
2016
2017 rcu_read_lock();
2018 wq = rcu_dereference(sk->sk_wq);
2019 if (wq_has_sleeper(wq))
2020 wake_up_interruptible_all(&wq->wait);
2021 rcu_read_unlock();
1da177e4
LT
2022}
2023
2024static void sock_def_error_report(struct sock *sk)
2025{
43815482
ED
2026 struct socket_wq *wq;
2027
2028 rcu_read_lock();
2029 wq = rcu_dereference(sk->sk_wq);
2030 if (wq_has_sleeper(wq))
2031 wake_up_interruptible_poll(&wq->wait, POLLERR);
8d8ad9d7 2032 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
43815482 2033 rcu_read_unlock();
1da177e4
LT
2034}
2035
2036static void sock_def_readable(struct sock *sk, int len)
2037{
43815482
ED
2038 struct socket_wq *wq;
2039
2040 rcu_read_lock();
2041 wq = rcu_dereference(sk->sk_wq);
2042 if (wq_has_sleeper(wq))
2c6607c6 2043 wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
37e5540b 2044 POLLRDNORM | POLLRDBAND);
8d8ad9d7 2045 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
43815482 2046 rcu_read_unlock();
1da177e4
LT
2047}
2048
2049static void sock_def_write_space(struct sock *sk)
2050{
43815482
ED
2051 struct socket_wq *wq;
2052
2053 rcu_read_lock();
1da177e4
LT
2054
2055 /* Do not wake up a writer until he can make "significant"
2056 * progress. --DaveM
2057 */
e71a4783 2058 if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
43815482
ED
2059 wq = rcu_dereference(sk->sk_wq);
2060 if (wq_has_sleeper(wq))
2061 wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
37e5540b 2062 POLLWRNORM | POLLWRBAND);
1da177e4
LT
2063
2064 /* Should agree with poll, otherwise some programs break */
2065 if (sock_writeable(sk))
8d8ad9d7 2066 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
1da177e4
LT
2067 }
2068
43815482 2069 rcu_read_unlock();
1da177e4
LT
2070}
2071
2072static void sock_def_destruct(struct sock *sk)
2073{
a51482bd 2074 kfree(sk->sk_protinfo);
1da177e4
LT
2075}
2076
2077void sk_send_sigurg(struct sock *sk)
2078{
2079 if (sk->sk_socket && sk->sk_socket->file)
2080 if (send_sigurg(&sk->sk_socket->file->f_owner))
8d8ad9d7 2081 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
1da177e4 2082}
2a91525c 2083EXPORT_SYMBOL(sk_send_sigurg);
1da177e4
LT
2084
2085void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2086 unsigned long expires)
2087{
2088 if (!mod_timer(timer, expires))
2089 sock_hold(sk);
2090}
1da177e4
LT
2091EXPORT_SYMBOL(sk_reset_timer);
2092
2093void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2094{
2095 if (timer_pending(timer) && del_timer(timer))
2096 __sock_put(sk);
2097}
1da177e4
LT
2098EXPORT_SYMBOL(sk_stop_timer);
2099
2100void sock_init_data(struct socket *sock, struct sock *sk)
2101{
2102 skb_queue_head_init(&sk->sk_receive_queue);
2103 skb_queue_head_init(&sk->sk_write_queue);
2104 skb_queue_head_init(&sk->sk_error_queue);
97fc2f08
CL
2105#ifdef CONFIG_NET_DMA
2106 skb_queue_head_init(&sk->sk_async_wait_queue);
2107#endif
1da177e4
LT
2108
2109 sk->sk_send_head = NULL;
2110
2111 init_timer(&sk->sk_timer);
4ec93edb 2112
1da177e4
LT
2113 sk->sk_allocation = GFP_KERNEL;
2114 sk->sk_rcvbuf = sysctl_rmem_default;
2115 sk->sk_sndbuf = sysctl_wmem_default;
2116 sk->sk_state = TCP_CLOSE;
972692e0 2117 sk_set_socket(sk, sock);
1da177e4
LT
2118
2119 sock_set_flag(sk, SOCK_ZAPPED);
2120
e71a4783 2121 if (sock) {
1da177e4 2122 sk->sk_type = sock->type;
43815482 2123 sk->sk_wq = sock->wq;
1da177e4
LT
2124 sock->sk = sk;
2125 } else
43815482 2126 sk->sk_wq = NULL;
1da177e4 2127
b6c6712a 2128 spin_lock_init(&sk->sk_dst_lock);
1da177e4 2129 rwlock_init(&sk->sk_callback_lock);
443aef0e
PZ
2130 lockdep_set_class_and_name(&sk->sk_callback_lock,
2131 af_callback_keys + sk->sk_family,
2132 af_family_clock_key_strings[sk->sk_family]);
1da177e4
LT
2133
2134 sk->sk_state_change = sock_def_wakeup;
2135 sk->sk_data_ready = sock_def_readable;
2136 sk->sk_write_space = sock_def_write_space;
2137 sk->sk_error_report = sock_def_error_report;
2138 sk->sk_destruct = sock_def_destruct;
2139
2140 sk->sk_sndmsg_page = NULL;
2141 sk->sk_sndmsg_off = 0;
ef64a54f 2142 sk->sk_peek_off = -1;
1da177e4 2143
109f6e39
EB
2144 sk->sk_peer_pid = NULL;
2145 sk->sk_peer_cred = NULL;
1da177e4
LT
2146 sk->sk_write_pending = 0;
2147 sk->sk_rcvlowat = 1;
2148 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
2149 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
2150
f37f0afb 2151 sk->sk_stamp = ktime_set(-1L, 0);
1da177e4 2152
4dc6dc71
ED
2153 /*
2154 * Before updating sk_refcnt, we must commit prior changes to memory
2155 * (Documentation/RCU/rculist_nulls.txt for details)
2156 */
2157 smp_wmb();
1da177e4 2158 atomic_set(&sk->sk_refcnt, 1);
33c732c3 2159 atomic_set(&sk->sk_drops, 0);
1da177e4 2160}
2a91525c 2161EXPORT_SYMBOL(sock_init_data);
1da177e4 2162
b5606c2d 2163void lock_sock_nested(struct sock *sk, int subclass)
1da177e4
LT
2164{
2165 might_sleep();
a5b5bb9a 2166 spin_lock_bh(&sk->sk_lock.slock);
d2e9117c 2167 if (sk->sk_lock.owned)
1da177e4 2168 __lock_sock(sk);
d2e9117c 2169 sk->sk_lock.owned = 1;
a5b5bb9a
IM
2170 spin_unlock(&sk->sk_lock.slock);
2171 /*
2172 * The sk_lock has mutex_lock() semantics here:
2173 */
fcc70d5f 2174 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
a5b5bb9a 2175 local_bh_enable();
1da177e4 2176}
fcc70d5f 2177EXPORT_SYMBOL(lock_sock_nested);
1da177e4 2178
b5606c2d 2179void release_sock(struct sock *sk)
1da177e4 2180{
a5b5bb9a
IM
2181 /*
2182 * The sk_lock has mutex_unlock() semantics:
2183 */
2184 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
2185
2186 spin_lock_bh(&sk->sk_lock.slock);
1da177e4
LT
2187 if (sk->sk_backlog.tail)
2188 __release_sock(sk);
46d3ceab
ED
2189
2190 if (sk->sk_prot->release_cb)
2191 sk->sk_prot->release_cb(sk);
2192
d2e9117c 2193 sk->sk_lock.owned = 0;
a5b5bb9a
IM
2194 if (waitqueue_active(&sk->sk_lock.wq))
2195 wake_up(&sk->sk_lock.wq);
2196 spin_unlock_bh(&sk->sk_lock.slock);
1da177e4
LT
2197}
2198EXPORT_SYMBOL(release_sock);
2199
8a74ad60
ED
2200/**
2201 * lock_sock_fast - fast version of lock_sock
2202 * @sk: socket
2203 *
2204 * This version should be used for very small section, where process wont block
2205 * return false if fast path is taken
2206 * sk_lock.slock locked, owned = 0, BH disabled
2207 * return true if slow path is taken
2208 * sk_lock.slock unlocked, owned = 1, BH enabled
2209 */
2210bool lock_sock_fast(struct sock *sk)
2211{
2212 might_sleep();
2213 spin_lock_bh(&sk->sk_lock.slock);
2214
2215 if (!sk->sk_lock.owned)
2216 /*
2217 * Note : We must disable BH
2218 */
2219 return false;
2220
2221 __lock_sock(sk);
2222 sk->sk_lock.owned = 1;
2223 spin_unlock(&sk->sk_lock.slock);
2224 /*
2225 * The sk_lock has mutex_lock() semantics here:
2226 */
2227 mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2228 local_bh_enable();
2229 return true;
2230}
2231EXPORT_SYMBOL(lock_sock_fast);
2232
1da177e4 2233int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
4ec93edb 2234{
b7aa0bf7 2235 struct timeval tv;
1da177e4 2236 if (!sock_flag(sk, SOCK_TIMESTAMP))
20d49473 2237 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
b7aa0bf7
ED
2238 tv = ktime_to_timeval(sk->sk_stamp);
2239 if (tv.tv_sec == -1)
1da177e4 2240 return -ENOENT;
b7aa0bf7
ED
2241 if (tv.tv_sec == 0) {
2242 sk->sk_stamp = ktime_get_real();
2243 tv = ktime_to_timeval(sk->sk_stamp);
2244 }
2245 return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
4ec93edb 2246}
1da177e4
LT
2247EXPORT_SYMBOL(sock_get_timestamp);
2248
ae40eb1e
ED
2249int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2250{
2251 struct timespec ts;
2252 if (!sock_flag(sk, SOCK_TIMESTAMP))
20d49473 2253 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
ae40eb1e
ED
2254 ts = ktime_to_timespec(sk->sk_stamp);
2255 if (ts.tv_sec == -1)
2256 return -ENOENT;
2257 if (ts.tv_sec == 0) {
2258 sk->sk_stamp = ktime_get_real();
2259 ts = ktime_to_timespec(sk->sk_stamp);
2260 }
2261 return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2262}
2263EXPORT_SYMBOL(sock_get_timestampns);
2264
20d49473 2265void sock_enable_timestamp(struct sock *sk, int flag)
4ec93edb 2266{
20d49473 2267 if (!sock_flag(sk, flag)) {
08e29af3
ED
2268 unsigned long previous_flags = sk->sk_flags;
2269
20d49473
PO
2270 sock_set_flag(sk, flag);
2271 /*
2272 * we just set one of the two flags which require net
2273 * time stamping, but time stamping might have been on
2274 * already because of the other one
2275 */
08e29af3 2276 if (!(previous_flags & SK_FLAGS_TIMESTAMP))
20d49473 2277 net_enable_timestamp();
1da177e4
LT
2278 }
2279}
1da177e4
LT
2280
2281/*
2282 * Get a socket option on an socket.
2283 *
2284 * FIX: POSIX 1003.1g is very ambiguous here. It states that
2285 * asynchronous errors should be reported by getsockopt. We assume
2286 * this means if you specify SO_ERROR (otherwise whats the point of it).
2287 */
2288int sock_common_getsockopt(struct socket *sock, int level, int optname,
2289 char __user *optval, int __user *optlen)
2290{
2291 struct sock *sk = sock->sk;
2292
2293 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2294}
1da177e4
LT
2295EXPORT_SYMBOL(sock_common_getsockopt);
2296
3fdadf7d 2297#ifdef CONFIG_COMPAT
543d9cfe
ACM
2298int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2299 char __user *optval, int __user *optlen)
3fdadf7d
DM
2300{
2301 struct sock *sk = sock->sk;
2302
1e51f951 2303 if (sk->sk_prot->compat_getsockopt != NULL)
543d9cfe
ACM
2304 return sk->sk_prot->compat_getsockopt(sk, level, optname,
2305 optval, optlen);
3fdadf7d
DM
2306 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2307}
2308EXPORT_SYMBOL(compat_sock_common_getsockopt);
2309#endif
2310
1da177e4
LT
2311int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
2312 struct msghdr *msg, size_t size, int flags)
2313{
2314 struct sock *sk = sock->sk;
2315 int addr_len = 0;
2316 int err;
2317
2318 err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2319 flags & ~MSG_DONTWAIT, &addr_len);
2320 if (err >= 0)
2321 msg->msg_namelen = addr_len;
2322 return err;
2323}
1da177e4
LT
2324EXPORT_SYMBOL(sock_common_recvmsg);
2325
2326/*
2327 * Set socket options on an inet socket.
2328 */
2329int sock_common_setsockopt(struct socket *sock, int level, int optname,
b7058842 2330 char __user *optval, unsigned int optlen)
1da177e4
LT
2331{
2332 struct sock *sk = sock->sk;
2333
2334 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2335}
1da177e4
LT
2336EXPORT_SYMBOL(sock_common_setsockopt);
2337
3fdadf7d 2338#ifdef CONFIG_COMPAT
543d9cfe 2339int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
b7058842 2340 char __user *optval, unsigned int optlen)
3fdadf7d
DM
2341{
2342 struct sock *sk = sock->sk;
2343
543d9cfe
ACM
2344 if (sk->sk_prot->compat_setsockopt != NULL)
2345 return sk->sk_prot->compat_setsockopt(sk, level, optname,
2346 optval, optlen);
3fdadf7d
DM
2347 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2348}
2349EXPORT_SYMBOL(compat_sock_common_setsockopt);
2350#endif
2351
1da177e4
LT
2352void sk_common_release(struct sock *sk)
2353{
2354 if (sk->sk_prot->destroy)
2355 sk->sk_prot->destroy(sk);
2356
2357 /*
2358 * Observation: when sock_common_release is called, processes have
2359 * no access to socket. But net still has.
2360 * Step one, detach it from networking:
2361 *
2362 * A. Remove from hash tables.
2363 */
2364
2365 sk->sk_prot->unhash(sk);
2366
2367 /*
2368 * In this point socket cannot receive new packets, but it is possible
2369 * that some packets are in flight because some CPU runs receiver and
2370 * did hash table lookup before we unhashed socket. They will achieve
2371 * receive queue and will be purged by socket destructor.
2372 *
2373 * Also we still have packets pending on receive queue and probably,
2374 * our own packets waiting in device queues. sock_destroy will drain
2375 * receive queue, but transmitted packets will delay socket destruction
2376 * until the last reference will be released.
2377 */
2378
2379 sock_orphan(sk);
2380
2381 xfrm_sk_free_policy(sk);
2382
e6848976 2383 sk_refcnt_debug_release(sk);
1da177e4
LT
2384 sock_put(sk);
2385}
1da177e4
LT
2386EXPORT_SYMBOL(sk_common_release);
2387
13ff3d6f
PE
2388#ifdef CONFIG_PROC_FS
2389#define PROTO_INUSE_NR 64 /* should be enough for the first time */
1338d466
PE
2390struct prot_inuse {
2391 int val[PROTO_INUSE_NR];
2392};
13ff3d6f
PE
2393
2394static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
70ee1159
PE
2395
2396#ifdef CONFIG_NET_NS
2397void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2398{
d6d9ca0f 2399 __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
70ee1159
PE
2400}
2401EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2402
2403int sock_prot_inuse_get(struct net *net, struct proto *prot)
2404{
2405 int cpu, idx = prot->inuse_idx;
2406 int res = 0;
2407
2408 for_each_possible_cpu(cpu)
2409 res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2410
2411 return res >= 0 ? res : 0;
2412}
2413EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2414
2c8c1e72 2415static int __net_init sock_inuse_init_net(struct net *net)
70ee1159
PE
2416{
2417 net->core.inuse = alloc_percpu(struct prot_inuse);
2418 return net->core.inuse ? 0 : -ENOMEM;
2419}
2420
2c8c1e72 2421static void __net_exit sock_inuse_exit_net(struct net *net)
70ee1159
PE
2422{
2423 free_percpu(net->core.inuse);
2424}
2425
2426static struct pernet_operations net_inuse_ops = {
2427 .init = sock_inuse_init_net,
2428 .exit = sock_inuse_exit_net,
2429};
2430
2431static __init int net_inuse_init(void)
2432{
2433 if (register_pernet_subsys(&net_inuse_ops))
2434 panic("Cannot initialize net inuse counters");
2435
2436 return 0;
2437}
2438
2439core_initcall(net_inuse_init);
2440#else
1338d466
PE
2441static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2442
c29a0bc4 2443void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
1338d466 2444{
d6d9ca0f 2445 __this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
1338d466
PE
2446}
2447EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2448
c29a0bc4 2449int sock_prot_inuse_get(struct net *net, struct proto *prot)
1338d466
PE
2450{
2451 int cpu, idx = prot->inuse_idx;
2452 int res = 0;
2453
2454 for_each_possible_cpu(cpu)
2455 res += per_cpu(prot_inuse, cpu).val[idx];
2456
2457 return res >= 0 ? res : 0;
2458}
2459EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
70ee1159 2460#endif
13ff3d6f
PE
2461
2462static void assign_proto_idx(struct proto *prot)
2463{
2464 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2465
2466 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
e005d193 2467 pr_err("PROTO_INUSE_NR exhausted\n");
13ff3d6f
PE
2468 return;
2469 }
2470
2471 set_bit(prot->inuse_idx, proto_inuse_idx);
2472}
2473
2474static void release_proto_idx(struct proto *prot)
2475{
2476 if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2477 clear_bit(prot->inuse_idx, proto_inuse_idx);
2478}
2479#else
2480static inline void assign_proto_idx(struct proto *prot)
2481{
2482}
2483
2484static inline void release_proto_idx(struct proto *prot)
2485{
2486}
2487#endif
2488
b733c007
PE
2489int proto_register(struct proto *prot, int alloc_slab)
2490{
1da177e4
LT
2491 if (alloc_slab) {
2492 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
271b72c7
ED
2493 SLAB_HWCACHE_ALIGN | prot->slab_flags,
2494 NULL);
1da177e4
LT
2495
2496 if (prot->slab == NULL) {
e005d193
JP
2497 pr_crit("%s: Can't create sock SLAB cache!\n",
2498 prot->name);
60e7663d 2499 goto out;
1da177e4 2500 }
2e6599cb
ACM
2501
2502 if (prot->rsk_prot != NULL) {
faf23422 2503 prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name);
7e56b5d6 2504 if (prot->rsk_prot->slab_name == NULL)
2e6599cb
ACM
2505 goto out_free_sock_slab;
2506
7e56b5d6 2507 prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2e6599cb 2508 prot->rsk_prot->obj_size, 0,
20c2df83 2509 SLAB_HWCACHE_ALIGN, NULL);
2e6599cb
ACM
2510
2511 if (prot->rsk_prot->slab == NULL) {
e005d193
JP
2512 pr_crit("%s: Can't create request sock SLAB cache!\n",
2513 prot->name);
2e6599cb
ACM
2514 goto out_free_request_sock_slab_name;
2515 }
2516 }
8feaf0c0 2517
6d6ee43e 2518 if (prot->twsk_prot != NULL) {
faf23422 2519 prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
8feaf0c0 2520
7e56b5d6 2521 if (prot->twsk_prot->twsk_slab_name == NULL)
8feaf0c0
ACM
2522 goto out_free_request_sock_slab;
2523
6d6ee43e 2524 prot->twsk_prot->twsk_slab =
7e56b5d6 2525 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
6d6ee43e 2526 prot->twsk_prot->twsk_obj_size,
3ab5aee7
ED
2527 0,
2528 SLAB_HWCACHE_ALIGN |
2529 prot->slab_flags,
20c2df83 2530 NULL);
6d6ee43e 2531 if (prot->twsk_prot->twsk_slab == NULL)
8feaf0c0
ACM
2532 goto out_free_timewait_sock_slab_name;
2533 }
1da177e4
LT
2534 }
2535
36b77a52 2536 mutex_lock(&proto_list_mutex);
1da177e4 2537 list_add(&prot->node, &proto_list);
13ff3d6f 2538 assign_proto_idx(prot);
36b77a52 2539 mutex_unlock(&proto_list_mutex);
b733c007
PE
2540 return 0;
2541
8feaf0c0 2542out_free_timewait_sock_slab_name:
7e56b5d6 2543 kfree(prot->twsk_prot->twsk_slab_name);
8feaf0c0
ACM
2544out_free_request_sock_slab:
2545 if (prot->rsk_prot && prot->rsk_prot->slab) {
2546 kmem_cache_destroy(prot->rsk_prot->slab);
2547 prot->rsk_prot->slab = NULL;
2548 }
2e6599cb 2549out_free_request_sock_slab_name:
72150e9b
DC
2550 if (prot->rsk_prot)
2551 kfree(prot->rsk_prot->slab_name);
2e6599cb
ACM
2552out_free_sock_slab:
2553 kmem_cache_destroy(prot->slab);
2554 prot->slab = NULL;
b733c007
PE
2555out:
2556 return -ENOBUFS;
1da177e4 2557}
1da177e4
LT
2558EXPORT_SYMBOL(proto_register);
2559
2560void proto_unregister(struct proto *prot)
2561{
36b77a52 2562 mutex_lock(&proto_list_mutex);
13ff3d6f 2563 release_proto_idx(prot);
0a3f4358 2564 list_del(&prot->node);
36b77a52 2565 mutex_unlock(&proto_list_mutex);
1da177e4
LT
2566
2567 if (prot->slab != NULL) {
2568 kmem_cache_destroy(prot->slab);
2569 prot->slab = NULL;
2570 }
2571
2e6599cb 2572 if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2e6599cb 2573 kmem_cache_destroy(prot->rsk_prot->slab);
7e56b5d6 2574 kfree(prot->rsk_prot->slab_name);
2e6599cb
ACM
2575 prot->rsk_prot->slab = NULL;
2576 }
2577
6d6ee43e 2578 if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
6d6ee43e 2579 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
7e56b5d6 2580 kfree(prot->twsk_prot->twsk_slab_name);
6d6ee43e 2581 prot->twsk_prot->twsk_slab = NULL;
8feaf0c0 2582 }
1da177e4 2583}
1da177e4
LT
2584EXPORT_SYMBOL(proto_unregister);
2585
2586#ifdef CONFIG_PROC_FS
1da177e4 2587static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
36b77a52 2588 __acquires(proto_list_mutex)
1da177e4 2589{
36b77a52 2590 mutex_lock(&proto_list_mutex);
60f0438a 2591 return seq_list_start_head(&proto_list, *pos);
1da177e4
LT
2592}
2593
2594static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2595{
60f0438a 2596 return seq_list_next(v, &proto_list, pos);
1da177e4
LT
2597}
2598
2599static void proto_seq_stop(struct seq_file *seq, void *v)
36b77a52 2600 __releases(proto_list_mutex)
1da177e4 2601{
36b77a52 2602 mutex_unlock(&proto_list_mutex);
1da177e4
LT
2603}
2604
2605static char proto_method_implemented(const void *method)
2606{
2607 return method == NULL ? 'n' : 'y';
2608}
180d8cd9
GC
2609static long sock_prot_memory_allocated(struct proto *proto)
2610{
cb75a36c 2611 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
180d8cd9
GC
2612}
2613
2614static char *sock_prot_memory_pressure(struct proto *proto)
2615{
2616 return proto->memory_pressure != NULL ?
2617 proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2618}
1da177e4
LT
2619
2620static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2621{
180d8cd9 2622
8d987e5c 2623 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
1da177e4
LT
2624 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2625 proto->name,
2626 proto->obj_size,
14e943db 2627 sock_prot_inuse_get(seq_file_net(seq), proto),
180d8cd9
GC
2628 sock_prot_memory_allocated(proto),
2629 sock_prot_memory_pressure(proto),
1da177e4
LT
2630 proto->max_header,
2631 proto->slab == NULL ? "no" : "yes",
2632 module_name(proto->owner),
2633 proto_method_implemented(proto->close),
2634 proto_method_implemented(proto->connect),
2635 proto_method_implemented(proto->disconnect),
2636 proto_method_implemented(proto->accept),
2637 proto_method_implemented(proto->ioctl),
2638 proto_method_implemented(proto->init),
2639 proto_method_implemented(proto->destroy),
2640 proto_method_implemented(proto->shutdown),
2641 proto_method_implemented(proto->setsockopt),
2642 proto_method_implemented(proto->getsockopt),
2643 proto_method_implemented(proto->sendmsg),
2644 proto_method_implemented(proto->recvmsg),
2645 proto_method_implemented(proto->sendpage),
2646 proto_method_implemented(proto->bind),
2647 proto_method_implemented(proto->backlog_rcv),
2648 proto_method_implemented(proto->hash),
2649 proto_method_implemented(proto->unhash),
2650 proto_method_implemented(proto->get_port),
2651 proto_method_implemented(proto->enter_memory_pressure));
2652}
2653
2654static int proto_seq_show(struct seq_file *seq, void *v)
2655{
60f0438a 2656 if (v == &proto_list)
1da177e4
LT
2657 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2658 "protocol",
2659 "size",
2660 "sockets",
2661 "memory",
2662 "press",
2663 "maxhdr",
2664 "slab",
2665 "module",
2666 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2667 else
60f0438a 2668 proto_seq_printf(seq, list_entry(v, struct proto, node));
1da177e4
LT
2669 return 0;
2670}
2671
f690808e 2672static const struct seq_operations proto_seq_ops = {
1da177e4
LT
2673 .start = proto_seq_start,
2674 .next = proto_seq_next,
2675 .stop = proto_seq_stop,
2676 .show = proto_seq_show,
2677};
2678
2679static int proto_seq_open(struct inode *inode, struct file *file)
2680{
14e943db
ED
2681 return seq_open_net(inode, file, &proto_seq_ops,
2682 sizeof(struct seq_net_private));
1da177e4
LT
2683}
2684
9a32144e 2685static const struct file_operations proto_seq_fops = {
1da177e4
LT
2686 .owner = THIS_MODULE,
2687 .open = proto_seq_open,
2688 .read = seq_read,
2689 .llseek = seq_lseek,
14e943db
ED
2690 .release = seq_release_net,
2691};
2692
2693static __net_init int proto_init_net(struct net *net)
2694{
2695 if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops))
2696 return -ENOMEM;
2697
2698 return 0;
2699}
2700
2701static __net_exit void proto_exit_net(struct net *net)
2702{
2703 proc_net_remove(net, "protocols");
2704}
2705
2706
2707static __net_initdata struct pernet_operations proto_net_ops = {
2708 .init = proto_init_net,
2709 .exit = proto_exit_net,
1da177e4
LT
2710};
2711
2712static int __init proto_init(void)
2713{
14e943db 2714 return register_pernet_subsys(&proto_net_ops);
1da177e4
LT
2715}
2716
2717subsys_initcall(proto_init);
2718
2719#endif /* PROC_FS */
This page took 1.106392 seconds and 5 git commands to generate.