netvm: allow the use of __GFP_MEMALLOC by specific sockets
[deliverable/linux.git] / net / core / sock.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Generic socket support routines. Memory allocators, socket lock/release
7 * handler for protocols to use and generic option handler.
8 *
9 *
02c30a84 10 * Authors: Ross Biro
1da177e4
LT
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Florian La Roche, <flla@stud.uni-sb.de>
13 * Alan Cox, <A.Cox@swansea.ac.uk>
14 *
15 * Fixes:
16 * Alan Cox : Numerous verify_area() problems
17 * Alan Cox : Connecting on a connecting socket
18 * now returns an error for tcp.
19 * Alan Cox : sock->protocol is set correctly.
20 * and is not sometimes left as 0.
21 * Alan Cox : connect handles icmp errors on a
22 * connect properly. Unfortunately there
23 * is a restart syscall nasty there. I
24 * can't match BSD without hacking the C
25 * library. Ideas urgently sought!
26 * Alan Cox : Disallow bind() to addresses that are
27 * not ours - especially broadcast ones!!
28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
30 * instead they leave that for the DESTROY timer.
31 * Alan Cox : Clean up error flag in accept
32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
33 * was buggy. Put a remove_sock() in the handler
34 * for memory when we hit 0. Also altered the timer
4ec93edb 35 * code. The ACK stuff can wait and needs major
1da177e4
LT
36 * TCP layer surgery.
37 * Alan Cox : Fixed TCP ack bug, removed remove sock
38 * and fixed timer/inet_bh race.
39 * Alan Cox : Added zapped flag for TCP
40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 * Rick Sladkey : Relaxed UDP rules for matching packets.
46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
47 * Pauline Middelink : identd support
48 * Alan Cox : Fixed connect() taking signals I think.
49 * Alan Cox : SO_LINGER supported
50 * Alan Cox : Error reporting fixes
51 * Anonymous : inet_create tidied up (sk->reuse setting)
52 * Alan Cox : inet sockets don't set sk->type!
53 * Alan Cox : Split socket option code
54 * Alan Cox : Callbacks
55 * Alan Cox : Nagle flag for Charles & Johannes stuff
56 * Alex : Removed restriction on inet fioctl
57 * Alan Cox : Splitting INET from NET core
58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
60 * Alan Cox : Split IP from generic code
61 * Alan Cox : New kfree_skbmem()
62 * Alan Cox : Make SO_DEBUG superuser only.
63 * Alan Cox : Allow anyone to clear SO_DEBUG
64 * (compatibility fix)
65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
66 * Alan Cox : Allocator for a socket is settable.
67 * Alan Cox : SO_ERROR includes soft errors.
68 * Alan Cox : Allow NULL arguments on some SO_ opts
69 * Alan Cox : Generic socket allocation to make hooks
70 * easier (suggested by Craig Metz).
71 * Michael Pall : SO_ERROR returns positive errno again
72 * Steve Whitehouse: Added default destructor to free
73 * protocol private data.
74 * Steve Whitehouse: Added various other default routines
75 * common to several socket families.
76 * Chris Evans : Call suser() check last on F_SETOWN
77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
79 * Andi Kleen : Fix write_space callback
80 * Chris Evans : Security fixes - signedness again
81 * Arnaldo C. Melo : cleanups, use skb_queue_purge
82 *
83 * To Fix:
84 *
85 *
86 * This program is free software; you can redistribute it and/or
87 * modify it under the terms of the GNU General Public License
88 * as published by the Free Software Foundation; either version
89 * 2 of the License, or (at your option) any later version.
90 */
91
e005d193
JP
92#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
93
4fc268d2 94#include <linux/capability.h>
1da177e4
LT
95#include <linux/errno.h>
96#include <linux/types.h>
97#include <linux/socket.h>
98#include <linux/in.h>
99#include <linux/kernel.h>
1da177e4
LT
100#include <linux/module.h>
101#include <linux/proc_fs.h>
102#include <linux/seq_file.h>
103#include <linux/sched.h>
104#include <linux/timer.h>
105#include <linux/string.h>
106#include <linux/sockios.h>
107#include <linux/net.h>
108#include <linux/mm.h>
109#include <linux/slab.h>
110#include <linux/interrupt.h>
111#include <linux/poll.h>
112#include <linux/tcp.h>
113#include <linux/init.h>
a1f8e7f7 114#include <linux/highmem.h>
3f551f94 115#include <linux/user_namespace.h>
c5905afb 116#include <linux/static_key.h>
3969eb38 117#include <linux/memcontrol.h>
8c1ae10d 118#include <linux/prefetch.h>
1da177e4
LT
119
120#include <asm/uaccess.h>
1da177e4
LT
121
122#include <linux/netdevice.h>
123#include <net/protocol.h>
124#include <linux/skbuff.h>
457c4cbc 125#include <net/net_namespace.h>
2e6599cb 126#include <net/request_sock.h>
1da177e4 127#include <net/sock.h>
20d49473 128#include <linux/net_tstamp.h>
1da177e4
LT
129#include <net/xfrm.h>
130#include <linux/ipsec.h>
f8451725 131#include <net/cls_cgroup.h>
5bc1421e 132#include <net/netprio_cgroup.h>
1da177e4
LT
133
134#include <linux/filter.h>
135
3847ce32
SM
136#include <trace/events/sock.h>
137
1da177e4
LT
138#ifdef CONFIG_INET
139#include <net/tcp.h>
140#endif
141
36b77a52 142static DEFINE_MUTEX(proto_list_mutex);
d1a4c0b3
GC
143static LIST_HEAD(proto_list);
144
c255a458 145#ifdef CONFIG_MEMCG_KMEM
1d62e436 146int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
d1a4c0b3
GC
147{
148 struct proto *proto;
149 int ret = 0;
150
36b77a52 151 mutex_lock(&proto_list_mutex);
d1a4c0b3
GC
152 list_for_each_entry(proto, &proto_list, node) {
153 if (proto->init_cgroup) {
1d62e436 154 ret = proto->init_cgroup(memcg, ss);
d1a4c0b3
GC
155 if (ret)
156 goto out;
157 }
158 }
159
36b77a52 160 mutex_unlock(&proto_list_mutex);
d1a4c0b3
GC
161 return ret;
162out:
163 list_for_each_entry_continue_reverse(proto, &proto_list, node)
164 if (proto->destroy_cgroup)
1d62e436 165 proto->destroy_cgroup(memcg);
36b77a52 166 mutex_unlock(&proto_list_mutex);
d1a4c0b3
GC
167 return ret;
168}
169
1d62e436 170void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg)
d1a4c0b3
GC
171{
172 struct proto *proto;
173
36b77a52 174 mutex_lock(&proto_list_mutex);
d1a4c0b3
GC
175 list_for_each_entry_reverse(proto, &proto_list, node)
176 if (proto->destroy_cgroup)
1d62e436 177 proto->destroy_cgroup(memcg);
36b77a52 178 mutex_unlock(&proto_list_mutex);
d1a4c0b3
GC
179}
180#endif
181
da21f24d
IM
182/*
183 * Each address family might have different locking rules, so we have
184 * one slock key per address family:
185 */
a5b5bb9a
IM
186static struct lock_class_key af_family_keys[AF_MAX];
187static struct lock_class_key af_family_slock_keys[AF_MAX];
188
c5905afb 189struct static_key memcg_socket_limit_enabled;
e1aab161
GC
190EXPORT_SYMBOL(memcg_socket_limit_enabled);
191
a5b5bb9a
IM
192/*
193 * Make lock validator output more readable. (we pre-construct these
194 * strings build-time, so that runtime initialization of socket
195 * locks is fast):
196 */
36cbd3dc 197static const char *const af_family_key_strings[AF_MAX+1] = {
a5b5bb9a
IM
198 "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX" , "sk_lock-AF_INET" ,
199 "sk_lock-AF_AX25" , "sk_lock-AF_IPX" , "sk_lock-AF_APPLETALK",
200 "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE" , "sk_lock-AF_ATMPVC" ,
201 "sk_lock-AF_X25" , "sk_lock-AF_INET6" , "sk_lock-AF_ROSE" ,
202 "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI" , "sk_lock-AF_SECURITY" ,
203 "sk_lock-AF_KEY" , "sk_lock-AF_NETLINK" , "sk_lock-AF_PACKET" ,
204 "sk_lock-AF_ASH" , "sk_lock-AF_ECONET" , "sk_lock-AF_ATMSVC" ,
cbd151bf 205 "sk_lock-AF_RDS" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" ,
a5b5bb9a 206 "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE" , "sk_lock-AF_LLC" ,
cd05acfe 207 "sk_lock-27" , "sk_lock-28" , "sk_lock-AF_CAN" ,
17926a79 208 "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" ,
bce7b154 209 "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" ,
6f107b58 210 "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG" ,
c7fe3b52 211 "sk_lock-AF_NFC" , "sk_lock-AF_MAX"
a5b5bb9a 212};
36cbd3dc 213static const char *const af_family_slock_key_strings[AF_MAX+1] = {
a5b5bb9a
IM
214 "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" ,
215 "slock-AF_AX25" , "slock-AF_IPX" , "slock-AF_APPLETALK",
216 "slock-AF_NETROM", "slock-AF_BRIDGE" , "slock-AF_ATMPVC" ,
217 "slock-AF_X25" , "slock-AF_INET6" , "slock-AF_ROSE" ,
218 "slock-AF_DECnet", "slock-AF_NETBEUI" , "slock-AF_SECURITY" ,
219 "slock-AF_KEY" , "slock-AF_NETLINK" , "slock-AF_PACKET" ,
220 "slock-AF_ASH" , "slock-AF_ECONET" , "slock-AF_ATMSVC" ,
cbd151bf 221 "slock-AF_RDS" , "slock-AF_SNA" , "slock-AF_IRDA" ,
a5b5bb9a 222 "slock-AF_PPPOX" , "slock-AF_WANPIPE" , "slock-AF_LLC" ,
cd05acfe 223 "slock-27" , "slock-28" , "slock-AF_CAN" ,
17926a79 224 "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" ,
bce7b154 225 "slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" ,
6f107b58 226 "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG" ,
c7fe3b52 227 "slock-AF_NFC" , "slock-AF_MAX"
a5b5bb9a 228};
36cbd3dc 229static const char *const af_family_clock_key_strings[AF_MAX+1] = {
443aef0e
PZ
230 "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" ,
231 "clock-AF_AX25" , "clock-AF_IPX" , "clock-AF_APPLETALK",
232 "clock-AF_NETROM", "clock-AF_BRIDGE" , "clock-AF_ATMPVC" ,
233 "clock-AF_X25" , "clock-AF_INET6" , "clock-AF_ROSE" ,
234 "clock-AF_DECnet", "clock-AF_NETBEUI" , "clock-AF_SECURITY" ,
235 "clock-AF_KEY" , "clock-AF_NETLINK" , "clock-AF_PACKET" ,
236 "clock-AF_ASH" , "clock-AF_ECONET" , "clock-AF_ATMSVC" ,
cbd151bf 237 "clock-AF_RDS" , "clock-AF_SNA" , "clock-AF_IRDA" ,
443aef0e 238 "clock-AF_PPPOX" , "clock-AF_WANPIPE" , "clock-AF_LLC" ,
b4942af6 239 "clock-27" , "clock-28" , "clock-AF_CAN" ,
e51f802b 240 "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" ,
bce7b154 241 "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" ,
6f107b58 242 "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG" ,
c7fe3b52 243 "clock-AF_NFC" , "clock-AF_MAX"
443aef0e 244};
da21f24d
IM
245
246/*
247 * sk_callback_lock locking rules are per-address-family,
248 * so split the lock classes by using a per-AF key:
249 */
250static struct lock_class_key af_callback_keys[AF_MAX];
251
1da177e4
LT
252/* Take into consideration the size of the struct sk_buff overhead in the
253 * determination of these values, since that is non-constant across
254 * platforms. This makes socket queueing behavior and performance
255 * not depend upon such differences.
256 */
257#define _SK_MEM_PACKETS 256
87fb4b7b 258#define _SK_MEM_OVERHEAD SKB_TRUESIZE(256)
1da177e4
LT
259#define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
260#define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
261
262/* Run time adjustable parameters. */
ab32ea5d 263__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
6d8ebc8a 264EXPORT_SYMBOL(sysctl_wmem_max);
ab32ea5d 265__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
6d8ebc8a 266EXPORT_SYMBOL(sysctl_rmem_max);
ab32ea5d
BH
267__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
268__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
1da177e4 269
25985edc 270/* Maximal space eaten by iovec or ancillary data plus some space */
ab32ea5d 271int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
2a91525c 272EXPORT_SYMBOL(sysctl_optmem_max);
1da177e4 273
7cb02404
MG
274/**
275 * sk_set_memalloc - sets %SOCK_MEMALLOC
276 * @sk: socket to set it on
277 *
278 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
279 * It's the responsibility of the admin to adjust min_free_kbytes
280 * to meet the requirements
281 */
282void sk_set_memalloc(struct sock *sk)
283{
284 sock_set_flag(sk, SOCK_MEMALLOC);
285 sk->sk_allocation |= __GFP_MEMALLOC;
286}
287EXPORT_SYMBOL_GPL(sk_set_memalloc);
288
289void sk_clear_memalloc(struct sock *sk)
290{
291 sock_reset_flag(sk, SOCK_MEMALLOC);
292 sk->sk_allocation &= ~__GFP_MEMALLOC;
293}
294EXPORT_SYMBOL_GPL(sk_clear_memalloc);
295
5bc1421e
NH
296#if defined(CONFIG_CGROUPS)
297#if !defined(CONFIG_NET_CLS_CGROUP)
f8451725
HX
298int net_cls_subsys_id = -1;
299EXPORT_SYMBOL_GPL(net_cls_subsys_id);
300#endif
5bc1421e
NH
301#if !defined(CONFIG_NETPRIO_CGROUP)
302int net_prio_subsys_id = -1;
303EXPORT_SYMBOL_GPL(net_prio_subsys_id);
304#endif
305#endif
f8451725 306
1da177e4
LT
307static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
308{
309 struct timeval tv;
310
311 if (optlen < sizeof(tv))
312 return -EINVAL;
313 if (copy_from_user(&tv, optval, sizeof(tv)))
314 return -EFAULT;
ba78073e
VA
315 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
316 return -EDOM;
1da177e4 317
ba78073e 318 if (tv.tv_sec < 0) {
6f11df83
AM
319 static int warned __read_mostly;
320
ba78073e 321 *timeo_p = 0;
50aab54f 322 if (warned < 10 && net_ratelimit()) {
ba78073e 323 warned++;
e005d193
JP
324 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
325 __func__, current->comm, task_pid_nr(current));
50aab54f 326 }
ba78073e
VA
327 return 0;
328 }
1da177e4
LT
329 *timeo_p = MAX_SCHEDULE_TIMEOUT;
330 if (tv.tv_sec == 0 && tv.tv_usec == 0)
331 return 0;
332 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
333 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
334 return 0;
335}
336
337static void sock_warn_obsolete_bsdism(const char *name)
338{
339 static int warned;
340 static char warncomm[TASK_COMM_LEN];
4ec93edb
YH
341 if (strcmp(warncomm, current->comm) && warned < 5) {
342 strcpy(warncomm, current->comm);
e005d193
JP
343 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
344 warncomm, name);
1da177e4
LT
345 warned++;
346 }
347}
348
08e29af3
ED
349#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
350
351static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
4ec93edb 352{
08e29af3
ED
353 if (sk->sk_flags & flags) {
354 sk->sk_flags &= ~flags;
355 if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP))
20d49473 356 net_disable_timestamp();
1da177e4
LT
357 }
358}
359
360
f0088a50
DV
361int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
362{
766e9037 363 int err;
f0088a50 364 int skb_len;
3b885787
NH
365 unsigned long flags;
366 struct sk_buff_head *list = &sk->sk_receive_queue;
f0088a50 367
0fd7bac6 368 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
766e9037 369 atomic_inc(&sk->sk_drops);
3847ce32 370 trace_sock_rcvqueue_full(sk, skb);
766e9037 371 return -ENOMEM;
f0088a50
DV
372 }
373
fda9ef5d 374 err = sk_filter(sk, skb);
f0088a50 375 if (err)
766e9037 376 return err;
f0088a50 377
3ab224be 378 if (!sk_rmem_schedule(sk, skb->truesize)) {
766e9037
ED
379 atomic_inc(&sk->sk_drops);
380 return -ENOBUFS;
3ab224be
HA
381 }
382
f0088a50
DV
383 skb->dev = NULL;
384 skb_set_owner_r(skb, sk);
49ad9599 385
f0088a50
DV
386 /* Cache the SKB length before we tack it onto the receive
387 * queue. Once it is added it no longer belongs to us and
388 * may be freed by other threads of control pulling packets
389 * from the queue.
390 */
391 skb_len = skb->len;
392
7fee226a
ED
393 /* we escape from rcu protected region, make sure we dont leak
394 * a norefcounted dst
395 */
396 skb_dst_force(skb);
397
3b885787
NH
398 spin_lock_irqsave(&list->lock, flags);
399 skb->dropcount = atomic_read(&sk->sk_drops);
400 __skb_queue_tail(list, skb);
401 spin_unlock_irqrestore(&list->lock, flags);
f0088a50
DV
402
403 if (!sock_flag(sk, SOCK_DEAD))
404 sk->sk_data_ready(sk, skb_len);
766e9037 405 return 0;
f0088a50
DV
406}
407EXPORT_SYMBOL(sock_queue_rcv_skb);
408
58a5a7b9 409int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
f0088a50
DV
410{
411 int rc = NET_RX_SUCCESS;
412
fda9ef5d 413 if (sk_filter(sk, skb))
f0088a50
DV
414 goto discard_and_relse;
415
416 skb->dev = NULL;
417
f545a38f 418 if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) {
c377411f
ED
419 atomic_inc(&sk->sk_drops);
420 goto discard_and_relse;
421 }
58a5a7b9
ACM
422 if (nested)
423 bh_lock_sock_nested(sk);
424 else
425 bh_lock_sock(sk);
a5b5bb9a
IM
426 if (!sock_owned_by_user(sk)) {
427 /*
428 * trylock + unlock semantics:
429 */
430 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
431
c57943a1 432 rc = sk_backlog_rcv(sk, skb);
a5b5bb9a
IM
433
434 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
f545a38f 435 } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
8eae939f
ZY
436 bh_unlock_sock(sk);
437 atomic_inc(&sk->sk_drops);
438 goto discard_and_relse;
439 }
440
f0088a50
DV
441 bh_unlock_sock(sk);
442out:
443 sock_put(sk);
444 return rc;
445discard_and_relse:
446 kfree_skb(skb);
447 goto out;
448}
449EXPORT_SYMBOL(sk_receive_skb);
450
ea94ff3b
KK
451void sk_reset_txq(struct sock *sk)
452{
453 sk_tx_queue_clear(sk);
454}
455EXPORT_SYMBOL(sk_reset_txq);
456
f0088a50
DV
457struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
458{
b6c6712a 459 struct dst_entry *dst = __sk_dst_get(sk);
f0088a50
DV
460
461 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
e022f0b4 462 sk_tx_queue_clear(sk);
a9b3cd7f 463 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
f0088a50
DV
464 dst_release(dst);
465 return NULL;
466 }
467
468 return dst;
469}
470EXPORT_SYMBOL(__sk_dst_check);
471
472struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
473{
474 struct dst_entry *dst = sk_dst_get(sk);
475
476 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
477 sk_dst_reset(sk);
478 dst_release(dst);
479 return NULL;
480 }
481
482 return dst;
483}
484EXPORT_SYMBOL(sk_dst_check);
485
4878809f
DM
486static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
487{
488 int ret = -ENOPROTOOPT;
489#ifdef CONFIG_NETDEVICES
3b1e0a65 490 struct net *net = sock_net(sk);
4878809f
DM
491 char devname[IFNAMSIZ];
492 int index;
493
494 /* Sorry... */
495 ret = -EPERM;
496 if (!capable(CAP_NET_RAW))
497 goto out;
498
499 ret = -EINVAL;
500 if (optlen < 0)
501 goto out;
502
503 /* Bind this socket to a particular device like "eth0",
504 * as specified in the passed interface name. If the
505 * name is "" or the option length is zero the socket
506 * is not bound.
507 */
508 if (optlen > IFNAMSIZ - 1)
509 optlen = IFNAMSIZ - 1;
510 memset(devname, 0, sizeof(devname));
511
512 ret = -EFAULT;
513 if (copy_from_user(devname, optval, optlen))
514 goto out;
515
000ba2e4
DM
516 index = 0;
517 if (devname[0] != '\0') {
bf8e56bf 518 struct net_device *dev;
4878809f 519
bf8e56bf
ED
520 rcu_read_lock();
521 dev = dev_get_by_name_rcu(net, devname);
522 if (dev)
523 index = dev->ifindex;
524 rcu_read_unlock();
4878809f
DM
525 ret = -ENODEV;
526 if (!dev)
527 goto out;
4878809f
DM
528 }
529
530 lock_sock(sk);
531 sk->sk_bound_dev_if = index;
532 sk_dst_reset(sk);
533 release_sock(sk);
534
535 ret = 0;
536
537out:
538#endif
539
540 return ret;
541}
542
c0ef877b
PE
543static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
544{
545 if (valbool)
546 sock_set_flag(sk, bit);
547 else
548 sock_reset_flag(sk, bit);
549}
550
1da177e4
LT
551/*
552 * This is meant for all protocols to use and covers goings on
553 * at the socket level. Everything here is generic.
554 */
555
556int sock_setsockopt(struct socket *sock, int level, int optname,
b7058842 557 char __user *optval, unsigned int optlen)
1da177e4 558{
2a91525c 559 struct sock *sk = sock->sk;
1da177e4
LT
560 int val;
561 int valbool;
562 struct linger ling;
563 int ret = 0;
4ec93edb 564
1da177e4
LT
565 /*
566 * Options without arguments
567 */
568
4878809f
DM
569 if (optname == SO_BINDTODEVICE)
570 return sock_bindtodevice(sk, optval, optlen);
571
e71a4783
SH
572 if (optlen < sizeof(int))
573 return -EINVAL;
4ec93edb 574
1da177e4
LT
575 if (get_user(val, (int __user *)optval))
576 return -EFAULT;
4ec93edb 577
2a91525c 578 valbool = val ? 1 : 0;
1da177e4
LT
579
580 lock_sock(sk);
581
2a91525c 582 switch (optname) {
e71a4783 583 case SO_DEBUG:
2a91525c 584 if (val && !capable(CAP_NET_ADMIN))
e71a4783 585 ret = -EACCES;
2a91525c 586 else
c0ef877b 587 sock_valbool_flag(sk, SOCK_DBG, valbool);
e71a4783
SH
588 break;
589 case SO_REUSEADDR:
4a17fd52 590 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
e71a4783
SH
591 break;
592 case SO_TYPE:
49c794e9 593 case SO_PROTOCOL:
0d6038ee 594 case SO_DOMAIN:
e71a4783
SH
595 case SO_ERROR:
596 ret = -ENOPROTOOPT;
597 break;
598 case SO_DONTROUTE:
c0ef877b 599 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
e71a4783
SH
600 break;
601 case SO_BROADCAST:
602 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
603 break;
604 case SO_SNDBUF:
605 /* Don't error on this BSD doesn't and if you think
82981930
ED
606 * about it this is right. Otherwise apps have to
607 * play 'guess the biggest size' games. RCVBUF/SNDBUF
608 * are treated in BSD as hints
609 */
610 val = min_t(u32, val, sysctl_wmem_max);
b0573dea 611set_sndbuf:
e71a4783 612 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
82981930
ED
613 sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);
614 /* Wake up sending tasks if we upped the value. */
e71a4783
SH
615 sk->sk_write_space(sk);
616 break;
1da177e4 617
e71a4783
SH
618 case SO_SNDBUFFORCE:
619 if (!capable(CAP_NET_ADMIN)) {
620 ret = -EPERM;
621 break;
622 }
623 goto set_sndbuf;
b0573dea 624
e71a4783
SH
625 case SO_RCVBUF:
626 /* Don't error on this BSD doesn't and if you think
82981930
ED
627 * about it this is right. Otherwise apps have to
628 * play 'guess the biggest size' games. RCVBUF/SNDBUF
629 * are treated in BSD as hints
630 */
631 val = min_t(u32, val, sysctl_rmem_max);
b0573dea 632set_rcvbuf:
e71a4783
SH
633 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
634 /*
635 * We double it on the way in to account for
636 * "struct sk_buff" etc. overhead. Applications
637 * assume that the SO_RCVBUF setting they make will
638 * allow that much actual data to be received on that
639 * socket.
640 *
641 * Applications are unaware that "struct sk_buff" and
642 * other overheads allocate from the receive buffer
643 * during socket buffer allocation.
644 *
645 * And after considering the possible alternatives,
646 * returning the value we actually used in getsockopt
647 * is the most desirable behavior.
648 */
82981930 649 sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);
e71a4783
SH
650 break;
651
652 case SO_RCVBUFFORCE:
653 if (!capable(CAP_NET_ADMIN)) {
654 ret = -EPERM;
1da177e4 655 break;
e71a4783
SH
656 }
657 goto set_rcvbuf;
1da177e4 658
e71a4783 659 case SO_KEEPALIVE:
1da177e4 660#ifdef CONFIG_INET
e71a4783
SH
661 if (sk->sk_protocol == IPPROTO_TCP)
662 tcp_set_keepalive(sk, valbool);
1da177e4 663#endif
e71a4783
SH
664 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
665 break;
666
667 case SO_OOBINLINE:
668 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
669 break;
670
671 case SO_NO_CHECK:
672 sk->sk_no_check = valbool;
673 break;
674
675 case SO_PRIORITY:
676 if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
677 sk->sk_priority = val;
678 else
679 ret = -EPERM;
680 break;
681
682 case SO_LINGER:
683 if (optlen < sizeof(ling)) {
684 ret = -EINVAL; /* 1003.1g */
1da177e4 685 break;
e71a4783 686 }
2a91525c 687 if (copy_from_user(&ling, optval, sizeof(ling))) {
e71a4783 688 ret = -EFAULT;
1da177e4 689 break;
e71a4783
SH
690 }
691 if (!ling.l_onoff)
692 sock_reset_flag(sk, SOCK_LINGER);
693 else {
1da177e4 694#if (BITS_PER_LONG == 32)
e71a4783
SH
695 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
696 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
1da177e4 697 else
e71a4783
SH
698#endif
699 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
700 sock_set_flag(sk, SOCK_LINGER);
701 }
702 break;
703
704 case SO_BSDCOMPAT:
705 sock_warn_obsolete_bsdism("setsockopt");
706 break;
707
708 case SO_PASSCRED:
709 if (valbool)
710 set_bit(SOCK_PASSCRED, &sock->flags);
711 else
712 clear_bit(SOCK_PASSCRED, &sock->flags);
713 break;
714
715 case SO_TIMESTAMP:
92f37fd2 716 case SO_TIMESTAMPNS:
e71a4783 717 if (valbool) {
92f37fd2
ED
718 if (optname == SO_TIMESTAMP)
719 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
720 else
721 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
e71a4783 722 sock_set_flag(sk, SOCK_RCVTSTAMP);
20d49473 723 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
92f37fd2 724 } else {
e71a4783 725 sock_reset_flag(sk, SOCK_RCVTSTAMP);
92f37fd2
ED
726 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
727 }
e71a4783
SH
728 break;
729
20d49473
PO
730 case SO_TIMESTAMPING:
731 if (val & ~SOF_TIMESTAMPING_MASK) {
f249fb78 732 ret = -EINVAL;
20d49473
PO
733 break;
734 }
735 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
736 val & SOF_TIMESTAMPING_TX_HARDWARE);
737 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
738 val & SOF_TIMESTAMPING_TX_SOFTWARE);
739 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
740 val & SOF_TIMESTAMPING_RX_HARDWARE);
741 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
742 sock_enable_timestamp(sk,
743 SOCK_TIMESTAMPING_RX_SOFTWARE);
744 else
745 sock_disable_timestamp(sk,
08e29af3 746 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
20d49473
PO
747 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
748 val & SOF_TIMESTAMPING_SOFTWARE);
749 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
750 val & SOF_TIMESTAMPING_SYS_HARDWARE);
751 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
752 val & SOF_TIMESTAMPING_RAW_HARDWARE);
753 break;
754
e71a4783
SH
755 case SO_RCVLOWAT:
756 if (val < 0)
757 val = INT_MAX;
758 sk->sk_rcvlowat = val ? : 1;
759 break;
760
761 case SO_RCVTIMEO:
762 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
763 break;
764
765 case SO_SNDTIMEO:
766 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
767 break;
1da177e4 768
e71a4783
SH
769 case SO_ATTACH_FILTER:
770 ret = -EINVAL;
771 if (optlen == sizeof(struct sock_fprog)) {
772 struct sock_fprog fprog;
1da177e4 773
e71a4783
SH
774 ret = -EFAULT;
775 if (copy_from_user(&fprog, optval, sizeof(fprog)))
1da177e4 776 break;
e71a4783
SH
777
778 ret = sk_attach_filter(&fprog, sk);
779 }
780 break;
781
782 case SO_DETACH_FILTER:
55b33325 783 ret = sk_detach_filter(sk);
e71a4783 784 break;
1da177e4 785
e71a4783
SH
786 case SO_PASSSEC:
787 if (valbool)
788 set_bit(SOCK_PASSSEC, &sock->flags);
789 else
790 clear_bit(SOCK_PASSSEC, &sock->flags);
791 break;
4a19ec58
LAT
792 case SO_MARK:
793 if (!capable(CAP_NET_ADMIN))
794 ret = -EPERM;
2a91525c 795 else
4a19ec58 796 sk->sk_mark = val;
4a19ec58 797 break;
877ce7c1 798
1da177e4
LT
799 /* We implement the SO_SNDLOWAT etc to
800 not be settable (1003.1g 5.3) */
3b885787 801 case SO_RXQ_OVFL:
8083f0fc 802 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
3b885787 803 break;
6e3e939f
JB
804
805 case SO_WIFI_STATUS:
806 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
807 break;
808
ef64a54f
PE
809 case SO_PEEK_OFF:
810 if (sock->ops->set_peek_off)
811 sock->ops->set_peek_off(sk, val);
812 else
813 ret = -EOPNOTSUPP;
814 break;
3bdc0eba
BG
815
816 case SO_NOFCS:
817 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
818 break;
819
e71a4783
SH
820 default:
821 ret = -ENOPROTOOPT;
822 break;
4ec93edb 823 }
1da177e4
LT
824 release_sock(sk);
825 return ret;
826}
2a91525c 827EXPORT_SYMBOL(sock_setsockopt);
1da177e4
LT
828
829
3f551f94
EB
830void cred_to_ucred(struct pid *pid, const struct cred *cred,
831 struct ucred *ucred)
832{
833 ucred->pid = pid_vnr(pid);
834 ucred->uid = ucred->gid = -1;
835 if (cred) {
836 struct user_namespace *current_ns = current_user_ns();
837
76b6db01
EB
838 ucred->uid = from_kuid(current_ns, cred->euid);
839 ucred->gid = from_kgid(current_ns, cred->egid);
3f551f94
EB
840 }
841}
3924773a 842EXPORT_SYMBOL_GPL(cred_to_ucred);
3f551f94 843
1da177e4
LT
844int sock_getsockopt(struct socket *sock, int level, int optname,
845 char __user *optval, int __user *optlen)
846{
847 struct sock *sk = sock->sk;
4ec93edb 848
e71a4783 849 union {
4ec93edb
YH
850 int val;
851 struct linger ling;
1da177e4
LT
852 struct timeval tm;
853 } v;
4ec93edb 854
4d0392be 855 int lv = sizeof(int);
1da177e4 856 int len;
4ec93edb 857
e71a4783 858 if (get_user(len, optlen))
4ec93edb 859 return -EFAULT;
e71a4783 860 if (len < 0)
1da177e4 861 return -EINVAL;
4ec93edb 862
50fee1de 863 memset(&v, 0, sizeof(v));
df0bca04 864
2a91525c 865 switch (optname) {
e71a4783
SH
866 case SO_DEBUG:
867 v.val = sock_flag(sk, SOCK_DBG);
868 break;
869
870 case SO_DONTROUTE:
871 v.val = sock_flag(sk, SOCK_LOCALROUTE);
872 break;
873
874 case SO_BROADCAST:
1b23a5df 875 v.val = sock_flag(sk, SOCK_BROADCAST);
e71a4783
SH
876 break;
877
878 case SO_SNDBUF:
879 v.val = sk->sk_sndbuf;
880 break;
881
882 case SO_RCVBUF:
883 v.val = sk->sk_rcvbuf;
884 break;
885
886 case SO_REUSEADDR:
887 v.val = sk->sk_reuse;
888 break;
889
890 case SO_KEEPALIVE:
1b23a5df 891 v.val = sock_flag(sk, SOCK_KEEPOPEN);
e71a4783
SH
892 break;
893
894 case SO_TYPE:
895 v.val = sk->sk_type;
896 break;
897
49c794e9
JE
898 case SO_PROTOCOL:
899 v.val = sk->sk_protocol;
900 break;
901
0d6038ee
JE
902 case SO_DOMAIN:
903 v.val = sk->sk_family;
904 break;
905
e71a4783
SH
906 case SO_ERROR:
907 v.val = -sock_error(sk);
2a91525c 908 if (v.val == 0)
e71a4783
SH
909 v.val = xchg(&sk->sk_err_soft, 0);
910 break;
911
912 case SO_OOBINLINE:
1b23a5df 913 v.val = sock_flag(sk, SOCK_URGINLINE);
e71a4783
SH
914 break;
915
916 case SO_NO_CHECK:
917 v.val = sk->sk_no_check;
918 break;
919
920 case SO_PRIORITY:
921 v.val = sk->sk_priority;
922 break;
923
924 case SO_LINGER:
925 lv = sizeof(v.ling);
1b23a5df 926 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER);
e71a4783
SH
927 v.ling.l_linger = sk->sk_lingertime / HZ;
928 break;
929
930 case SO_BSDCOMPAT:
931 sock_warn_obsolete_bsdism("getsockopt");
932 break;
933
934 case SO_TIMESTAMP:
92f37fd2
ED
935 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
936 !sock_flag(sk, SOCK_RCVTSTAMPNS);
937 break;
938
939 case SO_TIMESTAMPNS:
940 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
e71a4783
SH
941 break;
942
20d49473
PO
943 case SO_TIMESTAMPING:
944 v.val = 0;
945 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
946 v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
947 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
948 v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
949 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
950 v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
951 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
952 v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
953 if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
954 v.val |= SOF_TIMESTAMPING_SOFTWARE;
955 if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
956 v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
957 if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
958 v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
959 break;
960
e71a4783 961 case SO_RCVTIMEO:
2a91525c 962 lv = sizeof(struct timeval);
e71a4783
SH
963 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
964 v.tm.tv_sec = 0;
965 v.tm.tv_usec = 0;
966 } else {
967 v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
968 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
969 }
970 break;
971
972 case SO_SNDTIMEO:
2a91525c 973 lv = sizeof(struct timeval);
e71a4783
SH
974 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
975 v.tm.tv_sec = 0;
976 v.tm.tv_usec = 0;
977 } else {
978 v.tm.tv_sec = sk->sk_sndtimeo / HZ;
979 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
980 }
981 break;
1da177e4 982
e71a4783
SH
983 case SO_RCVLOWAT:
984 v.val = sk->sk_rcvlowat;
985 break;
1da177e4 986
e71a4783 987 case SO_SNDLOWAT:
2a91525c 988 v.val = 1;
e71a4783 989 break;
1da177e4 990
e71a4783 991 case SO_PASSCRED:
82981930 992 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
e71a4783 993 break;
1da177e4 994
e71a4783 995 case SO_PEERCRED:
109f6e39
EB
996 {
997 struct ucred peercred;
998 if (len > sizeof(peercred))
999 len = sizeof(peercred);
1000 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1001 if (copy_to_user(optval, &peercred, len))
e71a4783
SH
1002 return -EFAULT;
1003 goto lenout;
109f6e39 1004 }
1da177e4 1005
e71a4783
SH
1006 case SO_PEERNAME:
1007 {
1008 char address[128];
1009
1010 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1011 return -ENOTCONN;
1012 if (lv < len)
1013 return -EINVAL;
1014 if (copy_to_user(optval, address, len))
1015 return -EFAULT;
1016 goto lenout;
1017 }
1da177e4 1018
e71a4783
SH
1019 /* Dubious BSD thing... Probably nobody even uses it, but
1020 * the UNIX standard wants it for whatever reason... -DaveM
1021 */
1022 case SO_ACCEPTCONN:
1023 v.val = sk->sk_state == TCP_LISTEN;
1024 break;
1da177e4 1025
e71a4783 1026 case SO_PASSSEC:
82981930 1027 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
e71a4783 1028 break;
877ce7c1 1029
e71a4783
SH
1030 case SO_PEERSEC:
1031 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1da177e4 1032
4a19ec58
LAT
1033 case SO_MARK:
1034 v.val = sk->sk_mark;
1035 break;
1036
3b885787 1037 case SO_RXQ_OVFL:
1b23a5df 1038 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
3b885787
NH
1039 break;
1040
6e3e939f 1041 case SO_WIFI_STATUS:
1b23a5df 1042 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
6e3e939f
JB
1043 break;
1044
ef64a54f
PE
1045 case SO_PEEK_OFF:
1046 if (!sock->ops->set_peek_off)
1047 return -EOPNOTSUPP;
1048
1049 v.val = sk->sk_peek_off;
1050 break;
bc2f7996 1051 case SO_NOFCS:
1b23a5df 1052 v.val = sock_flag(sk, SOCK_NOFCS);
bc2f7996 1053 break;
e71a4783
SH
1054 default:
1055 return -ENOPROTOOPT;
1da177e4 1056 }
e71a4783 1057
1da177e4
LT
1058 if (len > lv)
1059 len = lv;
1060 if (copy_to_user(optval, &v, len))
1061 return -EFAULT;
1062lenout:
4ec93edb
YH
1063 if (put_user(len, optlen))
1064 return -EFAULT;
1065 return 0;
1da177e4
LT
1066}
1067
a5b5bb9a
IM
1068/*
1069 * Initialize an sk_lock.
1070 *
1071 * (We also register the sk_lock with the lock validator.)
1072 */
b6f99a21 1073static inline void sock_lock_init(struct sock *sk)
a5b5bb9a 1074{
ed07536e
PZ
1075 sock_lock_init_class_and_name(sk,
1076 af_family_slock_key_strings[sk->sk_family],
1077 af_family_slock_keys + sk->sk_family,
1078 af_family_key_strings[sk->sk_family],
1079 af_family_keys + sk->sk_family);
a5b5bb9a
IM
1080}
1081
4dc6dc71
ED
1082/*
1083 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1084 * even temporarly, because of RCU lookups. sk_node should also be left as is.
68835aba 1085 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
4dc6dc71 1086 */
f1a6c4da
PE
1087static void sock_copy(struct sock *nsk, const struct sock *osk)
1088{
1089#ifdef CONFIG_SECURITY_NETWORK
1090 void *sptr = nsk->sk_security;
1091#endif
68835aba
ED
1092 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1093
1094 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1095 osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1096
f1a6c4da
PE
1097#ifdef CONFIG_SECURITY_NETWORK
1098 nsk->sk_security = sptr;
1099 security_sk_clone(osk, nsk);
1100#endif
1101}
1102
fcbdf09d
OP
1103/*
1104 * caches using SLAB_DESTROY_BY_RCU should let .next pointer from nulls nodes
1105 * un-modified. Special care is taken when initializing object to zero.
1106 */
1107static inline void sk_prot_clear_nulls(struct sock *sk, int size)
1108{
1109 if (offsetof(struct sock, sk_node.next) != 0)
1110 memset(sk, 0, offsetof(struct sock, sk_node.next));
1111 memset(&sk->sk_node.pprev, 0,
1112 size - offsetof(struct sock, sk_node.pprev));
1113}
1114
1115void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1116{
1117 unsigned long nulls1, nulls2;
1118
1119 nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1120 nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1121 if (nulls1 > nulls2)
1122 swap(nulls1, nulls2);
1123
1124 if (nulls1 != 0)
1125 memset((char *)sk, 0, nulls1);
1126 memset((char *)sk + nulls1 + sizeof(void *), 0,
1127 nulls2 - nulls1 - sizeof(void *));
1128 memset((char *)sk + nulls2 + sizeof(void *), 0,
1129 size - nulls2 - sizeof(void *));
1130}
1131EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1132
2e4afe7b
PE
1133static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1134 int family)
c308c1b2
PE
1135{
1136 struct sock *sk;
1137 struct kmem_cache *slab;
1138
1139 slab = prot->slab;
e912b114
ED
1140 if (slab != NULL) {
1141 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1142 if (!sk)
1143 return sk;
1144 if (priority & __GFP_ZERO) {
fcbdf09d
OP
1145 if (prot->clear_sk)
1146 prot->clear_sk(sk, prot->obj_size);
1147 else
1148 sk_prot_clear_nulls(sk, prot->obj_size);
e912b114 1149 }
fcbdf09d 1150 } else
c308c1b2
PE
1151 sk = kmalloc(prot->obj_size, priority);
1152
2e4afe7b 1153 if (sk != NULL) {
a98b65a3
VN
1154 kmemcheck_annotate_bitfield(sk, flags);
1155
2e4afe7b
PE
1156 if (security_sk_alloc(sk, family, priority))
1157 goto out_free;
1158
1159 if (!try_module_get(prot->owner))
1160 goto out_free_sec;
e022f0b4 1161 sk_tx_queue_clear(sk);
2e4afe7b
PE
1162 }
1163
c308c1b2 1164 return sk;
2e4afe7b
PE
1165
1166out_free_sec:
1167 security_sk_free(sk);
1168out_free:
1169 if (slab != NULL)
1170 kmem_cache_free(slab, sk);
1171 else
1172 kfree(sk);
1173 return NULL;
c308c1b2
PE
1174}
1175
1176static void sk_prot_free(struct proto *prot, struct sock *sk)
1177{
1178 struct kmem_cache *slab;
2e4afe7b 1179 struct module *owner;
c308c1b2 1180
2e4afe7b 1181 owner = prot->owner;
c308c1b2 1182 slab = prot->slab;
2e4afe7b
PE
1183
1184 security_sk_free(sk);
c308c1b2
PE
1185 if (slab != NULL)
1186 kmem_cache_free(slab, sk);
1187 else
1188 kfree(sk);
2e4afe7b 1189 module_put(owner);
c308c1b2
PE
1190}
1191
f8451725
HX
1192#ifdef CONFIG_CGROUPS
1193void sock_update_classid(struct sock *sk)
1194{
1144182a 1195 u32 classid;
f8451725 1196
1144182a
PM
1197 rcu_read_lock(); /* doing current task, which cannot vanish. */
1198 classid = task_cls_classid(current);
1199 rcu_read_unlock();
f8451725
HX
1200 if (classid && classid != sk->sk_classid)
1201 sk->sk_classid = classid;
1202}
82862742 1203EXPORT_SYMBOL(sock_update_classid);
5bc1421e 1204
406a3c63 1205void sock_update_netprioidx(struct sock *sk, struct task_struct *task)
5bc1421e 1206{
5bc1421e
NH
1207 if (in_interrupt())
1208 return;
2b73bc65 1209
406a3c63 1210 sk->sk_cgrp_prioidx = task_netprioidx(task);
5bc1421e
NH
1211}
1212EXPORT_SYMBOL_GPL(sock_update_netprioidx);
f8451725
HX
1213#endif
1214
1da177e4
LT
1215/**
1216 * sk_alloc - All socket objects are allocated here
c4ea43c5 1217 * @net: the applicable net namespace
4dc3b16b
PP
1218 * @family: protocol family
1219 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1220 * @prot: struct proto associated with this new sock instance
1da177e4 1221 */
1b8d7ae4 1222struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
6257ff21 1223 struct proto *prot)
1da177e4 1224{
c308c1b2 1225 struct sock *sk;
1da177e4 1226
154adbc8 1227 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1da177e4 1228 if (sk) {
154adbc8
PE
1229 sk->sk_family = family;
1230 /*
1231 * See comment in struct sock definition to understand
1232 * why we need sk_prot_creator -acme
1233 */
1234 sk->sk_prot = sk->sk_prot_creator = prot;
1235 sock_lock_init(sk);
3b1e0a65 1236 sock_net_set(sk, get_net(net));
d66ee058 1237 atomic_set(&sk->sk_wmem_alloc, 1);
f8451725
HX
1238
1239 sock_update_classid(sk);
406a3c63 1240 sock_update_netprioidx(sk, current);
1da177e4 1241 }
a79af59e 1242
2e4afe7b 1243 return sk;
1da177e4 1244}
2a91525c 1245EXPORT_SYMBOL(sk_alloc);
1da177e4 1246
2b85a34e 1247static void __sk_free(struct sock *sk)
1da177e4
LT
1248{
1249 struct sk_filter *filter;
1da177e4
LT
1250
1251 if (sk->sk_destruct)
1252 sk->sk_destruct(sk);
1253
a898def2
PM
1254 filter = rcu_dereference_check(sk->sk_filter,
1255 atomic_read(&sk->sk_wmem_alloc) == 0);
1da177e4 1256 if (filter) {
309dd5fc 1257 sk_filter_uncharge(sk, filter);
a9b3cd7f 1258 RCU_INIT_POINTER(sk->sk_filter, NULL);
1da177e4
LT
1259 }
1260
08e29af3 1261 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1da177e4
LT
1262
1263 if (atomic_read(&sk->sk_omem_alloc))
e005d193
JP
1264 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1265 __func__, atomic_read(&sk->sk_omem_alloc));
1da177e4 1266
109f6e39
EB
1267 if (sk->sk_peer_cred)
1268 put_cred(sk->sk_peer_cred);
1269 put_pid(sk->sk_peer_pid);
3b1e0a65 1270 put_net(sock_net(sk));
c308c1b2 1271 sk_prot_free(sk->sk_prot_creator, sk);
1da177e4 1272}
2b85a34e
ED
1273
1274void sk_free(struct sock *sk)
1275{
1276 /*
25985edc 1277 * We subtract one from sk_wmem_alloc and can know if
2b85a34e
ED
1278 * some packets are still in some tx queue.
1279 * If not null, sock_wfree() will call __sk_free(sk) later
1280 */
1281 if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1282 __sk_free(sk);
1283}
2a91525c 1284EXPORT_SYMBOL(sk_free);
1da177e4 1285
edf02087 1286/*
25985edc
LDM
1287 * Last sock_put should drop reference to sk->sk_net. It has already
1288 * been dropped in sk_change_net. Taking reference to stopping namespace
edf02087 1289 * is not an option.
25985edc 1290 * Take reference to a socket to remove it from hash _alive_ and after that
edf02087
DL
1291 * destroy it in the context of init_net.
1292 */
1293void sk_release_kernel(struct sock *sk)
1294{
1295 if (sk == NULL || sk->sk_socket == NULL)
1296 return;
1297
1298 sock_hold(sk);
1299 sock_release(sk->sk_socket);
65a18ec5 1300 release_net(sock_net(sk));
3b1e0a65 1301 sock_net_set(sk, get_net(&init_net));
edf02087
DL
1302 sock_put(sk);
1303}
45af1754 1304EXPORT_SYMBOL(sk_release_kernel);
edf02087 1305
475f1b52
SR
1306static void sk_update_clone(const struct sock *sk, struct sock *newsk)
1307{
1308 if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
1309 sock_update_memcg(newsk);
1310}
1311
e56c57d0
ED
1312/**
1313 * sk_clone_lock - clone a socket, and lock its clone
1314 * @sk: the socket to clone
1315 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1316 *
1317 * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1318 */
1319struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
87d11ceb 1320{
8fd1d178 1321 struct sock *newsk;
87d11ceb 1322
8fd1d178 1323 newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
87d11ceb
ACM
1324 if (newsk != NULL) {
1325 struct sk_filter *filter;
1326
892c141e 1327 sock_copy(newsk, sk);
87d11ceb
ACM
1328
1329 /* SANITY */
3b1e0a65 1330 get_net(sock_net(newsk));
87d11ceb
ACM
1331 sk_node_init(&newsk->sk_node);
1332 sock_lock_init(newsk);
1333 bh_lock_sock(newsk);
fa438ccf 1334 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
8eae939f 1335 newsk->sk_backlog.len = 0;
87d11ceb
ACM
1336
1337 atomic_set(&newsk->sk_rmem_alloc, 0);
2b85a34e
ED
1338 /*
1339 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1340 */
1341 atomic_set(&newsk->sk_wmem_alloc, 1);
87d11ceb
ACM
1342 atomic_set(&newsk->sk_omem_alloc, 0);
1343 skb_queue_head_init(&newsk->sk_receive_queue);
1344 skb_queue_head_init(&newsk->sk_write_queue);
97fc2f08
CL
1345#ifdef CONFIG_NET_DMA
1346 skb_queue_head_init(&newsk->sk_async_wait_queue);
1347#endif
87d11ceb 1348
b6c6712a 1349 spin_lock_init(&newsk->sk_dst_lock);
87d11ceb 1350 rwlock_init(&newsk->sk_callback_lock);
443aef0e
PZ
1351 lockdep_set_class_and_name(&newsk->sk_callback_lock,
1352 af_callback_keys + newsk->sk_family,
1353 af_family_clock_key_strings[newsk->sk_family]);
87d11ceb
ACM
1354
1355 newsk->sk_dst_cache = NULL;
1356 newsk->sk_wmem_queued = 0;
1357 newsk->sk_forward_alloc = 0;
1358 newsk->sk_send_head = NULL;
87d11ceb
ACM
1359 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1360
1361 sock_reset_flag(newsk, SOCK_DONE);
1362 skb_queue_head_init(&newsk->sk_error_queue);
1363
0d7da9dd 1364 filter = rcu_dereference_protected(newsk->sk_filter, 1);
87d11ceb
ACM
1365 if (filter != NULL)
1366 sk_filter_charge(newsk, filter);
1367
1368 if (unlikely(xfrm_sk_clone_policy(newsk))) {
1369 /* It is still raw copy of parent, so invalidate
1370 * destructor and make plain sk_free() */
1371 newsk->sk_destruct = NULL;
b0691c8e 1372 bh_unlock_sock(newsk);
87d11ceb
ACM
1373 sk_free(newsk);
1374 newsk = NULL;
1375 goto out;
1376 }
1377
1378 newsk->sk_err = 0;
1379 newsk->sk_priority = 0;
4dc6dc71
ED
1380 /*
1381 * Before updating sk_refcnt, we must commit prior changes to memory
1382 * (Documentation/RCU/rculist_nulls.txt for details)
1383 */
1384 smp_wmb();
87d11ceb
ACM
1385 atomic_set(&newsk->sk_refcnt, 2);
1386
1387 /*
1388 * Increment the counter in the same struct proto as the master
1389 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1390 * is the same as sk->sk_prot->socks, as this field was copied
1391 * with memcpy).
1392 *
1393 * This _changes_ the previous behaviour, where
1394 * tcp_create_openreq_child always was incrementing the
1395 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1396 * to be taken into account in all callers. -acme
1397 */
1398 sk_refcnt_debug_inc(newsk);
972692e0 1399 sk_set_socket(newsk, NULL);
43815482 1400 newsk->sk_wq = NULL;
87d11ceb 1401
f3f511e1
GC
1402 sk_update_clone(sk, newsk);
1403
87d11ceb 1404 if (newsk->sk_prot->sockets_allocated)
180d8cd9 1405 sk_sockets_allocated_inc(newsk);
704da560 1406
08e29af3 1407 if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
704da560 1408 net_enable_timestamp();
87d11ceb
ACM
1409 }
1410out:
1411 return newsk;
1412}
e56c57d0 1413EXPORT_SYMBOL_GPL(sk_clone_lock);
87d11ceb 1414
9958089a
AK
1415void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1416{
1417 __sk_dst_set(sk, dst);
1418 sk->sk_route_caps = dst->dev->features;
1419 if (sk->sk_route_caps & NETIF_F_GSO)
4fcd6b99 1420 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
a465419b 1421 sk->sk_route_caps &= ~sk->sk_route_nocaps;
9958089a 1422 if (sk_can_gso(sk)) {
82cc1a7a 1423 if (dst->header_len) {
9958089a 1424 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
82cc1a7a 1425 } else {
9958089a 1426 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
82cc1a7a
PWJ
1427 sk->sk_gso_max_size = dst->dev->gso_max_size;
1428 }
9958089a
AK
1429 }
1430}
1431EXPORT_SYMBOL_GPL(sk_setup_caps);
1432
1da177e4
LT
1433void __init sk_init(void)
1434{
4481374c 1435 if (totalram_pages <= 4096) {
1da177e4
LT
1436 sysctl_wmem_max = 32767;
1437 sysctl_rmem_max = 32767;
1438 sysctl_wmem_default = 32767;
1439 sysctl_rmem_default = 32767;
4481374c 1440 } else if (totalram_pages >= 131072) {
1da177e4
LT
1441 sysctl_wmem_max = 131071;
1442 sysctl_rmem_max = 131071;
1443 }
1444}
1445
1446/*
1447 * Simple resource managers for sockets.
1448 */
1449
1450
4ec93edb
YH
1451/*
1452 * Write buffer destructor automatically called from kfree_skb.
1da177e4
LT
1453 */
1454void sock_wfree(struct sk_buff *skb)
1455{
1456 struct sock *sk = skb->sk;
d99927f4 1457 unsigned int len = skb->truesize;
1da177e4 1458
d99927f4
ED
1459 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1460 /*
1461 * Keep a reference on sk_wmem_alloc, this will be released
1462 * after sk_write_space() call
1463 */
1464 atomic_sub(len - 1, &sk->sk_wmem_alloc);
1da177e4 1465 sk->sk_write_space(sk);
d99927f4
ED
1466 len = 1;
1467 }
2b85a34e 1468 /*
d99927f4
ED
1469 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1470 * could not do because of in-flight packets
2b85a34e 1471 */
d99927f4 1472 if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
2b85a34e 1473 __sk_free(sk);
1da177e4 1474}
2a91525c 1475EXPORT_SYMBOL(sock_wfree);
1da177e4 1476
4ec93edb
YH
1477/*
1478 * Read buffer destructor automatically called from kfree_skb.
1da177e4
LT
1479 */
1480void sock_rfree(struct sk_buff *skb)
1481{
1482 struct sock *sk = skb->sk;
d361fd59 1483 unsigned int len = skb->truesize;
1da177e4 1484
d361fd59
ED
1485 atomic_sub(len, &sk->sk_rmem_alloc);
1486 sk_mem_uncharge(sk, len);
1da177e4 1487}
2a91525c 1488EXPORT_SYMBOL(sock_rfree);
1da177e4 1489
41063e9d
DM
1490void sock_edemux(struct sk_buff *skb)
1491{
1492 sock_put(skb->sk);
1493}
1494EXPORT_SYMBOL(sock_edemux);
1da177e4
LT
1495
1496int sock_i_uid(struct sock *sk)
1497{
1498 int uid;
1499
f064af1e 1500 read_lock_bh(&sk->sk_callback_lock);
1da177e4 1501 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
f064af1e 1502 read_unlock_bh(&sk->sk_callback_lock);
1da177e4
LT
1503 return uid;
1504}
2a91525c 1505EXPORT_SYMBOL(sock_i_uid);
1da177e4
LT
1506
1507unsigned long sock_i_ino(struct sock *sk)
1508{
1509 unsigned long ino;
1510
f064af1e 1511 read_lock_bh(&sk->sk_callback_lock);
1da177e4 1512 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
f064af1e 1513 read_unlock_bh(&sk->sk_callback_lock);
1da177e4
LT
1514 return ino;
1515}
2a91525c 1516EXPORT_SYMBOL(sock_i_ino);
1da177e4
LT
1517
1518/*
1519 * Allocate a skb from the socket's send buffer.
1520 */
86a76caf 1521struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
dd0fc66f 1522 gfp_t priority)
1da177e4
LT
1523{
1524 if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
2a91525c 1525 struct sk_buff *skb = alloc_skb(size, priority);
1da177e4
LT
1526 if (skb) {
1527 skb_set_owner_w(skb, sk);
1528 return skb;
1529 }
1530 }
1531 return NULL;
1532}
2a91525c 1533EXPORT_SYMBOL(sock_wmalloc);
1da177e4
LT
1534
1535/*
1536 * Allocate a skb from the socket's receive buffer.
4ec93edb 1537 */
86a76caf 1538struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
dd0fc66f 1539 gfp_t priority)
1da177e4
LT
1540{
1541 if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1542 struct sk_buff *skb = alloc_skb(size, priority);
1543 if (skb) {
1544 skb_set_owner_r(skb, sk);
1545 return skb;
1546 }
1547 }
1548 return NULL;
1549}
1550
4ec93edb 1551/*
1da177e4 1552 * Allocate a memory block from the socket's option memory buffer.
4ec93edb 1553 */
dd0fc66f 1554void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1da177e4 1555{
95c96174 1556 if ((unsigned int)size <= sysctl_optmem_max &&
1da177e4
LT
1557 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1558 void *mem;
1559 /* First do the add, to avoid the race if kmalloc
4ec93edb 1560 * might sleep.
1da177e4
LT
1561 */
1562 atomic_add(size, &sk->sk_omem_alloc);
1563 mem = kmalloc(size, priority);
1564 if (mem)
1565 return mem;
1566 atomic_sub(size, &sk->sk_omem_alloc);
1567 }
1568 return NULL;
1569}
2a91525c 1570EXPORT_SYMBOL(sock_kmalloc);
1da177e4
LT
1571
1572/*
1573 * Free an option memory block.
1574 */
1575void sock_kfree_s(struct sock *sk, void *mem, int size)
1576{
1577 kfree(mem);
1578 atomic_sub(size, &sk->sk_omem_alloc);
1579}
2a91525c 1580EXPORT_SYMBOL(sock_kfree_s);
1da177e4
LT
1581
1582/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1583 I think, these locks should be removed for datagram sockets.
1584 */
2a91525c 1585static long sock_wait_for_wmem(struct sock *sk, long timeo)
1da177e4
LT
1586{
1587 DEFINE_WAIT(wait);
1588
1589 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1590 for (;;) {
1591 if (!timeo)
1592 break;
1593 if (signal_pending(current))
1594 break;
1595 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
aa395145 1596 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1da177e4
LT
1597 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1598 break;
1599 if (sk->sk_shutdown & SEND_SHUTDOWN)
1600 break;
1601 if (sk->sk_err)
1602 break;
1603 timeo = schedule_timeout(timeo);
1604 }
aa395145 1605 finish_wait(sk_sleep(sk), &wait);
1da177e4
LT
1606 return timeo;
1607}
1608
1609
1610/*
1611 * Generic send/receive buffer handlers
1612 */
1613
4cc7f68d
HX
1614struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1615 unsigned long data_len, int noblock,
1616 int *errcode)
1da177e4
LT
1617{
1618 struct sk_buff *skb;
7d877f3b 1619 gfp_t gfp_mask;
1da177e4
LT
1620 long timeo;
1621 int err;
cc9b17ad
JW
1622 int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1623
1624 err = -EMSGSIZE;
1625 if (npages > MAX_SKB_FRAGS)
1626 goto failure;
1da177e4
LT
1627
1628 gfp_mask = sk->sk_allocation;
1629 if (gfp_mask & __GFP_WAIT)
1630 gfp_mask |= __GFP_REPEAT;
1631
1632 timeo = sock_sndtimeo(sk, noblock);
1633 while (1) {
1634 err = sock_error(sk);
1635 if (err != 0)
1636 goto failure;
1637
1638 err = -EPIPE;
1639 if (sk->sk_shutdown & SEND_SHUTDOWN)
1640 goto failure;
1641
1642 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
db38c179 1643 skb = alloc_skb(header_len, gfp_mask);
1da177e4 1644 if (skb) {
1da177e4
LT
1645 int i;
1646
1647 /* No pages, we're done... */
1648 if (!data_len)
1649 break;
1650
1da177e4
LT
1651 skb->truesize += data_len;
1652 skb_shinfo(skb)->nr_frags = npages;
1653 for (i = 0; i < npages; i++) {
1654 struct page *page;
1da177e4
LT
1655
1656 page = alloc_pages(sk->sk_allocation, 0);
1657 if (!page) {
1658 err = -ENOBUFS;
1659 skb_shinfo(skb)->nr_frags = i;
1660 kfree_skb(skb);
1661 goto failure;
1662 }
1663
ea2ab693
IC
1664 __skb_fill_page_desc(skb, i,
1665 page, 0,
1666 (data_len >= PAGE_SIZE ?
1667 PAGE_SIZE :
1668 data_len));
1da177e4
LT
1669 data_len -= PAGE_SIZE;
1670 }
1671
1672 /* Full success... */
1673 break;
1674 }
1675 err = -ENOBUFS;
1676 goto failure;
1677 }
1678 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1679 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1680 err = -EAGAIN;
1681 if (!timeo)
1682 goto failure;
1683 if (signal_pending(current))
1684 goto interrupted;
1685 timeo = sock_wait_for_wmem(sk, timeo);
1686 }
1687
1688 skb_set_owner_w(skb, sk);
1689 return skb;
1690
1691interrupted:
1692 err = sock_intr_errno(timeo);
1693failure:
1694 *errcode = err;
1695 return NULL;
1696}
4cc7f68d 1697EXPORT_SYMBOL(sock_alloc_send_pskb);
1da177e4 1698
4ec93edb 1699struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1da177e4
LT
1700 int noblock, int *errcode)
1701{
1702 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1703}
2a91525c 1704EXPORT_SYMBOL(sock_alloc_send_skb);
1da177e4
LT
1705
1706static void __lock_sock(struct sock *sk)
f39234d6
NK
1707 __releases(&sk->sk_lock.slock)
1708 __acquires(&sk->sk_lock.slock)
1da177e4
LT
1709{
1710 DEFINE_WAIT(wait);
1711
e71a4783 1712 for (;;) {
1da177e4
LT
1713 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1714 TASK_UNINTERRUPTIBLE);
1715 spin_unlock_bh(&sk->sk_lock.slock);
1716 schedule();
1717 spin_lock_bh(&sk->sk_lock.slock);
e71a4783 1718 if (!sock_owned_by_user(sk))
1da177e4
LT
1719 break;
1720 }
1721 finish_wait(&sk->sk_lock.wq, &wait);
1722}
1723
1724static void __release_sock(struct sock *sk)
f39234d6
NK
1725 __releases(&sk->sk_lock.slock)
1726 __acquires(&sk->sk_lock.slock)
1da177e4
LT
1727{
1728 struct sk_buff *skb = sk->sk_backlog.head;
1729
1730 do {
1731 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1732 bh_unlock_sock(sk);
1733
1734 do {
1735 struct sk_buff *next = skb->next;
1736
e4cbb02a 1737 prefetch(next);
7fee226a 1738 WARN_ON_ONCE(skb_dst_is_noref(skb));
1da177e4 1739 skb->next = NULL;
c57943a1 1740 sk_backlog_rcv(sk, skb);
1da177e4
LT
1741
1742 /*
1743 * We are in process context here with softirqs
1744 * disabled, use cond_resched_softirq() to preempt.
1745 * This is safe to do because we've taken the backlog
1746 * queue private:
1747 */
1748 cond_resched_softirq();
1749
1750 skb = next;
1751 } while (skb != NULL);
1752
1753 bh_lock_sock(sk);
e71a4783 1754 } while ((skb = sk->sk_backlog.head) != NULL);
8eae939f
ZY
1755
1756 /*
1757 * Doing the zeroing here guarantee we can not loop forever
1758 * while a wild producer attempts to flood us.
1759 */
1760 sk->sk_backlog.len = 0;
1da177e4
LT
1761}
1762
1763/**
1764 * sk_wait_data - wait for data to arrive at sk_receive_queue
4dc3b16b
PP
1765 * @sk: sock to wait on
1766 * @timeo: for how long
1da177e4
LT
1767 *
1768 * Now socket state including sk->sk_err is changed only under lock,
1769 * hence we may omit checks after joining wait queue.
1770 * We check receive queue before schedule() only as optimization;
1771 * it is very likely that release_sock() added new data.
1772 */
1773int sk_wait_data(struct sock *sk, long *timeo)
1774{
1775 int rc;
1776 DEFINE_WAIT(wait);
1777
aa395145 1778 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1da177e4
LT
1779 set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1780 rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1781 clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
aa395145 1782 finish_wait(sk_sleep(sk), &wait);
1da177e4
LT
1783 return rc;
1784}
1da177e4
LT
1785EXPORT_SYMBOL(sk_wait_data);
1786
3ab224be
HA
1787/**
1788 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1789 * @sk: socket
1790 * @size: memory size to allocate
1791 * @kind: allocation type
1792 *
1793 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1794 * rmem allocation. This function assumes that protocols which have
1795 * memory_pressure use sk_wmem_queued as write buffer accounting.
1796 */
1797int __sk_mem_schedule(struct sock *sk, int size, int kind)
1798{
1799 struct proto *prot = sk->sk_prot;
1800 int amt = sk_mem_pages(size);
8d987e5c 1801 long allocated;
e1aab161 1802 int parent_status = UNDER_LIMIT;
3ab224be
HA
1803
1804 sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
180d8cd9 1805
e1aab161 1806 allocated = sk_memory_allocated_add(sk, amt, &parent_status);
3ab224be
HA
1807
1808 /* Under limit. */
e1aab161
GC
1809 if (parent_status == UNDER_LIMIT &&
1810 allocated <= sk_prot_mem_limits(sk, 0)) {
180d8cd9 1811 sk_leave_memory_pressure(sk);
3ab224be
HA
1812 return 1;
1813 }
1814
e1aab161
GC
1815 /* Under pressure. (we or our parents) */
1816 if ((parent_status > SOFT_LIMIT) ||
1817 allocated > sk_prot_mem_limits(sk, 1))
180d8cd9 1818 sk_enter_memory_pressure(sk);
3ab224be 1819
e1aab161
GC
1820 /* Over hard limit (we or our parents) */
1821 if ((parent_status == OVER_LIMIT) ||
1822 (allocated > sk_prot_mem_limits(sk, 2)))
3ab224be
HA
1823 goto suppress_allocation;
1824
1825 /* guarantee minimum buffer size under pressure */
1826 if (kind == SK_MEM_RECV) {
1827 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1828 return 1;
180d8cd9 1829
3ab224be
HA
1830 } else { /* SK_MEM_SEND */
1831 if (sk->sk_type == SOCK_STREAM) {
1832 if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1833 return 1;
1834 } else if (atomic_read(&sk->sk_wmem_alloc) <
1835 prot->sysctl_wmem[0])
1836 return 1;
1837 }
1838
180d8cd9 1839 if (sk_has_memory_pressure(sk)) {
1748376b
ED
1840 int alloc;
1841
180d8cd9 1842 if (!sk_under_memory_pressure(sk))
1748376b 1843 return 1;
180d8cd9
GC
1844 alloc = sk_sockets_allocated_read_positive(sk);
1845 if (sk_prot_mem_limits(sk, 2) > alloc *
3ab224be
HA
1846 sk_mem_pages(sk->sk_wmem_queued +
1847 atomic_read(&sk->sk_rmem_alloc) +
1848 sk->sk_forward_alloc))
1849 return 1;
1850 }
1851
1852suppress_allocation:
1853
1854 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1855 sk_stream_moderate_sndbuf(sk);
1856
1857 /* Fail only if socket is _under_ its sndbuf.
1858 * In this case we cannot block, so that we have to fail.
1859 */
1860 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
1861 return 1;
1862 }
1863
3847ce32
SM
1864 trace_sock_exceed_buf_limit(sk, prot, allocated);
1865
3ab224be
HA
1866 /* Alas. Undo changes. */
1867 sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
180d8cd9 1868
0e90b31f 1869 sk_memory_allocated_sub(sk, amt);
180d8cd9 1870
3ab224be
HA
1871 return 0;
1872}
3ab224be
HA
1873EXPORT_SYMBOL(__sk_mem_schedule);
1874
1875/**
1876 * __sk_reclaim - reclaim memory_allocated
1877 * @sk: socket
1878 */
1879void __sk_mem_reclaim(struct sock *sk)
1880{
180d8cd9 1881 sk_memory_allocated_sub(sk,
0e90b31f 1882 sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT);
3ab224be
HA
1883 sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
1884
180d8cd9
GC
1885 if (sk_under_memory_pressure(sk) &&
1886 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
1887 sk_leave_memory_pressure(sk);
3ab224be 1888}
3ab224be
HA
1889EXPORT_SYMBOL(__sk_mem_reclaim);
1890
1891
1da177e4
LT
1892/*
1893 * Set of default routines for initialising struct proto_ops when
1894 * the protocol does not support a particular function. In certain
1895 * cases where it makes no sense for a protocol to have a "do nothing"
1896 * function, some default processing is provided.
1897 */
1898
1899int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1900{
1901 return -EOPNOTSUPP;
1902}
2a91525c 1903EXPORT_SYMBOL(sock_no_bind);
1da177e4 1904
4ec93edb 1905int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1da177e4
LT
1906 int len, int flags)
1907{
1908 return -EOPNOTSUPP;
1909}
2a91525c 1910EXPORT_SYMBOL(sock_no_connect);
1da177e4
LT
1911
1912int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1913{
1914 return -EOPNOTSUPP;
1915}
2a91525c 1916EXPORT_SYMBOL(sock_no_socketpair);
1da177e4
LT
1917
1918int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1919{
1920 return -EOPNOTSUPP;
1921}
2a91525c 1922EXPORT_SYMBOL(sock_no_accept);
1da177e4 1923
4ec93edb 1924int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1da177e4
LT
1925 int *len, int peer)
1926{
1927 return -EOPNOTSUPP;
1928}
2a91525c 1929EXPORT_SYMBOL(sock_no_getname);
1da177e4 1930
2a91525c 1931unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
1da177e4
LT
1932{
1933 return 0;
1934}
2a91525c 1935EXPORT_SYMBOL(sock_no_poll);
1da177e4
LT
1936
1937int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1938{
1939 return -EOPNOTSUPP;
1940}
2a91525c 1941EXPORT_SYMBOL(sock_no_ioctl);
1da177e4
LT
1942
1943int sock_no_listen(struct socket *sock, int backlog)
1944{
1945 return -EOPNOTSUPP;
1946}
2a91525c 1947EXPORT_SYMBOL(sock_no_listen);
1da177e4
LT
1948
1949int sock_no_shutdown(struct socket *sock, int how)
1950{
1951 return -EOPNOTSUPP;
1952}
2a91525c 1953EXPORT_SYMBOL(sock_no_shutdown);
1da177e4
LT
1954
1955int sock_no_setsockopt(struct socket *sock, int level, int optname,
b7058842 1956 char __user *optval, unsigned int optlen)
1da177e4
LT
1957{
1958 return -EOPNOTSUPP;
1959}
2a91525c 1960EXPORT_SYMBOL(sock_no_setsockopt);
1da177e4
LT
1961
1962int sock_no_getsockopt(struct socket *sock, int level, int optname,
1963 char __user *optval, int __user *optlen)
1964{
1965 return -EOPNOTSUPP;
1966}
2a91525c 1967EXPORT_SYMBOL(sock_no_getsockopt);
1da177e4
LT
1968
1969int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1970 size_t len)
1971{
1972 return -EOPNOTSUPP;
1973}
2a91525c 1974EXPORT_SYMBOL(sock_no_sendmsg);
1da177e4
LT
1975
1976int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1977 size_t len, int flags)
1978{
1979 return -EOPNOTSUPP;
1980}
2a91525c 1981EXPORT_SYMBOL(sock_no_recvmsg);
1da177e4
LT
1982
1983int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1984{
1985 /* Mirror missing mmap method error code */
1986 return -ENODEV;
1987}
2a91525c 1988EXPORT_SYMBOL(sock_no_mmap);
1da177e4
LT
1989
1990ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1991{
1992 ssize_t res;
1993 struct msghdr msg = {.msg_flags = flags};
1994 struct kvec iov;
1995 char *kaddr = kmap(page);
1996 iov.iov_base = kaddr + offset;
1997 iov.iov_len = size;
1998 res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1999 kunmap(page);
2000 return res;
2001}
2a91525c 2002EXPORT_SYMBOL(sock_no_sendpage);
1da177e4
LT
2003
2004/*
2005 * Default Socket Callbacks
2006 */
2007
2008static void sock_def_wakeup(struct sock *sk)
2009{
43815482
ED
2010 struct socket_wq *wq;
2011
2012 rcu_read_lock();
2013 wq = rcu_dereference(sk->sk_wq);
2014 if (wq_has_sleeper(wq))
2015 wake_up_interruptible_all(&wq->wait);
2016 rcu_read_unlock();
1da177e4
LT
2017}
2018
2019static void sock_def_error_report(struct sock *sk)
2020{
43815482
ED
2021 struct socket_wq *wq;
2022
2023 rcu_read_lock();
2024 wq = rcu_dereference(sk->sk_wq);
2025 if (wq_has_sleeper(wq))
2026 wake_up_interruptible_poll(&wq->wait, POLLERR);
8d8ad9d7 2027 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
43815482 2028 rcu_read_unlock();
1da177e4
LT
2029}
2030
2031static void sock_def_readable(struct sock *sk, int len)
2032{
43815482
ED
2033 struct socket_wq *wq;
2034
2035 rcu_read_lock();
2036 wq = rcu_dereference(sk->sk_wq);
2037 if (wq_has_sleeper(wq))
2c6607c6 2038 wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
37e5540b 2039 POLLRDNORM | POLLRDBAND);
8d8ad9d7 2040 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
43815482 2041 rcu_read_unlock();
1da177e4
LT
2042}
2043
2044static void sock_def_write_space(struct sock *sk)
2045{
43815482
ED
2046 struct socket_wq *wq;
2047
2048 rcu_read_lock();
1da177e4
LT
2049
2050 /* Do not wake up a writer until he can make "significant"
2051 * progress. --DaveM
2052 */
e71a4783 2053 if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
43815482
ED
2054 wq = rcu_dereference(sk->sk_wq);
2055 if (wq_has_sleeper(wq))
2056 wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
37e5540b 2057 POLLWRNORM | POLLWRBAND);
1da177e4
LT
2058
2059 /* Should agree with poll, otherwise some programs break */
2060 if (sock_writeable(sk))
8d8ad9d7 2061 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
1da177e4
LT
2062 }
2063
43815482 2064 rcu_read_unlock();
1da177e4
LT
2065}
2066
2067static void sock_def_destruct(struct sock *sk)
2068{
a51482bd 2069 kfree(sk->sk_protinfo);
1da177e4
LT
2070}
2071
2072void sk_send_sigurg(struct sock *sk)
2073{
2074 if (sk->sk_socket && sk->sk_socket->file)
2075 if (send_sigurg(&sk->sk_socket->file->f_owner))
8d8ad9d7 2076 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
1da177e4 2077}
2a91525c 2078EXPORT_SYMBOL(sk_send_sigurg);
1da177e4
LT
2079
2080void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2081 unsigned long expires)
2082{
2083 if (!mod_timer(timer, expires))
2084 sock_hold(sk);
2085}
1da177e4
LT
2086EXPORT_SYMBOL(sk_reset_timer);
2087
2088void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2089{
2090 if (timer_pending(timer) && del_timer(timer))
2091 __sock_put(sk);
2092}
1da177e4
LT
2093EXPORT_SYMBOL(sk_stop_timer);
2094
2095void sock_init_data(struct socket *sock, struct sock *sk)
2096{
2097 skb_queue_head_init(&sk->sk_receive_queue);
2098 skb_queue_head_init(&sk->sk_write_queue);
2099 skb_queue_head_init(&sk->sk_error_queue);
97fc2f08
CL
2100#ifdef CONFIG_NET_DMA
2101 skb_queue_head_init(&sk->sk_async_wait_queue);
2102#endif
1da177e4
LT
2103
2104 sk->sk_send_head = NULL;
2105
2106 init_timer(&sk->sk_timer);
4ec93edb 2107
1da177e4
LT
2108 sk->sk_allocation = GFP_KERNEL;
2109 sk->sk_rcvbuf = sysctl_rmem_default;
2110 sk->sk_sndbuf = sysctl_wmem_default;
2111 sk->sk_state = TCP_CLOSE;
972692e0 2112 sk_set_socket(sk, sock);
1da177e4
LT
2113
2114 sock_set_flag(sk, SOCK_ZAPPED);
2115
e71a4783 2116 if (sock) {
1da177e4 2117 sk->sk_type = sock->type;
43815482 2118 sk->sk_wq = sock->wq;
1da177e4
LT
2119 sock->sk = sk;
2120 } else
43815482 2121 sk->sk_wq = NULL;
1da177e4 2122
b6c6712a 2123 spin_lock_init(&sk->sk_dst_lock);
1da177e4 2124 rwlock_init(&sk->sk_callback_lock);
443aef0e
PZ
2125 lockdep_set_class_and_name(&sk->sk_callback_lock,
2126 af_callback_keys + sk->sk_family,
2127 af_family_clock_key_strings[sk->sk_family]);
1da177e4
LT
2128
2129 sk->sk_state_change = sock_def_wakeup;
2130 sk->sk_data_ready = sock_def_readable;
2131 sk->sk_write_space = sock_def_write_space;
2132 sk->sk_error_report = sock_def_error_report;
2133 sk->sk_destruct = sock_def_destruct;
2134
2135 sk->sk_sndmsg_page = NULL;
2136 sk->sk_sndmsg_off = 0;
ef64a54f 2137 sk->sk_peek_off = -1;
1da177e4 2138
109f6e39
EB
2139 sk->sk_peer_pid = NULL;
2140 sk->sk_peer_cred = NULL;
1da177e4
LT
2141 sk->sk_write_pending = 0;
2142 sk->sk_rcvlowat = 1;
2143 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
2144 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
2145
f37f0afb 2146 sk->sk_stamp = ktime_set(-1L, 0);
1da177e4 2147
4dc6dc71
ED
2148 /*
2149 * Before updating sk_refcnt, we must commit prior changes to memory
2150 * (Documentation/RCU/rculist_nulls.txt for details)
2151 */
2152 smp_wmb();
1da177e4 2153 atomic_set(&sk->sk_refcnt, 1);
33c732c3 2154 atomic_set(&sk->sk_drops, 0);
1da177e4 2155}
2a91525c 2156EXPORT_SYMBOL(sock_init_data);
1da177e4 2157
b5606c2d 2158void lock_sock_nested(struct sock *sk, int subclass)
1da177e4
LT
2159{
2160 might_sleep();
a5b5bb9a 2161 spin_lock_bh(&sk->sk_lock.slock);
d2e9117c 2162 if (sk->sk_lock.owned)
1da177e4 2163 __lock_sock(sk);
d2e9117c 2164 sk->sk_lock.owned = 1;
a5b5bb9a
IM
2165 spin_unlock(&sk->sk_lock.slock);
2166 /*
2167 * The sk_lock has mutex_lock() semantics here:
2168 */
fcc70d5f 2169 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
a5b5bb9a 2170 local_bh_enable();
1da177e4 2171}
fcc70d5f 2172EXPORT_SYMBOL(lock_sock_nested);
1da177e4 2173
b5606c2d 2174void release_sock(struct sock *sk)
1da177e4 2175{
a5b5bb9a
IM
2176 /*
2177 * The sk_lock has mutex_unlock() semantics:
2178 */
2179 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
2180
2181 spin_lock_bh(&sk->sk_lock.slock);
1da177e4
LT
2182 if (sk->sk_backlog.tail)
2183 __release_sock(sk);
46d3ceab
ED
2184
2185 if (sk->sk_prot->release_cb)
2186 sk->sk_prot->release_cb(sk);
2187
d2e9117c 2188 sk->sk_lock.owned = 0;
a5b5bb9a
IM
2189 if (waitqueue_active(&sk->sk_lock.wq))
2190 wake_up(&sk->sk_lock.wq);
2191 spin_unlock_bh(&sk->sk_lock.slock);
1da177e4
LT
2192}
2193EXPORT_SYMBOL(release_sock);
2194
8a74ad60
ED
2195/**
2196 * lock_sock_fast - fast version of lock_sock
2197 * @sk: socket
2198 *
2199 * This version should be used for very small section, where process wont block
2200 * return false if fast path is taken
2201 * sk_lock.slock locked, owned = 0, BH disabled
2202 * return true if slow path is taken
2203 * sk_lock.slock unlocked, owned = 1, BH enabled
2204 */
2205bool lock_sock_fast(struct sock *sk)
2206{
2207 might_sleep();
2208 spin_lock_bh(&sk->sk_lock.slock);
2209
2210 if (!sk->sk_lock.owned)
2211 /*
2212 * Note : We must disable BH
2213 */
2214 return false;
2215
2216 __lock_sock(sk);
2217 sk->sk_lock.owned = 1;
2218 spin_unlock(&sk->sk_lock.slock);
2219 /*
2220 * The sk_lock has mutex_lock() semantics here:
2221 */
2222 mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2223 local_bh_enable();
2224 return true;
2225}
2226EXPORT_SYMBOL(lock_sock_fast);
2227
1da177e4 2228int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
4ec93edb 2229{
b7aa0bf7 2230 struct timeval tv;
1da177e4 2231 if (!sock_flag(sk, SOCK_TIMESTAMP))
20d49473 2232 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
b7aa0bf7
ED
2233 tv = ktime_to_timeval(sk->sk_stamp);
2234 if (tv.tv_sec == -1)
1da177e4 2235 return -ENOENT;
b7aa0bf7
ED
2236 if (tv.tv_sec == 0) {
2237 sk->sk_stamp = ktime_get_real();
2238 tv = ktime_to_timeval(sk->sk_stamp);
2239 }
2240 return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
4ec93edb 2241}
1da177e4
LT
2242EXPORT_SYMBOL(sock_get_timestamp);
2243
ae40eb1e
ED
2244int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2245{
2246 struct timespec ts;
2247 if (!sock_flag(sk, SOCK_TIMESTAMP))
20d49473 2248 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
ae40eb1e
ED
2249 ts = ktime_to_timespec(sk->sk_stamp);
2250 if (ts.tv_sec == -1)
2251 return -ENOENT;
2252 if (ts.tv_sec == 0) {
2253 sk->sk_stamp = ktime_get_real();
2254 ts = ktime_to_timespec(sk->sk_stamp);
2255 }
2256 return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2257}
2258EXPORT_SYMBOL(sock_get_timestampns);
2259
20d49473 2260void sock_enable_timestamp(struct sock *sk, int flag)
4ec93edb 2261{
20d49473 2262 if (!sock_flag(sk, flag)) {
08e29af3
ED
2263 unsigned long previous_flags = sk->sk_flags;
2264
20d49473
PO
2265 sock_set_flag(sk, flag);
2266 /*
2267 * we just set one of the two flags which require net
2268 * time stamping, but time stamping might have been on
2269 * already because of the other one
2270 */
08e29af3 2271 if (!(previous_flags & SK_FLAGS_TIMESTAMP))
20d49473 2272 net_enable_timestamp();
1da177e4
LT
2273 }
2274}
1da177e4
LT
2275
2276/*
2277 * Get a socket option on an socket.
2278 *
2279 * FIX: POSIX 1003.1g is very ambiguous here. It states that
2280 * asynchronous errors should be reported by getsockopt. We assume
2281 * this means if you specify SO_ERROR (otherwise whats the point of it).
2282 */
2283int sock_common_getsockopt(struct socket *sock, int level, int optname,
2284 char __user *optval, int __user *optlen)
2285{
2286 struct sock *sk = sock->sk;
2287
2288 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2289}
1da177e4
LT
2290EXPORT_SYMBOL(sock_common_getsockopt);
2291
3fdadf7d 2292#ifdef CONFIG_COMPAT
543d9cfe
ACM
2293int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2294 char __user *optval, int __user *optlen)
3fdadf7d
DM
2295{
2296 struct sock *sk = sock->sk;
2297
1e51f951 2298 if (sk->sk_prot->compat_getsockopt != NULL)
543d9cfe
ACM
2299 return sk->sk_prot->compat_getsockopt(sk, level, optname,
2300 optval, optlen);
3fdadf7d
DM
2301 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2302}
2303EXPORT_SYMBOL(compat_sock_common_getsockopt);
2304#endif
2305
1da177e4
LT
2306int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
2307 struct msghdr *msg, size_t size, int flags)
2308{
2309 struct sock *sk = sock->sk;
2310 int addr_len = 0;
2311 int err;
2312
2313 err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2314 flags & ~MSG_DONTWAIT, &addr_len);
2315 if (err >= 0)
2316 msg->msg_namelen = addr_len;
2317 return err;
2318}
1da177e4
LT
2319EXPORT_SYMBOL(sock_common_recvmsg);
2320
2321/*
2322 * Set socket options on an inet socket.
2323 */
2324int sock_common_setsockopt(struct socket *sock, int level, int optname,
b7058842 2325 char __user *optval, unsigned int optlen)
1da177e4
LT
2326{
2327 struct sock *sk = sock->sk;
2328
2329 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2330}
1da177e4
LT
2331EXPORT_SYMBOL(sock_common_setsockopt);
2332
3fdadf7d 2333#ifdef CONFIG_COMPAT
543d9cfe 2334int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
b7058842 2335 char __user *optval, unsigned int optlen)
3fdadf7d
DM
2336{
2337 struct sock *sk = sock->sk;
2338
543d9cfe
ACM
2339 if (sk->sk_prot->compat_setsockopt != NULL)
2340 return sk->sk_prot->compat_setsockopt(sk, level, optname,
2341 optval, optlen);
3fdadf7d
DM
2342 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2343}
2344EXPORT_SYMBOL(compat_sock_common_setsockopt);
2345#endif
2346
1da177e4
LT
2347void sk_common_release(struct sock *sk)
2348{
2349 if (sk->sk_prot->destroy)
2350 sk->sk_prot->destroy(sk);
2351
2352 /*
2353 * Observation: when sock_common_release is called, processes have
2354 * no access to socket. But net still has.
2355 * Step one, detach it from networking:
2356 *
2357 * A. Remove from hash tables.
2358 */
2359
2360 sk->sk_prot->unhash(sk);
2361
2362 /*
2363 * In this point socket cannot receive new packets, but it is possible
2364 * that some packets are in flight because some CPU runs receiver and
2365 * did hash table lookup before we unhashed socket. They will achieve
2366 * receive queue and will be purged by socket destructor.
2367 *
2368 * Also we still have packets pending on receive queue and probably,
2369 * our own packets waiting in device queues. sock_destroy will drain
2370 * receive queue, but transmitted packets will delay socket destruction
2371 * until the last reference will be released.
2372 */
2373
2374 sock_orphan(sk);
2375
2376 xfrm_sk_free_policy(sk);
2377
e6848976 2378 sk_refcnt_debug_release(sk);
1da177e4
LT
2379 sock_put(sk);
2380}
1da177e4
LT
2381EXPORT_SYMBOL(sk_common_release);
2382
13ff3d6f
PE
2383#ifdef CONFIG_PROC_FS
2384#define PROTO_INUSE_NR 64 /* should be enough for the first time */
1338d466
PE
2385struct prot_inuse {
2386 int val[PROTO_INUSE_NR];
2387};
13ff3d6f
PE
2388
2389static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
70ee1159
PE
2390
2391#ifdef CONFIG_NET_NS
2392void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2393{
d6d9ca0f 2394 __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
70ee1159
PE
2395}
2396EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2397
2398int sock_prot_inuse_get(struct net *net, struct proto *prot)
2399{
2400 int cpu, idx = prot->inuse_idx;
2401 int res = 0;
2402
2403 for_each_possible_cpu(cpu)
2404 res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2405
2406 return res >= 0 ? res : 0;
2407}
2408EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2409
2c8c1e72 2410static int __net_init sock_inuse_init_net(struct net *net)
70ee1159
PE
2411{
2412 net->core.inuse = alloc_percpu(struct prot_inuse);
2413 return net->core.inuse ? 0 : -ENOMEM;
2414}
2415
2c8c1e72 2416static void __net_exit sock_inuse_exit_net(struct net *net)
70ee1159
PE
2417{
2418 free_percpu(net->core.inuse);
2419}
2420
2421static struct pernet_operations net_inuse_ops = {
2422 .init = sock_inuse_init_net,
2423 .exit = sock_inuse_exit_net,
2424};
2425
2426static __init int net_inuse_init(void)
2427{
2428 if (register_pernet_subsys(&net_inuse_ops))
2429 panic("Cannot initialize net inuse counters");
2430
2431 return 0;
2432}
2433
2434core_initcall(net_inuse_init);
2435#else
1338d466
PE
2436static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2437
c29a0bc4 2438void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
1338d466 2439{
d6d9ca0f 2440 __this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
1338d466
PE
2441}
2442EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2443
c29a0bc4 2444int sock_prot_inuse_get(struct net *net, struct proto *prot)
1338d466
PE
2445{
2446 int cpu, idx = prot->inuse_idx;
2447 int res = 0;
2448
2449 for_each_possible_cpu(cpu)
2450 res += per_cpu(prot_inuse, cpu).val[idx];
2451
2452 return res >= 0 ? res : 0;
2453}
2454EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
70ee1159 2455#endif
13ff3d6f
PE
2456
2457static void assign_proto_idx(struct proto *prot)
2458{
2459 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2460
2461 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
e005d193 2462 pr_err("PROTO_INUSE_NR exhausted\n");
13ff3d6f
PE
2463 return;
2464 }
2465
2466 set_bit(prot->inuse_idx, proto_inuse_idx);
2467}
2468
2469static void release_proto_idx(struct proto *prot)
2470{
2471 if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2472 clear_bit(prot->inuse_idx, proto_inuse_idx);
2473}
2474#else
2475static inline void assign_proto_idx(struct proto *prot)
2476{
2477}
2478
2479static inline void release_proto_idx(struct proto *prot)
2480{
2481}
2482#endif
2483
b733c007
PE
2484int proto_register(struct proto *prot, int alloc_slab)
2485{
1da177e4
LT
2486 if (alloc_slab) {
2487 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
271b72c7
ED
2488 SLAB_HWCACHE_ALIGN | prot->slab_flags,
2489 NULL);
1da177e4
LT
2490
2491 if (prot->slab == NULL) {
e005d193
JP
2492 pr_crit("%s: Can't create sock SLAB cache!\n",
2493 prot->name);
60e7663d 2494 goto out;
1da177e4 2495 }
2e6599cb
ACM
2496
2497 if (prot->rsk_prot != NULL) {
faf23422 2498 prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name);
7e56b5d6 2499 if (prot->rsk_prot->slab_name == NULL)
2e6599cb
ACM
2500 goto out_free_sock_slab;
2501
7e56b5d6 2502 prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2e6599cb 2503 prot->rsk_prot->obj_size, 0,
20c2df83 2504 SLAB_HWCACHE_ALIGN, NULL);
2e6599cb
ACM
2505
2506 if (prot->rsk_prot->slab == NULL) {
e005d193
JP
2507 pr_crit("%s: Can't create request sock SLAB cache!\n",
2508 prot->name);
2e6599cb
ACM
2509 goto out_free_request_sock_slab_name;
2510 }
2511 }
8feaf0c0 2512
6d6ee43e 2513 if (prot->twsk_prot != NULL) {
faf23422 2514 prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
8feaf0c0 2515
7e56b5d6 2516 if (prot->twsk_prot->twsk_slab_name == NULL)
8feaf0c0
ACM
2517 goto out_free_request_sock_slab;
2518
6d6ee43e 2519 prot->twsk_prot->twsk_slab =
7e56b5d6 2520 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
6d6ee43e 2521 prot->twsk_prot->twsk_obj_size,
3ab5aee7
ED
2522 0,
2523 SLAB_HWCACHE_ALIGN |
2524 prot->slab_flags,
20c2df83 2525 NULL);
6d6ee43e 2526 if (prot->twsk_prot->twsk_slab == NULL)
8feaf0c0
ACM
2527 goto out_free_timewait_sock_slab_name;
2528 }
1da177e4
LT
2529 }
2530
36b77a52 2531 mutex_lock(&proto_list_mutex);
1da177e4 2532 list_add(&prot->node, &proto_list);
13ff3d6f 2533 assign_proto_idx(prot);
36b77a52 2534 mutex_unlock(&proto_list_mutex);
b733c007
PE
2535 return 0;
2536
8feaf0c0 2537out_free_timewait_sock_slab_name:
7e56b5d6 2538 kfree(prot->twsk_prot->twsk_slab_name);
8feaf0c0
ACM
2539out_free_request_sock_slab:
2540 if (prot->rsk_prot && prot->rsk_prot->slab) {
2541 kmem_cache_destroy(prot->rsk_prot->slab);
2542 prot->rsk_prot->slab = NULL;
2543 }
2e6599cb 2544out_free_request_sock_slab_name:
72150e9b
DC
2545 if (prot->rsk_prot)
2546 kfree(prot->rsk_prot->slab_name);
2e6599cb
ACM
2547out_free_sock_slab:
2548 kmem_cache_destroy(prot->slab);
2549 prot->slab = NULL;
b733c007
PE
2550out:
2551 return -ENOBUFS;
1da177e4 2552}
1da177e4
LT
2553EXPORT_SYMBOL(proto_register);
2554
2555void proto_unregister(struct proto *prot)
2556{
36b77a52 2557 mutex_lock(&proto_list_mutex);
13ff3d6f 2558 release_proto_idx(prot);
0a3f4358 2559 list_del(&prot->node);
36b77a52 2560 mutex_unlock(&proto_list_mutex);
1da177e4
LT
2561
2562 if (prot->slab != NULL) {
2563 kmem_cache_destroy(prot->slab);
2564 prot->slab = NULL;
2565 }
2566
2e6599cb 2567 if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2e6599cb 2568 kmem_cache_destroy(prot->rsk_prot->slab);
7e56b5d6 2569 kfree(prot->rsk_prot->slab_name);
2e6599cb
ACM
2570 prot->rsk_prot->slab = NULL;
2571 }
2572
6d6ee43e 2573 if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
6d6ee43e 2574 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
7e56b5d6 2575 kfree(prot->twsk_prot->twsk_slab_name);
6d6ee43e 2576 prot->twsk_prot->twsk_slab = NULL;
8feaf0c0 2577 }
1da177e4 2578}
1da177e4
LT
2579EXPORT_SYMBOL(proto_unregister);
2580
2581#ifdef CONFIG_PROC_FS
1da177e4 2582static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
36b77a52 2583 __acquires(proto_list_mutex)
1da177e4 2584{
36b77a52 2585 mutex_lock(&proto_list_mutex);
60f0438a 2586 return seq_list_start_head(&proto_list, *pos);
1da177e4
LT
2587}
2588
2589static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2590{
60f0438a 2591 return seq_list_next(v, &proto_list, pos);
1da177e4
LT
2592}
2593
2594static void proto_seq_stop(struct seq_file *seq, void *v)
36b77a52 2595 __releases(proto_list_mutex)
1da177e4 2596{
36b77a52 2597 mutex_unlock(&proto_list_mutex);
1da177e4
LT
2598}
2599
2600static char proto_method_implemented(const void *method)
2601{
2602 return method == NULL ? 'n' : 'y';
2603}
180d8cd9
GC
2604static long sock_prot_memory_allocated(struct proto *proto)
2605{
cb75a36c 2606 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
180d8cd9
GC
2607}
2608
2609static char *sock_prot_memory_pressure(struct proto *proto)
2610{
2611 return proto->memory_pressure != NULL ?
2612 proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2613}
1da177e4
LT
2614
2615static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2616{
180d8cd9 2617
8d987e5c 2618 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
1da177e4
LT
2619 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2620 proto->name,
2621 proto->obj_size,
14e943db 2622 sock_prot_inuse_get(seq_file_net(seq), proto),
180d8cd9
GC
2623 sock_prot_memory_allocated(proto),
2624 sock_prot_memory_pressure(proto),
1da177e4
LT
2625 proto->max_header,
2626 proto->slab == NULL ? "no" : "yes",
2627 module_name(proto->owner),
2628 proto_method_implemented(proto->close),
2629 proto_method_implemented(proto->connect),
2630 proto_method_implemented(proto->disconnect),
2631 proto_method_implemented(proto->accept),
2632 proto_method_implemented(proto->ioctl),
2633 proto_method_implemented(proto->init),
2634 proto_method_implemented(proto->destroy),
2635 proto_method_implemented(proto->shutdown),
2636 proto_method_implemented(proto->setsockopt),
2637 proto_method_implemented(proto->getsockopt),
2638 proto_method_implemented(proto->sendmsg),
2639 proto_method_implemented(proto->recvmsg),
2640 proto_method_implemented(proto->sendpage),
2641 proto_method_implemented(proto->bind),
2642 proto_method_implemented(proto->backlog_rcv),
2643 proto_method_implemented(proto->hash),
2644 proto_method_implemented(proto->unhash),
2645 proto_method_implemented(proto->get_port),
2646 proto_method_implemented(proto->enter_memory_pressure));
2647}
2648
2649static int proto_seq_show(struct seq_file *seq, void *v)
2650{
60f0438a 2651 if (v == &proto_list)
1da177e4
LT
2652 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2653 "protocol",
2654 "size",
2655 "sockets",
2656 "memory",
2657 "press",
2658 "maxhdr",
2659 "slab",
2660 "module",
2661 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2662 else
60f0438a 2663 proto_seq_printf(seq, list_entry(v, struct proto, node));
1da177e4
LT
2664 return 0;
2665}
2666
f690808e 2667static const struct seq_operations proto_seq_ops = {
1da177e4
LT
2668 .start = proto_seq_start,
2669 .next = proto_seq_next,
2670 .stop = proto_seq_stop,
2671 .show = proto_seq_show,
2672};
2673
2674static int proto_seq_open(struct inode *inode, struct file *file)
2675{
14e943db
ED
2676 return seq_open_net(inode, file, &proto_seq_ops,
2677 sizeof(struct seq_net_private));
1da177e4
LT
2678}
2679
9a32144e 2680static const struct file_operations proto_seq_fops = {
1da177e4
LT
2681 .owner = THIS_MODULE,
2682 .open = proto_seq_open,
2683 .read = seq_read,
2684 .llseek = seq_lseek,
14e943db
ED
2685 .release = seq_release_net,
2686};
2687
2688static __net_init int proto_init_net(struct net *net)
2689{
2690 if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops))
2691 return -ENOMEM;
2692
2693 return 0;
2694}
2695
2696static __net_exit void proto_exit_net(struct net *net)
2697{
2698 proc_net_remove(net, "protocols");
2699}
2700
2701
2702static __net_initdata struct pernet_operations proto_net_ops = {
2703 .init = proto_init_net,
2704 .exit = proto_exit_net,
1da177e4
LT
2705};
2706
2707static int __init proto_init(void)
2708{
14e943db 2709 return register_pernet_subsys(&proto_net_ops);
1da177e4
LT
2710}
2711
2712subsys_initcall(proto_init);
2713
2714#endif /* PROC_FS */
This page took 0.899003 seconds and 5 git commands to generate.