fq_codel: should use qdisc backlog as threshold
[deliverable/linux.git] / net / core / sock.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Generic socket support routines. Memory allocators, socket lock/release
7 * handler for protocols to use and generic option handler.
8 *
9 *
02c30a84 10 * Authors: Ross Biro
1da177e4
LT
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Florian La Roche, <flla@stud.uni-sb.de>
13 * Alan Cox, <A.Cox@swansea.ac.uk>
14 *
15 * Fixes:
16 * Alan Cox : Numerous verify_area() problems
17 * Alan Cox : Connecting on a connecting socket
18 * now returns an error for tcp.
19 * Alan Cox : sock->protocol is set correctly.
20 * and is not sometimes left as 0.
21 * Alan Cox : connect handles icmp errors on a
22 * connect properly. Unfortunately there
23 * is a restart syscall nasty there. I
24 * can't match BSD without hacking the C
25 * library. Ideas urgently sought!
26 * Alan Cox : Disallow bind() to addresses that are
27 * not ours - especially broadcast ones!!
28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
30 * instead they leave that for the DESTROY timer.
31 * Alan Cox : Clean up error flag in accept
32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
33 * was buggy. Put a remove_sock() in the handler
34 * for memory when we hit 0. Also altered the timer
4ec93edb 35 * code. The ACK stuff can wait and needs major
1da177e4
LT
36 * TCP layer surgery.
37 * Alan Cox : Fixed TCP ack bug, removed remove sock
38 * and fixed timer/inet_bh race.
39 * Alan Cox : Added zapped flag for TCP
40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 * Rick Sladkey : Relaxed UDP rules for matching packets.
46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
47 * Pauline Middelink : identd support
48 * Alan Cox : Fixed connect() taking signals I think.
49 * Alan Cox : SO_LINGER supported
50 * Alan Cox : Error reporting fixes
51 * Anonymous : inet_create tidied up (sk->reuse setting)
52 * Alan Cox : inet sockets don't set sk->type!
53 * Alan Cox : Split socket option code
54 * Alan Cox : Callbacks
55 * Alan Cox : Nagle flag for Charles & Johannes stuff
56 * Alex : Removed restriction on inet fioctl
57 * Alan Cox : Splitting INET from NET core
58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
60 * Alan Cox : Split IP from generic code
61 * Alan Cox : New kfree_skbmem()
62 * Alan Cox : Make SO_DEBUG superuser only.
63 * Alan Cox : Allow anyone to clear SO_DEBUG
64 * (compatibility fix)
65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
66 * Alan Cox : Allocator for a socket is settable.
67 * Alan Cox : SO_ERROR includes soft errors.
68 * Alan Cox : Allow NULL arguments on some SO_ opts
69 * Alan Cox : Generic socket allocation to make hooks
70 * easier (suggested by Craig Metz).
71 * Michael Pall : SO_ERROR returns positive errno again
72 * Steve Whitehouse: Added default destructor to free
73 * protocol private data.
74 * Steve Whitehouse: Added various other default routines
75 * common to several socket families.
76 * Chris Evans : Call suser() check last on F_SETOWN
77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
79 * Andi Kleen : Fix write_space callback
80 * Chris Evans : Security fixes - signedness again
81 * Arnaldo C. Melo : cleanups, use skb_queue_purge
82 *
83 * To Fix:
84 *
85 *
86 * This program is free software; you can redistribute it and/or
87 * modify it under the terms of the GNU General Public License
88 * as published by the Free Software Foundation; either version
89 * 2 of the License, or (at your option) any later version.
90 */
91
4fc268d2 92#include <linux/capability.h>
1da177e4
LT
93#include <linux/errno.h>
94#include <linux/types.h>
95#include <linux/socket.h>
96#include <linux/in.h>
97#include <linux/kernel.h>
1da177e4
LT
98#include <linux/module.h>
99#include <linux/proc_fs.h>
100#include <linux/seq_file.h>
101#include <linux/sched.h>
102#include <linux/timer.h>
103#include <linux/string.h>
104#include <linux/sockios.h>
105#include <linux/net.h>
106#include <linux/mm.h>
107#include <linux/slab.h>
108#include <linux/interrupt.h>
109#include <linux/poll.h>
110#include <linux/tcp.h>
111#include <linux/init.h>
a1f8e7f7 112#include <linux/highmem.h>
3f551f94 113#include <linux/user_namespace.h>
c5905afb 114#include <linux/static_key.h>
3969eb38 115#include <linux/memcontrol.h>
8c1ae10d 116#include <linux/prefetch.h>
1da177e4
LT
117
118#include <asm/uaccess.h>
1da177e4
LT
119
120#include <linux/netdevice.h>
121#include <net/protocol.h>
122#include <linux/skbuff.h>
457c4cbc 123#include <net/net_namespace.h>
2e6599cb 124#include <net/request_sock.h>
1da177e4 125#include <net/sock.h>
20d49473 126#include <linux/net_tstamp.h>
1da177e4
LT
127#include <net/xfrm.h>
128#include <linux/ipsec.h>
f8451725 129#include <net/cls_cgroup.h>
5bc1421e 130#include <net/netprio_cgroup.h>
1da177e4
LT
131
132#include <linux/filter.h>
133
3847ce32
SM
134#include <trace/events/sock.h>
135
1da177e4
LT
136#ifdef CONFIG_INET
137#include <net/tcp.h>
138#endif
139
36b77a52 140static DEFINE_MUTEX(proto_list_mutex);
d1a4c0b3
GC
141static LIST_HEAD(proto_list);
142
143#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
144int mem_cgroup_sockets_init(struct cgroup *cgrp, struct cgroup_subsys *ss)
145{
146 struct proto *proto;
147 int ret = 0;
148
36b77a52 149 mutex_lock(&proto_list_mutex);
d1a4c0b3
GC
150 list_for_each_entry(proto, &proto_list, node) {
151 if (proto->init_cgroup) {
152 ret = proto->init_cgroup(cgrp, ss);
153 if (ret)
154 goto out;
155 }
156 }
157
36b77a52 158 mutex_unlock(&proto_list_mutex);
d1a4c0b3
GC
159 return ret;
160out:
161 list_for_each_entry_continue_reverse(proto, &proto_list, node)
162 if (proto->destroy_cgroup)
761b3ef5 163 proto->destroy_cgroup(cgrp);
36b77a52 164 mutex_unlock(&proto_list_mutex);
d1a4c0b3
GC
165 return ret;
166}
167
761b3ef5 168void mem_cgroup_sockets_destroy(struct cgroup *cgrp)
d1a4c0b3
GC
169{
170 struct proto *proto;
171
36b77a52 172 mutex_lock(&proto_list_mutex);
d1a4c0b3
GC
173 list_for_each_entry_reverse(proto, &proto_list, node)
174 if (proto->destroy_cgroup)
761b3ef5 175 proto->destroy_cgroup(cgrp);
36b77a52 176 mutex_unlock(&proto_list_mutex);
d1a4c0b3
GC
177}
178#endif
179
da21f24d
IM
180/*
181 * Each address family might have different locking rules, so we have
182 * one slock key per address family:
183 */
a5b5bb9a
IM
184static struct lock_class_key af_family_keys[AF_MAX];
185static struct lock_class_key af_family_slock_keys[AF_MAX];
186
c5905afb 187struct static_key memcg_socket_limit_enabled;
e1aab161
GC
188EXPORT_SYMBOL(memcg_socket_limit_enabled);
189
a5b5bb9a
IM
190/*
191 * Make lock validator output more readable. (we pre-construct these
192 * strings build-time, so that runtime initialization of socket
193 * locks is fast):
194 */
36cbd3dc 195static const char *const af_family_key_strings[AF_MAX+1] = {
a5b5bb9a
IM
196 "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX" , "sk_lock-AF_INET" ,
197 "sk_lock-AF_AX25" , "sk_lock-AF_IPX" , "sk_lock-AF_APPLETALK",
198 "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE" , "sk_lock-AF_ATMPVC" ,
199 "sk_lock-AF_X25" , "sk_lock-AF_INET6" , "sk_lock-AF_ROSE" ,
200 "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI" , "sk_lock-AF_SECURITY" ,
201 "sk_lock-AF_KEY" , "sk_lock-AF_NETLINK" , "sk_lock-AF_PACKET" ,
202 "sk_lock-AF_ASH" , "sk_lock-AF_ECONET" , "sk_lock-AF_ATMSVC" ,
cbd151bf 203 "sk_lock-AF_RDS" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" ,
a5b5bb9a 204 "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE" , "sk_lock-AF_LLC" ,
cd05acfe 205 "sk_lock-27" , "sk_lock-28" , "sk_lock-AF_CAN" ,
17926a79 206 "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" ,
bce7b154 207 "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" ,
6f107b58 208 "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG" ,
c7fe3b52 209 "sk_lock-AF_NFC" , "sk_lock-AF_MAX"
a5b5bb9a 210};
36cbd3dc 211static const char *const af_family_slock_key_strings[AF_MAX+1] = {
a5b5bb9a
IM
212 "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" ,
213 "slock-AF_AX25" , "slock-AF_IPX" , "slock-AF_APPLETALK",
214 "slock-AF_NETROM", "slock-AF_BRIDGE" , "slock-AF_ATMPVC" ,
215 "slock-AF_X25" , "slock-AF_INET6" , "slock-AF_ROSE" ,
216 "slock-AF_DECnet", "slock-AF_NETBEUI" , "slock-AF_SECURITY" ,
217 "slock-AF_KEY" , "slock-AF_NETLINK" , "slock-AF_PACKET" ,
218 "slock-AF_ASH" , "slock-AF_ECONET" , "slock-AF_ATMSVC" ,
cbd151bf 219 "slock-AF_RDS" , "slock-AF_SNA" , "slock-AF_IRDA" ,
a5b5bb9a 220 "slock-AF_PPPOX" , "slock-AF_WANPIPE" , "slock-AF_LLC" ,
cd05acfe 221 "slock-27" , "slock-28" , "slock-AF_CAN" ,
17926a79 222 "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" ,
bce7b154 223 "slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" ,
6f107b58 224 "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG" ,
c7fe3b52 225 "slock-AF_NFC" , "slock-AF_MAX"
a5b5bb9a 226};
36cbd3dc 227static const char *const af_family_clock_key_strings[AF_MAX+1] = {
443aef0e
PZ
228 "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" ,
229 "clock-AF_AX25" , "clock-AF_IPX" , "clock-AF_APPLETALK",
230 "clock-AF_NETROM", "clock-AF_BRIDGE" , "clock-AF_ATMPVC" ,
231 "clock-AF_X25" , "clock-AF_INET6" , "clock-AF_ROSE" ,
232 "clock-AF_DECnet", "clock-AF_NETBEUI" , "clock-AF_SECURITY" ,
233 "clock-AF_KEY" , "clock-AF_NETLINK" , "clock-AF_PACKET" ,
234 "clock-AF_ASH" , "clock-AF_ECONET" , "clock-AF_ATMSVC" ,
cbd151bf 235 "clock-AF_RDS" , "clock-AF_SNA" , "clock-AF_IRDA" ,
443aef0e 236 "clock-AF_PPPOX" , "clock-AF_WANPIPE" , "clock-AF_LLC" ,
b4942af6 237 "clock-27" , "clock-28" , "clock-AF_CAN" ,
e51f802b 238 "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" ,
bce7b154 239 "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" ,
6f107b58 240 "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG" ,
c7fe3b52 241 "clock-AF_NFC" , "clock-AF_MAX"
443aef0e 242};
da21f24d
IM
243
244/*
245 * sk_callback_lock locking rules are per-address-family,
246 * so split the lock classes by using a per-AF key:
247 */
248static struct lock_class_key af_callback_keys[AF_MAX];
249
1da177e4
LT
250/* Take into consideration the size of the struct sk_buff overhead in the
251 * determination of these values, since that is non-constant across
252 * platforms. This makes socket queueing behavior and performance
253 * not depend upon such differences.
254 */
255#define _SK_MEM_PACKETS 256
87fb4b7b 256#define _SK_MEM_OVERHEAD SKB_TRUESIZE(256)
1da177e4
LT
257#define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
258#define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
259
260/* Run time adjustable parameters. */
ab32ea5d 261__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
6d8ebc8a 262EXPORT_SYMBOL(sysctl_wmem_max);
ab32ea5d 263__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
6d8ebc8a 264EXPORT_SYMBOL(sysctl_rmem_max);
ab32ea5d
BH
265__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
266__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
1da177e4 267
25985edc 268/* Maximal space eaten by iovec or ancillary data plus some space */
ab32ea5d 269int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
2a91525c 270EXPORT_SYMBOL(sysctl_optmem_max);
1da177e4 271
5bc1421e
NH
272#if defined(CONFIG_CGROUPS)
273#if !defined(CONFIG_NET_CLS_CGROUP)
f8451725
HX
274int net_cls_subsys_id = -1;
275EXPORT_SYMBOL_GPL(net_cls_subsys_id);
276#endif
5bc1421e
NH
277#if !defined(CONFIG_NETPRIO_CGROUP)
278int net_prio_subsys_id = -1;
279EXPORT_SYMBOL_GPL(net_prio_subsys_id);
280#endif
281#endif
f8451725 282
1da177e4
LT
283static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
284{
285 struct timeval tv;
286
287 if (optlen < sizeof(tv))
288 return -EINVAL;
289 if (copy_from_user(&tv, optval, sizeof(tv)))
290 return -EFAULT;
ba78073e
VA
291 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
292 return -EDOM;
1da177e4 293
ba78073e 294 if (tv.tv_sec < 0) {
6f11df83
AM
295 static int warned __read_mostly;
296
ba78073e 297 *timeo_p = 0;
50aab54f 298 if (warned < 10 && net_ratelimit()) {
ba78073e
VA
299 warned++;
300 printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) "
301 "tries to set negative timeout\n",
ba25f9dc 302 current->comm, task_pid_nr(current));
50aab54f 303 }
ba78073e
VA
304 return 0;
305 }
1da177e4
LT
306 *timeo_p = MAX_SCHEDULE_TIMEOUT;
307 if (tv.tv_sec == 0 && tv.tv_usec == 0)
308 return 0;
309 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
310 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
311 return 0;
312}
313
314static void sock_warn_obsolete_bsdism(const char *name)
315{
316 static int warned;
317 static char warncomm[TASK_COMM_LEN];
4ec93edb
YH
318 if (strcmp(warncomm, current->comm) && warned < 5) {
319 strcpy(warncomm, current->comm);
1da177e4
LT
320 printk(KERN_WARNING "process `%s' is using obsolete "
321 "%s SO_BSDCOMPAT\n", warncomm, name);
322 warned++;
323 }
324}
325
08e29af3
ED
326#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
327
328static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
4ec93edb 329{
08e29af3
ED
330 if (sk->sk_flags & flags) {
331 sk->sk_flags &= ~flags;
332 if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP))
20d49473 333 net_disable_timestamp();
1da177e4
LT
334 }
335}
336
337
f0088a50
DV
338int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
339{
766e9037 340 int err;
f0088a50 341 int skb_len;
3b885787
NH
342 unsigned long flags;
343 struct sk_buff_head *list = &sk->sk_receive_queue;
f0088a50 344
0fd7bac6 345 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
766e9037 346 atomic_inc(&sk->sk_drops);
3847ce32 347 trace_sock_rcvqueue_full(sk, skb);
766e9037 348 return -ENOMEM;
f0088a50
DV
349 }
350
fda9ef5d 351 err = sk_filter(sk, skb);
f0088a50 352 if (err)
766e9037 353 return err;
f0088a50 354
3ab224be 355 if (!sk_rmem_schedule(sk, skb->truesize)) {
766e9037
ED
356 atomic_inc(&sk->sk_drops);
357 return -ENOBUFS;
3ab224be
HA
358 }
359
f0088a50
DV
360 skb->dev = NULL;
361 skb_set_owner_r(skb, sk);
49ad9599 362
f0088a50
DV
363 /* Cache the SKB length before we tack it onto the receive
364 * queue. Once it is added it no longer belongs to us and
365 * may be freed by other threads of control pulling packets
366 * from the queue.
367 */
368 skb_len = skb->len;
369
7fee226a
ED
370 /* we escape from rcu protected region, make sure we dont leak
371 * a norefcounted dst
372 */
373 skb_dst_force(skb);
374
3b885787
NH
375 spin_lock_irqsave(&list->lock, flags);
376 skb->dropcount = atomic_read(&sk->sk_drops);
377 __skb_queue_tail(list, skb);
378 spin_unlock_irqrestore(&list->lock, flags);
f0088a50
DV
379
380 if (!sock_flag(sk, SOCK_DEAD))
381 sk->sk_data_ready(sk, skb_len);
766e9037 382 return 0;
f0088a50
DV
383}
384EXPORT_SYMBOL(sock_queue_rcv_skb);
385
58a5a7b9 386int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
f0088a50
DV
387{
388 int rc = NET_RX_SUCCESS;
389
fda9ef5d 390 if (sk_filter(sk, skb))
f0088a50
DV
391 goto discard_and_relse;
392
393 skb->dev = NULL;
394
f545a38f 395 if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) {
c377411f
ED
396 atomic_inc(&sk->sk_drops);
397 goto discard_and_relse;
398 }
58a5a7b9
ACM
399 if (nested)
400 bh_lock_sock_nested(sk);
401 else
402 bh_lock_sock(sk);
a5b5bb9a
IM
403 if (!sock_owned_by_user(sk)) {
404 /*
405 * trylock + unlock semantics:
406 */
407 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
408
c57943a1 409 rc = sk_backlog_rcv(sk, skb);
a5b5bb9a
IM
410
411 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
f545a38f 412 } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
8eae939f
ZY
413 bh_unlock_sock(sk);
414 atomic_inc(&sk->sk_drops);
415 goto discard_and_relse;
416 }
417
f0088a50
DV
418 bh_unlock_sock(sk);
419out:
420 sock_put(sk);
421 return rc;
422discard_and_relse:
423 kfree_skb(skb);
424 goto out;
425}
426EXPORT_SYMBOL(sk_receive_skb);
427
ea94ff3b
KK
428void sk_reset_txq(struct sock *sk)
429{
430 sk_tx_queue_clear(sk);
431}
432EXPORT_SYMBOL(sk_reset_txq);
433
f0088a50
DV
434struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
435{
b6c6712a 436 struct dst_entry *dst = __sk_dst_get(sk);
f0088a50
DV
437
438 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
e022f0b4 439 sk_tx_queue_clear(sk);
a9b3cd7f 440 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
f0088a50
DV
441 dst_release(dst);
442 return NULL;
443 }
444
445 return dst;
446}
447EXPORT_SYMBOL(__sk_dst_check);
448
449struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
450{
451 struct dst_entry *dst = sk_dst_get(sk);
452
453 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
454 sk_dst_reset(sk);
455 dst_release(dst);
456 return NULL;
457 }
458
459 return dst;
460}
461EXPORT_SYMBOL(sk_dst_check);
462
4878809f
DM
463static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
464{
465 int ret = -ENOPROTOOPT;
466#ifdef CONFIG_NETDEVICES
3b1e0a65 467 struct net *net = sock_net(sk);
4878809f
DM
468 char devname[IFNAMSIZ];
469 int index;
470
471 /* Sorry... */
472 ret = -EPERM;
473 if (!capable(CAP_NET_RAW))
474 goto out;
475
476 ret = -EINVAL;
477 if (optlen < 0)
478 goto out;
479
480 /* Bind this socket to a particular device like "eth0",
481 * as specified in the passed interface name. If the
482 * name is "" or the option length is zero the socket
483 * is not bound.
484 */
485 if (optlen > IFNAMSIZ - 1)
486 optlen = IFNAMSIZ - 1;
487 memset(devname, 0, sizeof(devname));
488
489 ret = -EFAULT;
490 if (copy_from_user(devname, optval, optlen))
491 goto out;
492
000ba2e4
DM
493 index = 0;
494 if (devname[0] != '\0') {
bf8e56bf 495 struct net_device *dev;
4878809f 496
bf8e56bf
ED
497 rcu_read_lock();
498 dev = dev_get_by_name_rcu(net, devname);
499 if (dev)
500 index = dev->ifindex;
501 rcu_read_unlock();
4878809f
DM
502 ret = -ENODEV;
503 if (!dev)
504 goto out;
4878809f
DM
505 }
506
507 lock_sock(sk);
508 sk->sk_bound_dev_if = index;
509 sk_dst_reset(sk);
510 release_sock(sk);
511
512 ret = 0;
513
514out:
515#endif
516
517 return ret;
518}
519
c0ef877b
PE
520static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
521{
522 if (valbool)
523 sock_set_flag(sk, bit);
524 else
525 sock_reset_flag(sk, bit);
526}
527
1da177e4
LT
528/*
529 * This is meant for all protocols to use and covers goings on
530 * at the socket level. Everything here is generic.
531 */
532
533int sock_setsockopt(struct socket *sock, int level, int optname,
b7058842 534 char __user *optval, unsigned int optlen)
1da177e4 535{
2a91525c 536 struct sock *sk = sock->sk;
1da177e4
LT
537 int val;
538 int valbool;
539 struct linger ling;
540 int ret = 0;
4ec93edb 541
1da177e4
LT
542 /*
543 * Options without arguments
544 */
545
4878809f
DM
546 if (optname == SO_BINDTODEVICE)
547 return sock_bindtodevice(sk, optval, optlen);
548
e71a4783
SH
549 if (optlen < sizeof(int))
550 return -EINVAL;
4ec93edb 551
1da177e4
LT
552 if (get_user(val, (int __user *)optval))
553 return -EFAULT;
4ec93edb 554
2a91525c 555 valbool = val ? 1 : 0;
1da177e4
LT
556
557 lock_sock(sk);
558
2a91525c 559 switch (optname) {
e71a4783 560 case SO_DEBUG:
2a91525c 561 if (val && !capable(CAP_NET_ADMIN))
e71a4783 562 ret = -EACCES;
2a91525c 563 else
c0ef877b 564 sock_valbool_flag(sk, SOCK_DBG, valbool);
e71a4783
SH
565 break;
566 case SO_REUSEADDR:
4a17fd52 567 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
e71a4783
SH
568 break;
569 case SO_TYPE:
49c794e9 570 case SO_PROTOCOL:
0d6038ee 571 case SO_DOMAIN:
e71a4783
SH
572 case SO_ERROR:
573 ret = -ENOPROTOOPT;
574 break;
575 case SO_DONTROUTE:
c0ef877b 576 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
e71a4783
SH
577 break;
578 case SO_BROADCAST:
579 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
580 break;
581 case SO_SNDBUF:
582 /* Don't error on this BSD doesn't and if you think
82981930
ED
583 * about it this is right. Otherwise apps have to
584 * play 'guess the biggest size' games. RCVBUF/SNDBUF
585 * are treated in BSD as hints
586 */
587 val = min_t(u32, val, sysctl_wmem_max);
b0573dea 588set_sndbuf:
e71a4783 589 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
82981930
ED
590 sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);
591 /* Wake up sending tasks if we upped the value. */
e71a4783
SH
592 sk->sk_write_space(sk);
593 break;
1da177e4 594
e71a4783
SH
595 case SO_SNDBUFFORCE:
596 if (!capable(CAP_NET_ADMIN)) {
597 ret = -EPERM;
598 break;
599 }
600 goto set_sndbuf;
b0573dea 601
e71a4783
SH
602 case SO_RCVBUF:
603 /* Don't error on this BSD doesn't and if you think
82981930
ED
604 * about it this is right. Otherwise apps have to
605 * play 'guess the biggest size' games. RCVBUF/SNDBUF
606 * are treated in BSD as hints
607 */
608 val = min_t(u32, val, sysctl_rmem_max);
b0573dea 609set_rcvbuf:
e71a4783
SH
610 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
611 /*
612 * We double it on the way in to account for
613 * "struct sk_buff" etc. overhead. Applications
614 * assume that the SO_RCVBUF setting they make will
615 * allow that much actual data to be received on that
616 * socket.
617 *
618 * Applications are unaware that "struct sk_buff" and
619 * other overheads allocate from the receive buffer
620 * during socket buffer allocation.
621 *
622 * And after considering the possible alternatives,
623 * returning the value we actually used in getsockopt
624 * is the most desirable behavior.
625 */
82981930 626 sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);
e71a4783
SH
627 break;
628
629 case SO_RCVBUFFORCE:
630 if (!capable(CAP_NET_ADMIN)) {
631 ret = -EPERM;
1da177e4 632 break;
e71a4783
SH
633 }
634 goto set_rcvbuf;
1da177e4 635
e71a4783 636 case SO_KEEPALIVE:
1da177e4 637#ifdef CONFIG_INET
e71a4783
SH
638 if (sk->sk_protocol == IPPROTO_TCP)
639 tcp_set_keepalive(sk, valbool);
1da177e4 640#endif
e71a4783
SH
641 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
642 break;
643
644 case SO_OOBINLINE:
645 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
646 break;
647
648 case SO_NO_CHECK:
649 sk->sk_no_check = valbool;
650 break;
651
652 case SO_PRIORITY:
653 if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
654 sk->sk_priority = val;
655 else
656 ret = -EPERM;
657 break;
658
659 case SO_LINGER:
660 if (optlen < sizeof(ling)) {
661 ret = -EINVAL; /* 1003.1g */
1da177e4 662 break;
e71a4783 663 }
2a91525c 664 if (copy_from_user(&ling, optval, sizeof(ling))) {
e71a4783 665 ret = -EFAULT;
1da177e4 666 break;
e71a4783
SH
667 }
668 if (!ling.l_onoff)
669 sock_reset_flag(sk, SOCK_LINGER);
670 else {
1da177e4 671#if (BITS_PER_LONG == 32)
e71a4783
SH
672 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
673 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
1da177e4 674 else
e71a4783
SH
675#endif
676 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
677 sock_set_flag(sk, SOCK_LINGER);
678 }
679 break;
680
681 case SO_BSDCOMPAT:
682 sock_warn_obsolete_bsdism("setsockopt");
683 break;
684
685 case SO_PASSCRED:
686 if (valbool)
687 set_bit(SOCK_PASSCRED, &sock->flags);
688 else
689 clear_bit(SOCK_PASSCRED, &sock->flags);
690 break;
691
692 case SO_TIMESTAMP:
92f37fd2 693 case SO_TIMESTAMPNS:
e71a4783 694 if (valbool) {
92f37fd2
ED
695 if (optname == SO_TIMESTAMP)
696 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
697 else
698 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
e71a4783 699 sock_set_flag(sk, SOCK_RCVTSTAMP);
20d49473 700 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
92f37fd2 701 } else {
e71a4783 702 sock_reset_flag(sk, SOCK_RCVTSTAMP);
92f37fd2
ED
703 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
704 }
e71a4783
SH
705 break;
706
20d49473
PO
707 case SO_TIMESTAMPING:
708 if (val & ~SOF_TIMESTAMPING_MASK) {
f249fb78 709 ret = -EINVAL;
20d49473
PO
710 break;
711 }
712 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
713 val & SOF_TIMESTAMPING_TX_HARDWARE);
714 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
715 val & SOF_TIMESTAMPING_TX_SOFTWARE);
716 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
717 val & SOF_TIMESTAMPING_RX_HARDWARE);
718 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
719 sock_enable_timestamp(sk,
720 SOCK_TIMESTAMPING_RX_SOFTWARE);
721 else
722 sock_disable_timestamp(sk,
08e29af3 723 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
20d49473
PO
724 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
725 val & SOF_TIMESTAMPING_SOFTWARE);
726 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
727 val & SOF_TIMESTAMPING_SYS_HARDWARE);
728 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
729 val & SOF_TIMESTAMPING_RAW_HARDWARE);
730 break;
731
e71a4783
SH
732 case SO_RCVLOWAT:
733 if (val < 0)
734 val = INT_MAX;
735 sk->sk_rcvlowat = val ? : 1;
736 break;
737
738 case SO_RCVTIMEO:
739 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
740 break;
741
742 case SO_SNDTIMEO:
743 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
744 break;
1da177e4 745
e71a4783
SH
746 case SO_ATTACH_FILTER:
747 ret = -EINVAL;
748 if (optlen == sizeof(struct sock_fprog)) {
749 struct sock_fprog fprog;
1da177e4 750
e71a4783
SH
751 ret = -EFAULT;
752 if (copy_from_user(&fprog, optval, sizeof(fprog)))
1da177e4 753 break;
e71a4783
SH
754
755 ret = sk_attach_filter(&fprog, sk);
756 }
757 break;
758
759 case SO_DETACH_FILTER:
55b33325 760 ret = sk_detach_filter(sk);
e71a4783 761 break;
1da177e4 762
e71a4783
SH
763 case SO_PASSSEC:
764 if (valbool)
765 set_bit(SOCK_PASSSEC, &sock->flags);
766 else
767 clear_bit(SOCK_PASSSEC, &sock->flags);
768 break;
4a19ec58
LAT
769 case SO_MARK:
770 if (!capable(CAP_NET_ADMIN))
771 ret = -EPERM;
2a91525c 772 else
4a19ec58 773 sk->sk_mark = val;
4a19ec58 774 break;
877ce7c1 775
1da177e4
LT
776 /* We implement the SO_SNDLOWAT etc to
777 not be settable (1003.1g 5.3) */
3b885787 778 case SO_RXQ_OVFL:
8083f0fc 779 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
3b885787 780 break;
6e3e939f
JB
781
782 case SO_WIFI_STATUS:
783 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
784 break;
785
ef64a54f
PE
786 case SO_PEEK_OFF:
787 if (sock->ops->set_peek_off)
788 sock->ops->set_peek_off(sk, val);
789 else
790 ret = -EOPNOTSUPP;
791 break;
3bdc0eba
BG
792
793 case SO_NOFCS:
794 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
795 break;
796
e71a4783
SH
797 default:
798 ret = -ENOPROTOOPT;
799 break;
4ec93edb 800 }
1da177e4
LT
801 release_sock(sk);
802 return ret;
803}
2a91525c 804EXPORT_SYMBOL(sock_setsockopt);
1da177e4
LT
805
806
3f551f94
EB
807void cred_to_ucred(struct pid *pid, const struct cred *cred,
808 struct ucred *ucred)
809{
810 ucred->pid = pid_vnr(pid);
811 ucred->uid = ucred->gid = -1;
812 if (cred) {
813 struct user_namespace *current_ns = current_user_ns();
814
815 ucred->uid = user_ns_map_uid(current_ns, cred, cred->euid);
816 ucred->gid = user_ns_map_gid(current_ns, cred, cred->egid);
817 }
818}
3924773a 819EXPORT_SYMBOL_GPL(cred_to_ucred);
3f551f94 820
1da177e4
LT
821int sock_getsockopt(struct socket *sock, int level, int optname,
822 char __user *optval, int __user *optlen)
823{
824 struct sock *sk = sock->sk;
4ec93edb 825
e71a4783 826 union {
4ec93edb
YH
827 int val;
828 struct linger ling;
1da177e4
LT
829 struct timeval tm;
830 } v;
4ec93edb 831
4d0392be 832 int lv = sizeof(int);
1da177e4 833 int len;
4ec93edb 834
e71a4783 835 if (get_user(len, optlen))
4ec93edb 836 return -EFAULT;
e71a4783 837 if (len < 0)
1da177e4 838 return -EINVAL;
4ec93edb 839
50fee1de 840 memset(&v, 0, sizeof(v));
df0bca04 841
2a91525c 842 switch (optname) {
e71a4783
SH
843 case SO_DEBUG:
844 v.val = sock_flag(sk, SOCK_DBG);
845 break;
846
847 case SO_DONTROUTE:
848 v.val = sock_flag(sk, SOCK_LOCALROUTE);
849 break;
850
851 case SO_BROADCAST:
852 v.val = !!sock_flag(sk, SOCK_BROADCAST);
853 break;
854
855 case SO_SNDBUF:
856 v.val = sk->sk_sndbuf;
857 break;
858
859 case SO_RCVBUF:
860 v.val = sk->sk_rcvbuf;
861 break;
862
863 case SO_REUSEADDR:
864 v.val = sk->sk_reuse;
865 break;
866
867 case SO_KEEPALIVE:
868 v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
869 break;
870
871 case SO_TYPE:
872 v.val = sk->sk_type;
873 break;
874
49c794e9
JE
875 case SO_PROTOCOL:
876 v.val = sk->sk_protocol;
877 break;
878
0d6038ee
JE
879 case SO_DOMAIN:
880 v.val = sk->sk_family;
881 break;
882
e71a4783
SH
883 case SO_ERROR:
884 v.val = -sock_error(sk);
2a91525c 885 if (v.val == 0)
e71a4783
SH
886 v.val = xchg(&sk->sk_err_soft, 0);
887 break;
888
889 case SO_OOBINLINE:
890 v.val = !!sock_flag(sk, SOCK_URGINLINE);
891 break;
892
893 case SO_NO_CHECK:
894 v.val = sk->sk_no_check;
895 break;
896
897 case SO_PRIORITY:
898 v.val = sk->sk_priority;
899 break;
900
901 case SO_LINGER:
902 lv = sizeof(v.ling);
903 v.ling.l_onoff = !!sock_flag(sk, SOCK_LINGER);
904 v.ling.l_linger = sk->sk_lingertime / HZ;
905 break;
906
907 case SO_BSDCOMPAT:
908 sock_warn_obsolete_bsdism("getsockopt");
909 break;
910
911 case SO_TIMESTAMP:
92f37fd2
ED
912 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
913 !sock_flag(sk, SOCK_RCVTSTAMPNS);
914 break;
915
916 case SO_TIMESTAMPNS:
917 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
e71a4783
SH
918 break;
919
20d49473
PO
920 case SO_TIMESTAMPING:
921 v.val = 0;
922 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
923 v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
924 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
925 v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
926 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
927 v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
928 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
929 v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
930 if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
931 v.val |= SOF_TIMESTAMPING_SOFTWARE;
932 if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
933 v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
934 if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
935 v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
936 break;
937
e71a4783 938 case SO_RCVTIMEO:
2a91525c 939 lv = sizeof(struct timeval);
e71a4783
SH
940 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
941 v.tm.tv_sec = 0;
942 v.tm.tv_usec = 0;
943 } else {
944 v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
945 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
946 }
947 break;
948
949 case SO_SNDTIMEO:
2a91525c 950 lv = sizeof(struct timeval);
e71a4783
SH
951 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
952 v.tm.tv_sec = 0;
953 v.tm.tv_usec = 0;
954 } else {
955 v.tm.tv_sec = sk->sk_sndtimeo / HZ;
956 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
957 }
958 break;
1da177e4 959
e71a4783
SH
960 case SO_RCVLOWAT:
961 v.val = sk->sk_rcvlowat;
962 break;
1da177e4 963
e71a4783 964 case SO_SNDLOWAT:
2a91525c 965 v.val = 1;
e71a4783 966 break;
1da177e4 967
e71a4783 968 case SO_PASSCRED:
82981930 969 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
e71a4783 970 break;
1da177e4 971
e71a4783 972 case SO_PEERCRED:
109f6e39
EB
973 {
974 struct ucred peercred;
975 if (len > sizeof(peercred))
976 len = sizeof(peercred);
977 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
978 if (copy_to_user(optval, &peercred, len))
e71a4783
SH
979 return -EFAULT;
980 goto lenout;
109f6e39 981 }
1da177e4 982
e71a4783
SH
983 case SO_PEERNAME:
984 {
985 char address[128];
986
987 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
988 return -ENOTCONN;
989 if (lv < len)
990 return -EINVAL;
991 if (copy_to_user(optval, address, len))
992 return -EFAULT;
993 goto lenout;
994 }
1da177e4 995
e71a4783
SH
996 /* Dubious BSD thing... Probably nobody even uses it, but
997 * the UNIX standard wants it for whatever reason... -DaveM
998 */
999 case SO_ACCEPTCONN:
1000 v.val = sk->sk_state == TCP_LISTEN;
1001 break;
1da177e4 1002
e71a4783 1003 case SO_PASSSEC:
82981930 1004 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
e71a4783 1005 break;
877ce7c1 1006
e71a4783
SH
1007 case SO_PEERSEC:
1008 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1da177e4 1009
4a19ec58
LAT
1010 case SO_MARK:
1011 v.val = sk->sk_mark;
1012 break;
1013
3b885787
NH
1014 case SO_RXQ_OVFL:
1015 v.val = !!sock_flag(sk, SOCK_RXQ_OVFL);
1016 break;
1017
6e3e939f
JB
1018 case SO_WIFI_STATUS:
1019 v.val = !!sock_flag(sk, SOCK_WIFI_STATUS);
1020 break;
1021
ef64a54f
PE
1022 case SO_PEEK_OFF:
1023 if (!sock->ops->set_peek_off)
1024 return -EOPNOTSUPP;
1025
1026 v.val = sk->sk_peek_off;
1027 break;
bc2f7996
DM
1028 case SO_NOFCS:
1029 v.val = !!sock_flag(sk, SOCK_NOFCS);
1030 break;
e71a4783
SH
1031 default:
1032 return -ENOPROTOOPT;
1da177e4 1033 }
e71a4783 1034
1da177e4
LT
1035 if (len > lv)
1036 len = lv;
1037 if (copy_to_user(optval, &v, len))
1038 return -EFAULT;
1039lenout:
4ec93edb
YH
1040 if (put_user(len, optlen))
1041 return -EFAULT;
1042 return 0;
1da177e4
LT
1043}
1044
a5b5bb9a
IM
1045/*
1046 * Initialize an sk_lock.
1047 *
1048 * (We also register the sk_lock with the lock validator.)
1049 */
b6f99a21 1050static inline void sock_lock_init(struct sock *sk)
a5b5bb9a 1051{
ed07536e
PZ
1052 sock_lock_init_class_and_name(sk,
1053 af_family_slock_key_strings[sk->sk_family],
1054 af_family_slock_keys + sk->sk_family,
1055 af_family_key_strings[sk->sk_family],
1056 af_family_keys + sk->sk_family);
a5b5bb9a
IM
1057}
1058
4dc6dc71
ED
1059/*
1060 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1061 * even temporarly, because of RCU lookups. sk_node should also be left as is.
68835aba 1062 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
4dc6dc71 1063 */
f1a6c4da
PE
1064static void sock_copy(struct sock *nsk, const struct sock *osk)
1065{
1066#ifdef CONFIG_SECURITY_NETWORK
1067 void *sptr = nsk->sk_security;
1068#endif
68835aba
ED
1069 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1070
1071 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1072 osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1073
f1a6c4da
PE
1074#ifdef CONFIG_SECURITY_NETWORK
1075 nsk->sk_security = sptr;
1076 security_sk_clone(osk, nsk);
1077#endif
1078}
1079
fcbdf09d
OP
1080/*
1081 * caches using SLAB_DESTROY_BY_RCU should let .next pointer from nulls nodes
1082 * un-modified. Special care is taken when initializing object to zero.
1083 */
1084static inline void sk_prot_clear_nulls(struct sock *sk, int size)
1085{
1086 if (offsetof(struct sock, sk_node.next) != 0)
1087 memset(sk, 0, offsetof(struct sock, sk_node.next));
1088 memset(&sk->sk_node.pprev, 0,
1089 size - offsetof(struct sock, sk_node.pprev));
1090}
1091
1092void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1093{
1094 unsigned long nulls1, nulls2;
1095
1096 nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1097 nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1098 if (nulls1 > nulls2)
1099 swap(nulls1, nulls2);
1100
1101 if (nulls1 != 0)
1102 memset((char *)sk, 0, nulls1);
1103 memset((char *)sk + nulls1 + sizeof(void *), 0,
1104 nulls2 - nulls1 - sizeof(void *));
1105 memset((char *)sk + nulls2 + sizeof(void *), 0,
1106 size - nulls2 - sizeof(void *));
1107}
1108EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1109
2e4afe7b
PE
1110static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1111 int family)
c308c1b2
PE
1112{
1113 struct sock *sk;
1114 struct kmem_cache *slab;
1115
1116 slab = prot->slab;
e912b114
ED
1117 if (slab != NULL) {
1118 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1119 if (!sk)
1120 return sk;
1121 if (priority & __GFP_ZERO) {
fcbdf09d
OP
1122 if (prot->clear_sk)
1123 prot->clear_sk(sk, prot->obj_size);
1124 else
1125 sk_prot_clear_nulls(sk, prot->obj_size);
e912b114 1126 }
fcbdf09d 1127 } else
c308c1b2
PE
1128 sk = kmalloc(prot->obj_size, priority);
1129
2e4afe7b 1130 if (sk != NULL) {
a98b65a3
VN
1131 kmemcheck_annotate_bitfield(sk, flags);
1132
2e4afe7b
PE
1133 if (security_sk_alloc(sk, family, priority))
1134 goto out_free;
1135
1136 if (!try_module_get(prot->owner))
1137 goto out_free_sec;
e022f0b4 1138 sk_tx_queue_clear(sk);
2e4afe7b
PE
1139 }
1140
c308c1b2 1141 return sk;
2e4afe7b
PE
1142
1143out_free_sec:
1144 security_sk_free(sk);
1145out_free:
1146 if (slab != NULL)
1147 kmem_cache_free(slab, sk);
1148 else
1149 kfree(sk);
1150 return NULL;
c308c1b2
PE
1151}
1152
1153static void sk_prot_free(struct proto *prot, struct sock *sk)
1154{
1155 struct kmem_cache *slab;
2e4afe7b 1156 struct module *owner;
c308c1b2 1157
2e4afe7b 1158 owner = prot->owner;
c308c1b2 1159 slab = prot->slab;
2e4afe7b
PE
1160
1161 security_sk_free(sk);
c308c1b2
PE
1162 if (slab != NULL)
1163 kmem_cache_free(slab, sk);
1164 else
1165 kfree(sk);
2e4afe7b 1166 module_put(owner);
c308c1b2
PE
1167}
1168
f8451725
HX
1169#ifdef CONFIG_CGROUPS
1170void sock_update_classid(struct sock *sk)
1171{
1144182a 1172 u32 classid;
f8451725 1173
1144182a
PM
1174 rcu_read_lock(); /* doing current task, which cannot vanish. */
1175 classid = task_cls_classid(current);
1176 rcu_read_unlock();
f8451725
HX
1177 if (classid && classid != sk->sk_classid)
1178 sk->sk_classid = classid;
1179}
82862742 1180EXPORT_SYMBOL(sock_update_classid);
5bc1421e
NH
1181
1182void sock_update_netprioidx(struct sock *sk)
1183{
5bc1421e
NH
1184 if (in_interrupt())
1185 return;
2b73bc65
NH
1186
1187 sk->sk_cgrp_prioidx = task_netprioidx(current);
5bc1421e
NH
1188}
1189EXPORT_SYMBOL_GPL(sock_update_netprioidx);
f8451725
HX
1190#endif
1191
1da177e4
LT
1192/**
1193 * sk_alloc - All socket objects are allocated here
c4ea43c5 1194 * @net: the applicable net namespace
4dc3b16b
PP
1195 * @family: protocol family
1196 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1197 * @prot: struct proto associated with this new sock instance
1da177e4 1198 */
1b8d7ae4 1199struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
6257ff21 1200 struct proto *prot)
1da177e4 1201{
c308c1b2 1202 struct sock *sk;
1da177e4 1203
154adbc8 1204 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1da177e4 1205 if (sk) {
154adbc8
PE
1206 sk->sk_family = family;
1207 /*
1208 * See comment in struct sock definition to understand
1209 * why we need sk_prot_creator -acme
1210 */
1211 sk->sk_prot = sk->sk_prot_creator = prot;
1212 sock_lock_init(sk);
3b1e0a65 1213 sock_net_set(sk, get_net(net));
d66ee058 1214 atomic_set(&sk->sk_wmem_alloc, 1);
f8451725
HX
1215
1216 sock_update_classid(sk);
5bc1421e 1217 sock_update_netprioidx(sk);
1da177e4 1218 }
a79af59e 1219
2e4afe7b 1220 return sk;
1da177e4 1221}
2a91525c 1222EXPORT_SYMBOL(sk_alloc);
1da177e4 1223
2b85a34e 1224static void __sk_free(struct sock *sk)
1da177e4
LT
1225{
1226 struct sk_filter *filter;
1da177e4
LT
1227
1228 if (sk->sk_destruct)
1229 sk->sk_destruct(sk);
1230
a898def2
PM
1231 filter = rcu_dereference_check(sk->sk_filter,
1232 atomic_read(&sk->sk_wmem_alloc) == 0);
1da177e4 1233 if (filter) {
309dd5fc 1234 sk_filter_uncharge(sk, filter);
a9b3cd7f 1235 RCU_INIT_POINTER(sk->sk_filter, NULL);
1da177e4
LT
1236 }
1237
08e29af3 1238 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1da177e4
LT
1239
1240 if (atomic_read(&sk->sk_omem_alloc))
1241 printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
0dc47877 1242 __func__, atomic_read(&sk->sk_omem_alloc));
1da177e4 1243
109f6e39
EB
1244 if (sk->sk_peer_cred)
1245 put_cred(sk->sk_peer_cred);
1246 put_pid(sk->sk_peer_pid);
3b1e0a65 1247 put_net(sock_net(sk));
c308c1b2 1248 sk_prot_free(sk->sk_prot_creator, sk);
1da177e4 1249}
2b85a34e
ED
1250
1251void sk_free(struct sock *sk)
1252{
1253 /*
25985edc 1254 * We subtract one from sk_wmem_alloc and can know if
2b85a34e
ED
1255 * some packets are still in some tx queue.
1256 * If not null, sock_wfree() will call __sk_free(sk) later
1257 */
1258 if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1259 __sk_free(sk);
1260}
2a91525c 1261EXPORT_SYMBOL(sk_free);
1da177e4 1262
edf02087 1263/*
25985edc
LDM
1264 * Last sock_put should drop reference to sk->sk_net. It has already
1265 * been dropped in sk_change_net. Taking reference to stopping namespace
edf02087 1266 * is not an option.
25985edc 1267 * Take reference to a socket to remove it from hash _alive_ and after that
edf02087
DL
1268 * destroy it in the context of init_net.
1269 */
1270void sk_release_kernel(struct sock *sk)
1271{
1272 if (sk == NULL || sk->sk_socket == NULL)
1273 return;
1274
1275 sock_hold(sk);
1276 sock_release(sk->sk_socket);
65a18ec5 1277 release_net(sock_net(sk));
3b1e0a65 1278 sock_net_set(sk, get_net(&init_net));
edf02087
DL
1279 sock_put(sk);
1280}
45af1754 1281EXPORT_SYMBOL(sk_release_kernel);
edf02087 1282
475f1b52
SR
1283static void sk_update_clone(const struct sock *sk, struct sock *newsk)
1284{
1285 if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
1286 sock_update_memcg(newsk);
1287}
1288
e56c57d0
ED
1289/**
1290 * sk_clone_lock - clone a socket, and lock its clone
1291 * @sk: the socket to clone
1292 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1293 *
1294 * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1295 */
1296struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
87d11ceb 1297{
8fd1d178 1298 struct sock *newsk;
87d11ceb 1299
8fd1d178 1300 newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
87d11ceb
ACM
1301 if (newsk != NULL) {
1302 struct sk_filter *filter;
1303
892c141e 1304 sock_copy(newsk, sk);
87d11ceb
ACM
1305
1306 /* SANITY */
3b1e0a65 1307 get_net(sock_net(newsk));
87d11ceb
ACM
1308 sk_node_init(&newsk->sk_node);
1309 sock_lock_init(newsk);
1310 bh_lock_sock(newsk);
fa438ccf 1311 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
8eae939f 1312 newsk->sk_backlog.len = 0;
87d11ceb
ACM
1313
1314 atomic_set(&newsk->sk_rmem_alloc, 0);
2b85a34e
ED
1315 /*
1316 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1317 */
1318 atomic_set(&newsk->sk_wmem_alloc, 1);
87d11ceb
ACM
1319 atomic_set(&newsk->sk_omem_alloc, 0);
1320 skb_queue_head_init(&newsk->sk_receive_queue);
1321 skb_queue_head_init(&newsk->sk_write_queue);
97fc2f08
CL
1322#ifdef CONFIG_NET_DMA
1323 skb_queue_head_init(&newsk->sk_async_wait_queue);
1324#endif
87d11ceb 1325
b6c6712a 1326 spin_lock_init(&newsk->sk_dst_lock);
87d11ceb 1327 rwlock_init(&newsk->sk_callback_lock);
443aef0e
PZ
1328 lockdep_set_class_and_name(&newsk->sk_callback_lock,
1329 af_callback_keys + newsk->sk_family,
1330 af_family_clock_key_strings[newsk->sk_family]);
87d11ceb
ACM
1331
1332 newsk->sk_dst_cache = NULL;
1333 newsk->sk_wmem_queued = 0;
1334 newsk->sk_forward_alloc = 0;
1335 newsk->sk_send_head = NULL;
87d11ceb
ACM
1336 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1337
1338 sock_reset_flag(newsk, SOCK_DONE);
1339 skb_queue_head_init(&newsk->sk_error_queue);
1340
0d7da9dd 1341 filter = rcu_dereference_protected(newsk->sk_filter, 1);
87d11ceb
ACM
1342 if (filter != NULL)
1343 sk_filter_charge(newsk, filter);
1344
1345 if (unlikely(xfrm_sk_clone_policy(newsk))) {
1346 /* It is still raw copy of parent, so invalidate
1347 * destructor and make plain sk_free() */
1348 newsk->sk_destruct = NULL;
b0691c8e 1349 bh_unlock_sock(newsk);
87d11ceb
ACM
1350 sk_free(newsk);
1351 newsk = NULL;
1352 goto out;
1353 }
1354
1355 newsk->sk_err = 0;
1356 newsk->sk_priority = 0;
4dc6dc71
ED
1357 /*
1358 * Before updating sk_refcnt, we must commit prior changes to memory
1359 * (Documentation/RCU/rculist_nulls.txt for details)
1360 */
1361 smp_wmb();
87d11ceb
ACM
1362 atomic_set(&newsk->sk_refcnt, 2);
1363
1364 /*
1365 * Increment the counter in the same struct proto as the master
1366 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1367 * is the same as sk->sk_prot->socks, as this field was copied
1368 * with memcpy).
1369 *
1370 * This _changes_ the previous behaviour, where
1371 * tcp_create_openreq_child always was incrementing the
1372 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1373 * to be taken into account in all callers. -acme
1374 */
1375 sk_refcnt_debug_inc(newsk);
972692e0 1376 sk_set_socket(newsk, NULL);
43815482 1377 newsk->sk_wq = NULL;
87d11ceb 1378
f3f511e1
GC
1379 sk_update_clone(sk, newsk);
1380
87d11ceb 1381 if (newsk->sk_prot->sockets_allocated)
180d8cd9 1382 sk_sockets_allocated_inc(newsk);
704da560 1383
08e29af3 1384 if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
704da560 1385 net_enable_timestamp();
87d11ceb
ACM
1386 }
1387out:
1388 return newsk;
1389}
e56c57d0 1390EXPORT_SYMBOL_GPL(sk_clone_lock);
87d11ceb 1391
9958089a
AK
1392void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1393{
1394 __sk_dst_set(sk, dst);
1395 sk->sk_route_caps = dst->dev->features;
1396 if (sk->sk_route_caps & NETIF_F_GSO)
4fcd6b99 1397 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
a465419b 1398 sk->sk_route_caps &= ~sk->sk_route_nocaps;
9958089a 1399 if (sk_can_gso(sk)) {
82cc1a7a 1400 if (dst->header_len) {
9958089a 1401 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
82cc1a7a 1402 } else {
9958089a 1403 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
82cc1a7a
PWJ
1404 sk->sk_gso_max_size = dst->dev->gso_max_size;
1405 }
9958089a
AK
1406 }
1407}
1408EXPORT_SYMBOL_GPL(sk_setup_caps);
1409
1da177e4
LT
1410void __init sk_init(void)
1411{
4481374c 1412 if (totalram_pages <= 4096) {
1da177e4
LT
1413 sysctl_wmem_max = 32767;
1414 sysctl_rmem_max = 32767;
1415 sysctl_wmem_default = 32767;
1416 sysctl_rmem_default = 32767;
4481374c 1417 } else if (totalram_pages >= 131072) {
1da177e4
LT
1418 sysctl_wmem_max = 131071;
1419 sysctl_rmem_max = 131071;
1420 }
1421}
1422
1423/*
1424 * Simple resource managers for sockets.
1425 */
1426
1427
4ec93edb
YH
1428/*
1429 * Write buffer destructor automatically called from kfree_skb.
1da177e4
LT
1430 */
1431void sock_wfree(struct sk_buff *skb)
1432{
1433 struct sock *sk = skb->sk;
d99927f4 1434 unsigned int len = skb->truesize;
1da177e4 1435
d99927f4
ED
1436 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1437 /*
1438 * Keep a reference on sk_wmem_alloc, this will be released
1439 * after sk_write_space() call
1440 */
1441 atomic_sub(len - 1, &sk->sk_wmem_alloc);
1da177e4 1442 sk->sk_write_space(sk);
d99927f4
ED
1443 len = 1;
1444 }
2b85a34e 1445 /*
d99927f4
ED
1446 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1447 * could not do because of in-flight packets
2b85a34e 1448 */
d99927f4 1449 if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
2b85a34e 1450 __sk_free(sk);
1da177e4 1451}
2a91525c 1452EXPORT_SYMBOL(sock_wfree);
1da177e4 1453
4ec93edb
YH
1454/*
1455 * Read buffer destructor automatically called from kfree_skb.
1da177e4
LT
1456 */
1457void sock_rfree(struct sk_buff *skb)
1458{
1459 struct sock *sk = skb->sk;
d361fd59 1460 unsigned int len = skb->truesize;
1da177e4 1461
d361fd59
ED
1462 atomic_sub(len, &sk->sk_rmem_alloc);
1463 sk_mem_uncharge(sk, len);
1da177e4 1464}
2a91525c 1465EXPORT_SYMBOL(sock_rfree);
1da177e4
LT
1466
1467
1468int sock_i_uid(struct sock *sk)
1469{
1470 int uid;
1471
f064af1e 1472 read_lock_bh(&sk->sk_callback_lock);
1da177e4 1473 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
f064af1e 1474 read_unlock_bh(&sk->sk_callback_lock);
1da177e4
LT
1475 return uid;
1476}
2a91525c 1477EXPORT_SYMBOL(sock_i_uid);
1da177e4
LT
1478
1479unsigned long sock_i_ino(struct sock *sk)
1480{
1481 unsigned long ino;
1482
f064af1e 1483 read_lock_bh(&sk->sk_callback_lock);
1da177e4 1484 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
f064af1e 1485 read_unlock_bh(&sk->sk_callback_lock);
1da177e4
LT
1486 return ino;
1487}
2a91525c 1488EXPORT_SYMBOL(sock_i_ino);
1da177e4
LT
1489
1490/*
1491 * Allocate a skb from the socket's send buffer.
1492 */
86a76caf 1493struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
dd0fc66f 1494 gfp_t priority)
1da177e4
LT
1495{
1496 if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
2a91525c 1497 struct sk_buff *skb = alloc_skb(size, priority);
1da177e4
LT
1498 if (skb) {
1499 skb_set_owner_w(skb, sk);
1500 return skb;
1501 }
1502 }
1503 return NULL;
1504}
2a91525c 1505EXPORT_SYMBOL(sock_wmalloc);
1da177e4
LT
1506
1507/*
1508 * Allocate a skb from the socket's receive buffer.
4ec93edb 1509 */
86a76caf 1510struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
dd0fc66f 1511 gfp_t priority)
1da177e4
LT
1512{
1513 if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1514 struct sk_buff *skb = alloc_skb(size, priority);
1515 if (skb) {
1516 skb_set_owner_r(skb, sk);
1517 return skb;
1518 }
1519 }
1520 return NULL;
1521}
1522
4ec93edb 1523/*
1da177e4 1524 * Allocate a memory block from the socket's option memory buffer.
4ec93edb 1525 */
dd0fc66f 1526void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1da177e4 1527{
95c96174 1528 if ((unsigned int)size <= sysctl_optmem_max &&
1da177e4
LT
1529 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1530 void *mem;
1531 /* First do the add, to avoid the race if kmalloc
4ec93edb 1532 * might sleep.
1da177e4
LT
1533 */
1534 atomic_add(size, &sk->sk_omem_alloc);
1535 mem = kmalloc(size, priority);
1536 if (mem)
1537 return mem;
1538 atomic_sub(size, &sk->sk_omem_alloc);
1539 }
1540 return NULL;
1541}
2a91525c 1542EXPORT_SYMBOL(sock_kmalloc);
1da177e4
LT
1543
1544/*
1545 * Free an option memory block.
1546 */
1547void sock_kfree_s(struct sock *sk, void *mem, int size)
1548{
1549 kfree(mem);
1550 atomic_sub(size, &sk->sk_omem_alloc);
1551}
2a91525c 1552EXPORT_SYMBOL(sock_kfree_s);
1da177e4
LT
1553
1554/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1555 I think, these locks should be removed for datagram sockets.
1556 */
2a91525c 1557static long sock_wait_for_wmem(struct sock *sk, long timeo)
1da177e4
LT
1558{
1559 DEFINE_WAIT(wait);
1560
1561 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1562 for (;;) {
1563 if (!timeo)
1564 break;
1565 if (signal_pending(current))
1566 break;
1567 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
aa395145 1568 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1da177e4
LT
1569 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1570 break;
1571 if (sk->sk_shutdown & SEND_SHUTDOWN)
1572 break;
1573 if (sk->sk_err)
1574 break;
1575 timeo = schedule_timeout(timeo);
1576 }
aa395145 1577 finish_wait(sk_sleep(sk), &wait);
1da177e4
LT
1578 return timeo;
1579}
1580
1581
1582/*
1583 * Generic send/receive buffer handlers
1584 */
1585
4cc7f68d
HX
1586struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1587 unsigned long data_len, int noblock,
1588 int *errcode)
1da177e4
LT
1589{
1590 struct sk_buff *skb;
7d877f3b 1591 gfp_t gfp_mask;
1da177e4
LT
1592 long timeo;
1593 int err;
1594
1595 gfp_mask = sk->sk_allocation;
1596 if (gfp_mask & __GFP_WAIT)
1597 gfp_mask |= __GFP_REPEAT;
1598
1599 timeo = sock_sndtimeo(sk, noblock);
1600 while (1) {
1601 err = sock_error(sk);
1602 if (err != 0)
1603 goto failure;
1604
1605 err = -EPIPE;
1606 if (sk->sk_shutdown & SEND_SHUTDOWN)
1607 goto failure;
1608
1609 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
db38c179 1610 skb = alloc_skb(header_len, gfp_mask);
1da177e4
LT
1611 if (skb) {
1612 int npages;
1613 int i;
1614
1615 /* No pages, we're done... */
1616 if (!data_len)
1617 break;
1618
1619 npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1620 skb->truesize += data_len;
1621 skb_shinfo(skb)->nr_frags = npages;
1622 for (i = 0; i < npages; i++) {
1623 struct page *page;
1da177e4
LT
1624
1625 page = alloc_pages(sk->sk_allocation, 0);
1626 if (!page) {
1627 err = -ENOBUFS;
1628 skb_shinfo(skb)->nr_frags = i;
1629 kfree_skb(skb);
1630 goto failure;
1631 }
1632
ea2ab693
IC
1633 __skb_fill_page_desc(skb, i,
1634 page, 0,
1635 (data_len >= PAGE_SIZE ?
1636 PAGE_SIZE :
1637 data_len));
1da177e4
LT
1638 data_len -= PAGE_SIZE;
1639 }
1640
1641 /* Full success... */
1642 break;
1643 }
1644 err = -ENOBUFS;
1645 goto failure;
1646 }
1647 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1648 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1649 err = -EAGAIN;
1650 if (!timeo)
1651 goto failure;
1652 if (signal_pending(current))
1653 goto interrupted;
1654 timeo = sock_wait_for_wmem(sk, timeo);
1655 }
1656
1657 skb_set_owner_w(skb, sk);
1658 return skb;
1659
1660interrupted:
1661 err = sock_intr_errno(timeo);
1662failure:
1663 *errcode = err;
1664 return NULL;
1665}
4cc7f68d 1666EXPORT_SYMBOL(sock_alloc_send_pskb);
1da177e4 1667
4ec93edb 1668struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1da177e4
LT
1669 int noblock, int *errcode)
1670{
1671 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1672}
2a91525c 1673EXPORT_SYMBOL(sock_alloc_send_skb);
1da177e4
LT
1674
1675static void __lock_sock(struct sock *sk)
f39234d6
NK
1676 __releases(&sk->sk_lock.slock)
1677 __acquires(&sk->sk_lock.slock)
1da177e4
LT
1678{
1679 DEFINE_WAIT(wait);
1680
e71a4783 1681 for (;;) {
1da177e4
LT
1682 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1683 TASK_UNINTERRUPTIBLE);
1684 spin_unlock_bh(&sk->sk_lock.slock);
1685 schedule();
1686 spin_lock_bh(&sk->sk_lock.slock);
e71a4783 1687 if (!sock_owned_by_user(sk))
1da177e4
LT
1688 break;
1689 }
1690 finish_wait(&sk->sk_lock.wq, &wait);
1691}
1692
1693static void __release_sock(struct sock *sk)
f39234d6
NK
1694 __releases(&sk->sk_lock.slock)
1695 __acquires(&sk->sk_lock.slock)
1da177e4
LT
1696{
1697 struct sk_buff *skb = sk->sk_backlog.head;
1698
1699 do {
1700 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1701 bh_unlock_sock(sk);
1702
1703 do {
1704 struct sk_buff *next = skb->next;
1705
e4cbb02a 1706 prefetch(next);
7fee226a 1707 WARN_ON_ONCE(skb_dst_is_noref(skb));
1da177e4 1708 skb->next = NULL;
c57943a1 1709 sk_backlog_rcv(sk, skb);
1da177e4
LT
1710
1711 /*
1712 * We are in process context here with softirqs
1713 * disabled, use cond_resched_softirq() to preempt.
1714 * This is safe to do because we've taken the backlog
1715 * queue private:
1716 */
1717 cond_resched_softirq();
1718
1719 skb = next;
1720 } while (skb != NULL);
1721
1722 bh_lock_sock(sk);
e71a4783 1723 } while ((skb = sk->sk_backlog.head) != NULL);
8eae939f
ZY
1724
1725 /*
1726 * Doing the zeroing here guarantee we can not loop forever
1727 * while a wild producer attempts to flood us.
1728 */
1729 sk->sk_backlog.len = 0;
1da177e4
LT
1730}
1731
1732/**
1733 * sk_wait_data - wait for data to arrive at sk_receive_queue
4dc3b16b
PP
1734 * @sk: sock to wait on
1735 * @timeo: for how long
1da177e4
LT
1736 *
1737 * Now socket state including sk->sk_err is changed only under lock,
1738 * hence we may omit checks after joining wait queue.
1739 * We check receive queue before schedule() only as optimization;
1740 * it is very likely that release_sock() added new data.
1741 */
1742int sk_wait_data(struct sock *sk, long *timeo)
1743{
1744 int rc;
1745 DEFINE_WAIT(wait);
1746
aa395145 1747 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1da177e4
LT
1748 set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1749 rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1750 clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
aa395145 1751 finish_wait(sk_sleep(sk), &wait);
1da177e4
LT
1752 return rc;
1753}
1da177e4
LT
1754EXPORT_SYMBOL(sk_wait_data);
1755
3ab224be
HA
1756/**
1757 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1758 * @sk: socket
1759 * @size: memory size to allocate
1760 * @kind: allocation type
1761 *
1762 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1763 * rmem allocation. This function assumes that protocols which have
1764 * memory_pressure use sk_wmem_queued as write buffer accounting.
1765 */
1766int __sk_mem_schedule(struct sock *sk, int size, int kind)
1767{
1768 struct proto *prot = sk->sk_prot;
1769 int amt = sk_mem_pages(size);
8d987e5c 1770 long allocated;
e1aab161 1771 int parent_status = UNDER_LIMIT;
3ab224be
HA
1772
1773 sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
180d8cd9 1774
e1aab161 1775 allocated = sk_memory_allocated_add(sk, amt, &parent_status);
3ab224be
HA
1776
1777 /* Under limit. */
e1aab161
GC
1778 if (parent_status == UNDER_LIMIT &&
1779 allocated <= sk_prot_mem_limits(sk, 0)) {
180d8cd9 1780 sk_leave_memory_pressure(sk);
3ab224be
HA
1781 return 1;
1782 }
1783
e1aab161
GC
1784 /* Under pressure. (we or our parents) */
1785 if ((parent_status > SOFT_LIMIT) ||
1786 allocated > sk_prot_mem_limits(sk, 1))
180d8cd9 1787 sk_enter_memory_pressure(sk);
3ab224be 1788
e1aab161
GC
1789 /* Over hard limit (we or our parents) */
1790 if ((parent_status == OVER_LIMIT) ||
1791 (allocated > sk_prot_mem_limits(sk, 2)))
3ab224be
HA
1792 goto suppress_allocation;
1793
1794 /* guarantee minimum buffer size under pressure */
1795 if (kind == SK_MEM_RECV) {
1796 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1797 return 1;
180d8cd9 1798
3ab224be
HA
1799 } else { /* SK_MEM_SEND */
1800 if (sk->sk_type == SOCK_STREAM) {
1801 if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1802 return 1;
1803 } else if (atomic_read(&sk->sk_wmem_alloc) <
1804 prot->sysctl_wmem[0])
1805 return 1;
1806 }
1807
180d8cd9 1808 if (sk_has_memory_pressure(sk)) {
1748376b
ED
1809 int alloc;
1810
180d8cd9 1811 if (!sk_under_memory_pressure(sk))
1748376b 1812 return 1;
180d8cd9
GC
1813 alloc = sk_sockets_allocated_read_positive(sk);
1814 if (sk_prot_mem_limits(sk, 2) > alloc *
3ab224be
HA
1815 sk_mem_pages(sk->sk_wmem_queued +
1816 atomic_read(&sk->sk_rmem_alloc) +
1817 sk->sk_forward_alloc))
1818 return 1;
1819 }
1820
1821suppress_allocation:
1822
1823 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1824 sk_stream_moderate_sndbuf(sk);
1825
1826 /* Fail only if socket is _under_ its sndbuf.
1827 * In this case we cannot block, so that we have to fail.
1828 */
1829 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
1830 return 1;
1831 }
1832
3847ce32
SM
1833 trace_sock_exceed_buf_limit(sk, prot, allocated);
1834
3ab224be
HA
1835 /* Alas. Undo changes. */
1836 sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
180d8cd9 1837
0e90b31f 1838 sk_memory_allocated_sub(sk, amt);
180d8cd9 1839
3ab224be
HA
1840 return 0;
1841}
3ab224be
HA
1842EXPORT_SYMBOL(__sk_mem_schedule);
1843
1844/**
1845 * __sk_reclaim - reclaim memory_allocated
1846 * @sk: socket
1847 */
1848void __sk_mem_reclaim(struct sock *sk)
1849{
180d8cd9 1850 sk_memory_allocated_sub(sk,
0e90b31f 1851 sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT);
3ab224be
HA
1852 sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
1853
180d8cd9
GC
1854 if (sk_under_memory_pressure(sk) &&
1855 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
1856 sk_leave_memory_pressure(sk);
3ab224be 1857}
3ab224be
HA
1858EXPORT_SYMBOL(__sk_mem_reclaim);
1859
1860
1da177e4
LT
1861/*
1862 * Set of default routines for initialising struct proto_ops when
1863 * the protocol does not support a particular function. In certain
1864 * cases where it makes no sense for a protocol to have a "do nothing"
1865 * function, some default processing is provided.
1866 */
1867
1868int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1869{
1870 return -EOPNOTSUPP;
1871}
2a91525c 1872EXPORT_SYMBOL(sock_no_bind);
1da177e4 1873
4ec93edb 1874int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1da177e4
LT
1875 int len, int flags)
1876{
1877 return -EOPNOTSUPP;
1878}
2a91525c 1879EXPORT_SYMBOL(sock_no_connect);
1da177e4
LT
1880
1881int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1882{
1883 return -EOPNOTSUPP;
1884}
2a91525c 1885EXPORT_SYMBOL(sock_no_socketpair);
1da177e4
LT
1886
1887int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1888{
1889 return -EOPNOTSUPP;
1890}
2a91525c 1891EXPORT_SYMBOL(sock_no_accept);
1da177e4 1892
4ec93edb 1893int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1da177e4
LT
1894 int *len, int peer)
1895{
1896 return -EOPNOTSUPP;
1897}
2a91525c 1898EXPORT_SYMBOL(sock_no_getname);
1da177e4 1899
2a91525c 1900unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
1da177e4
LT
1901{
1902 return 0;
1903}
2a91525c 1904EXPORT_SYMBOL(sock_no_poll);
1da177e4
LT
1905
1906int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1907{
1908 return -EOPNOTSUPP;
1909}
2a91525c 1910EXPORT_SYMBOL(sock_no_ioctl);
1da177e4
LT
1911
1912int sock_no_listen(struct socket *sock, int backlog)
1913{
1914 return -EOPNOTSUPP;
1915}
2a91525c 1916EXPORT_SYMBOL(sock_no_listen);
1da177e4
LT
1917
1918int sock_no_shutdown(struct socket *sock, int how)
1919{
1920 return -EOPNOTSUPP;
1921}
2a91525c 1922EXPORT_SYMBOL(sock_no_shutdown);
1da177e4
LT
1923
1924int sock_no_setsockopt(struct socket *sock, int level, int optname,
b7058842 1925 char __user *optval, unsigned int optlen)
1da177e4
LT
1926{
1927 return -EOPNOTSUPP;
1928}
2a91525c 1929EXPORT_SYMBOL(sock_no_setsockopt);
1da177e4
LT
1930
1931int sock_no_getsockopt(struct socket *sock, int level, int optname,
1932 char __user *optval, int __user *optlen)
1933{
1934 return -EOPNOTSUPP;
1935}
2a91525c 1936EXPORT_SYMBOL(sock_no_getsockopt);
1da177e4
LT
1937
1938int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1939 size_t len)
1940{
1941 return -EOPNOTSUPP;
1942}
2a91525c 1943EXPORT_SYMBOL(sock_no_sendmsg);
1da177e4
LT
1944
1945int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1946 size_t len, int flags)
1947{
1948 return -EOPNOTSUPP;
1949}
2a91525c 1950EXPORT_SYMBOL(sock_no_recvmsg);
1da177e4
LT
1951
1952int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1953{
1954 /* Mirror missing mmap method error code */
1955 return -ENODEV;
1956}
2a91525c 1957EXPORT_SYMBOL(sock_no_mmap);
1da177e4
LT
1958
1959ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1960{
1961 ssize_t res;
1962 struct msghdr msg = {.msg_flags = flags};
1963 struct kvec iov;
1964 char *kaddr = kmap(page);
1965 iov.iov_base = kaddr + offset;
1966 iov.iov_len = size;
1967 res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1968 kunmap(page);
1969 return res;
1970}
2a91525c 1971EXPORT_SYMBOL(sock_no_sendpage);
1da177e4
LT
1972
1973/*
1974 * Default Socket Callbacks
1975 */
1976
1977static void sock_def_wakeup(struct sock *sk)
1978{
43815482
ED
1979 struct socket_wq *wq;
1980
1981 rcu_read_lock();
1982 wq = rcu_dereference(sk->sk_wq);
1983 if (wq_has_sleeper(wq))
1984 wake_up_interruptible_all(&wq->wait);
1985 rcu_read_unlock();
1da177e4
LT
1986}
1987
1988static void sock_def_error_report(struct sock *sk)
1989{
43815482
ED
1990 struct socket_wq *wq;
1991
1992 rcu_read_lock();
1993 wq = rcu_dereference(sk->sk_wq);
1994 if (wq_has_sleeper(wq))
1995 wake_up_interruptible_poll(&wq->wait, POLLERR);
8d8ad9d7 1996 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
43815482 1997 rcu_read_unlock();
1da177e4
LT
1998}
1999
2000static void sock_def_readable(struct sock *sk, int len)
2001{
43815482
ED
2002 struct socket_wq *wq;
2003
2004 rcu_read_lock();
2005 wq = rcu_dereference(sk->sk_wq);
2006 if (wq_has_sleeper(wq))
2c6607c6 2007 wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
37e5540b 2008 POLLRDNORM | POLLRDBAND);
8d8ad9d7 2009 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
43815482 2010 rcu_read_unlock();
1da177e4
LT
2011}
2012
2013static void sock_def_write_space(struct sock *sk)
2014{
43815482
ED
2015 struct socket_wq *wq;
2016
2017 rcu_read_lock();
1da177e4
LT
2018
2019 /* Do not wake up a writer until he can make "significant"
2020 * progress. --DaveM
2021 */
e71a4783 2022 if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
43815482
ED
2023 wq = rcu_dereference(sk->sk_wq);
2024 if (wq_has_sleeper(wq))
2025 wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
37e5540b 2026 POLLWRNORM | POLLWRBAND);
1da177e4
LT
2027
2028 /* Should agree with poll, otherwise some programs break */
2029 if (sock_writeable(sk))
8d8ad9d7 2030 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
1da177e4
LT
2031 }
2032
43815482 2033 rcu_read_unlock();
1da177e4
LT
2034}
2035
2036static void sock_def_destruct(struct sock *sk)
2037{
a51482bd 2038 kfree(sk->sk_protinfo);
1da177e4
LT
2039}
2040
2041void sk_send_sigurg(struct sock *sk)
2042{
2043 if (sk->sk_socket && sk->sk_socket->file)
2044 if (send_sigurg(&sk->sk_socket->file->f_owner))
8d8ad9d7 2045 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
1da177e4 2046}
2a91525c 2047EXPORT_SYMBOL(sk_send_sigurg);
1da177e4
LT
2048
2049void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2050 unsigned long expires)
2051{
2052 if (!mod_timer(timer, expires))
2053 sock_hold(sk);
2054}
1da177e4
LT
2055EXPORT_SYMBOL(sk_reset_timer);
2056
2057void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2058{
2059 if (timer_pending(timer) && del_timer(timer))
2060 __sock_put(sk);
2061}
1da177e4
LT
2062EXPORT_SYMBOL(sk_stop_timer);
2063
2064void sock_init_data(struct socket *sock, struct sock *sk)
2065{
2066 skb_queue_head_init(&sk->sk_receive_queue);
2067 skb_queue_head_init(&sk->sk_write_queue);
2068 skb_queue_head_init(&sk->sk_error_queue);
97fc2f08
CL
2069#ifdef CONFIG_NET_DMA
2070 skb_queue_head_init(&sk->sk_async_wait_queue);
2071#endif
1da177e4
LT
2072
2073 sk->sk_send_head = NULL;
2074
2075 init_timer(&sk->sk_timer);
4ec93edb 2076
1da177e4
LT
2077 sk->sk_allocation = GFP_KERNEL;
2078 sk->sk_rcvbuf = sysctl_rmem_default;
2079 sk->sk_sndbuf = sysctl_wmem_default;
2080 sk->sk_state = TCP_CLOSE;
972692e0 2081 sk_set_socket(sk, sock);
1da177e4
LT
2082
2083 sock_set_flag(sk, SOCK_ZAPPED);
2084
e71a4783 2085 if (sock) {
1da177e4 2086 sk->sk_type = sock->type;
43815482 2087 sk->sk_wq = sock->wq;
1da177e4
LT
2088 sock->sk = sk;
2089 } else
43815482 2090 sk->sk_wq = NULL;
1da177e4 2091
b6c6712a 2092 spin_lock_init(&sk->sk_dst_lock);
1da177e4 2093 rwlock_init(&sk->sk_callback_lock);
443aef0e
PZ
2094 lockdep_set_class_and_name(&sk->sk_callback_lock,
2095 af_callback_keys + sk->sk_family,
2096 af_family_clock_key_strings[sk->sk_family]);
1da177e4
LT
2097
2098 sk->sk_state_change = sock_def_wakeup;
2099 sk->sk_data_ready = sock_def_readable;
2100 sk->sk_write_space = sock_def_write_space;
2101 sk->sk_error_report = sock_def_error_report;
2102 sk->sk_destruct = sock_def_destruct;
2103
2104 sk->sk_sndmsg_page = NULL;
2105 sk->sk_sndmsg_off = 0;
ef64a54f 2106 sk->sk_peek_off = -1;
1da177e4 2107
109f6e39
EB
2108 sk->sk_peer_pid = NULL;
2109 sk->sk_peer_cred = NULL;
1da177e4
LT
2110 sk->sk_write_pending = 0;
2111 sk->sk_rcvlowat = 1;
2112 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
2113 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
2114
f37f0afb 2115 sk->sk_stamp = ktime_set(-1L, 0);
1da177e4 2116
4dc6dc71
ED
2117 /*
2118 * Before updating sk_refcnt, we must commit prior changes to memory
2119 * (Documentation/RCU/rculist_nulls.txt for details)
2120 */
2121 smp_wmb();
1da177e4 2122 atomic_set(&sk->sk_refcnt, 1);
33c732c3 2123 atomic_set(&sk->sk_drops, 0);
1da177e4 2124}
2a91525c 2125EXPORT_SYMBOL(sock_init_data);
1da177e4 2126
b5606c2d 2127void lock_sock_nested(struct sock *sk, int subclass)
1da177e4
LT
2128{
2129 might_sleep();
a5b5bb9a 2130 spin_lock_bh(&sk->sk_lock.slock);
d2e9117c 2131 if (sk->sk_lock.owned)
1da177e4 2132 __lock_sock(sk);
d2e9117c 2133 sk->sk_lock.owned = 1;
a5b5bb9a
IM
2134 spin_unlock(&sk->sk_lock.slock);
2135 /*
2136 * The sk_lock has mutex_lock() semantics here:
2137 */
fcc70d5f 2138 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
a5b5bb9a 2139 local_bh_enable();
1da177e4 2140}
fcc70d5f 2141EXPORT_SYMBOL(lock_sock_nested);
1da177e4 2142
b5606c2d 2143void release_sock(struct sock *sk)
1da177e4 2144{
a5b5bb9a
IM
2145 /*
2146 * The sk_lock has mutex_unlock() semantics:
2147 */
2148 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
2149
2150 spin_lock_bh(&sk->sk_lock.slock);
1da177e4
LT
2151 if (sk->sk_backlog.tail)
2152 __release_sock(sk);
d2e9117c 2153 sk->sk_lock.owned = 0;
a5b5bb9a
IM
2154 if (waitqueue_active(&sk->sk_lock.wq))
2155 wake_up(&sk->sk_lock.wq);
2156 spin_unlock_bh(&sk->sk_lock.slock);
1da177e4
LT
2157}
2158EXPORT_SYMBOL(release_sock);
2159
8a74ad60
ED
2160/**
2161 * lock_sock_fast - fast version of lock_sock
2162 * @sk: socket
2163 *
2164 * This version should be used for very small section, where process wont block
2165 * return false if fast path is taken
2166 * sk_lock.slock locked, owned = 0, BH disabled
2167 * return true if slow path is taken
2168 * sk_lock.slock unlocked, owned = 1, BH enabled
2169 */
2170bool lock_sock_fast(struct sock *sk)
2171{
2172 might_sleep();
2173 spin_lock_bh(&sk->sk_lock.slock);
2174
2175 if (!sk->sk_lock.owned)
2176 /*
2177 * Note : We must disable BH
2178 */
2179 return false;
2180
2181 __lock_sock(sk);
2182 sk->sk_lock.owned = 1;
2183 spin_unlock(&sk->sk_lock.slock);
2184 /*
2185 * The sk_lock has mutex_lock() semantics here:
2186 */
2187 mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2188 local_bh_enable();
2189 return true;
2190}
2191EXPORT_SYMBOL(lock_sock_fast);
2192
1da177e4 2193int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
4ec93edb 2194{
b7aa0bf7 2195 struct timeval tv;
1da177e4 2196 if (!sock_flag(sk, SOCK_TIMESTAMP))
20d49473 2197 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
b7aa0bf7
ED
2198 tv = ktime_to_timeval(sk->sk_stamp);
2199 if (tv.tv_sec == -1)
1da177e4 2200 return -ENOENT;
b7aa0bf7
ED
2201 if (tv.tv_sec == 0) {
2202 sk->sk_stamp = ktime_get_real();
2203 tv = ktime_to_timeval(sk->sk_stamp);
2204 }
2205 return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
4ec93edb 2206}
1da177e4
LT
2207EXPORT_SYMBOL(sock_get_timestamp);
2208
ae40eb1e
ED
2209int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2210{
2211 struct timespec ts;
2212 if (!sock_flag(sk, SOCK_TIMESTAMP))
20d49473 2213 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
ae40eb1e
ED
2214 ts = ktime_to_timespec(sk->sk_stamp);
2215 if (ts.tv_sec == -1)
2216 return -ENOENT;
2217 if (ts.tv_sec == 0) {
2218 sk->sk_stamp = ktime_get_real();
2219 ts = ktime_to_timespec(sk->sk_stamp);
2220 }
2221 return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2222}
2223EXPORT_SYMBOL(sock_get_timestampns);
2224
20d49473 2225void sock_enable_timestamp(struct sock *sk, int flag)
4ec93edb 2226{
20d49473 2227 if (!sock_flag(sk, flag)) {
08e29af3
ED
2228 unsigned long previous_flags = sk->sk_flags;
2229
20d49473
PO
2230 sock_set_flag(sk, flag);
2231 /*
2232 * we just set one of the two flags which require net
2233 * time stamping, but time stamping might have been on
2234 * already because of the other one
2235 */
08e29af3 2236 if (!(previous_flags & SK_FLAGS_TIMESTAMP))
20d49473 2237 net_enable_timestamp();
1da177e4
LT
2238 }
2239}
1da177e4
LT
2240
2241/*
2242 * Get a socket option on an socket.
2243 *
2244 * FIX: POSIX 1003.1g is very ambiguous here. It states that
2245 * asynchronous errors should be reported by getsockopt. We assume
2246 * this means if you specify SO_ERROR (otherwise whats the point of it).
2247 */
2248int sock_common_getsockopt(struct socket *sock, int level, int optname,
2249 char __user *optval, int __user *optlen)
2250{
2251 struct sock *sk = sock->sk;
2252
2253 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2254}
1da177e4
LT
2255EXPORT_SYMBOL(sock_common_getsockopt);
2256
3fdadf7d 2257#ifdef CONFIG_COMPAT
543d9cfe
ACM
2258int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2259 char __user *optval, int __user *optlen)
3fdadf7d
DM
2260{
2261 struct sock *sk = sock->sk;
2262
1e51f951 2263 if (sk->sk_prot->compat_getsockopt != NULL)
543d9cfe
ACM
2264 return sk->sk_prot->compat_getsockopt(sk, level, optname,
2265 optval, optlen);
3fdadf7d
DM
2266 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2267}
2268EXPORT_SYMBOL(compat_sock_common_getsockopt);
2269#endif
2270
1da177e4
LT
2271int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
2272 struct msghdr *msg, size_t size, int flags)
2273{
2274 struct sock *sk = sock->sk;
2275 int addr_len = 0;
2276 int err;
2277
2278 err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2279 flags & ~MSG_DONTWAIT, &addr_len);
2280 if (err >= 0)
2281 msg->msg_namelen = addr_len;
2282 return err;
2283}
1da177e4
LT
2284EXPORT_SYMBOL(sock_common_recvmsg);
2285
2286/*
2287 * Set socket options on an inet socket.
2288 */
2289int sock_common_setsockopt(struct socket *sock, int level, int optname,
b7058842 2290 char __user *optval, unsigned int optlen)
1da177e4
LT
2291{
2292 struct sock *sk = sock->sk;
2293
2294 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2295}
1da177e4
LT
2296EXPORT_SYMBOL(sock_common_setsockopt);
2297
3fdadf7d 2298#ifdef CONFIG_COMPAT
543d9cfe 2299int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
b7058842 2300 char __user *optval, unsigned int optlen)
3fdadf7d
DM
2301{
2302 struct sock *sk = sock->sk;
2303
543d9cfe
ACM
2304 if (sk->sk_prot->compat_setsockopt != NULL)
2305 return sk->sk_prot->compat_setsockopt(sk, level, optname,
2306 optval, optlen);
3fdadf7d
DM
2307 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2308}
2309EXPORT_SYMBOL(compat_sock_common_setsockopt);
2310#endif
2311
1da177e4
LT
2312void sk_common_release(struct sock *sk)
2313{
2314 if (sk->sk_prot->destroy)
2315 sk->sk_prot->destroy(sk);
2316
2317 /*
2318 * Observation: when sock_common_release is called, processes have
2319 * no access to socket. But net still has.
2320 * Step one, detach it from networking:
2321 *
2322 * A. Remove from hash tables.
2323 */
2324
2325 sk->sk_prot->unhash(sk);
2326
2327 /*
2328 * In this point socket cannot receive new packets, but it is possible
2329 * that some packets are in flight because some CPU runs receiver and
2330 * did hash table lookup before we unhashed socket. They will achieve
2331 * receive queue and will be purged by socket destructor.
2332 *
2333 * Also we still have packets pending on receive queue and probably,
2334 * our own packets waiting in device queues. sock_destroy will drain
2335 * receive queue, but transmitted packets will delay socket destruction
2336 * until the last reference will be released.
2337 */
2338
2339 sock_orphan(sk);
2340
2341 xfrm_sk_free_policy(sk);
2342
e6848976 2343 sk_refcnt_debug_release(sk);
1da177e4
LT
2344 sock_put(sk);
2345}
1da177e4
LT
2346EXPORT_SYMBOL(sk_common_release);
2347
13ff3d6f
PE
2348#ifdef CONFIG_PROC_FS
2349#define PROTO_INUSE_NR 64 /* should be enough for the first time */
1338d466
PE
2350struct prot_inuse {
2351 int val[PROTO_INUSE_NR];
2352};
13ff3d6f
PE
2353
2354static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
70ee1159
PE
2355
2356#ifdef CONFIG_NET_NS
2357void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2358{
d6d9ca0f 2359 __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
70ee1159
PE
2360}
2361EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2362
2363int sock_prot_inuse_get(struct net *net, struct proto *prot)
2364{
2365 int cpu, idx = prot->inuse_idx;
2366 int res = 0;
2367
2368 for_each_possible_cpu(cpu)
2369 res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2370
2371 return res >= 0 ? res : 0;
2372}
2373EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2374
2c8c1e72 2375static int __net_init sock_inuse_init_net(struct net *net)
70ee1159
PE
2376{
2377 net->core.inuse = alloc_percpu(struct prot_inuse);
2378 return net->core.inuse ? 0 : -ENOMEM;
2379}
2380
2c8c1e72 2381static void __net_exit sock_inuse_exit_net(struct net *net)
70ee1159
PE
2382{
2383 free_percpu(net->core.inuse);
2384}
2385
2386static struct pernet_operations net_inuse_ops = {
2387 .init = sock_inuse_init_net,
2388 .exit = sock_inuse_exit_net,
2389};
2390
2391static __init int net_inuse_init(void)
2392{
2393 if (register_pernet_subsys(&net_inuse_ops))
2394 panic("Cannot initialize net inuse counters");
2395
2396 return 0;
2397}
2398
2399core_initcall(net_inuse_init);
2400#else
1338d466
PE
2401static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2402
c29a0bc4 2403void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
1338d466 2404{
d6d9ca0f 2405 __this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
1338d466
PE
2406}
2407EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2408
c29a0bc4 2409int sock_prot_inuse_get(struct net *net, struct proto *prot)
1338d466
PE
2410{
2411 int cpu, idx = prot->inuse_idx;
2412 int res = 0;
2413
2414 for_each_possible_cpu(cpu)
2415 res += per_cpu(prot_inuse, cpu).val[idx];
2416
2417 return res >= 0 ? res : 0;
2418}
2419EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
70ee1159 2420#endif
13ff3d6f
PE
2421
2422static void assign_proto_idx(struct proto *prot)
2423{
2424 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2425
2426 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2427 printk(KERN_ERR "PROTO_INUSE_NR exhausted\n");
2428 return;
2429 }
2430
2431 set_bit(prot->inuse_idx, proto_inuse_idx);
2432}
2433
2434static void release_proto_idx(struct proto *prot)
2435{
2436 if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2437 clear_bit(prot->inuse_idx, proto_inuse_idx);
2438}
2439#else
2440static inline void assign_proto_idx(struct proto *prot)
2441{
2442}
2443
2444static inline void release_proto_idx(struct proto *prot)
2445{
2446}
2447#endif
2448
b733c007
PE
2449int proto_register(struct proto *prot, int alloc_slab)
2450{
1da177e4
LT
2451 if (alloc_slab) {
2452 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
271b72c7
ED
2453 SLAB_HWCACHE_ALIGN | prot->slab_flags,
2454 NULL);
1da177e4
LT
2455
2456 if (prot->slab == NULL) {
2457 printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
2458 prot->name);
60e7663d 2459 goto out;
1da177e4 2460 }
2e6599cb
ACM
2461
2462 if (prot->rsk_prot != NULL) {
faf23422 2463 prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name);
7e56b5d6 2464 if (prot->rsk_prot->slab_name == NULL)
2e6599cb
ACM
2465 goto out_free_sock_slab;
2466
7e56b5d6 2467 prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2e6599cb 2468 prot->rsk_prot->obj_size, 0,
20c2df83 2469 SLAB_HWCACHE_ALIGN, NULL);
2e6599cb
ACM
2470
2471 if (prot->rsk_prot->slab == NULL) {
2472 printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
2473 prot->name);
2474 goto out_free_request_sock_slab_name;
2475 }
2476 }
8feaf0c0 2477
6d6ee43e 2478 if (prot->twsk_prot != NULL) {
faf23422 2479 prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
8feaf0c0 2480
7e56b5d6 2481 if (prot->twsk_prot->twsk_slab_name == NULL)
8feaf0c0
ACM
2482 goto out_free_request_sock_slab;
2483
6d6ee43e 2484 prot->twsk_prot->twsk_slab =
7e56b5d6 2485 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
6d6ee43e 2486 prot->twsk_prot->twsk_obj_size,
3ab5aee7
ED
2487 0,
2488 SLAB_HWCACHE_ALIGN |
2489 prot->slab_flags,
20c2df83 2490 NULL);
6d6ee43e 2491 if (prot->twsk_prot->twsk_slab == NULL)
8feaf0c0
ACM
2492 goto out_free_timewait_sock_slab_name;
2493 }
1da177e4
LT
2494 }
2495
36b77a52 2496 mutex_lock(&proto_list_mutex);
1da177e4 2497 list_add(&prot->node, &proto_list);
13ff3d6f 2498 assign_proto_idx(prot);
36b77a52 2499 mutex_unlock(&proto_list_mutex);
b733c007
PE
2500 return 0;
2501
8feaf0c0 2502out_free_timewait_sock_slab_name:
7e56b5d6 2503 kfree(prot->twsk_prot->twsk_slab_name);
8feaf0c0
ACM
2504out_free_request_sock_slab:
2505 if (prot->rsk_prot && prot->rsk_prot->slab) {
2506 kmem_cache_destroy(prot->rsk_prot->slab);
2507 prot->rsk_prot->slab = NULL;
2508 }
2e6599cb 2509out_free_request_sock_slab_name:
72150e9b
DC
2510 if (prot->rsk_prot)
2511 kfree(prot->rsk_prot->slab_name);
2e6599cb
ACM
2512out_free_sock_slab:
2513 kmem_cache_destroy(prot->slab);
2514 prot->slab = NULL;
b733c007
PE
2515out:
2516 return -ENOBUFS;
1da177e4 2517}
1da177e4
LT
2518EXPORT_SYMBOL(proto_register);
2519
2520void proto_unregister(struct proto *prot)
2521{
36b77a52 2522 mutex_lock(&proto_list_mutex);
13ff3d6f 2523 release_proto_idx(prot);
0a3f4358 2524 list_del(&prot->node);
36b77a52 2525 mutex_unlock(&proto_list_mutex);
1da177e4
LT
2526
2527 if (prot->slab != NULL) {
2528 kmem_cache_destroy(prot->slab);
2529 prot->slab = NULL;
2530 }
2531
2e6599cb 2532 if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2e6599cb 2533 kmem_cache_destroy(prot->rsk_prot->slab);
7e56b5d6 2534 kfree(prot->rsk_prot->slab_name);
2e6599cb
ACM
2535 prot->rsk_prot->slab = NULL;
2536 }
2537
6d6ee43e 2538 if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
6d6ee43e 2539 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
7e56b5d6 2540 kfree(prot->twsk_prot->twsk_slab_name);
6d6ee43e 2541 prot->twsk_prot->twsk_slab = NULL;
8feaf0c0 2542 }
1da177e4 2543}
1da177e4
LT
2544EXPORT_SYMBOL(proto_unregister);
2545
2546#ifdef CONFIG_PROC_FS
1da177e4 2547static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
36b77a52 2548 __acquires(proto_list_mutex)
1da177e4 2549{
36b77a52 2550 mutex_lock(&proto_list_mutex);
60f0438a 2551 return seq_list_start_head(&proto_list, *pos);
1da177e4
LT
2552}
2553
2554static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2555{
60f0438a 2556 return seq_list_next(v, &proto_list, pos);
1da177e4
LT
2557}
2558
2559static void proto_seq_stop(struct seq_file *seq, void *v)
36b77a52 2560 __releases(proto_list_mutex)
1da177e4 2561{
36b77a52 2562 mutex_unlock(&proto_list_mutex);
1da177e4
LT
2563}
2564
2565static char proto_method_implemented(const void *method)
2566{
2567 return method == NULL ? 'n' : 'y';
2568}
180d8cd9
GC
2569static long sock_prot_memory_allocated(struct proto *proto)
2570{
cb75a36c 2571 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
180d8cd9
GC
2572}
2573
2574static char *sock_prot_memory_pressure(struct proto *proto)
2575{
2576 return proto->memory_pressure != NULL ?
2577 proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2578}
1da177e4
LT
2579
2580static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2581{
180d8cd9 2582
8d987e5c 2583 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
1da177e4
LT
2584 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2585 proto->name,
2586 proto->obj_size,
14e943db 2587 sock_prot_inuse_get(seq_file_net(seq), proto),
180d8cd9
GC
2588 sock_prot_memory_allocated(proto),
2589 sock_prot_memory_pressure(proto),
1da177e4
LT
2590 proto->max_header,
2591 proto->slab == NULL ? "no" : "yes",
2592 module_name(proto->owner),
2593 proto_method_implemented(proto->close),
2594 proto_method_implemented(proto->connect),
2595 proto_method_implemented(proto->disconnect),
2596 proto_method_implemented(proto->accept),
2597 proto_method_implemented(proto->ioctl),
2598 proto_method_implemented(proto->init),
2599 proto_method_implemented(proto->destroy),
2600 proto_method_implemented(proto->shutdown),
2601 proto_method_implemented(proto->setsockopt),
2602 proto_method_implemented(proto->getsockopt),
2603 proto_method_implemented(proto->sendmsg),
2604 proto_method_implemented(proto->recvmsg),
2605 proto_method_implemented(proto->sendpage),
2606 proto_method_implemented(proto->bind),
2607 proto_method_implemented(proto->backlog_rcv),
2608 proto_method_implemented(proto->hash),
2609 proto_method_implemented(proto->unhash),
2610 proto_method_implemented(proto->get_port),
2611 proto_method_implemented(proto->enter_memory_pressure));
2612}
2613
2614static int proto_seq_show(struct seq_file *seq, void *v)
2615{
60f0438a 2616 if (v == &proto_list)
1da177e4
LT
2617 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2618 "protocol",
2619 "size",
2620 "sockets",
2621 "memory",
2622 "press",
2623 "maxhdr",
2624 "slab",
2625 "module",
2626 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2627 else
60f0438a 2628 proto_seq_printf(seq, list_entry(v, struct proto, node));
1da177e4
LT
2629 return 0;
2630}
2631
f690808e 2632static const struct seq_operations proto_seq_ops = {
1da177e4
LT
2633 .start = proto_seq_start,
2634 .next = proto_seq_next,
2635 .stop = proto_seq_stop,
2636 .show = proto_seq_show,
2637};
2638
2639static int proto_seq_open(struct inode *inode, struct file *file)
2640{
14e943db
ED
2641 return seq_open_net(inode, file, &proto_seq_ops,
2642 sizeof(struct seq_net_private));
1da177e4
LT
2643}
2644
9a32144e 2645static const struct file_operations proto_seq_fops = {
1da177e4
LT
2646 .owner = THIS_MODULE,
2647 .open = proto_seq_open,
2648 .read = seq_read,
2649 .llseek = seq_lseek,
14e943db
ED
2650 .release = seq_release_net,
2651};
2652
2653static __net_init int proto_init_net(struct net *net)
2654{
2655 if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops))
2656 return -ENOMEM;
2657
2658 return 0;
2659}
2660
2661static __net_exit void proto_exit_net(struct net *net)
2662{
2663 proc_net_remove(net, "protocols");
2664}
2665
2666
2667static __net_initdata struct pernet_operations proto_net_ops = {
2668 .init = proto_init_net,
2669 .exit = proto_exit_net,
1da177e4
LT
2670};
2671
2672static int __init proto_init(void)
2673{
14e943db 2674 return register_pernet_subsys(&proto_net_ops);
1da177e4
LT
2675}
2676
2677subsys_initcall(proto_init);
2678
2679#endif /* PROC_FS */
This page took 0.843163 seconds and 5 git commands to generate.