ip_frag: dont touch device refcount
[deliverable/linux.git] / net / core / sock.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Generic socket support routines. Memory allocators, socket lock/release
7 * handler for protocols to use and generic option handler.
8 *
9 *
02c30a84 10 * Authors: Ross Biro
1da177e4
LT
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Florian La Roche, <flla@stud.uni-sb.de>
13 * Alan Cox, <A.Cox@swansea.ac.uk>
14 *
15 * Fixes:
16 * Alan Cox : Numerous verify_area() problems
17 * Alan Cox : Connecting on a connecting socket
18 * now returns an error for tcp.
19 * Alan Cox : sock->protocol is set correctly.
20 * and is not sometimes left as 0.
21 * Alan Cox : connect handles icmp errors on a
22 * connect properly. Unfortunately there
23 * is a restart syscall nasty there. I
24 * can't match BSD without hacking the C
25 * library. Ideas urgently sought!
26 * Alan Cox : Disallow bind() to addresses that are
27 * not ours - especially broadcast ones!!
28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
30 * instead they leave that for the DESTROY timer.
31 * Alan Cox : Clean up error flag in accept
32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
33 * was buggy. Put a remove_sock() in the handler
34 * for memory when we hit 0. Also altered the timer
4ec93edb 35 * code. The ACK stuff can wait and needs major
1da177e4
LT
36 * TCP layer surgery.
37 * Alan Cox : Fixed TCP ack bug, removed remove sock
38 * and fixed timer/inet_bh race.
39 * Alan Cox : Added zapped flag for TCP
40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 * Rick Sladkey : Relaxed UDP rules for matching packets.
46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
47 * Pauline Middelink : identd support
48 * Alan Cox : Fixed connect() taking signals I think.
49 * Alan Cox : SO_LINGER supported
50 * Alan Cox : Error reporting fixes
51 * Anonymous : inet_create tidied up (sk->reuse setting)
52 * Alan Cox : inet sockets don't set sk->type!
53 * Alan Cox : Split socket option code
54 * Alan Cox : Callbacks
55 * Alan Cox : Nagle flag for Charles & Johannes stuff
56 * Alex : Removed restriction on inet fioctl
57 * Alan Cox : Splitting INET from NET core
58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
60 * Alan Cox : Split IP from generic code
61 * Alan Cox : New kfree_skbmem()
62 * Alan Cox : Make SO_DEBUG superuser only.
63 * Alan Cox : Allow anyone to clear SO_DEBUG
64 * (compatibility fix)
65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
66 * Alan Cox : Allocator for a socket is settable.
67 * Alan Cox : SO_ERROR includes soft errors.
68 * Alan Cox : Allow NULL arguments on some SO_ opts
69 * Alan Cox : Generic socket allocation to make hooks
70 * easier (suggested by Craig Metz).
71 * Michael Pall : SO_ERROR returns positive errno again
72 * Steve Whitehouse: Added default destructor to free
73 * protocol private data.
74 * Steve Whitehouse: Added various other default routines
75 * common to several socket families.
76 * Chris Evans : Call suser() check last on F_SETOWN
77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
79 * Andi Kleen : Fix write_space callback
80 * Chris Evans : Security fixes - signedness again
81 * Arnaldo C. Melo : cleanups, use skb_queue_purge
82 *
83 * To Fix:
84 *
85 *
86 * This program is free software; you can redistribute it and/or
87 * modify it under the terms of the GNU General Public License
88 * as published by the Free Software Foundation; either version
89 * 2 of the License, or (at your option) any later version.
90 */
91
4fc268d2 92#include <linux/capability.h>
1da177e4
LT
93#include <linux/errno.h>
94#include <linux/types.h>
95#include <linux/socket.h>
96#include <linux/in.h>
97#include <linux/kernel.h>
1da177e4
LT
98#include <linux/module.h>
99#include <linux/proc_fs.h>
100#include <linux/seq_file.h>
101#include <linux/sched.h>
102#include <linux/timer.h>
103#include <linux/string.h>
104#include <linux/sockios.h>
105#include <linux/net.h>
106#include <linux/mm.h>
107#include <linux/slab.h>
108#include <linux/interrupt.h>
109#include <linux/poll.h>
110#include <linux/tcp.h>
111#include <linux/init.h>
a1f8e7f7 112#include <linux/highmem.h>
1da177e4
LT
113
114#include <asm/uaccess.h>
115#include <asm/system.h>
116
117#include <linux/netdevice.h>
118#include <net/protocol.h>
119#include <linux/skbuff.h>
457c4cbc 120#include <net/net_namespace.h>
2e6599cb 121#include <net/request_sock.h>
1da177e4 122#include <net/sock.h>
20d49473 123#include <linux/net_tstamp.h>
1da177e4
LT
124#include <net/xfrm.h>
125#include <linux/ipsec.h>
126
127#include <linux/filter.h>
128
129#ifdef CONFIG_INET
130#include <net/tcp.h>
131#endif
132
da21f24d
IM
133/*
134 * Each address family might have different locking rules, so we have
135 * one slock key per address family:
136 */
a5b5bb9a
IM
137static struct lock_class_key af_family_keys[AF_MAX];
138static struct lock_class_key af_family_slock_keys[AF_MAX];
139
a5b5bb9a
IM
140/*
141 * Make lock validator output more readable. (we pre-construct these
142 * strings build-time, so that runtime initialization of socket
143 * locks is fast):
144 */
36cbd3dc 145static const char *const af_family_key_strings[AF_MAX+1] = {
a5b5bb9a
IM
146 "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX" , "sk_lock-AF_INET" ,
147 "sk_lock-AF_AX25" , "sk_lock-AF_IPX" , "sk_lock-AF_APPLETALK",
148 "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE" , "sk_lock-AF_ATMPVC" ,
149 "sk_lock-AF_X25" , "sk_lock-AF_INET6" , "sk_lock-AF_ROSE" ,
150 "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI" , "sk_lock-AF_SECURITY" ,
151 "sk_lock-AF_KEY" , "sk_lock-AF_NETLINK" , "sk_lock-AF_PACKET" ,
152 "sk_lock-AF_ASH" , "sk_lock-AF_ECONET" , "sk_lock-AF_ATMSVC" ,
cbd151bf 153 "sk_lock-AF_RDS" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" ,
a5b5bb9a 154 "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE" , "sk_lock-AF_LLC" ,
cd05acfe 155 "sk_lock-27" , "sk_lock-28" , "sk_lock-AF_CAN" ,
17926a79 156 "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" ,
bce7b154 157 "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" ,
fcb94e42 158 "sk_lock-AF_IEEE802154",
bce7b154 159 "sk_lock-AF_MAX"
a5b5bb9a 160};
36cbd3dc 161static const char *const af_family_slock_key_strings[AF_MAX+1] = {
a5b5bb9a
IM
162 "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" ,
163 "slock-AF_AX25" , "slock-AF_IPX" , "slock-AF_APPLETALK",
164 "slock-AF_NETROM", "slock-AF_BRIDGE" , "slock-AF_ATMPVC" ,
165 "slock-AF_X25" , "slock-AF_INET6" , "slock-AF_ROSE" ,
166 "slock-AF_DECnet", "slock-AF_NETBEUI" , "slock-AF_SECURITY" ,
167 "slock-AF_KEY" , "slock-AF_NETLINK" , "slock-AF_PACKET" ,
168 "slock-AF_ASH" , "slock-AF_ECONET" , "slock-AF_ATMSVC" ,
cbd151bf 169 "slock-AF_RDS" , "slock-AF_SNA" , "slock-AF_IRDA" ,
a5b5bb9a 170 "slock-AF_PPPOX" , "slock-AF_WANPIPE" , "slock-AF_LLC" ,
cd05acfe 171 "slock-27" , "slock-28" , "slock-AF_CAN" ,
17926a79 172 "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" ,
bce7b154 173 "slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" ,
fcb94e42 174 "slock-AF_IEEE802154",
bce7b154 175 "slock-AF_MAX"
a5b5bb9a 176};
36cbd3dc 177static const char *const af_family_clock_key_strings[AF_MAX+1] = {
443aef0e
PZ
178 "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" ,
179 "clock-AF_AX25" , "clock-AF_IPX" , "clock-AF_APPLETALK",
180 "clock-AF_NETROM", "clock-AF_BRIDGE" , "clock-AF_ATMPVC" ,
181 "clock-AF_X25" , "clock-AF_INET6" , "clock-AF_ROSE" ,
182 "clock-AF_DECnet", "clock-AF_NETBEUI" , "clock-AF_SECURITY" ,
183 "clock-AF_KEY" , "clock-AF_NETLINK" , "clock-AF_PACKET" ,
184 "clock-AF_ASH" , "clock-AF_ECONET" , "clock-AF_ATMSVC" ,
cbd151bf 185 "clock-AF_RDS" , "clock-AF_SNA" , "clock-AF_IRDA" ,
443aef0e 186 "clock-AF_PPPOX" , "clock-AF_WANPIPE" , "clock-AF_LLC" ,
b4942af6 187 "clock-27" , "clock-28" , "clock-AF_CAN" ,
e51f802b 188 "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" ,
bce7b154 189 "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" ,
fcb94e42 190 "clock-AF_IEEE802154",
bce7b154 191 "clock-AF_MAX"
443aef0e 192};
da21f24d
IM
193
194/*
195 * sk_callback_lock locking rules are per-address-family,
196 * so split the lock classes by using a per-AF key:
197 */
198static struct lock_class_key af_callback_keys[AF_MAX];
199
1da177e4
LT
200/* Take into consideration the size of the struct sk_buff overhead in the
201 * determination of these values, since that is non-constant across
202 * platforms. This makes socket queueing behavior and performance
203 * not depend upon such differences.
204 */
205#define _SK_MEM_PACKETS 256
206#define _SK_MEM_OVERHEAD (sizeof(struct sk_buff) + 256)
207#define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
208#define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
209
210/* Run time adjustable parameters. */
ab32ea5d
BH
211__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
212__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
213__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
214__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
1da177e4
LT
215
216/* Maximal space eaten by iovec or ancilliary data plus some space */
ab32ea5d 217int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
2a91525c 218EXPORT_SYMBOL(sysctl_optmem_max);
1da177e4
LT
219
220static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
221{
222 struct timeval tv;
223
224 if (optlen < sizeof(tv))
225 return -EINVAL;
226 if (copy_from_user(&tv, optval, sizeof(tv)))
227 return -EFAULT;
ba78073e
VA
228 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
229 return -EDOM;
1da177e4 230
ba78073e 231 if (tv.tv_sec < 0) {
6f11df83
AM
232 static int warned __read_mostly;
233
ba78073e 234 *timeo_p = 0;
50aab54f 235 if (warned < 10 && net_ratelimit()) {
ba78073e
VA
236 warned++;
237 printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) "
238 "tries to set negative timeout\n",
ba25f9dc 239 current->comm, task_pid_nr(current));
50aab54f 240 }
ba78073e
VA
241 return 0;
242 }
1da177e4
LT
243 *timeo_p = MAX_SCHEDULE_TIMEOUT;
244 if (tv.tv_sec == 0 && tv.tv_usec == 0)
245 return 0;
246 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
247 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
248 return 0;
249}
250
251static void sock_warn_obsolete_bsdism(const char *name)
252{
253 static int warned;
254 static char warncomm[TASK_COMM_LEN];
4ec93edb
YH
255 if (strcmp(warncomm, current->comm) && warned < 5) {
256 strcpy(warncomm, current->comm);
1da177e4
LT
257 printk(KERN_WARNING "process `%s' is using obsolete "
258 "%s SO_BSDCOMPAT\n", warncomm, name);
259 warned++;
260 }
261}
262
20d49473 263static void sock_disable_timestamp(struct sock *sk, int flag)
4ec93edb 264{
20d49473
PO
265 if (sock_flag(sk, flag)) {
266 sock_reset_flag(sk, flag);
267 if (!sock_flag(sk, SOCK_TIMESTAMP) &&
268 !sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE)) {
269 net_disable_timestamp();
270 }
1da177e4
LT
271 }
272}
273
274
f0088a50
DV
275int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
276{
766e9037 277 int err;
f0088a50 278 int skb_len;
3b885787
NH
279 unsigned long flags;
280 struct sk_buff_head *list = &sk->sk_receive_queue;
f0088a50 281
9ee6b7f1 282 /* Cast sk->rcvbuf to unsigned... It's pointless, but reduces
f0088a50
DV
283 number of warnings when compiling with -W --ANK
284 */
285 if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
286 (unsigned)sk->sk_rcvbuf) {
766e9037
ED
287 atomic_inc(&sk->sk_drops);
288 return -ENOMEM;
f0088a50
DV
289 }
290
fda9ef5d 291 err = sk_filter(sk, skb);
f0088a50 292 if (err)
766e9037 293 return err;
f0088a50 294
3ab224be 295 if (!sk_rmem_schedule(sk, skb->truesize)) {
766e9037
ED
296 atomic_inc(&sk->sk_drops);
297 return -ENOBUFS;
3ab224be
HA
298 }
299
f0088a50
DV
300 skb->dev = NULL;
301 skb_set_owner_r(skb, sk);
49ad9599 302
f0088a50
DV
303 /* Cache the SKB length before we tack it onto the receive
304 * queue. Once it is added it no longer belongs to us and
305 * may be freed by other threads of control pulling packets
306 * from the queue.
307 */
308 skb_len = skb->len;
309
3b885787
NH
310 spin_lock_irqsave(&list->lock, flags);
311 skb->dropcount = atomic_read(&sk->sk_drops);
312 __skb_queue_tail(list, skb);
313 spin_unlock_irqrestore(&list->lock, flags);
f0088a50
DV
314
315 if (!sock_flag(sk, SOCK_DEAD))
316 sk->sk_data_ready(sk, skb_len);
766e9037 317 return 0;
f0088a50
DV
318}
319EXPORT_SYMBOL(sock_queue_rcv_skb);
320
58a5a7b9 321int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
f0088a50
DV
322{
323 int rc = NET_RX_SUCCESS;
324
fda9ef5d 325 if (sk_filter(sk, skb))
f0088a50
DV
326 goto discard_and_relse;
327
328 skb->dev = NULL;
329
58a5a7b9
ACM
330 if (nested)
331 bh_lock_sock_nested(sk);
332 else
333 bh_lock_sock(sk);
a5b5bb9a
IM
334 if (!sock_owned_by_user(sk)) {
335 /*
336 * trylock + unlock semantics:
337 */
338 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
339
c57943a1 340 rc = sk_backlog_rcv(sk, skb);
a5b5bb9a
IM
341
342 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
343 } else
f0088a50
DV
344 sk_add_backlog(sk, skb);
345 bh_unlock_sock(sk);
346out:
347 sock_put(sk);
348 return rc;
349discard_and_relse:
350 kfree_skb(skb);
351 goto out;
352}
353EXPORT_SYMBOL(sk_receive_skb);
354
ea94ff3b
KK
355void sk_reset_txq(struct sock *sk)
356{
357 sk_tx_queue_clear(sk);
358}
359EXPORT_SYMBOL(sk_reset_txq);
360
f0088a50
DV
361struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
362{
363 struct dst_entry *dst = sk->sk_dst_cache;
364
365 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
e022f0b4 366 sk_tx_queue_clear(sk);
f0088a50
DV
367 sk->sk_dst_cache = NULL;
368 dst_release(dst);
369 return NULL;
370 }
371
372 return dst;
373}
374EXPORT_SYMBOL(__sk_dst_check);
375
376struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
377{
378 struct dst_entry *dst = sk_dst_get(sk);
379
380 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
381 sk_dst_reset(sk);
382 dst_release(dst);
383 return NULL;
384 }
385
386 return dst;
387}
388EXPORT_SYMBOL(sk_dst_check);
389
4878809f
DM
390static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
391{
392 int ret = -ENOPROTOOPT;
393#ifdef CONFIG_NETDEVICES
3b1e0a65 394 struct net *net = sock_net(sk);
4878809f
DM
395 char devname[IFNAMSIZ];
396 int index;
397
398 /* Sorry... */
399 ret = -EPERM;
400 if (!capable(CAP_NET_RAW))
401 goto out;
402
403 ret = -EINVAL;
404 if (optlen < 0)
405 goto out;
406
407 /* Bind this socket to a particular device like "eth0",
408 * as specified in the passed interface name. If the
409 * name is "" or the option length is zero the socket
410 * is not bound.
411 */
412 if (optlen > IFNAMSIZ - 1)
413 optlen = IFNAMSIZ - 1;
414 memset(devname, 0, sizeof(devname));
415
416 ret = -EFAULT;
417 if (copy_from_user(devname, optval, optlen))
418 goto out;
419
420 if (devname[0] == '\0') {
421 index = 0;
422 } else {
881d966b 423 struct net_device *dev = dev_get_by_name(net, devname);
4878809f
DM
424
425 ret = -ENODEV;
426 if (!dev)
427 goto out;
428
429 index = dev->ifindex;
430 dev_put(dev);
431 }
432
433 lock_sock(sk);
434 sk->sk_bound_dev_if = index;
435 sk_dst_reset(sk);
436 release_sock(sk);
437
438 ret = 0;
439
440out:
441#endif
442
443 return ret;
444}
445
c0ef877b
PE
446static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
447{
448 if (valbool)
449 sock_set_flag(sk, bit);
450 else
451 sock_reset_flag(sk, bit);
452}
453
1da177e4
LT
454/*
455 * This is meant for all protocols to use and covers goings on
456 * at the socket level. Everything here is generic.
457 */
458
459int sock_setsockopt(struct socket *sock, int level, int optname,
b7058842 460 char __user *optval, unsigned int optlen)
1da177e4 461{
2a91525c 462 struct sock *sk = sock->sk;
1da177e4
LT
463 int val;
464 int valbool;
465 struct linger ling;
466 int ret = 0;
4ec93edb 467
1da177e4
LT
468 /*
469 * Options without arguments
470 */
471
4878809f
DM
472 if (optname == SO_BINDTODEVICE)
473 return sock_bindtodevice(sk, optval, optlen);
474
e71a4783
SH
475 if (optlen < sizeof(int))
476 return -EINVAL;
4ec93edb 477
1da177e4
LT
478 if (get_user(val, (int __user *)optval))
479 return -EFAULT;
4ec93edb 480
2a91525c 481 valbool = val ? 1 : 0;
1da177e4
LT
482
483 lock_sock(sk);
484
2a91525c 485 switch (optname) {
e71a4783 486 case SO_DEBUG:
2a91525c 487 if (val && !capable(CAP_NET_ADMIN))
e71a4783 488 ret = -EACCES;
2a91525c 489 else
c0ef877b 490 sock_valbool_flag(sk, SOCK_DBG, valbool);
e71a4783
SH
491 break;
492 case SO_REUSEADDR:
493 sk->sk_reuse = valbool;
494 break;
495 case SO_TYPE:
49c794e9 496 case SO_PROTOCOL:
0d6038ee 497 case SO_DOMAIN:
e71a4783
SH
498 case SO_ERROR:
499 ret = -ENOPROTOOPT;
500 break;
501 case SO_DONTROUTE:
c0ef877b 502 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
e71a4783
SH
503 break;
504 case SO_BROADCAST:
505 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
506 break;
507 case SO_SNDBUF:
508 /* Don't error on this BSD doesn't and if you think
509 about it this is right. Otherwise apps have to
510 play 'guess the biggest size' games. RCVBUF/SNDBUF
511 are treated in BSD as hints */
512
513 if (val > sysctl_wmem_max)
514 val = sysctl_wmem_max;
b0573dea 515set_sndbuf:
e71a4783
SH
516 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
517 if ((val * 2) < SOCK_MIN_SNDBUF)
518 sk->sk_sndbuf = SOCK_MIN_SNDBUF;
519 else
520 sk->sk_sndbuf = val * 2;
1da177e4 521
e71a4783
SH
522 /*
523 * Wake up sending tasks if we
524 * upped the value.
525 */
526 sk->sk_write_space(sk);
527 break;
1da177e4 528
e71a4783
SH
529 case SO_SNDBUFFORCE:
530 if (!capable(CAP_NET_ADMIN)) {
531 ret = -EPERM;
532 break;
533 }
534 goto set_sndbuf;
b0573dea 535
e71a4783
SH
536 case SO_RCVBUF:
537 /* Don't error on this BSD doesn't and if you think
538 about it this is right. Otherwise apps have to
539 play 'guess the biggest size' games. RCVBUF/SNDBUF
540 are treated in BSD as hints */
4ec93edb 541
e71a4783
SH
542 if (val > sysctl_rmem_max)
543 val = sysctl_rmem_max;
b0573dea 544set_rcvbuf:
e71a4783
SH
545 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
546 /*
547 * We double it on the way in to account for
548 * "struct sk_buff" etc. overhead. Applications
549 * assume that the SO_RCVBUF setting they make will
550 * allow that much actual data to be received on that
551 * socket.
552 *
553 * Applications are unaware that "struct sk_buff" and
554 * other overheads allocate from the receive buffer
555 * during socket buffer allocation.
556 *
557 * And after considering the possible alternatives,
558 * returning the value we actually used in getsockopt
559 * is the most desirable behavior.
560 */
561 if ((val * 2) < SOCK_MIN_RCVBUF)
562 sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
563 else
564 sk->sk_rcvbuf = val * 2;
565 break;
566
567 case SO_RCVBUFFORCE:
568 if (!capable(CAP_NET_ADMIN)) {
569 ret = -EPERM;
1da177e4 570 break;
e71a4783
SH
571 }
572 goto set_rcvbuf;
1da177e4 573
e71a4783 574 case SO_KEEPALIVE:
1da177e4 575#ifdef CONFIG_INET
e71a4783
SH
576 if (sk->sk_protocol == IPPROTO_TCP)
577 tcp_set_keepalive(sk, valbool);
1da177e4 578#endif
e71a4783
SH
579 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
580 break;
581
582 case SO_OOBINLINE:
583 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
584 break;
585
586 case SO_NO_CHECK:
587 sk->sk_no_check = valbool;
588 break;
589
590 case SO_PRIORITY:
591 if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
592 sk->sk_priority = val;
593 else
594 ret = -EPERM;
595 break;
596
597 case SO_LINGER:
598 if (optlen < sizeof(ling)) {
599 ret = -EINVAL; /* 1003.1g */
1da177e4 600 break;
e71a4783 601 }
2a91525c 602 if (copy_from_user(&ling, optval, sizeof(ling))) {
e71a4783 603 ret = -EFAULT;
1da177e4 604 break;
e71a4783
SH
605 }
606 if (!ling.l_onoff)
607 sock_reset_flag(sk, SOCK_LINGER);
608 else {
1da177e4 609#if (BITS_PER_LONG == 32)
e71a4783
SH
610 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
611 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
1da177e4 612 else
e71a4783
SH
613#endif
614 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
615 sock_set_flag(sk, SOCK_LINGER);
616 }
617 break;
618
619 case SO_BSDCOMPAT:
620 sock_warn_obsolete_bsdism("setsockopt");
621 break;
622
623 case SO_PASSCRED:
624 if (valbool)
625 set_bit(SOCK_PASSCRED, &sock->flags);
626 else
627 clear_bit(SOCK_PASSCRED, &sock->flags);
628 break;
629
630 case SO_TIMESTAMP:
92f37fd2 631 case SO_TIMESTAMPNS:
e71a4783 632 if (valbool) {
92f37fd2
ED
633 if (optname == SO_TIMESTAMP)
634 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
635 else
636 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
e71a4783 637 sock_set_flag(sk, SOCK_RCVTSTAMP);
20d49473 638 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
92f37fd2 639 } else {
e71a4783 640 sock_reset_flag(sk, SOCK_RCVTSTAMP);
92f37fd2
ED
641 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
642 }
e71a4783
SH
643 break;
644
20d49473
PO
645 case SO_TIMESTAMPING:
646 if (val & ~SOF_TIMESTAMPING_MASK) {
f249fb78 647 ret = -EINVAL;
20d49473
PO
648 break;
649 }
650 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
651 val & SOF_TIMESTAMPING_TX_HARDWARE);
652 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
653 val & SOF_TIMESTAMPING_TX_SOFTWARE);
654 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
655 val & SOF_TIMESTAMPING_RX_HARDWARE);
656 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
657 sock_enable_timestamp(sk,
658 SOCK_TIMESTAMPING_RX_SOFTWARE);
659 else
660 sock_disable_timestamp(sk,
661 SOCK_TIMESTAMPING_RX_SOFTWARE);
662 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
663 val & SOF_TIMESTAMPING_SOFTWARE);
664 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
665 val & SOF_TIMESTAMPING_SYS_HARDWARE);
666 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
667 val & SOF_TIMESTAMPING_RAW_HARDWARE);
668 break;
669
e71a4783
SH
670 case SO_RCVLOWAT:
671 if (val < 0)
672 val = INT_MAX;
673 sk->sk_rcvlowat = val ? : 1;
674 break;
675
676 case SO_RCVTIMEO:
677 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
678 break;
679
680 case SO_SNDTIMEO:
681 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
682 break;
1da177e4 683
e71a4783
SH
684 case SO_ATTACH_FILTER:
685 ret = -EINVAL;
686 if (optlen == sizeof(struct sock_fprog)) {
687 struct sock_fprog fprog;
1da177e4 688
e71a4783
SH
689 ret = -EFAULT;
690 if (copy_from_user(&fprog, optval, sizeof(fprog)))
1da177e4 691 break;
e71a4783
SH
692
693 ret = sk_attach_filter(&fprog, sk);
694 }
695 break;
696
697 case SO_DETACH_FILTER:
55b33325 698 ret = sk_detach_filter(sk);
e71a4783 699 break;
1da177e4 700
e71a4783
SH
701 case SO_PASSSEC:
702 if (valbool)
703 set_bit(SOCK_PASSSEC, &sock->flags);
704 else
705 clear_bit(SOCK_PASSSEC, &sock->flags);
706 break;
4a19ec58
LAT
707 case SO_MARK:
708 if (!capable(CAP_NET_ADMIN))
709 ret = -EPERM;
2a91525c 710 else
4a19ec58 711 sk->sk_mark = val;
4a19ec58 712 break;
877ce7c1 713
1da177e4
LT
714 /* We implement the SO_SNDLOWAT etc to
715 not be settable (1003.1g 5.3) */
3b885787
NH
716 case SO_RXQ_OVFL:
717 if (valbool)
718 sock_set_flag(sk, SOCK_RXQ_OVFL);
719 else
720 sock_reset_flag(sk, SOCK_RXQ_OVFL);
721 break;
e71a4783
SH
722 default:
723 ret = -ENOPROTOOPT;
724 break;
4ec93edb 725 }
1da177e4
LT
726 release_sock(sk);
727 return ret;
728}
2a91525c 729EXPORT_SYMBOL(sock_setsockopt);
1da177e4
LT
730
731
732int sock_getsockopt(struct socket *sock, int level, int optname,
733 char __user *optval, int __user *optlen)
734{
735 struct sock *sk = sock->sk;
4ec93edb 736
e71a4783 737 union {
4ec93edb
YH
738 int val;
739 struct linger ling;
1da177e4
LT
740 struct timeval tm;
741 } v;
4ec93edb 742
1da177e4
LT
743 unsigned int lv = sizeof(int);
744 int len;
4ec93edb 745
e71a4783 746 if (get_user(len, optlen))
4ec93edb 747 return -EFAULT;
e71a4783 748 if (len < 0)
1da177e4 749 return -EINVAL;
4ec93edb 750
50fee1de 751 memset(&v, 0, sizeof(v));
df0bca04 752
2a91525c 753 switch (optname) {
e71a4783
SH
754 case SO_DEBUG:
755 v.val = sock_flag(sk, SOCK_DBG);
756 break;
757
758 case SO_DONTROUTE:
759 v.val = sock_flag(sk, SOCK_LOCALROUTE);
760 break;
761
762 case SO_BROADCAST:
763 v.val = !!sock_flag(sk, SOCK_BROADCAST);
764 break;
765
766 case SO_SNDBUF:
767 v.val = sk->sk_sndbuf;
768 break;
769
770 case SO_RCVBUF:
771 v.val = sk->sk_rcvbuf;
772 break;
773
774 case SO_REUSEADDR:
775 v.val = sk->sk_reuse;
776 break;
777
778 case SO_KEEPALIVE:
779 v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
780 break;
781
782 case SO_TYPE:
783 v.val = sk->sk_type;
784 break;
785
49c794e9
JE
786 case SO_PROTOCOL:
787 v.val = sk->sk_protocol;
788 break;
789
0d6038ee
JE
790 case SO_DOMAIN:
791 v.val = sk->sk_family;
792 break;
793
e71a4783
SH
794 case SO_ERROR:
795 v.val = -sock_error(sk);
2a91525c 796 if (v.val == 0)
e71a4783
SH
797 v.val = xchg(&sk->sk_err_soft, 0);
798 break;
799
800 case SO_OOBINLINE:
801 v.val = !!sock_flag(sk, SOCK_URGINLINE);
802 break;
803
804 case SO_NO_CHECK:
805 v.val = sk->sk_no_check;
806 break;
807
808 case SO_PRIORITY:
809 v.val = sk->sk_priority;
810 break;
811
812 case SO_LINGER:
813 lv = sizeof(v.ling);
814 v.ling.l_onoff = !!sock_flag(sk, SOCK_LINGER);
815 v.ling.l_linger = sk->sk_lingertime / HZ;
816 break;
817
818 case SO_BSDCOMPAT:
819 sock_warn_obsolete_bsdism("getsockopt");
820 break;
821
822 case SO_TIMESTAMP:
92f37fd2
ED
823 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
824 !sock_flag(sk, SOCK_RCVTSTAMPNS);
825 break;
826
827 case SO_TIMESTAMPNS:
828 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
e71a4783
SH
829 break;
830
20d49473
PO
831 case SO_TIMESTAMPING:
832 v.val = 0;
833 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
834 v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
835 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
836 v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
837 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
838 v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
839 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
840 v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
841 if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
842 v.val |= SOF_TIMESTAMPING_SOFTWARE;
843 if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
844 v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
845 if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
846 v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
847 break;
848
e71a4783 849 case SO_RCVTIMEO:
2a91525c 850 lv = sizeof(struct timeval);
e71a4783
SH
851 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
852 v.tm.tv_sec = 0;
853 v.tm.tv_usec = 0;
854 } else {
855 v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
856 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
857 }
858 break;
859
860 case SO_SNDTIMEO:
2a91525c 861 lv = sizeof(struct timeval);
e71a4783
SH
862 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
863 v.tm.tv_sec = 0;
864 v.tm.tv_usec = 0;
865 } else {
866 v.tm.tv_sec = sk->sk_sndtimeo / HZ;
867 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
868 }
869 break;
1da177e4 870
e71a4783
SH
871 case SO_RCVLOWAT:
872 v.val = sk->sk_rcvlowat;
873 break;
1da177e4 874
e71a4783 875 case SO_SNDLOWAT:
2a91525c 876 v.val = 1;
e71a4783 877 break;
1da177e4 878
e71a4783
SH
879 case SO_PASSCRED:
880 v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
881 break;
1da177e4 882
e71a4783
SH
883 case SO_PEERCRED:
884 if (len > sizeof(sk->sk_peercred))
885 len = sizeof(sk->sk_peercred);
886 if (copy_to_user(optval, &sk->sk_peercred, len))
887 return -EFAULT;
888 goto lenout;
1da177e4 889
e71a4783
SH
890 case SO_PEERNAME:
891 {
892 char address[128];
893
894 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
895 return -ENOTCONN;
896 if (lv < len)
897 return -EINVAL;
898 if (copy_to_user(optval, address, len))
899 return -EFAULT;
900 goto lenout;
901 }
1da177e4 902
e71a4783
SH
903 /* Dubious BSD thing... Probably nobody even uses it, but
904 * the UNIX standard wants it for whatever reason... -DaveM
905 */
906 case SO_ACCEPTCONN:
907 v.val = sk->sk_state == TCP_LISTEN;
908 break;
1da177e4 909
e71a4783
SH
910 case SO_PASSSEC:
911 v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0;
912 break;
877ce7c1 913
e71a4783
SH
914 case SO_PEERSEC:
915 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1da177e4 916
4a19ec58
LAT
917 case SO_MARK:
918 v.val = sk->sk_mark;
919 break;
920
3b885787
NH
921 case SO_RXQ_OVFL:
922 v.val = !!sock_flag(sk, SOCK_RXQ_OVFL);
923 break;
924
e71a4783
SH
925 default:
926 return -ENOPROTOOPT;
1da177e4 927 }
e71a4783 928
1da177e4
LT
929 if (len > lv)
930 len = lv;
931 if (copy_to_user(optval, &v, len))
932 return -EFAULT;
933lenout:
4ec93edb
YH
934 if (put_user(len, optlen))
935 return -EFAULT;
936 return 0;
1da177e4
LT
937}
938
a5b5bb9a
IM
939/*
940 * Initialize an sk_lock.
941 *
942 * (We also register the sk_lock with the lock validator.)
943 */
b6f99a21 944static inline void sock_lock_init(struct sock *sk)
a5b5bb9a 945{
ed07536e
PZ
946 sock_lock_init_class_and_name(sk,
947 af_family_slock_key_strings[sk->sk_family],
948 af_family_slock_keys + sk->sk_family,
949 af_family_key_strings[sk->sk_family],
950 af_family_keys + sk->sk_family);
a5b5bb9a
IM
951}
952
4dc6dc71
ED
953/*
954 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
955 * even temporarly, because of RCU lookups. sk_node should also be left as is.
956 */
f1a6c4da
PE
957static void sock_copy(struct sock *nsk, const struct sock *osk)
958{
959#ifdef CONFIG_SECURITY_NETWORK
960 void *sptr = nsk->sk_security;
961#endif
4dc6dc71 962 BUILD_BUG_ON(offsetof(struct sock, sk_copy_start) !=
e022f0b4
KK
963 sizeof(osk->sk_node) + sizeof(osk->sk_refcnt) +
964 sizeof(osk->sk_tx_queue_mapping));
4dc6dc71
ED
965 memcpy(&nsk->sk_copy_start, &osk->sk_copy_start,
966 osk->sk_prot->obj_size - offsetof(struct sock, sk_copy_start));
f1a6c4da
PE
967#ifdef CONFIG_SECURITY_NETWORK
968 nsk->sk_security = sptr;
969 security_sk_clone(osk, nsk);
970#endif
971}
972
2e4afe7b
PE
973static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
974 int family)
c308c1b2
PE
975{
976 struct sock *sk;
977 struct kmem_cache *slab;
978
979 slab = prot->slab;
e912b114
ED
980 if (slab != NULL) {
981 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
982 if (!sk)
983 return sk;
984 if (priority & __GFP_ZERO) {
985 /*
986 * caches using SLAB_DESTROY_BY_RCU should let
987 * sk_node.next un-modified. Special care is taken
988 * when initializing object to zero.
989 */
990 if (offsetof(struct sock, sk_node.next) != 0)
991 memset(sk, 0, offsetof(struct sock, sk_node.next));
992 memset(&sk->sk_node.pprev, 0,
993 prot->obj_size - offsetof(struct sock,
994 sk_node.pprev));
995 }
996 }
c308c1b2
PE
997 else
998 sk = kmalloc(prot->obj_size, priority);
999
2e4afe7b 1000 if (sk != NULL) {
a98b65a3
VN
1001 kmemcheck_annotate_bitfield(sk, flags);
1002
2e4afe7b
PE
1003 if (security_sk_alloc(sk, family, priority))
1004 goto out_free;
1005
1006 if (!try_module_get(prot->owner))
1007 goto out_free_sec;
e022f0b4 1008 sk_tx_queue_clear(sk);
2e4afe7b
PE
1009 }
1010
c308c1b2 1011 return sk;
2e4afe7b
PE
1012
1013out_free_sec:
1014 security_sk_free(sk);
1015out_free:
1016 if (slab != NULL)
1017 kmem_cache_free(slab, sk);
1018 else
1019 kfree(sk);
1020 return NULL;
c308c1b2
PE
1021}
1022
1023static void sk_prot_free(struct proto *prot, struct sock *sk)
1024{
1025 struct kmem_cache *slab;
2e4afe7b 1026 struct module *owner;
c308c1b2 1027
2e4afe7b 1028 owner = prot->owner;
c308c1b2 1029 slab = prot->slab;
2e4afe7b
PE
1030
1031 security_sk_free(sk);
c308c1b2
PE
1032 if (slab != NULL)
1033 kmem_cache_free(slab, sk);
1034 else
1035 kfree(sk);
2e4afe7b 1036 module_put(owner);
c308c1b2
PE
1037}
1038
1da177e4
LT
1039/**
1040 * sk_alloc - All socket objects are allocated here
c4ea43c5 1041 * @net: the applicable net namespace
4dc3b16b
PP
1042 * @family: protocol family
1043 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1044 * @prot: struct proto associated with this new sock instance
1da177e4 1045 */
1b8d7ae4 1046struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
6257ff21 1047 struct proto *prot)
1da177e4 1048{
c308c1b2 1049 struct sock *sk;
1da177e4 1050
154adbc8 1051 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1da177e4 1052 if (sk) {
154adbc8
PE
1053 sk->sk_family = family;
1054 /*
1055 * See comment in struct sock definition to understand
1056 * why we need sk_prot_creator -acme
1057 */
1058 sk->sk_prot = sk->sk_prot_creator = prot;
1059 sock_lock_init(sk);
3b1e0a65 1060 sock_net_set(sk, get_net(net));
d66ee058 1061 atomic_set(&sk->sk_wmem_alloc, 1);
1da177e4 1062 }
a79af59e 1063
2e4afe7b 1064 return sk;
1da177e4 1065}
2a91525c 1066EXPORT_SYMBOL(sk_alloc);
1da177e4 1067
2b85a34e 1068static void __sk_free(struct sock *sk)
1da177e4
LT
1069{
1070 struct sk_filter *filter;
1da177e4
LT
1071
1072 if (sk->sk_destruct)
1073 sk->sk_destruct(sk);
1074
fda9ef5d 1075 filter = rcu_dereference(sk->sk_filter);
1da177e4 1076 if (filter) {
309dd5fc 1077 sk_filter_uncharge(sk, filter);
fda9ef5d 1078 rcu_assign_pointer(sk->sk_filter, NULL);
1da177e4
LT
1079 }
1080
20d49473
PO
1081 sock_disable_timestamp(sk, SOCK_TIMESTAMP);
1082 sock_disable_timestamp(sk, SOCK_TIMESTAMPING_RX_SOFTWARE);
1da177e4
LT
1083
1084 if (atomic_read(&sk->sk_omem_alloc))
1085 printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
0dc47877 1086 __func__, atomic_read(&sk->sk_omem_alloc));
1da177e4 1087
3b1e0a65 1088 put_net(sock_net(sk));
c308c1b2 1089 sk_prot_free(sk->sk_prot_creator, sk);
1da177e4 1090}
2b85a34e
ED
1091
1092void sk_free(struct sock *sk)
1093{
1094 /*
1095 * We substract one from sk_wmem_alloc and can know if
1096 * some packets are still in some tx queue.
1097 * If not null, sock_wfree() will call __sk_free(sk) later
1098 */
1099 if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1100 __sk_free(sk);
1101}
2a91525c 1102EXPORT_SYMBOL(sk_free);
1da177e4 1103
edf02087
DL
1104/*
1105 * Last sock_put should drop referrence to sk->sk_net. It has already
1106 * been dropped in sk_change_net. Taking referrence to stopping namespace
1107 * is not an option.
1108 * Take referrence to a socket to remove it from hash _alive_ and after that
1109 * destroy it in the context of init_net.
1110 */
1111void sk_release_kernel(struct sock *sk)
1112{
1113 if (sk == NULL || sk->sk_socket == NULL)
1114 return;
1115
1116 sock_hold(sk);
1117 sock_release(sk->sk_socket);
65a18ec5 1118 release_net(sock_net(sk));
3b1e0a65 1119 sock_net_set(sk, get_net(&init_net));
edf02087
DL
1120 sock_put(sk);
1121}
45af1754 1122EXPORT_SYMBOL(sk_release_kernel);
edf02087 1123
dd0fc66f 1124struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
87d11ceb 1125{
8fd1d178 1126 struct sock *newsk;
87d11ceb 1127
8fd1d178 1128 newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
87d11ceb
ACM
1129 if (newsk != NULL) {
1130 struct sk_filter *filter;
1131
892c141e 1132 sock_copy(newsk, sk);
87d11ceb
ACM
1133
1134 /* SANITY */
3b1e0a65 1135 get_net(sock_net(newsk));
87d11ceb
ACM
1136 sk_node_init(&newsk->sk_node);
1137 sock_lock_init(newsk);
1138 bh_lock_sock(newsk);
fa438ccf 1139 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
87d11ceb
ACM
1140
1141 atomic_set(&newsk->sk_rmem_alloc, 0);
2b85a34e
ED
1142 /*
1143 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1144 */
1145 atomic_set(&newsk->sk_wmem_alloc, 1);
87d11ceb
ACM
1146 atomic_set(&newsk->sk_omem_alloc, 0);
1147 skb_queue_head_init(&newsk->sk_receive_queue);
1148 skb_queue_head_init(&newsk->sk_write_queue);
97fc2f08
CL
1149#ifdef CONFIG_NET_DMA
1150 skb_queue_head_init(&newsk->sk_async_wait_queue);
1151#endif
87d11ceb
ACM
1152
1153 rwlock_init(&newsk->sk_dst_lock);
1154 rwlock_init(&newsk->sk_callback_lock);
443aef0e
PZ
1155 lockdep_set_class_and_name(&newsk->sk_callback_lock,
1156 af_callback_keys + newsk->sk_family,
1157 af_family_clock_key_strings[newsk->sk_family]);
87d11ceb
ACM
1158
1159 newsk->sk_dst_cache = NULL;
1160 newsk->sk_wmem_queued = 0;
1161 newsk->sk_forward_alloc = 0;
1162 newsk->sk_send_head = NULL;
87d11ceb
ACM
1163 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1164
1165 sock_reset_flag(newsk, SOCK_DONE);
1166 skb_queue_head_init(&newsk->sk_error_queue);
1167
1168 filter = newsk->sk_filter;
1169 if (filter != NULL)
1170 sk_filter_charge(newsk, filter);
1171
1172 if (unlikely(xfrm_sk_clone_policy(newsk))) {
1173 /* It is still raw copy of parent, so invalidate
1174 * destructor and make plain sk_free() */
1175 newsk->sk_destruct = NULL;
1176 sk_free(newsk);
1177 newsk = NULL;
1178 goto out;
1179 }
1180
1181 newsk->sk_err = 0;
1182 newsk->sk_priority = 0;
4dc6dc71
ED
1183 /*
1184 * Before updating sk_refcnt, we must commit prior changes to memory
1185 * (Documentation/RCU/rculist_nulls.txt for details)
1186 */
1187 smp_wmb();
87d11ceb
ACM
1188 atomic_set(&newsk->sk_refcnt, 2);
1189
1190 /*
1191 * Increment the counter in the same struct proto as the master
1192 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1193 * is the same as sk->sk_prot->socks, as this field was copied
1194 * with memcpy).
1195 *
1196 * This _changes_ the previous behaviour, where
1197 * tcp_create_openreq_child always was incrementing the
1198 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1199 * to be taken into account in all callers. -acme
1200 */
1201 sk_refcnt_debug_inc(newsk);
972692e0 1202 sk_set_socket(newsk, NULL);
87d11ceb
ACM
1203 newsk->sk_sleep = NULL;
1204
1205 if (newsk->sk_prot->sockets_allocated)
1748376b 1206 percpu_counter_inc(newsk->sk_prot->sockets_allocated);
87d11ceb
ACM
1207 }
1208out:
1209 return newsk;
1210}
87d11ceb
ACM
1211EXPORT_SYMBOL_GPL(sk_clone);
1212
9958089a
AK
1213void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1214{
1215 __sk_dst_set(sk, dst);
1216 sk->sk_route_caps = dst->dev->features;
1217 if (sk->sk_route_caps & NETIF_F_GSO)
4fcd6b99 1218 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
9958089a 1219 if (sk_can_gso(sk)) {
82cc1a7a 1220 if (dst->header_len) {
9958089a 1221 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
82cc1a7a 1222 } else {
9958089a 1223 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
82cc1a7a
PWJ
1224 sk->sk_gso_max_size = dst->dev->gso_max_size;
1225 }
9958089a
AK
1226 }
1227}
1228EXPORT_SYMBOL_GPL(sk_setup_caps);
1229
1da177e4
LT
1230void __init sk_init(void)
1231{
4481374c 1232 if (totalram_pages <= 4096) {
1da177e4
LT
1233 sysctl_wmem_max = 32767;
1234 sysctl_rmem_max = 32767;
1235 sysctl_wmem_default = 32767;
1236 sysctl_rmem_default = 32767;
4481374c 1237 } else if (totalram_pages >= 131072) {
1da177e4
LT
1238 sysctl_wmem_max = 131071;
1239 sysctl_rmem_max = 131071;
1240 }
1241}
1242
1243/*
1244 * Simple resource managers for sockets.
1245 */
1246
1247
4ec93edb
YH
1248/*
1249 * Write buffer destructor automatically called from kfree_skb.
1da177e4
LT
1250 */
1251void sock_wfree(struct sk_buff *skb)
1252{
1253 struct sock *sk = skb->sk;
d99927f4 1254 unsigned int len = skb->truesize;
1da177e4 1255
d99927f4
ED
1256 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1257 /*
1258 * Keep a reference on sk_wmem_alloc, this will be released
1259 * after sk_write_space() call
1260 */
1261 atomic_sub(len - 1, &sk->sk_wmem_alloc);
1da177e4 1262 sk->sk_write_space(sk);
d99927f4
ED
1263 len = 1;
1264 }
2b85a34e 1265 /*
d99927f4
ED
1266 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1267 * could not do because of in-flight packets
2b85a34e 1268 */
d99927f4 1269 if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
2b85a34e 1270 __sk_free(sk);
1da177e4 1271}
2a91525c 1272EXPORT_SYMBOL(sock_wfree);
1da177e4 1273
4ec93edb
YH
1274/*
1275 * Read buffer destructor automatically called from kfree_skb.
1da177e4
LT
1276 */
1277void sock_rfree(struct sk_buff *skb)
1278{
1279 struct sock *sk = skb->sk;
1280
1281 atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
3ab224be 1282 sk_mem_uncharge(skb->sk, skb->truesize);
1da177e4 1283}
2a91525c 1284EXPORT_SYMBOL(sock_rfree);
1da177e4
LT
1285
1286
1287int sock_i_uid(struct sock *sk)
1288{
1289 int uid;
1290
1291 read_lock(&sk->sk_callback_lock);
1292 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
1293 read_unlock(&sk->sk_callback_lock);
1294 return uid;
1295}
2a91525c 1296EXPORT_SYMBOL(sock_i_uid);
1da177e4
LT
1297
1298unsigned long sock_i_ino(struct sock *sk)
1299{
1300 unsigned long ino;
1301
1302 read_lock(&sk->sk_callback_lock);
1303 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1304 read_unlock(&sk->sk_callback_lock);
1305 return ino;
1306}
2a91525c 1307EXPORT_SYMBOL(sock_i_ino);
1da177e4
LT
1308
1309/*
1310 * Allocate a skb from the socket's send buffer.
1311 */
86a76caf 1312struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
dd0fc66f 1313 gfp_t priority)
1da177e4
LT
1314{
1315 if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
2a91525c 1316 struct sk_buff *skb = alloc_skb(size, priority);
1da177e4
LT
1317 if (skb) {
1318 skb_set_owner_w(skb, sk);
1319 return skb;
1320 }
1321 }
1322 return NULL;
1323}
2a91525c 1324EXPORT_SYMBOL(sock_wmalloc);
1da177e4
LT
1325
1326/*
1327 * Allocate a skb from the socket's receive buffer.
4ec93edb 1328 */
86a76caf 1329struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
dd0fc66f 1330 gfp_t priority)
1da177e4
LT
1331{
1332 if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1333 struct sk_buff *skb = alloc_skb(size, priority);
1334 if (skb) {
1335 skb_set_owner_r(skb, sk);
1336 return skb;
1337 }
1338 }
1339 return NULL;
1340}
1341
4ec93edb 1342/*
1da177e4 1343 * Allocate a memory block from the socket's option memory buffer.
4ec93edb 1344 */
dd0fc66f 1345void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1da177e4
LT
1346{
1347 if ((unsigned)size <= sysctl_optmem_max &&
1348 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1349 void *mem;
1350 /* First do the add, to avoid the race if kmalloc
4ec93edb 1351 * might sleep.
1da177e4
LT
1352 */
1353 atomic_add(size, &sk->sk_omem_alloc);
1354 mem = kmalloc(size, priority);
1355 if (mem)
1356 return mem;
1357 atomic_sub(size, &sk->sk_omem_alloc);
1358 }
1359 return NULL;
1360}
2a91525c 1361EXPORT_SYMBOL(sock_kmalloc);
1da177e4
LT
1362
1363/*
1364 * Free an option memory block.
1365 */
1366void sock_kfree_s(struct sock *sk, void *mem, int size)
1367{
1368 kfree(mem);
1369 atomic_sub(size, &sk->sk_omem_alloc);
1370}
2a91525c 1371EXPORT_SYMBOL(sock_kfree_s);
1da177e4
LT
1372
1373/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1374 I think, these locks should be removed for datagram sockets.
1375 */
2a91525c 1376static long sock_wait_for_wmem(struct sock *sk, long timeo)
1da177e4
LT
1377{
1378 DEFINE_WAIT(wait);
1379
1380 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1381 for (;;) {
1382 if (!timeo)
1383 break;
1384 if (signal_pending(current))
1385 break;
1386 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1387 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1388 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1389 break;
1390 if (sk->sk_shutdown & SEND_SHUTDOWN)
1391 break;
1392 if (sk->sk_err)
1393 break;
1394 timeo = schedule_timeout(timeo);
1395 }
1396 finish_wait(sk->sk_sleep, &wait);
1397 return timeo;
1398}
1399
1400
1401/*
1402 * Generic send/receive buffer handlers
1403 */
1404
4cc7f68d
HX
1405struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1406 unsigned long data_len, int noblock,
1407 int *errcode)
1da177e4
LT
1408{
1409 struct sk_buff *skb;
7d877f3b 1410 gfp_t gfp_mask;
1da177e4
LT
1411 long timeo;
1412 int err;
1413
1414 gfp_mask = sk->sk_allocation;
1415 if (gfp_mask & __GFP_WAIT)
1416 gfp_mask |= __GFP_REPEAT;
1417
1418 timeo = sock_sndtimeo(sk, noblock);
1419 while (1) {
1420 err = sock_error(sk);
1421 if (err != 0)
1422 goto failure;
1423
1424 err = -EPIPE;
1425 if (sk->sk_shutdown & SEND_SHUTDOWN)
1426 goto failure;
1427
1428 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
db38c179 1429 skb = alloc_skb(header_len, gfp_mask);
1da177e4
LT
1430 if (skb) {
1431 int npages;
1432 int i;
1433
1434 /* No pages, we're done... */
1435 if (!data_len)
1436 break;
1437
1438 npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1439 skb->truesize += data_len;
1440 skb_shinfo(skb)->nr_frags = npages;
1441 for (i = 0; i < npages; i++) {
1442 struct page *page;
1443 skb_frag_t *frag;
1444
1445 page = alloc_pages(sk->sk_allocation, 0);
1446 if (!page) {
1447 err = -ENOBUFS;
1448 skb_shinfo(skb)->nr_frags = i;
1449 kfree_skb(skb);
1450 goto failure;
1451 }
1452
1453 frag = &skb_shinfo(skb)->frags[i];
1454 frag->page = page;
1455 frag->page_offset = 0;
1456 frag->size = (data_len >= PAGE_SIZE ?
1457 PAGE_SIZE :
1458 data_len);
1459 data_len -= PAGE_SIZE;
1460 }
1461
1462 /* Full success... */
1463 break;
1464 }
1465 err = -ENOBUFS;
1466 goto failure;
1467 }
1468 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1469 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1470 err = -EAGAIN;
1471 if (!timeo)
1472 goto failure;
1473 if (signal_pending(current))
1474 goto interrupted;
1475 timeo = sock_wait_for_wmem(sk, timeo);
1476 }
1477
1478 skb_set_owner_w(skb, sk);
1479 return skb;
1480
1481interrupted:
1482 err = sock_intr_errno(timeo);
1483failure:
1484 *errcode = err;
1485 return NULL;
1486}
4cc7f68d 1487EXPORT_SYMBOL(sock_alloc_send_pskb);
1da177e4 1488
4ec93edb 1489struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1da177e4
LT
1490 int noblock, int *errcode)
1491{
1492 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1493}
2a91525c 1494EXPORT_SYMBOL(sock_alloc_send_skb);
1da177e4
LT
1495
1496static void __lock_sock(struct sock *sk)
1497{
1498 DEFINE_WAIT(wait);
1499
e71a4783 1500 for (;;) {
1da177e4
LT
1501 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1502 TASK_UNINTERRUPTIBLE);
1503 spin_unlock_bh(&sk->sk_lock.slock);
1504 schedule();
1505 spin_lock_bh(&sk->sk_lock.slock);
e71a4783 1506 if (!sock_owned_by_user(sk))
1da177e4
LT
1507 break;
1508 }
1509 finish_wait(&sk->sk_lock.wq, &wait);
1510}
1511
1512static void __release_sock(struct sock *sk)
1513{
1514 struct sk_buff *skb = sk->sk_backlog.head;
1515
1516 do {
1517 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1518 bh_unlock_sock(sk);
1519
1520 do {
1521 struct sk_buff *next = skb->next;
1522
1523 skb->next = NULL;
c57943a1 1524 sk_backlog_rcv(sk, skb);
1da177e4
LT
1525
1526 /*
1527 * We are in process context here with softirqs
1528 * disabled, use cond_resched_softirq() to preempt.
1529 * This is safe to do because we've taken the backlog
1530 * queue private:
1531 */
1532 cond_resched_softirq();
1533
1534 skb = next;
1535 } while (skb != NULL);
1536
1537 bh_lock_sock(sk);
e71a4783 1538 } while ((skb = sk->sk_backlog.head) != NULL);
1da177e4
LT
1539}
1540
1541/**
1542 * sk_wait_data - wait for data to arrive at sk_receive_queue
4dc3b16b
PP
1543 * @sk: sock to wait on
1544 * @timeo: for how long
1da177e4
LT
1545 *
1546 * Now socket state including sk->sk_err is changed only under lock,
1547 * hence we may omit checks after joining wait queue.
1548 * We check receive queue before schedule() only as optimization;
1549 * it is very likely that release_sock() added new data.
1550 */
1551int sk_wait_data(struct sock *sk, long *timeo)
1552{
1553 int rc;
1554 DEFINE_WAIT(wait);
1555
1556 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1557 set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1558 rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1559 clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1560 finish_wait(sk->sk_sleep, &wait);
1561 return rc;
1562}
1da177e4
LT
1563EXPORT_SYMBOL(sk_wait_data);
1564
3ab224be
HA
1565/**
1566 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1567 * @sk: socket
1568 * @size: memory size to allocate
1569 * @kind: allocation type
1570 *
1571 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1572 * rmem allocation. This function assumes that protocols which have
1573 * memory_pressure use sk_wmem_queued as write buffer accounting.
1574 */
1575int __sk_mem_schedule(struct sock *sk, int size, int kind)
1576{
1577 struct proto *prot = sk->sk_prot;
1578 int amt = sk_mem_pages(size);
1579 int allocated;
1580
1581 sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1582 allocated = atomic_add_return(amt, prot->memory_allocated);
1583
1584 /* Under limit. */
1585 if (allocated <= prot->sysctl_mem[0]) {
1586 if (prot->memory_pressure && *prot->memory_pressure)
1587 *prot->memory_pressure = 0;
1588 return 1;
1589 }
1590
1591 /* Under pressure. */
1592 if (allocated > prot->sysctl_mem[1])
1593 if (prot->enter_memory_pressure)
5c52ba17 1594 prot->enter_memory_pressure(sk);
3ab224be
HA
1595
1596 /* Over hard limit. */
1597 if (allocated > prot->sysctl_mem[2])
1598 goto suppress_allocation;
1599
1600 /* guarantee minimum buffer size under pressure */
1601 if (kind == SK_MEM_RECV) {
1602 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1603 return 1;
1604 } else { /* SK_MEM_SEND */
1605 if (sk->sk_type == SOCK_STREAM) {
1606 if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1607 return 1;
1608 } else if (atomic_read(&sk->sk_wmem_alloc) <
1609 prot->sysctl_wmem[0])
1610 return 1;
1611 }
1612
1613 if (prot->memory_pressure) {
1748376b
ED
1614 int alloc;
1615
1616 if (!*prot->memory_pressure)
1617 return 1;
1618 alloc = percpu_counter_read_positive(prot->sockets_allocated);
1619 if (prot->sysctl_mem[2] > alloc *
3ab224be
HA
1620 sk_mem_pages(sk->sk_wmem_queued +
1621 atomic_read(&sk->sk_rmem_alloc) +
1622 sk->sk_forward_alloc))
1623 return 1;
1624 }
1625
1626suppress_allocation:
1627
1628 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1629 sk_stream_moderate_sndbuf(sk);
1630
1631 /* Fail only if socket is _under_ its sndbuf.
1632 * In this case we cannot block, so that we have to fail.
1633 */
1634 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
1635 return 1;
1636 }
1637
1638 /* Alas. Undo changes. */
1639 sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
1640 atomic_sub(amt, prot->memory_allocated);
1641 return 0;
1642}
3ab224be
HA
1643EXPORT_SYMBOL(__sk_mem_schedule);
1644
1645/**
1646 * __sk_reclaim - reclaim memory_allocated
1647 * @sk: socket
1648 */
1649void __sk_mem_reclaim(struct sock *sk)
1650{
1651 struct proto *prot = sk->sk_prot;
1652
680a5a50 1653 atomic_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT,
3ab224be
HA
1654 prot->memory_allocated);
1655 sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
1656
1657 if (prot->memory_pressure && *prot->memory_pressure &&
1658 (atomic_read(prot->memory_allocated) < prot->sysctl_mem[0]))
1659 *prot->memory_pressure = 0;
1660}
3ab224be
HA
1661EXPORT_SYMBOL(__sk_mem_reclaim);
1662
1663
1da177e4
LT
1664/*
1665 * Set of default routines for initialising struct proto_ops when
1666 * the protocol does not support a particular function. In certain
1667 * cases where it makes no sense for a protocol to have a "do nothing"
1668 * function, some default processing is provided.
1669 */
1670
1671int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1672{
1673 return -EOPNOTSUPP;
1674}
2a91525c 1675EXPORT_SYMBOL(sock_no_bind);
1da177e4 1676
4ec93edb 1677int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1da177e4
LT
1678 int len, int flags)
1679{
1680 return -EOPNOTSUPP;
1681}
2a91525c 1682EXPORT_SYMBOL(sock_no_connect);
1da177e4
LT
1683
1684int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1685{
1686 return -EOPNOTSUPP;
1687}
2a91525c 1688EXPORT_SYMBOL(sock_no_socketpair);
1da177e4
LT
1689
1690int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1691{
1692 return -EOPNOTSUPP;
1693}
2a91525c 1694EXPORT_SYMBOL(sock_no_accept);
1da177e4 1695
4ec93edb 1696int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1da177e4
LT
1697 int *len, int peer)
1698{
1699 return -EOPNOTSUPP;
1700}
2a91525c 1701EXPORT_SYMBOL(sock_no_getname);
1da177e4 1702
2a91525c 1703unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
1da177e4
LT
1704{
1705 return 0;
1706}
2a91525c 1707EXPORT_SYMBOL(sock_no_poll);
1da177e4
LT
1708
1709int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1710{
1711 return -EOPNOTSUPP;
1712}
2a91525c 1713EXPORT_SYMBOL(sock_no_ioctl);
1da177e4
LT
1714
1715int sock_no_listen(struct socket *sock, int backlog)
1716{
1717 return -EOPNOTSUPP;
1718}
2a91525c 1719EXPORT_SYMBOL(sock_no_listen);
1da177e4
LT
1720
1721int sock_no_shutdown(struct socket *sock, int how)
1722{
1723 return -EOPNOTSUPP;
1724}
2a91525c 1725EXPORT_SYMBOL(sock_no_shutdown);
1da177e4
LT
1726
1727int sock_no_setsockopt(struct socket *sock, int level, int optname,
b7058842 1728 char __user *optval, unsigned int optlen)
1da177e4
LT
1729{
1730 return -EOPNOTSUPP;
1731}
2a91525c 1732EXPORT_SYMBOL(sock_no_setsockopt);
1da177e4
LT
1733
1734int sock_no_getsockopt(struct socket *sock, int level, int optname,
1735 char __user *optval, int __user *optlen)
1736{
1737 return -EOPNOTSUPP;
1738}
2a91525c 1739EXPORT_SYMBOL(sock_no_getsockopt);
1da177e4
LT
1740
1741int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1742 size_t len)
1743{
1744 return -EOPNOTSUPP;
1745}
2a91525c 1746EXPORT_SYMBOL(sock_no_sendmsg);
1da177e4
LT
1747
1748int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1749 size_t len, int flags)
1750{
1751 return -EOPNOTSUPP;
1752}
2a91525c 1753EXPORT_SYMBOL(sock_no_recvmsg);
1da177e4
LT
1754
1755int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1756{
1757 /* Mirror missing mmap method error code */
1758 return -ENODEV;
1759}
2a91525c 1760EXPORT_SYMBOL(sock_no_mmap);
1da177e4
LT
1761
1762ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1763{
1764 ssize_t res;
1765 struct msghdr msg = {.msg_flags = flags};
1766 struct kvec iov;
1767 char *kaddr = kmap(page);
1768 iov.iov_base = kaddr + offset;
1769 iov.iov_len = size;
1770 res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1771 kunmap(page);
1772 return res;
1773}
2a91525c 1774EXPORT_SYMBOL(sock_no_sendpage);
1da177e4
LT
1775
1776/*
1777 * Default Socket Callbacks
1778 */
1779
1780static void sock_def_wakeup(struct sock *sk)
1781{
1782 read_lock(&sk->sk_callback_lock);
a57de0b4 1783 if (sk_has_sleeper(sk))
1da177e4
LT
1784 wake_up_interruptible_all(sk->sk_sleep);
1785 read_unlock(&sk->sk_callback_lock);
1786}
1787
1788static void sock_def_error_report(struct sock *sk)
1789{
1790 read_lock(&sk->sk_callback_lock);
a57de0b4 1791 if (sk_has_sleeper(sk))
37e5540b 1792 wake_up_interruptible_poll(sk->sk_sleep, POLLERR);
8d8ad9d7 1793 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
1da177e4
LT
1794 read_unlock(&sk->sk_callback_lock);
1795}
1796
1797static void sock_def_readable(struct sock *sk, int len)
1798{
1799 read_lock(&sk->sk_callback_lock);
a57de0b4 1800 if (sk_has_sleeper(sk))
37e5540b
DL
1801 wake_up_interruptible_sync_poll(sk->sk_sleep, POLLIN |
1802 POLLRDNORM | POLLRDBAND);
8d8ad9d7 1803 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
1da177e4
LT
1804 read_unlock(&sk->sk_callback_lock);
1805}
1806
1807static void sock_def_write_space(struct sock *sk)
1808{
1809 read_lock(&sk->sk_callback_lock);
1810
1811 /* Do not wake up a writer until he can make "significant"
1812 * progress. --DaveM
1813 */
e71a4783 1814 if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
a57de0b4 1815 if (sk_has_sleeper(sk))
37e5540b
DL
1816 wake_up_interruptible_sync_poll(sk->sk_sleep, POLLOUT |
1817 POLLWRNORM | POLLWRBAND);
1da177e4
LT
1818
1819 /* Should agree with poll, otherwise some programs break */
1820 if (sock_writeable(sk))
8d8ad9d7 1821 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
1da177e4
LT
1822 }
1823
1824 read_unlock(&sk->sk_callback_lock);
1825}
1826
1827static void sock_def_destruct(struct sock *sk)
1828{
a51482bd 1829 kfree(sk->sk_protinfo);
1da177e4
LT
1830}
1831
1832void sk_send_sigurg(struct sock *sk)
1833{
1834 if (sk->sk_socket && sk->sk_socket->file)
1835 if (send_sigurg(&sk->sk_socket->file->f_owner))
8d8ad9d7 1836 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
1da177e4 1837}
2a91525c 1838EXPORT_SYMBOL(sk_send_sigurg);
1da177e4
LT
1839
1840void sk_reset_timer(struct sock *sk, struct timer_list* timer,
1841 unsigned long expires)
1842{
1843 if (!mod_timer(timer, expires))
1844 sock_hold(sk);
1845}
1da177e4
LT
1846EXPORT_SYMBOL(sk_reset_timer);
1847
1848void sk_stop_timer(struct sock *sk, struct timer_list* timer)
1849{
1850 if (timer_pending(timer) && del_timer(timer))
1851 __sock_put(sk);
1852}
1da177e4
LT
1853EXPORT_SYMBOL(sk_stop_timer);
1854
1855void sock_init_data(struct socket *sock, struct sock *sk)
1856{
1857 skb_queue_head_init(&sk->sk_receive_queue);
1858 skb_queue_head_init(&sk->sk_write_queue);
1859 skb_queue_head_init(&sk->sk_error_queue);
97fc2f08
CL
1860#ifdef CONFIG_NET_DMA
1861 skb_queue_head_init(&sk->sk_async_wait_queue);
1862#endif
1da177e4
LT
1863
1864 sk->sk_send_head = NULL;
1865
1866 init_timer(&sk->sk_timer);
4ec93edb 1867
1da177e4
LT
1868 sk->sk_allocation = GFP_KERNEL;
1869 sk->sk_rcvbuf = sysctl_rmem_default;
1870 sk->sk_sndbuf = sysctl_wmem_default;
1871 sk->sk_state = TCP_CLOSE;
972692e0 1872 sk_set_socket(sk, sock);
1da177e4
LT
1873
1874 sock_set_flag(sk, SOCK_ZAPPED);
1875
e71a4783 1876 if (sock) {
1da177e4
LT
1877 sk->sk_type = sock->type;
1878 sk->sk_sleep = &sock->wait;
1879 sock->sk = sk;
1880 } else
1881 sk->sk_sleep = NULL;
1882
1883 rwlock_init(&sk->sk_dst_lock);
1884 rwlock_init(&sk->sk_callback_lock);
443aef0e
PZ
1885 lockdep_set_class_and_name(&sk->sk_callback_lock,
1886 af_callback_keys + sk->sk_family,
1887 af_family_clock_key_strings[sk->sk_family]);
1da177e4
LT
1888
1889 sk->sk_state_change = sock_def_wakeup;
1890 sk->sk_data_ready = sock_def_readable;
1891 sk->sk_write_space = sock_def_write_space;
1892 sk->sk_error_report = sock_def_error_report;
1893 sk->sk_destruct = sock_def_destruct;
1894
1895 sk->sk_sndmsg_page = NULL;
1896 sk->sk_sndmsg_off = 0;
1897
1898 sk->sk_peercred.pid = 0;
1899 sk->sk_peercred.uid = -1;
1900 sk->sk_peercred.gid = -1;
1901 sk->sk_write_pending = 0;
1902 sk->sk_rcvlowat = 1;
1903 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
1904 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
1905
f37f0afb 1906 sk->sk_stamp = ktime_set(-1L, 0);
1da177e4 1907
4dc6dc71
ED
1908 /*
1909 * Before updating sk_refcnt, we must commit prior changes to memory
1910 * (Documentation/RCU/rculist_nulls.txt for details)
1911 */
1912 smp_wmb();
1da177e4 1913 atomic_set(&sk->sk_refcnt, 1);
33c732c3 1914 atomic_set(&sk->sk_drops, 0);
1da177e4 1915}
2a91525c 1916EXPORT_SYMBOL(sock_init_data);
1da177e4 1917
b5606c2d 1918void lock_sock_nested(struct sock *sk, int subclass)
1da177e4
LT
1919{
1920 might_sleep();
a5b5bb9a 1921 spin_lock_bh(&sk->sk_lock.slock);
d2e9117c 1922 if (sk->sk_lock.owned)
1da177e4 1923 __lock_sock(sk);
d2e9117c 1924 sk->sk_lock.owned = 1;
a5b5bb9a
IM
1925 spin_unlock(&sk->sk_lock.slock);
1926 /*
1927 * The sk_lock has mutex_lock() semantics here:
1928 */
fcc70d5f 1929 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
a5b5bb9a 1930 local_bh_enable();
1da177e4 1931}
fcc70d5f 1932EXPORT_SYMBOL(lock_sock_nested);
1da177e4 1933
b5606c2d 1934void release_sock(struct sock *sk)
1da177e4 1935{
a5b5bb9a
IM
1936 /*
1937 * The sk_lock has mutex_unlock() semantics:
1938 */
1939 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
1940
1941 spin_lock_bh(&sk->sk_lock.slock);
1da177e4
LT
1942 if (sk->sk_backlog.tail)
1943 __release_sock(sk);
d2e9117c 1944 sk->sk_lock.owned = 0;
a5b5bb9a
IM
1945 if (waitqueue_active(&sk->sk_lock.wq))
1946 wake_up(&sk->sk_lock.wq);
1947 spin_unlock_bh(&sk->sk_lock.slock);
1da177e4
LT
1948}
1949EXPORT_SYMBOL(release_sock);
1950
1951int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
4ec93edb 1952{
b7aa0bf7 1953 struct timeval tv;
1da177e4 1954 if (!sock_flag(sk, SOCK_TIMESTAMP))
20d49473 1955 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
b7aa0bf7
ED
1956 tv = ktime_to_timeval(sk->sk_stamp);
1957 if (tv.tv_sec == -1)
1da177e4 1958 return -ENOENT;
b7aa0bf7
ED
1959 if (tv.tv_sec == 0) {
1960 sk->sk_stamp = ktime_get_real();
1961 tv = ktime_to_timeval(sk->sk_stamp);
1962 }
1963 return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
4ec93edb 1964}
1da177e4
LT
1965EXPORT_SYMBOL(sock_get_timestamp);
1966
ae40eb1e
ED
1967int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
1968{
1969 struct timespec ts;
1970 if (!sock_flag(sk, SOCK_TIMESTAMP))
20d49473 1971 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
ae40eb1e
ED
1972 ts = ktime_to_timespec(sk->sk_stamp);
1973 if (ts.tv_sec == -1)
1974 return -ENOENT;
1975 if (ts.tv_sec == 0) {
1976 sk->sk_stamp = ktime_get_real();
1977 ts = ktime_to_timespec(sk->sk_stamp);
1978 }
1979 return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
1980}
1981EXPORT_SYMBOL(sock_get_timestampns);
1982
20d49473 1983void sock_enable_timestamp(struct sock *sk, int flag)
4ec93edb 1984{
20d49473
PO
1985 if (!sock_flag(sk, flag)) {
1986 sock_set_flag(sk, flag);
1987 /*
1988 * we just set one of the two flags which require net
1989 * time stamping, but time stamping might have been on
1990 * already because of the other one
1991 */
1992 if (!sock_flag(sk,
1993 flag == SOCK_TIMESTAMP ?
1994 SOCK_TIMESTAMPING_RX_SOFTWARE :
1995 SOCK_TIMESTAMP))
1996 net_enable_timestamp();
1da177e4
LT
1997 }
1998}
1da177e4
LT
1999
2000/*
2001 * Get a socket option on an socket.
2002 *
2003 * FIX: POSIX 1003.1g is very ambiguous here. It states that
2004 * asynchronous errors should be reported by getsockopt. We assume
2005 * this means if you specify SO_ERROR (otherwise whats the point of it).
2006 */
2007int sock_common_getsockopt(struct socket *sock, int level, int optname,
2008 char __user *optval, int __user *optlen)
2009{
2010 struct sock *sk = sock->sk;
2011
2012 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2013}
1da177e4
LT
2014EXPORT_SYMBOL(sock_common_getsockopt);
2015
3fdadf7d 2016#ifdef CONFIG_COMPAT
543d9cfe
ACM
2017int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2018 char __user *optval, int __user *optlen)
3fdadf7d
DM
2019{
2020 struct sock *sk = sock->sk;
2021
1e51f951 2022 if (sk->sk_prot->compat_getsockopt != NULL)
543d9cfe
ACM
2023 return sk->sk_prot->compat_getsockopt(sk, level, optname,
2024 optval, optlen);
3fdadf7d
DM
2025 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2026}
2027EXPORT_SYMBOL(compat_sock_common_getsockopt);
2028#endif
2029
1da177e4
LT
2030int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
2031 struct msghdr *msg, size_t size, int flags)
2032{
2033 struct sock *sk = sock->sk;
2034 int addr_len = 0;
2035 int err;
2036
2037 err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2038 flags & ~MSG_DONTWAIT, &addr_len);
2039 if (err >= 0)
2040 msg->msg_namelen = addr_len;
2041 return err;
2042}
1da177e4
LT
2043EXPORT_SYMBOL(sock_common_recvmsg);
2044
2045/*
2046 * Set socket options on an inet socket.
2047 */
2048int sock_common_setsockopt(struct socket *sock, int level, int optname,
b7058842 2049 char __user *optval, unsigned int optlen)
1da177e4
LT
2050{
2051 struct sock *sk = sock->sk;
2052
2053 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2054}
1da177e4
LT
2055EXPORT_SYMBOL(sock_common_setsockopt);
2056
3fdadf7d 2057#ifdef CONFIG_COMPAT
543d9cfe 2058int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
b7058842 2059 char __user *optval, unsigned int optlen)
3fdadf7d
DM
2060{
2061 struct sock *sk = sock->sk;
2062
543d9cfe
ACM
2063 if (sk->sk_prot->compat_setsockopt != NULL)
2064 return sk->sk_prot->compat_setsockopt(sk, level, optname,
2065 optval, optlen);
3fdadf7d
DM
2066 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2067}
2068EXPORT_SYMBOL(compat_sock_common_setsockopt);
2069#endif
2070
1da177e4
LT
2071void sk_common_release(struct sock *sk)
2072{
2073 if (sk->sk_prot->destroy)
2074 sk->sk_prot->destroy(sk);
2075
2076 /*
2077 * Observation: when sock_common_release is called, processes have
2078 * no access to socket. But net still has.
2079 * Step one, detach it from networking:
2080 *
2081 * A. Remove from hash tables.
2082 */
2083
2084 sk->sk_prot->unhash(sk);
2085
2086 /*
2087 * In this point socket cannot receive new packets, but it is possible
2088 * that some packets are in flight because some CPU runs receiver and
2089 * did hash table lookup before we unhashed socket. They will achieve
2090 * receive queue and will be purged by socket destructor.
2091 *
2092 * Also we still have packets pending on receive queue and probably,
2093 * our own packets waiting in device queues. sock_destroy will drain
2094 * receive queue, but transmitted packets will delay socket destruction
2095 * until the last reference will be released.
2096 */
2097
2098 sock_orphan(sk);
2099
2100 xfrm_sk_free_policy(sk);
2101
e6848976 2102 sk_refcnt_debug_release(sk);
1da177e4
LT
2103 sock_put(sk);
2104}
1da177e4
LT
2105EXPORT_SYMBOL(sk_common_release);
2106
2107static DEFINE_RWLOCK(proto_list_lock);
2108static LIST_HEAD(proto_list);
2109
13ff3d6f
PE
2110#ifdef CONFIG_PROC_FS
2111#define PROTO_INUSE_NR 64 /* should be enough for the first time */
1338d466
PE
2112struct prot_inuse {
2113 int val[PROTO_INUSE_NR];
2114};
13ff3d6f
PE
2115
2116static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
70ee1159
PE
2117
2118#ifdef CONFIG_NET_NS
2119void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2120{
2121 int cpu = smp_processor_id();
2122 per_cpu_ptr(net->core.inuse, cpu)->val[prot->inuse_idx] += val;
2123}
2124EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2125
2126int sock_prot_inuse_get(struct net *net, struct proto *prot)
2127{
2128 int cpu, idx = prot->inuse_idx;
2129 int res = 0;
2130
2131 for_each_possible_cpu(cpu)
2132 res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2133
2134 return res >= 0 ? res : 0;
2135}
2136EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2137
2138static int sock_inuse_init_net(struct net *net)
2139{
2140 net->core.inuse = alloc_percpu(struct prot_inuse);
2141 return net->core.inuse ? 0 : -ENOMEM;
2142}
2143
2144static void sock_inuse_exit_net(struct net *net)
2145{
2146 free_percpu(net->core.inuse);
2147}
2148
2149static struct pernet_operations net_inuse_ops = {
2150 .init = sock_inuse_init_net,
2151 .exit = sock_inuse_exit_net,
2152};
2153
2154static __init int net_inuse_init(void)
2155{
2156 if (register_pernet_subsys(&net_inuse_ops))
2157 panic("Cannot initialize net inuse counters");
2158
2159 return 0;
2160}
2161
2162core_initcall(net_inuse_init);
2163#else
1338d466
PE
2164static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2165
c29a0bc4 2166void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
1338d466
PE
2167{
2168 __get_cpu_var(prot_inuse).val[prot->inuse_idx] += val;
2169}
2170EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2171
c29a0bc4 2172int sock_prot_inuse_get(struct net *net, struct proto *prot)
1338d466
PE
2173{
2174 int cpu, idx = prot->inuse_idx;
2175 int res = 0;
2176
2177 for_each_possible_cpu(cpu)
2178 res += per_cpu(prot_inuse, cpu).val[idx];
2179
2180 return res >= 0 ? res : 0;
2181}
2182EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
70ee1159 2183#endif
13ff3d6f
PE
2184
2185static void assign_proto_idx(struct proto *prot)
2186{
2187 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2188
2189 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2190 printk(KERN_ERR "PROTO_INUSE_NR exhausted\n");
2191 return;
2192 }
2193
2194 set_bit(prot->inuse_idx, proto_inuse_idx);
2195}
2196
2197static void release_proto_idx(struct proto *prot)
2198{
2199 if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2200 clear_bit(prot->inuse_idx, proto_inuse_idx);
2201}
2202#else
2203static inline void assign_proto_idx(struct proto *prot)
2204{
2205}
2206
2207static inline void release_proto_idx(struct proto *prot)
2208{
2209}
2210#endif
2211
b733c007
PE
2212int proto_register(struct proto *prot, int alloc_slab)
2213{
1da177e4
LT
2214 if (alloc_slab) {
2215 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
271b72c7
ED
2216 SLAB_HWCACHE_ALIGN | prot->slab_flags,
2217 NULL);
1da177e4
LT
2218
2219 if (prot->slab == NULL) {
2220 printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
2221 prot->name);
60e7663d 2222 goto out;
1da177e4 2223 }
2e6599cb
ACM
2224
2225 if (prot->rsk_prot != NULL) {
2226 static const char mask[] = "request_sock_%s";
2227
7e56b5d6
CM
2228 prot->rsk_prot->slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
2229 if (prot->rsk_prot->slab_name == NULL)
2e6599cb
ACM
2230 goto out_free_sock_slab;
2231
7e56b5d6
CM
2232 sprintf(prot->rsk_prot->slab_name, mask, prot->name);
2233 prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2e6599cb 2234 prot->rsk_prot->obj_size, 0,
20c2df83 2235 SLAB_HWCACHE_ALIGN, NULL);
2e6599cb
ACM
2236
2237 if (prot->rsk_prot->slab == NULL) {
2238 printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
2239 prot->name);
2240 goto out_free_request_sock_slab_name;
2241 }
2242 }
8feaf0c0 2243
6d6ee43e 2244 if (prot->twsk_prot != NULL) {
8feaf0c0
ACM
2245 static const char mask[] = "tw_sock_%s";
2246
7e56b5d6 2247 prot->twsk_prot->twsk_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
8feaf0c0 2248
7e56b5d6 2249 if (prot->twsk_prot->twsk_slab_name == NULL)
8feaf0c0
ACM
2250 goto out_free_request_sock_slab;
2251
7e56b5d6 2252 sprintf(prot->twsk_prot->twsk_slab_name, mask, prot->name);
6d6ee43e 2253 prot->twsk_prot->twsk_slab =
7e56b5d6 2254 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
6d6ee43e 2255 prot->twsk_prot->twsk_obj_size,
3ab5aee7
ED
2256 0,
2257 SLAB_HWCACHE_ALIGN |
2258 prot->slab_flags,
20c2df83 2259 NULL);
6d6ee43e 2260 if (prot->twsk_prot->twsk_slab == NULL)
8feaf0c0
ACM
2261 goto out_free_timewait_sock_slab_name;
2262 }
1da177e4
LT
2263 }
2264
2a278051 2265 write_lock(&proto_list_lock);
1da177e4 2266 list_add(&prot->node, &proto_list);
13ff3d6f 2267 assign_proto_idx(prot);
1da177e4 2268 write_unlock(&proto_list_lock);
b733c007
PE
2269 return 0;
2270
8feaf0c0 2271out_free_timewait_sock_slab_name:
7e56b5d6 2272 kfree(prot->twsk_prot->twsk_slab_name);
8feaf0c0
ACM
2273out_free_request_sock_slab:
2274 if (prot->rsk_prot && prot->rsk_prot->slab) {
2275 kmem_cache_destroy(prot->rsk_prot->slab);
2276 prot->rsk_prot->slab = NULL;
2277 }
2e6599cb 2278out_free_request_sock_slab_name:
7e56b5d6 2279 kfree(prot->rsk_prot->slab_name);
2e6599cb
ACM
2280out_free_sock_slab:
2281 kmem_cache_destroy(prot->slab);
2282 prot->slab = NULL;
b733c007
PE
2283out:
2284 return -ENOBUFS;
1da177e4 2285}
1da177e4
LT
2286EXPORT_SYMBOL(proto_register);
2287
2288void proto_unregister(struct proto *prot)
2289{
2290 write_lock(&proto_list_lock);
13ff3d6f 2291 release_proto_idx(prot);
0a3f4358
PM
2292 list_del(&prot->node);
2293 write_unlock(&proto_list_lock);
1da177e4
LT
2294
2295 if (prot->slab != NULL) {
2296 kmem_cache_destroy(prot->slab);
2297 prot->slab = NULL;
2298 }
2299
2e6599cb 2300 if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2e6599cb 2301 kmem_cache_destroy(prot->rsk_prot->slab);
7e56b5d6 2302 kfree(prot->rsk_prot->slab_name);
2e6599cb
ACM
2303 prot->rsk_prot->slab = NULL;
2304 }
2305
6d6ee43e 2306 if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
6d6ee43e 2307 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
7e56b5d6 2308 kfree(prot->twsk_prot->twsk_slab_name);
6d6ee43e 2309 prot->twsk_prot->twsk_slab = NULL;
8feaf0c0 2310 }
1da177e4 2311}
1da177e4
LT
2312EXPORT_SYMBOL(proto_unregister);
2313
2314#ifdef CONFIG_PROC_FS
1da177e4 2315static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
9a429c49 2316 __acquires(proto_list_lock)
1da177e4
LT
2317{
2318 read_lock(&proto_list_lock);
60f0438a 2319 return seq_list_start_head(&proto_list, *pos);
1da177e4
LT
2320}
2321
2322static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2323{
60f0438a 2324 return seq_list_next(v, &proto_list, pos);
1da177e4
LT
2325}
2326
2327static void proto_seq_stop(struct seq_file *seq, void *v)
9a429c49 2328 __releases(proto_list_lock)
1da177e4
LT
2329{
2330 read_unlock(&proto_list_lock);
2331}
2332
2333static char proto_method_implemented(const void *method)
2334{
2335 return method == NULL ? 'n' : 'y';
2336}
2337
2338static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2339{
2340 seq_printf(seq, "%-9s %4u %6d %6d %-3s %6u %-3s %-10s "
2341 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2342 proto->name,
2343 proto->obj_size,
14e943db 2344 sock_prot_inuse_get(seq_file_net(seq), proto),
1da177e4
LT
2345 proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
2346 proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
2347 proto->max_header,
2348 proto->slab == NULL ? "no" : "yes",
2349 module_name(proto->owner),
2350 proto_method_implemented(proto->close),
2351 proto_method_implemented(proto->connect),
2352 proto_method_implemented(proto->disconnect),
2353 proto_method_implemented(proto->accept),
2354 proto_method_implemented(proto->ioctl),
2355 proto_method_implemented(proto->init),
2356 proto_method_implemented(proto->destroy),
2357 proto_method_implemented(proto->shutdown),
2358 proto_method_implemented(proto->setsockopt),
2359 proto_method_implemented(proto->getsockopt),
2360 proto_method_implemented(proto->sendmsg),
2361 proto_method_implemented(proto->recvmsg),
2362 proto_method_implemented(proto->sendpage),
2363 proto_method_implemented(proto->bind),
2364 proto_method_implemented(proto->backlog_rcv),
2365 proto_method_implemented(proto->hash),
2366 proto_method_implemented(proto->unhash),
2367 proto_method_implemented(proto->get_port),
2368 proto_method_implemented(proto->enter_memory_pressure));
2369}
2370
2371static int proto_seq_show(struct seq_file *seq, void *v)
2372{
60f0438a 2373 if (v == &proto_list)
1da177e4
LT
2374 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2375 "protocol",
2376 "size",
2377 "sockets",
2378 "memory",
2379 "press",
2380 "maxhdr",
2381 "slab",
2382 "module",
2383 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2384 else
60f0438a 2385 proto_seq_printf(seq, list_entry(v, struct proto, node));
1da177e4
LT
2386 return 0;
2387}
2388
f690808e 2389static const struct seq_operations proto_seq_ops = {
1da177e4
LT
2390 .start = proto_seq_start,
2391 .next = proto_seq_next,
2392 .stop = proto_seq_stop,
2393 .show = proto_seq_show,
2394};
2395
2396static int proto_seq_open(struct inode *inode, struct file *file)
2397{
14e943db
ED
2398 return seq_open_net(inode, file, &proto_seq_ops,
2399 sizeof(struct seq_net_private));
1da177e4
LT
2400}
2401
9a32144e 2402static const struct file_operations proto_seq_fops = {
1da177e4
LT
2403 .owner = THIS_MODULE,
2404 .open = proto_seq_open,
2405 .read = seq_read,
2406 .llseek = seq_lseek,
14e943db
ED
2407 .release = seq_release_net,
2408};
2409
2410static __net_init int proto_init_net(struct net *net)
2411{
2412 if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops))
2413 return -ENOMEM;
2414
2415 return 0;
2416}
2417
2418static __net_exit void proto_exit_net(struct net *net)
2419{
2420 proc_net_remove(net, "protocols");
2421}
2422
2423
2424static __net_initdata struct pernet_operations proto_net_ops = {
2425 .init = proto_init_net,
2426 .exit = proto_exit_net,
1da177e4
LT
2427};
2428
2429static int __init proto_init(void)
2430{
14e943db 2431 return register_pernet_subsys(&proto_net_ops);
1da177e4
LT
2432}
2433
2434subsys_initcall(proto_init);
2435
2436#endif /* PROC_FS */
This page took 0.76081 seconds and 5 git commands to generate.