mm: convert printk(KERN_<LEVEL> to pr_<level>
[deliverable/linux.git] / net / core / sock.c
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Generic socket support routines. Memory allocators, socket lock/release
7 * handler for protocols to use and generic option handler.
8 *
9 *
10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Florian La Roche, <flla@stud.uni-sb.de>
13 * Alan Cox, <A.Cox@swansea.ac.uk>
14 *
15 * Fixes:
16 * Alan Cox : Numerous verify_area() problems
17 * Alan Cox : Connecting on a connecting socket
18 * now returns an error for tcp.
19 * Alan Cox : sock->protocol is set correctly.
20 * and is not sometimes left as 0.
21 * Alan Cox : connect handles icmp errors on a
22 * connect properly. Unfortunately there
23 * is a restart syscall nasty there. I
24 * can't match BSD without hacking the C
25 * library. Ideas urgently sought!
26 * Alan Cox : Disallow bind() to addresses that are
27 * not ours - especially broadcast ones!!
28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
30 * instead they leave that for the DESTROY timer.
31 * Alan Cox : Clean up error flag in accept
32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
33 * was buggy. Put a remove_sock() in the handler
34 * for memory when we hit 0. Also altered the timer
35 * code. The ACK stuff can wait and needs major
36 * TCP layer surgery.
37 * Alan Cox : Fixed TCP ack bug, removed remove sock
38 * and fixed timer/inet_bh race.
39 * Alan Cox : Added zapped flag for TCP
40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 * Rick Sladkey : Relaxed UDP rules for matching packets.
46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
47 * Pauline Middelink : identd support
48 * Alan Cox : Fixed connect() taking signals I think.
49 * Alan Cox : SO_LINGER supported
50 * Alan Cox : Error reporting fixes
51 * Anonymous : inet_create tidied up (sk->reuse setting)
52 * Alan Cox : inet sockets don't set sk->type!
53 * Alan Cox : Split socket option code
54 * Alan Cox : Callbacks
55 * Alan Cox : Nagle flag for Charles & Johannes stuff
56 * Alex : Removed restriction on inet fioctl
57 * Alan Cox : Splitting INET from NET core
58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
60 * Alan Cox : Split IP from generic code
61 * Alan Cox : New kfree_skbmem()
62 * Alan Cox : Make SO_DEBUG superuser only.
63 * Alan Cox : Allow anyone to clear SO_DEBUG
64 * (compatibility fix)
65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
66 * Alan Cox : Allocator for a socket is settable.
67 * Alan Cox : SO_ERROR includes soft errors.
68 * Alan Cox : Allow NULL arguments on some SO_ opts
69 * Alan Cox : Generic socket allocation to make hooks
70 * easier (suggested by Craig Metz).
71 * Michael Pall : SO_ERROR returns positive errno again
72 * Steve Whitehouse: Added default destructor to free
73 * protocol private data.
74 * Steve Whitehouse: Added various other default routines
75 * common to several socket families.
76 * Chris Evans : Call suser() check last on F_SETOWN
77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
79 * Andi Kleen : Fix write_space callback
80 * Chris Evans : Security fixes - signedness again
81 * Arnaldo C. Melo : cleanups, use skb_queue_purge
82 *
83 * To Fix:
84 *
85 *
86 * This program is free software; you can redistribute it and/or
87 * modify it under the terms of the GNU General Public License
88 * as published by the Free Software Foundation; either version
89 * 2 of the License, or (at your option) any later version.
90 */
91
92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
93
94 #include <linux/capability.h>
95 #include <linux/errno.h>
96 #include <linux/errqueue.h>
97 #include <linux/types.h>
98 #include <linux/socket.h>
99 #include <linux/in.h>
100 #include <linux/kernel.h>
101 #include <linux/module.h>
102 #include <linux/proc_fs.h>
103 #include <linux/seq_file.h>
104 #include <linux/sched.h>
105 #include <linux/timer.h>
106 #include <linux/string.h>
107 #include <linux/sockios.h>
108 #include <linux/net.h>
109 #include <linux/mm.h>
110 #include <linux/slab.h>
111 #include <linux/interrupt.h>
112 #include <linux/poll.h>
113 #include <linux/tcp.h>
114 #include <linux/init.h>
115 #include <linux/highmem.h>
116 #include <linux/user_namespace.h>
117 #include <linux/static_key.h>
118 #include <linux/memcontrol.h>
119 #include <linux/prefetch.h>
120
121 #include <asm/uaccess.h>
122
123 #include <linux/netdevice.h>
124 #include <net/protocol.h>
125 #include <linux/skbuff.h>
126 #include <net/net_namespace.h>
127 #include <net/request_sock.h>
128 #include <net/sock.h>
129 #include <linux/net_tstamp.h>
130 #include <net/xfrm.h>
131 #include <linux/ipsec.h>
132 #include <net/cls_cgroup.h>
133 #include <net/netprio_cgroup.h>
134 #include <linux/sock_diag.h>
135
136 #include <linux/filter.h>
137 #include <net/sock_reuseport.h>
138
139 #include <trace/events/sock.h>
140
141 #ifdef CONFIG_INET
142 #include <net/tcp.h>
143 #endif
144
145 #include <net/busy_poll.h>
146
147 static DEFINE_MUTEX(proto_list_mutex);
148 static LIST_HEAD(proto_list);
149
150 /**
151 * sk_ns_capable - General socket capability test
152 * @sk: Socket to use a capability on or through
153 * @user_ns: The user namespace of the capability to use
154 * @cap: The capability to use
155 *
156 * Test to see if the opener of the socket had when the socket was
157 * created and the current process has the capability @cap in the user
158 * namespace @user_ns.
159 */
160 bool sk_ns_capable(const struct sock *sk,
161 struct user_namespace *user_ns, int cap)
162 {
163 return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
164 ns_capable(user_ns, cap);
165 }
166 EXPORT_SYMBOL(sk_ns_capable);
167
168 /**
169 * sk_capable - Socket global capability test
170 * @sk: Socket to use a capability on or through
171 * @cap: The global capability to use
172 *
173 * Test to see if the opener of the socket had when the socket was
174 * created and the current process has the capability @cap in all user
175 * namespaces.
176 */
177 bool sk_capable(const struct sock *sk, int cap)
178 {
179 return sk_ns_capable(sk, &init_user_ns, cap);
180 }
181 EXPORT_SYMBOL(sk_capable);
182
183 /**
184 * sk_net_capable - Network namespace socket capability test
185 * @sk: Socket to use a capability on or through
186 * @cap: The capability to use
187 *
188 * Test to see if the opener of the socket had when the socket was created
189 * and the current process has the capability @cap over the network namespace
190 * the socket is a member of.
191 */
192 bool sk_net_capable(const struct sock *sk, int cap)
193 {
194 return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
195 }
196 EXPORT_SYMBOL(sk_net_capable);
197
198 /*
199 * Each address family might have different locking rules, so we have
200 * one slock key per address family:
201 */
202 static struct lock_class_key af_family_keys[AF_MAX];
203 static struct lock_class_key af_family_slock_keys[AF_MAX];
204
205 /*
206 * Make lock validator output more readable. (we pre-construct these
207 * strings build-time, so that runtime initialization of socket
208 * locks is fast):
209 */
210 static const char *const af_family_key_strings[AF_MAX+1] = {
211 "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX" , "sk_lock-AF_INET" ,
212 "sk_lock-AF_AX25" , "sk_lock-AF_IPX" , "sk_lock-AF_APPLETALK",
213 "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE" , "sk_lock-AF_ATMPVC" ,
214 "sk_lock-AF_X25" , "sk_lock-AF_INET6" , "sk_lock-AF_ROSE" ,
215 "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI" , "sk_lock-AF_SECURITY" ,
216 "sk_lock-AF_KEY" , "sk_lock-AF_NETLINK" , "sk_lock-AF_PACKET" ,
217 "sk_lock-AF_ASH" , "sk_lock-AF_ECONET" , "sk_lock-AF_ATMSVC" ,
218 "sk_lock-AF_RDS" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" ,
219 "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE" , "sk_lock-AF_LLC" ,
220 "sk_lock-27" , "sk_lock-28" , "sk_lock-AF_CAN" ,
221 "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" ,
222 "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" ,
223 "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG" ,
224 "sk_lock-AF_NFC" , "sk_lock-AF_VSOCK" , "sk_lock-AF_MAX"
225 };
226 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
227 "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" ,
228 "slock-AF_AX25" , "slock-AF_IPX" , "slock-AF_APPLETALK",
229 "slock-AF_NETROM", "slock-AF_BRIDGE" , "slock-AF_ATMPVC" ,
230 "slock-AF_X25" , "slock-AF_INET6" , "slock-AF_ROSE" ,
231 "slock-AF_DECnet", "slock-AF_NETBEUI" , "slock-AF_SECURITY" ,
232 "slock-AF_KEY" , "slock-AF_NETLINK" , "slock-AF_PACKET" ,
233 "slock-AF_ASH" , "slock-AF_ECONET" , "slock-AF_ATMSVC" ,
234 "slock-AF_RDS" , "slock-AF_SNA" , "slock-AF_IRDA" ,
235 "slock-AF_PPPOX" , "slock-AF_WANPIPE" , "slock-AF_LLC" ,
236 "slock-27" , "slock-28" , "slock-AF_CAN" ,
237 "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" ,
238 "slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" ,
239 "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG" ,
240 "slock-AF_NFC" , "slock-AF_VSOCK" ,"slock-AF_MAX"
241 };
242 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
243 "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" ,
244 "clock-AF_AX25" , "clock-AF_IPX" , "clock-AF_APPLETALK",
245 "clock-AF_NETROM", "clock-AF_BRIDGE" , "clock-AF_ATMPVC" ,
246 "clock-AF_X25" , "clock-AF_INET6" , "clock-AF_ROSE" ,
247 "clock-AF_DECnet", "clock-AF_NETBEUI" , "clock-AF_SECURITY" ,
248 "clock-AF_KEY" , "clock-AF_NETLINK" , "clock-AF_PACKET" ,
249 "clock-AF_ASH" , "clock-AF_ECONET" , "clock-AF_ATMSVC" ,
250 "clock-AF_RDS" , "clock-AF_SNA" , "clock-AF_IRDA" ,
251 "clock-AF_PPPOX" , "clock-AF_WANPIPE" , "clock-AF_LLC" ,
252 "clock-27" , "clock-28" , "clock-AF_CAN" ,
253 "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" ,
254 "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" ,
255 "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG" ,
256 "clock-AF_NFC" , "clock-AF_VSOCK" , "clock-AF_MAX"
257 };
258
259 /*
260 * sk_callback_lock locking rules are per-address-family,
261 * so split the lock classes by using a per-AF key:
262 */
263 static struct lock_class_key af_callback_keys[AF_MAX];
264
265 /* Take into consideration the size of the struct sk_buff overhead in the
266 * determination of these values, since that is non-constant across
267 * platforms. This makes socket queueing behavior and performance
268 * not depend upon such differences.
269 */
270 #define _SK_MEM_PACKETS 256
271 #define _SK_MEM_OVERHEAD SKB_TRUESIZE(256)
272 #define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
273 #define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
274
275 /* Run time adjustable parameters. */
276 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
277 EXPORT_SYMBOL(sysctl_wmem_max);
278 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
279 EXPORT_SYMBOL(sysctl_rmem_max);
280 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
281 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
282
283 /* Maximal space eaten by iovec or ancillary data plus some space */
284 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
285 EXPORT_SYMBOL(sysctl_optmem_max);
286
287 int sysctl_tstamp_allow_data __read_mostly = 1;
288
289 struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
290 EXPORT_SYMBOL_GPL(memalloc_socks);
291
292 /**
293 * sk_set_memalloc - sets %SOCK_MEMALLOC
294 * @sk: socket to set it on
295 *
296 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
297 * It's the responsibility of the admin to adjust min_free_kbytes
298 * to meet the requirements
299 */
300 void sk_set_memalloc(struct sock *sk)
301 {
302 sock_set_flag(sk, SOCK_MEMALLOC);
303 sk->sk_allocation |= __GFP_MEMALLOC;
304 static_key_slow_inc(&memalloc_socks);
305 }
306 EXPORT_SYMBOL_GPL(sk_set_memalloc);
307
308 void sk_clear_memalloc(struct sock *sk)
309 {
310 sock_reset_flag(sk, SOCK_MEMALLOC);
311 sk->sk_allocation &= ~__GFP_MEMALLOC;
312 static_key_slow_dec(&memalloc_socks);
313
314 /*
315 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
316 * progress of swapping. SOCK_MEMALLOC may be cleared while
317 * it has rmem allocations due to the last swapfile being deactivated
318 * but there is a risk that the socket is unusable due to exceeding
319 * the rmem limits. Reclaim the reserves and obey rmem limits again.
320 */
321 sk_mem_reclaim(sk);
322 }
323 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
324
325 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
326 {
327 int ret;
328 unsigned long pflags = current->flags;
329
330 /* these should have been dropped before queueing */
331 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
332
333 current->flags |= PF_MEMALLOC;
334 ret = sk->sk_backlog_rcv(sk, skb);
335 tsk_restore_flags(current, pflags, PF_MEMALLOC);
336
337 return ret;
338 }
339 EXPORT_SYMBOL(__sk_backlog_rcv);
340
341 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
342 {
343 struct timeval tv;
344
345 if (optlen < sizeof(tv))
346 return -EINVAL;
347 if (copy_from_user(&tv, optval, sizeof(tv)))
348 return -EFAULT;
349 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
350 return -EDOM;
351
352 if (tv.tv_sec < 0) {
353 static int warned __read_mostly;
354
355 *timeo_p = 0;
356 if (warned < 10 && net_ratelimit()) {
357 warned++;
358 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
359 __func__, current->comm, task_pid_nr(current));
360 }
361 return 0;
362 }
363 *timeo_p = MAX_SCHEDULE_TIMEOUT;
364 if (tv.tv_sec == 0 && tv.tv_usec == 0)
365 return 0;
366 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
367 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
368 return 0;
369 }
370
371 static void sock_warn_obsolete_bsdism(const char *name)
372 {
373 static int warned;
374 static char warncomm[TASK_COMM_LEN];
375 if (strcmp(warncomm, current->comm) && warned < 5) {
376 strcpy(warncomm, current->comm);
377 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
378 warncomm, name);
379 warned++;
380 }
381 }
382
383 static bool sock_needs_netstamp(const struct sock *sk)
384 {
385 switch (sk->sk_family) {
386 case AF_UNSPEC:
387 case AF_UNIX:
388 return false;
389 default:
390 return true;
391 }
392 }
393
394 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
395 {
396 if (sk->sk_flags & flags) {
397 sk->sk_flags &= ~flags;
398 if (sock_needs_netstamp(sk) &&
399 !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
400 net_disable_timestamp();
401 }
402 }
403
404
405 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
406 {
407 int err;
408 unsigned long flags;
409 struct sk_buff_head *list = &sk->sk_receive_queue;
410
411 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
412 atomic_inc(&sk->sk_drops);
413 trace_sock_rcvqueue_full(sk, skb);
414 return -ENOMEM;
415 }
416
417 err = sk_filter(sk, skb);
418 if (err)
419 return err;
420
421 if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
422 atomic_inc(&sk->sk_drops);
423 return -ENOBUFS;
424 }
425
426 skb->dev = NULL;
427 skb_set_owner_r(skb, sk);
428
429 /* we escape from rcu protected region, make sure we dont leak
430 * a norefcounted dst
431 */
432 skb_dst_force(skb);
433
434 spin_lock_irqsave(&list->lock, flags);
435 sock_skb_set_dropcount(sk, skb);
436 __skb_queue_tail(list, skb);
437 spin_unlock_irqrestore(&list->lock, flags);
438
439 if (!sock_flag(sk, SOCK_DEAD))
440 sk->sk_data_ready(sk);
441 return 0;
442 }
443 EXPORT_SYMBOL(sock_queue_rcv_skb);
444
445 int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
446 {
447 int rc = NET_RX_SUCCESS;
448
449 if (sk_filter(sk, skb))
450 goto discard_and_relse;
451
452 skb->dev = NULL;
453
454 if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
455 atomic_inc(&sk->sk_drops);
456 goto discard_and_relse;
457 }
458 if (nested)
459 bh_lock_sock_nested(sk);
460 else
461 bh_lock_sock(sk);
462 if (!sock_owned_by_user(sk)) {
463 /*
464 * trylock + unlock semantics:
465 */
466 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
467
468 rc = sk_backlog_rcv(sk, skb);
469
470 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
471 } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
472 bh_unlock_sock(sk);
473 atomic_inc(&sk->sk_drops);
474 goto discard_and_relse;
475 }
476
477 bh_unlock_sock(sk);
478 out:
479 sock_put(sk);
480 return rc;
481 discard_and_relse:
482 kfree_skb(skb);
483 goto out;
484 }
485 EXPORT_SYMBOL(sk_receive_skb);
486
487 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
488 {
489 struct dst_entry *dst = __sk_dst_get(sk);
490
491 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
492 sk_tx_queue_clear(sk);
493 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
494 dst_release(dst);
495 return NULL;
496 }
497
498 return dst;
499 }
500 EXPORT_SYMBOL(__sk_dst_check);
501
502 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
503 {
504 struct dst_entry *dst = sk_dst_get(sk);
505
506 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
507 sk_dst_reset(sk);
508 dst_release(dst);
509 return NULL;
510 }
511
512 return dst;
513 }
514 EXPORT_SYMBOL(sk_dst_check);
515
516 static int sock_setbindtodevice(struct sock *sk, char __user *optval,
517 int optlen)
518 {
519 int ret = -ENOPROTOOPT;
520 #ifdef CONFIG_NETDEVICES
521 struct net *net = sock_net(sk);
522 char devname[IFNAMSIZ];
523 int index;
524
525 /* Sorry... */
526 ret = -EPERM;
527 if (!ns_capable(net->user_ns, CAP_NET_RAW))
528 goto out;
529
530 ret = -EINVAL;
531 if (optlen < 0)
532 goto out;
533
534 /* Bind this socket to a particular device like "eth0",
535 * as specified in the passed interface name. If the
536 * name is "" or the option length is zero the socket
537 * is not bound.
538 */
539 if (optlen > IFNAMSIZ - 1)
540 optlen = IFNAMSIZ - 1;
541 memset(devname, 0, sizeof(devname));
542
543 ret = -EFAULT;
544 if (copy_from_user(devname, optval, optlen))
545 goto out;
546
547 index = 0;
548 if (devname[0] != '\0') {
549 struct net_device *dev;
550
551 rcu_read_lock();
552 dev = dev_get_by_name_rcu(net, devname);
553 if (dev)
554 index = dev->ifindex;
555 rcu_read_unlock();
556 ret = -ENODEV;
557 if (!dev)
558 goto out;
559 }
560
561 lock_sock(sk);
562 sk->sk_bound_dev_if = index;
563 sk_dst_reset(sk);
564 release_sock(sk);
565
566 ret = 0;
567
568 out:
569 #endif
570
571 return ret;
572 }
573
574 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
575 int __user *optlen, int len)
576 {
577 int ret = -ENOPROTOOPT;
578 #ifdef CONFIG_NETDEVICES
579 struct net *net = sock_net(sk);
580 char devname[IFNAMSIZ];
581
582 if (sk->sk_bound_dev_if == 0) {
583 len = 0;
584 goto zero;
585 }
586
587 ret = -EINVAL;
588 if (len < IFNAMSIZ)
589 goto out;
590
591 ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
592 if (ret)
593 goto out;
594
595 len = strlen(devname) + 1;
596
597 ret = -EFAULT;
598 if (copy_to_user(optval, devname, len))
599 goto out;
600
601 zero:
602 ret = -EFAULT;
603 if (put_user(len, optlen))
604 goto out;
605
606 ret = 0;
607
608 out:
609 #endif
610
611 return ret;
612 }
613
614 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
615 {
616 if (valbool)
617 sock_set_flag(sk, bit);
618 else
619 sock_reset_flag(sk, bit);
620 }
621
622 bool sk_mc_loop(struct sock *sk)
623 {
624 if (dev_recursion_level())
625 return false;
626 if (!sk)
627 return true;
628 switch (sk->sk_family) {
629 case AF_INET:
630 return inet_sk(sk)->mc_loop;
631 #if IS_ENABLED(CONFIG_IPV6)
632 case AF_INET6:
633 return inet6_sk(sk)->mc_loop;
634 #endif
635 }
636 WARN_ON(1);
637 return true;
638 }
639 EXPORT_SYMBOL(sk_mc_loop);
640
641 /*
642 * This is meant for all protocols to use and covers goings on
643 * at the socket level. Everything here is generic.
644 */
645
646 int sock_setsockopt(struct socket *sock, int level, int optname,
647 char __user *optval, unsigned int optlen)
648 {
649 struct sock *sk = sock->sk;
650 int val;
651 int valbool;
652 struct linger ling;
653 int ret = 0;
654
655 /*
656 * Options without arguments
657 */
658
659 if (optname == SO_BINDTODEVICE)
660 return sock_setbindtodevice(sk, optval, optlen);
661
662 if (optlen < sizeof(int))
663 return -EINVAL;
664
665 if (get_user(val, (int __user *)optval))
666 return -EFAULT;
667
668 valbool = val ? 1 : 0;
669
670 lock_sock(sk);
671
672 switch (optname) {
673 case SO_DEBUG:
674 if (val && !capable(CAP_NET_ADMIN))
675 ret = -EACCES;
676 else
677 sock_valbool_flag(sk, SOCK_DBG, valbool);
678 break;
679 case SO_REUSEADDR:
680 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
681 break;
682 case SO_REUSEPORT:
683 sk->sk_reuseport = valbool;
684 break;
685 case SO_TYPE:
686 case SO_PROTOCOL:
687 case SO_DOMAIN:
688 case SO_ERROR:
689 ret = -ENOPROTOOPT;
690 break;
691 case SO_DONTROUTE:
692 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
693 break;
694 case SO_BROADCAST:
695 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
696 break;
697 case SO_SNDBUF:
698 /* Don't error on this BSD doesn't and if you think
699 * about it this is right. Otherwise apps have to
700 * play 'guess the biggest size' games. RCVBUF/SNDBUF
701 * are treated in BSD as hints
702 */
703 val = min_t(u32, val, sysctl_wmem_max);
704 set_sndbuf:
705 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
706 sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);
707 /* Wake up sending tasks if we upped the value. */
708 sk->sk_write_space(sk);
709 break;
710
711 case SO_SNDBUFFORCE:
712 if (!capable(CAP_NET_ADMIN)) {
713 ret = -EPERM;
714 break;
715 }
716 goto set_sndbuf;
717
718 case SO_RCVBUF:
719 /* Don't error on this BSD doesn't and if you think
720 * about it this is right. Otherwise apps have to
721 * play 'guess the biggest size' games. RCVBUF/SNDBUF
722 * are treated in BSD as hints
723 */
724 val = min_t(u32, val, sysctl_rmem_max);
725 set_rcvbuf:
726 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
727 /*
728 * We double it on the way in to account for
729 * "struct sk_buff" etc. overhead. Applications
730 * assume that the SO_RCVBUF setting they make will
731 * allow that much actual data to be received on that
732 * socket.
733 *
734 * Applications are unaware that "struct sk_buff" and
735 * other overheads allocate from the receive buffer
736 * during socket buffer allocation.
737 *
738 * And after considering the possible alternatives,
739 * returning the value we actually used in getsockopt
740 * is the most desirable behavior.
741 */
742 sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);
743 break;
744
745 case SO_RCVBUFFORCE:
746 if (!capable(CAP_NET_ADMIN)) {
747 ret = -EPERM;
748 break;
749 }
750 goto set_rcvbuf;
751
752 case SO_KEEPALIVE:
753 #ifdef CONFIG_INET
754 if (sk->sk_protocol == IPPROTO_TCP &&
755 sk->sk_type == SOCK_STREAM)
756 tcp_set_keepalive(sk, valbool);
757 #endif
758 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
759 break;
760
761 case SO_OOBINLINE:
762 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
763 break;
764
765 case SO_NO_CHECK:
766 sk->sk_no_check_tx = valbool;
767 break;
768
769 case SO_PRIORITY:
770 if ((val >= 0 && val <= 6) ||
771 ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
772 sk->sk_priority = val;
773 else
774 ret = -EPERM;
775 break;
776
777 case SO_LINGER:
778 if (optlen < sizeof(ling)) {
779 ret = -EINVAL; /* 1003.1g */
780 break;
781 }
782 if (copy_from_user(&ling, optval, sizeof(ling))) {
783 ret = -EFAULT;
784 break;
785 }
786 if (!ling.l_onoff)
787 sock_reset_flag(sk, SOCK_LINGER);
788 else {
789 #if (BITS_PER_LONG == 32)
790 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
791 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
792 else
793 #endif
794 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
795 sock_set_flag(sk, SOCK_LINGER);
796 }
797 break;
798
799 case SO_BSDCOMPAT:
800 sock_warn_obsolete_bsdism("setsockopt");
801 break;
802
803 case SO_PASSCRED:
804 if (valbool)
805 set_bit(SOCK_PASSCRED, &sock->flags);
806 else
807 clear_bit(SOCK_PASSCRED, &sock->flags);
808 break;
809
810 case SO_TIMESTAMP:
811 case SO_TIMESTAMPNS:
812 if (valbool) {
813 if (optname == SO_TIMESTAMP)
814 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
815 else
816 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
817 sock_set_flag(sk, SOCK_RCVTSTAMP);
818 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
819 } else {
820 sock_reset_flag(sk, SOCK_RCVTSTAMP);
821 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
822 }
823 break;
824
825 case SO_TIMESTAMPING:
826 if (val & ~SOF_TIMESTAMPING_MASK) {
827 ret = -EINVAL;
828 break;
829 }
830
831 if (val & SOF_TIMESTAMPING_OPT_ID &&
832 !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
833 if (sk->sk_protocol == IPPROTO_TCP &&
834 sk->sk_type == SOCK_STREAM) {
835 if (sk->sk_state != TCP_ESTABLISHED) {
836 ret = -EINVAL;
837 break;
838 }
839 sk->sk_tskey = tcp_sk(sk)->snd_una;
840 } else {
841 sk->sk_tskey = 0;
842 }
843 }
844 sk->sk_tsflags = val;
845 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
846 sock_enable_timestamp(sk,
847 SOCK_TIMESTAMPING_RX_SOFTWARE);
848 else
849 sock_disable_timestamp(sk,
850 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
851 break;
852
853 case SO_RCVLOWAT:
854 if (val < 0)
855 val = INT_MAX;
856 sk->sk_rcvlowat = val ? : 1;
857 break;
858
859 case SO_RCVTIMEO:
860 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
861 break;
862
863 case SO_SNDTIMEO:
864 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
865 break;
866
867 case SO_ATTACH_FILTER:
868 ret = -EINVAL;
869 if (optlen == sizeof(struct sock_fprog)) {
870 struct sock_fprog fprog;
871
872 ret = -EFAULT;
873 if (copy_from_user(&fprog, optval, sizeof(fprog)))
874 break;
875
876 ret = sk_attach_filter(&fprog, sk);
877 }
878 break;
879
880 case SO_ATTACH_BPF:
881 ret = -EINVAL;
882 if (optlen == sizeof(u32)) {
883 u32 ufd;
884
885 ret = -EFAULT;
886 if (copy_from_user(&ufd, optval, sizeof(ufd)))
887 break;
888
889 ret = sk_attach_bpf(ufd, sk);
890 }
891 break;
892
893 case SO_ATTACH_REUSEPORT_CBPF:
894 ret = -EINVAL;
895 if (optlen == sizeof(struct sock_fprog)) {
896 struct sock_fprog fprog;
897
898 ret = -EFAULT;
899 if (copy_from_user(&fprog, optval, sizeof(fprog)))
900 break;
901
902 ret = sk_reuseport_attach_filter(&fprog, sk);
903 }
904 break;
905
906 case SO_ATTACH_REUSEPORT_EBPF:
907 ret = -EINVAL;
908 if (optlen == sizeof(u32)) {
909 u32 ufd;
910
911 ret = -EFAULT;
912 if (copy_from_user(&ufd, optval, sizeof(ufd)))
913 break;
914
915 ret = sk_reuseport_attach_bpf(ufd, sk);
916 }
917 break;
918
919 case SO_DETACH_FILTER:
920 ret = sk_detach_filter(sk);
921 break;
922
923 case SO_LOCK_FILTER:
924 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
925 ret = -EPERM;
926 else
927 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
928 break;
929
930 case SO_PASSSEC:
931 if (valbool)
932 set_bit(SOCK_PASSSEC, &sock->flags);
933 else
934 clear_bit(SOCK_PASSSEC, &sock->flags);
935 break;
936 case SO_MARK:
937 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
938 ret = -EPERM;
939 else
940 sk->sk_mark = val;
941 break;
942
943 case SO_RXQ_OVFL:
944 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
945 break;
946
947 case SO_WIFI_STATUS:
948 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
949 break;
950
951 case SO_PEEK_OFF:
952 if (sock->ops->set_peek_off)
953 ret = sock->ops->set_peek_off(sk, val);
954 else
955 ret = -EOPNOTSUPP;
956 break;
957
958 case SO_NOFCS:
959 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
960 break;
961
962 case SO_SELECT_ERR_QUEUE:
963 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
964 break;
965
966 #ifdef CONFIG_NET_RX_BUSY_POLL
967 case SO_BUSY_POLL:
968 /* allow unprivileged users to decrease the value */
969 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
970 ret = -EPERM;
971 else {
972 if (val < 0)
973 ret = -EINVAL;
974 else
975 sk->sk_ll_usec = val;
976 }
977 break;
978 #endif
979
980 case SO_MAX_PACING_RATE:
981 sk->sk_max_pacing_rate = val;
982 sk->sk_pacing_rate = min(sk->sk_pacing_rate,
983 sk->sk_max_pacing_rate);
984 break;
985
986 case SO_INCOMING_CPU:
987 sk->sk_incoming_cpu = val;
988 break;
989
990 default:
991 ret = -ENOPROTOOPT;
992 break;
993 }
994 release_sock(sk);
995 return ret;
996 }
997 EXPORT_SYMBOL(sock_setsockopt);
998
999
1000 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1001 struct ucred *ucred)
1002 {
1003 ucred->pid = pid_vnr(pid);
1004 ucred->uid = ucred->gid = -1;
1005 if (cred) {
1006 struct user_namespace *current_ns = current_user_ns();
1007
1008 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1009 ucred->gid = from_kgid_munged(current_ns, cred->egid);
1010 }
1011 }
1012
1013 int sock_getsockopt(struct socket *sock, int level, int optname,
1014 char __user *optval, int __user *optlen)
1015 {
1016 struct sock *sk = sock->sk;
1017
1018 union {
1019 int val;
1020 struct linger ling;
1021 struct timeval tm;
1022 } v;
1023
1024 int lv = sizeof(int);
1025 int len;
1026
1027 if (get_user(len, optlen))
1028 return -EFAULT;
1029 if (len < 0)
1030 return -EINVAL;
1031
1032 memset(&v, 0, sizeof(v));
1033
1034 switch (optname) {
1035 case SO_DEBUG:
1036 v.val = sock_flag(sk, SOCK_DBG);
1037 break;
1038
1039 case SO_DONTROUTE:
1040 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1041 break;
1042
1043 case SO_BROADCAST:
1044 v.val = sock_flag(sk, SOCK_BROADCAST);
1045 break;
1046
1047 case SO_SNDBUF:
1048 v.val = sk->sk_sndbuf;
1049 break;
1050
1051 case SO_RCVBUF:
1052 v.val = sk->sk_rcvbuf;
1053 break;
1054
1055 case SO_REUSEADDR:
1056 v.val = sk->sk_reuse;
1057 break;
1058
1059 case SO_REUSEPORT:
1060 v.val = sk->sk_reuseport;
1061 break;
1062
1063 case SO_KEEPALIVE:
1064 v.val = sock_flag(sk, SOCK_KEEPOPEN);
1065 break;
1066
1067 case SO_TYPE:
1068 v.val = sk->sk_type;
1069 break;
1070
1071 case SO_PROTOCOL:
1072 v.val = sk->sk_protocol;
1073 break;
1074
1075 case SO_DOMAIN:
1076 v.val = sk->sk_family;
1077 break;
1078
1079 case SO_ERROR:
1080 v.val = -sock_error(sk);
1081 if (v.val == 0)
1082 v.val = xchg(&sk->sk_err_soft, 0);
1083 break;
1084
1085 case SO_OOBINLINE:
1086 v.val = sock_flag(sk, SOCK_URGINLINE);
1087 break;
1088
1089 case SO_NO_CHECK:
1090 v.val = sk->sk_no_check_tx;
1091 break;
1092
1093 case SO_PRIORITY:
1094 v.val = sk->sk_priority;
1095 break;
1096
1097 case SO_LINGER:
1098 lv = sizeof(v.ling);
1099 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER);
1100 v.ling.l_linger = sk->sk_lingertime / HZ;
1101 break;
1102
1103 case SO_BSDCOMPAT:
1104 sock_warn_obsolete_bsdism("getsockopt");
1105 break;
1106
1107 case SO_TIMESTAMP:
1108 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1109 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1110 break;
1111
1112 case SO_TIMESTAMPNS:
1113 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1114 break;
1115
1116 case SO_TIMESTAMPING:
1117 v.val = sk->sk_tsflags;
1118 break;
1119
1120 case SO_RCVTIMEO:
1121 lv = sizeof(struct timeval);
1122 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1123 v.tm.tv_sec = 0;
1124 v.tm.tv_usec = 0;
1125 } else {
1126 v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1127 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
1128 }
1129 break;
1130
1131 case SO_SNDTIMEO:
1132 lv = sizeof(struct timeval);
1133 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1134 v.tm.tv_sec = 0;
1135 v.tm.tv_usec = 0;
1136 } else {
1137 v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1138 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
1139 }
1140 break;
1141
1142 case SO_RCVLOWAT:
1143 v.val = sk->sk_rcvlowat;
1144 break;
1145
1146 case SO_SNDLOWAT:
1147 v.val = 1;
1148 break;
1149
1150 case SO_PASSCRED:
1151 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1152 break;
1153
1154 case SO_PEERCRED:
1155 {
1156 struct ucred peercred;
1157 if (len > sizeof(peercred))
1158 len = sizeof(peercred);
1159 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1160 if (copy_to_user(optval, &peercred, len))
1161 return -EFAULT;
1162 goto lenout;
1163 }
1164
1165 case SO_PEERNAME:
1166 {
1167 char address[128];
1168
1169 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1170 return -ENOTCONN;
1171 if (lv < len)
1172 return -EINVAL;
1173 if (copy_to_user(optval, address, len))
1174 return -EFAULT;
1175 goto lenout;
1176 }
1177
1178 /* Dubious BSD thing... Probably nobody even uses it, but
1179 * the UNIX standard wants it for whatever reason... -DaveM
1180 */
1181 case SO_ACCEPTCONN:
1182 v.val = sk->sk_state == TCP_LISTEN;
1183 break;
1184
1185 case SO_PASSSEC:
1186 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1187 break;
1188
1189 case SO_PEERSEC:
1190 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1191
1192 case SO_MARK:
1193 v.val = sk->sk_mark;
1194 break;
1195
1196 case SO_RXQ_OVFL:
1197 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1198 break;
1199
1200 case SO_WIFI_STATUS:
1201 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1202 break;
1203
1204 case SO_PEEK_OFF:
1205 if (!sock->ops->set_peek_off)
1206 return -EOPNOTSUPP;
1207
1208 v.val = sk->sk_peek_off;
1209 break;
1210 case SO_NOFCS:
1211 v.val = sock_flag(sk, SOCK_NOFCS);
1212 break;
1213
1214 case SO_BINDTODEVICE:
1215 return sock_getbindtodevice(sk, optval, optlen, len);
1216
1217 case SO_GET_FILTER:
1218 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1219 if (len < 0)
1220 return len;
1221
1222 goto lenout;
1223
1224 case SO_LOCK_FILTER:
1225 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1226 break;
1227
1228 case SO_BPF_EXTENSIONS:
1229 v.val = bpf_tell_extensions();
1230 break;
1231
1232 case SO_SELECT_ERR_QUEUE:
1233 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1234 break;
1235
1236 #ifdef CONFIG_NET_RX_BUSY_POLL
1237 case SO_BUSY_POLL:
1238 v.val = sk->sk_ll_usec;
1239 break;
1240 #endif
1241
1242 case SO_MAX_PACING_RATE:
1243 v.val = sk->sk_max_pacing_rate;
1244 break;
1245
1246 case SO_INCOMING_CPU:
1247 v.val = sk->sk_incoming_cpu;
1248 break;
1249
1250 default:
1251 /* We implement the SO_SNDLOWAT etc to not be settable
1252 * (1003.1g 7).
1253 */
1254 return -ENOPROTOOPT;
1255 }
1256
1257 if (len > lv)
1258 len = lv;
1259 if (copy_to_user(optval, &v, len))
1260 return -EFAULT;
1261 lenout:
1262 if (put_user(len, optlen))
1263 return -EFAULT;
1264 return 0;
1265 }
1266
1267 /*
1268 * Initialize an sk_lock.
1269 *
1270 * (We also register the sk_lock with the lock validator.)
1271 */
1272 static inline void sock_lock_init(struct sock *sk)
1273 {
1274 sock_lock_init_class_and_name(sk,
1275 af_family_slock_key_strings[sk->sk_family],
1276 af_family_slock_keys + sk->sk_family,
1277 af_family_key_strings[sk->sk_family],
1278 af_family_keys + sk->sk_family);
1279 }
1280
1281 /*
1282 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1283 * even temporarly, because of RCU lookups. sk_node should also be left as is.
1284 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1285 */
1286 static void sock_copy(struct sock *nsk, const struct sock *osk)
1287 {
1288 #ifdef CONFIG_SECURITY_NETWORK
1289 void *sptr = nsk->sk_security;
1290 #endif
1291 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1292
1293 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1294 osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1295
1296 #ifdef CONFIG_SECURITY_NETWORK
1297 nsk->sk_security = sptr;
1298 security_sk_clone(osk, nsk);
1299 #endif
1300 }
1301
1302 void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1303 {
1304 unsigned long nulls1, nulls2;
1305
1306 nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1307 nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1308 if (nulls1 > nulls2)
1309 swap(nulls1, nulls2);
1310
1311 if (nulls1 != 0)
1312 memset((char *)sk, 0, nulls1);
1313 memset((char *)sk + nulls1 + sizeof(void *), 0,
1314 nulls2 - nulls1 - sizeof(void *));
1315 memset((char *)sk + nulls2 + sizeof(void *), 0,
1316 size - nulls2 - sizeof(void *));
1317 }
1318 EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1319
1320 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1321 int family)
1322 {
1323 struct sock *sk;
1324 struct kmem_cache *slab;
1325
1326 slab = prot->slab;
1327 if (slab != NULL) {
1328 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1329 if (!sk)
1330 return sk;
1331 if (priority & __GFP_ZERO) {
1332 if (prot->clear_sk)
1333 prot->clear_sk(sk, prot->obj_size);
1334 else
1335 sk_prot_clear_nulls(sk, prot->obj_size);
1336 }
1337 } else
1338 sk = kmalloc(prot->obj_size, priority);
1339
1340 if (sk != NULL) {
1341 kmemcheck_annotate_bitfield(sk, flags);
1342
1343 if (security_sk_alloc(sk, family, priority))
1344 goto out_free;
1345
1346 if (!try_module_get(prot->owner))
1347 goto out_free_sec;
1348 sk_tx_queue_clear(sk);
1349 cgroup_sk_alloc(&sk->sk_cgrp_data);
1350 }
1351
1352 return sk;
1353
1354 out_free_sec:
1355 security_sk_free(sk);
1356 out_free:
1357 if (slab != NULL)
1358 kmem_cache_free(slab, sk);
1359 else
1360 kfree(sk);
1361 return NULL;
1362 }
1363
1364 static void sk_prot_free(struct proto *prot, struct sock *sk)
1365 {
1366 struct kmem_cache *slab;
1367 struct module *owner;
1368
1369 owner = prot->owner;
1370 slab = prot->slab;
1371
1372 cgroup_sk_free(&sk->sk_cgrp_data);
1373 security_sk_free(sk);
1374 if (slab != NULL)
1375 kmem_cache_free(slab, sk);
1376 else
1377 kfree(sk);
1378 module_put(owner);
1379 }
1380
1381 /**
1382 * sk_alloc - All socket objects are allocated here
1383 * @net: the applicable net namespace
1384 * @family: protocol family
1385 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1386 * @prot: struct proto associated with this new sock instance
1387 * @kern: is this to be a kernel socket?
1388 */
1389 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1390 struct proto *prot, int kern)
1391 {
1392 struct sock *sk;
1393
1394 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1395 if (sk) {
1396 sk->sk_family = family;
1397 /*
1398 * See comment in struct sock definition to understand
1399 * why we need sk_prot_creator -acme
1400 */
1401 sk->sk_prot = sk->sk_prot_creator = prot;
1402 sock_lock_init(sk);
1403 sk->sk_net_refcnt = kern ? 0 : 1;
1404 if (likely(sk->sk_net_refcnt))
1405 get_net(net);
1406 sock_net_set(sk, net);
1407 atomic_set(&sk->sk_wmem_alloc, 1);
1408
1409 sock_update_classid(&sk->sk_cgrp_data);
1410 sock_update_netprioidx(&sk->sk_cgrp_data);
1411 }
1412
1413 return sk;
1414 }
1415 EXPORT_SYMBOL(sk_alloc);
1416
1417 void sk_destruct(struct sock *sk)
1418 {
1419 struct sk_filter *filter;
1420
1421 if (sk->sk_destruct)
1422 sk->sk_destruct(sk);
1423
1424 filter = rcu_dereference_check(sk->sk_filter,
1425 atomic_read(&sk->sk_wmem_alloc) == 0);
1426 if (filter) {
1427 sk_filter_uncharge(sk, filter);
1428 RCU_INIT_POINTER(sk->sk_filter, NULL);
1429 }
1430 if (rcu_access_pointer(sk->sk_reuseport_cb))
1431 reuseport_detach_sock(sk);
1432
1433 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1434
1435 if (atomic_read(&sk->sk_omem_alloc))
1436 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1437 __func__, atomic_read(&sk->sk_omem_alloc));
1438
1439 if (sk->sk_peer_cred)
1440 put_cred(sk->sk_peer_cred);
1441 put_pid(sk->sk_peer_pid);
1442 if (likely(sk->sk_net_refcnt))
1443 put_net(sock_net(sk));
1444 sk_prot_free(sk->sk_prot_creator, sk);
1445 }
1446
1447 static void __sk_free(struct sock *sk)
1448 {
1449 if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt))
1450 sock_diag_broadcast_destroy(sk);
1451 else
1452 sk_destruct(sk);
1453 }
1454
1455 void sk_free(struct sock *sk)
1456 {
1457 /*
1458 * We subtract one from sk_wmem_alloc and can know if
1459 * some packets are still in some tx queue.
1460 * If not null, sock_wfree() will call __sk_free(sk) later
1461 */
1462 if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1463 __sk_free(sk);
1464 }
1465 EXPORT_SYMBOL(sk_free);
1466
1467 /**
1468 * sk_clone_lock - clone a socket, and lock its clone
1469 * @sk: the socket to clone
1470 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1471 *
1472 * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1473 */
1474 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1475 {
1476 struct sock *newsk;
1477 bool is_charged = true;
1478
1479 newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1480 if (newsk != NULL) {
1481 struct sk_filter *filter;
1482
1483 sock_copy(newsk, sk);
1484
1485 /* SANITY */
1486 if (likely(newsk->sk_net_refcnt))
1487 get_net(sock_net(newsk));
1488 sk_node_init(&newsk->sk_node);
1489 sock_lock_init(newsk);
1490 bh_lock_sock(newsk);
1491 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
1492 newsk->sk_backlog.len = 0;
1493
1494 atomic_set(&newsk->sk_rmem_alloc, 0);
1495 /*
1496 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1497 */
1498 atomic_set(&newsk->sk_wmem_alloc, 1);
1499 atomic_set(&newsk->sk_omem_alloc, 0);
1500 skb_queue_head_init(&newsk->sk_receive_queue);
1501 skb_queue_head_init(&newsk->sk_write_queue);
1502
1503 rwlock_init(&newsk->sk_callback_lock);
1504 lockdep_set_class_and_name(&newsk->sk_callback_lock,
1505 af_callback_keys + newsk->sk_family,
1506 af_family_clock_key_strings[newsk->sk_family]);
1507
1508 newsk->sk_dst_cache = NULL;
1509 newsk->sk_wmem_queued = 0;
1510 newsk->sk_forward_alloc = 0;
1511 newsk->sk_send_head = NULL;
1512 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1513
1514 sock_reset_flag(newsk, SOCK_DONE);
1515 skb_queue_head_init(&newsk->sk_error_queue);
1516
1517 filter = rcu_dereference_protected(newsk->sk_filter, 1);
1518 if (filter != NULL)
1519 /* though it's an empty new sock, the charging may fail
1520 * if sysctl_optmem_max was changed between creation of
1521 * original socket and cloning
1522 */
1523 is_charged = sk_filter_charge(newsk, filter);
1524
1525 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1526 /* It is still raw copy of parent, so invalidate
1527 * destructor and make plain sk_free() */
1528 newsk->sk_destruct = NULL;
1529 bh_unlock_sock(newsk);
1530 sk_free(newsk);
1531 newsk = NULL;
1532 goto out;
1533 }
1534
1535 newsk->sk_err = 0;
1536 newsk->sk_priority = 0;
1537 newsk->sk_incoming_cpu = raw_smp_processor_id();
1538 atomic64_set(&newsk->sk_cookie, 0);
1539 /*
1540 * Before updating sk_refcnt, we must commit prior changes to memory
1541 * (Documentation/RCU/rculist_nulls.txt for details)
1542 */
1543 smp_wmb();
1544 atomic_set(&newsk->sk_refcnt, 2);
1545
1546 /*
1547 * Increment the counter in the same struct proto as the master
1548 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1549 * is the same as sk->sk_prot->socks, as this field was copied
1550 * with memcpy).
1551 *
1552 * This _changes_ the previous behaviour, where
1553 * tcp_create_openreq_child always was incrementing the
1554 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1555 * to be taken into account in all callers. -acme
1556 */
1557 sk_refcnt_debug_inc(newsk);
1558 sk_set_socket(newsk, NULL);
1559 newsk->sk_wq = NULL;
1560
1561 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
1562 sock_update_memcg(newsk);
1563
1564 if (newsk->sk_prot->sockets_allocated)
1565 sk_sockets_allocated_inc(newsk);
1566
1567 if (sock_needs_netstamp(sk) &&
1568 newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1569 net_enable_timestamp();
1570 }
1571 out:
1572 return newsk;
1573 }
1574 EXPORT_SYMBOL_GPL(sk_clone_lock);
1575
1576 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1577 {
1578 u32 max_segs = 1;
1579
1580 sk_dst_set(sk, dst);
1581 sk->sk_route_caps = dst->dev->features;
1582 if (sk->sk_route_caps & NETIF_F_GSO)
1583 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1584 sk->sk_route_caps &= ~sk->sk_route_nocaps;
1585 if (sk_can_gso(sk)) {
1586 if (dst->header_len) {
1587 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1588 } else {
1589 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1590 sk->sk_gso_max_size = dst->dev->gso_max_size;
1591 max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
1592 }
1593 }
1594 sk->sk_gso_max_segs = max_segs;
1595 }
1596 EXPORT_SYMBOL_GPL(sk_setup_caps);
1597
1598 /*
1599 * Simple resource managers for sockets.
1600 */
1601
1602
1603 /*
1604 * Write buffer destructor automatically called from kfree_skb.
1605 */
1606 void sock_wfree(struct sk_buff *skb)
1607 {
1608 struct sock *sk = skb->sk;
1609 unsigned int len = skb->truesize;
1610
1611 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1612 /*
1613 * Keep a reference on sk_wmem_alloc, this will be released
1614 * after sk_write_space() call
1615 */
1616 atomic_sub(len - 1, &sk->sk_wmem_alloc);
1617 sk->sk_write_space(sk);
1618 len = 1;
1619 }
1620 /*
1621 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1622 * could not do because of in-flight packets
1623 */
1624 if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1625 __sk_free(sk);
1626 }
1627 EXPORT_SYMBOL(sock_wfree);
1628
1629 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1630 {
1631 skb_orphan(skb);
1632 skb->sk = sk;
1633 #ifdef CONFIG_INET
1634 if (unlikely(!sk_fullsock(sk))) {
1635 skb->destructor = sock_edemux;
1636 sock_hold(sk);
1637 return;
1638 }
1639 #endif
1640 skb->destructor = sock_wfree;
1641 skb_set_hash_from_sk(skb, sk);
1642 /*
1643 * We used to take a refcount on sk, but following operation
1644 * is enough to guarantee sk_free() wont free this sock until
1645 * all in-flight packets are completed
1646 */
1647 atomic_add(skb->truesize, &sk->sk_wmem_alloc);
1648 }
1649 EXPORT_SYMBOL(skb_set_owner_w);
1650
1651 void skb_orphan_partial(struct sk_buff *skb)
1652 {
1653 /* TCP stack sets skb->ooo_okay based on sk_wmem_alloc,
1654 * so we do not completely orphan skb, but transfert all
1655 * accounted bytes but one, to avoid unexpected reorders.
1656 */
1657 if (skb->destructor == sock_wfree
1658 #ifdef CONFIG_INET
1659 || skb->destructor == tcp_wfree
1660 #endif
1661 ) {
1662 atomic_sub(skb->truesize - 1, &skb->sk->sk_wmem_alloc);
1663 skb->truesize = 1;
1664 } else {
1665 skb_orphan(skb);
1666 }
1667 }
1668 EXPORT_SYMBOL(skb_orphan_partial);
1669
1670 /*
1671 * Read buffer destructor automatically called from kfree_skb.
1672 */
1673 void sock_rfree(struct sk_buff *skb)
1674 {
1675 struct sock *sk = skb->sk;
1676 unsigned int len = skb->truesize;
1677
1678 atomic_sub(len, &sk->sk_rmem_alloc);
1679 sk_mem_uncharge(sk, len);
1680 }
1681 EXPORT_SYMBOL(sock_rfree);
1682
1683 /*
1684 * Buffer destructor for skbs that are not used directly in read or write
1685 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1686 */
1687 void sock_efree(struct sk_buff *skb)
1688 {
1689 sock_put(skb->sk);
1690 }
1691 EXPORT_SYMBOL(sock_efree);
1692
1693 kuid_t sock_i_uid(struct sock *sk)
1694 {
1695 kuid_t uid;
1696
1697 read_lock_bh(&sk->sk_callback_lock);
1698 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1699 read_unlock_bh(&sk->sk_callback_lock);
1700 return uid;
1701 }
1702 EXPORT_SYMBOL(sock_i_uid);
1703
1704 unsigned long sock_i_ino(struct sock *sk)
1705 {
1706 unsigned long ino;
1707
1708 read_lock_bh(&sk->sk_callback_lock);
1709 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1710 read_unlock_bh(&sk->sk_callback_lock);
1711 return ino;
1712 }
1713 EXPORT_SYMBOL(sock_i_ino);
1714
1715 /*
1716 * Allocate a skb from the socket's send buffer.
1717 */
1718 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1719 gfp_t priority)
1720 {
1721 if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1722 struct sk_buff *skb = alloc_skb(size, priority);
1723 if (skb) {
1724 skb_set_owner_w(skb, sk);
1725 return skb;
1726 }
1727 }
1728 return NULL;
1729 }
1730 EXPORT_SYMBOL(sock_wmalloc);
1731
1732 /*
1733 * Allocate a memory block from the socket's option memory buffer.
1734 */
1735 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1736 {
1737 if ((unsigned int)size <= sysctl_optmem_max &&
1738 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1739 void *mem;
1740 /* First do the add, to avoid the race if kmalloc
1741 * might sleep.
1742 */
1743 atomic_add(size, &sk->sk_omem_alloc);
1744 mem = kmalloc(size, priority);
1745 if (mem)
1746 return mem;
1747 atomic_sub(size, &sk->sk_omem_alloc);
1748 }
1749 return NULL;
1750 }
1751 EXPORT_SYMBOL(sock_kmalloc);
1752
1753 /* Free an option memory block. Note, we actually want the inline
1754 * here as this allows gcc to detect the nullify and fold away the
1755 * condition entirely.
1756 */
1757 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
1758 const bool nullify)
1759 {
1760 if (WARN_ON_ONCE(!mem))
1761 return;
1762 if (nullify)
1763 kzfree(mem);
1764 else
1765 kfree(mem);
1766 atomic_sub(size, &sk->sk_omem_alloc);
1767 }
1768
1769 void sock_kfree_s(struct sock *sk, void *mem, int size)
1770 {
1771 __sock_kfree_s(sk, mem, size, false);
1772 }
1773 EXPORT_SYMBOL(sock_kfree_s);
1774
1775 void sock_kzfree_s(struct sock *sk, void *mem, int size)
1776 {
1777 __sock_kfree_s(sk, mem, size, true);
1778 }
1779 EXPORT_SYMBOL(sock_kzfree_s);
1780
1781 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1782 I think, these locks should be removed for datagram sockets.
1783 */
1784 static long sock_wait_for_wmem(struct sock *sk, long timeo)
1785 {
1786 DEFINE_WAIT(wait);
1787
1788 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1789 for (;;) {
1790 if (!timeo)
1791 break;
1792 if (signal_pending(current))
1793 break;
1794 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1795 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1796 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1797 break;
1798 if (sk->sk_shutdown & SEND_SHUTDOWN)
1799 break;
1800 if (sk->sk_err)
1801 break;
1802 timeo = schedule_timeout(timeo);
1803 }
1804 finish_wait(sk_sleep(sk), &wait);
1805 return timeo;
1806 }
1807
1808
1809 /*
1810 * Generic send/receive buffer handlers
1811 */
1812
1813 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1814 unsigned long data_len, int noblock,
1815 int *errcode, int max_page_order)
1816 {
1817 struct sk_buff *skb;
1818 long timeo;
1819 int err;
1820
1821 timeo = sock_sndtimeo(sk, noblock);
1822 for (;;) {
1823 err = sock_error(sk);
1824 if (err != 0)
1825 goto failure;
1826
1827 err = -EPIPE;
1828 if (sk->sk_shutdown & SEND_SHUTDOWN)
1829 goto failure;
1830
1831 if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
1832 break;
1833
1834 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1835 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1836 err = -EAGAIN;
1837 if (!timeo)
1838 goto failure;
1839 if (signal_pending(current))
1840 goto interrupted;
1841 timeo = sock_wait_for_wmem(sk, timeo);
1842 }
1843 skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
1844 errcode, sk->sk_allocation);
1845 if (skb)
1846 skb_set_owner_w(skb, sk);
1847 return skb;
1848
1849 interrupted:
1850 err = sock_intr_errno(timeo);
1851 failure:
1852 *errcode = err;
1853 return NULL;
1854 }
1855 EXPORT_SYMBOL(sock_alloc_send_pskb);
1856
1857 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1858 int noblock, int *errcode)
1859 {
1860 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
1861 }
1862 EXPORT_SYMBOL(sock_alloc_send_skb);
1863
1864 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
1865 struct sockcm_cookie *sockc)
1866 {
1867 struct cmsghdr *cmsg;
1868
1869 for_each_cmsghdr(cmsg, msg) {
1870 if (!CMSG_OK(msg, cmsg))
1871 return -EINVAL;
1872 if (cmsg->cmsg_level != SOL_SOCKET)
1873 continue;
1874 switch (cmsg->cmsg_type) {
1875 case SO_MARK:
1876 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1877 return -EPERM;
1878 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
1879 return -EINVAL;
1880 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
1881 break;
1882 default:
1883 return -EINVAL;
1884 }
1885 }
1886 return 0;
1887 }
1888 EXPORT_SYMBOL(sock_cmsg_send);
1889
1890 /* On 32bit arches, an skb frag is limited to 2^15 */
1891 #define SKB_FRAG_PAGE_ORDER get_order(32768)
1892
1893 /**
1894 * skb_page_frag_refill - check that a page_frag contains enough room
1895 * @sz: minimum size of the fragment we want to get
1896 * @pfrag: pointer to page_frag
1897 * @gfp: priority for memory allocation
1898 *
1899 * Note: While this allocator tries to use high order pages, there is
1900 * no guarantee that allocations succeed. Therefore, @sz MUST be
1901 * less or equal than PAGE_SIZE.
1902 */
1903 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
1904 {
1905 if (pfrag->page) {
1906 if (page_ref_count(pfrag->page) == 1) {
1907 pfrag->offset = 0;
1908 return true;
1909 }
1910 if (pfrag->offset + sz <= pfrag->size)
1911 return true;
1912 put_page(pfrag->page);
1913 }
1914
1915 pfrag->offset = 0;
1916 if (SKB_FRAG_PAGE_ORDER) {
1917 /* Avoid direct reclaim but allow kswapd to wake */
1918 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
1919 __GFP_COMP | __GFP_NOWARN |
1920 __GFP_NORETRY,
1921 SKB_FRAG_PAGE_ORDER);
1922 if (likely(pfrag->page)) {
1923 pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
1924 return true;
1925 }
1926 }
1927 pfrag->page = alloc_page(gfp);
1928 if (likely(pfrag->page)) {
1929 pfrag->size = PAGE_SIZE;
1930 return true;
1931 }
1932 return false;
1933 }
1934 EXPORT_SYMBOL(skb_page_frag_refill);
1935
1936 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
1937 {
1938 if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
1939 return true;
1940
1941 sk_enter_memory_pressure(sk);
1942 sk_stream_moderate_sndbuf(sk);
1943 return false;
1944 }
1945 EXPORT_SYMBOL(sk_page_frag_refill);
1946
1947 static void __lock_sock(struct sock *sk)
1948 __releases(&sk->sk_lock.slock)
1949 __acquires(&sk->sk_lock.slock)
1950 {
1951 DEFINE_WAIT(wait);
1952
1953 for (;;) {
1954 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1955 TASK_UNINTERRUPTIBLE);
1956 spin_unlock_bh(&sk->sk_lock.slock);
1957 schedule();
1958 spin_lock_bh(&sk->sk_lock.slock);
1959 if (!sock_owned_by_user(sk))
1960 break;
1961 }
1962 finish_wait(&sk->sk_lock.wq, &wait);
1963 }
1964
1965 static void __release_sock(struct sock *sk)
1966 __releases(&sk->sk_lock.slock)
1967 __acquires(&sk->sk_lock.slock)
1968 {
1969 struct sk_buff *skb = sk->sk_backlog.head;
1970
1971 do {
1972 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1973 bh_unlock_sock(sk);
1974
1975 do {
1976 struct sk_buff *next = skb->next;
1977
1978 prefetch(next);
1979 WARN_ON_ONCE(skb_dst_is_noref(skb));
1980 skb->next = NULL;
1981 sk_backlog_rcv(sk, skb);
1982
1983 /*
1984 * We are in process context here with softirqs
1985 * disabled, use cond_resched_softirq() to preempt.
1986 * This is safe to do because we've taken the backlog
1987 * queue private:
1988 */
1989 cond_resched_softirq();
1990
1991 skb = next;
1992 } while (skb != NULL);
1993
1994 bh_lock_sock(sk);
1995 } while ((skb = sk->sk_backlog.head) != NULL);
1996
1997 /*
1998 * Doing the zeroing here guarantee we can not loop forever
1999 * while a wild producer attempts to flood us.
2000 */
2001 sk->sk_backlog.len = 0;
2002 }
2003
2004 /**
2005 * sk_wait_data - wait for data to arrive at sk_receive_queue
2006 * @sk: sock to wait on
2007 * @timeo: for how long
2008 * @skb: last skb seen on sk_receive_queue
2009 *
2010 * Now socket state including sk->sk_err is changed only under lock,
2011 * hence we may omit checks after joining wait queue.
2012 * We check receive queue before schedule() only as optimization;
2013 * it is very likely that release_sock() added new data.
2014 */
2015 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2016 {
2017 int rc;
2018 DEFINE_WAIT(wait);
2019
2020 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2021 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2022 rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb);
2023 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2024 finish_wait(sk_sleep(sk), &wait);
2025 return rc;
2026 }
2027 EXPORT_SYMBOL(sk_wait_data);
2028
2029 /**
2030 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2031 * @sk: socket
2032 * @size: memory size to allocate
2033 * @kind: allocation type
2034 *
2035 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2036 * rmem allocation. This function assumes that protocols which have
2037 * memory_pressure use sk_wmem_queued as write buffer accounting.
2038 */
2039 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2040 {
2041 struct proto *prot = sk->sk_prot;
2042 int amt = sk_mem_pages(size);
2043 long allocated;
2044
2045 sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
2046
2047 allocated = sk_memory_allocated_add(sk, amt);
2048
2049 if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2050 !mem_cgroup_charge_skmem(sk->sk_memcg, amt))
2051 goto suppress_allocation;
2052
2053 /* Under limit. */
2054 if (allocated <= sk_prot_mem_limits(sk, 0)) {
2055 sk_leave_memory_pressure(sk);
2056 return 1;
2057 }
2058
2059 /* Under pressure. */
2060 if (allocated > sk_prot_mem_limits(sk, 1))
2061 sk_enter_memory_pressure(sk);
2062
2063 /* Over hard limit. */
2064 if (allocated > sk_prot_mem_limits(sk, 2))
2065 goto suppress_allocation;
2066
2067 /* guarantee minimum buffer size under pressure */
2068 if (kind == SK_MEM_RECV) {
2069 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
2070 return 1;
2071
2072 } else { /* SK_MEM_SEND */
2073 if (sk->sk_type == SOCK_STREAM) {
2074 if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
2075 return 1;
2076 } else if (atomic_read(&sk->sk_wmem_alloc) <
2077 prot->sysctl_wmem[0])
2078 return 1;
2079 }
2080
2081 if (sk_has_memory_pressure(sk)) {
2082 int alloc;
2083
2084 if (!sk_under_memory_pressure(sk))
2085 return 1;
2086 alloc = sk_sockets_allocated_read_positive(sk);
2087 if (sk_prot_mem_limits(sk, 2) > alloc *
2088 sk_mem_pages(sk->sk_wmem_queued +
2089 atomic_read(&sk->sk_rmem_alloc) +
2090 sk->sk_forward_alloc))
2091 return 1;
2092 }
2093
2094 suppress_allocation:
2095
2096 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2097 sk_stream_moderate_sndbuf(sk);
2098
2099 /* Fail only if socket is _under_ its sndbuf.
2100 * In this case we cannot block, so that we have to fail.
2101 */
2102 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2103 return 1;
2104 }
2105
2106 trace_sock_exceed_buf_limit(sk, prot, allocated);
2107
2108 /* Alas. Undo changes. */
2109 sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
2110
2111 sk_memory_allocated_sub(sk, amt);
2112
2113 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2114 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2115
2116 return 0;
2117 }
2118 EXPORT_SYMBOL(__sk_mem_schedule);
2119
2120 /**
2121 * __sk_mem_reclaim - reclaim memory_allocated
2122 * @sk: socket
2123 * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2124 */
2125 void __sk_mem_reclaim(struct sock *sk, int amount)
2126 {
2127 amount >>= SK_MEM_QUANTUM_SHIFT;
2128 sk_memory_allocated_sub(sk, amount);
2129 sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2130
2131 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2132 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2133
2134 if (sk_under_memory_pressure(sk) &&
2135 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2136 sk_leave_memory_pressure(sk);
2137 }
2138 EXPORT_SYMBOL(__sk_mem_reclaim);
2139
2140
2141 /*
2142 * Set of default routines for initialising struct proto_ops when
2143 * the protocol does not support a particular function. In certain
2144 * cases where it makes no sense for a protocol to have a "do nothing"
2145 * function, some default processing is provided.
2146 */
2147
2148 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2149 {
2150 return -EOPNOTSUPP;
2151 }
2152 EXPORT_SYMBOL(sock_no_bind);
2153
2154 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2155 int len, int flags)
2156 {
2157 return -EOPNOTSUPP;
2158 }
2159 EXPORT_SYMBOL(sock_no_connect);
2160
2161 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2162 {
2163 return -EOPNOTSUPP;
2164 }
2165 EXPORT_SYMBOL(sock_no_socketpair);
2166
2167 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
2168 {
2169 return -EOPNOTSUPP;
2170 }
2171 EXPORT_SYMBOL(sock_no_accept);
2172
2173 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2174 int *len, int peer)
2175 {
2176 return -EOPNOTSUPP;
2177 }
2178 EXPORT_SYMBOL(sock_no_getname);
2179
2180 unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2181 {
2182 return 0;
2183 }
2184 EXPORT_SYMBOL(sock_no_poll);
2185
2186 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2187 {
2188 return -EOPNOTSUPP;
2189 }
2190 EXPORT_SYMBOL(sock_no_ioctl);
2191
2192 int sock_no_listen(struct socket *sock, int backlog)
2193 {
2194 return -EOPNOTSUPP;
2195 }
2196 EXPORT_SYMBOL(sock_no_listen);
2197
2198 int sock_no_shutdown(struct socket *sock, int how)
2199 {
2200 return -EOPNOTSUPP;
2201 }
2202 EXPORT_SYMBOL(sock_no_shutdown);
2203
2204 int sock_no_setsockopt(struct socket *sock, int level, int optname,
2205 char __user *optval, unsigned int optlen)
2206 {
2207 return -EOPNOTSUPP;
2208 }
2209 EXPORT_SYMBOL(sock_no_setsockopt);
2210
2211 int sock_no_getsockopt(struct socket *sock, int level, int optname,
2212 char __user *optval, int __user *optlen)
2213 {
2214 return -EOPNOTSUPP;
2215 }
2216 EXPORT_SYMBOL(sock_no_getsockopt);
2217
2218 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2219 {
2220 return -EOPNOTSUPP;
2221 }
2222 EXPORT_SYMBOL(sock_no_sendmsg);
2223
2224 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2225 int flags)
2226 {
2227 return -EOPNOTSUPP;
2228 }
2229 EXPORT_SYMBOL(sock_no_recvmsg);
2230
2231 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2232 {
2233 /* Mirror missing mmap method error code */
2234 return -ENODEV;
2235 }
2236 EXPORT_SYMBOL(sock_no_mmap);
2237
2238 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2239 {
2240 ssize_t res;
2241 struct msghdr msg = {.msg_flags = flags};
2242 struct kvec iov;
2243 char *kaddr = kmap(page);
2244 iov.iov_base = kaddr + offset;
2245 iov.iov_len = size;
2246 res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2247 kunmap(page);
2248 return res;
2249 }
2250 EXPORT_SYMBOL(sock_no_sendpage);
2251
2252 /*
2253 * Default Socket Callbacks
2254 */
2255
2256 static void sock_def_wakeup(struct sock *sk)
2257 {
2258 struct socket_wq *wq;
2259
2260 rcu_read_lock();
2261 wq = rcu_dereference(sk->sk_wq);
2262 if (skwq_has_sleeper(wq))
2263 wake_up_interruptible_all(&wq->wait);
2264 rcu_read_unlock();
2265 }
2266
2267 static void sock_def_error_report(struct sock *sk)
2268 {
2269 struct socket_wq *wq;
2270
2271 rcu_read_lock();
2272 wq = rcu_dereference(sk->sk_wq);
2273 if (skwq_has_sleeper(wq))
2274 wake_up_interruptible_poll(&wq->wait, POLLERR);
2275 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2276 rcu_read_unlock();
2277 }
2278
2279 static void sock_def_readable(struct sock *sk)
2280 {
2281 struct socket_wq *wq;
2282
2283 rcu_read_lock();
2284 wq = rcu_dereference(sk->sk_wq);
2285 if (skwq_has_sleeper(wq))
2286 wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2287 POLLRDNORM | POLLRDBAND);
2288 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2289 rcu_read_unlock();
2290 }
2291
2292 static void sock_def_write_space(struct sock *sk)
2293 {
2294 struct socket_wq *wq;
2295
2296 rcu_read_lock();
2297
2298 /* Do not wake up a writer until he can make "significant"
2299 * progress. --DaveM
2300 */
2301 if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2302 wq = rcu_dereference(sk->sk_wq);
2303 if (skwq_has_sleeper(wq))
2304 wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2305 POLLWRNORM | POLLWRBAND);
2306
2307 /* Should agree with poll, otherwise some programs break */
2308 if (sock_writeable(sk))
2309 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2310 }
2311
2312 rcu_read_unlock();
2313 }
2314
2315 static void sock_def_destruct(struct sock *sk)
2316 {
2317 }
2318
2319 void sk_send_sigurg(struct sock *sk)
2320 {
2321 if (sk->sk_socket && sk->sk_socket->file)
2322 if (send_sigurg(&sk->sk_socket->file->f_owner))
2323 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2324 }
2325 EXPORT_SYMBOL(sk_send_sigurg);
2326
2327 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2328 unsigned long expires)
2329 {
2330 if (!mod_timer(timer, expires))
2331 sock_hold(sk);
2332 }
2333 EXPORT_SYMBOL(sk_reset_timer);
2334
2335 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2336 {
2337 if (del_timer(timer))
2338 __sock_put(sk);
2339 }
2340 EXPORT_SYMBOL(sk_stop_timer);
2341
2342 void sock_init_data(struct socket *sock, struct sock *sk)
2343 {
2344 skb_queue_head_init(&sk->sk_receive_queue);
2345 skb_queue_head_init(&sk->sk_write_queue);
2346 skb_queue_head_init(&sk->sk_error_queue);
2347
2348 sk->sk_send_head = NULL;
2349
2350 init_timer(&sk->sk_timer);
2351
2352 sk->sk_allocation = GFP_KERNEL;
2353 sk->sk_rcvbuf = sysctl_rmem_default;
2354 sk->sk_sndbuf = sysctl_wmem_default;
2355 sk->sk_state = TCP_CLOSE;
2356 sk_set_socket(sk, sock);
2357
2358 sock_set_flag(sk, SOCK_ZAPPED);
2359
2360 if (sock) {
2361 sk->sk_type = sock->type;
2362 sk->sk_wq = sock->wq;
2363 sock->sk = sk;
2364 } else
2365 sk->sk_wq = NULL;
2366
2367 rwlock_init(&sk->sk_callback_lock);
2368 lockdep_set_class_and_name(&sk->sk_callback_lock,
2369 af_callback_keys + sk->sk_family,
2370 af_family_clock_key_strings[sk->sk_family]);
2371
2372 sk->sk_state_change = sock_def_wakeup;
2373 sk->sk_data_ready = sock_def_readable;
2374 sk->sk_write_space = sock_def_write_space;
2375 sk->sk_error_report = sock_def_error_report;
2376 sk->sk_destruct = sock_def_destruct;
2377
2378 sk->sk_frag.page = NULL;
2379 sk->sk_frag.offset = 0;
2380 sk->sk_peek_off = -1;
2381
2382 sk->sk_peer_pid = NULL;
2383 sk->sk_peer_cred = NULL;
2384 sk->sk_write_pending = 0;
2385 sk->sk_rcvlowat = 1;
2386 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
2387 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
2388
2389 sk->sk_stamp = ktime_set(-1L, 0);
2390
2391 #ifdef CONFIG_NET_RX_BUSY_POLL
2392 sk->sk_napi_id = 0;
2393 sk->sk_ll_usec = sysctl_net_busy_read;
2394 #endif
2395
2396 sk->sk_max_pacing_rate = ~0U;
2397 sk->sk_pacing_rate = ~0U;
2398 sk->sk_incoming_cpu = -1;
2399 /*
2400 * Before updating sk_refcnt, we must commit prior changes to memory
2401 * (Documentation/RCU/rculist_nulls.txt for details)
2402 */
2403 smp_wmb();
2404 atomic_set(&sk->sk_refcnt, 1);
2405 atomic_set(&sk->sk_drops, 0);
2406 }
2407 EXPORT_SYMBOL(sock_init_data);
2408
2409 void lock_sock_nested(struct sock *sk, int subclass)
2410 {
2411 might_sleep();
2412 spin_lock_bh(&sk->sk_lock.slock);
2413 if (sk->sk_lock.owned)
2414 __lock_sock(sk);
2415 sk->sk_lock.owned = 1;
2416 spin_unlock(&sk->sk_lock.slock);
2417 /*
2418 * The sk_lock has mutex_lock() semantics here:
2419 */
2420 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2421 local_bh_enable();
2422 }
2423 EXPORT_SYMBOL(lock_sock_nested);
2424
2425 void release_sock(struct sock *sk)
2426 {
2427 /*
2428 * The sk_lock has mutex_unlock() semantics:
2429 */
2430 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
2431
2432 spin_lock_bh(&sk->sk_lock.slock);
2433 if (sk->sk_backlog.tail)
2434 __release_sock(sk);
2435
2436 /* Warning : release_cb() might need to release sk ownership,
2437 * ie call sock_release_ownership(sk) before us.
2438 */
2439 if (sk->sk_prot->release_cb)
2440 sk->sk_prot->release_cb(sk);
2441
2442 sock_release_ownership(sk);
2443 if (waitqueue_active(&sk->sk_lock.wq))
2444 wake_up(&sk->sk_lock.wq);
2445 spin_unlock_bh(&sk->sk_lock.slock);
2446 }
2447 EXPORT_SYMBOL(release_sock);
2448
2449 /**
2450 * lock_sock_fast - fast version of lock_sock
2451 * @sk: socket
2452 *
2453 * This version should be used for very small section, where process wont block
2454 * return false if fast path is taken
2455 * sk_lock.slock locked, owned = 0, BH disabled
2456 * return true if slow path is taken
2457 * sk_lock.slock unlocked, owned = 1, BH enabled
2458 */
2459 bool lock_sock_fast(struct sock *sk)
2460 {
2461 might_sleep();
2462 spin_lock_bh(&sk->sk_lock.slock);
2463
2464 if (!sk->sk_lock.owned)
2465 /*
2466 * Note : We must disable BH
2467 */
2468 return false;
2469
2470 __lock_sock(sk);
2471 sk->sk_lock.owned = 1;
2472 spin_unlock(&sk->sk_lock.slock);
2473 /*
2474 * The sk_lock has mutex_lock() semantics here:
2475 */
2476 mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2477 local_bh_enable();
2478 return true;
2479 }
2480 EXPORT_SYMBOL(lock_sock_fast);
2481
2482 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2483 {
2484 struct timeval tv;
2485 if (!sock_flag(sk, SOCK_TIMESTAMP))
2486 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2487 tv = ktime_to_timeval(sk->sk_stamp);
2488 if (tv.tv_sec == -1)
2489 return -ENOENT;
2490 if (tv.tv_sec == 0) {
2491 sk->sk_stamp = ktime_get_real();
2492 tv = ktime_to_timeval(sk->sk_stamp);
2493 }
2494 return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2495 }
2496 EXPORT_SYMBOL(sock_get_timestamp);
2497
2498 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2499 {
2500 struct timespec ts;
2501 if (!sock_flag(sk, SOCK_TIMESTAMP))
2502 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2503 ts = ktime_to_timespec(sk->sk_stamp);
2504 if (ts.tv_sec == -1)
2505 return -ENOENT;
2506 if (ts.tv_sec == 0) {
2507 sk->sk_stamp = ktime_get_real();
2508 ts = ktime_to_timespec(sk->sk_stamp);
2509 }
2510 return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2511 }
2512 EXPORT_SYMBOL(sock_get_timestampns);
2513
2514 void sock_enable_timestamp(struct sock *sk, int flag)
2515 {
2516 if (!sock_flag(sk, flag)) {
2517 unsigned long previous_flags = sk->sk_flags;
2518
2519 sock_set_flag(sk, flag);
2520 /*
2521 * we just set one of the two flags which require net
2522 * time stamping, but time stamping might have been on
2523 * already because of the other one
2524 */
2525 if (sock_needs_netstamp(sk) &&
2526 !(previous_flags & SK_FLAGS_TIMESTAMP))
2527 net_enable_timestamp();
2528 }
2529 }
2530
2531 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2532 int level, int type)
2533 {
2534 struct sock_exterr_skb *serr;
2535 struct sk_buff *skb;
2536 int copied, err;
2537
2538 err = -EAGAIN;
2539 skb = sock_dequeue_err_skb(sk);
2540 if (skb == NULL)
2541 goto out;
2542
2543 copied = skb->len;
2544 if (copied > len) {
2545 msg->msg_flags |= MSG_TRUNC;
2546 copied = len;
2547 }
2548 err = skb_copy_datagram_msg(skb, 0, msg, copied);
2549 if (err)
2550 goto out_free_skb;
2551
2552 sock_recv_timestamp(msg, sk, skb);
2553
2554 serr = SKB_EXT_ERR(skb);
2555 put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2556
2557 msg->msg_flags |= MSG_ERRQUEUE;
2558 err = copied;
2559
2560 out_free_skb:
2561 kfree_skb(skb);
2562 out:
2563 return err;
2564 }
2565 EXPORT_SYMBOL(sock_recv_errqueue);
2566
2567 /*
2568 * Get a socket option on an socket.
2569 *
2570 * FIX: POSIX 1003.1g is very ambiguous here. It states that
2571 * asynchronous errors should be reported by getsockopt. We assume
2572 * this means if you specify SO_ERROR (otherwise whats the point of it).
2573 */
2574 int sock_common_getsockopt(struct socket *sock, int level, int optname,
2575 char __user *optval, int __user *optlen)
2576 {
2577 struct sock *sk = sock->sk;
2578
2579 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2580 }
2581 EXPORT_SYMBOL(sock_common_getsockopt);
2582
2583 #ifdef CONFIG_COMPAT
2584 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2585 char __user *optval, int __user *optlen)
2586 {
2587 struct sock *sk = sock->sk;
2588
2589 if (sk->sk_prot->compat_getsockopt != NULL)
2590 return sk->sk_prot->compat_getsockopt(sk, level, optname,
2591 optval, optlen);
2592 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2593 }
2594 EXPORT_SYMBOL(compat_sock_common_getsockopt);
2595 #endif
2596
2597 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2598 int flags)
2599 {
2600 struct sock *sk = sock->sk;
2601 int addr_len = 0;
2602 int err;
2603
2604 err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
2605 flags & ~MSG_DONTWAIT, &addr_len);
2606 if (err >= 0)
2607 msg->msg_namelen = addr_len;
2608 return err;
2609 }
2610 EXPORT_SYMBOL(sock_common_recvmsg);
2611
2612 /*
2613 * Set socket options on an inet socket.
2614 */
2615 int sock_common_setsockopt(struct socket *sock, int level, int optname,
2616 char __user *optval, unsigned int optlen)
2617 {
2618 struct sock *sk = sock->sk;
2619
2620 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2621 }
2622 EXPORT_SYMBOL(sock_common_setsockopt);
2623
2624 #ifdef CONFIG_COMPAT
2625 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2626 char __user *optval, unsigned int optlen)
2627 {
2628 struct sock *sk = sock->sk;
2629
2630 if (sk->sk_prot->compat_setsockopt != NULL)
2631 return sk->sk_prot->compat_setsockopt(sk, level, optname,
2632 optval, optlen);
2633 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2634 }
2635 EXPORT_SYMBOL(compat_sock_common_setsockopt);
2636 #endif
2637
2638 void sk_common_release(struct sock *sk)
2639 {
2640 if (sk->sk_prot->destroy)
2641 sk->sk_prot->destroy(sk);
2642
2643 /*
2644 * Observation: when sock_common_release is called, processes have
2645 * no access to socket. But net still has.
2646 * Step one, detach it from networking:
2647 *
2648 * A. Remove from hash tables.
2649 */
2650
2651 sk->sk_prot->unhash(sk);
2652
2653 /*
2654 * In this point socket cannot receive new packets, but it is possible
2655 * that some packets are in flight because some CPU runs receiver and
2656 * did hash table lookup before we unhashed socket. They will achieve
2657 * receive queue and will be purged by socket destructor.
2658 *
2659 * Also we still have packets pending on receive queue and probably,
2660 * our own packets waiting in device queues. sock_destroy will drain
2661 * receive queue, but transmitted packets will delay socket destruction
2662 * until the last reference will be released.
2663 */
2664
2665 sock_orphan(sk);
2666
2667 xfrm_sk_free_policy(sk);
2668
2669 sk_refcnt_debug_release(sk);
2670
2671 if (sk->sk_frag.page) {
2672 put_page(sk->sk_frag.page);
2673 sk->sk_frag.page = NULL;
2674 }
2675
2676 sock_put(sk);
2677 }
2678 EXPORT_SYMBOL(sk_common_release);
2679
2680 #ifdef CONFIG_PROC_FS
2681 #define PROTO_INUSE_NR 64 /* should be enough for the first time */
2682 struct prot_inuse {
2683 int val[PROTO_INUSE_NR];
2684 };
2685
2686 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2687
2688 #ifdef CONFIG_NET_NS
2689 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2690 {
2691 __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2692 }
2693 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2694
2695 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2696 {
2697 int cpu, idx = prot->inuse_idx;
2698 int res = 0;
2699
2700 for_each_possible_cpu(cpu)
2701 res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2702
2703 return res >= 0 ? res : 0;
2704 }
2705 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2706
2707 static int __net_init sock_inuse_init_net(struct net *net)
2708 {
2709 net->core.inuse = alloc_percpu(struct prot_inuse);
2710 return net->core.inuse ? 0 : -ENOMEM;
2711 }
2712
2713 static void __net_exit sock_inuse_exit_net(struct net *net)
2714 {
2715 free_percpu(net->core.inuse);
2716 }
2717
2718 static struct pernet_operations net_inuse_ops = {
2719 .init = sock_inuse_init_net,
2720 .exit = sock_inuse_exit_net,
2721 };
2722
2723 static __init int net_inuse_init(void)
2724 {
2725 if (register_pernet_subsys(&net_inuse_ops))
2726 panic("Cannot initialize net inuse counters");
2727
2728 return 0;
2729 }
2730
2731 core_initcall(net_inuse_init);
2732 #else
2733 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2734
2735 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2736 {
2737 __this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2738 }
2739 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2740
2741 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2742 {
2743 int cpu, idx = prot->inuse_idx;
2744 int res = 0;
2745
2746 for_each_possible_cpu(cpu)
2747 res += per_cpu(prot_inuse, cpu).val[idx];
2748
2749 return res >= 0 ? res : 0;
2750 }
2751 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2752 #endif
2753
2754 static void assign_proto_idx(struct proto *prot)
2755 {
2756 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2757
2758 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2759 pr_err("PROTO_INUSE_NR exhausted\n");
2760 return;
2761 }
2762
2763 set_bit(prot->inuse_idx, proto_inuse_idx);
2764 }
2765
2766 static void release_proto_idx(struct proto *prot)
2767 {
2768 if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2769 clear_bit(prot->inuse_idx, proto_inuse_idx);
2770 }
2771 #else
2772 static inline void assign_proto_idx(struct proto *prot)
2773 {
2774 }
2775
2776 static inline void release_proto_idx(struct proto *prot)
2777 {
2778 }
2779 #endif
2780
2781 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
2782 {
2783 if (!rsk_prot)
2784 return;
2785 kfree(rsk_prot->slab_name);
2786 rsk_prot->slab_name = NULL;
2787 kmem_cache_destroy(rsk_prot->slab);
2788 rsk_prot->slab = NULL;
2789 }
2790
2791 static int req_prot_init(const struct proto *prot)
2792 {
2793 struct request_sock_ops *rsk_prot = prot->rsk_prot;
2794
2795 if (!rsk_prot)
2796 return 0;
2797
2798 rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
2799 prot->name);
2800 if (!rsk_prot->slab_name)
2801 return -ENOMEM;
2802
2803 rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
2804 rsk_prot->obj_size, 0,
2805 prot->slab_flags, NULL);
2806
2807 if (!rsk_prot->slab) {
2808 pr_crit("%s: Can't create request sock SLAB cache!\n",
2809 prot->name);
2810 return -ENOMEM;
2811 }
2812 return 0;
2813 }
2814
2815 int proto_register(struct proto *prot, int alloc_slab)
2816 {
2817 if (alloc_slab) {
2818 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2819 SLAB_HWCACHE_ALIGN | prot->slab_flags,
2820 NULL);
2821
2822 if (prot->slab == NULL) {
2823 pr_crit("%s: Can't create sock SLAB cache!\n",
2824 prot->name);
2825 goto out;
2826 }
2827
2828 if (req_prot_init(prot))
2829 goto out_free_request_sock_slab;
2830
2831 if (prot->twsk_prot != NULL) {
2832 prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2833
2834 if (prot->twsk_prot->twsk_slab_name == NULL)
2835 goto out_free_request_sock_slab;
2836
2837 prot->twsk_prot->twsk_slab =
2838 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2839 prot->twsk_prot->twsk_obj_size,
2840 0,
2841 prot->slab_flags,
2842 NULL);
2843 if (prot->twsk_prot->twsk_slab == NULL)
2844 goto out_free_timewait_sock_slab_name;
2845 }
2846 }
2847
2848 mutex_lock(&proto_list_mutex);
2849 list_add(&prot->node, &proto_list);
2850 assign_proto_idx(prot);
2851 mutex_unlock(&proto_list_mutex);
2852 return 0;
2853
2854 out_free_timewait_sock_slab_name:
2855 kfree(prot->twsk_prot->twsk_slab_name);
2856 out_free_request_sock_slab:
2857 req_prot_cleanup(prot->rsk_prot);
2858
2859 kmem_cache_destroy(prot->slab);
2860 prot->slab = NULL;
2861 out:
2862 return -ENOBUFS;
2863 }
2864 EXPORT_SYMBOL(proto_register);
2865
2866 void proto_unregister(struct proto *prot)
2867 {
2868 mutex_lock(&proto_list_mutex);
2869 release_proto_idx(prot);
2870 list_del(&prot->node);
2871 mutex_unlock(&proto_list_mutex);
2872
2873 kmem_cache_destroy(prot->slab);
2874 prot->slab = NULL;
2875
2876 req_prot_cleanup(prot->rsk_prot);
2877
2878 if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2879 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2880 kfree(prot->twsk_prot->twsk_slab_name);
2881 prot->twsk_prot->twsk_slab = NULL;
2882 }
2883 }
2884 EXPORT_SYMBOL(proto_unregister);
2885
2886 #ifdef CONFIG_PROC_FS
2887 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2888 __acquires(proto_list_mutex)
2889 {
2890 mutex_lock(&proto_list_mutex);
2891 return seq_list_start_head(&proto_list, *pos);
2892 }
2893
2894 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2895 {
2896 return seq_list_next(v, &proto_list, pos);
2897 }
2898
2899 static void proto_seq_stop(struct seq_file *seq, void *v)
2900 __releases(proto_list_mutex)
2901 {
2902 mutex_unlock(&proto_list_mutex);
2903 }
2904
2905 static char proto_method_implemented(const void *method)
2906 {
2907 return method == NULL ? 'n' : 'y';
2908 }
2909 static long sock_prot_memory_allocated(struct proto *proto)
2910 {
2911 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
2912 }
2913
2914 static char *sock_prot_memory_pressure(struct proto *proto)
2915 {
2916 return proto->memory_pressure != NULL ?
2917 proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2918 }
2919
2920 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2921 {
2922
2923 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
2924 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2925 proto->name,
2926 proto->obj_size,
2927 sock_prot_inuse_get(seq_file_net(seq), proto),
2928 sock_prot_memory_allocated(proto),
2929 sock_prot_memory_pressure(proto),
2930 proto->max_header,
2931 proto->slab == NULL ? "no" : "yes",
2932 module_name(proto->owner),
2933 proto_method_implemented(proto->close),
2934 proto_method_implemented(proto->connect),
2935 proto_method_implemented(proto->disconnect),
2936 proto_method_implemented(proto->accept),
2937 proto_method_implemented(proto->ioctl),
2938 proto_method_implemented(proto->init),
2939 proto_method_implemented(proto->destroy),
2940 proto_method_implemented(proto->shutdown),
2941 proto_method_implemented(proto->setsockopt),
2942 proto_method_implemented(proto->getsockopt),
2943 proto_method_implemented(proto->sendmsg),
2944 proto_method_implemented(proto->recvmsg),
2945 proto_method_implemented(proto->sendpage),
2946 proto_method_implemented(proto->bind),
2947 proto_method_implemented(proto->backlog_rcv),
2948 proto_method_implemented(proto->hash),
2949 proto_method_implemented(proto->unhash),
2950 proto_method_implemented(proto->get_port),
2951 proto_method_implemented(proto->enter_memory_pressure));
2952 }
2953
2954 static int proto_seq_show(struct seq_file *seq, void *v)
2955 {
2956 if (v == &proto_list)
2957 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2958 "protocol",
2959 "size",
2960 "sockets",
2961 "memory",
2962 "press",
2963 "maxhdr",
2964 "slab",
2965 "module",
2966 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2967 else
2968 proto_seq_printf(seq, list_entry(v, struct proto, node));
2969 return 0;
2970 }
2971
2972 static const struct seq_operations proto_seq_ops = {
2973 .start = proto_seq_start,
2974 .next = proto_seq_next,
2975 .stop = proto_seq_stop,
2976 .show = proto_seq_show,
2977 };
2978
2979 static int proto_seq_open(struct inode *inode, struct file *file)
2980 {
2981 return seq_open_net(inode, file, &proto_seq_ops,
2982 sizeof(struct seq_net_private));
2983 }
2984
2985 static const struct file_operations proto_seq_fops = {
2986 .owner = THIS_MODULE,
2987 .open = proto_seq_open,
2988 .read = seq_read,
2989 .llseek = seq_lseek,
2990 .release = seq_release_net,
2991 };
2992
2993 static __net_init int proto_init_net(struct net *net)
2994 {
2995 if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
2996 return -ENOMEM;
2997
2998 return 0;
2999 }
3000
3001 static __net_exit void proto_exit_net(struct net *net)
3002 {
3003 remove_proc_entry("protocols", net->proc_net);
3004 }
3005
3006
3007 static __net_initdata struct pernet_operations proto_net_ops = {
3008 .init = proto_init_net,
3009 .exit = proto_exit_net,
3010 };
3011
3012 static int __init proto_init(void)
3013 {
3014 return register_pernet_subsys(&proto_net_ops);
3015 }
3016
3017 subsys_initcall(proto_init);
3018
3019 #endif /* PROC_FS */
This page took 0.114373 seconds and 5 git commands to generate.