[TCP]: Simplify SKB data portion allocation with NETIF_F_SG.
[deliverable/linux.git] / net / ipv4 / tcp.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Version: $Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $
9 *
02c30a84 10 * Authors: Ross Biro
1da177e4
LT
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * Corey Minyard <wf-rch!minyard@relay.EU.net>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 * Linus Torvalds, <torvalds@cs.helsinki.fi>
17 * Alan Cox, <gw4pts@gw4pts.ampr.org>
18 * Matthew Dillon, <dillon@apollo.west.oic.com>
19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 * Jorge Cwik, <jorge@laser.satlink.net>
21 *
22 * Fixes:
23 * Alan Cox : Numerous verify_area() calls
24 * Alan Cox : Set the ACK bit on a reset
25 * Alan Cox : Stopped it crashing if it closed while
26 * sk->inuse=1 and was trying to connect
27 * (tcp_err()).
28 * Alan Cox : All icmp error handling was broken
29 * pointers passed where wrong and the
30 * socket was looked up backwards. Nobody
31 * tested any icmp error code obviously.
32 * Alan Cox : tcp_err() now handled properly. It
33 * wakes people on errors. poll
34 * behaves and the icmp error race
35 * has gone by moving it into sock.c
36 * Alan Cox : tcp_send_reset() fixed to work for
37 * everything not just packets for
38 * unknown sockets.
39 * Alan Cox : tcp option processing.
40 * Alan Cox : Reset tweaked (still not 100%) [Had
41 * syn rule wrong]
42 * Herp Rosmanith : More reset fixes
43 * Alan Cox : No longer acks invalid rst frames.
44 * Acking any kind of RST is right out.
45 * Alan Cox : Sets an ignore me flag on an rst
46 * receive otherwise odd bits of prattle
47 * escape still
48 * Alan Cox : Fixed another acking RST frame bug.
49 * Should stop LAN workplace lockups.
50 * Alan Cox : Some tidyups using the new skb list
51 * facilities
52 * Alan Cox : sk->keepopen now seems to work
53 * Alan Cox : Pulls options out correctly on accepts
54 * Alan Cox : Fixed assorted sk->rqueue->next errors
55 * Alan Cox : PSH doesn't end a TCP read. Switched a
56 * bit to skb ops.
57 * Alan Cox : Tidied tcp_data to avoid a potential
58 * nasty.
59 * Alan Cox : Added some better commenting, as the
60 * tcp is hard to follow
61 * Alan Cox : Removed incorrect check for 20 * psh
62 * Michael O'Reilly : ack < copied bug fix.
63 * Johannes Stille : Misc tcp fixes (not all in yet).
64 * Alan Cox : FIN with no memory -> CRASH
65 * Alan Cox : Added socket option proto entries.
66 * Also added awareness of them to accept.
67 * Alan Cox : Added TCP options (SOL_TCP)
68 * Alan Cox : Switched wakeup calls to callbacks,
69 * so the kernel can layer network
70 * sockets.
71 * Alan Cox : Use ip_tos/ip_ttl settings.
72 * Alan Cox : Handle FIN (more) properly (we hope).
73 * Alan Cox : RST frames sent on unsynchronised
74 * state ack error.
75 * Alan Cox : Put in missing check for SYN bit.
76 * Alan Cox : Added tcp_select_window() aka NET2E
77 * window non shrink trick.
78 * Alan Cox : Added a couple of small NET2E timer
79 * fixes
80 * Charles Hedrick : TCP fixes
81 * Toomas Tamm : TCP window fixes
82 * Alan Cox : Small URG fix to rlogin ^C ack fight
83 * Charles Hedrick : Rewrote most of it to actually work
84 * Linus : Rewrote tcp_read() and URG handling
85 * completely
86 * Gerhard Koerting: Fixed some missing timer handling
87 * Matthew Dillon : Reworked TCP machine states as per RFC
88 * Gerhard Koerting: PC/TCP workarounds
89 * Adam Caldwell : Assorted timer/timing errors
90 * Matthew Dillon : Fixed another RST bug
91 * Alan Cox : Move to kernel side addressing changes.
92 * Alan Cox : Beginning work on TCP fastpathing
93 * (not yet usable)
94 * Arnt Gulbrandsen: Turbocharged tcp_check() routine.
95 * Alan Cox : TCP fast path debugging
96 * Alan Cox : Window clamping
97 * Michael Riepe : Bug in tcp_check()
98 * Matt Dillon : More TCP improvements and RST bug fixes
99 * Matt Dillon : Yet more small nasties remove from the
100 * TCP code (Be very nice to this man if
101 * tcp finally works 100%) 8)
102 * Alan Cox : BSD accept semantics.
103 * Alan Cox : Reset on closedown bug.
104 * Peter De Schrijver : ENOTCONN check missing in tcp_sendto().
105 * Michael Pall : Handle poll() after URG properly in
106 * all cases.
107 * Michael Pall : Undo the last fix in tcp_read_urg()
108 * (multi URG PUSH broke rlogin).
109 * Michael Pall : Fix the multi URG PUSH problem in
110 * tcp_readable(), poll() after URG
111 * works now.
112 * Michael Pall : recv(...,MSG_OOB) never blocks in the
113 * BSD api.
114 * Alan Cox : Changed the semantics of sk->socket to
115 * fix a race and a signal problem with
116 * accept() and async I/O.
117 * Alan Cox : Relaxed the rules on tcp_sendto().
118 * Yury Shevchuk : Really fixed accept() blocking problem.
119 * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for
120 * clients/servers which listen in on
121 * fixed ports.
122 * Alan Cox : Cleaned the above up and shrank it to
123 * a sensible code size.
124 * Alan Cox : Self connect lockup fix.
125 * Alan Cox : No connect to multicast.
126 * Ross Biro : Close unaccepted children on master
127 * socket close.
128 * Alan Cox : Reset tracing code.
129 * Alan Cox : Spurious resets on shutdown.
130 * Alan Cox : Giant 15 minute/60 second timer error
131 * Alan Cox : Small whoops in polling before an
132 * accept.
133 * Alan Cox : Kept the state trace facility since
134 * it's handy for debugging.
135 * Alan Cox : More reset handler fixes.
136 * Alan Cox : Started rewriting the code based on
137 * the RFC's for other useful protocol
138 * references see: Comer, KA9Q NOS, and
139 * for a reference on the difference
140 * between specifications and how BSD
141 * works see the 4.4lite source.
142 * A.N.Kuznetsov : Don't time wait on completion of tidy
143 * close.
144 * Linus Torvalds : Fin/Shutdown & copied_seq changes.
145 * Linus Torvalds : Fixed BSD port reuse to work first syn
146 * Alan Cox : Reimplemented timers as per the RFC
147 * and using multiple timers for sanity.
148 * Alan Cox : Small bug fixes, and a lot of new
149 * comments.
150 * Alan Cox : Fixed dual reader crash by locking
151 * the buffers (much like datagram.c)
152 * Alan Cox : Fixed stuck sockets in probe. A probe
153 * now gets fed up of retrying without
154 * (even a no space) answer.
155 * Alan Cox : Extracted closing code better
156 * Alan Cox : Fixed the closing state machine to
157 * resemble the RFC.
158 * Alan Cox : More 'per spec' fixes.
159 * Jorge Cwik : Even faster checksumming.
160 * Alan Cox : tcp_data() doesn't ack illegal PSH
161 * only frames. At least one pc tcp stack
162 * generates them.
163 * Alan Cox : Cache last socket.
164 * Alan Cox : Per route irtt.
165 * Matt Day : poll()->select() match BSD precisely on error
166 * Alan Cox : New buffers
167 * Marc Tamsky : Various sk->prot->retransmits and
168 * sk->retransmits misupdating fixed.
169 * Fixed tcp_write_timeout: stuck close,
170 * and TCP syn retries gets used now.
171 * Mark Yarvis : In tcp_read_wakeup(), don't send an
172 * ack if state is TCP_CLOSED.
173 * Alan Cox : Look up device on a retransmit - routes may
174 * change. Doesn't yet cope with MSS shrink right
175 * but it's a start!
176 * Marc Tamsky : Closing in closing fixes.
177 * Mike Shaver : RFC1122 verifications.
178 * Alan Cox : rcv_saddr errors.
179 * Alan Cox : Block double connect().
180 * Alan Cox : Small hooks for enSKIP.
181 * Alexey Kuznetsov: Path MTU discovery.
182 * Alan Cox : Support soft errors.
183 * Alan Cox : Fix MTU discovery pathological case
184 * when the remote claims no mtu!
185 * Marc Tamsky : TCP_CLOSE fix.
186 * Colin (G3TNE) : Send a reset on syn ack replies in
187 * window but wrong (fixes NT lpd problems)
188 * Pedro Roque : Better TCP window handling, delayed ack.
189 * Joerg Reuter : No modification of locked buffers in
190 * tcp_do_retransmit()
191 * Eric Schenk : Changed receiver side silly window
192 * avoidance algorithm to BSD style
193 * algorithm. This doubles throughput
194 * against machines running Solaris,
195 * and seems to result in general
196 * improvement.
197 * Stefan Magdalinski : adjusted tcp_readable() to fix FIONREAD
198 * Willy Konynenberg : Transparent proxying support.
199 * Mike McLagan : Routing by source
200 * Keith Owens : Do proper merging with partial SKB's in
201 * tcp_do_sendmsg to avoid burstiness.
202 * Eric Schenk : Fix fast close down bug with
203 * shutdown() followed by close().
204 * Andi Kleen : Make poll agree with SIGIO
205 * Salvatore Sanfilippo : Support SO_LINGER with linger == 1 and
206 * lingertime == 0 (RFC 793 ABORT Call)
207 * Hirokazu Takahashi : Use copy_from_user() instead of
208 * csum_and_copy_from_user() if possible.
209 *
210 * This program is free software; you can redistribute it and/or
211 * modify it under the terms of the GNU General Public License
212 * as published by the Free Software Foundation; either version
213 * 2 of the License, or(at your option) any later version.
214 *
215 * Description of States:
216 *
217 * TCP_SYN_SENT sent a connection request, waiting for ack
218 *
219 * TCP_SYN_RECV received a connection request, sent ack,
220 * waiting for final ack in three-way handshake.
221 *
222 * TCP_ESTABLISHED connection established
223 *
224 * TCP_FIN_WAIT1 our side has shutdown, waiting to complete
225 * transmission of remaining buffered data
226 *
227 * TCP_FIN_WAIT2 all buffered data sent, waiting for remote
228 * to shutdown
229 *
230 * TCP_CLOSING both sides have shutdown but we still have
231 * data we have to finish sending
232 *
233 * TCP_TIME_WAIT timeout to catch resent junk before entering
234 * closed, can only be entered from FIN_WAIT2
235 * or CLOSING. Required because the other end
236 * may not have gotten our last ACK causing it
237 * to retransmit the data packet (which we ignore)
238 *
239 * TCP_CLOSE_WAIT remote side has shutdown and is waiting for
240 * us to finish writing our data and to shutdown
241 * (we have to close() to move on to LAST_ACK)
242 *
243 * TCP_LAST_ACK out side has shutdown after remote has
244 * shutdown. There may still be data in our
245 * buffer that we have to finish sending
246 *
247 * TCP_CLOSE socket is finished
248 */
249
250#include <linux/config.h>
251#include <linux/module.h>
252#include <linux/types.h>
253#include <linux/fcntl.h>
254#include <linux/poll.h>
255#include <linux/init.h>
256#include <linux/smp_lock.h>
257#include <linux/fs.h>
258#include <linux/random.h>
259#include <linux/bootmem.h>
260
261#include <net/icmp.h>
262#include <net/tcp.h>
263#include <net/xfrm.h>
264#include <net/ip.h>
265
266
267#include <asm/uaccess.h>
268#include <asm/ioctls.h>
269
270int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
271
272DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics);
273
1da177e4
LT
274kmem_cache_t *tcp_bucket_cachep;
275kmem_cache_t *tcp_timewait_cachep;
276
277atomic_t tcp_orphan_count = ATOMIC_INIT(0);
278
279int sysctl_tcp_mem[3];
280int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
281int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
282
283EXPORT_SYMBOL(sysctl_tcp_mem);
284EXPORT_SYMBOL(sysctl_tcp_rmem);
285EXPORT_SYMBOL(sysctl_tcp_wmem);
286
287atomic_t tcp_memory_allocated; /* Current allocated memory. */
288atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */
289
290EXPORT_SYMBOL(tcp_memory_allocated);
291EXPORT_SYMBOL(tcp_sockets_allocated);
292
293/*
294 * Pressure flag: try to collapse.
295 * Technical note: it is used by multiple contexts non atomically.
296 * All the sk_stream_mem_schedule() is of this nature: accounting
297 * is strict, actions are advisory and have some latency.
298 */
299int tcp_memory_pressure;
300
301EXPORT_SYMBOL(tcp_memory_pressure);
302
303void tcp_enter_memory_pressure(void)
304{
305 if (!tcp_memory_pressure) {
306 NET_INC_STATS(LINUX_MIB_TCPMEMORYPRESSURES);
307 tcp_memory_pressure = 1;
308 }
309}
310
311EXPORT_SYMBOL(tcp_enter_memory_pressure);
312
313/*
314 * LISTEN is a special case for poll..
315 */
316static __inline__ unsigned int tcp_listen_poll(struct sock *sk,
317 poll_table *wait)
318{
0e87506f 319 return !reqsk_queue_empty(&tcp_sk(sk)->accept_queue) ? (POLLIN | POLLRDNORM) : 0;
1da177e4
LT
320}
321
322/*
323 * Wait for a TCP event.
324 *
325 * Note that we don't need to lock the socket, as the upper poll layers
326 * take care of normal races (between the test and the event) and we don't
327 * go look at any of the socket buffers directly.
328 */
329unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
330{
331 unsigned int mask;
332 struct sock *sk = sock->sk;
333 struct tcp_sock *tp = tcp_sk(sk);
334
335 poll_wait(file, sk->sk_sleep, wait);
336 if (sk->sk_state == TCP_LISTEN)
337 return tcp_listen_poll(sk, wait);
338
339 /* Socket is not locked. We are protected from async events
340 by poll logic and correct handling of state changes
341 made by another threads is impossible in any case.
342 */
343
344 mask = 0;
345 if (sk->sk_err)
346 mask = POLLERR;
347
348 /*
349 * POLLHUP is certainly not done right. But poll() doesn't
350 * have a notion of HUP in just one direction, and for a
351 * socket the read side is more interesting.
352 *
353 * Some poll() documentation says that POLLHUP is incompatible
354 * with the POLLOUT/POLLWR flags, so somebody should check this
355 * all. But careful, it tends to be safer to return too many
356 * bits than too few, and you can easily break real applications
357 * if you don't tell them that something has hung up!
358 *
359 * Check-me.
360 *
361 * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
362 * our fs/select.c). It means that after we received EOF,
363 * poll always returns immediately, making impossible poll() on write()
364 * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
365 * if and only if shutdown has been made in both directions.
366 * Actually, it is interesting to look how Solaris and DUX
367 * solve this dilemma. I would prefer, if PULLHUP were maskable,
368 * then we could set it on SND_SHUTDOWN. BTW examples given
369 * in Stevens' books assume exactly this behaviour, it explains
370 * why PULLHUP is incompatible with POLLOUT. --ANK
371 *
372 * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
373 * blocking on fresh not-connected or disconnected socket. --ANK
374 */
375 if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
376 mask |= POLLHUP;
377 if (sk->sk_shutdown & RCV_SHUTDOWN)
378 mask |= POLLIN | POLLRDNORM;
379
380 /* Connected? */
381 if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
382 /* Potential race condition. If read of tp below will
383 * escape above sk->sk_state, we can be illegally awaken
384 * in SYN_* states. */
385 if ((tp->rcv_nxt != tp->copied_seq) &&
386 (tp->urg_seq != tp->copied_seq ||
387 tp->rcv_nxt != tp->copied_seq + 1 ||
388 sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))
389 mask |= POLLIN | POLLRDNORM;
390
391 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
392 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
393 mask |= POLLOUT | POLLWRNORM;
394 } else { /* send SIGIO later */
395 set_bit(SOCK_ASYNC_NOSPACE,
396 &sk->sk_socket->flags);
397 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
398
399 /* Race breaker. If space is freed after
400 * wspace test but before the flags are set,
401 * IO signal will be lost.
402 */
403 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
404 mask |= POLLOUT | POLLWRNORM;
405 }
406 }
407
408 if (tp->urg_data & TCP_URG_VALID)
409 mask |= POLLPRI;
410 }
411 return mask;
412}
413
414int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
415{
416 struct tcp_sock *tp = tcp_sk(sk);
417 int answ;
418
419 switch (cmd) {
420 case SIOCINQ:
421 if (sk->sk_state == TCP_LISTEN)
422 return -EINVAL;
423
424 lock_sock(sk);
425 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
426 answ = 0;
427 else if (sock_flag(sk, SOCK_URGINLINE) ||
428 !tp->urg_data ||
429 before(tp->urg_seq, tp->copied_seq) ||
430 !before(tp->urg_seq, tp->rcv_nxt)) {
431 answ = tp->rcv_nxt - tp->copied_seq;
432
433 /* Subtract 1, if FIN is in queue. */
434 if (answ && !skb_queue_empty(&sk->sk_receive_queue))
435 answ -=
436 ((struct sk_buff *)sk->sk_receive_queue.prev)->h.th->fin;
437 } else
438 answ = tp->urg_seq - tp->copied_seq;
439 release_sock(sk);
440 break;
441 case SIOCATMARK:
442 answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
443 break;
444 case SIOCOUTQ:
445 if (sk->sk_state == TCP_LISTEN)
446 return -EINVAL;
447
448 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
449 answ = 0;
450 else
451 answ = tp->write_seq - tp->snd_una;
452 break;
453 default:
454 return -ENOIOCTLCMD;
455 };
456
457 return put_user(answ, (int __user *)arg);
458}
459
460
461int tcp_listen_start(struct sock *sk)
462{
463 struct inet_sock *inet = inet_sk(sk);
464 struct tcp_sock *tp = tcp_sk(sk);
0e87506f
ACM
465 int rc = reqsk_queue_alloc(&tp->accept_queue, TCP_SYNQ_HSIZE);
466
467 if (rc != 0)
468 return rc;
1da177e4
LT
469
470 sk->sk_max_ack_backlog = 0;
471 sk->sk_ack_backlog = 0;
1da177e4
LT
472 tcp_delack_init(tp);
473
1da177e4
LT
474 /* There is race window here: we announce ourselves listening,
475 * but this transition is still not validated by get_port().
476 * It is OK, because this socket enters to hash table only
477 * after validation is complete.
478 */
479 sk->sk_state = TCP_LISTEN;
480 if (!sk->sk_prot->get_port(sk, inet->num)) {
481 inet->sport = htons(inet->num);
482
483 sk_dst_reset(sk);
484 sk->sk_prot->hash(sk);
485
486 return 0;
487 }
488
489 sk->sk_state = TCP_CLOSE;
0e87506f 490 reqsk_queue_destroy(&tp->accept_queue);
1da177e4
LT
491 return -EADDRINUSE;
492}
493
494/*
495 * This routine closes sockets which have been at least partially
496 * opened, but not yet accepted.
497 */
498
499static void tcp_listen_stop (struct sock *sk)
500{
501 struct tcp_sock *tp = tcp_sk(sk);
2ad69c55 502 struct listen_sock *lopt;
0e87506f 503 struct request_sock *acc_req;
60236fdd 504 struct request_sock *req;
1da177e4
LT
505 int i;
506
507 tcp_delete_keepalive_timer(sk);
508
509 /* make all the listen_opt local to us */
0e87506f
ACM
510 lopt = reqsk_queue_yank_listen_sk(&tp->accept_queue);
511 acc_req = reqsk_queue_yank_acceptq(&tp->accept_queue);
1da177e4
LT
512
513 if (lopt->qlen) {
514 for (i = 0; i < TCP_SYNQ_HSIZE; i++) {
515 while ((req = lopt->syn_table[i]) != NULL) {
516 lopt->syn_table[i] = req->dl_next;
517 lopt->qlen--;
60236fdd 518 reqsk_free(req);
1da177e4
LT
519
520 /* Following specs, it would be better either to send FIN
521 * (and enter FIN-WAIT-1, it is normal close)
522 * or to send active reset (abort).
523 * Certainly, it is pretty dangerous while synflood, but it is
524 * bad justification for our negligence 8)
525 * To be honest, we are not able to make either
526 * of the variants now. --ANK
527 */
528 }
529 }
530 }
531 BUG_TRAP(!lopt->qlen);
532
533 kfree(lopt);
534
535 while ((req = acc_req) != NULL) {
536 struct sock *child = req->sk;
537
538 acc_req = req->dl_next;
539
540 local_bh_disable();
541 bh_lock_sock(child);
542 BUG_TRAP(!sock_owned_by_user(child));
543 sock_hold(child);
544
545 tcp_disconnect(child, O_NONBLOCK);
546
547 sock_orphan(child);
548
549 atomic_inc(&tcp_orphan_count);
550
551 tcp_destroy_sock(child);
552
553 bh_unlock_sock(child);
554 local_bh_enable();
555 sock_put(child);
556
557 sk_acceptq_removed(sk);
60236fdd 558 __reqsk_free(req);
1da177e4
LT
559 }
560 BUG_TRAP(!sk->sk_ack_backlog);
561}
562
563static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
564{
565 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
566 tp->pushed_seq = tp->write_seq;
567}
568
569static inline int forced_push(struct tcp_sock *tp)
570{
571 return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
572}
573
574static inline void skb_entail(struct sock *sk, struct tcp_sock *tp,
575 struct sk_buff *skb)
576{
577 skb->csum = 0;
578 TCP_SKB_CB(skb)->seq = tp->write_seq;
579 TCP_SKB_CB(skb)->end_seq = tp->write_seq;
580 TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
581 TCP_SKB_CB(skb)->sacked = 0;
582 skb_header_release(skb);
583 __skb_queue_tail(&sk->sk_write_queue, skb);
584 sk_charge_skb(sk, skb);
585 if (!sk->sk_send_head)
586 sk->sk_send_head = skb;
587 else if (tp->nonagle&TCP_NAGLE_PUSH)
588 tp->nonagle &= ~TCP_NAGLE_PUSH;
589}
590
591static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
592 struct sk_buff *skb)
593{
594 if (flags & MSG_OOB) {
595 tp->urg_mode = 1;
596 tp->snd_up = tp->write_seq;
597 TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
598 }
599}
600
601static inline void tcp_push(struct sock *sk, struct tcp_sock *tp, int flags,
602 int mss_now, int nonagle)
603{
604 if (sk->sk_send_head) {
605 struct sk_buff *skb = sk->sk_write_queue.prev;
606 if (!(flags & MSG_MORE) || forced_push(tp))
607 tcp_mark_push(tp, skb);
608 tcp_mark_urg(tp, flags, skb);
609 __tcp_push_pending_frames(sk, tp, mss_now,
610 (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
611 }
612}
613
614static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
615 size_t psize, int flags)
616{
617 struct tcp_sock *tp = tcp_sk(sk);
618 int mss_now;
619 int err;
620 ssize_t copied;
621 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
622
623 /* Wait for a connection to finish. */
624 if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
625 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
626 goto out_err;
627
628 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
629
630 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
631 copied = 0;
632
633 err = -EPIPE;
634 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
635 goto do_error;
636
637 while (psize > 0) {
638 struct sk_buff *skb = sk->sk_write_queue.prev;
639 struct page *page = pages[poffset / PAGE_SIZE];
640 int copy, i, can_coalesce;
641 int offset = poffset % PAGE_SIZE;
642 int size = min_t(size_t, psize, PAGE_SIZE - offset);
643
644 if (!sk->sk_send_head || (copy = mss_now - skb->len) <= 0) {
645new_segment:
646 if (!sk_stream_memory_free(sk))
647 goto wait_for_sndbuf;
648
649 skb = sk_stream_alloc_pskb(sk, 0, 0,
650 sk->sk_allocation);
651 if (!skb)
652 goto wait_for_memory;
653
654 skb_entail(sk, tp, skb);
655 copy = mss_now;
656 }
657
658 if (copy > size)
659 copy = size;
660
661 i = skb_shinfo(skb)->nr_frags;
662 can_coalesce = skb_can_coalesce(skb, i, page, offset);
663 if (!can_coalesce && i >= MAX_SKB_FRAGS) {
664 tcp_mark_push(tp, skb);
665 goto new_segment;
666 }
667 if (sk->sk_forward_alloc < copy &&
668 !sk_stream_mem_schedule(sk, copy, 0))
669 goto wait_for_memory;
670
671 if (can_coalesce) {
672 skb_shinfo(skb)->frags[i - 1].size += copy;
673 } else {
674 get_page(page);
675 skb_fill_page_desc(skb, i, page, offset, copy);
676 }
677
678 skb->len += copy;
679 skb->data_len += copy;
680 skb->truesize += copy;
681 sk->sk_wmem_queued += copy;
682 sk->sk_forward_alloc -= copy;
683 skb->ip_summed = CHECKSUM_HW;
684 tp->write_seq += copy;
685 TCP_SKB_CB(skb)->end_seq += copy;
686 skb_shinfo(skb)->tso_segs = 0;
687
688 if (!copied)
689 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
690
691 copied += copy;
692 poffset += copy;
693 if (!(psize -= copy))
694 goto out;
695
696 if (skb->len != mss_now || (flags & MSG_OOB))
697 continue;
698
699 if (forced_push(tp)) {
700 tcp_mark_push(tp, skb);
701 __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
702 } else if (skb == sk->sk_send_head)
703 tcp_push_one(sk, mss_now);
704 continue;
705
706wait_for_sndbuf:
707 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
708wait_for_memory:
709 if (copied)
710 tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
711
712 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
713 goto do_error;
714
715 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
716 }
717
718out:
719 if (copied)
720 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
721 return copied;
722
723do_error:
724 if (copied)
725 goto out;
726out_err:
727 return sk_stream_error(sk, flags, err);
728}
729
730ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
731 size_t size, int flags)
732{
733 ssize_t res;
734 struct sock *sk = sock->sk;
735
736#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
737
738 if (!(sk->sk_route_caps & NETIF_F_SG) ||
739 !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))
740 return sock_no_sendpage(sock, page, offset, size, flags);
741
742#undef TCP_ZC_CSUM_FLAGS
743
744 lock_sock(sk);
745 TCP_CHECK_TIMER(sk);
746 res = do_tcp_sendpages(sk, &page, offset, size, flags);
747 TCP_CHECK_TIMER(sk);
748 release_sock(sk);
749 return res;
750}
751
752#define TCP_PAGE(sk) (sk->sk_sndmsg_page)
753#define TCP_OFF(sk) (sk->sk_sndmsg_off)
754
755static inline int select_size(struct sock *sk, struct tcp_sock *tp)
756{
757 int tmp = tp->mss_cache_std;
758
c65f7f00
DM
759 if (sk->sk_route_caps & NETIF_F_SG)
760 tmp = 0;
1da177e4 761
1da177e4
LT
762 return tmp;
763}
764
765int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
766 size_t size)
767{
768 struct iovec *iov;
769 struct tcp_sock *tp = tcp_sk(sk);
770 struct sk_buff *skb;
771 int iovlen, flags;
772 int mss_now;
773 int err, copied;
774 long timeo;
775
776 lock_sock(sk);
777 TCP_CHECK_TIMER(sk);
778
779 flags = msg->msg_flags;
780 timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
781
782 /* Wait for a connection to finish. */
783 if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
784 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
785 goto out_err;
786
787 /* This should be in poll */
788 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
789
790 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
791
792 /* Ok commence sending. */
793 iovlen = msg->msg_iovlen;
794 iov = msg->msg_iov;
795 copied = 0;
796
797 err = -EPIPE;
798 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
799 goto do_error;
800
801 while (--iovlen >= 0) {
802 int seglen = iov->iov_len;
803 unsigned char __user *from = iov->iov_base;
804
805 iov++;
806
807 while (seglen > 0) {
808 int copy;
809
810 skb = sk->sk_write_queue.prev;
811
812 if (!sk->sk_send_head ||
813 (copy = mss_now - skb->len) <= 0) {
814
815new_segment:
816 /* Allocate new segment. If the interface is SG,
817 * allocate skb fitting to single page.
818 */
819 if (!sk_stream_memory_free(sk))
820 goto wait_for_sndbuf;
821
822 skb = sk_stream_alloc_pskb(sk, select_size(sk, tp),
823 0, sk->sk_allocation);
824 if (!skb)
825 goto wait_for_memory;
826
827 /*
828 * Check whether we can use HW checksum.
829 */
830 if (sk->sk_route_caps &
831 (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM |
832 NETIF_F_HW_CSUM))
833 skb->ip_summed = CHECKSUM_HW;
834
835 skb_entail(sk, tp, skb);
836 copy = mss_now;
837 }
838
839 /* Try to append data to the end of skb. */
840 if (copy > seglen)
841 copy = seglen;
842
843 /* Where to copy to? */
844 if (skb_tailroom(skb) > 0) {
845 /* We have some space in skb head. Superb! */
846 if (copy > skb_tailroom(skb))
847 copy = skb_tailroom(skb);
848 if ((err = skb_add_data(skb, from, copy)) != 0)
849 goto do_fault;
850 } else {
851 int merge = 0;
852 int i = skb_shinfo(skb)->nr_frags;
853 struct page *page = TCP_PAGE(sk);
854 int off = TCP_OFF(sk);
855
856 if (skb_can_coalesce(skb, i, page, off) &&
857 off != PAGE_SIZE) {
858 /* We can extend the last page
859 * fragment. */
860 merge = 1;
861 } else if (i == MAX_SKB_FRAGS ||
862 (!i &&
863 !(sk->sk_route_caps & NETIF_F_SG))) {
864 /* Need to add new fragment and cannot
865 * do this because interface is non-SG,
866 * or because all the page slots are
867 * busy. */
868 tcp_mark_push(tp, skb);
869 goto new_segment;
870 } else if (page) {
1da177e4
LT
871 if (off == PAGE_SIZE) {
872 put_page(page);
873 TCP_PAGE(sk) = page = NULL;
874 }
875 }
876
877 if (!page) {
878 /* Allocate new cache page. */
879 if (!(page = sk_stream_alloc_page(sk)))
880 goto wait_for_memory;
881 off = 0;
882 }
883
884 if (copy > PAGE_SIZE - off)
885 copy = PAGE_SIZE - off;
886
887 /* Time to copy data. We are close to
888 * the end! */
889 err = skb_copy_to_page(sk, from, skb, page,
890 off, copy);
891 if (err) {
892 /* If this page was new, give it to the
893 * socket so it does not get leaked.
894 */
895 if (!TCP_PAGE(sk)) {
896 TCP_PAGE(sk) = page;
897 TCP_OFF(sk) = 0;
898 }
899 goto do_error;
900 }
901
902 /* Update the skb. */
903 if (merge) {
904 skb_shinfo(skb)->frags[i - 1].size +=
905 copy;
906 } else {
907 skb_fill_page_desc(skb, i, page, off, copy);
908 if (TCP_PAGE(sk)) {
909 get_page(page);
910 } else if (off + copy < PAGE_SIZE) {
911 get_page(page);
912 TCP_PAGE(sk) = page;
913 }
914 }
915
916 TCP_OFF(sk) = off + copy;
917 }
918
919 if (!copied)
920 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
921
922 tp->write_seq += copy;
923 TCP_SKB_CB(skb)->end_seq += copy;
924 skb_shinfo(skb)->tso_segs = 0;
925
926 from += copy;
927 copied += copy;
928 if ((seglen -= copy) == 0 && iovlen == 0)
929 goto out;
930
931 if (skb->len != mss_now || (flags & MSG_OOB))
932 continue;
933
934 if (forced_push(tp)) {
935 tcp_mark_push(tp, skb);
936 __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
937 } else if (skb == sk->sk_send_head)
938 tcp_push_one(sk, mss_now);
939 continue;
940
941wait_for_sndbuf:
942 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
943wait_for_memory:
944 if (copied)
945 tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
946
947 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
948 goto do_error;
949
950 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
951 }
952 }
953
954out:
955 if (copied)
956 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
957 TCP_CHECK_TIMER(sk);
958 release_sock(sk);
959 return copied;
960
961do_fault:
962 if (!skb->len) {
963 if (sk->sk_send_head == skb)
964 sk->sk_send_head = NULL;
965 __skb_unlink(skb, skb->list);
966 sk_stream_free_skb(sk, skb);
967 }
968
969do_error:
970 if (copied)
971 goto out;
972out_err:
973 err = sk_stream_error(sk, flags, err);
974 TCP_CHECK_TIMER(sk);
975 release_sock(sk);
976 return err;
977}
978
979/*
980 * Handle reading urgent data. BSD has very simple semantics for
981 * this, no blocking and very strange errors 8)
982 */
983
984static int tcp_recv_urg(struct sock *sk, long timeo,
985 struct msghdr *msg, int len, int flags,
986 int *addr_len)
987{
988 struct tcp_sock *tp = tcp_sk(sk);
989
990 /* No URG data to read. */
991 if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
992 tp->urg_data == TCP_URG_READ)
993 return -EINVAL; /* Yes this is right ! */
994
995 if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
996 return -ENOTCONN;
997
998 if (tp->urg_data & TCP_URG_VALID) {
999 int err = 0;
1000 char c = tp->urg_data;
1001
1002 if (!(flags & MSG_PEEK))
1003 tp->urg_data = TCP_URG_READ;
1004
1005 /* Read urgent data. */
1006 msg->msg_flags |= MSG_OOB;
1007
1008 if (len > 0) {
1009 if (!(flags & MSG_TRUNC))
1010 err = memcpy_toiovec(msg->msg_iov, &c, 1);
1011 len = 1;
1012 } else
1013 msg->msg_flags |= MSG_TRUNC;
1014
1015 return err ? -EFAULT : len;
1016 }
1017
1018 if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1019 return 0;
1020
1021 /* Fixed the recv(..., MSG_OOB) behaviour. BSD docs and
1022 * the available implementations agree in this case:
1023 * this call should never block, independent of the
1024 * blocking state of the socket.
1025 * Mike <pall@rz.uni-karlsruhe.de>
1026 */
1027 return -EAGAIN;
1028}
1029
1030/* Clean up the receive buffer for full frames taken by the user,
1031 * then send an ACK if necessary. COPIED is the number of bytes
1032 * tcp_recvmsg has given to the user so far, it speeds up the
1033 * calculation of whether or not we must ACK for the sake of
1034 * a window update.
1035 */
1036static void cleanup_rbuf(struct sock *sk, int copied)
1037{
1038 struct tcp_sock *tp = tcp_sk(sk);
1039 int time_to_ack = 0;
1040
1041#if TCP_DEBUG
1042 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1043
1044 BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
1045#endif
1046
1047 if (tcp_ack_scheduled(tp)) {
1048 /* Delayed ACKs frequently hit locked sockets during bulk
1049 * receive. */
1050 if (tp->ack.blocked ||
1051 /* Once-per-two-segments ACK was not sent by tcp_input.c */
1052 tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss ||
1053 /*
1054 * If this read emptied read buffer, we send ACK, if
1055 * connection is not bidirectional, user drained
1056 * receive buffer and there was a small segment
1057 * in queue.
1058 */
1059 (copied > 0 && (tp->ack.pending & TCP_ACK_PUSHED) &&
1060 !tp->ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
1061 time_to_ack = 1;
1062 }
1063
1064 /* We send an ACK if we can now advertise a non-zero window
1065 * which has been raised "significantly".
1066 *
1067 * Even if window raised up to infinity, do not send window open ACK
1068 * in states, where we will not receive more. It is useless.
1069 */
1070 if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1071 __u32 rcv_window_now = tcp_receive_window(tp);
1072
1073 /* Optimize, __tcp_select_window() is not cheap. */
1074 if (2*rcv_window_now <= tp->window_clamp) {
1075 __u32 new_window = __tcp_select_window(sk);
1076
1077 /* Send ACK now, if this read freed lots of space
1078 * in our buffer. Certainly, new_window is new window.
1079 * We can advertise it now, if it is not less than current one.
1080 * "Lots" means "at least twice" here.
1081 */
1082 if (new_window && new_window >= 2 * rcv_window_now)
1083 time_to_ack = 1;
1084 }
1085 }
1086 if (time_to_ack)
1087 tcp_send_ack(sk);
1088}
1089
1090static void tcp_prequeue_process(struct sock *sk)
1091{
1092 struct sk_buff *skb;
1093 struct tcp_sock *tp = tcp_sk(sk);
1094
1095 NET_ADD_STATS_USER(LINUX_MIB_TCPPREQUEUED, skb_queue_len(&tp->ucopy.prequeue));
1096
1097 /* RX process wants to run with disabled BHs, though it is not
1098 * necessary */
1099 local_bh_disable();
1100 while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1101 sk->sk_backlog_rcv(sk, skb);
1102 local_bh_enable();
1103
1104 /* Clear memory counter. */
1105 tp->ucopy.memory = 0;
1106}
1107
1108static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1109{
1110 struct sk_buff *skb;
1111 u32 offset;
1112
1113 skb_queue_walk(&sk->sk_receive_queue, skb) {
1114 offset = seq - TCP_SKB_CB(skb)->seq;
1115 if (skb->h.th->syn)
1116 offset--;
1117 if (offset < skb->len || skb->h.th->fin) {
1118 *off = offset;
1119 return skb;
1120 }
1121 }
1122 return NULL;
1123}
1124
1125/*
1126 * This routine provides an alternative to tcp_recvmsg() for routines
1127 * that would like to handle copying from skbuffs directly in 'sendfile'
1128 * fashion.
1129 * Note:
1130 * - It is assumed that the socket was locked by the caller.
1131 * - The routine does not block.
1132 * - At present, there is no support for reading OOB data
1133 * or for 'peeking' the socket using this routine
1134 * (although both would be easy to implement).
1135 */
1136int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1137 sk_read_actor_t recv_actor)
1138{
1139 struct sk_buff *skb;
1140 struct tcp_sock *tp = tcp_sk(sk);
1141 u32 seq = tp->copied_seq;
1142 u32 offset;
1143 int copied = 0;
1144
1145 if (sk->sk_state == TCP_LISTEN)
1146 return -ENOTCONN;
1147 while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1148 if (offset < skb->len) {
1149 size_t used, len;
1150
1151 len = skb->len - offset;
1152 /* Stop reading if we hit a patch of urgent data */
1153 if (tp->urg_data) {
1154 u32 urg_offset = tp->urg_seq - seq;
1155 if (urg_offset < len)
1156 len = urg_offset;
1157 if (!len)
1158 break;
1159 }
1160 used = recv_actor(desc, skb, offset, len);
1161 if (used <= len) {
1162 seq += used;
1163 copied += used;
1164 offset += used;
1165 }
1166 if (offset != skb->len)
1167 break;
1168 }
1169 if (skb->h.th->fin) {
1170 sk_eat_skb(sk, skb);
1171 ++seq;
1172 break;
1173 }
1174 sk_eat_skb(sk, skb);
1175 if (!desc->count)
1176 break;
1177 }
1178 tp->copied_seq = seq;
1179
1180 tcp_rcv_space_adjust(sk);
1181
1182 /* Clean up data we have read: This will do ACK frames. */
1183 if (copied)
1184 cleanup_rbuf(sk, copied);
1185 return copied;
1186}
1187
1188/*
1189 * This routine copies from a sock struct into the user buffer.
1190 *
1191 * Technical note: in 2.3 we work on _locked_ socket, so that
1192 * tricks with *seq access order and skb->users are not required.
1193 * Probably, code can be easily improved even more.
1194 */
1195
1196int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1197 size_t len, int nonblock, int flags, int *addr_len)
1198{
1199 struct tcp_sock *tp = tcp_sk(sk);
1200 int copied = 0;
1201 u32 peek_seq;
1202 u32 *seq;
1203 unsigned long used;
1204 int err;
1205 int target; /* Read at least this many bytes */
1206 long timeo;
1207 struct task_struct *user_recv = NULL;
1208
1209 lock_sock(sk);
1210
1211 TCP_CHECK_TIMER(sk);
1212
1213 err = -ENOTCONN;
1214 if (sk->sk_state == TCP_LISTEN)
1215 goto out;
1216
1217 timeo = sock_rcvtimeo(sk, nonblock);
1218
1219 /* Urgent data needs to be handled specially. */
1220 if (flags & MSG_OOB)
1221 goto recv_urg;
1222
1223 seq = &tp->copied_seq;
1224 if (flags & MSG_PEEK) {
1225 peek_seq = tp->copied_seq;
1226 seq = &peek_seq;
1227 }
1228
1229 target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1230
1231 do {
1232 struct sk_buff *skb;
1233 u32 offset;
1234
1235 /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
1236 if (tp->urg_data && tp->urg_seq == *seq) {
1237 if (copied)
1238 break;
1239 if (signal_pending(current)) {
1240 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1241 break;
1242 }
1243 }
1244
1245 /* Next get a buffer. */
1246
1247 skb = skb_peek(&sk->sk_receive_queue);
1248 do {
1249 if (!skb)
1250 break;
1251
1252 /* Now that we have two receive queues this
1253 * shouldn't happen.
1254 */
1255 if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1256 printk(KERN_INFO "recvmsg bug: copied %X "
1257 "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
1258 break;
1259 }
1260 offset = *seq - TCP_SKB_CB(skb)->seq;
1261 if (skb->h.th->syn)
1262 offset--;
1263 if (offset < skb->len)
1264 goto found_ok_skb;
1265 if (skb->h.th->fin)
1266 goto found_fin_ok;
1267 BUG_TRAP(flags & MSG_PEEK);
1268 skb = skb->next;
1269 } while (skb != (struct sk_buff *)&sk->sk_receive_queue);
1270
1271 /* Well, if we have backlog, try to process it now yet. */
1272
1273 if (copied >= target && !sk->sk_backlog.tail)
1274 break;
1275
1276 if (copied) {
1277 if (sk->sk_err ||
1278 sk->sk_state == TCP_CLOSE ||
1279 (sk->sk_shutdown & RCV_SHUTDOWN) ||
1280 !timeo ||
1281 signal_pending(current) ||
1282 (flags & MSG_PEEK))
1283 break;
1284 } else {
1285 if (sock_flag(sk, SOCK_DONE))
1286 break;
1287
1288 if (sk->sk_err) {
1289 copied = sock_error(sk);
1290 break;
1291 }
1292
1293 if (sk->sk_shutdown & RCV_SHUTDOWN)
1294 break;
1295
1296 if (sk->sk_state == TCP_CLOSE) {
1297 if (!sock_flag(sk, SOCK_DONE)) {
1298 /* This occurs when user tries to read
1299 * from never connected socket.
1300 */
1301 copied = -ENOTCONN;
1302 break;
1303 }
1304 break;
1305 }
1306
1307 if (!timeo) {
1308 copied = -EAGAIN;
1309 break;
1310 }
1311
1312 if (signal_pending(current)) {
1313 copied = sock_intr_errno(timeo);
1314 break;
1315 }
1316 }
1317
1318 cleanup_rbuf(sk, copied);
1319
7df55125 1320 if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
1da177e4
LT
1321 /* Install new reader */
1322 if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1323 user_recv = current;
1324 tp->ucopy.task = user_recv;
1325 tp->ucopy.iov = msg->msg_iov;
1326 }
1327
1328 tp->ucopy.len = len;
1329
1330 BUG_TRAP(tp->copied_seq == tp->rcv_nxt ||
1331 (flags & (MSG_PEEK | MSG_TRUNC)));
1332
1333 /* Ugly... If prequeue is not empty, we have to
1334 * process it before releasing socket, otherwise
1335 * order will be broken at second iteration.
1336 * More elegant solution is required!!!
1337 *
1338 * Look: we have the following (pseudo)queues:
1339 *
1340 * 1. packets in flight
1341 * 2. backlog
1342 * 3. prequeue
1343 * 4. receive_queue
1344 *
1345 * Each queue can be processed only if the next ones
1346 * are empty. At this point we have empty receive_queue.
1347 * But prequeue _can_ be not empty after 2nd iteration,
1348 * when we jumped to start of loop because backlog
1349 * processing added something to receive_queue.
1350 * We cannot release_sock(), because backlog contains
1351 * packets arrived _after_ prequeued ones.
1352 *
1353 * Shortly, algorithm is clear --- to process all
1354 * the queues in order. We could make it more directly,
1355 * requeueing packets from backlog to prequeue, if
1356 * is not empty. It is more elegant, but eats cycles,
1357 * unfortunately.
1358 */
1359 if (skb_queue_len(&tp->ucopy.prequeue))
1360 goto do_prequeue;
1361
1362 /* __ Set realtime policy in scheduler __ */
1363 }
1364
1365 if (copied >= target) {
1366 /* Do not sleep, just process backlog. */
1367 release_sock(sk);
1368 lock_sock(sk);
1369 } else
1370 sk_wait_data(sk, &timeo);
1371
1372 if (user_recv) {
1373 int chunk;
1374
1375 /* __ Restore normal policy in scheduler __ */
1376
1377 if ((chunk = len - tp->ucopy.len) != 0) {
1378 NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
1379 len -= chunk;
1380 copied += chunk;
1381 }
1382
1383 if (tp->rcv_nxt == tp->copied_seq &&
1384 skb_queue_len(&tp->ucopy.prequeue)) {
1385do_prequeue:
1386 tcp_prequeue_process(sk);
1387
1388 if ((chunk = len - tp->ucopy.len) != 0) {
1389 NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1390 len -= chunk;
1391 copied += chunk;
1392 }
1393 }
1394 }
1395 if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
1396 if (net_ratelimit())
1397 printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1398 current->comm, current->pid);
1399 peek_seq = tp->copied_seq;
1400 }
1401 continue;
1402
1403 found_ok_skb:
1404 /* Ok so how much can we use? */
1405 used = skb->len - offset;
1406 if (len < used)
1407 used = len;
1408
1409 /* Do we have urgent data here? */
1410 if (tp->urg_data) {
1411 u32 urg_offset = tp->urg_seq - *seq;
1412 if (urg_offset < used) {
1413 if (!urg_offset) {
1414 if (!sock_flag(sk, SOCK_URGINLINE)) {
1415 ++*seq;
1416 offset++;
1417 used--;
1418 if (!used)
1419 goto skip_copy;
1420 }
1421 } else
1422 used = urg_offset;
1423 }
1424 }
1425
1426 if (!(flags & MSG_TRUNC)) {
1427 err = skb_copy_datagram_iovec(skb, offset,
1428 msg->msg_iov, used);
1429 if (err) {
1430 /* Exception. Bailout! */
1431 if (!copied)
1432 copied = -EFAULT;
1433 break;
1434 }
1435 }
1436
1437 *seq += used;
1438 copied += used;
1439 len -= used;
1440
1441 tcp_rcv_space_adjust(sk);
1442
1443skip_copy:
1444 if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1445 tp->urg_data = 0;
1446 tcp_fast_path_check(sk, tp);
1447 }
1448 if (used + offset < skb->len)
1449 continue;
1450
1451 if (skb->h.th->fin)
1452 goto found_fin_ok;
1453 if (!(flags & MSG_PEEK))
1454 sk_eat_skb(sk, skb);
1455 continue;
1456
1457 found_fin_ok:
1458 /* Process the FIN. */
1459 ++*seq;
1460 if (!(flags & MSG_PEEK))
1461 sk_eat_skb(sk, skb);
1462 break;
1463 } while (len > 0);
1464
1465 if (user_recv) {
1466 if (skb_queue_len(&tp->ucopy.prequeue)) {
1467 int chunk;
1468
1469 tp->ucopy.len = copied > 0 ? len : 0;
1470
1471 tcp_prequeue_process(sk);
1472
1473 if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1474 NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1475 len -= chunk;
1476 copied += chunk;
1477 }
1478 }
1479
1480 tp->ucopy.task = NULL;
1481 tp->ucopy.len = 0;
1482 }
1483
1484 /* According to UNIX98, msg_name/msg_namelen are ignored
1485 * on connected socket. I was just happy when found this 8) --ANK
1486 */
1487
1488 /* Clean up data we have read: This will do ACK frames. */
1489 cleanup_rbuf(sk, copied);
1490
1491 TCP_CHECK_TIMER(sk);
1492 release_sock(sk);
1493 return copied;
1494
1495out:
1496 TCP_CHECK_TIMER(sk);
1497 release_sock(sk);
1498 return err;
1499
1500recv_urg:
1501 err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1502 goto out;
1503}
1504
1505/*
1506 * State processing on a close. This implements the state shift for
1507 * sending our FIN frame. Note that we only send a FIN for some
1508 * states. A shutdown() may have already sent the FIN, or we may be
1509 * closed.
1510 */
1511
1512static unsigned char new_state[16] = {
1513 /* current state: new state: action: */
1514 /* (Invalid) */ TCP_CLOSE,
1515 /* TCP_ESTABLISHED */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1516 /* TCP_SYN_SENT */ TCP_CLOSE,
1517 /* TCP_SYN_RECV */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1518 /* TCP_FIN_WAIT1 */ TCP_FIN_WAIT1,
1519 /* TCP_FIN_WAIT2 */ TCP_FIN_WAIT2,
1520 /* TCP_TIME_WAIT */ TCP_CLOSE,
1521 /* TCP_CLOSE */ TCP_CLOSE,
1522 /* TCP_CLOSE_WAIT */ TCP_LAST_ACK | TCP_ACTION_FIN,
1523 /* TCP_LAST_ACK */ TCP_LAST_ACK,
1524 /* TCP_LISTEN */ TCP_CLOSE,
1525 /* TCP_CLOSING */ TCP_CLOSING,
1526};
1527
1528static int tcp_close_state(struct sock *sk)
1529{
1530 int next = (int)new_state[sk->sk_state];
1531 int ns = next & TCP_STATE_MASK;
1532
1533 tcp_set_state(sk, ns);
1534
1535 return next & TCP_ACTION_FIN;
1536}
1537
1538/*
1539 * Shutdown the sending side of a connection. Much like close except
1540 * that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD).
1541 */
1542
1543void tcp_shutdown(struct sock *sk, int how)
1544{
1545 /* We need to grab some memory, and put together a FIN,
1546 * and then put it into the queue to be sent.
1547 * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1548 */
1549 if (!(how & SEND_SHUTDOWN))
1550 return;
1551
1552 /* If we've already sent a FIN, or it's a closed state, skip this. */
1553 if ((1 << sk->sk_state) &
1554 (TCPF_ESTABLISHED | TCPF_SYN_SENT |
1555 TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
1556 /* Clear out any half completed packets. FIN if needed. */
1557 if (tcp_close_state(sk))
1558 tcp_send_fin(sk);
1559 }
1560}
1561
1562/*
1563 * At this point, there should be no process reference to this
1564 * socket, and thus no user references at all. Therefore we
1565 * can assume the socket waitqueue is inactive and nobody will
1566 * try to jump onto it.
1567 */
1568void tcp_destroy_sock(struct sock *sk)
1569{
1570 BUG_TRAP(sk->sk_state == TCP_CLOSE);
1571 BUG_TRAP(sock_flag(sk, SOCK_DEAD));
1572
1573 /* It cannot be in hash table! */
1574 BUG_TRAP(sk_unhashed(sk));
1575
1576 /* If it has not 0 inet_sk(sk)->num, it must be bound */
1577 BUG_TRAP(!inet_sk(sk)->num || tcp_sk(sk)->bind_hash);
1578
1579 sk->sk_prot->destroy(sk);
1580
1581 sk_stream_kill_queues(sk);
1582
1583 xfrm_sk_free_policy(sk);
1584
1585#ifdef INET_REFCNT_DEBUG
1586 if (atomic_read(&sk->sk_refcnt) != 1) {
1587 printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n",
1588 sk, atomic_read(&sk->sk_refcnt));
1589 }
1590#endif
1591
1592 atomic_dec(&tcp_orphan_count);
1593 sock_put(sk);
1594}
1595
1596void tcp_close(struct sock *sk, long timeout)
1597{
1598 struct sk_buff *skb;
1599 int data_was_unread = 0;
1600
1601 lock_sock(sk);
1602 sk->sk_shutdown = SHUTDOWN_MASK;
1603
1604 if (sk->sk_state == TCP_LISTEN) {
1605 tcp_set_state(sk, TCP_CLOSE);
1606
1607 /* Special case. */
1608 tcp_listen_stop(sk);
1609
1610 goto adjudge_to_death;
1611 }
1612
1613 /* We need to flush the recv. buffs. We do this only on the
1614 * descriptor close, not protocol-sourced closes, because the
1615 * reader process may not have drained the data yet!
1616 */
1617 while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
1618 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
1619 skb->h.th->fin;
1620 data_was_unread += len;
1621 __kfree_skb(skb);
1622 }
1623
1624 sk_stream_mem_reclaim(sk);
1625
1626 /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
1627 * 3.10, we send a RST here because data was lost. To
1628 * witness the awful effects of the old behavior of always
1629 * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
1630 * a bulk GET in an FTP client, suspend the process, wait
1631 * for the client to advertise a zero window, then kill -9
1632 * the FTP client, wheee... Note: timeout is always zero
1633 * in such a case.
1634 */
1635 if (data_was_unread) {
1636 /* Unread data was tossed, zap the connection. */
1637 NET_INC_STATS_USER(LINUX_MIB_TCPABORTONCLOSE);
1638 tcp_set_state(sk, TCP_CLOSE);
1639 tcp_send_active_reset(sk, GFP_KERNEL);
1640 } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
1641 /* Check zero linger _after_ checking for unread data. */
1642 sk->sk_prot->disconnect(sk, 0);
1643 NET_INC_STATS_USER(LINUX_MIB_TCPABORTONDATA);
1644 } else if (tcp_close_state(sk)) {
1645 /* We FIN if the application ate all the data before
1646 * zapping the connection.
1647 */
1648
1649 /* RED-PEN. Formally speaking, we have broken TCP state
1650 * machine. State transitions:
1651 *
1652 * TCP_ESTABLISHED -> TCP_FIN_WAIT1
1653 * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
1654 * TCP_CLOSE_WAIT -> TCP_LAST_ACK
1655 *
1656 * are legal only when FIN has been sent (i.e. in window),
1657 * rather than queued out of window. Purists blame.
1658 *
1659 * F.e. "RFC state" is ESTABLISHED,
1660 * if Linux state is FIN-WAIT-1, but FIN is still not sent.
1661 *
1662 * The visible declinations are that sometimes
1663 * we enter time-wait state, when it is not required really
1664 * (harmless), do not send active resets, when they are
1665 * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
1666 * they look as CLOSING or LAST_ACK for Linux)
1667 * Probably, I missed some more holelets.
1668 * --ANK
1669 */
1670 tcp_send_fin(sk);
1671 }
1672
1673 sk_stream_wait_close(sk, timeout);
1674
1675adjudge_to_death:
1676 /* It is the last release_sock in its life. It will remove backlog. */
1677 release_sock(sk);
1678
1679
1680 /* Now socket is owned by kernel and we acquire BH lock
1681 to finish close. No need to check for user refs.
1682 */
1683 local_bh_disable();
1684 bh_lock_sock(sk);
1685 BUG_TRAP(!sock_owned_by_user(sk));
1686
1687 sock_hold(sk);
1688 sock_orphan(sk);
1689
1690 /* This is a (useful) BSD violating of the RFC. There is a
1691 * problem with TCP as specified in that the other end could
1692 * keep a socket open forever with no application left this end.
1693 * We use a 3 minute timeout (about the same as BSD) then kill
1694 * our end. If they send after that then tough - BUT: long enough
1695 * that we won't make the old 4*rto = almost no time - whoops
1696 * reset mistake.
1697 *
1698 * Nope, it was not mistake. It is really desired behaviour
1699 * f.e. on http servers, when such sockets are useless, but
1700 * consume significant resources. Let's do it with special
1701 * linger2 option. --ANK
1702 */
1703
1704 if (sk->sk_state == TCP_FIN_WAIT2) {
1705 struct tcp_sock *tp = tcp_sk(sk);
1706 if (tp->linger2 < 0) {
1707 tcp_set_state(sk, TCP_CLOSE);
1708 tcp_send_active_reset(sk, GFP_ATOMIC);
1709 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER);
1710 } else {
1711 int tmo = tcp_fin_time(tp);
1712
1713 if (tmo > TCP_TIMEWAIT_LEN) {
1714 tcp_reset_keepalive_timer(sk, tcp_fin_time(tp));
1715 } else {
1716 atomic_inc(&tcp_orphan_count);
1717 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
1718 goto out;
1719 }
1720 }
1721 }
1722 if (sk->sk_state != TCP_CLOSE) {
1723 sk_stream_mem_reclaim(sk);
1724 if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans ||
1725 (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
1726 atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
1727 if (net_ratelimit())
1728 printk(KERN_INFO "TCP: too many of orphaned "
1729 "sockets\n");
1730 tcp_set_state(sk, TCP_CLOSE);
1731 tcp_send_active_reset(sk, GFP_ATOMIC);
1732 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
1733 }
1734 }
1735 atomic_inc(&tcp_orphan_count);
1736
1737 if (sk->sk_state == TCP_CLOSE)
1738 tcp_destroy_sock(sk);
1739 /* Otherwise, socket is reprieved until protocol close. */
1740
1741out:
1742 bh_unlock_sock(sk);
1743 local_bh_enable();
1744 sock_put(sk);
1745}
1746
1747/* These states need RST on ABORT according to RFC793 */
1748
1749static inline int tcp_need_reset(int state)
1750{
1751 return (1 << state) &
1752 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
1753 TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
1754}
1755
1756int tcp_disconnect(struct sock *sk, int flags)
1757{
1758 struct inet_sock *inet = inet_sk(sk);
1759 struct tcp_sock *tp = tcp_sk(sk);
1760 int err = 0;
1761 int old_state = sk->sk_state;
1762
1763 if (old_state != TCP_CLOSE)
1764 tcp_set_state(sk, TCP_CLOSE);
1765
1766 /* ABORT function of RFC793 */
1767 if (old_state == TCP_LISTEN) {
1768 tcp_listen_stop(sk);
1769 } else if (tcp_need_reset(old_state) ||
1770 (tp->snd_nxt != tp->write_seq &&
1771 (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
1772 /* The last check adjusts for discrepance of Linux wrt. RFC
1773 * states
1774 */
1775 tcp_send_active_reset(sk, gfp_any());
1776 sk->sk_err = ECONNRESET;
1777 } else if (old_state == TCP_SYN_SENT)
1778 sk->sk_err = ECONNRESET;
1779
1780 tcp_clear_xmit_timers(sk);
1781 __skb_queue_purge(&sk->sk_receive_queue);
1782 sk_stream_writequeue_purge(sk);
1783 __skb_queue_purge(&tp->out_of_order_queue);
1784
1785 inet->dport = 0;
1786
1787 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
1788 inet_reset_saddr(sk);
1789
1790 sk->sk_shutdown = 0;
1791 sock_reset_flag(sk, SOCK_DONE);
1792 tp->srtt = 0;
1793 if ((tp->write_seq += tp->max_window + 2) == 0)
1794 tp->write_seq = 1;
1795 tp->backoff = 0;
1796 tp->snd_cwnd = 2;
1797 tp->probes_out = 0;
1798 tp->packets_out = 0;
1799 tp->snd_ssthresh = 0x7fffffff;
1800 tp->snd_cwnd_cnt = 0;
1801 tcp_set_ca_state(tp, TCP_CA_Open);
1802 tcp_clear_retrans(tp);
1803 tcp_delack_init(tp);
1804 sk->sk_send_head = NULL;
1805 tp->rx_opt.saw_tstamp = 0;
1806 tcp_sack_reset(&tp->rx_opt);
1807 __sk_dst_reset(sk);
1808
1809 BUG_TRAP(!inet->num || tp->bind_hash);
1810
1811 sk->sk_error_report(sk);
1812 return err;
1813}
1814
1815/*
1816 * Wait for an incoming connection, avoid race
1817 * conditions. This must be called with the socket locked.
1818 */
1819static int wait_for_connect(struct sock *sk, long timeo)
1820{
1821 struct tcp_sock *tp = tcp_sk(sk);
1822 DEFINE_WAIT(wait);
1823 int err;
1824
1825 /*
1826 * True wake-one mechanism for incoming connections: only
1827 * one process gets woken up, not the 'whole herd'.
1828 * Since we do not 'race & poll' for established sockets
1829 * anymore, the common case will execute the loop only once.
1830 *
1831 * Subtle issue: "add_wait_queue_exclusive()" will be added
1832 * after any current non-exclusive waiters, and we know that
1833 * it will always _stay_ after any new non-exclusive waiters
1834 * because all non-exclusive waiters are added at the
1835 * beginning of the wait-queue. As such, it's ok to "drop"
1836 * our exclusiveness temporarily when we get woken up without
1837 * having to remove and re-insert us on the wait queue.
1838 */
1839 for (;;) {
1840 prepare_to_wait_exclusive(sk->sk_sleep, &wait,
1841 TASK_INTERRUPTIBLE);
1842 release_sock(sk);
0e87506f 1843 if (reqsk_queue_empty(&tp->accept_queue))
1da177e4
LT
1844 timeo = schedule_timeout(timeo);
1845 lock_sock(sk);
1846 err = 0;
0e87506f 1847 if (!reqsk_queue_empty(&tp->accept_queue))
1da177e4
LT
1848 break;
1849 err = -EINVAL;
1850 if (sk->sk_state != TCP_LISTEN)
1851 break;
1852 err = sock_intr_errno(timeo);
1853 if (signal_pending(current))
1854 break;
1855 err = -EAGAIN;
1856 if (!timeo)
1857 break;
1858 }
1859 finish_wait(sk->sk_sleep, &wait);
1860 return err;
1861}
1862
1863/*
1864 * This will accept the next outstanding connection.
1865 */
1866
1867struct sock *tcp_accept(struct sock *sk, int flags, int *err)
1868{
1869 struct tcp_sock *tp = tcp_sk(sk);
1da177e4
LT
1870 struct sock *newsk;
1871 int error;
1872
1873 lock_sock(sk);
1874
1875 /* We need to make sure that this socket is listening,
1876 * and that it has something pending.
1877 */
1878 error = -EINVAL;
1879 if (sk->sk_state != TCP_LISTEN)
0e87506f 1880 goto out_err;
1da177e4
LT
1881
1882 /* Find already established connection */
0e87506f 1883 if (reqsk_queue_empty(&tp->accept_queue)) {
1da177e4
LT
1884 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
1885
1886 /* If this is a non blocking socket don't sleep */
1887 error = -EAGAIN;
1888 if (!timeo)
0e87506f 1889 goto out_err;
1da177e4
LT
1890
1891 error = wait_for_connect(sk, timeo);
1892 if (error)
0e87506f 1893 goto out_err;
1da177e4
LT
1894 }
1895
0e87506f 1896 newsk = reqsk_queue_get_child(&tp->accept_queue, sk);
1da177e4 1897 BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
1da177e4
LT
1898out:
1899 release_sock(sk);
0e87506f
ACM
1900 return newsk;
1901out_err:
1902 newsk = NULL;
1da177e4 1903 *err = error;
0e87506f 1904 goto out;
1da177e4
LT
1905}
1906
1907/*
1908 * Socket option code for TCP.
1909 */
1910int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
1911 int optlen)
1912{
1913 struct tcp_sock *tp = tcp_sk(sk);
1914 int val;
1915 int err = 0;
1916
1917 if (level != SOL_TCP)
1918 return tp->af_specific->setsockopt(sk, level, optname,
1919 optval, optlen);
1920
5f8ef48d
SH
1921 /* This is a string value all the others are int's */
1922 if (optname == TCP_CONGESTION) {
1923 char name[TCP_CA_NAME_MAX];
1924
1925 if (optlen < 1)
1926 return -EINVAL;
1927
1928 val = strncpy_from_user(name, optval,
1929 min(TCP_CA_NAME_MAX-1, optlen));
1930 if (val < 0)
1931 return -EFAULT;
1932 name[val] = 0;
1933
1934 lock_sock(sk);
1935 err = tcp_set_congestion_control(tp, name);
1936 release_sock(sk);
1937 return err;
1938 }
1939
1da177e4
LT
1940 if (optlen < sizeof(int))
1941 return -EINVAL;
1942
1943 if (get_user(val, (int __user *)optval))
1944 return -EFAULT;
1945
1946 lock_sock(sk);
1947
1948 switch (optname) {
1949 case TCP_MAXSEG:
1950 /* Values greater than interface MTU won't take effect. However
1951 * at the point when this call is done we typically don't yet
1952 * know which interface is going to be used */
1953 if (val < 8 || val > MAX_TCP_WINDOW) {
1954 err = -EINVAL;
1955 break;
1956 }
1957 tp->rx_opt.user_mss = val;
1958 break;
1959
1960 case TCP_NODELAY:
1961 if (val) {
1962 /* TCP_NODELAY is weaker than TCP_CORK, so that
1963 * this option on corked socket is remembered, but
1964 * it is not activated until cork is cleared.
1965 *
1966 * However, when TCP_NODELAY is set we make
1967 * an explicit push, which overrides even TCP_CORK
1968 * for currently queued segments.
1969 */
1970 tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
1971 tcp_push_pending_frames(sk, tp);
1972 } else {
1973 tp->nonagle &= ~TCP_NAGLE_OFF;
1974 }
1975 break;
1976
1977 case TCP_CORK:
1978 /* When set indicates to always queue non-full frames.
1979 * Later the user clears this option and we transmit
1980 * any pending partial frames in the queue. This is
1981 * meant to be used alongside sendfile() to get properly
1982 * filled frames when the user (for example) must write
1983 * out headers with a write() call first and then use
1984 * sendfile to send out the data parts.
1985 *
1986 * TCP_CORK can be set together with TCP_NODELAY and it is
1987 * stronger than TCP_NODELAY.
1988 */
1989 if (val) {
1990 tp->nonagle |= TCP_NAGLE_CORK;
1991 } else {
1992 tp->nonagle &= ~TCP_NAGLE_CORK;
1993 if (tp->nonagle&TCP_NAGLE_OFF)
1994 tp->nonagle |= TCP_NAGLE_PUSH;
1995 tcp_push_pending_frames(sk, tp);
1996 }
1997 break;
1998
1999 case TCP_KEEPIDLE:
2000 if (val < 1 || val > MAX_TCP_KEEPIDLE)
2001 err = -EINVAL;
2002 else {
2003 tp->keepalive_time = val * HZ;
2004 if (sock_flag(sk, SOCK_KEEPOPEN) &&
2005 !((1 << sk->sk_state) &
2006 (TCPF_CLOSE | TCPF_LISTEN))) {
2007 __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
2008 if (tp->keepalive_time > elapsed)
2009 elapsed = tp->keepalive_time - elapsed;
2010 else
2011 elapsed = 0;
2012 tcp_reset_keepalive_timer(sk, elapsed);
2013 }
2014 }
2015 break;
2016 case TCP_KEEPINTVL:
2017 if (val < 1 || val > MAX_TCP_KEEPINTVL)
2018 err = -EINVAL;
2019 else
2020 tp->keepalive_intvl = val * HZ;
2021 break;
2022 case TCP_KEEPCNT:
2023 if (val < 1 || val > MAX_TCP_KEEPCNT)
2024 err = -EINVAL;
2025 else
2026 tp->keepalive_probes = val;
2027 break;
2028 case TCP_SYNCNT:
2029 if (val < 1 || val > MAX_TCP_SYNCNT)
2030 err = -EINVAL;
2031 else
2032 tp->syn_retries = val;
2033 break;
2034
2035 case TCP_LINGER2:
2036 if (val < 0)
2037 tp->linger2 = -1;
2038 else if (val > sysctl_tcp_fin_timeout / HZ)
2039 tp->linger2 = 0;
2040 else
2041 tp->linger2 = val * HZ;
2042 break;
2043
2044 case TCP_DEFER_ACCEPT:
2045 tp->defer_accept = 0;
2046 if (val > 0) {
2047 /* Translate value in seconds to number of
2048 * retransmits */
2049 while (tp->defer_accept < 32 &&
2050 val > ((TCP_TIMEOUT_INIT / HZ) <<
2051 tp->defer_accept))
2052 tp->defer_accept++;
2053 tp->defer_accept++;
2054 }
2055 break;
2056
2057 case TCP_WINDOW_CLAMP:
2058 if (!val) {
2059 if (sk->sk_state != TCP_CLOSE) {
2060 err = -EINVAL;
2061 break;
2062 }
2063 tp->window_clamp = 0;
2064 } else
2065 tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
2066 SOCK_MIN_RCVBUF / 2 : val;
2067 break;
2068
2069 case TCP_QUICKACK:
2070 if (!val) {
2071 tp->ack.pingpong = 1;
2072 } else {
2073 tp->ack.pingpong = 0;
2074 if ((1 << sk->sk_state) &
2075 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
2076 tcp_ack_scheduled(tp)) {
2077 tp->ack.pending |= TCP_ACK_PUSHED;
2078 cleanup_rbuf(sk, 1);
2079 if (!(val & 1))
2080 tp->ack.pingpong = 1;
2081 }
2082 }
2083 break;
2084
2085 default:
2086 err = -ENOPROTOOPT;
2087 break;
2088 };
2089 release_sock(sk);
2090 return err;
2091}
2092
2093/* Return information about state of tcp endpoint in API format. */
2094void tcp_get_info(struct sock *sk, struct tcp_info *info)
2095{
2096 struct tcp_sock *tp = tcp_sk(sk);
2097 u32 now = tcp_time_stamp;
2098
2099 memset(info, 0, sizeof(*info));
2100
2101 info->tcpi_state = sk->sk_state;
2102 info->tcpi_ca_state = tp->ca_state;
2103 info->tcpi_retransmits = tp->retransmits;
2104 info->tcpi_probes = tp->probes_out;
2105 info->tcpi_backoff = tp->backoff;
2106
2107 if (tp->rx_opt.tstamp_ok)
2108 info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
2109 if (tp->rx_opt.sack_ok)
2110 info->tcpi_options |= TCPI_OPT_SACK;
2111 if (tp->rx_opt.wscale_ok) {
2112 info->tcpi_options |= TCPI_OPT_WSCALE;
2113 info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
2114 info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
2115 }
2116
2117 if (tp->ecn_flags&TCP_ECN_OK)
2118 info->tcpi_options |= TCPI_OPT_ECN;
2119
2120 info->tcpi_rto = jiffies_to_usecs(tp->rto);
2121 info->tcpi_ato = jiffies_to_usecs(tp->ack.ato);
2122 info->tcpi_snd_mss = tp->mss_cache_std;
2123 info->tcpi_rcv_mss = tp->ack.rcv_mss;
2124
2125 info->tcpi_unacked = tp->packets_out;
2126 info->tcpi_sacked = tp->sacked_out;
2127 info->tcpi_lost = tp->lost_out;
2128 info->tcpi_retrans = tp->retrans_out;
2129 info->tcpi_fackets = tp->fackets_out;
2130
2131 info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
2132 info->tcpi_last_data_recv = jiffies_to_msecs(now - tp->ack.lrcvtime);
2133 info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
2134
2135 info->tcpi_pmtu = tp->pmtu_cookie;
2136 info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
2137 info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
2138 info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
2139 info->tcpi_snd_ssthresh = tp->snd_ssthresh;
2140 info->tcpi_snd_cwnd = tp->snd_cwnd;
2141 info->tcpi_advmss = tp->advmss;
2142 info->tcpi_reordering = tp->reordering;
2143
2144 info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
2145 info->tcpi_rcv_space = tp->rcvq_space.space;
2146
2147 info->tcpi_total_retrans = tp->total_retrans;
2148}
2149
2150EXPORT_SYMBOL_GPL(tcp_get_info);
2151
2152int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2153 int __user *optlen)
2154{
2155 struct tcp_sock *tp = tcp_sk(sk);
2156 int val, len;
2157
2158 if (level != SOL_TCP)
2159 return tp->af_specific->getsockopt(sk, level, optname,
2160 optval, optlen);
2161
2162 if (get_user(len, optlen))
2163 return -EFAULT;
2164
2165 len = min_t(unsigned int, len, sizeof(int));
2166
2167 if (len < 0)
2168 return -EINVAL;
2169
2170 switch (optname) {
2171 case TCP_MAXSEG:
2172 val = tp->mss_cache_std;
2173 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2174 val = tp->rx_opt.user_mss;
2175 break;
2176 case TCP_NODELAY:
2177 val = !!(tp->nonagle&TCP_NAGLE_OFF);
2178 break;
2179 case TCP_CORK:
2180 val = !!(tp->nonagle&TCP_NAGLE_CORK);
2181 break;
2182 case TCP_KEEPIDLE:
2183 val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;
2184 break;
2185 case TCP_KEEPINTVL:
2186 val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;
2187 break;
2188 case TCP_KEEPCNT:
2189 val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
2190 break;
2191 case TCP_SYNCNT:
2192 val = tp->syn_retries ? : sysctl_tcp_syn_retries;
2193 break;
2194 case TCP_LINGER2:
2195 val = tp->linger2;
2196 if (val >= 0)
2197 val = (val ? : sysctl_tcp_fin_timeout) / HZ;
2198 break;
2199 case TCP_DEFER_ACCEPT:
2200 val = !tp->defer_accept ? 0 : ((TCP_TIMEOUT_INIT / HZ) <<
2201 (tp->defer_accept - 1));
2202 break;
2203 case TCP_WINDOW_CLAMP:
2204 val = tp->window_clamp;
2205 break;
2206 case TCP_INFO: {
2207 struct tcp_info info;
2208
2209 if (get_user(len, optlen))
2210 return -EFAULT;
2211
2212 tcp_get_info(sk, &info);
2213
2214 len = min_t(unsigned int, len, sizeof(info));
2215 if (put_user(len, optlen))
2216 return -EFAULT;
2217 if (copy_to_user(optval, &info, len))
2218 return -EFAULT;
2219 return 0;
2220 }
2221 case TCP_QUICKACK:
2222 val = !tp->ack.pingpong;
2223 break;
5f8ef48d
SH
2224
2225 case TCP_CONGESTION:
2226 if (get_user(len, optlen))
2227 return -EFAULT;
2228 len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
2229 if (put_user(len, optlen))
2230 return -EFAULT;
2231 if (copy_to_user(optval, tp->ca_ops->name, len))
2232 return -EFAULT;
2233 return 0;
1da177e4
LT
2234 default:
2235 return -ENOPROTOOPT;
2236 };
2237
2238 if (put_user(len, optlen))
2239 return -EFAULT;
2240 if (copy_to_user(optval, &val, len))
2241 return -EFAULT;
2242 return 0;
2243}
2244
2245
2246extern void __skb_cb_too_small_for_tcp(int, int);
5f8ef48d 2247extern struct tcp_congestion_ops tcp_reno;
1da177e4
LT
2248
2249static __initdata unsigned long thash_entries;
2250static int __init set_thash_entries(char *str)
2251{
2252 if (!str)
2253 return 0;
2254 thash_entries = simple_strtoul(str, &str, 0);
2255 return 1;
2256}
2257__setup("thash_entries=", set_thash_entries);
2258
2259void __init tcp_init(void)
2260{
2261 struct sk_buff *skb = NULL;
2262 int order, i;
2263
2264 if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
2265 __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2266 sizeof(skb->cb));
2267
1da177e4
LT
2268 tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
2269 sizeof(struct tcp_bind_bucket),
2270 0, SLAB_HWCACHE_ALIGN,
2271 NULL, NULL);
2272 if (!tcp_bucket_cachep)
2273 panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
2274
2275 tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
2276 sizeof(struct tcp_tw_bucket),
2277 0, SLAB_HWCACHE_ALIGN,
2278 NULL, NULL);
2279 if (!tcp_timewait_cachep)
2280 panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
2281
2282 /* Size and allocate the main established and bind bucket
2283 * hash tables.
2284 *
2285 * The methodology is similar to that of the buffer cache.
2286 */
2287 tcp_ehash = (struct tcp_ehash_bucket *)
2288 alloc_large_system_hash("TCP established",
2289 sizeof(struct tcp_ehash_bucket),
2290 thash_entries,
2291 (num_physpages >= 128 * 1024) ?
2292 (25 - PAGE_SHIFT) :
2293 (27 - PAGE_SHIFT),
2294 HASH_HIGHMEM,
2295 &tcp_ehash_size,
2296 NULL,
2297 0);
2298 tcp_ehash_size = (1 << tcp_ehash_size) >> 1;
2299 for (i = 0; i < (tcp_ehash_size << 1); i++) {
2300 rwlock_init(&tcp_ehash[i].lock);
2301 INIT_HLIST_HEAD(&tcp_ehash[i].chain);
2302 }
2303
2304 tcp_bhash = (struct tcp_bind_hashbucket *)
2305 alloc_large_system_hash("TCP bind",
2306 sizeof(struct tcp_bind_hashbucket),
2307 tcp_ehash_size,
2308 (num_physpages >= 128 * 1024) ?
2309 (25 - PAGE_SHIFT) :
2310 (27 - PAGE_SHIFT),
2311 HASH_HIGHMEM,
2312 &tcp_bhash_size,
2313 NULL,
2314 64 * 1024);
2315 tcp_bhash_size = 1 << tcp_bhash_size;
2316 for (i = 0; i < tcp_bhash_size; i++) {
2317 spin_lock_init(&tcp_bhash[i].lock);
2318 INIT_HLIST_HEAD(&tcp_bhash[i].chain);
2319 }
2320
2321 /* Try to be a bit smarter and adjust defaults depending
2322 * on available memory.
2323 */
2324 for (order = 0; ((1 << order) << PAGE_SHIFT) <
2325 (tcp_bhash_size * sizeof(struct tcp_bind_hashbucket));
2326 order++)
2327 ;
e7626486 2328 if (order >= 4) {
1da177e4
LT
2329 sysctl_local_port_range[0] = 32768;
2330 sysctl_local_port_range[1] = 61000;
2331 sysctl_tcp_max_tw_buckets = 180000;
2332 sysctl_tcp_max_orphans = 4096 << (order - 4);
2333 sysctl_max_syn_backlog = 1024;
2334 } else if (order < 3) {
2335 sysctl_local_port_range[0] = 1024 * (3 - order);
2336 sysctl_tcp_max_tw_buckets >>= (3 - order);
2337 sysctl_tcp_max_orphans >>= (3 - order);
2338 sysctl_max_syn_backlog = 128;
2339 }
2340 tcp_port_rover = sysctl_local_port_range[0] - 1;
2341
2342 sysctl_tcp_mem[0] = 768 << order;
2343 sysctl_tcp_mem[1] = 1024 << order;
2344 sysctl_tcp_mem[2] = 1536 << order;
2345
2346 if (order < 3) {
2347 sysctl_tcp_wmem[2] = 64 * 1024;
2348 sysctl_tcp_rmem[0] = PAGE_SIZE;
2349 sysctl_tcp_rmem[1] = 43689;
2350 sysctl_tcp_rmem[2] = 2 * 43689;
2351 }
2352
2353 printk(KERN_INFO "TCP: Hash tables configured "
2354 "(established %d bind %d)\n",
2355 tcp_ehash_size << 1, tcp_bhash_size);
317a76f9
SH
2356
2357 tcp_register_congestion_control(&tcp_reno);
1da177e4
LT
2358}
2359
2360EXPORT_SYMBOL(tcp_accept);
2361EXPORT_SYMBOL(tcp_close);
2362EXPORT_SYMBOL(tcp_destroy_sock);
2363EXPORT_SYMBOL(tcp_disconnect);
2364EXPORT_SYMBOL(tcp_getsockopt);
2365EXPORT_SYMBOL(tcp_ioctl);
1da177e4
LT
2366EXPORT_SYMBOL(tcp_poll);
2367EXPORT_SYMBOL(tcp_read_sock);
2368EXPORT_SYMBOL(tcp_recvmsg);
2369EXPORT_SYMBOL(tcp_sendmsg);
2370EXPORT_SYMBOL(tcp_sendpage);
2371EXPORT_SYMBOL(tcp_setsockopt);
2372EXPORT_SYMBOL(tcp_shutdown);
2373EXPORT_SYMBOL(tcp_statistics);
2374EXPORT_SYMBOL(tcp_timewait_cachep);
This page took 0.141389 seconds and 5 git commands to generate.