ASoC: Intel: mrfld: add bytes control for modules
[deliverable/linux.git] / net / ipv4 / tcp_ipv4.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
1da177e4
LT
8 * IPv4 specific functions
9 *
10 *
11 * code split from:
12 * linux/ipv4/tcp.c
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
15 *
16 * See tcp.c for author information
17 *
18 * This program is free software; you can redistribute it and/or
19 * modify it under the terms of the GNU General Public License
20 * as published by the Free Software Foundation; either version
21 * 2 of the License, or (at your option) any later version.
22 */
23
24/*
25 * Changes:
26 * David S. Miller : New socket lookup architecture.
27 * This code is dedicated to John Dyson.
28 * David S. Miller : Change semantics of established hash,
29 * half is devoted to TIME_WAIT sockets
30 * and the rest go in the other half.
31 * Andi Kleen : Add support for syncookies and fixed
32 * some bugs: ip options weren't passed to
33 * the TCP layer, missed a check for an
34 * ACK bit.
35 * Andi Kleen : Implemented fast path mtu discovery.
36 * Fixed many serious bugs in the
60236fdd 37 * request_sock handling and moved
1da177e4
LT
38 * most of it into the af independent code.
39 * Added tail drop and some other bugfixes.
caa20d9a 40 * Added new listen semantics.
1da177e4
LT
41 * Mike McLagan : Routing by source
42 * Juan Jose Ciarlante: ip_dynaddr bits
43 * Andi Kleen: various fixes.
44 * Vitaly E. Lavrov : Transparent proxy revived after year
45 * coma.
46 * Andi Kleen : Fix new listen.
47 * Andi Kleen : Fix accept error reporting.
48 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
49 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
50 * a single port at the same time.
51 */
52
afd46503 53#define pr_fmt(fmt) "TCP: " fmt
1da177e4 54
eb4dea58 55#include <linux/bottom_half.h>
1da177e4
LT
56#include <linux/types.h>
57#include <linux/fcntl.h>
58#include <linux/module.h>
59#include <linux/random.h>
60#include <linux/cache.h>
61#include <linux/jhash.h>
62#include <linux/init.h>
63#include <linux/times.h>
5a0e3ad6 64#include <linux/slab.h>
1da177e4 65
457c4cbc 66#include <net/net_namespace.h>
1da177e4 67#include <net/icmp.h>
304a1618 68#include <net/inet_hashtables.h>
1da177e4 69#include <net/tcp.h>
20380731 70#include <net/transp_v6.h>
1da177e4
LT
71#include <net/ipv6.h>
72#include <net/inet_common.h>
6d6ee43e 73#include <net/timewait_sock.h>
1da177e4 74#include <net/xfrm.h>
1a2449a8 75#include <net/netdma.h>
6e5714ea 76#include <net/secure_seq.h>
d1a4c0b3 77#include <net/tcp_memcontrol.h>
076bb0c8 78#include <net/busy_poll.h>
1da177e4
LT
79
80#include <linux/inet.h>
81#include <linux/ipv6.h>
82#include <linux/stddef.h>
83#include <linux/proc_fs.h>
84#include <linux/seq_file.h>
85
cfb6eeb4
YH
86#include <linux/crypto.h>
87#include <linux/scatterlist.h>
88
ab32ea5d
BH
89int sysctl_tcp_tw_reuse __read_mostly;
90int sysctl_tcp_low_latency __read_mostly;
4bc2f18b 91EXPORT_SYMBOL(sysctl_tcp_low_latency);
1da177e4 92
1da177e4 93
cfb6eeb4 94#ifdef CONFIG_TCP_MD5SIG
a915da9b 95static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
318cf7aa 96 __be32 daddr, __be32 saddr, const struct tcphdr *th);
cfb6eeb4
YH
97#endif
98
5caea4ea 99struct inet_hashinfo tcp_hashinfo;
4bc2f18b 100EXPORT_SYMBOL(tcp_hashinfo);
1da177e4 101
936b8bdb 102static __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
1da177e4 103{
eddc9ec5
ACM
104 return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
105 ip_hdr(skb)->saddr,
aa8223c7
ACM
106 tcp_hdr(skb)->dest,
107 tcp_hdr(skb)->source);
1da177e4
LT
108}
109
6d6ee43e
ACM
110int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
111{
112 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
113 struct tcp_sock *tp = tcp_sk(sk);
114
115 /* With PAWS, it is safe from the viewpoint
116 of data integrity. Even without PAWS it is safe provided sequence
117 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
118
119 Actually, the idea is close to VJ's one, only timestamp cache is
120 held not per host, but per port pair and TW bucket is used as state
121 holder.
122
123 If TW bucket has been already destroyed we fall back to VJ's scheme
124 and use initial timestamp retrieved from peer table.
125 */
126 if (tcptw->tw_ts_recent_stamp &&
127 (twp == NULL || (sysctl_tcp_tw_reuse &&
9d729f72 128 get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
6d6ee43e
ACM
129 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
130 if (tp->write_seq == 0)
131 tp->write_seq = 1;
132 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
133 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
134 sock_hold(sktw);
135 return 1;
136 }
137
138 return 0;
139}
6d6ee43e
ACM
140EXPORT_SYMBOL_GPL(tcp_twsk_unique);
141
1da177e4
LT
142/* This will initiate an outgoing connection. */
143int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
144{
2d7192d6 145 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
1da177e4
LT
146 struct inet_sock *inet = inet_sk(sk);
147 struct tcp_sock *tp = tcp_sk(sk);
dca8b089 148 __be16 orig_sport, orig_dport;
bada8adc 149 __be32 daddr, nexthop;
da905bd1 150 struct flowi4 *fl4;
2d7192d6 151 struct rtable *rt;
1da177e4 152 int err;
f6d8bd05 153 struct ip_options_rcu *inet_opt;
1da177e4
LT
154
155 if (addr_len < sizeof(struct sockaddr_in))
156 return -EINVAL;
157
158 if (usin->sin_family != AF_INET)
159 return -EAFNOSUPPORT;
160
161 nexthop = daddr = usin->sin_addr.s_addr;
f6d8bd05
ED
162 inet_opt = rcu_dereference_protected(inet->inet_opt,
163 sock_owned_by_user(sk));
164 if (inet_opt && inet_opt->opt.srr) {
1da177e4
LT
165 if (!daddr)
166 return -EINVAL;
f6d8bd05 167 nexthop = inet_opt->opt.faddr;
1da177e4
LT
168 }
169
dca8b089
DM
170 orig_sport = inet->inet_sport;
171 orig_dport = usin->sin_port;
da905bd1
DM
172 fl4 = &inet->cork.fl.u.ip4;
173 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
b23dd4fe
DM
174 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
175 IPPROTO_TCP,
0e0d44ab 176 orig_sport, orig_dport, sk);
b23dd4fe
DM
177 if (IS_ERR(rt)) {
178 err = PTR_ERR(rt);
179 if (err == -ENETUNREACH)
f1d8cba6 180 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
b23dd4fe 181 return err;
584bdf8c 182 }
1da177e4
LT
183
184 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
185 ip_rt_put(rt);
186 return -ENETUNREACH;
187 }
188
f6d8bd05 189 if (!inet_opt || !inet_opt->opt.srr)
da905bd1 190 daddr = fl4->daddr;
1da177e4 191
c720c7e8 192 if (!inet->inet_saddr)
da905bd1 193 inet->inet_saddr = fl4->saddr;
c720c7e8 194 inet->inet_rcv_saddr = inet->inet_saddr;
1da177e4 195
c720c7e8 196 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
1da177e4
LT
197 /* Reset inherited state */
198 tp->rx_opt.ts_recent = 0;
199 tp->rx_opt.ts_recent_stamp = 0;
ee995283
PE
200 if (likely(!tp->repair))
201 tp->write_seq = 0;
1da177e4
LT
202 }
203
295ff7ed 204 if (tcp_death_row.sysctl_tw_recycle &&
81166dd6
DM
205 !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
206 tcp_fetch_timewait_stamp(sk, &rt->dst);
1da177e4 207
c720c7e8
ED
208 inet->inet_dport = usin->sin_port;
209 inet->inet_daddr = daddr;
1da177e4 210
b73c3d0e
TH
211 inet_set_txhash(sk);
212
d83d8461 213 inet_csk(sk)->icsk_ext_hdr_len = 0;
f6d8bd05
ED
214 if (inet_opt)
215 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1da177e4 216
bee7ca9e 217 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
1da177e4
LT
218
219 /* Socket identity is still unknown (sport may be zero).
220 * However we set state to SYN-SENT and not releasing socket
221 * lock select source port, enter ourselves into the hash tables and
222 * complete initialization after this.
223 */
224 tcp_set_state(sk, TCP_SYN_SENT);
a7f5e7f1 225 err = inet_hash_connect(&tcp_death_row, sk);
1da177e4
LT
226 if (err)
227 goto failure;
228
da905bd1 229 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
b23dd4fe
DM
230 inet->inet_sport, inet->inet_dport, sk);
231 if (IS_ERR(rt)) {
232 err = PTR_ERR(rt);
233 rt = NULL;
1da177e4 234 goto failure;
b23dd4fe 235 }
1da177e4 236 /* OK, now commit destination to socket. */
bcd76111 237 sk->sk_gso_type = SKB_GSO_TCPV4;
d8d1f30b 238 sk_setup_caps(sk, &rt->dst);
1da177e4 239
ee995283 240 if (!tp->write_seq && likely(!tp->repair))
c720c7e8
ED
241 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
242 inet->inet_daddr,
243 inet->inet_sport,
1da177e4
LT
244 usin->sin_port);
245
c720c7e8 246 inet->inet_id = tp->write_seq ^ jiffies;
1da177e4 247
2b916477 248 err = tcp_connect(sk);
ee995283 249
1da177e4
LT
250 rt = NULL;
251 if (err)
252 goto failure;
253
254 return 0;
255
256failure:
7174259e
ACM
257 /*
258 * This unhashes the socket and releases the local port,
259 * if necessary.
260 */
1da177e4
LT
261 tcp_set_state(sk, TCP_CLOSE);
262 ip_rt_put(rt);
263 sk->sk_route_caps = 0;
c720c7e8 264 inet->inet_dport = 0;
1da177e4
LT
265 return err;
266}
4bc2f18b 267EXPORT_SYMBOL(tcp_v4_connect);
1da177e4 268
1da177e4 269/*
563d34d0
ED
270 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
271 * It can be called through tcp_release_cb() if socket was owned by user
272 * at the time tcp_v4_err() was called to handle ICMP message.
1da177e4 273 */
4fab9071 274void tcp_v4_mtu_reduced(struct sock *sk)
1da177e4
LT
275{
276 struct dst_entry *dst;
277 struct inet_sock *inet = inet_sk(sk);
563d34d0 278 u32 mtu = tcp_sk(sk)->mtu_info;
1da177e4 279
80d0a69f
DM
280 dst = inet_csk_update_pmtu(sk, mtu);
281 if (!dst)
1da177e4
LT
282 return;
283
1da177e4
LT
284 /* Something is about to be wrong... Remember soft error
285 * for the case, if this connection will not able to recover.
286 */
287 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
288 sk->sk_err_soft = EMSGSIZE;
289
290 mtu = dst_mtu(dst);
291
292 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
482fc609 293 ip_sk_accept_pmtu(sk) &&
d83d8461 294 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
1da177e4
LT
295 tcp_sync_mss(sk, mtu);
296
297 /* Resend the TCP packet because it's
298 * clear that the old packet has been
299 * dropped. This is the new "fast" path mtu
300 * discovery.
301 */
302 tcp_simple_retransmit(sk);
303 } /* else let the usual retransmit timer handle it */
304}
4fab9071 305EXPORT_SYMBOL(tcp_v4_mtu_reduced);
1da177e4 306
55be7a9c
DM
307static void do_redirect(struct sk_buff *skb, struct sock *sk)
308{
309 struct dst_entry *dst = __sk_dst_check(sk, 0);
310
1ed5c48f 311 if (dst)
6700c270 312 dst->ops->redirect(dst, sk, skb);
55be7a9c
DM
313}
314
1da177e4
LT
315/*
316 * This routine is called by the ICMP module when it gets some
317 * sort of error condition. If err < 0 then the socket should
318 * be closed and the error returned to the user. If err > 0
319 * it's just the icmp type << 8 | icmp code. After adjustment
320 * header points to the first 8 bytes of the tcp header. We need
321 * to find the appropriate port.
322 *
323 * The locking strategy used here is very "optimistic". When
324 * someone else accesses the socket the ICMP is just dropped
325 * and for some paths there is no check at all.
326 * A more general error queue to queue errors for later handling
327 * is probably better.
328 *
329 */
330
4d1a2d9e 331void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
1da177e4 332{
b71d1d42 333 const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
4d1a2d9e 334 struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
f1ecd5d9 335 struct inet_connection_sock *icsk;
1da177e4
LT
336 struct tcp_sock *tp;
337 struct inet_sock *inet;
4d1a2d9e
DL
338 const int type = icmp_hdr(icmp_skb)->type;
339 const int code = icmp_hdr(icmp_skb)->code;
1da177e4 340 struct sock *sk;
f1ecd5d9 341 struct sk_buff *skb;
0a672f74
YC
342 struct request_sock *fastopen;
343 __u32 seq, snd_una;
f1ecd5d9 344 __u32 remaining;
1da177e4 345 int err;
4d1a2d9e 346 struct net *net = dev_net(icmp_skb->dev);
1da177e4 347
fd54d716 348 sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
4d1a2d9e 349 iph->saddr, th->source, inet_iif(icmp_skb));
1da177e4 350 if (!sk) {
dcfc23ca 351 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
1da177e4
LT
352 return;
353 }
354 if (sk->sk_state == TCP_TIME_WAIT) {
9469c7b4 355 inet_twsk_put(inet_twsk(sk));
1da177e4
LT
356 return;
357 }
358
359 bh_lock_sock(sk);
360 /* If too many ICMPs get dropped on busy
361 * servers this needs to be solved differently.
563d34d0
ED
362 * We do take care of PMTU discovery (RFC1191) special case :
363 * we can receive locally generated ICMP messages while socket is held.
1da177e4 364 */
b74aa930
ED
365 if (sock_owned_by_user(sk)) {
366 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
367 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
368 }
1da177e4
LT
369 if (sk->sk_state == TCP_CLOSE)
370 goto out;
371
97e3ecd1 372 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
373 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
374 goto out;
375 }
376
f1ecd5d9 377 icsk = inet_csk(sk);
1da177e4
LT
378 tp = tcp_sk(sk);
379 seq = ntohl(th->seq);
0a672f74
YC
380 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
381 fastopen = tp->fastopen_rsk;
382 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
1da177e4 383 if (sk->sk_state != TCP_LISTEN &&
0a672f74 384 !between(seq, snd_una, tp->snd_nxt)) {
de0744af 385 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
1da177e4
LT
386 goto out;
387 }
388
389 switch (type) {
55be7a9c
DM
390 case ICMP_REDIRECT:
391 do_redirect(icmp_skb, sk);
392 goto out;
1da177e4
LT
393 case ICMP_SOURCE_QUENCH:
394 /* Just silently ignore these. */
395 goto out;
396 case ICMP_PARAMETERPROB:
397 err = EPROTO;
398 break;
399 case ICMP_DEST_UNREACH:
400 if (code > NR_ICMP_UNREACH)
401 goto out;
402
403 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
0d4f0608
ED
404 /* We are not interested in TCP_LISTEN and open_requests
405 * (SYN-ACKs send out by Linux are always <576bytes so
406 * they should go through unfragmented).
407 */
408 if (sk->sk_state == TCP_LISTEN)
409 goto out;
410
563d34d0 411 tp->mtu_info = info;
144d56e9 412 if (!sock_owned_by_user(sk)) {
563d34d0 413 tcp_v4_mtu_reduced(sk);
144d56e9
ED
414 } else {
415 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
416 sock_hold(sk);
417 }
1da177e4
LT
418 goto out;
419 }
420
421 err = icmp_err_convert[code].errno;
f1ecd5d9
DL
422 /* check if icmp_skb allows revert of backoff
423 * (see draft-zimmermann-tcp-lcd) */
424 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
425 break;
426 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
0a672f74 427 !icsk->icsk_backoff || fastopen)
f1ecd5d9
DL
428 break;
429
8f49c270
DM
430 if (sock_owned_by_user(sk))
431 break;
432
f1ecd5d9 433 icsk->icsk_backoff--;
740b0f18 434 inet_csk(sk)->icsk_rto = (tp->srtt_us ? __tcp_set_rto(tp) :
9ad7c049 435 TCP_TIMEOUT_INIT) << icsk->icsk_backoff;
f1ecd5d9
DL
436 tcp_bound_rto(sk);
437
438 skb = tcp_write_queue_head(sk);
439 BUG_ON(!skb);
440
441 remaining = icsk->icsk_rto - min(icsk->icsk_rto,
442 tcp_time_stamp - TCP_SKB_CB(skb)->when);
443
444 if (remaining) {
445 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
446 remaining, TCP_RTO_MAX);
f1ecd5d9
DL
447 } else {
448 /* RTO revert clocked out retransmission.
449 * Will retransmit now */
450 tcp_retransmit_timer(sk);
451 }
452
1da177e4
LT
453 break;
454 case ICMP_TIME_EXCEEDED:
455 err = EHOSTUNREACH;
456 break;
457 default:
458 goto out;
459 }
460
461 switch (sk->sk_state) {
60236fdd 462 struct request_sock *req, **prev;
1da177e4
LT
463 case TCP_LISTEN:
464 if (sock_owned_by_user(sk))
465 goto out;
466
463c84b9
ACM
467 req = inet_csk_search_req(sk, &prev, th->dest,
468 iph->daddr, iph->saddr);
1da177e4
LT
469 if (!req)
470 goto out;
471
472 /* ICMPs are not backlogged, hence we cannot get
473 an established socket here.
474 */
547b792c 475 WARN_ON(req->sk);
1da177e4 476
2e6599cb 477 if (seq != tcp_rsk(req)->snt_isn) {
de0744af 478 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
1da177e4
LT
479 goto out;
480 }
481
482 /*
483 * Still in SYN_RECV, just remove it silently.
484 * There is no good way to pass the error to the newly
485 * created socket, and POSIX does not want network
486 * errors returned from accept().
487 */
463c84b9 488 inet_csk_reqsk_queue_drop(sk, req, prev);
848bf15f 489 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1da177e4
LT
490 goto out;
491
492 case TCP_SYN_SENT:
0a672f74
YC
493 case TCP_SYN_RECV:
494 /* Only in fast or simultaneous open. If a fast open socket is
495 * is already accepted it is treated as a connected one below.
496 */
497 if (fastopen && fastopen->sk == NULL)
498 break;
499
1da177e4 500 if (!sock_owned_by_user(sk)) {
1da177e4
LT
501 sk->sk_err = err;
502
503 sk->sk_error_report(sk);
504
505 tcp_done(sk);
506 } else {
507 sk->sk_err_soft = err;
508 }
509 goto out;
510 }
511
512 /* If we've already connected we will keep trying
513 * until we time out, or the user gives up.
514 *
515 * rfc1122 4.2.3.9 allows to consider as hard errors
516 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
517 * but it is obsoleted by pmtu discovery).
518 *
519 * Note, that in modern internet, where routing is unreliable
520 * and in each dark corner broken firewalls sit, sending random
521 * errors ordered by their masters even this two messages finally lose
522 * their original sense (even Linux sends invalid PORT_UNREACHs)
523 *
524 * Now we are in compliance with RFCs.
525 * --ANK (980905)
526 */
527
528 inet = inet_sk(sk);
529 if (!sock_owned_by_user(sk) && inet->recverr) {
530 sk->sk_err = err;
531 sk->sk_error_report(sk);
532 } else { /* Only an error on timeout */
533 sk->sk_err_soft = err;
534 }
535
536out:
537 bh_unlock_sock(sk);
538 sock_put(sk);
539}
540
28850dc7 541void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1da177e4 542{
aa8223c7 543 struct tcphdr *th = tcp_hdr(skb);
1da177e4 544
84fa7933 545 if (skb->ip_summed == CHECKSUM_PARTIAL) {
419f9f89 546 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
663ead3b 547 skb->csum_start = skb_transport_header(skb) - skb->head;
ff1dcadb 548 skb->csum_offset = offsetof(struct tcphdr, check);
1da177e4 549 } else {
419f9f89 550 th->check = tcp_v4_check(skb->len, saddr, daddr,
07f0757a 551 csum_partial(th,
1da177e4
LT
552 th->doff << 2,
553 skb->csum));
554 }
555}
556
419f9f89 557/* This routine computes an IPv4 TCP checksum. */
bb296246 558void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
419f9f89 559{
cf533ea5 560 const struct inet_sock *inet = inet_sk(sk);
419f9f89
HX
561
562 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
563}
4bc2f18b 564EXPORT_SYMBOL(tcp_v4_send_check);
419f9f89 565
1da177e4
LT
566/*
567 * This routine will send an RST to the other tcp.
568 *
569 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
570 * for reset.
571 * Answer: if a packet caused RST, it is not for a socket
572 * existing in our system, if it is matched to a socket,
573 * it is just duplicate segment or bug in other side's TCP.
574 * So that we build reply only basing on parameters
575 * arrived with segment.
576 * Exception: precedence violation. We do not implement it in any case.
577 */
578
cfb6eeb4 579static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
1da177e4 580{
cf533ea5 581 const struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4
YH
582 struct {
583 struct tcphdr th;
584#ifdef CONFIG_TCP_MD5SIG
714e85be 585 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
cfb6eeb4
YH
586#endif
587 } rep;
1da177e4 588 struct ip_reply_arg arg;
cfb6eeb4
YH
589#ifdef CONFIG_TCP_MD5SIG
590 struct tcp_md5sig_key *key;
658ddaaf
SL
591 const __u8 *hash_location = NULL;
592 unsigned char newhash[16];
593 int genhash;
594 struct sock *sk1 = NULL;
cfb6eeb4 595#endif
a86b1e30 596 struct net *net;
1da177e4
LT
597
598 /* Never send a reset in response to a reset. */
599 if (th->rst)
600 return;
601
511c3f92 602 if (skb_rtable(skb)->rt_type != RTN_LOCAL)
1da177e4
LT
603 return;
604
605 /* Swap the send and the receive. */
cfb6eeb4
YH
606 memset(&rep, 0, sizeof(rep));
607 rep.th.dest = th->source;
608 rep.th.source = th->dest;
609 rep.th.doff = sizeof(struct tcphdr) / 4;
610 rep.th.rst = 1;
1da177e4
LT
611
612 if (th->ack) {
cfb6eeb4 613 rep.th.seq = th->ack_seq;
1da177e4 614 } else {
cfb6eeb4
YH
615 rep.th.ack = 1;
616 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
617 skb->len - (th->doff << 2));
1da177e4
LT
618 }
619
7174259e 620 memset(&arg, 0, sizeof(arg));
cfb6eeb4
YH
621 arg.iov[0].iov_base = (unsigned char *)&rep;
622 arg.iov[0].iov_len = sizeof(rep.th);
623
624#ifdef CONFIG_TCP_MD5SIG
658ddaaf
SL
625 hash_location = tcp_parse_md5sig_option(th);
626 if (!sk && hash_location) {
627 /*
628 * active side is lost. Try to find listening socket through
629 * source port, and then find md5 key through listening socket.
630 * we are not loose security here:
631 * Incoming packet is checked with md5 hash with finding key,
632 * no RST generated if md5 hash doesn't match.
633 */
634 sk1 = __inet_lookup_listener(dev_net(skb_dst(skb)->dev),
da5e3630
TH
635 &tcp_hashinfo, ip_hdr(skb)->saddr,
636 th->source, ip_hdr(skb)->daddr,
658ddaaf
SL
637 ntohs(th->source), inet_iif(skb));
638 /* don't send rst if it can't find key */
639 if (!sk1)
640 return;
641 rcu_read_lock();
642 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
643 &ip_hdr(skb)->saddr, AF_INET);
644 if (!key)
645 goto release_sk1;
646
647 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, NULL, skb);
648 if (genhash || memcmp(hash_location, newhash, 16) != 0)
649 goto release_sk1;
650 } else {
651 key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
652 &ip_hdr(skb)->saddr,
653 AF_INET) : NULL;
654 }
655
cfb6eeb4
YH
656 if (key) {
657 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
658 (TCPOPT_NOP << 16) |
659 (TCPOPT_MD5SIG << 8) |
660 TCPOLEN_MD5SIG);
661 /* Update length and the length the header thinks exists */
662 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
663 rep.th.doff = arg.iov[0].iov_len / 4;
664
49a72dfb 665 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
78e645cb
IJ
666 key, ip_hdr(skb)->saddr,
667 ip_hdr(skb)->daddr, &rep.th);
cfb6eeb4
YH
668 }
669#endif
eddc9ec5
ACM
670 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
671 ip_hdr(skb)->saddr, /* XXX */
52cd5750 672 arg.iov[0].iov_len, IPPROTO_TCP, 0);
1da177e4 673 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
88ef4a5a 674 arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
e2446eaa 675 /* When socket is gone, all binding information is lost.
4c675258
AK
676 * routing might fail in this case. No choice here, if we choose to force
677 * input interface, we will misroute in case of asymmetric route.
e2446eaa 678 */
4c675258
AK
679 if (sk)
680 arg.bound_dev_if = sk->sk_bound_dev_if;
1da177e4 681
adf30907 682 net = dev_net(skb_dst(skb)->dev);
66b13d99 683 arg.tos = ip_hdr(skb)->tos;
be9f4a44 684 ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
70e73416 685 ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
1da177e4 686
63231bdd
PE
687 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
688 TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
658ddaaf
SL
689
690#ifdef CONFIG_TCP_MD5SIG
691release_sk1:
692 if (sk1) {
693 rcu_read_unlock();
694 sock_put(sk1);
695 }
696#endif
1da177e4
LT
697}
698
699/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
700 outside socket context is ugly, certainly. What can I do?
701 */
702
9501f972 703static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
ee684b6f 704 u32 win, u32 tsval, u32 tsecr, int oif,
88ef4a5a 705 struct tcp_md5sig_key *key,
66b13d99 706 int reply_flags, u8 tos)
1da177e4 707{
cf533ea5 708 const struct tcphdr *th = tcp_hdr(skb);
1da177e4
LT
709 struct {
710 struct tcphdr th;
714e85be 711 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
cfb6eeb4 712#ifdef CONFIG_TCP_MD5SIG
714e85be 713 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
cfb6eeb4
YH
714#endif
715 ];
1da177e4
LT
716 } rep;
717 struct ip_reply_arg arg;
adf30907 718 struct net *net = dev_net(skb_dst(skb)->dev);
1da177e4
LT
719
720 memset(&rep.th, 0, sizeof(struct tcphdr));
7174259e 721 memset(&arg, 0, sizeof(arg));
1da177e4
LT
722
723 arg.iov[0].iov_base = (unsigned char *)&rep;
724 arg.iov[0].iov_len = sizeof(rep.th);
ee684b6f 725 if (tsecr) {
cfb6eeb4
YH
726 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
727 (TCPOPT_TIMESTAMP << 8) |
728 TCPOLEN_TIMESTAMP);
ee684b6f
AV
729 rep.opt[1] = htonl(tsval);
730 rep.opt[2] = htonl(tsecr);
cb48cfe8 731 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
1da177e4
LT
732 }
733
734 /* Swap the send and the receive. */
735 rep.th.dest = th->source;
736 rep.th.source = th->dest;
737 rep.th.doff = arg.iov[0].iov_len / 4;
738 rep.th.seq = htonl(seq);
739 rep.th.ack_seq = htonl(ack);
740 rep.th.ack = 1;
741 rep.th.window = htons(win);
742
cfb6eeb4 743#ifdef CONFIG_TCP_MD5SIG
cfb6eeb4 744 if (key) {
ee684b6f 745 int offset = (tsecr) ? 3 : 0;
cfb6eeb4
YH
746
747 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
748 (TCPOPT_NOP << 16) |
749 (TCPOPT_MD5SIG << 8) |
750 TCPOLEN_MD5SIG);
751 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
752 rep.th.doff = arg.iov[0].iov_len/4;
753
49a72dfb 754 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
90b7e112
AL
755 key, ip_hdr(skb)->saddr,
756 ip_hdr(skb)->daddr, &rep.th);
cfb6eeb4
YH
757 }
758#endif
88ef4a5a 759 arg.flags = reply_flags;
eddc9ec5
ACM
760 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
761 ip_hdr(skb)->saddr, /* XXX */
1da177e4
LT
762 arg.iov[0].iov_len, IPPROTO_TCP, 0);
763 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
9501f972
YH
764 if (oif)
765 arg.bound_dev_if = oif;
66b13d99 766 arg.tos = tos;
be9f4a44 767 ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
70e73416 768 ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
1da177e4 769
63231bdd 770 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
1da177e4
LT
771}
772
773static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
774{
8feaf0c0 775 struct inet_timewait_sock *tw = inet_twsk(sk);
cfb6eeb4 776 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1da177e4 777
9501f972 778 tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
7174259e 779 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
ee684b6f 780 tcp_time_stamp + tcptw->tw_ts_offset,
9501f972
YH
781 tcptw->tw_ts_recent,
782 tw->tw_bound_dev_if,
88ef4a5a 783 tcp_twsk_md5_key(tcptw),
66b13d99
ED
784 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
785 tw->tw_tos
9501f972 786 );
1da177e4 787
8feaf0c0 788 inet_twsk_put(tw);
1da177e4
LT
789}
790
6edafaaf 791static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
7174259e 792 struct request_sock *req)
1da177e4 793{
168a8f58
JC
794 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
795 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
796 */
797 tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
798 tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
799 tcp_rsk(req)->rcv_nxt, req->rcv_wnd,
ee684b6f 800 tcp_time_stamp,
9501f972
YH
801 req->ts_recent,
802 0,
a915da9b
ED
803 tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
804 AF_INET),
66b13d99
ED
805 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
806 ip_hdr(skb)->tos);
1da177e4
LT
807}
808
1da177e4 809/*
9bf1d83e 810 * Send a SYN-ACK after having received a SYN.
60236fdd 811 * This still operates on a request_sock only, not on a big
1da177e4
LT
812 * socket.
813 */
72659ecc 814static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
d6274bd8 815 struct flowi *fl,
72659ecc 816 struct request_sock *req,
843f4a55
YC
817 u16 queue_mapping,
818 struct tcp_fastopen_cookie *foc)
1da177e4 819{
2e6599cb 820 const struct inet_request_sock *ireq = inet_rsk(req);
6bd023f3 821 struct flowi4 fl4;
1da177e4 822 int err = -1;
d41db5af 823 struct sk_buff *skb;
1da177e4
LT
824
825 /* First, grab a route. */
ba3f7f04 826 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
fd80eb94 827 return -1;
1da177e4 828
843f4a55 829 skb = tcp_make_synack(sk, dst, req, foc);
1da177e4
LT
830
831 if (skb) {
634fb979 832 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1da177e4 833
fff32699 834 skb_set_queue_mapping(skb, queue_mapping);
634fb979
ED
835 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
836 ireq->ir_rmt_addr,
2e6599cb 837 ireq->opt);
b9df3cb8 838 err = net_xmit_eval(err);
1da177e4
LT
839 }
840
1da177e4
LT
841 return err;
842}
843
844/*
60236fdd 845 * IPv4 request_sock destructor.
1da177e4 846 */
60236fdd 847static void tcp_v4_reqsk_destructor(struct request_sock *req)
1da177e4 848{
a51482bd 849 kfree(inet_rsk(req)->opt);
1da177e4
LT
850}
851
946cedcc 852/*
a2a385d6 853 * Return true if a syncookie should be sent
946cedcc 854 */
a2a385d6 855bool tcp_syn_flood_action(struct sock *sk,
946cedcc
ED
856 const struct sk_buff *skb,
857 const char *proto)
1da177e4 858{
946cedcc 859 const char *msg = "Dropping request";
a2a385d6 860 bool want_cookie = false;
946cedcc
ED
861 struct listen_sock *lopt;
862
2a1d4bd4 863#ifdef CONFIG_SYN_COOKIES
946cedcc 864 if (sysctl_tcp_syncookies) {
2a1d4bd4 865 msg = "Sending cookies";
a2a385d6 866 want_cookie = true;
946cedcc
ED
867 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
868 } else
80e40daa 869#endif
946cedcc
ED
870 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
871
872 lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
5ad37d5d 873 if (!lopt->synflood_warned && sysctl_tcp_syncookies != 2) {
946cedcc 874 lopt->synflood_warned = 1;
afd46503 875 pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n",
946cedcc
ED
876 proto, ntohs(tcp_hdr(skb)->dest), msg);
877 }
878 return want_cookie;
2a1d4bd4 879}
946cedcc 880EXPORT_SYMBOL(tcp_syn_flood_action);
1da177e4
LT
881
882/*
60236fdd 883 * Save and compile IPv4 options into the request_sock if needed.
1da177e4 884 */
5dff747b 885static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb)
1da177e4 886{
f6d8bd05
ED
887 const struct ip_options *opt = &(IPCB(skb)->opt);
888 struct ip_options_rcu *dopt = NULL;
1da177e4
LT
889
890 if (opt && opt->optlen) {
f6d8bd05
ED
891 int opt_size = sizeof(*dopt) + opt->optlen;
892
1da177e4
LT
893 dopt = kmalloc(opt_size, GFP_ATOMIC);
894 if (dopt) {
f6d8bd05 895 if (ip_options_echo(&dopt->opt, skb)) {
1da177e4
LT
896 kfree(dopt);
897 dopt = NULL;
898 }
899 }
900 }
901 return dopt;
902}
903
cfb6eeb4
YH
904#ifdef CONFIG_TCP_MD5SIG
905/*
906 * RFC2385 MD5 checksumming requires a mapping of
907 * IP address->MD5 Key.
908 * We need to maintain these in the sk structure.
909 */
910
911/* Find the Key structure for an address. */
a915da9b
ED
912struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
913 const union tcp_md5_addr *addr,
914 int family)
cfb6eeb4
YH
915{
916 struct tcp_sock *tp = tcp_sk(sk);
a915da9b 917 struct tcp_md5sig_key *key;
a915da9b 918 unsigned int size = sizeof(struct in_addr);
a8afca03 919 struct tcp_md5sig_info *md5sig;
cfb6eeb4 920
a8afca03
ED
921 /* caller either holds rcu_read_lock() or socket lock */
922 md5sig = rcu_dereference_check(tp->md5sig_info,
b4fb05ea
ED
923 sock_owned_by_user(sk) ||
924 lockdep_is_held(&sk->sk_lock.slock));
a8afca03 925 if (!md5sig)
cfb6eeb4 926 return NULL;
a915da9b
ED
927#if IS_ENABLED(CONFIG_IPV6)
928 if (family == AF_INET6)
929 size = sizeof(struct in6_addr);
930#endif
b67bfe0d 931 hlist_for_each_entry_rcu(key, &md5sig->head, node) {
a915da9b
ED
932 if (key->family != family)
933 continue;
934 if (!memcmp(&key->addr, addr, size))
935 return key;
cfb6eeb4
YH
936 }
937 return NULL;
938}
a915da9b 939EXPORT_SYMBOL(tcp_md5_do_lookup);
cfb6eeb4
YH
940
941struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
942 struct sock *addr_sk)
943{
a915da9b
ED
944 union tcp_md5_addr *addr;
945
946 addr = (union tcp_md5_addr *)&inet_sk(addr_sk)->inet_daddr;
947 return tcp_md5_do_lookup(sk, addr, AF_INET);
cfb6eeb4 948}
cfb6eeb4
YH
949EXPORT_SYMBOL(tcp_v4_md5_lookup);
950
f5b99bcd
AB
951static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
952 struct request_sock *req)
cfb6eeb4 953{
a915da9b
ED
954 union tcp_md5_addr *addr;
955
634fb979 956 addr = (union tcp_md5_addr *)&inet_rsk(req)->ir_rmt_addr;
a915da9b 957 return tcp_md5_do_lookup(sk, addr, AF_INET);
cfb6eeb4
YH
958}
959
960/* This can be called on a newly created socket, from other files */
a915da9b
ED
961int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
962 int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
cfb6eeb4
YH
963{
964 /* Add Key to the list */
b0a713e9 965 struct tcp_md5sig_key *key;
cfb6eeb4 966 struct tcp_sock *tp = tcp_sk(sk);
a915da9b 967 struct tcp_md5sig_info *md5sig;
cfb6eeb4 968
c0353c7b 969 key = tcp_md5_do_lookup(sk, addr, family);
cfb6eeb4
YH
970 if (key) {
971 /* Pre-existing entry - just update that one. */
a915da9b 972 memcpy(key->key, newkey, newkeylen);
b0a713e9 973 key->keylen = newkeylen;
a915da9b
ED
974 return 0;
975 }
260fcbeb 976
a8afca03
ED
977 md5sig = rcu_dereference_protected(tp->md5sig_info,
978 sock_owned_by_user(sk));
a915da9b
ED
979 if (!md5sig) {
980 md5sig = kmalloc(sizeof(*md5sig), gfp);
981 if (!md5sig)
cfb6eeb4 982 return -ENOMEM;
cfb6eeb4 983
a915da9b
ED
984 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
985 INIT_HLIST_HEAD(&md5sig->head);
a8afca03 986 rcu_assign_pointer(tp->md5sig_info, md5sig);
a915da9b 987 }
cfb6eeb4 988
5f3d9cb2 989 key = sock_kmalloc(sk, sizeof(*key), gfp);
a915da9b
ED
990 if (!key)
991 return -ENOMEM;
71cea17e 992 if (!tcp_alloc_md5sig_pool()) {
5f3d9cb2 993 sock_kfree_s(sk, key, sizeof(*key));
a915da9b 994 return -ENOMEM;
cfb6eeb4 995 }
a915da9b
ED
996
997 memcpy(key->key, newkey, newkeylen);
998 key->keylen = newkeylen;
999 key->family = family;
1000 memcpy(&key->addr, addr,
1001 (family == AF_INET6) ? sizeof(struct in6_addr) :
1002 sizeof(struct in_addr));
1003 hlist_add_head_rcu(&key->node, &md5sig->head);
cfb6eeb4
YH
1004 return 0;
1005}
a915da9b 1006EXPORT_SYMBOL(tcp_md5_do_add);
cfb6eeb4 1007
a915da9b 1008int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
cfb6eeb4 1009{
a915da9b
ED
1010 struct tcp_md5sig_key *key;
1011
c0353c7b 1012 key = tcp_md5_do_lookup(sk, addr, family);
a915da9b
ED
1013 if (!key)
1014 return -ENOENT;
1015 hlist_del_rcu(&key->node);
5f3d9cb2 1016 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
a915da9b 1017 kfree_rcu(key, rcu);
a915da9b 1018 return 0;
cfb6eeb4 1019}
a915da9b 1020EXPORT_SYMBOL(tcp_md5_do_del);
cfb6eeb4 1021
e0683e70 1022static void tcp_clear_md5_list(struct sock *sk)
cfb6eeb4
YH
1023{
1024 struct tcp_sock *tp = tcp_sk(sk);
a915da9b 1025 struct tcp_md5sig_key *key;
b67bfe0d 1026 struct hlist_node *n;
a8afca03 1027 struct tcp_md5sig_info *md5sig;
cfb6eeb4 1028
a8afca03
ED
1029 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1030
b67bfe0d 1031 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
a915da9b 1032 hlist_del_rcu(&key->node);
5f3d9cb2 1033 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
a915da9b 1034 kfree_rcu(key, rcu);
cfb6eeb4
YH
1035 }
1036}
1037
7174259e
ACM
1038static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1039 int optlen)
cfb6eeb4
YH
1040{
1041 struct tcp_md5sig cmd;
1042 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
cfb6eeb4
YH
1043
1044 if (optlen < sizeof(cmd))
1045 return -EINVAL;
1046
7174259e 1047 if (copy_from_user(&cmd, optval, sizeof(cmd)))
cfb6eeb4
YH
1048 return -EFAULT;
1049
1050 if (sin->sin_family != AF_INET)
1051 return -EINVAL;
1052
64a124ed 1053 if (!cmd.tcpm_keylen)
a915da9b
ED
1054 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1055 AF_INET);
cfb6eeb4
YH
1056
1057 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1058 return -EINVAL;
1059
a915da9b
ED
1060 return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1061 AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1062 GFP_KERNEL);
cfb6eeb4
YH
1063}
1064
49a72dfb
AL
1065static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1066 __be32 daddr, __be32 saddr, int nbytes)
cfb6eeb4 1067{
cfb6eeb4 1068 struct tcp4_pseudohdr *bp;
49a72dfb 1069 struct scatterlist sg;
cfb6eeb4
YH
1070
1071 bp = &hp->md5_blk.ip4;
cfb6eeb4
YH
1072
1073 /*
49a72dfb 1074 * 1. the TCP pseudo-header (in the order: source IP address,
cfb6eeb4
YH
1075 * destination IP address, zero-padded protocol number, and
1076 * segment length)
1077 */
1078 bp->saddr = saddr;
1079 bp->daddr = daddr;
1080 bp->pad = 0;
076fb722 1081 bp->protocol = IPPROTO_TCP;
49a72dfb 1082 bp->len = cpu_to_be16(nbytes);
c7da57a1 1083
49a72dfb
AL
1084 sg_init_one(&sg, bp, sizeof(*bp));
1085 return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1086}
1087
a915da9b 1088static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
318cf7aa 1089 __be32 daddr, __be32 saddr, const struct tcphdr *th)
49a72dfb
AL
1090{
1091 struct tcp_md5sig_pool *hp;
1092 struct hash_desc *desc;
1093
1094 hp = tcp_get_md5sig_pool();
1095 if (!hp)
1096 goto clear_hash_noput;
1097 desc = &hp->md5_desc;
1098
1099 if (crypto_hash_init(desc))
1100 goto clear_hash;
1101 if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1102 goto clear_hash;
1103 if (tcp_md5_hash_header(hp, th))
1104 goto clear_hash;
1105 if (tcp_md5_hash_key(hp, key))
1106 goto clear_hash;
1107 if (crypto_hash_final(desc, md5_hash))
cfb6eeb4
YH
1108 goto clear_hash;
1109
cfb6eeb4 1110 tcp_put_md5sig_pool();
cfb6eeb4 1111 return 0;
49a72dfb 1112
cfb6eeb4
YH
1113clear_hash:
1114 tcp_put_md5sig_pool();
1115clear_hash_noput:
1116 memset(md5_hash, 0, 16);
49a72dfb 1117 return 1;
cfb6eeb4
YH
1118}
1119
49a72dfb 1120int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
318cf7aa
ED
1121 const struct sock *sk, const struct request_sock *req,
1122 const struct sk_buff *skb)
cfb6eeb4 1123{
49a72dfb
AL
1124 struct tcp_md5sig_pool *hp;
1125 struct hash_desc *desc;
318cf7aa 1126 const struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4
YH
1127 __be32 saddr, daddr;
1128
1129 if (sk) {
c720c7e8
ED
1130 saddr = inet_sk(sk)->inet_saddr;
1131 daddr = inet_sk(sk)->inet_daddr;
49a72dfb 1132 } else if (req) {
634fb979
ED
1133 saddr = inet_rsk(req)->ir_loc_addr;
1134 daddr = inet_rsk(req)->ir_rmt_addr;
cfb6eeb4 1135 } else {
49a72dfb
AL
1136 const struct iphdr *iph = ip_hdr(skb);
1137 saddr = iph->saddr;
1138 daddr = iph->daddr;
cfb6eeb4 1139 }
49a72dfb
AL
1140
1141 hp = tcp_get_md5sig_pool();
1142 if (!hp)
1143 goto clear_hash_noput;
1144 desc = &hp->md5_desc;
1145
1146 if (crypto_hash_init(desc))
1147 goto clear_hash;
1148
1149 if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1150 goto clear_hash;
1151 if (tcp_md5_hash_header(hp, th))
1152 goto clear_hash;
1153 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1154 goto clear_hash;
1155 if (tcp_md5_hash_key(hp, key))
1156 goto clear_hash;
1157 if (crypto_hash_final(desc, md5_hash))
1158 goto clear_hash;
1159
1160 tcp_put_md5sig_pool();
1161 return 0;
1162
1163clear_hash:
1164 tcp_put_md5sig_pool();
1165clear_hash_noput:
1166 memset(md5_hash, 0, 16);
1167 return 1;
cfb6eeb4 1168}
49a72dfb 1169EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
cfb6eeb4 1170
9ea88a15
DP
1171static bool __tcp_v4_inbound_md5_hash(struct sock *sk,
1172 const struct sk_buff *skb)
cfb6eeb4
YH
1173{
1174 /*
1175 * This gets called for each TCP segment that arrives
1176 * so we want to be efficient.
1177 * We have 3 drop cases:
1178 * o No MD5 hash and one expected.
1179 * o MD5 hash and we're not expecting one.
1180 * o MD5 hash and its wrong.
1181 */
cf533ea5 1182 const __u8 *hash_location = NULL;
cfb6eeb4 1183 struct tcp_md5sig_key *hash_expected;
eddc9ec5 1184 const struct iphdr *iph = ip_hdr(skb);
cf533ea5 1185 const struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4 1186 int genhash;
cfb6eeb4
YH
1187 unsigned char newhash[16];
1188
a915da9b
ED
1189 hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1190 AF_INET);
7d5d5525 1191 hash_location = tcp_parse_md5sig_option(th);
cfb6eeb4 1192
cfb6eeb4
YH
1193 /* We've parsed the options - do we have a hash? */
1194 if (!hash_expected && !hash_location)
a2a385d6 1195 return false;
cfb6eeb4
YH
1196
1197 if (hash_expected && !hash_location) {
785957d3 1198 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
a2a385d6 1199 return true;
cfb6eeb4
YH
1200 }
1201
1202 if (!hash_expected && hash_location) {
785957d3 1203 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
a2a385d6 1204 return true;
cfb6eeb4
YH
1205 }
1206
1207 /* Okay, so this is hash_expected and hash_location -
1208 * so we need to calculate the checksum.
1209 */
49a72dfb
AL
1210 genhash = tcp_v4_md5_hash_skb(newhash,
1211 hash_expected,
1212 NULL, NULL, skb);
cfb6eeb4
YH
1213
1214 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
e87cc472
JP
1215 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1216 &iph->saddr, ntohs(th->source),
1217 &iph->daddr, ntohs(th->dest),
1218 genhash ? " tcp_v4_calc_md5_hash failed"
1219 : "");
a2a385d6 1220 return true;
cfb6eeb4 1221 }
a2a385d6 1222 return false;
cfb6eeb4
YH
1223}
1224
9ea88a15
DP
1225static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
1226{
1227 bool ret;
1228
1229 rcu_read_lock();
1230 ret = __tcp_v4_inbound_md5_hash(sk, skb);
1231 rcu_read_unlock();
1232
1233 return ret;
1234}
1235
cfb6eeb4
YH
1236#endif
1237
16bea70a
OP
1238static void tcp_v4_init_req(struct request_sock *req, struct sock *sk,
1239 struct sk_buff *skb)
1240{
1241 struct inet_request_sock *ireq = inet_rsk(req);
1242
1243 ireq->ir_loc_addr = ip_hdr(skb)->daddr;
1244 ireq->ir_rmt_addr = ip_hdr(skb)->saddr;
1245 ireq->no_srccheck = inet_sk(sk)->transparent;
1246 ireq->opt = tcp_v4_save_options(skb);
1247}
1248
d94e0417
OP
1249static struct dst_entry *tcp_v4_route_req(struct sock *sk, struct flowi *fl,
1250 const struct request_sock *req,
1251 bool *strict)
1252{
1253 struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1254
1255 if (strict) {
1256 if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1257 *strict = true;
1258 else
1259 *strict = false;
1260 }
1261
1262 return dst;
1263}
1264
72a3effa 1265struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1da177e4 1266 .family = PF_INET,
2e6599cb 1267 .obj_size = sizeof(struct tcp_request_sock),
5db92c99 1268 .rtx_syn_ack = tcp_rtx_synack,
60236fdd
ACM
1269 .send_ack = tcp_v4_reqsk_send_ack,
1270 .destructor = tcp_v4_reqsk_destructor,
1da177e4 1271 .send_reset = tcp_v4_send_reset,
72659ecc 1272 .syn_ack_timeout = tcp_syn_ack_timeout,
1da177e4
LT
1273};
1274
b2e4b3de 1275static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
2aec4a29 1276 .mss_clamp = TCP_MSS_DEFAULT,
16bea70a 1277#ifdef CONFIG_TCP_MD5SIG
cfb6eeb4 1278 .md5_lookup = tcp_v4_reqsk_md5_lookup,
e3afe7b7 1279 .calc_md5_hash = tcp_v4_md5_hash_skb,
b6332e6c 1280#endif
16bea70a 1281 .init_req = tcp_v4_init_req,
fb7b37a7
OP
1282#ifdef CONFIG_SYN_COOKIES
1283 .cookie_init_seq = cookie_v4_init_sequence,
1284#endif
d94e0417 1285 .route_req = tcp_v4_route_req,
936b8bdb 1286 .init_seq = tcp_v4_init_sequence,
d6274bd8 1287 .send_synack = tcp_v4_send_synack,
695da14e 1288 .queue_hash_add = inet_csk_reqsk_queue_hash_add,
16bea70a 1289};
cfb6eeb4 1290
1da177e4
LT
1291int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1292{
1da177e4 1293 /* Never answer to SYNs send to broadcast or multicast */
511c3f92 1294 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1da177e4
LT
1295 goto drop;
1296
1fb6f159
OP
1297 return tcp_conn_request(&tcp_request_sock_ops,
1298 &tcp_request_sock_ipv4_ops, sk, skb);
1da177e4 1299
1da177e4 1300drop:
848bf15f 1301 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1da177e4
LT
1302 return 0;
1303}
4bc2f18b 1304EXPORT_SYMBOL(tcp_v4_conn_request);
1da177e4
LT
1305
1306
1307/*
1308 * The three way handshake has completed - we got a valid synack -
1309 * now create the new socket.
1310 */
1311struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
60236fdd 1312 struct request_sock *req,
1da177e4
LT
1313 struct dst_entry *dst)
1314{
2e6599cb 1315 struct inet_request_sock *ireq;
1da177e4
LT
1316 struct inet_sock *newinet;
1317 struct tcp_sock *newtp;
1318 struct sock *newsk;
cfb6eeb4
YH
1319#ifdef CONFIG_TCP_MD5SIG
1320 struct tcp_md5sig_key *key;
1321#endif
f6d8bd05 1322 struct ip_options_rcu *inet_opt;
1da177e4
LT
1323
1324 if (sk_acceptq_is_full(sk))
1325 goto exit_overflow;
1326
1da177e4
LT
1327 newsk = tcp_create_openreq_child(sk, req, skb);
1328 if (!newsk)
093d2823 1329 goto exit_nonewsk;
1da177e4 1330
bcd76111 1331 newsk->sk_gso_type = SKB_GSO_TCPV4;
fae6ef87 1332 inet_sk_rx_dst_set(newsk, skb);
1da177e4
LT
1333
1334 newtp = tcp_sk(newsk);
1335 newinet = inet_sk(newsk);
2e6599cb 1336 ireq = inet_rsk(req);
634fb979
ED
1337 newinet->inet_daddr = ireq->ir_rmt_addr;
1338 newinet->inet_rcv_saddr = ireq->ir_loc_addr;
1339 newinet->inet_saddr = ireq->ir_loc_addr;
f6d8bd05
ED
1340 inet_opt = ireq->opt;
1341 rcu_assign_pointer(newinet->inet_opt, inet_opt);
2e6599cb 1342 ireq->opt = NULL;
463c84b9 1343 newinet->mc_index = inet_iif(skb);
eddc9ec5 1344 newinet->mc_ttl = ip_hdr(skb)->ttl;
4c507d28 1345 newinet->rcv_tos = ip_hdr(skb)->tos;
d83d8461 1346 inet_csk(newsk)->icsk_ext_hdr_len = 0;
b73c3d0e 1347 inet_set_txhash(newsk);
f6d8bd05
ED
1348 if (inet_opt)
1349 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
c720c7e8 1350 newinet->inet_id = newtp->write_seq ^ jiffies;
1da177e4 1351
dfd25fff
ED
1352 if (!dst) {
1353 dst = inet_csk_route_child_sock(sk, newsk, req);
1354 if (!dst)
1355 goto put_and_exit;
1356 } else {
1357 /* syncookie case : see end of cookie_v4_check() */
1358 }
0e734419
DM
1359 sk_setup_caps(newsk, dst);
1360
1da177e4 1361 tcp_sync_mss(newsk, dst_mtu(dst));
0dbaee3b 1362 newtp->advmss = dst_metric_advmss(dst);
f5fff5dc
TQ
1363 if (tcp_sk(sk)->rx_opt.user_mss &&
1364 tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1365 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1366
1da177e4
LT
1367 tcp_initialize_rcv_mss(newsk);
1368
cfb6eeb4
YH
1369#ifdef CONFIG_TCP_MD5SIG
1370 /* Copy over the MD5 key from the original socket */
a915da9b
ED
1371 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1372 AF_INET);
c720c7e8 1373 if (key != NULL) {
cfb6eeb4
YH
1374 /*
1375 * We're using one, so create a matching key
1376 * on the newsk structure. If we fail to get
1377 * memory, then we end up not copying the key
1378 * across. Shucks.
1379 */
a915da9b
ED
1380 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1381 AF_INET, key->key, key->keylen, GFP_ATOMIC);
a465419b 1382 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
cfb6eeb4
YH
1383 }
1384#endif
1385
0e734419
DM
1386 if (__inet_inherit_port(sk, newsk) < 0)
1387 goto put_and_exit;
9327f705 1388 __inet_hash_nolisten(newsk, NULL);
1da177e4
LT
1389
1390 return newsk;
1391
1392exit_overflow:
de0744af 1393 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
093d2823
BS
1394exit_nonewsk:
1395 dst_release(dst);
1da177e4 1396exit:
de0744af 1397 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1da177e4 1398 return NULL;
0e734419 1399put_and_exit:
e337e24d
CP
1400 inet_csk_prepare_forced_close(newsk);
1401 tcp_done(newsk);
0e734419 1402 goto exit;
1da177e4 1403}
4bc2f18b 1404EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1da177e4
LT
1405
1406static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1407{
aa8223c7 1408 struct tcphdr *th = tcp_hdr(skb);
eddc9ec5 1409 const struct iphdr *iph = ip_hdr(skb);
1da177e4 1410 struct sock *nsk;
60236fdd 1411 struct request_sock **prev;
1da177e4 1412 /* Find possible connection requests. */
463c84b9
ACM
1413 struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1414 iph->saddr, iph->daddr);
1da177e4 1415 if (req)
8336886f 1416 return tcp_check_req(sk, skb, req, prev, false);
1da177e4 1417
3b1e0a65 1418 nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
c67499c0 1419 th->source, iph->daddr, th->dest, inet_iif(skb));
1da177e4
LT
1420
1421 if (nsk) {
1422 if (nsk->sk_state != TCP_TIME_WAIT) {
1423 bh_lock_sock(nsk);
1424 return nsk;
1425 }
9469c7b4 1426 inet_twsk_put(inet_twsk(nsk));
1da177e4
LT
1427 return NULL;
1428 }
1429
1430#ifdef CONFIG_SYN_COOKIES
af9b4738 1431 if (!th->syn)
1da177e4
LT
1432 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1433#endif
1434 return sk;
1435}
1436
1da177e4
LT
1437/* The socket must have it's spinlock held when we get
1438 * here.
1439 *
1440 * We have a potential double-lock case here, so even when
1441 * doing backlog processing we use the BH locking scheme.
1442 * This is because we cannot sleep with the original spinlock
1443 * held.
1444 */
1445int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1446{
cfb6eeb4 1447 struct sock *rsk;
cfb6eeb4 1448
1da177e4 1449 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
404e0a8b
ED
1450 struct dst_entry *dst = sk->sk_rx_dst;
1451
bdeab991 1452 sock_rps_save_rxhash(sk, skb);
404e0a8b 1453 if (dst) {
505fbcf0
ED
1454 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1455 dst->ops->check(dst, 0) == NULL) {
92101b3b
DM
1456 dst_release(dst);
1457 sk->sk_rx_dst = NULL;
1458 }
1459 }
c995ae22 1460 tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1da177e4
LT
1461 return 0;
1462 }
1463
ab6a5bb6 1464 if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1da177e4
LT
1465 goto csum_err;
1466
1467 if (sk->sk_state == TCP_LISTEN) {
1468 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1469 if (!nsk)
1470 goto discard;
1471
1472 if (nsk != sk) {
bdeab991 1473 sock_rps_save_rxhash(nsk, skb);
cfb6eeb4
YH
1474 if (tcp_child_process(sk, nsk, skb)) {
1475 rsk = nsk;
1da177e4 1476 goto reset;
cfb6eeb4 1477 }
1da177e4
LT
1478 return 0;
1479 }
ca55158c 1480 } else
bdeab991 1481 sock_rps_save_rxhash(sk, skb);
ca55158c 1482
aa8223c7 1483 if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
cfb6eeb4 1484 rsk = sk;
1da177e4 1485 goto reset;
cfb6eeb4 1486 }
1da177e4
LT
1487 return 0;
1488
1489reset:
cfb6eeb4 1490 tcp_v4_send_reset(rsk, skb);
1da177e4
LT
1491discard:
1492 kfree_skb(skb);
1493 /* Be careful here. If this function gets more complicated and
1494 * gcc suffers from register pressure on the x86, sk (in %ebx)
1495 * might be destroyed here. This current version compiles correctly,
1496 * but you have been warned.
1497 */
1498 return 0;
1499
1500csum_err:
6a5dc9e5 1501 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
63231bdd 1502 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1da177e4
LT
1503 goto discard;
1504}
4bc2f18b 1505EXPORT_SYMBOL(tcp_v4_do_rcv);
1da177e4 1506
160eb5a6 1507void tcp_v4_early_demux(struct sk_buff *skb)
41063e9d 1508{
41063e9d
DM
1509 const struct iphdr *iph;
1510 const struct tcphdr *th;
1511 struct sock *sk;
41063e9d 1512
41063e9d 1513 if (skb->pkt_type != PACKET_HOST)
160eb5a6 1514 return;
41063e9d 1515
45f00f99 1516 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
160eb5a6 1517 return;
41063e9d
DM
1518
1519 iph = ip_hdr(skb);
45f00f99 1520 th = tcp_hdr(skb);
41063e9d
DM
1521
1522 if (th->doff < sizeof(struct tcphdr) / 4)
160eb5a6 1523 return;
41063e9d 1524
45f00f99 1525 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
41063e9d 1526 iph->saddr, th->source,
7011d085 1527 iph->daddr, ntohs(th->dest),
9cb429d6 1528 skb->skb_iif);
41063e9d
DM
1529 if (sk) {
1530 skb->sk = sk;
1531 skb->destructor = sock_edemux;
1532 if (sk->sk_state != TCP_TIME_WAIT) {
1533 struct dst_entry *dst = sk->sk_rx_dst;
505fbcf0 1534
41063e9d
DM
1535 if (dst)
1536 dst = dst_check(dst, 0);
92101b3b 1537 if (dst &&
505fbcf0 1538 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
92101b3b 1539 skb_dst_set_noref(skb, dst);
41063e9d
DM
1540 }
1541 }
41063e9d
DM
1542}
1543
b2fb4f54
ED
1544/* Packet is added to VJ-style prequeue for processing in process
1545 * context, if a reader task is waiting. Apparently, this exciting
1546 * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1547 * failed somewhere. Latency? Burstiness? Well, at least now we will
1548 * see, why it failed. 8)8) --ANK
1549 *
1550 */
1551bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1552{
1553 struct tcp_sock *tp = tcp_sk(sk);
1554
1555 if (sysctl_tcp_low_latency || !tp->ucopy.task)
1556 return false;
1557
1558 if (skb->len <= tcp_hdrlen(skb) &&
1559 skb_queue_len(&tp->ucopy.prequeue) == 0)
1560 return false;
1561
58717686 1562 skb_dst_force(skb);
b2fb4f54
ED
1563 __skb_queue_tail(&tp->ucopy.prequeue, skb);
1564 tp->ucopy.memory += skb->truesize;
1565 if (tp->ucopy.memory > sk->sk_rcvbuf) {
1566 struct sk_buff *skb1;
1567
1568 BUG_ON(sock_owned_by_user(sk));
1569
1570 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
1571 sk_backlog_rcv(sk, skb1);
1572 NET_INC_STATS_BH(sock_net(sk),
1573 LINUX_MIB_TCPPREQUEUEDROPPED);
1574 }
1575
1576 tp->ucopy.memory = 0;
1577 } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1578 wake_up_interruptible_sync_poll(sk_sleep(sk),
1579 POLLIN | POLLRDNORM | POLLRDBAND);
1580 if (!inet_csk_ack_scheduled(sk))
1581 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1582 (3 * tcp_rto_min(sk)) / 4,
1583 TCP_RTO_MAX);
1584 }
1585 return true;
1586}
1587EXPORT_SYMBOL(tcp_prequeue);
1588
1da177e4
LT
1589/*
1590 * From tcp_input.c
1591 */
1592
1593int tcp_v4_rcv(struct sk_buff *skb)
1594{
eddc9ec5 1595 const struct iphdr *iph;
cf533ea5 1596 const struct tcphdr *th;
1da177e4
LT
1597 struct sock *sk;
1598 int ret;
a86b1e30 1599 struct net *net = dev_net(skb->dev);
1da177e4
LT
1600
1601 if (skb->pkt_type != PACKET_HOST)
1602 goto discard_it;
1603
1604 /* Count it even if it's bad */
63231bdd 1605 TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1da177e4
LT
1606
1607 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1608 goto discard_it;
1609
aa8223c7 1610 th = tcp_hdr(skb);
1da177e4
LT
1611
1612 if (th->doff < sizeof(struct tcphdr) / 4)
1613 goto bad_packet;
1614 if (!pskb_may_pull(skb, th->doff * 4))
1615 goto discard_it;
1616
1617 /* An explanation is required here, I think.
1618 * Packet length and doff are validated by header prediction,
caa20d9a 1619 * provided case of th->doff==0 is eliminated.
1da177e4 1620 * So, we defer the checks. */
ed70fcfc
TH
1621
1622 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
6a5dc9e5 1623 goto csum_error;
1da177e4 1624
aa8223c7 1625 th = tcp_hdr(skb);
eddc9ec5 1626 iph = ip_hdr(skb);
1da177e4
LT
1627 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1628 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1629 skb->len - th->doff * 4);
1630 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1631 TCP_SKB_CB(skb)->when = 0;
b82d1bb4 1632 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1da177e4
LT
1633 TCP_SKB_CB(skb)->sacked = 0;
1634
9a1f27c4 1635 sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1da177e4
LT
1636 if (!sk)
1637 goto no_tcp_socket;
1638
bb134d5d
ED
1639process:
1640 if (sk->sk_state == TCP_TIME_WAIT)
1641 goto do_time_wait;
1642
6cce09f8
ED
1643 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1644 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
d218d111 1645 goto discard_and_relse;
6cce09f8 1646 }
d218d111 1647
1da177e4
LT
1648 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1649 goto discard_and_relse;
9ea88a15
DP
1650
1651#ifdef CONFIG_TCP_MD5SIG
1652 /*
1653 * We really want to reject the packet as early as possible
1654 * if:
1655 * o We're expecting an MD5'd packet and this is no MD5 tcp option
1656 * o There is an MD5 option and we're not expecting one
1657 */
1658 if (tcp_v4_inbound_md5_hash(sk, skb))
1659 goto discard_and_relse;
1660#endif
1661
b59c2701 1662 nf_reset(skb);
1da177e4 1663
fda9ef5d 1664 if (sk_filter(sk, skb))
1da177e4
LT
1665 goto discard_and_relse;
1666
8b80cda5 1667 sk_mark_napi_id(sk, skb);
1da177e4
LT
1668 skb->dev = NULL;
1669
c6366184 1670 bh_lock_sock_nested(sk);
1da177e4
LT
1671 ret = 0;
1672 if (!sock_owned_by_user(sk)) {
1a2449a8
CL
1673#ifdef CONFIG_NET_DMA
1674 struct tcp_sock *tp = tcp_sk(sk);
1675 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
a2bd1140 1676 tp->ucopy.dma_chan = net_dma_find_channel();
1a2449a8 1677 if (tp->ucopy.dma_chan)
1da177e4 1678 ret = tcp_v4_do_rcv(sk, skb);
1a2449a8
CL
1679 else
1680#endif
1681 {
1682 if (!tcp_prequeue(sk, skb))
ae8d7f88 1683 ret = tcp_v4_do_rcv(sk, skb);
1a2449a8 1684 }
da882c1f
ED
1685 } else if (unlikely(sk_add_backlog(sk, skb,
1686 sk->sk_rcvbuf + sk->sk_sndbuf))) {
6b03a53a 1687 bh_unlock_sock(sk);
6cce09f8 1688 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
6b03a53a
ZY
1689 goto discard_and_relse;
1690 }
1da177e4
LT
1691 bh_unlock_sock(sk);
1692
1693 sock_put(sk);
1694
1695 return ret;
1696
1697no_tcp_socket:
1698 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1699 goto discard_it;
1700
1701 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
6a5dc9e5
ED
1702csum_error:
1703 TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
1da177e4 1704bad_packet:
63231bdd 1705 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1da177e4 1706 } else {
cfb6eeb4 1707 tcp_v4_send_reset(NULL, skb);
1da177e4
LT
1708 }
1709
1710discard_it:
1711 /* Discard frame. */
1712 kfree_skb(skb);
e905a9ed 1713 return 0;
1da177e4
LT
1714
1715discard_and_relse:
1716 sock_put(sk);
1717 goto discard_it;
1718
1719do_time_wait:
1720 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
9469c7b4 1721 inet_twsk_put(inet_twsk(sk));
1da177e4
LT
1722 goto discard_it;
1723 }
1724
6a5dc9e5 1725 if (skb->len < (th->doff << 2)) {
9469c7b4 1726 inet_twsk_put(inet_twsk(sk));
6a5dc9e5
ED
1727 goto bad_packet;
1728 }
1729 if (tcp_checksum_complete(skb)) {
1730 inet_twsk_put(inet_twsk(sk));
1731 goto csum_error;
1da177e4 1732 }
9469c7b4 1733 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1da177e4 1734 case TCP_TW_SYN: {
c346dca1 1735 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
c67499c0 1736 &tcp_hashinfo,
da5e3630 1737 iph->saddr, th->source,
eddc9ec5 1738 iph->daddr, th->dest,
463c84b9 1739 inet_iif(skb));
1da177e4 1740 if (sk2) {
9469c7b4
YH
1741 inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1742 inet_twsk_put(inet_twsk(sk));
1da177e4
LT
1743 sk = sk2;
1744 goto process;
1745 }
1746 /* Fall through to ACK */
1747 }
1748 case TCP_TW_ACK:
1749 tcp_v4_timewait_ack(sk, skb);
1750 break;
1751 case TCP_TW_RST:
1752 goto no_tcp_socket;
1753 case TCP_TW_SUCCESS:;
1754 }
1755 goto discard_it;
1756}
1757
ccb7c410
DM
1758static struct timewait_sock_ops tcp_timewait_sock_ops = {
1759 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
1760 .twsk_unique = tcp_twsk_unique,
1761 .twsk_destructor= tcp_twsk_destructor,
ccb7c410 1762};
1da177e4 1763
63d02d15 1764void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
5d299f3d
ED
1765{
1766 struct dst_entry *dst = skb_dst(skb);
1767
1768 dst_hold(dst);
1769 sk->sk_rx_dst = dst;
1770 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1771}
63d02d15 1772EXPORT_SYMBOL(inet_sk_rx_dst_set);
5d299f3d 1773
3b401a81 1774const struct inet_connection_sock_af_ops ipv4_specific = {
543d9cfe
ACM
1775 .queue_xmit = ip_queue_xmit,
1776 .send_check = tcp_v4_send_check,
1777 .rebuild_header = inet_sk_rebuild_header,
5d299f3d 1778 .sk_rx_dst_set = inet_sk_rx_dst_set,
543d9cfe
ACM
1779 .conn_request = tcp_v4_conn_request,
1780 .syn_recv_sock = tcp_v4_syn_recv_sock,
543d9cfe
ACM
1781 .net_header_len = sizeof(struct iphdr),
1782 .setsockopt = ip_setsockopt,
1783 .getsockopt = ip_getsockopt,
1784 .addr2sockaddr = inet_csk_addr2sockaddr,
1785 .sockaddr_len = sizeof(struct sockaddr_in),
ab1e0a13 1786 .bind_conflict = inet_csk_bind_conflict,
3fdadf7d 1787#ifdef CONFIG_COMPAT
543d9cfe
ACM
1788 .compat_setsockopt = compat_ip_setsockopt,
1789 .compat_getsockopt = compat_ip_getsockopt,
3fdadf7d 1790#endif
4fab9071 1791 .mtu_reduced = tcp_v4_mtu_reduced,
1da177e4 1792};
4bc2f18b 1793EXPORT_SYMBOL(ipv4_specific);
1da177e4 1794
cfb6eeb4 1795#ifdef CONFIG_TCP_MD5SIG
b2e4b3de 1796static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
cfb6eeb4 1797 .md5_lookup = tcp_v4_md5_lookup,
49a72dfb 1798 .calc_md5_hash = tcp_v4_md5_hash_skb,
cfb6eeb4 1799 .md5_parse = tcp_v4_parse_md5_keys,
cfb6eeb4 1800};
b6332e6c 1801#endif
cfb6eeb4 1802
1da177e4
LT
1803/* NOTE: A lot of things set to zero explicitly by call to
1804 * sk_alloc() so need not be done here.
1805 */
1806static int tcp_v4_init_sock(struct sock *sk)
1807{
6687e988 1808 struct inet_connection_sock *icsk = inet_csk(sk);
1da177e4 1809
900f65d3 1810 tcp_init_sock(sk);
1da177e4 1811
8292a17a 1812 icsk->icsk_af_ops = &ipv4_specific;
900f65d3 1813
cfb6eeb4 1814#ifdef CONFIG_TCP_MD5SIG
ac807fa8 1815 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
cfb6eeb4 1816#endif
1da177e4 1817
1da177e4
LT
1818 return 0;
1819}
1820
7d06b2e0 1821void tcp_v4_destroy_sock(struct sock *sk)
1da177e4
LT
1822{
1823 struct tcp_sock *tp = tcp_sk(sk);
1824
1825 tcp_clear_xmit_timers(sk);
1826
6687e988 1827 tcp_cleanup_congestion_control(sk);
317a76f9 1828
1da177e4 1829 /* Cleanup up the write buffer. */
fe067e8a 1830 tcp_write_queue_purge(sk);
1da177e4
LT
1831
1832 /* Cleans up our, hopefully empty, out_of_order_queue. */
e905a9ed 1833 __skb_queue_purge(&tp->out_of_order_queue);
1da177e4 1834
cfb6eeb4
YH
1835#ifdef CONFIG_TCP_MD5SIG
1836 /* Clean up the MD5 key list, if any */
1837 if (tp->md5sig_info) {
a915da9b 1838 tcp_clear_md5_list(sk);
a8afca03 1839 kfree_rcu(tp->md5sig_info, rcu);
cfb6eeb4
YH
1840 tp->md5sig_info = NULL;
1841 }
1842#endif
1843
1a2449a8
CL
1844#ifdef CONFIG_NET_DMA
1845 /* Cleans up our sk_async_wait_queue */
e905a9ed 1846 __skb_queue_purge(&sk->sk_async_wait_queue);
1a2449a8
CL
1847#endif
1848
1da177e4
LT
1849 /* Clean prequeue, it must be empty really */
1850 __skb_queue_purge(&tp->ucopy.prequeue);
1851
1852 /* Clean up a referenced TCP bind bucket. */
463c84b9 1853 if (inet_csk(sk)->icsk_bind_hash)
ab1e0a13 1854 inet_put_port(sk);
1da177e4 1855
168a8f58 1856 BUG_ON(tp->fastopen_rsk != NULL);
435cf559 1857
cf60af03
YC
1858 /* If socket is aborted during connect operation */
1859 tcp_free_fastopen_req(tp);
1860
180d8cd9 1861 sk_sockets_allocated_dec(sk);
d1a4c0b3 1862 sock_release_memcg(sk);
1da177e4 1863}
1da177e4
LT
1864EXPORT_SYMBOL(tcp_v4_destroy_sock);
1865
1866#ifdef CONFIG_PROC_FS
1867/* Proc filesystem TCP sock list dumping. */
1868
a8b690f9
TH
1869/*
1870 * Get next listener socket follow cur. If cur is NULL, get first socket
1871 * starting from bucket given in st->bucket; when st->bucket is zero the
1872 * very first socket in the hash table is returned.
1873 */
1da177e4
LT
1874static void *listening_get_next(struct seq_file *seq, void *cur)
1875{
463c84b9 1876 struct inet_connection_sock *icsk;
c25eb3bf 1877 struct hlist_nulls_node *node;
1da177e4 1878 struct sock *sk = cur;
5caea4ea 1879 struct inet_listen_hashbucket *ilb;
5799de0b 1880 struct tcp_iter_state *st = seq->private;
a4146b1b 1881 struct net *net = seq_file_net(seq);
1da177e4
LT
1882
1883 if (!sk) {
a8b690f9 1884 ilb = &tcp_hashinfo.listening_hash[st->bucket];
5caea4ea 1885 spin_lock_bh(&ilb->lock);
c25eb3bf 1886 sk = sk_nulls_head(&ilb->head);
a8b690f9 1887 st->offset = 0;
1da177e4
LT
1888 goto get_sk;
1889 }
5caea4ea 1890 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1da177e4 1891 ++st->num;
a8b690f9 1892 ++st->offset;
1da177e4
LT
1893
1894 if (st->state == TCP_SEQ_STATE_OPENREQ) {
60236fdd 1895 struct request_sock *req = cur;
1da177e4 1896
72a3effa 1897 icsk = inet_csk(st->syn_wait_sk);
1da177e4
LT
1898 req = req->dl_next;
1899 while (1) {
1900 while (req) {
bdccc4ca 1901 if (req->rsk_ops->family == st->family) {
1da177e4
LT
1902 cur = req;
1903 goto out;
1904 }
1905 req = req->dl_next;
1906 }
72a3effa 1907 if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1da177e4
LT
1908 break;
1909get_req:
463c84b9 1910 req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1da177e4 1911 }
1bde5ac4 1912 sk = sk_nulls_next(st->syn_wait_sk);
1da177e4 1913 st->state = TCP_SEQ_STATE_LISTENING;
463c84b9 1914 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1da177e4 1915 } else {
e905a9ed 1916 icsk = inet_csk(sk);
463c84b9
ACM
1917 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1918 if (reqsk_queue_len(&icsk->icsk_accept_queue))
1da177e4 1919 goto start_req;
463c84b9 1920 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1bde5ac4 1921 sk = sk_nulls_next(sk);
1da177e4
LT
1922 }
1923get_sk:
c25eb3bf 1924 sk_nulls_for_each_from(sk, node) {
8475ef9f
PE
1925 if (!net_eq(sock_net(sk), net))
1926 continue;
1927 if (sk->sk_family == st->family) {
1da177e4
LT
1928 cur = sk;
1929 goto out;
1930 }
e905a9ed 1931 icsk = inet_csk(sk);
463c84b9
ACM
1932 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1933 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1da177e4
LT
1934start_req:
1935 st->uid = sock_i_uid(sk);
1936 st->syn_wait_sk = sk;
1937 st->state = TCP_SEQ_STATE_OPENREQ;
1938 st->sbucket = 0;
1939 goto get_req;
1940 }
463c84b9 1941 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1da177e4 1942 }
5caea4ea 1943 spin_unlock_bh(&ilb->lock);
a8b690f9 1944 st->offset = 0;
0f7ff927 1945 if (++st->bucket < INET_LHTABLE_SIZE) {
5caea4ea
ED
1946 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1947 spin_lock_bh(&ilb->lock);
c25eb3bf 1948 sk = sk_nulls_head(&ilb->head);
1da177e4
LT
1949 goto get_sk;
1950 }
1951 cur = NULL;
1952out:
1953 return cur;
1954}
1955
1956static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1957{
a8b690f9
TH
1958 struct tcp_iter_state *st = seq->private;
1959 void *rc;
1960
1961 st->bucket = 0;
1962 st->offset = 0;
1963 rc = listening_get_next(seq, NULL);
1da177e4
LT
1964
1965 while (rc && *pos) {
1966 rc = listening_get_next(seq, rc);
1967 --*pos;
1968 }
1969 return rc;
1970}
1971
05dbc7b5 1972static inline bool empty_bucket(const struct tcp_iter_state *st)
6eac5604 1973{
05dbc7b5 1974 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
6eac5604
AK
1975}
1976
a8b690f9
TH
1977/*
1978 * Get first established socket starting from bucket given in st->bucket.
1979 * If st->bucket is zero, the very first socket in the hash is returned.
1980 */
1da177e4
LT
1981static void *established_get_first(struct seq_file *seq)
1982{
5799de0b 1983 struct tcp_iter_state *st = seq->private;
a4146b1b 1984 struct net *net = seq_file_net(seq);
1da177e4
LT
1985 void *rc = NULL;
1986
a8b690f9
TH
1987 st->offset = 0;
1988 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1da177e4 1989 struct sock *sk;
3ab5aee7 1990 struct hlist_nulls_node *node;
9db66bdc 1991 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1da177e4 1992
6eac5604
AK
1993 /* Lockless fast path for the common case of empty buckets */
1994 if (empty_bucket(st))
1995 continue;
1996
9db66bdc 1997 spin_lock_bh(lock);
3ab5aee7 1998 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
f40c8174 1999 if (sk->sk_family != st->family ||
878628fb 2000 !net_eq(sock_net(sk), net)) {
1da177e4
LT
2001 continue;
2002 }
2003 rc = sk;
2004 goto out;
2005 }
9db66bdc 2006 spin_unlock_bh(lock);
1da177e4
LT
2007 }
2008out:
2009 return rc;
2010}
2011
2012static void *established_get_next(struct seq_file *seq, void *cur)
2013{
2014 struct sock *sk = cur;
3ab5aee7 2015 struct hlist_nulls_node *node;
5799de0b 2016 struct tcp_iter_state *st = seq->private;
a4146b1b 2017 struct net *net = seq_file_net(seq);
1da177e4
LT
2018
2019 ++st->num;
a8b690f9 2020 ++st->offset;
1da177e4 2021
05dbc7b5 2022 sk = sk_nulls_next(sk);
1da177e4 2023
3ab5aee7 2024 sk_nulls_for_each_from(sk, node) {
878628fb 2025 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
05dbc7b5 2026 return sk;
1da177e4
LT
2027 }
2028
05dbc7b5
ED
2029 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2030 ++st->bucket;
2031 return established_get_first(seq);
1da177e4
LT
2032}
2033
2034static void *established_get_idx(struct seq_file *seq, loff_t pos)
2035{
a8b690f9
TH
2036 struct tcp_iter_state *st = seq->private;
2037 void *rc;
2038
2039 st->bucket = 0;
2040 rc = established_get_first(seq);
1da177e4
LT
2041
2042 while (rc && pos) {
2043 rc = established_get_next(seq, rc);
2044 --pos;
7174259e 2045 }
1da177e4
LT
2046 return rc;
2047}
2048
2049static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2050{
2051 void *rc;
5799de0b 2052 struct tcp_iter_state *st = seq->private;
1da177e4 2053
1da177e4
LT
2054 st->state = TCP_SEQ_STATE_LISTENING;
2055 rc = listening_get_idx(seq, &pos);
2056
2057 if (!rc) {
1da177e4
LT
2058 st->state = TCP_SEQ_STATE_ESTABLISHED;
2059 rc = established_get_idx(seq, pos);
2060 }
2061
2062 return rc;
2063}
2064
a8b690f9
TH
2065static void *tcp_seek_last_pos(struct seq_file *seq)
2066{
2067 struct tcp_iter_state *st = seq->private;
2068 int offset = st->offset;
2069 int orig_num = st->num;
2070 void *rc = NULL;
2071
2072 switch (st->state) {
2073 case TCP_SEQ_STATE_OPENREQ:
2074 case TCP_SEQ_STATE_LISTENING:
2075 if (st->bucket >= INET_LHTABLE_SIZE)
2076 break;
2077 st->state = TCP_SEQ_STATE_LISTENING;
2078 rc = listening_get_next(seq, NULL);
2079 while (offset-- && rc)
2080 rc = listening_get_next(seq, rc);
2081 if (rc)
2082 break;
2083 st->bucket = 0;
05dbc7b5 2084 st->state = TCP_SEQ_STATE_ESTABLISHED;
a8b690f9
TH
2085 /* Fallthrough */
2086 case TCP_SEQ_STATE_ESTABLISHED:
a8b690f9
TH
2087 if (st->bucket > tcp_hashinfo.ehash_mask)
2088 break;
2089 rc = established_get_first(seq);
2090 while (offset-- && rc)
2091 rc = established_get_next(seq, rc);
2092 }
2093
2094 st->num = orig_num;
2095
2096 return rc;
2097}
2098
1da177e4
LT
2099static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2100{
5799de0b 2101 struct tcp_iter_state *st = seq->private;
a8b690f9
TH
2102 void *rc;
2103
2104 if (*pos && *pos == st->last_pos) {
2105 rc = tcp_seek_last_pos(seq);
2106 if (rc)
2107 goto out;
2108 }
2109
1da177e4
LT
2110 st->state = TCP_SEQ_STATE_LISTENING;
2111 st->num = 0;
a8b690f9
TH
2112 st->bucket = 0;
2113 st->offset = 0;
2114 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2115
2116out:
2117 st->last_pos = *pos;
2118 return rc;
1da177e4
LT
2119}
2120
2121static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2122{
a8b690f9 2123 struct tcp_iter_state *st = seq->private;
1da177e4 2124 void *rc = NULL;
1da177e4
LT
2125
2126 if (v == SEQ_START_TOKEN) {
2127 rc = tcp_get_idx(seq, 0);
2128 goto out;
2129 }
1da177e4
LT
2130
2131 switch (st->state) {
2132 case TCP_SEQ_STATE_OPENREQ:
2133 case TCP_SEQ_STATE_LISTENING:
2134 rc = listening_get_next(seq, v);
2135 if (!rc) {
1da177e4 2136 st->state = TCP_SEQ_STATE_ESTABLISHED;
a8b690f9
TH
2137 st->bucket = 0;
2138 st->offset = 0;
1da177e4
LT
2139 rc = established_get_first(seq);
2140 }
2141 break;
2142 case TCP_SEQ_STATE_ESTABLISHED:
1da177e4
LT
2143 rc = established_get_next(seq, v);
2144 break;
2145 }
2146out:
2147 ++*pos;
a8b690f9 2148 st->last_pos = *pos;
1da177e4
LT
2149 return rc;
2150}
2151
2152static void tcp_seq_stop(struct seq_file *seq, void *v)
2153{
5799de0b 2154 struct tcp_iter_state *st = seq->private;
1da177e4
LT
2155
2156 switch (st->state) {
2157 case TCP_SEQ_STATE_OPENREQ:
2158 if (v) {
463c84b9
ACM
2159 struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2160 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1da177e4
LT
2161 }
2162 case TCP_SEQ_STATE_LISTENING:
2163 if (v != SEQ_START_TOKEN)
5caea4ea 2164 spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
1da177e4 2165 break;
1da177e4
LT
2166 case TCP_SEQ_STATE_ESTABLISHED:
2167 if (v)
9db66bdc 2168 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1da177e4
LT
2169 break;
2170 }
2171}
2172
73cb88ec 2173int tcp_seq_open(struct inode *inode, struct file *file)
1da177e4 2174{
d9dda78b 2175 struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
1da177e4 2176 struct tcp_iter_state *s;
52d6f3f1 2177 int err;
1da177e4 2178
52d6f3f1
DL
2179 err = seq_open_net(inode, file, &afinfo->seq_ops,
2180 sizeof(struct tcp_iter_state));
2181 if (err < 0)
2182 return err;
f40c8174 2183
52d6f3f1 2184 s = ((struct seq_file *)file->private_data)->private;
1da177e4 2185 s->family = afinfo->family;
a8b690f9 2186 s->last_pos = 0;
f40c8174
DL
2187 return 0;
2188}
73cb88ec 2189EXPORT_SYMBOL(tcp_seq_open);
f40c8174 2190
6f8b13bc 2191int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
1da177e4
LT
2192{
2193 int rc = 0;
2194 struct proc_dir_entry *p;
2195
9427c4b3
DL
2196 afinfo->seq_ops.start = tcp_seq_start;
2197 afinfo->seq_ops.next = tcp_seq_next;
2198 afinfo->seq_ops.stop = tcp_seq_stop;
2199
84841c3c 2200 p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
73cb88ec 2201 afinfo->seq_fops, afinfo);
84841c3c 2202 if (!p)
1da177e4
LT
2203 rc = -ENOMEM;
2204 return rc;
2205}
4bc2f18b 2206EXPORT_SYMBOL(tcp_proc_register);
1da177e4 2207
6f8b13bc 2208void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
1da177e4 2209{
ece31ffd 2210 remove_proc_entry(afinfo->name, net->proc_net);
1da177e4 2211}
4bc2f18b 2212EXPORT_SYMBOL(tcp_proc_unregister);
1da177e4 2213
cf533ea5 2214static void get_openreq4(const struct sock *sk, const struct request_sock *req,
652586df 2215 struct seq_file *f, int i, kuid_t uid)
1da177e4 2216{
2e6599cb 2217 const struct inet_request_sock *ireq = inet_rsk(req);
a399a805 2218 long delta = req->expires - jiffies;
1da177e4 2219
5e659e4c 2220 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
652586df 2221 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
1da177e4 2222 i,
634fb979 2223 ireq->ir_loc_addr,
c720c7e8 2224 ntohs(inet_sk(sk)->inet_sport),
634fb979
ED
2225 ireq->ir_rmt_addr,
2226 ntohs(ireq->ir_rmt_port),
1da177e4
LT
2227 TCP_SYN_RECV,
2228 0, 0, /* could print option size, but that is af dependent. */
2229 1, /* timers active (only the expire timer) */
a399a805 2230 jiffies_delta_to_clock_t(delta),
e6c022a4 2231 req->num_timeout,
a7cb5a49 2232 from_kuid_munged(seq_user_ns(f), uid),
1da177e4
LT
2233 0, /* non standard timer */
2234 0, /* open_requests have no inode */
2235 atomic_read(&sk->sk_refcnt),
652586df 2236 req);
1da177e4
LT
2237}
2238
652586df 2239static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
1da177e4
LT
2240{
2241 int timer_active;
2242 unsigned long timer_expires;
cf533ea5 2243 const struct tcp_sock *tp = tcp_sk(sk);
cf4c6bf8 2244 const struct inet_connection_sock *icsk = inet_csk(sk);
cf533ea5 2245 const struct inet_sock *inet = inet_sk(sk);
168a8f58 2246 struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq;
c720c7e8
ED
2247 __be32 dest = inet->inet_daddr;
2248 __be32 src = inet->inet_rcv_saddr;
2249 __u16 destp = ntohs(inet->inet_dport);
2250 __u16 srcp = ntohs(inet->inet_sport);
49d09007 2251 int rx_queue;
1da177e4 2252
6ba8a3b1
ND
2253 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2254 icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2255 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
1da177e4 2256 timer_active = 1;
463c84b9
ACM
2257 timer_expires = icsk->icsk_timeout;
2258 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
1da177e4 2259 timer_active = 4;
463c84b9 2260 timer_expires = icsk->icsk_timeout;
cf4c6bf8 2261 } else if (timer_pending(&sk->sk_timer)) {
1da177e4 2262 timer_active = 2;
cf4c6bf8 2263 timer_expires = sk->sk_timer.expires;
1da177e4
LT
2264 } else {
2265 timer_active = 0;
2266 timer_expires = jiffies;
2267 }
2268
49d09007
ED
2269 if (sk->sk_state == TCP_LISTEN)
2270 rx_queue = sk->sk_ack_backlog;
2271 else
2272 /*
2273 * because we dont lock socket, we might find a transient negative value
2274 */
2275 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2276
5e659e4c 2277 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
652586df 2278 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
cf4c6bf8 2279 i, src, srcp, dest, destp, sk->sk_state,
47da8ee6 2280 tp->write_seq - tp->snd_una,
49d09007 2281 rx_queue,
1da177e4 2282 timer_active,
a399a805 2283 jiffies_delta_to_clock_t(timer_expires - jiffies),
463c84b9 2284 icsk->icsk_retransmits,
a7cb5a49 2285 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
6687e988 2286 icsk->icsk_probes_out,
cf4c6bf8
IJ
2287 sock_i_ino(sk),
2288 atomic_read(&sk->sk_refcnt), sk,
7be87351
SH
2289 jiffies_to_clock_t(icsk->icsk_rto),
2290 jiffies_to_clock_t(icsk->icsk_ack.ato),
463c84b9 2291 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
1da177e4 2292 tp->snd_cwnd,
168a8f58
JC
2293 sk->sk_state == TCP_LISTEN ?
2294 (fastopenq ? fastopenq->max_qlen : 0) :
652586df 2295 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
1da177e4
LT
2296}
2297
cf533ea5 2298static void get_timewait4_sock(const struct inet_timewait_sock *tw,
652586df 2299 struct seq_file *f, int i)
1da177e4 2300{
23f33c2d 2301 __be32 dest, src;
1da177e4 2302 __u16 destp, srcp;
e2a1d3e4 2303 s32 delta = tw->tw_ttd - inet_tw_time_stamp();
1da177e4
LT
2304
2305 dest = tw->tw_daddr;
2306 src = tw->tw_rcv_saddr;
2307 destp = ntohs(tw->tw_dport);
2308 srcp = ntohs(tw->tw_sport);
2309
5e659e4c 2310 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
652586df 2311 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
1da177e4 2312 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
a399a805 2313 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
652586df 2314 atomic_read(&tw->tw_refcnt), tw);
1da177e4
LT
2315}
2316
2317#define TMPSZ 150
2318
2319static int tcp4_seq_show(struct seq_file *seq, void *v)
2320{
5799de0b 2321 struct tcp_iter_state *st;
05dbc7b5 2322 struct sock *sk = v;
1da177e4 2323
652586df 2324 seq_setwidth(seq, TMPSZ - 1);
1da177e4 2325 if (v == SEQ_START_TOKEN) {
652586df 2326 seq_puts(seq, " sl local_address rem_address st tx_queue "
1da177e4
LT
2327 "rx_queue tr tm->when retrnsmt uid timeout "
2328 "inode");
2329 goto out;
2330 }
2331 st = seq->private;
2332
2333 switch (st->state) {
2334 case TCP_SEQ_STATE_LISTENING:
2335 case TCP_SEQ_STATE_ESTABLISHED:
05dbc7b5 2336 if (sk->sk_state == TCP_TIME_WAIT)
652586df 2337 get_timewait4_sock(v, seq, st->num);
05dbc7b5 2338 else
652586df 2339 get_tcp4_sock(v, seq, st->num);
1da177e4
LT
2340 break;
2341 case TCP_SEQ_STATE_OPENREQ:
652586df 2342 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid);
1da177e4
LT
2343 break;
2344 }
1da177e4 2345out:
652586df 2346 seq_pad(seq, '\n');
1da177e4
LT
2347 return 0;
2348}
2349
73cb88ec
AV
2350static const struct file_operations tcp_afinfo_seq_fops = {
2351 .owner = THIS_MODULE,
2352 .open = tcp_seq_open,
2353 .read = seq_read,
2354 .llseek = seq_lseek,
2355 .release = seq_release_net
2356};
2357
1da177e4 2358static struct tcp_seq_afinfo tcp4_seq_afinfo = {
1da177e4
LT
2359 .name = "tcp",
2360 .family = AF_INET,
73cb88ec 2361 .seq_fops = &tcp_afinfo_seq_fops,
9427c4b3
DL
2362 .seq_ops = {
2363 .show = tcp4_seq_show,
2364 },
1da177e4
LT
2365};
2366
2c8c1e72 2367static int __net_init tcp4_proc_init_net(struct net *net)
757764f6
PE
2368{
2369 return tcp_proc_register(net, &tcp4_seq_afinfo);
2370}
2371
2c8c1e72 2372static void __net_exit tcp4_proc_exit_net(struct net *net)
757764f6
PE
2373{
2374 tcp_proc_unregister(net, &tcp4_seq_afinfo);
2375}
2376
2377static struct pernet_operations tcp4_net_ops = {
2378 .init = tcp4_proc_init_net,
2379 .exit = tcp4_proc_exit_net,
2380};
2381
1da177e4
LT
2382int __init tcp4_proc_init(void)
2383{
757764f6 2384 return register_pernet_subsys(&tcp4_net_ops);
1da177e4
LT
2385}
2386
2387void tcp4_proc_exit(void)
2388{
757764f6 2389 unregister_pernet_subsys(&tcp4_net_ops);
1da177e4
LT
2390}
2391#endif /* CONFIG_PROC_FS */
2392
2393struct proto tcp_prot = {
2394 .name = "TCP",
2395 .owner = THIS_MODULE,
2396 .close = tcp_close,
2397 .connect = tcp_v4_connect,
2398 .disconnect = tcp_disconnect,
463c84b9 2399 .accept = inet_csk_accept,
1da177e4
LT
2400 .ioctl = tcp_ioctl,
2401 .init = tcp_v4_init_sock,
2402 .destroy = tcp_v4_destroy_sock,
2403 .shutdown = tcp_shutdown,
2404 .setsockopt = tcp_setsockopt,
2405 .getsockopt = tcp_getsockopt,
1da177e4 2406 .recvmsg = tcp_recvmsg,
7ba42910
CG
2407 .sendmsg = tcp_sendmsg,
2408 .sendpage = tcp_sendpage,
1da177e4 2409 .backlog_rcv = tcp_v4_do_rcv,
46d3ceab 2410 .release_cb = tcp_release_cb,
ab1e0a13
ACM
2411 .hash = inet_hash,
2412 .unhash = inet_unhash,
2413 .get_port = inet_csk_get_port,
1da177e4 2414 .enter_memory_pressure = tcp_enter_memory_pressure,
c9bee3b7 2415 .stream_memory_free = tcp_stream_memory_free,
1da177e4 2416 .sockets_allocated = &tcp_sockets_allocated,
0a5578cf 2417 .orphan_count = &tcp_orphan_count,
1da177e4
LT
2418 .memory_allocated = &tcp_memory_allocated,
2419 .memory_pressure = &tcp_memory_pressure,
a4fe34bf 2420 .sysctl_mem = sysctl_tcp_mem,
1da177e4
LT
2421 .sysctl_wmem = sysctl_tcp_wmem,
2422 .sysctl_rmem = sysctl_tcp_rmem,
2423 .max_header = MAX_TCP_HEADER,
2424 .obj_size = sizeof(struct tcp_sock),
3ab5aee7 2425 .slab_flags = SLAB_DESTROY_BY_RCU,
6d6ee43e 2426 .twsk_prot = &tcp_timewait_sock_ops,
60236fdd 2427 .rsk_prot = &tcp_request_sock_ops,
39d8cda7 2428 .h.hashinfo = &tcp_hashinfo,
7ba42910 2429 .no_autobind = true,
543d9cfe
ACM
2430#ifdef CONFIG_COMPAT
2431 .compat_setsockopt = compat_tcp_setsockopt,
2432 .compat_getsockopt = compat_tcp_getsockopt,
2433#endif
c255a458 2434#ifdef CONFIG_MEMCG_KMEM
d1a4c0b3
GC
2435 .init_cgroup = tcp_init_cgroup,
2436 .destroy_cgroup = tcp_destroy_cgroup,
2437 .proto_cgroup = tcp_proto_cgroup,
2438#endif
1da177e4 2439};
4bc2f18b 2440EXPORT_SYMBOL(tcp_prot);
1da177e4 2441
046ee902
DL
2442static int __net_init tcp_sk_init(struct net *net)
2443{
5d134f1c 2444 net->ipv4.sysctl_tcp_ecn = 2;
be9f4a44 2445 return 0;
046ee902
DL
2446}
2447
2448static void __net_exit tcp_sk_exit(struct net *net)
2449{
b099ce26
EB
2450}
2451
2452static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2453{
2454 inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
046ee902
DL
2455}
2456
2457static struct pernet_operations __net_initdata tcp_sk_ops = {
b099ce26
EB
2458 .init = tcp_sk_init,
2459 .exit = tcp_sk_exit,
2460 .exit_batch = tcp_sk_exit_batch,
046ee902
DL
2461};
2462
9b0f976f 2463void __init tcp_v4_init(void)
1da177e4 2464{
5caea4ea 2465 inet_hashinfo_init(&tcp_hashinfo);
6a1b3054 2466 if (register_pernet_subsys(&tcp_sk_ops))
1da177e4 2467 panic("Failed to create the TCP control socket.\n");
1da177e4 2468}
This page took 1.364668 seconds and 5 git commands to generate.